From 52c12f42f0870323d9d47400bf8d868e286cd3ab Mon Sep 17 00:00:00 2001 From: John Field Date: Wed, 6 Feb 2019 11:06:20 +0000 Subject: [PATCH 1/3] basic dockerize --- README.md | 12 +++++++++++- docker-compose.yml | 9 +++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 docker-compose.yml diff --git a/README.md b/README.md index a87d760..51af04f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ #### What? -* We want to be able to structure our dataset (see "Campaign Lab Data Inventory"). +* We want to be able to structure our dataset - see [Campaign Lab Data Inventory](https://docs.google.com/spreadsheets/d/1s5zWhdXi0-YBUMkK2Le3cfENBsfc29vOnFhnfn8N6dU). * In order to do this, we first should define what the structure (schema) of the different data sources are. * This will help us down the line to create modules that transform our raw data into our target data, for later export into a database, R package, or any other tools for utilising the data in a highly structured and annotated format. @@ -55,3 +55,13 @@ * *source* is a link (if available) to the actual dataset. * The *description* is a one liner that describes the dataset * *properties* is a list of the *datapoints* that we want to *end up with after transforming the raw dataset*. + + +### Dockerized +I'm learning my way around data science and Python. So am working with Docker to improve reproducability and other good reasons. +For now, have mounted the entire repo into the image's workspace. + +* docker-compose up +* Get a login URL (localhost:8888?token=...) from the output +* docker exec -it jupyter-notebook /bin/bash +* python -c 'from london_election_results import get_data; print(get_data())' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7352482 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,9 @@ +version: '3.1' +services: + notebook: + image: jupyter/datascience-notebook + container_name: jupyter-notebook + ports: + - "8888:8888" + volumes: + - ./:/home/jovyan/work From 0955ab0d2e9f8360c41bf2c1eeffe4d61d6596d4 Mon Sep 17 00:00:00 2001 From: John Field Date: Thu, 7 Feb 2019 06:58:37 +0000 Subject: [PATCH 2/3] Add basic elasticsearch for dev ETL --- docker-compose.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 7352482..a1169cf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,3 +7,26 @@ services: - "8888:8888" volumes: - ./:/home/jovyan/work + dejavu: + image: appbaseio/dejavu + container_name: dejavu + ports: + - "1358:1358" + volumes: + - ./:/home/jovyan/work + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.5.2 + container_name: elasticsearch + ports: + - "9200:9200" + #volumes: + # - ./env/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml + environment: + discovery.type: "single-node" + ES_JAVA_OPTS: "-Xmx256m -Xms256m" + http.port: "9200" + http.cors.enabled: "true" + http.cors.allow-origin: "http://localhost:1358,http://127.0.0.1:1358" + http.cors.allow-headers: "X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization" + http.cors.allow-methods: "OPTIONS, HEAD, GET, PUT, POST, DELETE" + http.cors.allow-credentials: "true" From 61abbd1a660c5c0fd9d8f690255d423dcb838d3e Mon Sep 17 00:00:00 2001 From: John Field Date: Thu, 7 Feb 2019 13:01:34 +0000 Subject: [PATCH 3/3] Actually, just use kibana --- README.md | 27 +++++++++++++++++++-------- docker-compose.yml | 43 ++++++++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 51af04f..4b36bab 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ ## Campaign Lab Data Pipeline +For context, see [Campaign Lab Guide](https://github.com/CampaignLab/Campaign-Lab-Guide/blob/master/Campaign%20Lab%20Guide.md0). + #### What? -* We want to be able to structure our dataset - see [Campaign Lab Data Inventory](https://docs.google.com/spreadsheets/d/1s5zWhdXi0-YBUMkK2Le3cfENBsfc29vOnFhnfn8N6dU). +* We want to be able to structure our dataset from the Data Inventory. * In order to do this, we first should define what the structure (schema) of the different data sources are. * This will help us down the line to create modules that transform our raw data into our target data, for later export into a database, R package, or any other tools for utilising the data in a highly structured and annotated format. @@ -57,11 +59,20 @@ * *properties* is a list of the *datapoints* that we want to *end up with after transforming the raw dataset*. -### Dockerized -I'm learning my way around data science and Python. So am working with Docker to improve reproducability and other good reasons. -For now, have mounted the entire repo into the image's workspace. +### Toolset +(Author is learning his way around data science and Python, better approaches welcome.) +Datasets are expected to be largely static; transformers are intended to be manually run and eyeballed as needed, instead of automated. +They can be run in a local environment. +For reproducability and dev tooling, can also use a container environment via Docker. + +Run a specific command: +`docker-compose run datascience python -c 'from london_election_results import get_data; print(get_data())'` + +Running the environment: -* docker-compose up -* Get a login URL (localhost:8888?token=...) from the output -* docker exec -it jupyter-notebook /bin/bash -* python -c 'from london_election_results import get_data; print(get_data())' \ No newline at end of file +* `docker-compose up` +* `http://localhost:9200` #elasticsearch +* `http://localhost:5601` #kibana +* Can import a CSV with e.g. +* `docker-compose run datascience python -c 'elasticsearch_loader --es-host http://elasticsearch:9200 --index campaignlab --type campaignlab csv ../schemas/local_election_results_2018-05-03.csv` +* Follow https://www.elastic.co/guide/en/kibana/current/tutorial-build-dashboard.html to visualise. \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index a1169cf..29b932c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,32 +1,33 @@ version: '3.1' services: - notebook: - image: jupyter/datascience-notebook - container_name: jupyter-notebook + datascience: + image: civisanalytics/datascience-python:4.2.0 + container_name: datascience-python ports: - "8888:8888" volumes: - - ./:/home/jovyan/work - dejavu: - image: appbaseio/dejavu - container_name: dejavu - ports: - - "1358:1358" - volumes: - - ./:/home/jovyan/work + - ./:/pipeline + working_dir: "/pipeline/transformers" + tty: true + # Keep container running idle. + command: [ "/bin/sh", "-c", "pip install elasticsearch-loader; tail -f /dev/null"] elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.5.2 + image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.6.0 container_name: elasticsearch ports: - "9200:9200" - #volumes: - # - ./env/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml environment: - discovery.type: "single-node" + CLUSTER_NAME: "campaignlab" + HTTP_PORT: "9200" + DISCOVERY_TYPE: "single-node" ES_JAVA_OPTS: "-Xmx256m -Xms256m" - http.port: "9200" - http.cors.enabled: "true" - http.cors.allow-origin: "http://localhost:1358,http://127.0.0.1:1358" - http.cors.allow-headers: "X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization" - http.cors.allow-methods: "OPTIONS, HEAD, GET, PUT, POST, DELETE" - http.cors.allow-credentials: "true" + + kibana: + image: docker.elastic.co/kibana/kibana-oss:6.6.0 + container_name: kibana + ports: + - "5601:5601" + - "8080:8080" + environment: + SERVER_NAME: "kibana" + ELASTICSEARCH_HOSTS: "http://elasticsearch:9200"