From 4eabe4b225396dc5dc064174da19c5268d51a1c0 Mon Sep 17 00:00:00 2001 From: averheecke-tpx Date: Wed, 24 Jul 2024 11:47:24 +0100 Subject: [PATCH 1/5] remove pipeline directory --- pipeline/column.csv | 1 - pipeline/combine.csv | 1 - pipeline/concat.csv | 1 - pipeline/convert.csv | 1 - pipeline/default-value.csv | 1 - pipeline/default.csv | 1 - pipeline/lookup.csv | 1 - pipeline/old-entity.csv | 1 - pipeline/patch.csv | 1 - pipeline/skip.csv | 1 - 10 files changed, 10 deletions(-) delete mode 100644 pipeline/column.csv delete mode 100644 pipeline/combine.csv delete mode 100644 pipeline/concat.csv delete mode 100644 pipeline/convert.csv delete mode 100644 pipeline/default-value.csv delete mode 100644 pipeline/default.csv delete mode 100644 pipeline/lookup.csv delete mode 100644 pipeline/old-entity.csv delete mode 100644 pipeline/patch.csv delete mode 100644 pipeline/skip.csv diff --git a/pipeline/column.csv b/pipeline/column.csv deleted file mode 100644 index d12e860..0000000 --- a/pipeline/column.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,column,field diff --git a/pipeline/combine.csv b/pipeline/combine.csv deleted file mode 100644 index e9ad393..0000000 --- a/pipeline/combine.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,endpoint,field,separator,entry-date,start-date,end-date diff --git a/pipeline/concat.csv b/pipeline/concat.csv deleted file mode 100644 index 6d1a8d3..0000000 --- a/pipeline/concat.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,field,fields,separator,entry-date,start-date,end-date diff --git a/pipeline/convert.csv b/pipeline/convert.csv deleted file mode 100644 index a95d41f..0000000 --- a/pipeline/convert.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,script \ No newline at end of file diff --git a/pipeline/default-value.csv b/pipeline/default-value.csv deleted file mode 100644 index a604363..0000000 --- a/pipeline/default-value.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,endpoint,field,value \ No newline at end of file diff --git a/pipeline/default.csv b/pipeline/default.csv deleted file mode 100644 index e8c9137..0000000 --- a/pipeline/default.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,field,default-field,entry-date,start-date,end-date \ No newline at end of file diff --git a/pipeline/lookup.csv b/pipeline/lookup.csv deleted file mode 100644 index abd27b4..0000000 --- a/pipeline/lookup.csv +++ /dev/null @@ -1 +0,0 @@ -prefix,resource,entry-number,organisation,reference,entity \ No newline at end of file diff --git a/pipeline/old-entity.csv b/pipeline/old-entity.csv deleted file mode 100644 index 997ad81..0000000 --- a/pipeline/old-entity.csv +++ /dev/null @@ -1 +0,0 @@ -old-entity,status,entity,notes \ No newline at end of file diff --git a/pipeline/patch.csv b/pipeline/patch.csv deleted file mode 100644 index f8aa2c3..0000000 --- a/pipeline/patch.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,field,pattern,value \ No newline at end of file diff --git a/pipeline/skip.csv b/pipeline/skip.csv deleted file mode 100644 index 4c96897..0000000 --- a/pipeline/skip.csv +++ /dev/null @@ -1 +0,0 @@ -dataset,resource,pattern \ No newline at end of file From 9d0e4e2275835aa13996ac9f1856bfc204c6e2eb Mon Sep 17 00:00:00 2001 From: averheecke-tpx Date: Wed, 24 Jul 2024 11:47:48 +0100 Subject: [PATCH 2/5] remove collection directory --- collection/endpoint.csv | 1 - collection/old-resource.csv | 1 - collection/source.csv | 1 - 3 files changed, 3 deletions(-) delete mode 100644 collection/endpoint.csv delete mode 100644 collection/old-resource.csv delete mode 100644 collection/source.csv diff --git a/collection/endpoint.csv b/collection/endpoint.csv deleted file mode 100644 index b47265b..0000000 --- a/collection/endpoint.csv +++ /dev/null @@ -1 +0,0 @@ -endpoint,endpoint-url,parameters,plugin,entry-date,start-date,end-date diff --git a/collection/old-resource.csv b/collection/old-resource.csv deleted file mode 100644 index b7262ec..0000000 --- a/collection/old-resource.csv +++ /dev/null @@ -1 +0,0 @@ -old-resource,status,resource,notes diff --git a/collection/source.csv b/collection/source.csv deleted file mode 100644 index 0213c45..0000000 --- a/collection/source.csv +++ /dev/null @@ -1 +0,0 @@ -source,attribution,collection,documentation-url,endpoint,licence,organisation,pipelines,entry-date,start-date,end-date From 2b6d8385f07135cdf1d278f5513eda4ed1c89373 Mon Sep 17 00:00:00 2001 From: averheecke-tpx Date: Wed, 24 Jul 2024 11:48:59 +0100 Subject: [PATCH 3/5] updated run.yml, removed others --- .github/workflows/callable_run.yml | 279 ----------------------------- .github/workflows/run.yml | 80 +-------- .github/workflows/run_caller.yml | 10 -- 3 files changed, 6 insertions(+), 363 deletions(-) delete mode 100644 .github/workflows/callable_run.yml delete mode 100644 .github/workflows/run_caller.yml diff --git a/.github/workflows/callable_run.yml b/.github/workflows/callable_run.yml deleted file mode 100644 index df53178..0000000 --- a/.github/workflows/callable_run.yml +++ /dev/null @@ -1,279 +0,0 @@ -name: Run collection -on: - workflow_call: - secrets: - DLB_BOT_EMAIL: - required: true - DLB_BOT_TOKEN: - required: true - DLB_BOT_USERNAME: - required: true - AWS_S3_ACCESS_KEY_ID: - required: true - AWS_S3_SECRET_ACCESS_KEY: - required: true -env: - DLB_BOT_EMAIL: ${{ secrets.DLB_BOT_EMAIL }} - DLB_BOT_TOKEN: ${{ secrets.DLB_BOT_TOKEN }} - DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }} -jobs: - build: - runs-on: ubuntu-latest - steps: - - - name: Free up disk space - run: | - df -h - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - echo - df -h - - - uses: actions/checkout@v3 - with: - lfs: true - - - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Configure git - run: | - git config user.email "${DLB_BOT_EMAIL}" - git config user.name "${DLB_BOT_USERNAME}" - git remote set-url origin https://${DLB_BOT_USERNAME}:${DLB_BOT_TOKEN}@github.com/${GITHUB_REPOSITORY}.git - git checkout ${GITHUB_REF_NAME} - - - name: Update makerules - run: make makerules - - - name: Commit updated makerules - run: make commit-makerules - - - name: Install dependencies - run: make init - - - name: Run the collector - run: make collect - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-resources - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-resources - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-resources - - - name: Save logs to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-logs - - - name: Build the collection database - run: make collection - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-collection - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-collection - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-collection - - - name: transform collected files - run: make transformed -j 2 - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Build datasets from the transformed files - run: make dataset - - - name: Configure Development AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Development S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-expectations - - - name: Configure Staging AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Staging S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-expectations - - - name: Configure Production AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Prod S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-expectations - - check-pipeline-errors: - runs-on: ubuntu-latest - needs: - - build - if: always() && contains(join(needs.*.result, ','), 'failure') - steps: - - name: send failure notification - uses: slackapi/slack-github-action@v1 - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - with: - channel-id: 'planning-data-platform' - payload: | - { - "text": "Collection Run: ${{ github.repository }}", - "icon_emoji": ":warning:", - "username": "CollectionRunner", - "blocks": [ - { - "type": "header", - "text": { - "type": "plain_text", - "text": "Collection Run Failed: ${{ github.repository }} " - } - }, - { - "type": "divider" - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "The report for this run is available on " - } - } - ] - } - diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 178485e..9c1a5e6 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -1,78 +1,10 @@ -name: Run collection +name: Call Collection Run on: - schedule: - - cron: 0 0 * * * + # schedule: + # - cron: 0 0 * * * workflow_dispatch: null -env: - DLB_BOT_EMAIL: ${{ secrets.DLB_BOT_EMAIL }} - DLB_BOT_TOKEN: ${{ secrets.DLB_BOT_TOKEN }} - DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }} jobs: - build: - runs-on: ubuntu-latest - steps: + call-workflow: + uses: digital-land/collection-template/.github/workflows/callable_run.yml@main + secrets: inherit - - name: Free up disk space - run: | - df -h - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - echo - df -h - - - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{secrets.AWS_S3_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.AWS_S3_SECRET_ACCESS_KEY}} - aws-region: eu-west-2 - - - name: Configure git - run: | - git config user.email "${DLB_BOT_EMAIL}" - git config user.name "${DLB_BOT_USERNAME}" - git remote set-url origin https://${DLB_BOT_USERNAME}:${DLB_BOT_TOKEN}@github.com/${GITHUB_REPOSITORY}.git - git checkout ${GITHUB_REF##*/} - - - name: Update makerules - run: make makerules - - - name: Commit updated makerules - run: make commit-makerules - - - name: Install dependencies - run: make init - - - name: Run the collector - run: make collect - - - name: Commit collection logs - run: make commit-collection - - - name: Save collected resources to S3 - run: make save-resources - - - name: Build the collection database - run: make collection - - - name: Push collection database to S3 - run: make save-collection - - - name: transform collected files - run: make transformed - - - name: Save transformed files to S3 - run: make save-transformed - - - name: Build datasets from the transformed files - run: make dataset - - - name: Save datasets to S3 - run: make save-dataset diff --git a/.github/workflows/run_caller.yml b/.github/workflows/run_caller.yml deleted file mode 100644 index 9a54829..0000000 --- a/.github/workflows/run_caller.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Call Collection Run -on: - schedule: - - cron: 0 0 * * * - workflow_dispatch: null -jobs: - call-workflow: - uses: digital-land/collection-template/.github/workflows/callable_run.yml@main - secrets: inherit - From 43c8529f679e347004e1e9d83dd7e7c62f62c4b9 Mon Sep 17 00:00:00 2001 From: averheecke-tpx Date: Wed, 24 Jul 2024 12:04:56 +0100 Subject: [PATCH 4/5] updated makerules folder files to latest versions --- makerules/collection.mk | 44 ++++++++++++++++++++--- makerules/makerules.mk | 35 ++++++++++++++++-- makerules/pipeline.mk | 80 +++++++++++++++++++++++++++++++---------- 3 files changed, 132 insertions(+), 27 deletions(-) diff --git a/makerules/collection.mk b/makerules/collection.mk index d24ff45..4071474 100644 --- a/makerules/collection.mk +++ b/makerules/collection.mk @@ -4,6 +4,10 @@ commit-collection\ clobber-today +ifeq ($(COLLECTION_CONFIG_URL),) +COLLECTION_CONFIG_URL=$(CONFIG_URL)collection/$(COLLECTION_NAME)/ +endif + ifeq ($(COLLECTION_DIR),) COLLECTION_DIR=collection/ endif @@ -13,13 +17,21 @@ RESOURCE_DIR=$(COLLECTION_DIR)resource/ endif ifeq ($(DATASTORE_URL),) -DATASTORE_URL=https://$(COLLECTION_DATASET_BUCKET_NAME).s3.eu-west-2.amazonaws.com/ +DATASTORE_URL=https://files.planning.data.gov.uk/ endif # data sources SOURCE_CSV=$(COLLECTION_DIR)source.csv ENDPOINT_CSV=$(COLLECTION_DIR)endpoint.csv +OLD_RESOURCE_CSV=$(COLLECTION_DIR)old-resource.csv + +ifeq ($(COLLECTION_CONFIG_FILES),) +COLLECTION_CONFIG_FILES=\ + $(SOURCE_CSV)\ + $(ENDPOINT_CSV)\ + $(OLD_RESOURCE_CSV) +endif # collection log LOG_DIR=$(COLLECTION_DIR)log/ @@ -30,16 +42,26 @@ COLLECTION_INDEX=\ $(COLLECTION_DIR)/log.csv\ $(COLLECTION_DIR)/resource.csv +init:: + @curl -o /dev/null -s -w "%{http_code}" '$(DATASTORE_URL)$(REPOSITORY)/$(COLLECTION_DIR)log.csv' > /tmp/log_status_code + @curl -o /dev/null -s -w "%{http_code}" '$(DATASTORE_URL)$(REPOSITORY)/$(COLLECTION_DIR)resource.csv' > /tmp/resource_status_code + @if [ $$(cat /tmp/log_status_code) -ne 403 ] && [ $$(cat /tmp/resource_status_code) -ne 403 ]; then \ + curl -qfsL '$(DATASTORE_URL)$(REPOSITORY)/$(COLLECTION_DIR)log.csv' > $(COLLECTION_DIR)log.csv; \ + curl -qfsL '$(DATASTORE_URL)$(REPOSITORY)/$(COLLECTION_DIR)resource.csv' > $(COLLECTION_DIR)resource.csv; \ + fi + @rm -f /tmp/log_status_code /tmp/resource_status_code + + first-pass:: collect second-pass:: collection -collect:: $(SOURCE_CSV) $(ENDPOINT_CSV) +collect:: $(COLLECTION_CONFIG_FILES) @mkdir -p $(RESOURCE_DIR) - digital-land collect $(ENDPOINT_CSV) + digital-land ${DIGITAL_LAND_OPTS} collect $(ENDPOINT_CSV) collection:: - digital-land collection-save-csv + digital-land ${DIGITAL_LAND_OPTS} collection-save-csv clobber-today:: rm -rf $(LOG_FILES_TODAY) $(COLLECTION_INDEX) @@ -48,7 +70,7 @@ makerules:: curl -qfsL '$(SOURCE_URL)/makerules/main/collection.mk' > makerules/collection.mk commit-collection:: - git add collection + git add collection/log git diff --quiet && git diff --staged --quiet || (git commit -m "Collection $(shell date +%F)"; git push origin $(BRANCH)) load-resources:: @@ -57,6 +79,9 @@ load-resources:: save-resources:: aws s3 sync $(RESOURCE_DIR) s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(RESOURCE_DIR) --no-progress +save-logs:: + aws s3 sync $(COLLECTION_DIR)log s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(COLLECTION_DIR)log --no-progress + save-collection:: aws s3 cp $(COLLECTION_DIR)log.csv s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(COLLECTION_DIR) --no-progress aws s3 cp $(COLLECTION_DIR)resource.csv s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(COLLECTION_DIR) --no-progress @@ -69,3 +94,12 @@ endif collection/resource/%: @mkdir -p collection/resource/ curl -qfsL '$(DATASTORE_URL)$(REPOSITORY)/$(RESOURCE_DIR)$(notdir $@)' > $@ + +collection/%.csv: + @mkdir -p $(COLLECTION_DIR) + curl -qfsL '$(COLLECTION_CONFIG_URL)$(notdir $@)' > $@ + +config:: $(COLLECTION_CONFIG_FILES) + +clean:: + rm -f $(COLLECTION_CONFIG_FILES) \ No newline at end of file diff --git a/makerules/makerules.mk b/makerules/makerules.mk index 3f544fc..22d0495 100644 --- a/makerules/makerules.mk +++ b/makerules/makerules.mk @@ -1,5 +1,3 @@ -SOURCE_URL=https://raw.githubusercontent.com/digital-land/ - # deduce the repository ifeq ($(REPOSITORY),) REPOSITORY=$(shell basename -s .git `git config --get remote.origin.url`) @@ -8,19 +6,40 @@ endif ifeq ($(ENVIRONMENT),) ENVIRONMENT=production endif + +ifeq ($(SOURCE_URL),) +SOURCE_URL=https://raw.githubusercontent.com/digital-land/ +endif + +ifeq ($(CONFIG_URL),) +CONFIG_URL=https://raw.githubusercontent.com/digital-land/config/main/ +endif + +ifeq ($(COLLECTION_NAME),) +COLLECTION_NAME=$(shell echo "$(REPOSITORY)"|sed 's/-collection$$//') +endif + ifeq ($(COLLECTION_DATASET_BUCKET_NAME),) COLLECTION_DATASET_BUCKET_NAME=digital-land-$(ENVIRONMENT)-collection-dataset endif + ifeq ($(HOISTED_COLLECTION_DATASET_BUCKET_NAME),) HOISTED_COLLECTION_DATASET_BUCKET_NAME=digital-land-$(ENVIRONMENT)-collection-dataset-hoisted endif + define dataset_url 'https://$(COLLECTION_DATASET_BUCKET_NAME).s3.eu-west-2.amazonaws.com/$(2)-collection/dataset/$(1).sqlite3' endef +ifeq ($(CACHE_DIR),) +CACHE_DIR=var/cache/ +endif + + .PHONY: \ makerules\ specification\ + config\ init\ first-pass\ second-pass\ @@ -116,12 +135,22 @@ specification:: curl -qfsL '$(SOURCE_URL)/specification/main/specification/schema.csv' > specification/schema.csv curl -qfsL '$(SOURCE_URL)/specification/main/specification/schema-field.csv' > specification/schema-field.csv + init:: specification endif +# local copy of organsiation datapackage +$(CACHE_DIR)organisation.csv: + @mkdir -p $(CACHE_DIR) + curl -qfs "https://files.planning.data.gov.uk/organisation-collection/dataset/organisation.csv" > $(CACHE_DIR)organisation.csv + +init:: config + +config::; + commit-makerules:: git add makerules git diff --quiet && git diff --staged --quiet || (git commit -m "Updated makerules $(shell date +%F)"; git push origin $(BRANCH)) commit-collection:: - @: + @: \ No newline at end of file diff --git a/makerules/pipeline.mk b/makerules/pipeline.mk index 2c07619..d9121d9 100644 --- a/makerules/pipeline.mk +++ b/makerules/pipeline.mk @@ -4,11 +4,19 @@ commit-dataset # data sources -# collected resources +ifeq ($(PIPELINE_CONFIG_URL),) +PIPELINE_CONFIG_URL=$(CONFIG_URL)pipeline/$(COLLECTION_NAME)/ +endif + ifeq ($(COLLECTION_DIR),) COLLECTION_DIR=collection/ endif +ifeq ($(PIPELINE_DIR),) +PIPELINE_DIR=pipeline/ +endif + +# collected resources ifeq ($(RESOURCE_DIR),) RESOURCE_DIR=$(COLLECTION_DIR)resource/ endif @@ -63,28 +71,44 @@ ifeq ($(EXPECTATION_DIR),) EXPECTATION_DIR = expectations/ endif +ifeq ($(PIPELINE_CONFIG_FILES),) +PIPELINE_CONFIG_FILES=\ + $(PIPELINE_DIR)column.csv\ + $(PIPELINE_DIR)combine.csv\ + $(PIPELINE_DIR)concat.csv\ + $(PIPELINE_DIR)convert.csv\ + $(PIPELINE_DIR)default.csv\ + $(PIPELINE_DIR)default-value.csv\ + $(PIPELINE_DIR)filter.csv\ + $(PIPELINE_DIR)lookup.csv\ + $(PIPELINE_DIR)old-entity.csv\ + $(PIPELINE_DIR)patch.csv\ + $(PIPELINE_DIR)skip.csv\ + $(PIPELINE_DIR)transform.csv +endif + define run-pipeline mkdir -p $(@D) $(ISSUE_DIR)$(notdir $(@D)) $(COLUMN_FIELD_DIR)$(notdir $(@D)) $(DATASET_RESOURCE_DIR)$(notdir $(@D)) - digital-land --dataset $(notdir $(@D)) $(DIGITAL_LAND_FLAGS) pipeline $(1) --issue-dir $(ISSUE_DIR)$(notdir $(@D)) --column-field-dir $(COLUMN_FIELD_DIR)$(notdir $(@D)) --dataset-resource-dir $(DATASET_RESOURCE_DIR)$(notdir $(@D)) $(PIPELINE_FLAGS) $< $@ + digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(@D)) $(DIGITAL_LAND_FLAGS) pipeline $(1) --issue-dir $(ISSUE_DIR)$(notdir $(@D)) --column-field-dir $(COLUMN_FIELD_DIR)$(notdir $(@D)) --dataset-resource-dir $(DATASET_RESOURCE_DIR)$(notdir $(@D)) $(PIPELINE_FLAGS) $< $@ endef define build-dataset = mkdir -p $(@D) - time digital-land --dataset $(notdir $(basename $@)) dataset-create --output-path $(basename $@).sqlite3 $(^) + time digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(basename $@)) dataset-create --output-path $(basename $@).sqlite3 $(^) time datasette inspect $(basename $@).sqlite3 --inspect-file=$(basename $@).sqlite3.json - time digital-land --dataset $(notdir $(basename $@)) dataset-entries $(basename $@).sqlite3 $@ + time digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(basename $@)) dataset-entries $(basename $@).sqlite3 $@ mkdir -p $(FLATTENED_DIR) - time digital-land --dataset $(notdir $(basename $@)) dataset-entries-flattened $@ $(FLATTENED_DIR) + time digital-land ${DIGITAL_LAND_OPTS} --dataset $(notdir $(basename $@)) dataset-entries-flattened $@ $(FLATTENED_DIR) md5sum $@ $(basename $@).sqlite3 - csvstack $(wildcard $(ISSUE_DIR)/$(notdir $(basename $@))/*.csv) > $(basename $@)-issue.csv - mkdir -p $(EXPECTATION_DIR)yamls/data_acceptance/ - mkdir -p $(EXPECTATION_DIR)results/data_acceptance/$(notdir $(basename $@)) - -curl -qsfL 'https://raw.githubusercontent.com/digital-land/expectations-config/main/dataset_acceptance/$(notdir $(basename $@)).yaml' > $(EXPECTATION_DIR)yamls/data_acceptance/$(notdir $(basename $@)).yaml - time digital-land expectations --results-path "$(EXPECTATION_DIR)results/data_acceptance/$(notdir $(basename $@))" --sqlite-dataset-path "$(basename $@).sqlite3" --data-quality-yaml "$(EXPECTATION_DIR)yamls/data_acceptance/$(notdir $(basename $@)).yaml" + csvstack $(ISSUE_DIR)$(notdir $(basename $@))/*.csv > $(basename $@)-issue.csv + mkdir -p $(EXPECTATION_DIR) + time digital-land ${DIGITAL_LAND_OPTS} expectations-dataset-checkpoint --output-dir=$(EXPECTATION_DIR) --specification-dir=specification --data-path=$(basename $@).sqlite3 + csvstack $(EXPECTATION_DIR)/**/$(notdir $(basename $@))-results.csv > $(basename $@)-expectation-result.csv + csvstack $(EXPECTATION_DIR)/**/$(notdir $(basename $@))-issues.csv > $(basename $@)-expectation-issue.csv endef collection:: - digital-land collection-pipeline-makerules > collection/pipeline.mk + digital-land ${DIGITAL_LAND_OPTS} collection-pipeline-makerules > collection/pipeline.mk -include collection/pipeline.mk @@ -101,14 +125,15 @@ ifndef GDAL ifeq ($(UNAME),Darwin) $(error GDAL tools not found in PATH) endif + sudo add-apt-repository ppa:ubuntugis/ppa + sudo apt-get update sudo apt-get install gdal-bin endif pyproj sync --file uk_os_OSTN15_NTv2_OSGBtoETRS.tif -v ifeq ($(UNAME),Linux) - sudo apt-get install libsqlite3-mod-spatialite + dpkg-query -W libsqlite3-mod-spatialite >/dev/null 2>&1 || sudo apt-get install libsqlite3-mod-spatialite endif - clobber:: rm -rf $(DATASET_DIRS) @@ -116,9 +141,7 @@ clean:: rm -rf ./var # local copy of the organisation dataset -init:: - @mkdir -p $(CACHE_DIR) - curl -qfs "https://raw.githubusercontent.com/digital-land/organisation-dataset/main/collection/organisation.csv" > $(CACHE_DIR)organisation.csv +init:: $(CACHE_DIR)organisation.csv makerules:: curl -qfsL '$(SOURCE_URL)/makerules/main/pipeline.mk' > makerules/pipeline.mk @@ -132,17 +155,21 @@ save-transformed:: save-dataset:: aws s3 sync $(DATASET_DIR) s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(DATASET_DIR) --no-progress @mkdir -p $(FLATTENED_DIR) +ifeq ($(HOISTED_COLLECTION_DATASET_BUCKET_NAME),digital-land-$(ENVIRONMENT)-collection-dataset-hoisted) aws s3 sync $(FLATTENED_DIR) s3://$(HOISTED_COLLECTION_DATASET_BUCKET_NAME)/data/ --no-progress +else + aws s3 sync $(FLATTENED_DIR) s3://$(HOISTED_COLLECTION_DATASET_BUCKET_NAME)/dataset/ --no-progress --content-disposition attachment +endif save-expectations:: - @mkdir -p $(EXPECTATION_DIR)results/ - aws s3 sync $(EXPECTATION_DIR)results/ s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(REPOSITORY)/$(EXPECTATION_DIR) + @mkdir -p $(EXPECTATION_DIR) + aws s3 sync $(EXPECTATION_DIR) s3://$(COLLECTION_DATASET_BUCKET_NAME)/$(EXPECTATION_DIR) --exclude "*" --include "*.csv" --no-progress # convert an individual resource # .. this assumes conversion is the same for every dataset, but it may not be soon var/converted/%.csv: collection/resource/% mkdir -p var/converted/ - digital-land convert $< + digital-land ${DIGITAL_LAND_OPTS} convert $< transformed:: @mkdir -p $(TRANSFORMED_DIR) @@ -155,3 +182,18 @@ datasette: metadata.json --setting sql_time_limit_ms 5000 \ --load-extension $(SPATIALITE_EXTENSION) \ --metadata metadata.json + +FALLBACK_CONFIG_URL := https://files.planning.data.gov.uk/config/pipeline/$(COLLECTION_NAME)/ + +$(PIPELINE_DIR)%.csv: + @mkdir -p $(PIPELINE_DIR) + @if [ ! -f $@ ]; then \ + echo "Config file $@ not found locally. Attempting to download..."; \ + curl -qfsL '$(PIPELINE_CONFIG_URL)$(notdir $@)' -o $@ || \ + (echo "File not found in config repo. Attempting to download from AWS..." && curl -qfsL '$(FALLBACK_CONFIG_URL)$(notdir $@)' -o $@); \ + fi + +config:: $(PIPELINE_CONFIG_FILES) + +clean:: + rm -f $(PIPELINE_CONFIG_FILES) \ No newline at end of file From 95682e7fb20f4df9f22b136f3bc61757219b6543 Mon Sep 17 00:00:00 2001 From: averheecke-tpx Date: Fri, 11 Oct 2024 12:11:03 +0100 Subject: [PATCH 5/5] Delete .github/workflows/callable_run.yml --- .github/workflows/callable_run.yml | 297 ----------------------------- 1 file changed, 297 deletions(-) delete mode 100644 .github/workflows/callable_run.yml diff --git a/.github/workflows/callable_run.yml b/.github/workflows/callable_run.yml deleted file mode 100644 index e9fea50..0000000 --- a/.github/workflows/callable_run.yml +++ /dev/null @@ -1,297 +0,0 @@ -name: Run collection -on: - workflow_call: - secrets: - DLB_BOT_EMAIL: - required: true - DLB_BOT_TOKEN: - required: true - DLB_BOT_USERNAME: - required: true - AWS_S3_ACCESS_KEY_ID: - required: true - AWS_S3_SECRET_ACCESS_KEY: - required: true -env: - DLB_BOT_EMAIL: ${{ secrets.DLB_BOT_EMAIL }} - DLB_BOT_TOKEN: ${{ secrets.DLB_BOT_TOKEN }} - DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }} -jobs: - build: - runs-on: ubuntu-latest - steps: - - - name: Free up disk space - run: | - df -h - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - echo - df -h - - - uses: actions/checkout@v3 - with: - lfs: true - - - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Configure git - run: | - git config user.email "${DLB_BOT_EMAIL}" - git config user.name "${DLB_BOT_USERNAME}" - git remote set-url origin https://${DLB_BOT_USERNAME}:${DLB_BOT_TOKEN}@github.com/${GITHUB_REPOSITORY}.git - git checkout ${GITHUB_REF_NAME} - - - name: Update makerules - run: make makerules - - - name: Commit updated makerules - run: make commit-makerules - - - name: Install dependencies - run: make init - - - name: Run the collector - run: make collect - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-resources - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-resources - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save collected resources to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-resources - - - name: Save logs to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-logs - - - name: Build the collection database - run: make collection - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-collection - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-collection - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Push collection database to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-collection - - - name: transform collected files - run: make transformed -j 2 - - - name: Configure Development AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Configure Staging AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Configure Production AWS Credentials - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save transformed files to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-transformed - - - name: Build datasets from the transformed files - run: make dataset - - - name: Configure Development AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Development S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Development S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-expectations - - - name: Save performance to Development S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} - run: make save-performance - - - name: Configure Staging AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Staging S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Staging S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-expectations - - - name: Save performance to Staging S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} - run: make save-performance - - - name: Configure Production AWS Credentials - if: always() - uses: aws-actions/configure-aws-credentials@v1-node16 - with: - aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} - aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} - aws-region: eu-west-2 - - - name: Save datasets to Prod S3 - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-dataset - - - name: Save expectations to Prod S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-expectations - - - name: Save performance to Prod S3 - if: always() - env: - COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} - run: make save-performance - - check-pipeline-errors: - runs-on: ubuntu-latest - needs: - - build - if: always() && contains(join(needs.*.result, ','), 'failure') - steps: - - name: send failure notification - uses: slackapi/slack-github-action@v1 - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} - with: - channel-id: 'planning-data-platform' - payload: | - { - "text": "Collection Run: ${{ github.repository }}", - "icon_emoji": ":warning:", - "username": "CollectionRunner", - "blocks": [ - { - "type": "header", - "text": { - "type": "plain_text", - "text": "Collection Run Failed: ${{ github.repository }} " - } - }, - { - "type": "divider" - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "The report for this run is available on " - } - } - ] - } -