diff --git a/.github/scripts/standardise_csvs.py b/.github/scripts/standardise_csvs.py index 09d9cd36..e5fd0e7a 100644 --- a/.github/scripts/standardise_csvs.py +++ b/.github/scripts/standardise_csvs.py @@ -46,6 +46,17 @@ def standardise_csv(file_path, expected_columns, sort_cols=None): reader = csv.DictReader(f) rows = list(reader) + # Check for unexpected columns + actual_cols = list(reader.fieldnames or []) + unexpected = [col for col in actual_cols if col not in expected_cols] + if unexpected: + return f"✗ {file_path}: unexpected column(s) found that would be removed: {', '.join(unexpected)}" + + # Check for extra values beyond the header (e.g. stray trailing commas) + for row in rows: + if None in row: + return f"✗ {file_path}: one or more rows have more values than columns in the header" + # Sort rows if sort columns are specified if sort_cols: rows.sort(key=lambda row: _sort_key(row, sort_cols)) @@ -80,6 +91,7 @@ def main(): """Standardise all CSVs in all datasets across pipeline and collection.""" # Get the root directory (two levels up from this script) base_dir = os.path.join(os.path.dirname(__file__), '../..') + errors = [] for folder_type in ["collection", "pipeline"]: folder_path = os.path.join(base_dir, folder_type) @@ -103,8 +115,12 @@ def main(): result = standardise_csv(file_path, expected_columns, sort_cols) if result: print(result) + errors.append(result) else: print(f"⊘ {filename} (not found)") + if errors: + sys.exit(1) + if __name__ == "__main__": main() diff --git a/.github/workflows/standardise_csvs.yml b/.github/workflows/standardise_csvs.yml new file mode 100644 index 00000000..c52f52f9 --- /dev/null +++ b/.github/workflows/standardise_csvs.yml @@ -0,0 +1,87 @@ +name: Standardise CSVs + +on: + schedule: + - cron: "00 22 * * *" + workflow_dispatch: + +concurrency: + group: standardise-csvs + cancel-in-progress: true + +jobs: + standardise: + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.APP_ID }} + private-key: ${{ secrets.APP_PRIVATE_KEY }} + + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ steps.app-token.outputs.token }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.11" + + - name: Configure git + run: | + git config user.name "github-actions-bot" + git config user.email "noreply@github.com" + + - name: Run standardise script + run: python .github/scripts/standardise_csvs.py + + - name: Commit and push if changed + run: | + git add collection/ pipeline/ + if git diff --staged --quiet; then + echo "No changes to commit" + exit 0 + fi + git commit -m "Standardise row ordering and line endings in CSVs" + git push origin main + + check-standardise-failure: + runs-on: ubuntu-latest + needs: + - standardise + if: github.event_name == 'schedule' && contains(join(needs.*.result, ','), 'failure') + steps: + - name: Send failure notification + uses: slackapi/slack-github-action@v1 + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + with: + channel-id: "planning-data-alerts" + payload: | + { + "text": "Standardise CSVs has failed", + "icon_emoji": ":warning:", + "username": "Standardise CSVs", + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "Standardise CSVs has failed" + } + }, + { + "type": "divider" + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "The report is available on " + } + } + ] + } diff --git a/.github/workflows/upload_to_S3.yml b/.github/workflows/upload_to_S3.yml index 02b8831f..0dc492b8 100644 --- a/.github/workflows/upload_to_S3.yml +++ b/.github/workflows/upload_to_S3.yml @@ -1,8 +1,10 @@ name: Upload to S3 on: - schedule: - - cron: '30 22 * * *' + workflow_run: + workflows: ["Standardise CSVs"] + types: + - completed workflow_dispatch: inputs: environment: