Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/scripts/standardise_csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,17 @@ def standardise_csv(file_path, expected_columns, sort_cols=None):
reader = csv.DictReader(f)
rows = list(reader)

# Check for unexpected columns
actual_cols = list(reader.fieldnames or [])
unexpected = [col for col in actual_cols if col not in expected_cols]
if unexpected:
return f"✗ {file_path}: unexpected column(s) found that would be removed: {', '.join(unexpected)}"

# Check for extra values beyond the header (e.g. stray trailing commas)
for row in rows:
if None in row:
return f"✗ {file_path}: one or more rows have more values than columns in the header"

# Sort rows if sort columns are specified
if sort_cols:
rows.sort(key=lambda row: _sort_key(row, sort_cols))
Expand Down Expand Up @@ -80,6 +91,7 @@ def main():
"""Standardise all CSVs in all datasets across pipeline and collection."""
# Get the root directory (two levels up from this script)
base_dir = os.path.join(os.path.dirname(__file__), '../..')
errors = []

for folder_type in ["collection", "pipeline"]:
folder_path = os.path.join(base_dir, folder_type)
Expand All @@ -103,8 +115,12 @@ def main():
result = standardise_csv(file_path, expected_columns, sort_cols)
if result:
print(result)
errors.append(result)
else:
print(f"⊘ {filename} (not found)")

if errors:
sys.exit(1)

if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions .github/workflows/standardise_csvs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: Standardise CSVs

on:
schedule:
- cron: "00 22 * * *"
workflow_dispatch:

concurrency:
group: standardise-csvs
cancel-in-progress: true

jobs:
standardise:
runs-on: ubuntu-latest
steps:
- name: Generate GitHub App token
id: app-token
uses: actions/create-github-app-token@v1
with:
app-id: ${{ secrets.APP_ID }}
private-key: ${{ secrets.APP_PRIVATE_KEY }}

- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ steps.app-token.outputs.token }}

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "3.11"

- name: Configure git
run: |
git config user.name "github-actions-bot"
git config user.email "noreply@github.com"

- name: Run standardise script
run: python .github/scripts/standardise_csvs.py

- name: Commit and push if changed
run: |
git add collection/ pipeline/
if git diff --staged --quiet; then
echo "No changes to commit"
exit 0
fi
git commit -m "Standardise row ordering and line endings in CSVs"
git push origin main

check-standardise-failure:
runs-on: ubuntu-latest
needs:
- standardise
if: github.event_name == 'schedule' && contains(join(needs.*.result, ','), 'failure')
steps:
- name: Send failure notification
uses: slackapi/slack-github-action@v1
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
with:
channel-id: "planning-data-alerts"
payload: |
{
"text": "Standardise CSVs has failed",
"icon_emoji": ":warning:",
"username": "Standardise CSVs",
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "Standardise CSVs has failed"
}
},
{
"type": "divider"
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "The report is available on <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub>"
}
}
]
}
6 changes: 4 additions & 2 deletions .github/workflows/upload_to_S3.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
name: Upload to S3

on:
schedule:
- cron: '30 22 * * *'
workflow_run:
workflows: ["Standardise CSVs"]
types:
- completed
workflow_dispatch:
inputs:
environment:
Expand Down
Loading