Reddit Scraper #579
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Reddit Scraper | |
| on: | |
| schedule: | |
| - cron: "5 */2 * * *" # run every 5 minutes (UTC) | |
| workflow_dispatch: # allow manual run from GitHub UI | |
| jobs: | |
| run-scraper: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.11" | |
| - name: Install dependencies | |
| run: | | |
| pip install asyncpraw pandas | |
| - name: Run scraper | |
| run: python scripts/scheduler.py | |
| env: | |
| REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }} | |
| REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }} | |
| - name: Create timestamped copies of merged CSVs | |
| run: | | |
| ts=$(date -u +'%Y%m%d_%H%M%S') | |
| for f in data_tmp/*_merged.csv; do | |
| [ -e "$f" ] || continue | |
| base=$(basename "$f" .csv) | |
| cp "$f" "data_tmp/${base}_${ts}.csv" | |
| done | |
| - name: Set artifact name with timestamp | |
| run: echo "ARTIFACT_NAME=reddit-scraper-data-$(date -u +'%Y%m%d_%H%M%S')" >> $GITHUB_ENV | |
| - name: Upload CSV artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ env.ARTIFACT_NAME }} | |
| path: data_tmp/*_merged_*.csv | |
| if-no-files-found: warn | |
| retention-days: 3 | |
| - name: Send results by email | |
| uses: dawidd6/action-send-mail@v6 | |
| with: | |
| server_address: smtp.gmail.com | |
| server_port: 465 | |
| secure: true | |
| username: ${{ secrets.SMTP_USERNAME }} | |
| password: ${{ secrets.SMTP_PASSWORD }} | |
| subject: "Reddit scraper run – CSVs ready" | |
| to: ra.nike28@gmail.com | |
| from: ${{ secrets.SMTP_USERNAME }} | |
| body: | | |
| The Reddit scraper has finished running. | |
| You can download the CSVs from this run here: | |
| ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| Repository: ${{ github.repository }} | |
| Run number: ${{ github.run_number }} |