Reddit_scraper/.github/workflows/reddit_scraper.yml at master · trotskitten/Reddit_scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
name: Reddit Scraper

on:
  schedule:
    - cron: "5 */2 * * *"   # run every 5 minutes (UTC)
  workflow_dispatch:       # allow manual run from GitHub UI

jobs:
  run-scraper:
    runs-on: ubuntu-latest

    steps:
      - name: Check out repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install dependencies
        run: |
          pip install asyncpraw pandas

      - name: Run scraper
        run: python scripts/scheduler.py
        env:
          REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
          REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}

      - name: Create timestamped copies of merged CSVs
        run: |
          ts=$(date -u +'%Y%m%d_%H%M%S')
          for f in data_tmp/*_merged.csv; do
            [ -e "$f" ] || continue
            base=$(basename "$f" .csv)
            cp "$f" "data_tmp/${base}_${ts}.csv"
          done

      - name: Set artifact name with timestamp
        run: echo "ARTIFACT_NAME=reddit-scraper-data-$(date -u +'%Y%m%d_%H%M%S')" >> $GITHUB_ENV

      - name: Upload CSV artifacts
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.ARTIFACT_NAME }}
          path: data_tmp/*_merged_*.csv
          if-no-files-found: warn
          retention-days: 3

      - name: Send results by email
        uses: dawidd6/action-send-mail@v6
        with:
          server_address: smtp.gmail.com
          server_port: 465
          secure: true
          username: ${{ secrets.SMTP_USERNAME }}
          password: ${{ secrets.SMTP_PASSWORD }}
          subject: "Reddit scraper run – CSVs ready"
          to: ra.nike28@gmail.com
          from: ${{ secrets.SMTP_USERNAME }}
          body: |
            The Reddit scraper has finished running.

            You can download the CSVs from this run here:
            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

            Repository: ${{ github.repository }}
            Run number: ${{ github.run_number }}