borisolver · borisolver · Dec 15, 2025
diff --git a/tools/reddit_dump_reader/Cargo.toml b/tools/reddit_dump_reader/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "reddit_dump_reader"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow = "1"
+async-compression = { version = "0.4", features = ["tokio", "gzip", "zstd", "xz"] }
+clap = { version = "4.5", features = ["derive", "env"] }
+chrono = { version = "0.4", features = ["serde", "clock"] }
+log = "0.4"
+reqwest = { version = "0.12", features = ["json", "gzip", "brotli", "stream", "rustls-tls"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+tokio = { version = "1", features = ["macros", "rt-multi-thread", "fs", "io-util", "time", "process"] }
+tokio-util = { version = "0.7", features = ["io"] }
+env_logger = "0.11"
diff --git a/tools/reddit_dump_reader/README.md b/tools/reddit_dump_reader/README.md
@@ -0,0 +1,41 @@
+# reddit_dump_reader
+
+A lightweight tool to stream Reddit monthly dumps (comments + submissions) and push them to CleanApp's `/api/v3/reports/bulk_ingest` endpoint.
+
+The tool only applies the optional allowlist/keyword filters described below; it does **not** attempt to pre-screen for CleanApp-style reports. Classification and downstream analysis remain the responsibility of the backend after ingestion.
+
+## Usage
+
+```bash
+# Dry run to inspect converted items (prefer signed HTTPS URLs for GCS)
+CLEANAPP_BACKEND_URL=https://backend.example.com \
+CLEANAPP_FETCHER_TOKEN=secret \
+./target/release/reddit_dump_reader \
+  --inputs https://files.pushshift.io/reddit/comments/RC_2024-05.zst \
+  --inputs https://files.pushshift.io/reddit/submissions/RS_2024-05.zst \
+  --mode both \
+  --max-items 5 \
+  --dry-run
+
+# Live ingest with batching and filtering
+./target/release/reddit_dump_reader \
+  --inputs RC_2024-06.zst RS_2024-06.zst \
+  --backend-url https://backend.example.com \
+  --fetcher-token secret \
+  --batch-size 1000 \
+  --concurrency 8 \
+  --subreddit-allowlist allow.txt \
+  --keyword-file keywords.txt
+```
+
+Flags:
+- `--inputs <path-or-url>` (repeatable) accepts gzip/zstd/xz or plain NDJSON. Public/signed HTTPS URLs are preferred. `gs://` paths are supported via `gsutil cat` if available in your environment.
+- `--mode comments|submissions|both` to pick record types.
+- `--max-items` to cap ingestion (helpful for smoke tests).
+- `--batch-size` (default 1000) and `--concurrency` (default 8).
+- `--subreddit-allowlist` and `--keyword-file` provide simple gating.
+- `--dry-run` prints converted items instead of posting.
+
+Environment variables:
+- `CLEANAPP_BACKEND_URL`
+- `CLEANAPP_FETCHER_TOKEN`