-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgap_detector.py
More file actions
196 lines (153 loc) · 6.96 KB
/
gap_detector.py
File metadata and controls
196 lines (153 loc) · 6.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Copyright (c) 2026 Nardo. AGPL-3.0 — see LICENSE
"""
Gap Detector — find stories our RSS feeds missed.
Cross-references crawled headlines against Google News trending RSS
to identify coverage gaps. Returns missing stories for pipeline inclusion.
Usage:
from gap_detector import detect_gaps
gaps = await detect_gaps(
crawled_headlines=["Fed raises rates...", "Oil prices surge..."],
category="world",
)
# gaps = [{"title": ..., "url": ..., "source": ..., "summary": ...}, ...]
"""
import asyncio
import logging
import re
from html import unescape
from xml.etree.ElementTree import ParseError
import aiohttp
import feedparser
logger = logging.getLogger(__name__)
# ── Google News RSS by category ───────────────────────────────────────────────
GNEWS_URLS: dict[str, str] = {
"world": "https://news.google.com/rss?topic=WORLD&hl=en",
"business": "https://news.google.com/rss?topic=BUSINESS&hl=en",
"technology": "https://news.google.com/rss?topic=TECHNOLOGY&hl=en",
"crypto": "https://news.google.com/rss/search?q=cryptocurrency&hl=en",
}
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
}
MAX_GAP_STORIES = 5
SIMILARITY_THRESHOLD = 0.2 # Jaccard below this = no match
# ── Jaccard similarity ────────────────────────────────────────────────────────
def _tokenize(text: str) -> set[str]:
"""Lowercase, strip punctuation, split into word tokens."""
text = re.sub(r"[^\w\s]", "", text.lower())
return set(text.split())
def _jaccard(a: str, b: str) -> float:
"""Jaccard similarity between two strings."""
tokens_a = _tokenize(a)
tokens_b = _tokenize(b)
if not tokens_a or not tokens_b:
return 0.0
intersection = tokens_a & tokens_b
union = tokens_a | tokens_b
return len(intersection) / len(union)
def _has_match(candidate: str, headlines: list[str], threshold: float = SIMILARITY_THRESHOLD) -> bool:
"""Check if candidate headline matches any existing headline."""
for h in headlines:
if _jaccard(candidate, h) >= threshold:
return True
return False
# ── Fetch Google News RSS ─────────────────────────────────────────────────────
async def _fetch_gnews(category: str, timeout: int = 15) -> list[dict]:
"""Fetch and parse Google News RSS for a category.
Returns list of {"title": ..., "url": ..., "source": ...}.
"""
url = GNEWS_URLS.get(category)
if not url:
logger.warning("Unknown category %r, falling back to 'world'", category)
url = GNEWS_URLS["world"]
try:
async with aiohttp.ClientSession(headers=HEADERS) as session:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as resp:
if resp.status != 200:
logger.error("Google News RSS returned %d for %s", resp.status, category)
return []
raw = await resp.text()
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
logger.error("Failed to fetch Google News RSS for %s: %s", category, e)
return []
feed = feedparser.parse(raw)
results = []
for entry in feed.entries[:30]: # Top 30 from Google News
title = unescape(entry.get("title", "")).strip()
link = entry.get("link", "")
# Google News titles often end with " - Source Name"
source = ""
if " - " in title:
parts = title.rsplit(" - ", 1)
title = parts[0].strip()
source = parts[1].strip()
results.append({"title": title, "url": link, "source": source})
logger.info("Fetched %d headlines from Google News (%s)", len(results), category)
return results
# ── Core gap detection ────────────────────────────────────────────────────────
async def detect_gaps(
crawled_headlines: list[str],
category: str = "world",
) -> list[dict]:
"""Find stories our feeds missed.
Args:
crawled_headlines: list of headline strings from our RSS feeds.
category: 'world', 'business', 'technology', 'crypto'.
Returns:
list of {"title", "url", "source", "summary"} for missing stories.
Maximum MAX_GAP_STORIES items.
"""
if not crawled_headlines:
logger.warning("No crawled headlines provided — skipping gap detection")
return []
# 1. Fetch Google News trending for same category
gnews = await _fetch_gnews(category)
if not gnews:
logger.warning("No Google News results for %s — skipping gap detection", category)
return []
# 2. Compare: find stories with NO match in our headlines
gaps = []
for item in gnews:
if not _has_match(item["title"], crawled_headlines):
gaps.append(item)
logger.info(
"Gap detection [%s]: %d Google News headlines, %d crawled, %d gaps found",
category, len(gnews), len(crawled_headlines), len(gaps),
)
# 3. Cap at MAX_GAP_STORIES
gaps = gaps[:MAX_GAP_STORIES]
# 4. Add empty summary (caller can enrich later via LLM)
for g in gaps:
g.setdefault("summary", "")
return gaps
# ── Standalone test ───────────────────────────────────────────────────────────
async def _test():
"""Quick test: detect gaps against a dummy set of headlines."""
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
sample_headlines = [
"Trump announces new tariffs on Chinese goods",
"Fed holds interest rates steady amid inflation concerns",
"Ukraine counteroffensive gains momentum in east",
"OpenAI releases GPT-5 with advanced reasoning",
"Bitcoin hits new all-time high above $100,000",
"EU passes landmark AI regulation bill",
"Apple unveils new MacBook Pro with M5 chip",
"Oil prices surge after OPEC production cuts",
"Tesla recalls 500,000 vehicles over safety issue",
"Japan earthquake triggers tsunami warning",
]
for cat in ("world", "business", "technology", "crypto"):
print(f"\n{'='*60}")
print(f"Category: {cat}")
print(f"{'='*60}")
gaps = await detect_gaps(sample_headlines, category=cat)
for i, g in enumerate(gaps, 1):
print(f" {i}. [{g['source']}] {g['title']}")
print(f" {g['url']}")
print("\nDone.")
if __name__ == "__main__":
asyncio.run(_test())