-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathingest.py
More file actions
107 lines (91 loc) · 3.17 KB
/
ingest.py
File metadata and controls
107 lines (91 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import uuid
from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import requests
load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("techpulse")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
TARGET_OUTLETS = {
"bbc-news": "BBC",
"reuters": "Reuters",
"cnn": "CNN",
"fox-news": "Fox News",
"associated-press": "AP",
"al-jazeera-english": "Al Jazeera",
}
#fetch
def fetch_news(page_size=100) -> list[dict]:
source_ids = ",".join(TARGET_OUTLETS.keys())
url = "https://newsapi.org/v2/everything"
params = {
"q": "Iran war",
"sources": source_ids,
"pageSize": page_size,
"sortBy": "publishedAt",
"apiKey": NEWS_API_KEY,
}
resp = requests.get(url, params=params)
resp.raise_for_status()
articles = resp.json().get("articles", [])
print(f"Fetched {len(articles)} articles from NewsAPI")
return articles
#chunking
def chunk_articles(articles: list[dict], chunk_size=200) -> list[dict]:
chunks = []
for a in articles:
title = a.get("title") or ""
description = a.get("description") or ""
source_id = a.get("source", {}).get("id", "unknown")
source_name = a.get("source", {}).get("name", "Unknown")
if not title:
continue
# combine title + description for richer TRIBE signal
text = f"{title}. {description}".strip(". ") if description else title
chunks.append({
"chunk_id": str(uuid.uuid4()),
"text": text,
"title": title,
"description": description,
"url": a.get("url", ""),
"outlet": source_name,
"source_id": source_id,
"published_at": a.get("publishedAt", ""),
})
print(f"Created {len(chunks)} chunks")
return chunks
#embed and upsert
def embed_and_upsert(chunks:list[dict], batch_size=50):
index.delete(delete_all=True)
print("Cleared existing Pinecone vectors")
texts = [c["text"] for c in chunks]
embedder.encode(texts, show_progress_bar=True)
embeddings = embedder.encode(texts)
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i+batch_size]
batch_embeddings = embeddings[i:i+batch_size]
vectors = []
for chunk, emb in zip(batch_chunks, batch_embeddings):
vectors.append({
"id": chunk["chunk_id"],
"values": emb.tolist(),
"metadata": {
"text": chunk["text"],
"title": chunk["title"],
"url": chunk["url"],
"outlet": chunk["outlet"],
"source_id": chunk["source_id"],
"published_at": chunk["published_at"],
}
})
index.upsert(vectors=vectors)
print(f"Upserted batch {i//batch_size + 1}")
print(f"Done — {len(chunks)} chunks in Pinecone")
#main
if __name__ == "__main__":
articles = fetch_news(page_size=100)
chunks = chunk_articles(articles)
embed_and_upsert(chunks)