Skip to content

Commit bf93213

Browse files
authored
Merge pull request #2 from lucebert/feat/delete-previous-docs
Feat/delete previous docs
2 parents 4a7840d + 4534299 commit bf93213

File tree

5 files changed

+178
-84
lines changed

5 files changed

+178
-84
lines changed

src/index_graph/configuration.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ class IndexConfiguration(BaseConfiguration):
1919
This class defines the parameters needed for configuring the indexing and
2020
retrieval processes, including embedding model selection, retriever provider choice, and search parameters.
2121
"""
22-
api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
22+
api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
23+
pinecone_index: str = field(default="langchain-doc", metadata={"description": "The Pinecone index to use for indexing documents."})

src/index_graph/graph.py

Lines changed: 111 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,112 +1,159 @@
1-
"""This "graph" simply exposes an endpoint for a user to upload docs to be indexed."""
2-
31
import asyncio
42
import os
3+
import logging
54
from typing import List, Optional
5+
from datetime import datetime
6+
from pathlib import Path
7+
import gc
8+
from pinecone import Index
69

710
import requests
811
from langchain_community.document_loaders import WebBaseLoader
912
from langchain_core.documents import Document
1013
from langchain_core.runnables import RunnableConfig
1114
from langchain_text_splitters import RecursiveCharacterTextSplitter
1215
from langgraph.graph import END, START, StateGraph
13-
1416
from index_graph.configuration import IndexConfiguration
1517
from index_graph.state import IndexState, InputState
1618
from shared import retrieval
19+
from shared.utils import load_pinecone_index
1720

21+
# Configure logging for errors and status
22+
LOG_PATH = Path("indexing_errors.log")
23+
logging.basicConfig(
24+
filename=LOG_PATH,
25+
filemode="a",
26+
format="%(asctime)s [%(levelname)s] %(message)s",
27+
level=logging.INFO,
28+
)
1829

1930
def check_index_config(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
20-
"""Check the API key."""
31+
"""Validate API key and supported retriever provider."""
2132
configuration = IndexConfiguration.from_runnable_config(config)
2233

2334
if not configuration.api_key:
2435
raise ValueError("API key is required for document indexing.")
25-
36+
2637
if configuration.api_key != os.getenv("INDEX_API_KEY"):
2738
raise ValueError("Authentication failed: Invalid API key provided.")
28-
39+
2940
if configuration.retriever_provider != "pinecone":
3041
raise ValueError("Only Pinecone is currently supported for document indexing due to specific ID prefix requirements.")
31-
42+
3243
return {}
3344

3445
async def get_sitemap_urls(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
35-
"""Get the URLs from the sitemap."""
46+
"""Fetch all URLs from a sitemap (XML format)."""
3647
url = state.url_site_map
37-
38-
headers = {
39-
"Accept": "application/xml",
40-
"User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
41-
}
42-
response = requests.get(url, headers=headers)
43-
sitemap_content = response.text
48+
4449
headers = {
4550
"Accept": "application/xml",
4651
"User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
4752
}
4853
response = requests.get(url, headers=headers)
4954
sitemap_content = response.text
5055

51-
# Extract URLs from sitemap (assuming XML format)
5256
import xml.etree.ElementTree as ET
53-
5457
root = ET.fromstring(sitemap_content)
55-
# Extract all URLs, removing frequency and other metadata
5658
urls_to_index = [
5759
url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
5860
for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url")
5961
]
6062

6163
print(f"Found {len(urls_to_index)} URLs to index.")
62-
6364
return {"urls_to_index": urls_to_index}
6465

65-
async def index_docs(
66-
state: IndexState, *, config: Optional[RunnableConfig] = None
67-
) -> dict[str, str]:
68-
"""Asynchronously index documents in the given state using the configured retriever.
69-
70-
This function takes the documents from the state, ensures they have a user ID,
71-
adds them to the retriever's index, and then signals for the documents to be
72-
deleted from the state.
73-
74-
If docs are not provided in the state, they will be loaded
75-
from the configuration.docs_file JSON file.
76-
77-
Args:
78-
state (IndexState): The current state containing documents and retriever.
79-
config (Optional[RunnableConfig]): Configuration for the indexing process.r
80-
"""
81-
# Process all URLs in parallel
82-
chunk_tasks = [index_url(url, config) for url in state.urls_to_index]
83-
await asyncio.gather(*chunk_tasks)
84-
85-
return {}
86-
87-
88-
async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
89-
"""Index a web path."""
90-
loader = WebBaseLoader(
91-
web_paths=(url,),
92-
)
93-
docs = loader.load()
94-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
95-
docs = text_splitter.split_documents(docs)
96-
97-
with retrieval.make_retriever(config) as retriever:
98-
99-
await retriever.vectorstore.aadd_texts(
100-
#namespace= "langgraph" if "langgraph" in url else "langchain",
101-
texts=[doc.page_content for doc in docs],
102-
metadatas=[doc.metadata for doc in docs],
103-
id_prefix=url,
104-
)
105-
106-
return docs
66+
async def index_docs(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
67+
"""Index documents from all URLs in batches of 100, without concurrency limitation."""
68+
# Load Pinecone index once
69+
index_name = os.environ["PINECONE_INDEX_NAME"]
70+
index = load_pinecone_index(index_name)
71+
72+
success_count = 0
73+
fail_count = 0
74+
75+
async def safe_index_url(url: str) -> None:
76+
nonlocal success_count, fail_count
77+
try:
78+
await index_url(url, config=config, index=index)
79+
success_count += 1
80+
except Exception as e:
81+
logging.error(f"Failed indexing {url}: {e}")
82+
with open("failed_urls.txt", "a") as f:
83+
f.write(f"{url}\n")
84+
fail_count += 1
85+
finally:
86+
gc.collect()
87+
88+
# Process URLs in batches of 100
89+
batch_size = 100
90+
total = len(state.urls_to_index)
91+
for i in range(0, total, batch_size):
92+
current_batch = state.urls_to_index[i:i + batch_size]
93+
print(f"🔄 Processing batch {i // batch_size + 1} / {(total + batch_size - 1) // batch_size}")
94+
tasks = [safe_index_url(url) for url in current_batch]
95+
await asyncio.gather(*tasks, return_exceptions=True)
96+
97+
print(f"Indexed: {success_count} | Failed: {fail_count}")
98+
return {
99+
"success_count": str(success_count),
100+
"fail_count": str(fail_count),
101+
}
107102

108103

109-
# Define the graph
104+
async def index_url(url: str, config: IndexConfiguration, index:Index, retry: int = 1) -> List[Document]:
105+
"""Delete old chunks and re-index content from a given URL."""
106+
try:
107+
logging.info(f"Indexing: {url}")
108+
loader = WebBaseLoader(web_paths=(url,))
109+
docs = loader.load()
110+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
111+
docs = text_splitter.split_documents(docs)
112+
113+
now_str = datetime.utcnow().isoformat()
114+
for doc in docs:
115+
doc.metadata["source_url"] = url
116+
doc.metadata["last_indexed_at"] = now_str
117+
118+
texts = [doc.page_content for doc in docs]
119+
metadatas = [doc.metadata for doc in docs]
120+
chunk_ids = [f"{url}--chunk{i}" for i in range(len(texts))]
121+
122+
123+
print(f"Checking for existing chunks at prefix: {url}")
124+
existing_urls = list(index.list(prefix=f"{url}"))
125+
126+
if existing_urls:
127+
print(f"Deleted old chunks ({len(existing_urls)}) for {url}")
128+
index.delete(ids=existing_urls)
129+
else:
130+
print(f"No existing chunks found for {url}")
131+
132+
async with retrieval.make_retriever(config) as (_, vectorstore):
133+
if hasattr(vectorstore, "aadd_texts"):
134+
await vectorstore.aadd_texts(
135+
texts=texts,
136+
metadatas=metadatas,
137+
ids=chunk_ids
138+
)
139+
else:
140+
for i, doc in enumerate(docs):
141+
doc.metadata["id"] = chunk_ids[i]
142+
await vectorstore.aadd_documents(docs)
143+
144+
logging.info(f"Successfully indexed {url}")
145+
return docs
146+
147+
except Exception as e:
148+
if retry > 0:
149+
logging.warning(f"⚠️ Retry {url} after error: {e}")
150+
await asyncio.sleep(1)
151+
return await index_url(url, config,index, retry=retry - 1)
152+
else:
153+
logging.error(f"Final failure for {url}: {e}")
154+
return []
155+
156+
# Define the graph structure
110157
builder = StateGraph(IndexState, input=InputState, config_schema=IndexConfiguration)
111158
builder.add_node(check_index_config)
112159
builder.add_node(index_docs)
@@ -115,6 +162,7 @@ async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
115162
builder.add_edge("check_index_config", "get_sitemap_urls")
116163
builder.add_edge("get_sitemap_urls", "index_docs")
117164
builder.add_edge("index_docs", END)
118-
# Compile into a graph object that you can invoke and deploy.
165+
166+
# Compile the state graph for execution
119167
graph = builder.compile()
120168
graph.name = "IndexGraph"

src/retrieval_graph/graph.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,10 @@
1414

1515
from retrieval_graph.configuration import AgentConfiguration
1616
from retrieval_graph.researcher_graph.graph import graph as researcher_graph
17-
from retrieval_graph.state import AgentState, InputState, Router
17+
from retrieval_graph.state import AgentState, InputState
1818
from shared.utils import format_docs, load_chat_model
1919

2020

21-
2221
async def respond_to_general_query(
2322
state: AgentState, *, config: RunnableConfig
2423
) -> dict[str, list[BaseMessage]]:

src/shared/retrieval.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
"""
66

77
import os
8-
from contextlib import contextmanager
9-
from typing import Generator
10-
8+
from contextlib import contextmanager, asynccontextmanager
9+
from typing import AsyncGenerator, Tuple
1110
from langchain_core.embeddings import Embeddings
1211
from langchain_core.runnables import RunnableConfig
1312
from langchain_core.vectorstores import VectorStoreRetriever
@@ -29,33 +28,68 @@ def make_text_encoder(model: str) -> Embeddings:
2928

3029

3130
## Retriever constructors
32-
@contextmanager
33-
def make_pinecone_retriever(
31+
@asynccontextmanager
32+
async def make_pinecone_retriever(
3433
configuration: BaseConfiguration, embedding_model: Embeddings
35-
) -> Generator[VectorStoreRetriever, None, None]:
36-
"""Configure this agent to connect to a specific pinecone index."""
34+
) -> AsyncGenerator[Tuple[VectorStoreRetriever, "PineconeVectorStore"], None]:
35+
"""Configure this agent to connect to a specific Pinecone index and return both retriever and vectorstore."""
36+
3737
from langchain_pinecone import PineconeVectorStore
38+
from pinecone import Pinecone, ServerlessSpec
39+
40+
pinecone_client = Pinecone(
41+
api_key=os.environ["PINECONE_API_KEY"],
42+
environment=os.environ["PINECONE_ENVIRONMENT"]
43+
)
44+
45+
index_name = os.environ["PINECONE_INDEX_NAME"]
46+
indexes = pinecone_client.list_indexes().names()
47+
48+
print("🔎 Index disponibles :", indexes)
3849

39-
vstore = PineconeVectorStore.from_existing_index(
40-
os.environ["PINECONE_INDEX_NAME"], embedding=embedding_model
50+
if index_name not in indexes:
51+
print(f"⚠️ L'index '{index_name}' n'existe pas. Création...")
52+
# pinecone_client.create_index(name=index_name, dimension=1536, metric="cosine")
53+
54+
pinecone_client.create_index(
55+
name=index_name,
56+
dimension=1536,
57+
metric="cosine",
58+
spec=ServerlessSpec(
59+
cloud="aws", # or "gcp"
60+
region="us-east-1" # adapt
61+
)
62+
)
63+
print(f"✅ Index '{index_name}' créé.")
64+
65+
vectorstore = PineconeVectorStore.from_existing_index(
66+
index_name=index_name,
67+
embedding=embedding_model
4168
)
42-
yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
4369

44-
@contextmanager
45-
def make_retriever(
70+
retriever = vectorstore.as_retriever(search_kwargs=configuration.search_kwargs)
71+
72+
yield retriever, vectorstore
73+
74+
@asynccontextmanager
75+
async def make_retriever(
4676
config: RunnableConfig,
47-
) -> Generator[VectorStoreRetriever, None, None]:
48-
"""Create a retriever for the agent, based on the current configuration."""
77+
) -> AsyncGenerator[Tuple[VectorStoreRetriever, object], None]:
78+
"""
79+
Create a retriever for the agent, based on the current configuration.
80+
Returns both the retriever and the underlying vectorstore (if available).
81+
"""
4982
configuration = BaseConfiguration.from_runnable_config(config)
5083
embedding_model = make_text_encoder(configuration.embedding_model)
84+
5185
match configuration.retriever_provider:
5286
case "pinecone":
53-
with make_pinecone_retriever(configuration, embedding_model) as retriever:
54-
yield retriever
87+
async with make_pinecone_retriever(configuration, embedding_model) as (retriever, vectorstore):
88+
yield retriever, vectorstore
5589

5690
case _:
5791
raise ValueError(
58-
"Unrecognized retriever_provider in configuration. "
92+
"Unrecognized retriever_provider in configuration. "
5993
f"Expected one of: {', '.join(BaseConfiguration.__annotations__['retriever_provider'].__args__)}\n"
6094
f"Got: {configuration.retriever_provider}"
6195
)

src/shared/utils.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
format_docs: Convert documents to an xml-formatted string.
55
load_chat_model: Load a chat model from a model name.
66
"""
7-
7+
import os
88
from typing import Optional
99

1010
from langchain.chat_models import init_chat_model
1111
from langchain_core.documents import Document
1212
from langchain_core.language_models import BaseChatModel
13+
from pinecone import Index, Pinecone
1314

1415

1516
def _format_doc(doc: Document) -> str:
@@ -62,6 +63,17 @@ def format_docs(docs: Optional[list[Document]]) -> str:
6263
{formatted}
6364
</documents>"""
6465

66+
def load_pinecone_index(index_name: str) -> Index:
67+
"""Load a Pinecone index from a name.
68+
69+
Args:
70+
index_name (str): The name of the Pinecone index to load.
71+
72+
Returns:
73+
Index: The Pinecone index.
74+
"""
75+
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
76+
return pc.Index(index_name)
6577

6678
def load_chat_model(fully_specified_name: str) -> BaseChatModel:
6779
"""Load a chat model from a fully specified name.

0 commit comments

Comments
 (0)