Skip to content

Commit 68be152

Browse files
committed
Revert "Merge pull request #2 from lucebert/feat/delete-previous-docs"
This reverts commit bf93213, reversing changes made to 4a7840d.
1 parent bf93213 commit 68be152

5 files changed

Lines changed: 84 additions & 178 deletions

File tree

src/index_graph/configuration.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,4 @@ class IndexConfiguration(BaseConfiguration):
1919
This class defines the parameters needed for configuring the indexing and
2020
retrieval processes, including embedding model selection, retriever provider choice, and search parameters.
2121
"""
22-
api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
23-
pinecone_index: str = field(default="langchain-doc", metadata={"description": "The Pinecone index to use for indexing documents."})
22+
api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})

src/index_graph/graph.py

Lines changed: 63 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,159 +1,112 @@
1+
"""This "graph" simply exposes an endpoint for a user to upload docs to be indexed."""
2+
13
import asyncio
24
import os
3-
import logging
45
from typing import List, Optional
5-
from datetime import datetime
6-
from pathlib import Path
7-
import gc
8-
from pinecone import Index
96

107
import requests
118
from langchain_community.document_loaders import WebBaseLoader
129
from langchain_core.documents import Document
1310
from langchain_core.runnables import RunnableConfig
1411
from langchain_text_splitters import RecursiveCharacterTextSplitter
1512
from langgraph.graph import END, START, StateGraph
13+
1614
from index_graph.configuration import IndexConfiguration
1715
from index_graph.state import IndexState, InputState
1816
from shared import retrieval
19-
from shared.utils import load_pinecone_index
2017

21-
# Configure logging for errors and status
22-
LOG_PATH = Path("indexing_errors.log")
23-
logging.basicConfig(
24-
filename=LOG_PATH,
25-
filemode="a",
26-
format="%(asctime)s [%(levelname)s] %(message)s",
27-
level=logging.INFO,
28-
)
2918

3019
def check_index_config(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
31-
"""Validate API key and supported retriever provider."""
20+
"""Check the API key."""
3221
configuration = IndexConfiguration.from_runnable_config(config)
3322

3423
if not configuration.api_key:
3524
raise ValueError("API key is required for document indexing.")
36-
25+
3726
if configuration.api_key != os.getenv("INDEX_API_KEY"):
3827
raise ValueError("Authentication failed: Invalid API key provided.")
39-
28+
4029
if configuration.retriever_provider != "pinecone":
4130
raise ValueError("Only Pinecone is currently supported for document indexing due to specific ID prefix requirements.")
42-
31+
4332
return {}
4433

4534
async def get_sitemap_urls(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
46-
"""Fetch all URLs from a sitemap (XML format)."""
35+
"""Get the URLs from the sitemap."""
4736
url = state.url_site_map
48-
37+
38+
headers = {
39+
"Accept": "application/xml",
40+
"User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
41+
}
42+
response = requests.get(url, headers=headers)
43+
sitemap_content = response.text
4944
headers = {
5045
"Accept": "application/xml",
5146
"User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
5247
}
5348
response = requests.get(url, headers=headers)
5449
sitemap_content = response.text
5550

51+
# Extract URLs from sitemap (assuming XML format)
5652
import xml.etree.ElementTree as ET
53+
5754
root = ET.fromstring(sitemap_content)
55+
# Extract all URLs, removing frequency and other metadata
5856
urls_to_index = [
5957
url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
6058
for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url")
6159
]
6260

6361
print(f"Found {len(urls_to_index)} URLs to index.")
62+
6463
return {"urls_to_index": urls_to_index}
6564

66-
async def index_docs(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
67-
"""Index documents from all URLs in batches of 100, without concurrency limitation."""
68-
# Load Pinecone index once
69-
index_name = os.environ["PINECONE_INDEX_NAME"]
70-
index = load_pinecone_index(index_name)
71-
72-
success_count = 0
73-
fail_count = 0
74-
75-
async def safe_index_url(url: str) -> None:
76-
nonlocal success_count, fail_count
77-
try:
78-
await index_url(url, config=config, index=index)
79-
success_count += 1
80-
except Exception as e:
81-
logging.error(f"Failed indexing {url}: {e}")
82-
with open("failed_urls.txt", "a") as f:
83-
f.write(f"{url}\n")
84-
fail_count += 1
85-
finally:
86-
gc.collect()
87-
88-
# Process URLs in batches of 100
89-
batch_size = 100
90-
total = len(state.urls_to_index)
91-
for i in range(0, total, batch_size):
92-
current_batch = state.urls_to_index[i:i + batch_size]
93-
print(f"🔄 Processing batch {i // batch_size + 1} / {(total + batch_size - 1) // batch_size}")
94-
tasks = [safe_index_url(url) for url in current_batch]
95-
await asyncio.gather(*tasks, return_exceptions=True)
96-
97-
print(f"Indexed: {success_count} | Failed: {fail_count}")
98-
return {
99-
"success_count": str(success_count),
100-
"fail_count": str(fail_count),
101-
}
65+
async def index_docs(
66+
state: IndexState, *, config: Optional[RunnableConfig] = None
67+
) -> dict[str, str]:
68+
"""Asynchronously index documents in the given state using the configured retriever.
69+
70+
This function takes the documents from the state, ensures they have a user ID,
71+
adds them to the retriever's index, and then signals for the documents to be
72+
deleted from the state.
73+
74+
If docs are not provided in the state, they will be loaded
75+
from the configuration.docs_file JSON file.
76+
77+
Args:
78+
state (IndexState): The current state containing documents and retriever.
79+
config (Optional[RunnableConfig]): Configuration for the indexing process.r
80+
"""
81+
# Process all URLs in parallel
82+
chunk_tasks = [index_url(url, config) for url in state.urls_to_index]
83+
await asyncio.gather(*chunk_tasks)
84+
85+
return {}
10286

10387

104-
async def index_url(url: str, config: IndexConfiguration, index:Index, retry: int = 1) -> List[Document]:
105-
"""Delete old chunks and re-index content from a given URL."""
106-
try:
107-
logging.info(f"Indexing: {url}")
108-
loader = WebBaseLoader(web_paths=(url,))
109-
docs = loader.load()
110-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
111-
docs = text_splitter.split_documents(docs)
112-
113-
now_str = datetime.utcnow().isoformat()
114-
for doc in docs:
115-
doc.metadata["source_url"] = url
116-
doc.metadata["last_indexed_at"] = now_str
117-
118-
texts = [doc.page_content for doc in docs]
119-
metadatas = [doc.metadata for doc in docs]
120-
chunk_ids = [f"{url}--chunk{i}" for i in range(len(texts))]
121-
122-
123-
print(f"Checking for existing chunks at prefix: {url}")
124-
existing_urls = list(index.list(prefix=f"{url}"))
125-
126-
if existing_urls:
127-
print(f"Deleted old chunks ({len(existing_urls)}) for {url}")
128-
index.delete(ids=existing_urls)
129-
else:
130-
print(f"No existing chunks found for {url}")
131-
132-
async with retrieval.make_retriever(config) as (_, vectorstore):
133-
if hasattr(vectorstore, "aadd_texts"):
134-
await vectorstore.aadd_texts(
135-
texts=texts,
136-
metadatas=metadatas,
137-
ids=chunk_ids
138-
)
139-
else:
140-
for i, doc in enumerate(docs):
141-
doc.metadata["id"] = chunk_ids[i]
142-
await vectorstore.aadd_documents(docs)
143-
144-
logging.info(f"Successfully indexed {url}")
145-
return docs
146-
147-
except Exception as e:
148-
if retry > 0:
149-
logging.warning(f"⚠️ Retry {url} after error: {e}")
150-
await asyncio.sleep(1)
151-
return await index_url(url, config,index, retry=retry - 1)
152-
else:
153-
logging.error(f"Final failure for {url}: {e}")
154-
return []
155-
156-
# Define the graph structure
88+
async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
89+
"""Index a web path."""
90+
loader = WebBaseLoader(
91+
web_paths=(url,),
92+
)
93+
docs = loader.load()
94+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
95+
docs = text_splitter.split_documents(docs)
96+
97+
with retrieval.make_retriever(config) as retriever:
98+
99+
await retriever.vectorstore.aadd_texts(
100+
#namespace= "langgraph" if "langgraph" in url else "langchain",
101+
texts=[doc.page_content for doc in docs],
102+
metadatas=[doc.metadata for doc in docs],
103+
id_prefix=url,
104+
)
105+
106+
return docs
107+
108+
109+
# Define the graph
157110
builder = StateGraph(IndexState, input=InputState, config_schema=IndexConfiguration)
158111
builder.add_node(check_index_config)
159112
builder.add_node(index_docs)
@@ -162,7 +115,6 @@ async def index_url(url: str, config: IndexConfiguration, index:Index, retry: in
162115
builder.add_edge("check_index_config", "get_sitemap_urls")
163116
builder.add_edge("get_sitemap_urls", "index_docs")
164117
builder.add_edge("index_docs", END)
165-
166-
# Compile the state graph for execution
118+
# Compile into a graph object that you can invoke and deploy.
167119
graph = builder.compile()
168120
graph.name = "IndexGraph"

src/retrieval_graph/graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@
1414

1515
from retrieval_graph.configuration import AgentConfiguration
1616
from retrieval_graph.researcher_graph.graph import graph as researcher_graph
17-
from retrieval_graph.state import AgentState, InputState
17+
from retrieval_graph.state import AgentState, InputState, Router
1818
from shared.utils import format_docs, load_chat_model
1919

2020

21+
2122
async def respond_to_general_query(
2223
state: AgentState, *, config: RunnableConfig
2324
) -> dict[str, list[BaseMessage]]:

src/shared/retrieval.py

Lines changed: 17 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
"""
66

77
import os
8-
from contextlib import contextmanager, asynccontextmanager
9-
from typing import AsyncGenerator, Tuple
8+
from contextlib import contextmanager
9+
from typing import Generator
10+
1011
from langchain_core.embeddings import Embeddings
1112
from langchain_core.runnables import RunnableConfig
1213
from langchain_core.vectorstores import VectorStoreRetriever
@@ -28,68 +29,33 @@ def make_text_encoder(model: str) -> Embeddings:
2829

2930

3031
## Retriever constructors
31-
@asynccontextmanager
32-
async def make_pinecone_retriever(
32+
@contextmanager
33+
def make_pinecone_retriever(
3334
configuration: BaseConfiguration, embedding_model: Embeddings
34-
) -> AsyncGenerator[Tuple[VectorStoreRetriever, "PineconeVectorStore"], None]:
35-
"""Configure this agent to connect to a specific Pinecone index and return both retriever and vectorstore."""
36-
35+
) -> Generator[VectorStoreRetriever, None, None]:
36+
"""Configure this agent to connect to a specific pinecone index."""
3737
from langchain_pinecone import PineconeVectorStore
38-
from pinecone import Pinecone, ServerlessSpec
39-
40-
pinecone_client = Pinecone(
41-
api_key=os.environ["PINECONE_API_KEY"],
42-
environment=os.environ["PINECONE_ENVIRONMENT"]
43-
)
44-
45-
index_name = os.environ["PINECONE_INDEX_NAME"]
46-
indexes = pinecone_client.list_indexes().names()
47-
48-
print("🔎 Index disponibles :", indexes)
4938

50-
if index_name not in indexes:
51-
print(f"⚠️ L'index '{index_name}' n'existe pas. Création...")
52-
# pinecone_client.create_index(name=index_name, dimension=1536, metric="cosine")
53-
54-
pinecone_client.create_index(
55-
name=index_name,
56-
dimension=1536,
57-
metric="cosine",
58-
spec=ServerlessSpec(
59-
cloud="aws", # or "gcp"
60-
region="us-east-1" # adapt
61-
)
62-
)
63-
print(f"✅ Index '{index_name}' créé.")
64-
65-
vectorstore = PineconeVectorStore.from_existing_index(
66-
index_name=index_name,
67-
embedding=embedding_model
39+
vstore = PineconeVectorStore.from_existing_index(
40+
os.environ["PINECONE_INDEX_NAME"], embedding=embedding_model
6841
)
42+
yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
6943

70-
retriever = vectorstore.as_retriever(search_kwargs=configuration.search_kwargs)
71-
72-
yield retriever, vectorstore
73-
74-
@asynccontextmanager
75-
async def make_retriever(
44+
@contextmanager
45+
def make_retriever(
7646
config: RunnableConfig,
77-
) -> AsyncGenerator[Tuple[VectorStoreRetriever, object], None]:
78-
"""
79-
Create a retriever for the agent, based on the current configuration.
80-
Returns both the retriever and the underlying vectorstore (if available).
81-
"""
47+
) -> Generator[VectorStoreRetriever, None, None]:
48+
"""Create a retriever for the agent, based on the current configuration."""
8249
configuration = BaseConfiguration.from_runnable_config(config)
8350
embedding_model = make_text_encoder(configuration.embedding_model)
84-
8551
match configuration.retriever_provider:
8652
case "pinecone":
87-
async with make_pinecone_retriever(configuration, embedding_model) as (retriever, vectorstore):
88-
yield retriever, vectorstore
53+
with make_pinecone_retriever(configuration, embedding_model) as retriever:
54+
yield retriever
8955

9056
case _:
9157
raise ValueError(
92-
"Unrecognized retriever_provider in configuration. "
58+
"Unrecognized retriever_provider in configuration. "
9359
f"Expected one of: {', '.join(BaseConfiguration.__annotations__['retriever_provider'].__args__)}\n"
9460
f"Got: {configuration.retriever_provider}"
9561
)

src/shared/utils.py

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44
format_docs: Convert documents to an xml-formatted string.
55
load_chat_model: Load a chat model from a model name.
66
"""
7-
import os
7+
88
from typing import Optional
99

1010
from langchain.chat_models import init_chat_model
1111
from langchain_core.documents import Document
1212
from langchain_core.language_models import BaseChatModel
13-
from pinecone import Index, Pinecone
1413

1514

1615
def _format_doc(doc: Document) -> str:
@@ -63,17 +62,6 @@ def format_docs(docs: Optional[list[Document]]) -> str:
6362
{formatted}
6463
</documents>"""
6564

66-
def load_pinecone_index(index_name: str) -> Index:
67-
"""Load a Pinecone index from a name.
68-
69-
Args:
70-
index_name (str): The name of the Pinecone index to load.
71-
72-
Returns:
73-
Index: The Pinecone index.
74-
"""
75-
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
76-
return pc.Index(index_name)
7765

7866
def load_chat_model(fully_specified_name: str) -> BaseChatModel:
7967
"""Load a chat model from a fully specified name.

0 commit comments

Comments
 (0)