improve index graph

lucebert · lucebert · commit fc3a4d6176a3 · 2025-03-04T02:01:03.000+01:00
diff --git a/.env.example b/.env.example
@@ -25,3 +25,6 @@ PINECONE_INDEX_NAME=...
 
 ## Mongo Atlas
 MONGODB_URI=... # Full connection string
+
+## Index API key
+INDEX_API_KEY=...
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     "langchain-cohere>=0.2.4",
     "bs4>=0.0.2",
     "lxml>=5.3.0",
+    "pinecone>=6.0.1",
 ]
 
 [project.optional-dependencies]
diff --git a/src/index_graph/configuration.py b/src/index_graph/configuration.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from shared.configuration import BaseConfiguration
 
@@ -18,4 +18,5 @@ class IndexConfiguration(BaseConfiguration):
 
     This class defines the parameters needed for configuring the indexing and
     retrieval processes, including embedding model selection, retriever provider choice, and search parameters.
-    """
+    """
+    api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
diff --git a/src/index_graph/graph.py b/src/index_graph/graph.py
@@ -1,6 +1,7 @@
 """This "graph" simply exposes an endpoint for a user to upload docs to be indexed."""
 
 import asyncio
+import os
 from typing import List, Optional
 
 import requests
@@ -11,33 +12,35 @@
 from langgraph.graph import END, START, StateGraph
 
 from index_graph.configuration import IndexConfiguration
-from index_graph.state import IndexState
+from index_graph.state import IndexState, InputState
 from shared import retrieval
 
 
-async def index_docs(
-    state: IndexState, *, config: Optional[RunnableConfig] = None
-) -> dict[str, str]:
-    """Asynchronously index documents in the given state using the configured retriever.
-
-    This function takes the documents from the state, ensures they have a user ID,
-    adds them to the retriever's index, and then signals for the documents to be
-    deleted from the state.
-
-    If docs are not provided in the state, they will be loaded
-    from the configuration.docs_file JSON file.
+def check_index_config(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
+    """Check the API key."""
+    configuration = IndexConfiguration.from_runnable_config(config)
 
-    Args:
-        state (IndexState): The current state containing documents and retriever.
-        config (Optional[RunnableConfig]): Configuration for the indexing process.r
-    """
-    if not config:
-        raise ValueError("Configuration required to run index_docs.")
+    if not configuration.api_key:
+        raise ValueError("API key is required for document indexing.")
+    
+    if configuration.api_key != os.getenv("INDEX_API_KEY"):
+        raise ValueError("Authentication failed: Invalid API key provided.")
+    
+    if configuration.retriever_provider != "pinecone":
+        raise ValueError("Only Pinecone is currently supported for document indexing due to specific ID prefix requirements.")
+    
+    return {}
 
+async def get_sitemap_urls(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
+    """Get the URLs from the sitemap."""
     url = state.url_site_map
-
-    # Load and parse the sitemap
-
+    
+    headers = {
+        "Accept": "application/xml",
+        "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
+    }
+    response = requests.get(url, headers=headers)
+    sitemap_content = response.text
     headers = {
         "Accept": "application/xml",
         "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
@@ -50,64 +53,67 @@ async def index_docs(
 
     root = ET.fromstring(sitemap_content)
     # Extract all URLs, removing frequency and other metadata
-    urls = [
+    urls_to_index = [
         url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
         for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url")
     ]
 
-    # Process each URL
-    docs = []
+    print(f"Found {len(urls_to_index)} URLs to index.")
 
-    # Convert synchronous function to async
-    async def async_get_web_chunks(url):
-        return await asyncio.get_event_loop().run_in_executor(
-            None, get_web_chuncks, url
-        )
+    return {"urls_to_index": urls_to_index}
 
-    # Process all URLs in parallel
-    chunk_tasks = [async_get_web_chunks(url) for url in urls]
-    chunks_list = await asyncio.gather(*chunk_tasks)
+async def index_docs(
+    state: IndexState, *, config: Optional[RunnableConfig] = None
+) -> dict[str, str]:
+    """Asynchronously index documents in the given state using the configured retriever.
 
-    # Flatten the list of lists into single docs list
-    docs = [doc for chunks in chunks_list for doc in chunks]
+    This function takes the documents from the state, ensures they have a user ID,
+    adds them to the retriever's index, and then signals for the documents to be
+    deleted from the state.
 
-    print(f"Indexing {len(docs)} documents from {url}.")
+    If docs are not provided in the state, they will be loaded
+    from the configuration.docs_file JSON file.
 
-    with retrieval.make_retriever(config) as retriever:
-        batch_size = min(
-            500, max(1, len(docs))
-        )  # Target 500 docs per batch, but handle smaller doc counts
-        for i in range(0, len(docs), batch_size):
-            batch = docs[i : i + batch_size]
-            await retriever.aadd_documents(batch)
-            print(
-                f"Indexed batch {i//batch_size + 1} of {(len(docs) + batch_size - 1)//batch_size}"
-            )
+    Args:
+        state (IndexState): The current state containing documents and retriever.
+        config (Optional[RunnableConfig]): Configuration for the indexing process.r
+    """
+    # Process all URLs in parallel
+    chunk_tasks = [index_url(url, config) for url in state.urls_to_index]
+    await asyncio.gather(*chunk_tasks)
 
-    return {"docs": docs}
+    return {}
 
 
-def get_web_chuncks(path: str) -> List[Document]:
+async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
     """Index a web path."""
     loader = WebBaseLoader(
-        web_paths=(path,),
+        web_paths=(url,),
     )
     docs = loader.load()
-
-    print(f"Loaded {len(docs)} documents from {path}.")
-
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-
     docs = text_splitter.split_documents(docs)
 
-    print(f"Split {len(docs)} documents into chunks.")
+    with retrieval.make_retriever(config) as retriever:
+
+        await retriever.vectorstore.aadd_texts(
+            #namespace= "langgraph" if "langgraph" in url else "langchain",
+            texts=[doc.page_content for doc in docs],
+            metadatas=[doc.metadata for doc in docs],
+            id_prefix=url,
+        )
+
     return docs
 
 
 # Define the graph
-builder = StateGraph(IndexState, config_schema=IndexConfiguration)
+builder = StateGraph(IndexState, input=InputState, config_schema=IndexConfiguration)
+builder.add_node(check_index_config)
 builder.add_node(index_docs)
-builder.add_edge(START, "index_docs")
+builder.add_node(get_sitemap_urls)
+builder.add_edge(START, "check_index_config")
+builder.add_edge("check_index_config", "get_sitemap_urls")
+builder.add_edge("get_sitemap_urls", "index_docs")
 builder.add_edge("index_docs", END)
 # Compile into a graph object that you can invoke and deploy.
 graph = builder.compile()
diff --git a/src/index_graph/state.py b/src/index_graph/state.py
@@ -1,18 +1,26 @@
 """State management for the index graph."""
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+
+
+@dataclass(kw_only=True)
+class InputState:
+    """The input state for the index graph."""
+    
+    url_site_map: str
+    """The URL to the site map to index."""
 
 
 # The index state defines the simple IO for the single-node index graph
 @dataclass(kw_only=True)
-class IndexState:
+class IndexState(InputState):
     """Represents the state for document indexing and retrieval.
 
     This class defines the structure of the index state, which includes
     the documents to be indexed and the retriever used for searching
     these documents.
     """
 
+    urls_to_index: list[str] = field(default_factory=list)
+    """The URLs to index."""
 
-    url_site_map: str
-    """The URL to the site map to index."""
diff --git a/src/simple_rag/graph.py b/src/simple_rag/graph.py
@@ -45,7 +45,7 @@ async def generate(state: GraphState):
 
     # RAG generation
     # Prompt
-    prompt = hub.pull("self-rag")
+    prompt = hub.pull("langchaindoc/simple-rag")
 
     # LLM
     llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ dependencies = [`
`22`	`22`	`"langchain-cohere>=0.2.4",`
`23`	`23`	`"bs4>=0.0.2",`
`24`	`24`	`"lxml>=5.3.0",`
	`25`	`+ "pinecone>=6.0.1",`
`25`	`26`	`]`
`26`	`27`
`27`	`28`	`[project.optional-dependencies]`