Revert "Merge pull request #2 from lucebert/feat/delete-previous-docs"

lucebert · lucebert · commit 68be1525e089 · 2025-04-03T16:13:16.000+02:00
This reverts commit bf93213, reversing changes made to 4a7840d.
diff --git a/src/index_graph/configuration.py b/src/index_graph/configuration.py
@@ -19,5 +19,4 @@ class IndexConfiguration(BaseConfiguration):
     This class defines the parameters needed for configuring the indexing and
     retrieval processes, including embedding model selection, retriever provider choice, and search parameters.
     """
-    api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
-    pinecone_index: str = field(default="langchain-doc", metadata={"description": "The Pinecone index to use for indexing documents."})
+    api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
diff --git a/src/index_graph/graph.py b/src/index_graph/graph.py
@@ -1,159 +1,112 @@
+"""This "graph" simply exposes an endpoint for a user to upload docs to be indexed."""
+
 import asyncio
 import os
-import logging
 from typing import List, Optional
-from datetime import datetime
-from pathlib import Path
-import gc
-from pinecone import Index
 
 import requests
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableConfig
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langgraph.graph import END, START, StateGraph
+
 from index_graph.configuration import IndexConfiguration
 from index_graph.state import IndexState, InputState
 from shared import retrieval
-from shared.utils import load_pinecone_index
 
-# Configure logging for errors and status
-LOG_PATH = Path("indexing_errors.log")
-logging.basicConfig(
-    filename=LOG_PATH,
-    filemode="a",
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    level=logging.INFO,
-)
 
 def check_index_config(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
-    """Validate API key and supported retriever provider."""
+    """Check the API key."""
     configuration = IndexConfiguration.from_runnable_config(config)
 
     if not configuration.api_key:
         raise ValueError("API key is required for document indexing.")
-
+    
     if configuration.api_key != os.getenv("INDEX_API_KEY"):
         raise ValueError("Authentication failed: Invalid API key provided.")
-
+    
     if configuration.retriever_provider != "pinecone":
         raise ValueError("Only Pinecone is currently supported for document indexing due to specific ID prefix requirements.")
-
+    
     return {}
 
 async def get_sitemap_urls(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
-    """Fetch all URLs from a sitemap (XML format)."""
+    """Get the URLs from the sitemap."""
     url = state.url_site_map
-
+    
+    headers = {
+        "Accept": "application/xml",
+        "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
+    }
+    response = requests.get(url, headers=headers)
+    sitemap_content = response.text
     headers = {
         "Accept": "application/xml",
         "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
     }
     response = requests.get(url, headers=headers)
     sitemap_content = response.text
 
+    # Extract URLs from sitemap (assuming XML format)
     import xml.etree.ElementTree as ET
+
     root = ET.fromstring(sitemap_content)
+    # Extract all URLs, removing frequency and other metadata
     urls_to_index = [
         url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
         for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url")
     ]
 
     print(f"Found {len(urls_to_index)} URLs to index.")
+
     return {"urls_to_index": urls_to_index}
 
-async def index_docs(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
-    """Index documents from all URLs in batches of 100, without concurrency limitation."""
-    # Load Pinecone index once
-    index_name = os.environ["PINECONE_INDEX_NAME"]
-    index = load_pinecone_index(index_name)
-
-    success_count = 0
-    fail_count = 0
-
-    async def safe_index_url(url: str) -> None:
-        nonlocal success_count, fail_count
-        try:
-            await index_url(url, config=config, index=index)
-            success_count += 1
-        except Exception as e:
-            logging.error(f"Failed indexing {url}: {e}")
-            with open("failed_urls.txt", "a") as f:
-                f.write(f"{url}\n")
-            fail_count += 1
-        finally:
-            gc.collect()
-
-    # Process URLs in batches of 100
-    batch_size = 100
-    total = len(state.urls_to_index)
-    for i in range(0, total, batch_size):
-        current_batch = state.urls_to_index[i:i + batch_size]
-        print(f"🔄 Processing batch {i // batch_size + 1} / {(total + batch_size - 1) // batch_size}")
-        tasks = [safe_index_url(url) for url in current_batch]
-        await asyncio.gather(*tasks, return_exceptions=True)
-
-    print(f"Indexed: {success_count} | Failed: {fail_count}")
-    return {
-        "success_count": str(success_count),
-        "fail_count": str(fail_count),
-    }
+async def index_docs(
+    state: IndexState, *, config: Optional[RunnableConfig] = None
+) -> dict[str, str]:
+    """Asynchronously index documents in the given state using the configured retriever.
+
+    This function takes the documents from the state, ensures they have a user ID,
+    adds them to the retriever's index, and then signals for the documents to be
+    deleted from the state.
+
+    If docs are not provided in the state, they will be loaded
+    from the configuration.docs_file JSON file.
+
+    Args:
+        state (IndexState): The current state containing documents and retriever.
+        config (Optional[RunnableConfig]): Configuration for the indexing process.r
+    """
+    # Process all URLs in parallel
+    chunk_tasks = [index_url(url, config) for url in state.urls_to_index]
+    await asyncio.gather(*chunk_tasks)
+
+    return {}
 
 
-async def index_url(url: str, config: IndexConfiguration, index:Index, retry: int = 1) -> List[Document]:
-    """Delete old chunks and re-index content from a given URL."""
-    try:
-        logging.info(f"Indexing: {url}")
-        loader = WebBaseLoader(web_paths=(url,))
-        docs = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        docs = text_splitter.split_documents(docs)
-
-        now_str = datetime.utcnow().isoformat()
-        for doc in docs:
-            doc.metadata["source_url"] = url
-            doc.metadata["last_indexed_at"] = now_str
-
-        texts = [doc.page_content for doc in docs]
-        metadatas = [doc.metadata for doc in docs]
-        chunk_ids = [f"{url}--chunk{i}" for i in range(len(texts))]
-
-        
-        print(f"Checking for existing chunks at prefix: {url}")
-        existing_urls = list(index.list(prefix=f"{url}"))
-
-        if existing_urls:
-            print(f"Deleted old chunks ({len(existing_urls)}) for {url}")
-            index.delete(ids=existing_urls)
-        else:
-            print(f"No existing chunks found for {url}")
-
-        async with retrieval.make_retriever(config) as (_, vectorstore):
-            if hasattr(vectorstore, "aadd_texts"):
-                await vectorstore.aadd_texts(
-                    texts=texts,
-                    metadatas=metadatas,
-                    ids=chunk_ids
-                )
-            else:
-                for i, doc in enumerate(docs):
-                    doc.metadata["id"] = chunk_ids[i]
-                await vectorstore.aadd_documents(docs)
-
-        logging.info(f"Successfully indexed {url}")
-        return docs
-
-    except Exception as e:
-        if retry > 0:
-            logging.warning(f"⚠️ Retry {url} after error: {e}")
-            await asyncio.sleep(1)
-            return await index_url(url, config,index, retry=retry - 1)
-        else:
-            logging.error(f"Final failure for {url}: {e}")
-            return []
-
-# Define the graph structure
+async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
+    """Index a web path."""
+    loader = WebBaseLoader(
+        web_paths=(url,),
+    )
+    docs = loader.load()
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    docs = text_splitter.split_documents(docs)
+
+    with retrieval.make_retriever(config) as retriever:
+
+        await retriever.vectorstore.aadd_texts(
+            #namespace= "langgraph" if "langgraph" in url else "langchain",
+            texts=[doc.page_content for doc in docs],
+            metadatas=[doc.metadata for doc in docs],
+            id_prefix=url,
+        )
+
+    return docs
+
+
+# Define the graph
 builder = StateGraph(IndexState, input=InputState, config_schema=IndexConfiguration)
 builder.add_node(check_index_config)
 builder.add_node(index_docs)
@@ -162,7 +115,6 @@ async def index_url(url: str, config: IndexConfiguration, index:Index, retry: in
 builder.add_edge("check_index_config", "get_sitemap_urls")
 builder.add_edge("get_sitemap_urls", "index_docs")
 builder.add_edge("index_docs", END)
-
-# Compile the state graph for execution
+# Compile into a graph object that you can invoke and deploy.
 graph = builder.compile()
 graph.name = "IndexGraph"
diff --git a/src/retrieval_graph/graph.py b/src/retrieval_graph/graph.py
@@ -14,10 +14,11 @@
 
 from retrieval_graph.configuration import AgentConfiguration
 from retrieval_graph.researcher_graph.graph import graph as researcher_graph
-from retrieval_graph.state import AgentState, InputState
+from retrieval_graph.state import AgentState, InputState, Router
 from shared.utils import format_docs, load_chat_model
 
 
+
 async def respond_to_general_query(
     state: AgentState, *, config: RunnableConfig
 ) -> dict[str, list[BaseMessage]]:
diff --git a/src/shared/retrieval.py b/src/shared/retrieval.py
@@ -5,8 +5,9 @@
 """
 
 import os
-from contextlib import contextmanager, asynccontextmanager
-from typing import AsyncGenerator, Tuple
+from contextlib import contextmanager
+from typing import Generator
+
 from langchain_core.embeddings import Embeddings
 from langchain_core.runnables import RunnableConfig
 from langchain_core.vectorstores import VectorStoreRetriever
@@ -28,68 +29,33 @@ def make_text_encoder(model: str) -> Embeddings:
 
 
 ## Retriever constructors
-@asynccontextmanager
-async def make_pinecone_retriever(
+@contextmanager
+def make_pinecone_retriever(
     configuration: BaseConfiguration, embedding_model: Embeddings
-) -> AsyncGenerator[Tuple[VectorStoreRetriever, "PineconeVectorStore"], None]:
-    """Configure this agent to connect to a specific Pinecone index and return both retriever and vectorstore."""
-
+) -> Generator[VectorStoreRetriever, None, None]:
+    """Configure this agent to connect to a specific pinecone index."""
     from langchain_pinecone import PineconeVectorStore
-    from pinecone import Pinecone, ServerlessSpec
-
-    pinecone_client = Pinecone(
-        api_key=os.environ["PINECONE_API_KEY"],
-        environment=os.environ["PINECONE_ENVIRONMENT"]
-    )
-
-    index_name = os.environ["PINECONE_INDEX_NAME"]
-    indexes = pinecone_client.list_indexes().names()
-
-    print("🔎 Index disponibles :", indexes)
 
-    if index_name not in indexes:
-        print(f"⚠️ L'index '{index_name}' n'existe pas. Création...")
-       # pinecone_client.create_index(name=index_name, dimension=1536, metric="cosine")
-
-        pinecone_client.create_index(
-            name=index_name,
-            dimension=1536,
-            metric="cosine",
-            spec=ServerlessSpec(
-                cloud="aws",  # or "gcp"
-                region="us-east-1" # adapt
-            )
-        )
-        print(f"✅ Index '{index_name}' créé.")
-
-    vectorstore = PineconeVectorStore.from_existing_index(
-        index_name=index_name,
-        embedding=embedding_model
+    vstore = PineconeVectorStore.from_existing_index(
+        os.environ["PINECONE_INDEX_NAME"], embedding=embedding_model
     )
+    yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
 
-    retriever = vectorstore.as_retriever(search_kwargs=configuration.search_kwargs)
-
-    yield retriever, vectorstore
-
-@asynccontextmanager
-async def make_retriever(
+@contextmanager
+def make_retriever(
     config: RunnableConfig,
-) -> AsyncGenerator[Tuple[VectorStoreRetriever, object], None]:
-    """
-    Create a retriever for the agent, based on the current configuration.
-    Returns both the retriever and the underlying vectorstore (if available).
-    """
+) -> Generator[VectorStoreRetriever, None, None]:
+    """Create a retriever for the agent, based on the current configuration."""
     configuration = BaseConfiguration.from_runnable_config(config)
     embedding_model = make_text_encoder(configuration.embedding_model)
-
     match configuration.retriever_provider:
         case "pinecone":
-            async with make_pinecone_retriever(configuration, embedding_model) as (retriever, vectorstore):
-                yield retriever, vectorstore
+            with make_pinecone_retriever(configuration, embedding_model) as retriever:
+                yield retriever
 
         case _:
             raise ValueError(
-                "❌ Unrecognized retriever_provider in configuration. "
+                "Unrecognized retriever_provider in configuration. "
                 f"Expected one of: {', '.join(BaseConfiguration.__annotations__['retriever_provider'].__args__)}\n"
                 f"Got: {configuration.retriever_provider}"
             )
diff --git a/src/shared/utils.py b/src/shared/utils.py
@@ -4,13 +4,12 @@
     format_docs: Convert documents to an xml-formatted string.
     load_chat_model: Load a chat model from a model name.
 """
-import os
+
 from typing import Optional
 
 from langchain.chat_models import init_chat_model
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
-from pinecone import Index, Pinecone
 
 
 def _format_doc(doc: Document) -> str:
@@ -63,17 +62,6 @@ def format_docs(docs: Optional[list[Document]]) -> str:
 {formatted}
 </documents>"""
 
-def load_pinecone_index(index_name: str) -> Index:
-    """Load a Pinecone index from a name.
-
-    Args:
-        index_name (str): The name of the Pinecone index to load.
-
-    Returns:
-        Index: The Pinecone index.
-    """
-    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
-    return pc.Index(index_name)
 
 def load_chat_model(fully_specified_name: str) -> BaseChatModel:
     """Load a chat model from a fully specified name.