Merge pull request #2 from lucebert/feat/delete-previous-docs

lucebert · web-flow · commit bf93213cc395 · 2025-03-25T18:12:04.000+01:00
Feat/delete previous docs
diff --git a/src/index_graph/configuration.py b/src/index_graph/configuration.py
@@ -19,4 +19,5 @@ class IndexConfiguration(BaseConfiguration):
     This class defines the parameters needed for configuring the indexing and
     retrieval processes, including embedding model selection, retriever provider choice, and search parameters.
     """
-    api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
+    api_key: str = field(default="", metadata={"description": "The API key for indexing documents."})
+    pinecone_index: str = field(default="langchain-doc", metadata={"description": "The Pinecone index to use for indexing documents."})
diff --git a/src/index_graph/graph.py b/src/index_graph/graph.py
@@ -1,112 +1,159 @@
-"""This "graph" simply exposes an endpoint for a user to upload docs to be indexed."""
-
 import asyncio
 import os
+import logging
 from typing import List, Optional
+from datetime import datetime
+from pathlib import Path
+import gc
+from pinecone import Index
 
 import requests
 from langchain_community.document_loaders import WebBaseLoader
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableConfig
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langgraph.graph import END, START, StateGraph
-
 from index_graph.configuration import IndexConfiguration
 from index_graph.state import IndexState, InputState
 from shared import retrieval
+from shared.utils import load_pinecone_index
 
+# Configure logging for errors and status
+LOG_PATH = Path("indexing_errors.log")
+logging.basicConfig(
+    filename=LOG_PATH,
+    filemode="a",
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    level=logging.INFO,
+)
 
 def check_index_config(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
-    """Check the API key."""
+    """Validate API key and supported retriever provider."""
     configuration = IndexConfiguration.from_runnable_config(config)
 
     if not configuration.api_key:
         raise ValueError("API key is required for document indexing.")
-    
+
     if configuration.api_key != os.getenv("INDEX_API_KEY"):
         raise ValueError("Authentication failed: Invalid API key provided.")
-    
+
     if configuration.retriever_provider != "pinecone":
         raise ValueError("Only Pinecone is currently supported for document indexing due to specific ID prefix requirements.")
-    
+
     return {}
 
 async def get_sitemap_urls(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
-    """Get the URLs from the sitemap."""
+    """Fetch all URLs from a sitemap (XML format)."""
     url = state.url_site_map
-    
-    headers = {
-        "Accept": "application/xml",
-        "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
-    }
-    response = requests.get(url, headers=headers)
-    sitemap_content = response.text
+
     headers = {
         "Accept": "application/xml",
         "User-Agent": "Mozilla/5.0 (compatible; LangChainBot/1.0)",
     }
     response = requests.get(url, headers=headers)
     sitemap_content = response.text
 
-    # Extract URLs from sitemap (assuming XML format)
     import xml.etree.ElementTree as ET
-
     root = ET.fromstring(sitemap_content)
-    # Extract all URLs, removing frequency and other metadata
     urls_to_index = [
         url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
         for url in root.findall("{http://www.sitemaps.org/schemas/sitemap/0.9}url")
     ]
 
     print(f"Found {len(urls_to_index)} URLs to index.")
-
     return {"urls_to_index": urls_to_index}
 
-async def index_docs(
-    state: IndexState, *, config: Optional[RunnableConfig] = None
-) -> dict[str, str]:
-    """Asynchronously index documents in the given state using the configured retriever.
-
-    This function takes the documents from the state, ensures they have a user ID,
-    adds them to the retriever's index, and then signals for the documents to be
-    deleted from the state.
-
-    If docs are not provided in the state, they will be loaded
-    from the configuration.docs_file JSON file.
-
-    Args:
-        state (IndexState): The current state containing documents and retriever.
-        config (Optional[RunnableConfig]): Configuration for the indexing process.r
-    """
-    # Process all URLs in parallel
-    chunk_tasks = [index_url(url, config) for url in state.urls_to_index]
-    await asyncio.gather(*chunk_tasks)
-
-    return {}
-
-
-async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
-    """Index a web path."""
-    loader = WebBaseLoader(
-        web_paths=(url,),
-    )
-    docs = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-    docs = text_splitter.split_documents(docs)
-
-    with retrieval.make_retriever(config) as retriever:
-
-        await retriever.vectorstore.aadd_texts(
-            #namespace= "langgraph" if "langgraph" in url else "langchain",
-            texts=[doc.page_content for doc in docs],
-            metadatas=[doc.metadata for doc in docs],
-            id_prefix=url,
-        )
-
-    return docs
+async def index_docs(state: IndexState, *, config: Optional[RunnableConfig] = None) -> dict[str, str]:
+    """Index documents from all URLs in batches of 100, without concurrency limitation."""
+    # Load Pinecone index once
+    index_name = os.environ["PINECONE_INDEX_NAME"]
+    index = load_pinecone_index(index_name)
+
+    success_count = 0
+    fail_count = 0
+
+    async def safe_index_url(url: str) -> None:
+        nonlocal success_count, fail_count
+        try:
+            await index_url(url, config=config, index=index)
+            success_count += 1
+        except Exception as e:
+            logging.error(f"Failed indexing {url}: {e}")
+            with open("failed_urls.txt", "a") as f:
+                f.write(f"{url}\n")
+            fail_count += 1
+        finally:
+            gc.collect()
+
+    # Process URLs in batches of 100
+    batch_size = 100
+    total = len(state.urls_to_index)
+    for i in range(0, total, batch_size):
+        current_batch = state.urls_to_index[i:i + batch_size]
+        print(f"🔄 Processing batch {i // batch_size + 1} / {(total + batch_size - 1) // batch_size}")
+        tasks = [safe_index_url(url) for url in current_batch]
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+    print(f"Indexed: {success_count} | Failed: {fail_count}")
+    return {
+        "success_count": str(success_count),
+        "fail_count": str(fail_count),
+    }
 
 
-# Define the graph
+async def index_url(url: str, config: IndexConfiguration, index:Index, retry: int = 1) -> List[Document]:
+    """Delete old chunks and re-index content from a given URL."""
+    try:
+        logging.info(f"Indexing: {url}")
+        loader = WebBaseLoader(web_paths=(url,))
+        docs = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+        docs = text_splitter.split_documents(docs)
+
+        now_str = datetime.utcnow().isoformat()
+        for doc in docs:
+            doc.metadata["source_url"] = url
+            doc.metadata["last_indexed_at"] = now_str
+
+        texts = [doc.page_content for doc in docs]
+        metadatas = [doc.metadata for doc in docs]
+        chunk_ids = [f"{url}--chunk{i}" for i in range(len(texts))]
+
+        
+        print(f"Checking for existing chunks at prefix: {url}")
+        existing_urls = list(index.list(prefix=f"{url}"))
+
+        if existing_urls:
+            print(f"Deleted old chunks ({len(existing_urls)}) for {url}")
+            index.delete(ids=existing_urls)
+        else:
+            print(f"No existing chunks found for {url}")
+
+        async with retrieval.make_retriever(config) as (_, vectorstore):
+            if hasattr(vectorstore, "aadd_texts"):
+                await vectorstore.aadd_texts(
+                    texts=texts,
+                    metadatas=metadatas,
+                    ids=chunk_ids
+                )
+            else:
+                for i, doc in enumerate(docs):
+                    doc.metadata["id"] = chunk_ids[i]
+                await vectorstore.aadd_documents(docs)
+
+        logging.info(f"Successfully indexed {url}")
+        return docs
+
+    except Exception as e:
+        if retry > 0:
+            logging.warning(f"⚠️ Retry {url} after error: {e}")
+            await asyncio.sleep(1)
+            return await index_url(url, config,index, retry=retry - 1)
+        else:
+            logging.error(f"Final failure for {url}: {e}")
+            return []
+
+# Define the graph structure
 builder = StateGraph(IndexState, input=InputState, config_schema=IndexConfiguration)
 builder.add_node(check_index_config)
 builder.add_node(index_docs)
@@ -115,6 +162,7 @@ async def index_url(url: str, config: IndexConfiguration) -> List[Document]:
 builder.add_edge("check_index_config", "get_sitemap_urls")
 builder.add_edge("get_sitemap_urls", "index_docs")
 builder.add_edge("index_docs", END)
-# Compile into a graph object that you can invoke and deploy.
+
+# Compile the state graph for execution
 graph = builder.compile()
 graph.name = "IndexGraph"
diff --git a/src/retrieval_graph/graph.py b/src/retrieval_graph/graph.py
@@ -14,11 +14,10 @@
 
 from retrieval_graph.configuration import AgentConfiguration
 from retrieval_graph.researcher_graph.graph import graph as researcher_graph
-from retrieval_graph.state import AgentState, InputState, Router
+from retrieval_graph.state import AgentState, InputState
 from shared.utils import format_docs, load_chat_model
 
 
-
 async def respond_to_general_query(
     state: AgentState, *, config: RunnableConfig
 ) -> dict[str, list[BaseMessage]]:
diff --git a/src/shared/retrieval.py b/src/shared/retrieval.py
@@ -5,9 +5,8 @@
 """
 
 import os
-from contextlib import contextmanager
-from typing import Generator
-
+from contextlib import contextmanager, asynccontextmanager
+from typing import AsyncGenerator, Tuple
 from langchain_core.embeddings import Embeddings
 from langchain_core.runnables import RunnableConfig
 from langchain_core.vectorstores import VectorStoreRetriever
@@ -29,33 +28,68 @@ def make_text_encoder(model: str) -> Embeddings:
 
 
 ## Retriever constructors
-@contextmanager
-def make_pinecone_retriever(
+@asynccontextmanager
+async def make_pinecone_retriever(
     configuration: BaseConfiguration, embedding_model: Embeddings
-) -> Generator[VectorStoreRetriever, None, None]:
-    """Configure this agent to connect to a specific pinecone index."""
+) -> AsyncGenerator[Tuple[VectorStoreRetriever, "PineconeVectorStore"], None]:
+    """Configure this agent to connect to a specific Pinecone index and return both retriever and vectorstore."""
+
     from langchain_pinecone import PineconeVectorStore
+    from pinecone import Pinecone, ServerlessSpec
+
+    pinecone_client = Pinecone(
+        api_key=os.environ["PINECONE_API_KEY"],
+        environment=os.environ["PINECONE_ENVIRONMENT"]
+    )
+
+    index_name = os.environ["PINECONE_INDEX_NAME"]
+    indexes = pinecone_client.list_indexes().names()
+
+    print("🔎 Index disponibles :", indexes)
 
-    vstore = PineconeVectorStore.from_existing_index(
-        os.environ["PINECONE_INDEX_NAME"], embedding=embedding_model
+    if index_name not in indexes:
+        print(f"⚠️ L'index '{index_name}' n'existe pas. Création...")
+       # pinecone_client.create_index(name=index_name, dimension=1536, metric="cosine")
+
+        pinecone_client.create_index(
+            name=index_name,
+            dimension=1536,
+            metric="cosine",
+            spec=ServerlessSpec(
+                cloud="aws",  # or "gcp"
+                region="us-east-1" # adapt
+            )
+        )
+        print(f"✅ Index '{index_name}' créé.")
+
+    vectorstore = PineconeVectorStore.from_existing_index(
+        index_name=index_name,
+        embedding=embedding_model
     )
-    yield vstore.as_retriever(search_kwargs=configuration.search_kwargs)
 
-@contextmanager
-def make_retriever(
+    retriever = vectorstore.as_retriever(search_kwargs=configuration.search_kwargs)
+
+    yield retriever, vectorstore
+
+@asynccontextmanager
+async def make_retriever(
     config: RunnableConfig,
-) -> Generator[VectorStoreRetriever, None, None]:
-    """Create a retriever for the agent, based on the current configuration."""
+) -> AsyncGenerator[Tuple[VectorStoreRetriever, object], None]:
+    """
+    Create a retriever for the agent, based on the current configuration.
+    Returns both the retriever and the underlying vectorstore (if available).
+    """
     configuration = BaseConfiguration.from_runnable_config(config)
     embedding_model = make_text_encoder(configuration.embedding_model)
+
     match configuration.retriever_provider:
         case "pinecone":
-            with make_pinecone_retriever(configuration, embedding_model) as retriever:
-                yield retriever
+            async with make_pinecone_retriever(configuration, embedding_model) as (retriever, vectorstore):
+                yield retriever, vectorstore
 
         case _:
             raise ValueError(
-                "Unrecognized retriever_provider in configuration. "
+                "❌ Unrecognized retriever_provider in configuration. "
                 f"Expected one of: {', '.join(BaseConfiguration.__annotations__['retriever_provider'].__args__)}\n"
                 f"Got: {configuration.retriever_provider}"
             )
diff --git a/src/shared/utils.py b/src/shared/utils.py
@@ -4,12 +4,13 @@
     format_docs: Convert documents to an xml-formatted string.
     load_chat_model: Load a chat model from a model name.
 """
-
+import os
 from typing import Optional
 
 from langchain.chat_models import init_chat_model
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
+from pinecone import Index, Pinecone
 
 
 def _format_doc(doc: Document) -> str:
@@ -62,6 +63,17 @@ def format_docs(docs: Optional[list[Document]]) -> str:
 {formatted}
 </documents>"""
 
+def load_pinecone_index(index_name: str) -> Index:
+    """Load a Pinecone index from a name.
+
+    Args:
+        index_name (str): The name of the Pinecone index to load.
+
+    Returns:
+        Index: The Pinecone index.
+    """
+    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
+    return pc.Index(index_name)
 
 def load_chat_model(fully_specified_name: str) -> BaseChatModel:
     """Load a chat model from a fully specified name.