langchain-ce-app/vectorize_docs.py at main · midaz/langchain-ce-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import tempfile
import ssl
import certifi
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_openai import OpenAIEmbeddings
import math


#read local .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())


# Configure requests to use certifi's certificates
requests.packages.urllib3.util.ssl_.DEFAULT_CERTS = certifi.where()

def get_vector_db_retriever():
    """Get or create a vector store retriever for Langchain documentation."""
    persist_path = os.path.join(tempfile.gettempdir(), "langchain_docs.parquet")
    print(f"Vector store path: {persist_path}")
    print(f"Vector store exists: {os.path.exists(persist_path)}")

    embd = OpenAIEmbeddings()

    # If vector store exists, then load it
    if os.path.exists(persist_path):
        print("Loading existing vector store...")
        try:
            vectorstore = SKLearnVectorStore(
                embedding=embd,
                persist_path=persist_path,
                serializer="parquet"
            )
            print("Successfully loaded existing vector store")
            return vectorstore.as_retriever(
                search_type="similarity_score_threshold",
                search_kwargs={
                    "k": 4,  # Number of documents to retrieve
                    "score_threshold": 0.5  # Minimum similarity score
                }
            )
        except Exception as e:
            print(f"Error loading vector store: {e}")
            print("Will create new vector store instead")

    print("Creating new vector store...")
    # Otherwise, index Langchain documents and create new vector store
    python_docs_loader = SitemapLoader(
        web_path="https://python.langchain.com/sitemap.xml",
        continue_on_failure=True,
        verify_ssl=False  # Disable SSL verification
    )

    print("Downloading Python documentation...")
    python_docs = python_docs_loader.load()
    print(f"Downloaded {len(python_docs)} Python documents")

    # Split documents using tiktoken encoder with better chunking strategy
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=500,  # Keep small chunk size to stay under token limits
        chunk_overlap=50,  # Small overlap for context while staying under limits
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]  # More natural text splitting
    )
    doc_splits = text_splitter.split_documents(python_docs)
    print(f"Created {len(doc_splits)} chunks")

    # Create and persist vector store
    print("Creating vector store...")

    # Initialize empty vector store
    vectorstore = SKLearnVectorStore(
        embedding=embd,
        persist_path=persist_path,
        serializer="parquet"
    )

    # Process in smaller batches to handle token limits
    batch_size = 50  # Reduced batch size to stay well under token limits
    num_batches = math.ceil(len(doc_splits) / batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(doc_splits))
        batch = doc_splits[start_idx:end_idx]
        print(f"Processing batch {i+1}/{num_batches} ({len(batch)} chunks)")

        try:
            # Add batch to vector store
            vectorstore.add_documents(batch)
            vectorstore.persist()
            print(f"Batch {i+1} processed and persisted")
        except Exception as e:
            print(f"Error processing batch {i+1}: {e}")
            print("Trying with smaller batch...")
            # If batch fails, try processing one document at a time
            for doc in batch:
                try:
                    vectorstore.add_documents([doc])
                    vectorstore.persist()
                except Exception as e:
                    print(f"Error processing document: {e}")
                    continue

    print(f"Vector store saved to: {persist_path}")
    print(f"Vector store file exists: {os.path.exists(persist_path)}")
    print(f"Vector store file size: {os.path.getsize(persist_path) if os.path.exists(persist_path) else 'N/A'} bytes")

    return vectorstore.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 4,  # Number of documents to retrieve
            "score_threshold": 0.5  # Minimum similarity score
        }
    )

def main():
    # Get the retriever
    retriever = get_vector_db_retriever()

    # Test the retriever
    query = "How do I create a router chain in Langchain?"
    print(f"\nTesting retriever with query: {query}")

    # Use invoke instead of get_relevant_documents
    docs = retriever.invoke(query)
    print(f"Found {len(docs)} relevant documents")

    # Show better previews of the documents
    print("\nRelevant document previews:")
    for i, doc in enumerate(docs, 1):
        print(f"\nDocument {i}:")
        print("-" * 80)
        print(doc.page_content[:1000])
        print("-" * 80)
        if doc.metadata:
            print("Metadata:", doc.metadata)
        print()

if __name__ == "__main__":
    main()