RAG-Doc-Intelligence/main.py at main · black/RAG-Doc-Intelligence · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import tempfile
from werkzeug.utils import secure_filename
import uuid
import time

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Configuration
UPLOAD_FOLDER = 'pdfs/'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max upload size

# Initialize embeddings and LLM model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
model = OllamaLLM(model="gemma3:4b")

# Store vector databases in memory
vector_stores = {}

# Chat prompt template
template = """
You are an assistant that answers questions. Using the following retrieved information, answer the user question. If you don't know the answer, say that you don't know.
Question: {question}
Context: {context}
Answer:
"""
prompt_template = ChatPromptTemplate.from_template(template)

@app.route('/upload', methods=['POST'])
def upload_pdf():
    """
    Endpoint to upload a PDF and create a vector store
    """

    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    print(file)

    if file.filename == '':
        return jsonify({'error': 'No file selected'}), 400

    if not file.filename.lower().endswith('.pdf'):
        return jsonify({'error': 'Only PDF files are supported'}), 400

    try:
        # Generate unique ID for this document
        doc_id = str(uuid.uuid4())

        # Save file to disk temporarily
        filename = secure_filename(file.filename)
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], f"{doc_id}_{filename}")
        file.save(file_path)

        # Create vector store
        vector_stores[doc_id] = create_vector_store(file_path)

        return jsonify({
            'success': True,
            'doc_id': doc_id,
            'filename': filename
        })

    except Exception as e:
        print(str(e))
        return jsonify({'error': str(e)}), 500

@app.route('/query', methods=['POST'])
def query_document():
    """
    Endpoint to query a previously uploaded document
    """
    data = request.json
    if not data:
        return jsonify({'error': 'No data provided'}), 400

    doc_id = data.get('doc_id')
    question = data.get('question')

    if not doc_id or not question:
        return jsonify({'error': 'Missing doc_id or question'}), 400

    if doc_id not in vector_stores:
        return jsonify({'error': 'Document not found'}), 404

    try:
        # Retrieve relevant documents
        docs = retrieve_docs(vector_stores[doc_id], question)

        # Generate answer
        answer = question_pdf(question, docs)

        # Format sources
        sources = []
        for doc in docs:
            if hasattr(doc, 'metadata') and 'source' in doc.metadata:
                source_info = {
                    'filename': os.path.basename(doc.metadata['source']),
                    'page': doc.metadata.get('page', 'unknown')
                }
                if source_info not in sources:
                    sources.append(source_info)

        return jsonify({
            'answer': answer,
            'sources': sources
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/documents', methods=['GET'])
def list_documents():
    """
    Endpoint to list all uploaded documents
    """
    documents = []
    for doc_id in vector_stores:
        # Try to find the original filename from the saved path
        file_pattern = f"{doc_id}_"
        matching_files = [f for f in os.listdir(app.config['UPLOAD_FOLDER']) if f.startswith(file_pattern)]

        filename = matching_files[0].replace(file_pattern, '') if matching_files else "Unknown"

        documents.append({
            'doc_id': doc_id,
            'filename': filename
        })

    return jsonify({'documents': documents})

@app.route('/delete/<doc_id>', methods=['DELETE'])
def delete_document(doc_id):
    """
    Endpoint to delete a document
    """
    if doc_id not in vector_stores:
        return jsonify({'error': 'Document not found'}), 404

    try:
        # Remove from vector store
        del vector_stores[doc_id]

        # Delete physical file
        file_pattern = f"{doc_id}_"
        matching_files = [f for f in os.listdir(app.config['UPLOAD_FOLDER']) if f.startswith(file_pattern)]

        for file in matching_files:
            os.remove(os.path.join(app.config['UPLOAD_FOLDER'], file))

        return jsonify({'success': True})

    except Exception as e:
        return jsonify({'error': str(e)}), 500

def create_vector_store(file_path):
    """
    Create a vector store from a PDF file
    """
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=300,
        add_start_index=True
    )

    chunked_docs = text_splitter.split_documents(documents)
    db = FAISS.from_documents(chunked_docs, embeddings)
    return db

def retrieve_docs(db, query, k=4):
    """
    Retrieve relevant documents for a query
    """
    return db.similarity_search(query, k)

def question_pdf(question, documents):
    """
    Generate an answer to a question based on the retrieved documents
    """
    context = "\n\n".join([doc.page_content for doc in documents])
    chain = prompt_template | model

    response = chain.invoke({"question": question, "context": context})
    return response.content if hasattr(response, 'content') else str(response)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)