-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
203 lines (161 loc) · 6.03 KB
/
main.py
File metadata and controls
203 lines (161 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import tempfile
from werkzeug.utils import secure_filename
import uuid
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
app = Flask(__name__)
CORS(app) # Enable CORS for all routes
# Configuration
UPLOAD_FOLDER = 'pdfs/'
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max upload size
# Initialize embeddings and LLM model
embeddings = OllamaEmbeddings(model="nomic-embed-text")
model = OllamaLLM(model="gemma3:4b")
# Store vector databases in memory
vector_stores = {}
# Chat prompt template
template = """
You are an assistant that answers questions. Using the following retrieved information, answer the user question. If you don't know the answer, say that you don't know.
Question: {question}
Context: {context}
Answer:
"""
prompt_template = ChatPromptTemplate.from_template(template)
@app.route('/upload', methods=['POST'])
def upload_pdf():
"""
Endpoint to upload a PDF and create a vector store
"""
if 'file' not in request.files:
return jsonify({'error': 'No file part in the request'}), 400
file = request.files['file']
print(file)
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not file.filename.lower().endswith('.pdf'):
return jsonify({'error': 'Only PDF files are supported'}), 400
try:
# Generate unique ID for this document
doc_id = str(uuid.uuid4())
# Save file to disk temporarily
filename = secure_filename(file.filename)
file_path = os.path.join(app.config['UPLOAD_FOLDER'], f"{doc_id}_{filename}")
file.save(file_path)
# Create vector store
vector_stores[doc_id] = create_vector_store(file_path)
return jsonify({
'success': True,
'doc_id': doc_id,
'filename': filename
})
except Exception as e:
print(str(e))
return jsonify({'error': str(e)}), 500
@app.route('/query', methods=['POST'])
def query_document():
"""
Endpoint to query a previously uploaded document
"""
data = request.json
if not data:
return jsonify({'error': 'No data provided'}), 400
doc_id = data.get('doc_id')
question = data.get('question')
if not doc_id or not question:
return jsonify({'error': 'Missing doc_id or question'}), 400
if doc_id not in vector_stores:
return jsonify({'error': 'Document not found'}), 404
try:
# Retrieve relevant documents
docs = retrieve_docs(vector_stores[doc_id], question)
# Generate answer
answer = question_pdf(question, docs)
# Format sources
sources = []
for doc in docs:
if hasattr(doc, 'metadata') and 'source' in doc.metadata:
source_info = {
'filename': os.path.basename(doc.metadata['source']),
'page': doc.metadata.get('page', 'unknown')
}
if source_info not in sources:
sources.append(source_info)
return jsonify({
'answer': answer,
'sources': sources
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/documents', methods=['GET'])
def list_documents():
"""
Endpoint to list all uploaded documents
"""
documents = []
for doc_id in vector_stores:
# Try to find the original filename from the saved path
file_pattern = f"{doc_id}_"
matching_files = [f for f in os.listdir(app.config['UPLOAD_FOLDER']) if f.startswith(file_pattern)]
filename = matching_files[0].replace(file_pattern, '') if matching_files else "Unknown"
documents.append({
'doc_id': doc_id,
'filename': filename
})
return jsonify({'documents': documents})
@app.route('/delete/<doc_id>', methods=['DELETE'])
def delete_document(doc_id):
"""
Endpoint to delete a document
"""
if doc_id not in vector_stores:
return jsonify({'error': 'Document not found'}), 404
try:
# Remove from vector store
del vector_stores[doc_id]
# Delete physical file
file_pattern = f"{doc_id}_"
matching_files = [f for f in os.listdir(app.config['UPLOAD_FOLDER']) if f.startswith(file_pattern)]
for file in matching_files:
os.remove(os.path.join(app.config['UPLOAD_FOLDER'], file))
return jsonify({'success': True})
except Exception as e:
return jsonify({'error': str(e)}), 500
def create_vector_store(file_path):
"""
Create a vector store from a PDF file
"""
loader = PyPDFLoader(file_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=300,
add_start_index=True
)
chunked_docs = text_splitter.split_documents(documents)
db = FAISS.from_documents(chunked_docs, embeddings)
return db
def retrieve_docs(db, query, k=4):
"""
Retrieve relevant documents for a query
"""
return db.similarity_search(query, k)
def question_pdf(question, documents):
"""
Generate an answer to a question based on the retrieved documents
"""
context = "\n\n".join([doc.page_content for doc in documents])
chain = prompt_template | model
response = chain.invoke({"question": question, "context": context})
return response.content if hasattr(response, 'content') else str(response)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True)