-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
124 lines (103 loc) · 4.05 KB
/
app.py
File metadata and controls
124 lines (103 loc) · 4.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from flask import Flask, request, jsonify
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import google.generativeai as genai
from apify_client import ApifyClient
import os
import time
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
# Set environment variables
os.environ["PINECONE_API_KEY"] = "pcsk_6ydvUG_N2kdjNryw3QE7YTG5fGMpRYLLG1N8M8871R1qmXHFdduRrcec5Msq5fp2fsJWaN"
os.environ["GOOGLE_API_KEY"] = "AIzaSyCIRm7rkRsm5SjGuX8QIHaEKyLlRJZxlDE"
# Initialize Pinecone
pinecone_api_key = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=pinecone_api_key)
# Define a directory for saving content
directory = "./Content"
# Helper functions
def load_docs(directory):
loader = DirectoryLoader(directory)
documents = loader.load()
return documents
def split_docs(documents, chunk_size=500, chunk_overlap=20):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs = text_splitter.split_documents(documents)
return docs
def crawl_website(url, max_crawl_depth=5):
client = ApifyClient("apify_api_5q5GdSg0k2cHWGiXvJBfzRBiWn8Dqw2KN32B")
run_input = {
"startUrls": [{"url": url}],
# Reduced settings for simplicity
"maxCrawlPages": 50,
"maxCrawlDepth": max_crawl_depth
}
run = client.actor("aYG0l9s7dbB7j3gbS").call(run_input=run_input)
# Write data to a text file
with open(f"{directory}/data.txt", "w", encoding="utf-8") as file:
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
text = item["text"]
file.write(f"Text: {text}\n\n")
documents = load_docs(directory)
docs = split_docs(documents)
# Initialize embeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Check if the Pinecone index exists
index_name = "langchain-index1"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name not in existing_indexes:
pc.create_index(
name=index_name,
dimension=384,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
while not pc.describe_index(index_name).status["ready"]:
time.sleep(1)
index = pc.Index(index_name)
docsearch = PineconeVectorStore.from_documents(
docs, embeddings, index_name=index_name
)
return docsearch
def generate_response(docsearch, query):
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
docs = docsearch.similarity_search(query)
model = genai.GenerativeModel("gemini-1.5-flash")
prompt = f"User's Query: {query}\n\nContext: {docs}\nGenerate a response using the provided context."
response = model.generate_content(prompt)
return response
# Flask endpoints
@app.route('/crawl', methods=['POST'])
def crawl():
data = request.json
url = data.get("url")
if not url:
return jsonify({"error": "URL is required"}), 400
try:
docsearch = crawl_website(url, 3)
return jsonify({"message": "Website crawled successfully"}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/query', methods=['POST'])
def query():
data = request.json
user_query = data.get("query")
if not user_query:
return jsonify({"error": "Query is required"}), 400
try:
index_name = "langchain-index1"
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
docsearch = PineconeVectorStore.from_existing_index(index_name, embeddings)
response = generate_response(docsearch, user_query)
return jsonify({"response": response.text}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
# Run the app
if __name__ == '__main__':
app.run(debug=True)