-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain_not_optimized.py
More file actions
121 lines (106 loc) · 4.15 KB
/
main_not_optimized.py
File metadata and controls
121 lines (106 loc) · 4.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import ollama
import requests
from bs4 import BeautifulSoup
import psycopg2
from pgvector.psycopg2 import register_vector
class WebpageQA:
def __init__(self, model="crewai-llama-model", db_params=None):
self.model = model
self.text_data = []
self.embeddings = []
self.db_params = db_params or {
'dbname': 'postgres',
'user': 'postgres',
'password': '123456',
'host': 'localhost',
'port': '5432'
}
self.conn = None
self.cursor = None
self._connect_db()
self._setup_table()
def _connect_db(self):
"""Connects to the PostgreSQL database and registers pgvector."""
self.conn = psycopg2.connect(**self.db_params)
self.cursor = self.conn.cursor()
register_vector(self.conn)
def _setup_table(self):
"""Sets up the table for storing embeddings."""
self.cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
self.cursor.execute("""
CREATE TABLE IF NOT EXISTS webpage_embeddings (
id SERIAL PRIMARY KEY,
text TEXT,
embedding VECTOR(4096) -- Adjust dimension based on your embeddings
)
""")
self.conn.commit()
def fetch_webpage(self, url):
"""Fetches the HTML content of a webpage."""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
raise Exception(f"Failed to fetch the webpage: {url}")
def process_html(self, html_content):
"""Extracts text from HTML using BeautifulSoup."""
soup = BeautifulSoup(html_content, "html.parser")
for script in soup(["script", "style"]):
script.extract() # Remove script and style elements
self.text_data = [text.strip() for text in soup.get_text().split("\n") if text.strip()]
def generate_embeddings(self):
"""Generates embeddings using Ollama and stores them in PostgreSQL."""
for text in self.text_data:
embedding = self.get_embedding(text)
self.embeddings.append(embedding)
self.cursor.execute(
"INSERT INTO webpage_embeddings (text, embedding) VALUES (%s, %s)",
(text, embedding)
)
self.conn.commit()
def get_embedding(self, text):
"""Fetches embedding from Ollama."""
response = ollama.embeddings(model=self.model, prompt=text)
return response["embedding"]
def search_similar_text(self, query, top_k=2):
"""Finds the most relevant text snippets using pgvector."""
query_embedding = self.get_embedding(query)
# Convert the embedding to a string format
query_embedding_str = f'[{",".join(map(str, query_embedding))}]'
self.cursor.execute(
"SELECT text FROM webpage_embeddings ORDER BY embedding <-> %s::vector LIMIT %s",
(query_embedding_str, top_k)
)
results = self.cursor.fetchall()
return [result[0] for result in results]
def ask_question(self, query):
"""Returns the most relevant text snippets for a given question."""
results = self.search_similar_text(query)
return "\n".join(results)
def __del__(self):
"""Closes the database connection."""
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
if __name__ == "__main__":
url = input("Enter a webpage URL: ")
qa_system = WebpageQA(db_params={
'dbname': 'postgres',
'user': 'postgres',
'password': '123456',
'host': 'localhost',
'port': '5432'
})
print("Fetching webpage...")
html_content = qa_system.fetch_webpage(url)
print("Processing HTML...")
qa_system.process_html(html_content)
print("Generating embeddings and storing in PostgreSQL...")
qa_system.generate_embeddings()
while True:
query = input("Ask a question about the page (or type 'exit' to quit): ")
if query.lower() == 'exit':
break
answer = qa_system.ask_question(query)
print("\nAnswer:\n", answer, "\n")