rag2web/main_not_optimized.py at main · lysandroc/rag2web · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import ollama
import requests
from bs4 import BeautifulSoup
import psycopg2
from pgvector.psycopg2 import register_vector

class WebpageQA:
    def __init__(self, model="crewai-llama-model", db_params=None):
        self.model = model
        self.text_data = []
        self.embeddings = []
        self.db_params = db_params or {
            'dbname': 'postgres',
            'user': 'postgres',
            'password': '123456',
            'host': 'localhost',
            'port': '5432'
        }
        self.conn = None
        self.cursor = None
        self._connect_db()
        self._setup_table()

    def _connect_db(self):
        """Connects to the PostgreSQL database and registers pgvector."""
        self.conn = psycopg2.connect(**self.db_params)
        self.cursor = self.conn.cursor()
        register_vector(self.conn)

    def _setup_table(self):
        """Sets up the table for storing embeddings."""
        self.cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
        self.cursor.execute("""
            CREATE TABLE IF NOT EXISTS webpage_embeddings (
                id SERIAL PRIMARY KEY,
                text TEXT,
                embedding VECTOR(4096)  -- Adjust dimension based on your embeddings
            )
        """)
        self.conn.commit()

    def fetch_webpage(self, url):
        """Fetches the HTML content of a webpage."""
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            raise Exception(f"Failed to fetch the webpage: {url}")

    def process_html(self, html_content):
        """Extracts text from HTML using BeautifulSoup."""
        soup = BeautifulSoup(html_content, "html.parser")
        for script in soup(["script", "style"]):
            script.extract()  # Remove script and style elements
        self.text_data = [text.strip() for text in soup.get_text().split("\n") if text.strip()]

    def generate_embeddings(self):
        """Generates embeddings using Ollama and stores them in PostgreSQL."""
        for text in self.text_data:
            embedding = self.get_embedding(text)
            self.embeddings.append(embedding)
            self.cursor.execute(
                "INSERT INTO webpage_embeddings (text, embedding) VALUES (%s, %s)",
                (text, embedding)
            )
        self.conn.commit()

    def get_embedding(self, text):
        """Fetches embedding from Ollama."""
        response = ollama.embeddings(model=self.model, prompt=text)
        return response["embedding"]

    def search_similar_text(self, query, top_k=2):
        """Finds the most relevant text snippets using pgvector."""
        query_embedding = self.get_embedding(query)
        # Convert the embedding to a string format
        query_embedding_str = f'[{",".join(map(str, query_embedding))}]'
        self.cursor.execute(
            "SELECT text FROM webpage_embeddings ORDER BY embedding <-> %s::vector LIMIT %s",
            (query_embedding_str, top_k)
        )
        results = self.cursor.fetchall()
        return [result[0] for result in results]

    def ask_question(self, query):
        """Returns the most relevant text snippets for a given question."""
        results = self.search_similar_text(query)
        return "\n".join(results)

    def __del__(self):
        """Closes the database connection."""
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()

if __name__ == "__main__":
    url = input("Enter a webpage URL: ")
    qa_system = WebpageQA(db_params={
        'dbname': 'postgres',
        'user': 'postgres',
        'password': '123456',
        'host': 'localhost',
        'port': '5432'
    })

    print("Fetching webpage...")
    html_content = qa_system.fetch_webpage(url)

    print("Processing HTML...")
    qa_system.process_html(html_content)

    print("Generating embeddings and storing in PostgreSQL...")
    qa_system.generate_embeddings()

    while True:
        query = input("Ask a question about the page (or type 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        answer = qa_system.ask_question(query)
        print("\nAnswer:\n", answer, "\n")