-
Notifications
You must be signed in to change notification settings - Fork 0
Description
Small project designed to query all URLs from bookmarks, verify URLs work, read the page, summarise it and database it. Flask interface part 2, initially just testing it works.
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, Column, String, Text
from sqlalchemy.orm import declarative_base, sessionmaker
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
import ollama
import uuid
import os
Database setup
DATABASE_URL = "sqlite:///urls.db" # Database URL for SQLite
Base = declarative_base() # Base class for SQLAlchemy models
Define a model for storing URL records
class URLRecord(Base):
tablename = 'urls' # Name of the table in the database
id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4())) # Unique identifier for each record
url = Column(String, unique=True, nullable=False) # URL (must be unique and not null)
heading = Column(String) # Page heading
summary = Column(Text) # Summary of the page content
tags = Column(Text) # Tags extracted from the content
Create the database and the URLRecord table
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
Class to interact with the Ollama LLM
class LlmClient:
def init(self, ollama_instance_url: str, model: str):
self.ollama_instance_url = ollama_instance_url # URL of the Ollama instance
self.model = model # Model to use for generation
self.client = ollama.Client(host=ollama_instance_url) # Initialize the Ollama client
# Get a response from the LLM for a given input
def get_llm_response(self, input: str = ""):
response = self.client.generate(
model=self.model,
prompt=input,
)
output = response["response"] # Extract the response content
output = self.clean_output(output) # Clean the output (remove any unwanted tokens)
return output
# Get a streamed response from the LLM for a given input
def get_llm_response_stream(self, input: str = ""):
response = self.client.generate(
model=self.model,
prompt=input,
stream=True,
)
for chunk in response:
output = chunk["response"] # Extract the chunk content
output = self.clean_output(output) # Clean the output
yield output
# Clean the output by removing unwanted tokens
def clean_output(self, output: str) -> str:
ending_token = ""
output = output.replace(ending_token, "")
return output
Remove tracking parameters from the URL
def strip_tracking(url):
parsed_url = urlparse(url) # Parse the URL into components
query = parse_qs(parsed_url.query) # Parse the query parameters
query = {k: v for k, v in query.items() if not k.startswith(('amp_', 'precache_', 'utm_'))} # Remove tracking parameters
stripped_query = urlencode(query, doseq=True) # Re-encode the query parameters
stripped_url = parsed_url._replace(query=stripped_query) # Replace the query part of the URL
return urlunparse(stripped_url) # Reconstruct the URL
Fetch the content of the URL
def fetch_url_data(url):
try:
response = requests.get(url) # Make a GET request to the URL
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content with BeautifulSoup
text = soup.get_text() # Extract the text content
return text, soup # Return the text content and the BeautifulSoup object
else:
print(f"Failed to retrieve the URL. Status code: {response.status_code}")
return None, None
except requests.RequestException as e:
print(f"Request failed: {e}")
return None, None
Summarise the text using Ollama
def summarise_with_ollama(llm_client, text):
try:
# Structured prompt for summarisation
prompt = f"Please summarise the following content and extract tags: \n\n{text[:2000]}"
response = llm_client.get_llm_response(prompt) # Get the response from Ollama
# Simulating a structured response with summary and tags
lines = response.split('\n')
summary = lines[0]
tags = ', '.join(set(word for word in response.split() if word.isalpha() and len(word) > 3)) # Extract tags
short_summary = summary[:200] + '...' if len(summary) > 200 else summary # Truncate the summary
return short_summary, tags
except Exception as e:
print(f"Error calling Ollama: {e}")
return None, None
Process a single URL
def process_url(llm_client, url):
stripped_url = strip_tracking(url) # Remove tracking parameters
existing_record = session.query(URLRecord).filter_by(url=stripped_url).first() # Check if the URL is already processed
if existing_record:
print(f"URL already processed: {stripped_url}")
return False
text, soup = fetch_url_data(stripped_url) # Fetch the content of the URL
if text and soup:
heading = soup.title.string if soup.title else "No title found" # Extract the page heading
summary, tags = summarise_with_ollama(llm_client, text) # Summarise the content
if summary:
# Create a new record and save it to the database
new_record = URLRecord(url=stripped_url, heading=heading, summary=summary, tags=tags)
session.add(new_record)
session.commit()
print(f"URL: {stripped_url}")
print(f"Heading: {heading}")
print(f"Summary: {summary}")
print(f"Tags: {tags}")
return True
else:
print("Failed to summarise text with Ollama.")
return False
else:
print("Failed to fetch URL data.")
return False
Main function to process a list of URLs from a file
def main():
llm_client = LlmClient(ollama_instance_url="http://localhost:8000", model="mistral") # Initialize the LLM client
# Read URLs from the file
with open("bmurllist.txt", "r") as file:
urls = file.readlines()
# Get already processed URLs from the database
processed_urls = set(record.url for record in session.query(URLRecord.url).all())
broken_urls = []
# Process each URL
for url in urls:
url = url.strip()
if url and url not in processed_urls:
success = process_url(llm_client, url)
if not success:
broken_urls.append(url) # Add to broken URLs list if processing fails
processed_urls.add(url) # Add to processed URLs set
# Write broken URLs to a file
with open("broken_urls.txt", "w") as file:
for url in broken_urls:
file.write(url + "\n")
if name == "main":
main()