-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
97 lines (82 loc) · 3.22 KB
/
utils.py
File metadata and controls
97 lines (82 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# utils.py
# -------------------------------
# Helper functions for Quotes Web Scraping Project
# -------------------------------
import json
import requests
from bs4 import BeautifulSoup
# ---------------------------------------------
# Function to get the full author link from a quote block
# ---------------------------------------------
def get_author_link(base_url, quote):
"""
Returns the full URL to the author's detail page
"""
link = quote.find("a")["href"]
return base_url + link
# ---------------------------------------------
# Function to scrape quotes from multiple pages
# ---------------------------------------------
def scrape_quotes(base_url, page_number):
"""
Scrape quotes from the first `page_number` pages of the website.
Returns a list of dictionaries containing:
text, author, tags, and author_detail_link
"""
all_quotes = []
for page in range(1, page_number + 1):
curr_page_url = f"{base_url}/page/{page}/"
try:
response = requests.get(curr_page_url, timeout=10)
response.raise_for_status()
print(f"Successfully fetched page {page}")
except requests.RequestException as e:
print(f"Error fetching page {page}: {e}")
continue
soup = BeautifulSoup(response.text, "html.parser")
quotes = soup.find_all("div", class_="quote")
for quote in quotes:
try:
quote_text = quote.find("span", class_="text").get_text(strip=True)
author = quote.find("small", class_="author").get_text(strip=True)
tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]
author_link = get_author_link(base_url, quote)
all_quotes.append({
"text": quote_text,
"author": author,
"tags": tags,
"author_detail_link": author_link
})
except Exception as e:
print("Error extracting quote:", e)
return all_quotes
# ---------------------------------------------
# Function to save scraped data into a JSON file
# ---------------------------------------------
def save_into_json_file(data, file_name):
"""
Save data (list of dictionaries) into a JSON file
"""
with open(file_name, 'w', encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Done! {len(data)} quotes saved to {file_name}")
print("*" * 50)
# ---------------------------------------------
# Function to read JSON file and print quotes with a specific tag
# ---------------------------------------------
def read_print_quotes_with_tag(file, tag):
"""
Read a JSON file and print all quotes containing the specified tag
"""
with open(file, 'r', encoding="utf-8") as f:
quotes_data = json.load(f)
print(f"\nQuotes with the tag '{tag}':")
print("-" * 50)
tag_to_check = tag.lower()
filtered_quotes = [q for q in quotes_data if tag_to_check in q['tags']]
if not filtered_quotes:
print(f"No quotes found with tag: {tag}")
else:
for quote in filtered_quotes:
print(f"\n{quote}\n")
print("-" * 30)