quotes-web-scraping/utils.py at main · Deepak-Dhungel/quotes-web-scraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# utils.py
# -------------------------------
# Helper functions for Quotes Web Scraping Project
# -------------------------------

import json
import requests
from bs4 import BeautifulSoup

# ---------------------------------------------
# Function to get the full author link from a quote block
# ---------------------------------------------
def get_author_link(base_url, quote):
    """
    Returns the full URL to the author's detail page
    """
    link = quote.find("a")["href"]
    return base_url + link

# ---------------------------------------------
# Function to scrape quotes from multiple pages
# ---------------------------------------------
def scrape_quotes(base_url, page_number):
    """
    Scrape quotes from the first `page_number` pages of the website.
    Returns a list of dictionaries containing:
    text, author, tags, and author_detail_link
    """
    all_quotes = []

    for page in range(1, page_number + 1):
        curr_page_url = f"{base_url}/page/{page}/"

        try:
            response = requests.get(curr_page_url, timeout=10)
            response.raise_for_status()
            print(f"Successfully fetched page {page}")
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        quotes = soup.find_all("div", class_="quote")

        for quote in quotes:
            try:
                quote_text = quote.find("span", class_="text").get_text(strip=True)
                author = quote.find("small", class_="author").get_text(strip=True)
                tags = [tag.get_text(strip=True) for tag in quote.find_all("a", class_="tag")]
                author_link = get_author_link(base_url, quote)

                all_quotes.append({
                    "text": quote_text,
                    "author": author,
                    "tags": tags,
                    "author_detail_link": author_link
                })

            except Exception as e:
                print("Error extracting quote:", e)

    return all_quotes

# ---------------------------------------------
# Function to save scraped data into a JSON file
# ---------------------------------------------
def save_into_json_file(data, file_name):
    """
    Save data (list of dictionaries) into a JSON file
    """
    with open(file_name, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Done! {len(data)} quotes saved to {file_name}")
    print("*" * 50)

# ---------------------------------------------
# Function to read JSON file and print quotes with a specific tag
# ---------------------------------------------
def read_print_quotes_with_tag(file, tag):
    """
    Read a JSON file and print all quotes containing the specified tag
    """
    with open(file, 'r', encoding="utf-8") as f:
        quotes_data = json.load(f)

    print(f"\nQuotes with the tag '{tag}':")
    print("-" * 50)

    tag_to_check = tag.lower()
    filtered_quotes = [q for q in quotes_data if tag_to_check in q['tags']]

    if not filtered_quotes:
        print(f"No quotes found with tag: {tag}")
    else:
        for quote in filtered_quotes:
            print(f"\n{quote}\n")
            print("-" * 30)