Scrapers/eBay.py at main · TahaGPT/Scrapers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import requests as req
from bs4 import BeautifulSoup as bsp
import pandas as pd
import csv
import json

def getPage(url): # function to get the html of the scrapped page for readability
    response = req.get(url)
    if not response.ok:
        print("Server responded with exit code:", response.status_code) # if scrapping is not allowed
        return None
    else:
        soup = bsp(response.text, "lxml") # converting to html
        pretty = soup.prettify() # increasing readability
        with open('scrapped.html', mode='w', encoding='utf-8') as htmlFile:  # specify encoding method for writing in a file
            htmlFile.write(pretty)
        return soup


def findContent(soup, info): # function to extract the required elements from the website soup
    if soup is None: # if previous function returned none
        return

    # going from basic to specific
    ol = soup.find('div', id='srp-river-results')
    articles = ol.find_all('div', class_='s-item__info clearfix')

    for article in articles:
        # extracting the title
        try:
            head = article.find('span', role='heading')
            title = head.text.strip()
            sub = 'New Listing'
            if sub in title:
                title = title.replace(sub, "")
            print(title)
            info['Title'].append(title)
        except:
            info['Title'].append(' ')


        # extracting the prices
        try:
            prices = article.find('span', class_='s-item__price')
            price = prices.text.strip() #if price_text.startswith('£') else 0.0
            print(price)
            info['Price'].append(price)
        except:
            info['Price'].append(' ')


        # extracting the ratings
        try:
            stars = article.find('div', class_='x-star-rating')
            star = stars.find('span', class_= 'clipped').text  # if len(stars['span']) > 1 else "No rating"
            print(star)
            info['Star-Rating'].append(star)
        except:
            info['Star-Rating'].append(' ')


        # extracting the sales
        try:
            avail = article.find('span', class_='s-item__dynamic s-item__quantitySold')
            availes = avail.find('span', class_ = 'BOLD')
            sales = availes.text.strip()
            print(sales)
            info['Sales'].append(sales)
        except:
            info['Sales'].append(' ')


        # extracting the status
        secondaries = article.find_all('div', class_='s-item__subtitle')
        for secondary in secondaries:
            try:
                extra = secondary.find('span', class_ = 'SECONDARY_INFO')
                status = extra.text.strip()
                print(status)
                info['Status'].append(status)
            except:
                info['Status'].append(' ')


        # extracting the best-seller
        try:
            sell = article.find('span', class_='s-item__etrs')
            best = sell.find('span', class_ = 's-item__etrs-text')
            bestSeller = best.text.strip()
            print(bestSeller)
            info['Best-Seller'].append(bestSeller)
        except:
            info['Best-Seller'].append(' ')


        # extracting the ratings-count
        try:
            rates = article.find('span', {'aria-hidden' : 'false'})
            rating = rates.text.strip()
            print(rating)
            info['Ratings-Count'].append(rating)
        except:
            info['Ratings-Count'].append(' ')


        # extracting the author
        try:
            writing = article.find('div', class_ = 's-item__subtitle')
            author = writing.text.strip().split('|')[0]
            sub = 'by'
            if sub in author:
                info['Author'].append(author)
                print(author)
            else:
                info['Author'].append(' ')
        except:
            info['Author'].append(' ')


def main():  # the function in which all the functionality takes place
    info = { # uninitialized dictionary to use for later
    'Title': [],
    'Price': [],
    'Star-Rating': [],
    'Sales': [],
    'Status' : [],
    'Best-Seller' : [],
    'Ratings-Count' : [],
    'Author' : []
    }

    for i in range(1, 170): # for all the fifty pages of the site
        url = f'https://www.ebay.com/sch/i.html?_from=R40&_nkw=Psychology+Books&_sacat=0&_pgn={i}'
        soup = getPage(url) # receiving the html element of the page
        findContent(soup, info) # finding and extracting required stuff

    # Write dictionary to CSV file
    csvFile = 'output.csv' # writing the headings of the columns
    with open(csvFile, mode='w', newline='', encoding='utf-8') as file: # using an encoder for special characters
        fieldnames = list(info.keys()) # typecasting the keys into a list
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for i in range(len(info['Title'])): # according to number of items, we are copying every value of every key into it's respective heading
            row = {
                'Title': info['Title'][i],
                'Price': info['Price'][i],
                'Star-Rating': info['Star-Rating'][i],
                'Sales': info['Sales'][i],
                'Status': info['Status'][i],
                'Best-Seller': info['Best-Seller'][i],
                'Ratings-Count': info['Ratings-Count'][i],
                'Author': info['Author'][i]
            }
            writer.writerow(row)


     # Write dictionary to JSON file
    json_file = 'scrapped.json'
    with open(json_file, mode='w', encoding='utf-8') as file:
        # Prepare list of dictionaries for JSON serialization
        json_data = []
        for i in range(len(info['Title'])): # according to number of items, we are copying every value of every key into it's respective heading
            row = {
                'Title': info['Title'][i],
                'Price': info['Price'][i],
                'Star-Rating': info['Star-Rating'][i],
                'Sales': info['Sales'][i],
                'Status': info['Status'][i],
                'Best-Seller': info['Best-Seller'][i],
                'Ratings-Count': info['Ratings-Count'][i],
                'Author': info['Author'][i]
            }
            json_data.append(row)

        # Write JSON data to file
        json.dump(json_data, file, indent=4, ensure_ascii=False)


if __name__ == '__main__':
    main()