-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheBay.py
More file actions
186 lines (152 loc) · 6.33 KB
/
eBay.py
File metadata and controls
186 lines (152 loc) · 6.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import requests as req
from bs4 import BeautifulSoup as bsp
import pandas as pd
import csv
import json
def getPage(url): # function to get the html of the scrapped page for readability
response = req.get(url)
if not response.ok:
print("Server responded with exit code:", response.status_code) # if scrapping is not allowed
return None
else:
soup = bsp(response.text, "lxml") # converting to html
pretty = soup.prettify() # increasing readability
with open('scrapped.html', mode='w', encoding='utf-8') as htmlFile: # specify encoding method for writing in a file
htmlFile.write(pretty)
return soup
def findContent(soup, info): # function to extract the required elements from the website soup
if soup is None: # if previous function returned none
return
# going from basic to specific
ol = soup.find('div', id='srp-river-results')
articles = ol.find_all('div', class_='s-item__info clearfix')
for article in articles:
# extracting the title
try:
head = article.find('span', role='heading')
title = head.text.strip()
sub = 'New Listing'
if sub in title:
title = title.replace(sub, "")
print(title)
info['Title'].append(title)
except:
info['Title'].append(' ')
# extracting the prices
try:
prices = article.find('span', class_='s-item__price')
price = prices.text.strip() #if price_text.startswith('£') else 0.0
print(price)
info['Price'].append(price)
except:
info['Price'].append(' ')
# extracting the ratings
try:
stars = article.find('div', class_='x-star-rating')
star = stars.find('span', class_= 'clipped').text # if len(stars['span']) > 1 else "No rating"
print(star)
info['Star-Rating'].append(star)
except:
info['Star-Rating'].append(' ')
# extracting the sales
try:
avail = article.find('span', class_='s-item__dynamic s-item__quantitySold')
availes = avail.find('span', class_ = 'BOLD')
sales = availes.text.strip()
print(sales)
info['Sales'].append(sales)
except:
info['Sales'].append(' ')
# extracting the status
secondaries = article.find_all('div', class_='s-item__subtitle')
for secondary in secondaries:
try:
extra = secondary.find('span', class_ = 'SECONDARY_INFO')
status = extra.text.strip()
print(status)
info['Status'].append(status)
except:
info['Status'].append(' ')
# extracting the best-seller
try:
sell = article.find('span', class_='s-item__etrs')
best = sell.find('span', class_ = 's-item__etrs-text')
bestSeller = best.text.strip()
print(bestSeller)
info['Best-Seller'].append(bestSeller)
except:
info['Best-Seller'].append(' ')
# extracting the ratings-count
try:
rates = article.find('span', {'aria-hidden' : 'false'})
rating = rates.text.strip()
print(rating)
info['Ratings-Count'].append(rating)
except:
info['Ratings-Count'].append(' ')
# extracting the author
try:
writing = article.find('div', class_ = 's-item__subtitle')
author = writing.text.strip().split('|')[0]
sub = 'by'
if sub in author:
info['Author'].append(author)
print(author)
else:
info['Author'].append(' ')
except:
info['Author'].append(' ')
def main(): # the function in which all the functionality takes place
info = { # uninitialized dictionary to use for later
'Title': [],
'Price': [],
'Star-Rating': [],
'Sales': [],
'Status' : [],
'Best-Seller' : [],
'Ratings-Count' : [],
'Author' : []
}
for i in range(1, 170): # for all the fifty pages of the site
url = f'https://www.ebay.com/sch/i.html?_from=R40&_nkw=Psychology+Books&_sacat=0&_pgn={i}'
soup = getPage(url) # receiving the html element of the page
findContent(soup, info) # finding and extracting required stuff
# Write dictionary to CSV file
csvFile = 'output.csv' # writing the headings of the columns
with open(csvFile, mode='w', newline='', encoding='utf-8') as file: # using an encoder for special characters
fieldnames = list(info.keys()) # typecasting the keys into a list
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for i in range(len(info['Title'])): # according to number of items, we are copying every value of every key into it's respective heading
row = {
'Title': info['Title'][i],
'Price': info['Price'][i],
'Star-Rating': info['Star-Rating'][i],
'Sales': info['Sales'][i],
'Status': info['Status'][i],
'Best-Seller': info['Best-Seller'][i],
'Ratings-Count': info['Ratings-Count'][i],
'Author': info['Author'][i]
}
writer.writerow(row)
# Write dictionary to JSON file
json_file = 'scrapped.json'
with open(json_file, mode='w', encoding='utf-8') as file:
# Prepare list of dictionaries for JSON serialization
json_data = []
for i in range(len(info['Title'])): # according to number of items, we are copying every value of every key into it's respective heading
row = {
'Title': info['Title'][i],
'Price': info['Price'][i],
'Star-Rating': info['Star-Rating'][i],
'Sales': info['Sales'][i],
'Status': info['Status'][i],
'Best-Seller': info['Best-Seller'][i],
'Ratings-Count': info['Ratings-Count'][i],
'Author': info['Author'][i]
}
json_data.append(row)
# Write JSON data to file
json.dump(json_data, file, indent=4, ensure_ascii=False)
if __name__ == '__main__':
main()