-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAmazon.py
More file actions
105 lines (95 loc) · 3.56 KB
/
Amazon.py
File metadata and controls
105 lines (95 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests as req
from bs4 import BeautifulSoup as bsp
import pandas as pd
import time
from selenium import webdriver
import requests
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from fake_useragent import UserAgent
firefox_path = "/usr/bin/firefox" # Example for Linux
geckodriver_path = "/snap/bin/geckodriver"
options = Options()
options.binary_location = firefox_path
service = Service(executable_path=geckodriver_path)
# from fake_useragent import UserAgent
headers = {"User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'}
base = 'https://www.amazon.com'
def generateMenu():
# session = requests.Session()
# session.headers.update({"User-Agent": UserAgent().random})
driver = webdriver.Chrome()
# driver = webdriver.Firefox(service = service, options = options)
driver.get(base)
# It will scroll to right above the footer of the page then scoll to the top then back to the bottom untill there is no new items being loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)
html = driver.page_source
driver.quit()
soup = bsp(html, 'lxml')
boxes = soup.find_all('a', class_ = 'a-link-normal _fluid-quad-image-label-v2_style_centerImage__30wh- aok-block image-window')
print(boxes)
links = []
categ = []
for box in boxes:
actual = box.get('href')
link = base + actual
print(link)
links.append(link)
som = box.get('aria-label')
categ.append(som)
print(som)
return links, som
def giveProducts(url):
print("VISITING : ", url)
# driver = webdriver.Chrome()
# # driver = webdriver.Firefox()
# driver.get(url)
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(10)
# html = driver.page_source
# driver.quit()
# response = req.get(url, headers = headers)
# session = requests.Session()
# session.headers.update({"User-Agent": UserAgent().random})
response = req.get(url, headers = {"User-Agent": UserAgent().random})
if not response.ok:
print(f"Product exited with Code : {response.status_code}")
else:
soup = bsp(response.text, 'lxml')
print(soup)
products = soup.find_all('a', class_ = 'a-link-normal s-line-clamp-2 s-link-style a-text-normal')['href']
for product in products:
product = url + product
nextP = soup.find('a', class_ = 's-pagination-item s-pagination-next s-pagination-button s-pagination-button-accessibility s-pagination-separator')['href']
if nextP:
url = base + nextP
return products, url
else:
return products, NULL
if __name__ == '__main__':
info = {
'Title' : [],
'Category' : [],
'Average_Rating' : [],
'Rating_Number' : [],
'Features' : [[]],
'Description' : [[]],
'Price' : [],
'Images' : [[]],
'Videos' : [[]],
'Store' : [],
'Categories' : [],
'Details' : [[]],
'Parent_asin' : [],
'Bought_Together' : []
}
links, cat = generateMenu()
print(links)
products = []
for link in links:
while(link):
product, link = giveProducts(link)
products.extend(product)