-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLama.py
More file actions
188 lines (165 loc) · 7.29 KB
/
Lama.py
File metadata and controls
188 lines (165 loc) · 7.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import requests as req
from bs4 import BeautifulSoup as bsp
import pandas as pd
import csv
import json
import time
from selenium import webdriver
import random
import requests
info = { # uninitialized dictionary to use for later
'Name': [],
'Price': [],
'Image': [],
'Link':[]
}
url = 'https://lamaretail.com/collections/woman-studio-collection'
# Getting the navbar of the website
soup = bsp(req.get(url).content, 'html.parser')
# getting the ul tag with the class name "site-nav site-navigation small--hide"
ol = soup.find('ul', class_='site-nav site-navigation small--hide')
# getting the li tags inside the ul tag
lis = ol.find_all('li')
# within each li tag there is an ul tag with the class name "site-nav__dropdown text-left" and within that there are li tags
# we are extracting the href attribute of the a tag within the li tag
links = []
for li in lis:
ul = li.find('ul', class_='site-nav__dropdown text-left')
if ul is not None:
for li in ul.find_all('li'):
a = li.find('a')
links.append(a['href'])
for i in range(len(links)):
links[i] = 'https://lamaretail.com' + links[i]
print(links)
def get_valid_proxies():
proxy_list_url = 'https://free-proxy-list.net/'
response = requests.get(proxy_list_url)
soup = bsp(response.text, 'html.parser')
proxy_data = []
rows = soup.find_all('tr')[1:]
for row in rows:
columns = row.find_all('td')
if len(columns) >= 8:
ip_address = columns[0].text.strip()
google_enabled = columns[5].text.strip().lower() == 'yes'
https_enabled = columns[6].text.strip().lower() == 'yes'
last_checked = columns[7].text.strip()
if (last_checked.endswith('mins ago') and int(last_checked.split(' ')[0]) < 15) or last_checked.endswith('hours ago'):
if google_enabled or https_enabled:
proxy_data.append({'ip_address': ip_address, 'google_enabled': google_enabled, 'https_enabled': https_enabled})
return proxy_data
def rotate_user_agent(proxy):
if proxy:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'http': f'http://{proxy}',
'https': f'https://{proxy}'
}
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
return headers
# Scrapping the links one by one
for link in links:
# Getting the navbar of the website
response = req.get(link)
if not response.ok:
print("Server responded with exit code:", response.status_code) # if scrapping is not allowed
else:
soup = bsp(response.content, 'html.parser')
# scrolling the page to the bottom above the footer then back up to load all the items until the end
driver = webdriver.Chrome()
driver.get(link)
Previous_Height = driver.execute_script("return document.body.scrollHeight")
while True:
# It will scroll to right above the footer of the page then scoll to the top then back to the bottom untill there is no new items being loaded
driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 1000);")
time.sleep(5)
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 1000);")
time.sleep(5)
New_Height = driver.execute_script("return document.body.scrollHeight")
if New_Height == Previous_Height:
break
Previous_Height = New_Height
# if there is a popup press the close button
try:
driver.find_element_by_class_name('modal-close').click()
except:
pass
# getting the html content of the page
html = driver.page_source
driver.quit()
# parsing the html content
soup = bsp(html, 'html.parser')
pretty = soup.prettify() # increasing readability
with open('scrapped.html', 'w', encoding='utf-8') as htmlFile: # specify encoding method
htmlFile.write(pretty)
lists = soup.find('div', class_ = 'grid grid--uniform')
dresses = lists.find_all('div', class_ = 'grid__item-image-wrapper')
for dress in dresses:
# getting the name
try:
dressName = dress.find('div', class_='grid-product__title grid-product__title--body')
name = dressName.text.strip()
print(name)
info['Name'].append(name)
except:
info['Name'].append(' ')
# getting the price
try:
dressPrice = soup.find('div', class_='grid-product__meta').find('div', class_='grid-product__price').find('span', class_='money')
price = dressPrice.string.strip() # Use .strip() to clean up any surrounding whitespace
print(price)
info['Price'].append(price)
except:
info['Price'].append(' ')
# getting the image
try:
dressImage = dress.find('div', class_= 'grid-product__image-mask').find('div', class_ = 'image-wrap loaded').find('img')
image = 'https:' + dressImage['src']
print(image)
info['Image'].append(image)
except:
info['Image'].append(' ')
# getting the link
try:
dressLink = dress.find('a', class_ = 'grid-product__link')
link = 'https://lamaretail.com' + dressLink['href']
print(link)
info['Link'].append(link)
except:
print("kuch nhi")
info['Link'].append(' ')
# Write dictionary to CSV file
csvFile = 'lama.csv' # writing the headings of the columns
with open(csvFile, mode='w', newline='', encoding='utf-8') as file: # using an encoder for special characters
fieldnames = list(info.keys()) # typecasting the keys into a list
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for i in range(len(info['Name'])): # according to number of items, we are copying every value of every key into it's respective heading
row = {
'Name': info['Name'][i],
'Price': info['Price'][i],
'Link': info['Link'][i],
'Image': info['Image'][i]
}
writer.writerow(row)
# Write dictionary to JSON file
json_file = 'lama.json'
with open(json_file, mode='w', encoding='utf-8') as file:
# Prepare list of dictionaries for JSON serialization
json_data = []
for i in range(len(info['Name'])): # according to number of items, we are copying every value of every key into it's respective heading
row = {
'Name': info['Name'][i],
'Price': info['Price'][i],
'Link': info['Link'][i],
'Image': info['Image'][i]
}
json_data.append(row)
# Write JSON data to file
json.dump(json_data, file, indent=4, ensure_ascii=False)