Scrapers/Sapphire.py at main · TahaGPT/Scrapers · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import requests as req
from bs4 import BeautifulSoup as bsp
import pandas as pd
import time
from selenium import webdriver
import requests
from selenium.webdriver.edge.service import Service as EdgeService
from selenium.webdriver.edge.options import Options as EdgeOptions
info = {  # uninitialized dictionary to use for later
        'Name': [],
        'Price': [],
        'Image': [],
        'Link':[]
    }
url = 'https://pk.sapphireonline.pk/'
# Getting the navbar of the website
soup = bsp(req.get(url).content, 'html.parser')
# getting the navbar of the website which has a ul tag with class 't4s-nav__ul t4s-d-inline-flex t4s-flex-wrap t4s-align-items-center'
navbar = soup.find('ul', class_='t4s-nav__ul t4s-d-inline-flex t4s-flex-wrap t4s-align-items-center')
# In the ul there are li tag each tag has a href in the a tag
links = [link.find('a')['href'] for link in navbar.find_all('li')]
# adding the url to the links
links = [url + link for link in links]

for link in links:
    print(link)
def get_valid_proxies():
    proxy_list_url = 'https://free-proxy-list.net/'
    response = requests.get(proxy_list_url)
    soup = bsp(response.text, 'html.parser')
    proxy_data = []
    rows = soup.find_all('tr')[1:]
    for row in rows:
        columns = row.find_all('td')
        if len(columns) >= 8:
            ip_address = columns[0].text.strip()
            google_enabled = columns[5].text.strip().lower() == 'yes'
            https_enabled = columns[6].text.strip().lower() == 'yes'
            last_checked = columns[7].text.strip()
            if (last_checked.endswith('mins ago') and int(last_checked.split(' ')[0]) < 15) or last_checked.endswith('hours ago'):
                if google_enabled or https_enabled:
                    proxy_data.append({'ip_address': ip_address, 'google_enabled': google_enabled, 'https_enabled': https_enabled})

    return proxy_data

def rotate_user_agent(proxy):
    if proxy:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'http': f'http://{proxy}',
            'https': f'https://{proxy}'
        }
    else:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
    return headers
# Scrapping the links one by one
for link in links:
    # Getting the navbar of the website
    response = req.get(link)
    if not response.ok:
        print("Server responded with exit code:", response.status_code) # if scrapping is not allowed
    else:
        soup = bsp(response.content, 'html.parser')
        # scrolling the page to the bottom above the footer then back up to load all the items until the end
        driver = webdriver.Chrome()
        driver.get(link)
        Previous_Height = driver.execute_script("return document.body.scrollHeight")
        while True:
            # It will scroll to right above the footer of the page then scoll to the top then back to the bottom untill there is no new items being loaded
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 1000);")
            time.sleep(5)
            driver.execute_script("window.scrollTo(0, 0);")
            time.sleep(5)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 1000);")
            time.sleep(5)
            New_Height = driver.execute_script("return document.body.scrollHeight")
            if New_Height == Previous_Height:
                break
            Previous_Height = New_Height
            # if there is a popup press the close button
            try:
                driver.find_element_by_class_name('modal-close').click()
            except:
                pass


        # getting the html content of the page
        html = driver.page_source
        driver.quit()
        # parsing the html content
        soup = bsp(html, 'html.parser')
        pretty = soup.prettify() # increasing readability
        with open('scrapped.html', 'w', encoding='utf-8') as htmlFile:  # specify encoding method
            htmlFile.write(pretty)


        # getting the div with class 't4s-section-inner t4s_nt_se_template--16016591585354__main t4s_se_template--16016591585354__main t4s-container-fluid'
        try:
            products = soup.find_all('div', class_='t4s-product-wrapper')
            # print(products)
            for product in products:
                try:
                    # getting name in h3 tag of class 't4s-product-title'
                    name = product.find('h3', class_='t4s-product-title').text.strip()
                    print(name)
                    info['Name'].append(name)
                except:
                    info['Name'].append(' ')

                try:
                    # getting price from div with class 't4s-product-price'
                    price = product.find('div', class_='t4s-product-price').text.strip()
                    print(price)
                    info['Price'].append(price)
                except:
                    info['Price'].append(' ')

                try:
                    # getting the href from div with class 't4s-product-btns t4s-col-2 t4s-col-lg-5'
                    link = product.find('div', class_='t4s-product-btns t4s-col-2 t4s-col-lg-5').find('a')['href']
                    print(url + link)
                    info['Link'].append(url+link)
                except:
                    info['Link'].append(' ')

                try:
                    # getting image from div with class 't4s-product-img t4s_ratio is-show-img2'
                    image = product.find('div', class_='t4s-product-img t4s_ratio is-show-img2').find('img')['src']
                    print(image)
                    info['Image'].append(image)
                except:
                    info['Image'].append(' ')
        except:
            pass
# print(info)
# Write dictionary to CSV file
df = pd.DataFrame(info)
df.to_csv('Sapphire.csv')