Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 31 additions & 72 deletions legacy/LinkedinScrapper.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,27 @@
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support import expected_conditions as expect
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd
import numpy as np

from datetime import datetime

from selenium.common.exceptions import (
ElementNotVisibleException,
ElementClickInterceptedException,
WebDriverException,
TimeoutException,
)

from os.path import dirname, join


username = input('Enter your linkedin Email : ')
password = input('Enter your linkedin Password : ')
file_name = input('Enter your file name : ')

import os

# username = ''
# password = ''
# file_name = ''
username = os.getenv('LINKEDIN_USERNAME')
password = os.getenv('LINKEDIN_PASSWORD')
file_name = input('Enter your file name : ')

# search_query = ""
search_query = input('Enter your search query : ')
search_query = search_query.replace(" ", "%20")

place_name = input('Enter targed place name : ')
# place_name = ""


options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options) # version_main allows to specify your chrome version instead of following chrome global version
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # version_main allows to specify your chrome version instead of following chrome global version
driver.maximize_window()



def loging():
driver.get('https://www.linkedin.com')
WebDriverWait(driver, 10).until(expect.visibility_of_element_located((By.XPATH, '//input[@id="session_key"]')))
driver.find_element(By.XPATH, '//input[@id="session_key"]').send_keys(username)
driver.find_element(By.XPATH,"//input[@id='session_password']").send_keys(password)
driver.find_element(By.XPATH,"//button[contains(text(),'Sign in')]").click()

driver.find_element(By.XPATH, "//input[@id=\'session_password\']").send_keys(password)
driver.find_element(By.XPATH, "//button[contains(text(),\'Sign in\')]").click()


def scrap_available_profie():
Expand All @@ -73,20 +38,20 @@ def scrap_available_profie():

WebDriverWait(driver, 10).until(expect.visibility_of_element_located((By.XPATH, "//input[@placeholder='Add a location']")))
driver.find_element(By.XPATH, "//input[@placeholder='Add a location']").send_keys(place_name)
time.sleep(2)
driver.find_element(By.XPATH, "//*[contains(@id, 'basic-result-')]").click()
time.sleep(2)
driver.find_element(By.XPATH, '//button[text()="Locations"]').click()
time.sleep(2)
driver.find_element(By.XPATH, "//*[contains(@id, 'basic-result-')] ").click()
time.sleep(2)
driver.find_element(By.XPATH, '//button[text()="Locations"]').click()

time.sleep(4)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')

html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
Expand All @@ -111,7 +76,6 @@ def scrap_available_profie():
#print (designation.text)
Linked_in_designation.append(designation.text)


if(len(total_pages) == 0):
last_page = 1
else:
Expand All @@ -127,14 +91,14 @@ def scrap_available_profie():
driver.get(driver.current_url+'&page='+str(x))

time.sleep(4)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')

html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")
Expand Down Expand Up @@ -162,10 +126,9 @@ def scrap_available_profie():
else:
count2 = count2 + 1


Linkedin_link = list(dict.fromkeys(Linkedin_link))
Linked_in_designation = list(dict.fromkeys(Linked_in_designation))

clean_linkedin_links = []
for L_l in Linkedin_link:
if 'headless?' not in L_l:
Expand All @@ -174,21 +137,17 @@ def scrap_available_profie():
print(len(clean_linkedin_links))
print(len(Linked_in_designation))


a = np.array(clean_linkedin_links)
b = np.array(Linked_in_designation)

df = pd.DataFrame({"Profile Link" : a})
df.to_csv(file_name+"_Profile_Link.csv", index=False)
df.to_csv(file_name+"_Profile_Link.csv", index=False)

df2 = pd.DataFrame({"Designation" : b})
df2.to_csv(file_name+"_Designation.csv", index=False)


df2.to_csv(file_name+"_Designation.csv", index=False)



loging()
time.sleep(15)
scrap_available_profie()

srap_available_profie()