-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscape_linkedin.py
More file actions
87 lines (69 loc) · 3.03 KB
/
scape_linkedin.py
File metadata and controls
87 lines (69 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
COMPANY = 222852511 #careem company code
COUNTRY = "pk" #location pakistan
EMAIL = input("Email: ")
PASSWORD = input("Password: ")
EMPLOYEE = 1000 # number of results
linkedin = 'https://www.linkedin.com'
# Open browser
browser = webdriver.Chrome(executable_path='C:/Users/Aksa/Downloads/chromedriver_win32/chromedriver.exe')
browser.get(linkedin)
time.sleep(6)
# Login to the website
email = browser.find_element_by_name('session_key')
password = browser.find_element_by_name('session_password')
email.send_keys(EMAIL + Keys.RETURN)
password.send_keys(PASSWORD + Keys.RETURN)
time.sleep(3)
search = "https://www.linkedin.com/search/results/people/?facetCurrentCompany=%5B%"+ str(COMPANY) +"%22%5D&facetGeoRegion=%5B%22" + str(COUNTRY) + "%3A0%22%5D&origin=FACETED_SEARCH"
browser.get(search)
time.sleep(3)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
current_url = 'url_placeholder' # this is a placeholder for the URL check
# Create empty dataframe
df = pd.DataFrame(columns=['name', 'title', 'location', 'profile'])
# Go through pages and download data
while True:
# Check to see if url is the 100th page in search
if current_url.find('page=100') != -1:
break
# Check to see if this is the end of the search
previous_url = current_url
current_url = browser.current_url
if current_url == previous_url:
break
# Start scraping and filling in the dataframe
time.sleep(5)
page = BeautifulSoup(browser.page_source, 'html.parser')
page_names = page.find_all('span', class_='actor-name')
page_titles = page.find_all('p', class_='subline-level-1')
page_locations = page.find_all('p', class_='subline-level-2')
page_profiles = page.find_all('a', class_='search-result__result-link')
# Put scraped data into a dataframe
names = list(map(lambda x: x.text, page_names))
titles = list(map(lambda x: x.text.replace('\n', ''), page_titles))
locations = list(map(lambda x: x.text.replace('\n', ''), page_locations))
profiles = list(map(lambda x: linkedin + x['href'], page_profiles))[::2]
temp = pd.DataFrame({'name': names, 'title': titles, 'location': locations, 'profile': profiles})
# Filter out members who do not provide information
temp = temp[temp['name'] != 'LinkedIn Member']
# Append new data to df
df = df.append(temp)
# Stop appending if the number of retrieved records exceeds the limit
if df.shape[0] >= EMPLOYEE:
break
#going to the next page
nextt = browser.find_elements_by_css_selector("button span.artdeco-button__text")[-2]
browser.execute_script("arguments[0].click();", nextt)
time.sleep(5)
# Reset dataframe index
df.reset_index()
# save the results into a csv file
df.to_csv("output_search.csv", index=False)
# Close browser
browser.quit()