daiict-scrapper/script.py at main · Steosumit/daiict-scrapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# Simple script to scrape a website and save the links
# IMPORTANT NOTE: use export USER="whatever" and export PASS="whatever" before running the script (LINUX)
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
import pandas as pd
import time

url = "http://placement.daiict.ac.in/Student/Companies"
response = requests.get(url)

options = Options()
options.use_chromium = True
options.add_argument = "--headless"
# Set the path to your Edge browser binary if needed:
# options.binary_location = "/usr/bin/microsoft-edge"
# service = Service(executable_path="/usr/bin/microsoft-edge")  # Update this path as needed
driver = webdriver.Edge(options=options)
driver.get(url)
time.sleep(5)  # let the JS load

# Find username & password input fields
username_input = driver.find_element(By.ID, "username")  # or use ID
password_input = driver.find_element(By.ID, "password")

# Enter your credentials
username_input.send_keys("USER")
password_input.send_keys("PASS")

# Submit the form (either click login button or press Enter)
login_button = driver.find_element(By.XPATH, '//input[@type="submit"]')
login_button.click()
time.sleep(3)

driver.get("http://placement.daiict.ac.in/Student/Companies")
time.sleep(3)

# Load the html as static
html = driver.page_source

html_content = html
soup = BeautifulSoup(html_content, 'html.parser')
companies_list = []
# Loop through all rows in the table and print the href of each <a> tag
for row in soup.table.tbody.find_all("tr"):
    a_tag = row.find("a")
    if a_tag and a_tag.has_attr("href"):
        companies_list.append(a_tag["href"])

# Write to a file
with open("companies.csv", "w") as csvfile:
    for line in companies_list:
        csvfile.write(line)
        csvfile.write("\n")

# Fix for the IndexError issue
def safe_extract_data(rows):
    head = [row.find('td').get_text(strip=False) for row in rows if row.find('td')]  # Check if td exists
    print(len(head))

    # Add a check to make sure there are at least 3 columns before accessing index 2
    value = []
    for row in rows:
        td_elements = row.find_all('td')
        if len(td_elements) > 2:  # Make sure there are at least 3 columns
            value.append(td_elements[2].get_text(strip=False))

    print(len(value))
    return value

# Example usage:
# rows = soup.find_all('tr')
# values = []
# value = safe_extract_data(rows)
# values.append(value)