Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
venv
__pycache__
__pycache__
.env
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# Project
# Project
testing/sk
1 change: 1 addition & 0 deletions SPARC
Submodule SPARC added at f30fe6
67 changes: 62 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,73 @@
from flask import Flask, render_template, request, jsonify
from fpdf import FPDF
from docx import Document
import os

app = Flask(__name__)
app = Flask(
__name__,
template_folder=os.path.join("Application", "templates"),
static_folder=os.path.join("Application", "static")
)


data_store = {
"linkedin": "",
"github": "",
"chat": []
}

@app.route('/')
def home():
def index():
return render_template('index.html')

@app.route('/submit_links', methods=['POST'])
def submit_links():
data_store["linkedin"] = request.form.get("linkedin")
data_store["github"] = request.form.get("github")
return jsonify({"message": "Links submitted successfully!"})

@app.route('/chat', methods=['POST'])
def chat():
user_message = request.json.get('message')
response_message = f"You said: {user_message}. This is a placeholder response!"
return jsonify({"response": response_message})
user_message = request.json.get("message")
bot_response = f"You said: {user_message}. Tell me more!"

data_store["chat"].append({"user": user_message, "bot": bot_response})

return jsonify({"response": bot_response})

@app.route('/export', methods=['POST'])
def export():
export_type = request.form.get("export_type")

if export_type == "pdf":
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Chatbot Interaction Export", ln=True, align='C')
pdf.cell(200, 10, txt=f"LinkedIn: {data_store['linkedin']}", ln=True)
pdf.cell(200, 10, txt=f"GitHub: {data_store['github']}", ln=True)

for conversation in data_store["chat"]:
pdf.cell(200, 10, txt=f"User: {conversation['user']}", ln=True)
pdf.cell(200, 10, txt=f"Bot: {conversation['bot']}", ln=True)

pdf.output("chat_export.pdf")
return jsonify({"message": "Exported as PDF successfully!", "file": "chat_export.pdf"})

elif export_type == "word":
doc = Document()
doc.add_heading("Chatbot Interaction Export", level=1)
doc.add_paragraph(f"LinkedIn: {data_store['linkedin']}")
doc.add_paragraph(f"GitHub: {data_store['github']}")

for conversation in data_store["chat"]:
doc.add_paragraph(f"User: {conversation['user']}")
doc.add_paragraph(f"Bot: {conversation['bot']}")

doc.save("chat_export.docx")
return jsonify({"message": "Exported as Word successfully!", "file": "chat_export.docx"})

return jsonify({"message": "Invalid export type!"})

if __name__ == '__main__':
app.run(debug=True)
74 changes: 74 additions & 0 deletions application/app/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import requests
import base64
import json

def save_json(filename , json_data):
with open('{}.json'.format(filename), 'w') as fp:
json.dump(json_data, fp , indent= True)


def extract_data(DataNeeded, DataFromGithub, ):
Data = {}
for (k, v) in DataFromGithub.items():

if k in DataNeeded:
Data[k] = v

return Data

class User:
def __init__(self, Username):
self.Username = Username
self.UserURL = 'https://api.github.com/users/{}'.format(self.Username)

def get_user_stats(self):
user_data = requests.get(self.UserURL).json()
data_needed = [
'name',
'type',
'company',
'blog',
'location',
'email',
'public_repos',
'followers'
]
self.UserData = extract_data(data_needed, user_data)

repos_url = 'https://api.github.com/users/{}/repos'.format(self.Username)
repos_data = requests.get(repos_url).json()

repos_list = []

for repo in repos_data:
repo_info = {"name": repo.get("name")}
readme_url = "https://api.github.com/repos/{}/{}/readme".format(self.Username, repo.get("name"))
readme_response = requests.get(readme_url)
if readme_response.status_code == 200:
readme_json = readme_response.json()
if "content" in readme_json and readme_json.get("encoding") == "base64":
try:
decoded_content = base64.b64decode(readme_json["content"]).decode('utf-8')
if len(decoded_content) > 5000:
decoded_content = decoded_content[:5000]
repo_info["readme"] = decoded_content
except Exception as e:
repo_info["readme"] = "Error decoding README: {}".format(e)
else:
repo_info["readme"] = None
else:
repo_info["readme"] = None

repos_list.append(repo_info)

self.UserData["repositories"] = repos_list

save_json('output_of_User', self.UserData)

return json.dumps(self.UserData, indent=4)

# username_input = 'username'
# user = User(username_input)
# data = user.get_user_stats()
# print(data)
161 changes: 161 additions & 0 deletions application/app/linkedin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import os
import json
import time
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=chrome_options)

load_dotenv()
LINKEDIN_USERNAME = os.getenv('LINKEDIN_USERNAME')
LINKEDIN_PASSWORD = os.getenv('LINKEDIN_PASSWORD')

def init_driver():
"""Initialize Selenium ChromeDriver with options."""
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
driver.maximize_window()
return driver

def linkedin_login(driver):
"""Logs into LinkedIn using credentials from environment variables."""
driver.get("https://www.linkedin.com/login")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "username")))

driver.find_element(By.ID, "username").send_keys(LINKEDIN_USERNAME)
driver.find_element(By.ID, "password").send_keys(LINKEDIN_PASSWORD)
driver.find_element(By.XPATH, "//button[@type='submit']").click()

WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "global-nav-search")))

def scrape_linkedin_profile(driver, profile_url):
"""Scrapes LinkedIn profile data and returns a dictionary."""
data = {}
driver.get(profile_url)
time.sleep(5)

try:
name_element = driver.find_element(By.CSS_SELECTOR, "h1.text-heading-xlarge")
data['name'] = name_element.text
except NoSuchElementException:
data['name'] = None

try:
company_element = driver.find_element(By.XPATH, "//div[contains(@class, 'pv-entity__secondary-title')]")
data['company'] = company_element.text.strip()
except NoSuchElementException:
data['company'] = None

try:
education_section = driver.find_element(By.ID, "education-section")
education_items = education_section.find_elements(By.CSS_SELECTOR, "li")
education_list = [item.text for item in education_items]
data['education'] = education_list
except NoSuchElementException:
data['education'] = []

try:
licenses_section = driver.find_element(By.ID, "licenses-certifications-section")
license_items = licenses_section.find_elements(By.CSS_SELECTOR, "li")
licenses_list = [item.text for item in license_items]
data['licenses_certifications'] = licenses_list
except NoSuchElementException:
data['licenses_certifications'] = []

try:
projects_section = driver.find_element(By.ID, "projects-section")
project_items = projects_section.find_elements(By.CSS_SELECTOR, "li")
projects_list = [item.text for item in project_items]
data['projects'] = projects_list
except NoSuchElementException:
data['projects'] = []

try:
volunteering_section = driver.find_element(By.ID, "volunteering-section")
volunteering_items = volunteering_section.find_elements(By.CSS_SELECTOR, "li")
volunteering_list = [item.text for item in volunteering_items]
data['volunteering'] = volunteering_list
except NoSuchElementException:
data['volunteering'] = []

try:
skills_section = driver.find_element(By.ID, "skills-section")
skill_items = skills_section.find_elements(By.CSS_SELECTOR, "li")
skills_list = [item.text for item in skill_items]
data['skills'] = skills_list
except NoSuchElementException:
data['skills'] = []

try:
recommendations_section = driver.find_element(By.ID, "recommendations-section")
recommendation_items = recommendations_section.find_elements(By.CSS_SELECTOR, "li")
recommendations_list = [item.text for item in recommendation_items]
data['recommendations'] = recommendations_list
except NoSuchElementException:
data['recommendations'] = []

try:
honors_section = driver.find_element(By.ID, "honors-awards-section")
honors_items = honors_section.find_elements(By.CSS_SELECTOR, "li")
honors_list = [item.text for item in honors_items]
data['honors_awards'] = honors_list
except NoSuchElementException:
data['honors_awards'] = []

try:
posts_tab = driver.find_element(By.XPATH, "//a[contains(@href, '/detail/recent-activity/')]")
posts_tab.click()
time.sleep(5) # Wait for posts to load
posts = driver.find_elements(By.CSS_SELECTOR, "div.feed-shared-update-v2")
posts_captions = []
for post in posts:
try:
caption = post.find_element(By.CSS_SELECTOR, "span.break-words").text
posts_captions.append(caption)
except NoSuchElementException:
continue
data['posts_captions'] = posts_captions
except NoSuchElementException:
data['posts_captions'] = []

return data

def save_data(data, json_filename="linkedin_data.json", txt_filename="linkedin_data.txt"):
"""Saves the scraped data to JSON and TXT files."""
with open(json_filename, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=4, ensure_ascii=False)

with open(txt_filename, "w", encoding="utf-8") as txt_file:
for key, value in data.items():
txt_file.write(f"{key}:\n{value}\n\n")

if __name__ == "__main__":
# profile_url = input("Enter the LinkedIn profile URL to scrape: ")
# Testing with sans profile
profile_url ="https://www.linkedin.com/in/sanskriti-joshi-408575266/"
driver = init_driver()
try:
linkedin_login(driver)
time.sleep(3)
scraped_data = scrape_linkedin_profile(driver, profile_url)
save_data(scraped_data)
print("Scraping complete. Data saved in 'linkedin_data.json' and 'linkedin_data.txt'.")
finally:
try:
driver.quit()
except OSError as e:
print("Error terminating service process (ignored):", e)

Loading