Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
466 changes: 443 additions & 23 deletions backend/crawling/crawling_demo_jikhaeng.ipynb

Large diffs are not rendered by default.

Binary file added backend/crawling/downloads/NHN_공고8.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added backend/crawling/downloads/코웨이_공고7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added backend/crawling/downloads/쿠팡_공고2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added backend/crawling/downloads/클로봇_공고6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added backend/crawling/직행_크롤링_결과.xlsx
Binary file not shown.
Binary file not shown.
Empty file.
Empty file.
26 changes: 26 additions & 0 deletions backend/test_crawl/insert_to_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine, text
import pandas as pd


# 1. 환경변수 불러오기
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL")
print("🔗 DATABASE_URL:", DATABASE_URL)

# 2. DB 연결
engine = create_engine(DATABASE_URL)

# 3. 연결 테스트 + 데이터 조회
try:
with engine.connect() as conn:
conn.execute(text("SELECT 1;")) # SQLAlchemy 2.0+에서는 text() 필요
print("✅ DB 연결 성공")

# 실제 테이블에서 5개만 조회
df = pd.read_sql("SELECT * FROM certifications LIMIT 5;", con=engine)
print("📦 certifications 테이블 샘플:\n", df)

except Exception as e:
print("❌ DB 연결 실패:", e)
173 changes: 173 additions & 0 deletions backend/test_crawl/recruit_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

def crawl_zighang(job_name="서버·백엔드", max_clicks=10):
options = Options()
options.add_argument("--headless") # 필요 시 제거 가능
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1280, 1024)
wait = WebDriverWait(driver, 20)

try:
driver.get("https://zighang.com/it")
time.sleep(2)

arrow_xpath = '//*[@id="root"]/main/div[3]/div/div/div/div/div[2]/div/section/button[2]/div/img'
wait.until(EC.element_to_be_clickable((By.XPATH, arrow_xpath))).click()
time.sleep(1)

job_button_xpath = f'//button[normalize-space()="{job_name}"]'
wait.until(EC.element_to_be_clickable((By.XPATH, job_button_xpath))).click()
time.sleep(1)

confirm_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.sticky.bottom-0 button.bg-primary')))
driver.execute_script("arguments[0].scrollIntoView(true);", confirm_button)
driver.execute_script("arguments[0].click();", confirm_button)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'p.ds-web-title2')))

original_tab = driver.current_window_handle
results = []

for i in range(2, max_clicks + 1):
try:
title_xpath = f'(//p[contains(@class, "ds-web-title2")])[{i}]'
title_elem = wait.until(EC.presence_of_element_located((By.XPATH, title_xpath)))
driver.execute_script("arguments[0].scrollIntoView(true);", title_elem)
driver.execute_script("window.scrollBy(0, -200);")
time.sleep(0.3)

parent_link = title_elem.find_element(By.XPATH, "./ancestor::a[1]")
driver.execute_script("arguments[0].click();", parent_link)
time.sleep(2)

new_tab = [tab for tab in driver.window_handles if tab != original_tab][0]
driver.switch_to.window(new_tab)

data = {}
data["회사명"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[1]/div/a').text
data["경력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[1]/div/div').text
data["학력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[3]/div/div').text
data["근무지"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[2]/div/div').text
data["직군"] = job_name

try:
data["우대사항"] = driver.find_element(By.XPATH, '//h2[text()="우대사항"]/following-sibling::p').text
except:
data["우대사항"] = ""

try:
data["자격요건"] = driver.find_element(By.XPATH, '//h2[text()="자격요건"]/following-sibling::p').text
except:
data["자격요건"] = ""

if data["우대사항"] == "" and data["자격요건"] == "":
try:
img_elem = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[4]/img')
data["이미지경로"] = img_elem.get_attribute("src")
except:
data["이미지경로"] = ""
else:
data["이미지경로"] = ""

results.append(data)
driver.close()
driver.switch_to.window(original_tab)

except Exception as e:
print(f"❌ [{i}]번째 공고 실패: {e}")
continue

return pd.DataFrame(results)

finally:
driver.quit()




def crawl_linkareer(max_pages=5):
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# chrome_options.add_argument("--headless") # 필요 시 주석 제거
driver = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(driver, 10)

results = []

try:
for page in range(1, max_pages + 1):
list_url = f"https://linkareer.com/list/recruit?filterBy_activityTypeID=5&filterBy_categoryIDs=58&filterBy_status=OPEN&orderBy_direction=DESC&orderBy_field=RECENT&page={page}"
driver.get(list_url)
time.sleep(2)
print(f"📄 {page}페이지 접속 완료")

main_window = driver.current_window_handle
row_count = len(driver.find_elements(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr'))
print(f"🔍 {row_count}개의 공고 탐색 예정")

for i in range(1, row_count + 1):
try:
link_element = driver.find_element(By.XPATH, f'//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr[{i}]/td[2]/div/a/div/p')
link_element.click()

driver.switch_to.window(driver.window_handles[-1])

company_name = wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/header/h2'))).text.strip()
company_type = wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[1]/dd'))).text.strip()
position_element = wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[5]/dd')))
position = position_element.text.strip()

p_elements = driver.find_elements(By.XPATH, '//*[@id="DETAIL"]/section[1]/div/p')
p_texts = [p.text.strip() for p in p_elements]

qual_idx = next((idx for idx, text in enumerate(p_texts) if '자격요건' in text or '자격 요건' in text), None)
pos_idx = next((idx for idx, text in enumerate(p_texts) if '모집 직무' in text or '세부 직무' in text), None)

qualification_texts = []
if qual_idx is not None:
for t in p_texts[qual_idx + 1:]:
if any(keyword in t for keyword in ['지원', '혜택', '우대', '다음', '근무']):
break
qualification_texts.append(t)
qualification = "\n".join(qualification_texts)

detail_position = ''
if pos_idx is not None and pos_idx + 1 < len(p_texts):
detail_position = p_texts[pos_idx + 1]

results.append({
'회사명': company_name,
'기업형태': company_type,
'모집직무': position,
'세부직무': detail_position,
'자격요건': qualification
})

print(f"✅ {company_name} ({i}/{row_count}, page {page}) 크롤링 완료")
driver.close()
driver.switch_to.window(main_window)
time.sleep(1)

except Exception as e:
print(f"❌ {page}페이지 {i}번째 공고 오류: {e}")
if len(driver.window_handles) > 1:
driver.close()
driver.switch_to.window(main_window)
continue

return pd.DataFrame(results)

finally:
driver.quit()
19 changes: 19 additions & 0 deletions backend/test_crawl/selenium_basic/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller

chromedriver_autoinstaller.install()

driver = webdriver.Chrome()
# 1. 드라이버.get() 메서드를 사용하여 웹 페이지를 연다.
driver.get("https://www.naver.com")
time.sleep(3)

# 2. 웹 페이지가 열리면, 웹 페이지의 요소를 찾기 위해 find_element() 메서드를 사용한다.
css_selector = "#shortcutArea > ul > li:nth-child(8) > a > span.service_name"
group_navigation = driver.find_element(By.CSS_SELECTOR, css_selector)

print(group_navigation.text)
group_navigation.click()
input()
65 changes: 65 additions & 0 deletions backend/test_crawl/selenium_basic/selenium_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webrdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import chromedriver_autoinstaller

chromedriver_autoinstaller.install()

driver = webdriver.Chrome()
driver.get("https://www.naver.com")

# # 1. Navigation (웹 페이지 이동) 관련 툴
# # get, back, forward, refresh 메서드를 사용하여 웹 페이지를 이동하는 방법을 알아보자.

# # 1-1. get() 원하는 페이지로 이동하는 함수
# driver.get("https://www.naver.com")
# time.sleep(1)
# driver.get("https://www.google.com")

# # 1-2. back() 이전 페이지로 이동하는 함수
# driver.back()
# time.sleep(2)

# # 1-3. forward() 다음 페이지로 이동하는 함수
# driver.forward()
# time.sleep(2)

# # 1-4. refresh() 현재 페이지를 새로고침하는 함수
# driver.refresh()
# time.sleep(2)
# print("동작 끝")
# input()

# # 2.browser information
# # 2-1. title ~ 웹 사이트의 제목을 가져오는 함수
# title = driver.title
# print("제목:", title)
# # 2-2. current_url ~ 현재 웹 페이지의 URL을 가져오는 함수
# current_url = driver.current_url
# print("현재 URL:", current_url)

# if "nid.naver.com" in current_url:
# print("지금은 로그인 하는 로직이 필요함")
# else:
# print("네이버 로그인 페이지가 아닙니다.")


# 3. Driver Wait (드라이버 대기)
# 3-1. 3초 때 로딩이 끝나서, element가 찾아짐.
# 3-2. 30초 까지는 기다리겠음.
# 3-3. 30초가 넘어가면 에러던짐

try:
selector = "#shortcutArea > ul > li:nth-child(8) > a > span.service_name"
WebDriverWait(driver, 30).until(EC.presence_of_element_located(
By.CSS_SELECTOR, selector
))
except:
print("예외 발생, 예외 처리 코드 실행하기")
print("엘리먼트 로딩 끝")
print("다음 코드 실행")

input()
Loading