Skip to content

Commit 27ea6dc

Browse files
authored
Style(crawling): remove redundant code and merge scrapers into one file (#114)
1 parent 28e63e7 commit 27ea6dc

29 files changed

+211
-6762
lines changed

backend/crawling/01_DB_test.ipynb

Lines changed: 0 additions & 511 deletions
This file was deleted.

backend/crawling/README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,11 @@
66

77
## 🗂 개요
88
spectrackr의 기반이 되는 데이터베이스를 구축하기 위한 전처리 및 크롤링 파이프라인입니다.
9-
총 3가지 핵심 데이터를 수집합니다:
10-
• ✅ 합격자 정보 (applicants)
11-
• ✅ 자격증 정보 (certifications)
12-
• ✅ 기업의 모집 요강 (job_postings)
9+
1310

1411

1512
## 🧩 기능
16-
### 1. 합격자 크롤링
17-
###
13+
1814

1915
## 실행 순서
2016

backend/crawling/base SQL.sql

Lines changed: 0 additions & 37 deletions
This file was deleted.

backend/crawling/certifications.csv

Lines changed: 0 additions & 14 deletions
This file was deleted.

backend/crawling/cleaned_successful_applicants_specs.csv

Lines changed: 0 additions & 11 deletions
This file was deleted.

backend/crawling/crawl_jikhang.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.chrome.options import Options
4+
from selenium.webdriver.support.ui import WebDriverWait
5+
from selenium.webdriver.support import expected_conditions as EC
6+
import time
7+
import pandas as pd
8+
9+
# 크롬 설정
10+
options = Options()
11+
options.add_experimental_option("detach", True)
12+
driver = webdriver.Chrome(options=options)
13+
driver.set_window_size(1280, 1024)
14+
wait = WebDriverWait(driver, 20)
15+
16+
# 사이트 접속
17+
driver.get("https://zighang.com/it")
18+
time.sleep(2)
19+
20+
# 직무 설정
21+
job_name = "서버·백엔드"
22+
23+
# 직무 필터 열기
24+
arrow_xpath = '//*[@id="root"]/main/div[3]/div/div/div/div/div[2]/div/section/button[2]/div/img'
25+
wait.until(EC.element_to_be_clickable((By.XPATH, arrow_xpath))).click()
26+
print("직무 필터 열기 성공")
27+
time.sleep(1)
28+
29+
# 직무 클릭
30+
job_button_xpath = f'//button[normalize-space()="{job_name}"]'
31+
wait.until(EC.element_to_be_clickable((By.XPATH, job_button_xpath))).click()
32+
print(f" 직무 '{job_name}' 선택 완료")
33+
time.sleep(1)
34+
35+
# 공고 보기 클릭
36+
confirm_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.sticky.bottom-0 button.bg-primary')))
37+
driver.execute_script("arguments[0].scrollIntoView(true);", confirm_button)
38+
driver.execute_script("arguments[0].click();", confirm_button)
39+
print("공고 보기 버튼 클릭 완료")
40+
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'p.ds-web-title2')))
41+
print("공고 리스트 로딩 완료")
42+
43+
# 공고 순회
44+
MAX_CLICKS = 10
45+
original_tab = driver.current_window_handle
46+
results = []
47+
48+
for i in range(2, MAX_CLICKS + 1):
49+
try:
50+
title_xpath = f'(//p[contains(@class, "ds-web-title2")])[{i}]'
51+
title_elem = wait.until(EC.presence_of_element_located((By.XPATH, title_xpath)))
52+
driver.execute_script("arguments[0].scrollIntoView(true);", title_elem)
53+
driver.execute_script("window.scrollBy(0, -200);")
54+
time.sleep(0.3)
55+
parent_link = title_elem.find_element(By.XPATH, "./ancestor::a[1]")
56+
driver.execute_script("arguments[0].click();", parent_link)
57+
print(f"▶️ [{i}]번째 공고 클릭 → 새 탭 열림 예상")
58+
time.sleep(2)
59+
60+
# 새 탭 전환
61+
new_tab = [tab for tab in driver.window_handles if tab != original_tab][0]
62+
driver.switch_to.window(new_tab)
63+
64+
# 공고 정보 크롤링
65+
data = {}
66+
data["회사명"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[1]/div/a').text
67+
data["경력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[1]/div/div').text
68+
data["학력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[3]/div/div').text
69+
data["근무지"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[2]/div/div').text
70+
data["직군"] = job_name
71+
72+
try:
73+
data["우대사항"] = driver.find_element(By.XPATH, '//h2[text()="우대사항"]/following-sibling::p').text
74+
print(f"우대사항 크롤링 완료: {data['우대사항'][:10]}...")
75+
except:
76+
data["우대사항"] = ""
77+
78+
try:
79+
data["자격요건"] = driver.find_element(By.XPATH, '//h2[text()="자격요건"]/following-sibling::p').text
80+
print(f"자격요건 크롤링 완료: {data['자격요건'][:10]}...")
81+
except:
82+
data["자격요건"] = ""
83+
84+
if data["우대사항"] == "" and data["자격요건"] == "":
85+
try:
86+
img_elem = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[4]/img')
87+
data["이미지경로"] = img_elem.get_attribute("src")
88+
print("이미지 URL 저장 완료")
89+
except:
90+
data["이미지경로"] = ""
91+
print("이미지 URL 저장 실패")
92+
else:
93+
data["이미지경로"] = ""
94+
95+
results.append(data)
96+
driver.close()
97+
driver.switch_to.window(original_tab)
98+
print(f"🔙 기존 탭 복귀 완료\n")
99+
100+
except Exception as e:
101+
print(f"[{i}]번째 공고 실패: {e}")
102+
continue
103+
104+
# 엑셀 저장
105+
df = pd.DataFrame(results)
106+
df.to_excel("직행_크롤링_결과.xlsx", index=False)
107+
print("엑셀 저장 완료")
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.common.by import By
3+
from selenium.webdriver.chrome.options import Options
4+
from selenium.webdriver.support.ui import WebDriverWait
5+
from selenium.webdriver.support import expected_conditions as EC
6+
import time
7+
8+
import pandas as pd
9+
import time
10+
11+
12+
chrome_options = Options()
13+
chrome_options.add_experimental_option("detach", True)
14+
driver = webdriver.Chrome(options=chrome_options)
15+
wait = WebDriverWait(driver, 10)
16+
17+
results = []
18+
19+
# 크롤링할 페이지 범위 (1~5페이지 예시)
20+
for page in range(1, 6): # 필요시 1~원하는 페이지 범위 수정
21+
list_url = f"https://linkareer.com/list/recruit?filterBy_activityTypeID=5&filterBy_categoryIDs=58&filterBy_status=OPEN&orderBy_direction=DESC&orderBy_field=RECENT&page={page}"
22+
driver.get(list_url)
23+
time.sleep(2)
24+
25+
print(f"📄 {page}페이지 접속 완료")
26+
27+
# 메인 탭 핸들 저장
28+
main_window = driver.current_window_handle
29+
30+
# 공고 row 개수 가져오기
31+
row_count = len(driver.find_elements(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr'))
32+
print(f"{row_count}개의 공고 탐색 예정")
33+
34+
for i in range(1, row_count + 1):
35+
try:
36+
link_element = driver.find_element(By.XPATH, f'//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr[{i}]/td[2]/div/a/div/p')
37+
link_element.click()
38+
39+
# 새 탭으로 전환
40+
driver.switch_to.window(driver.window_handles[-1])
41+
42+
# 기본 정보 크롤링
43+
company_name = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/header/h2'))).text.strip()
44+
company_type = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[1]/dd'))).text.strip()
45+
46+
position_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[5]/dd')))
47+
position = position_element.text.strip()
48+
49+
# 모든 p 태그 텍스트 수집
50+
p_elements = driver.find_elements(By.XPATH, '//*[@id="DETAIL"]/section[1]/div/p')
51+
p_texts = [p.text.strip() for p in p_elements]
52+
53+
# 자격요건/모집직무 인덱스 탐색
54+
qual_idx = next((idx for idx, text in enumerate(p_texts) if '자격요건' in text or '자격 요건' in text), None)
55+
pos_idx = next((idx for idx, text in enumerate(p_texts) if '모집 직무' in text or '세부 직무' in text), None)
56+
57+
# 자격요건 텍스트 추출
58+
qualification_texts = []
59+
if qual_idx is not None:
60+
for t in p_texts[qual_idx + 1:]:
61+
if any(keyword in t for keyword in ['지원', '혜택', '우대', '다음', '근무']):
62+
break
63+
qualification_texts.append(t)
64+
qualification = "\n".join(qualification_texts)
65+
66+
# 세부직무 추출
67+
detail_position = ''
68+
if pos_idx is not None:
69+
detail_position = p_texts[pos_idx + 1] if pos_idx + 1 < len(p_texts) else ''
70+
71+
# 결과 저장
72+
results.append({
73+
'회사명': company_name,
74+
'기업형태': company_type,
75+
'모집직무': position,
76+
'세부직무': detail_position,
77+
'자격요건': qualification
78+
})
79+
80+
print(f"{company_name} ({i}/{row_count}, page {page}) 크롤링 완료")
81+
82+
# 새 탭 닫기 & 메인 탭으로 전환
83+
driver.close()
84+
driver.switch_to.window(main_window)
85+
86+
time.sleep(1)
87+
88+
except Exception as e:
89+
print(f"{page}페이지 {i}번째 공고 오류: {e}")
90+
# 예외 시 새 탭 닫고 메인 탭 복귀
91+
if len(driver.window_handles) > 1:
92+
driver.close()
93+
driver.switch_to.window(main_window)
94+
continue
95+
96+
driver.quit()
97+
98+
# DataFrame 변환 및 저장
99+
df = pd.DataFrame(results)
100+
df.to_csv('linkareer_crawling.csv', index=False, encoding='utf-8-sig')
101+
102+
print("크롤링 완료! CSV 저장됨.")

0 commit comments

Comments
 (0)