khyeonm
diff --git a/‎backend/crawling/01_DB_test.ipynb‎
Lines changed: 0 additions & 511 deletions b/‎backend/crawling/01_DB_test.ipynb‎
Lines changed: 0 additions & 511 deletions
diff --git a/‎backend/crawling/README.md‎
Lines changed: 2 additions & 6 deletions b/‎backend/crawling/README.md‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎backend/crawling/base SQL.sql‎
Lines changed: 0 additions & 37 deletions b/‎backend/crawling/base SQL.sql‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎backend/crawling/certifications.csv‎
Lines changed: 0 additions & 14 deletions b/‎backend/crawling/certifications.csv‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎backend/crawling/cleaned_successful_applicants_specs.csv‎
Lines changed: 0 additions & 11 deletions b/‎backend/crawling/cleaned_successful_applicants_specs.csv‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎backend/crawling/crawl_jikhang.py‎
Lines changed: 107 additions & 0 deletions b/‎backend/crawling/crawl_jikhang.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎backend/crawling/crawl_linkareer.py‎
Lines changed: 102 additions & 0 deletions b/‎backend/crawling/crawl_linkareer.py‎
Lines changed: 102 additions & 0 deletions
@@ -6,15 +6,11 @@
 
 ## 🗂 개요
 spectrackr의 기반이 되는 데이터베이스를 구축하기 위한 전처리 및 크롤링 파이프라인입니다.
-총 3가지 핵심 데이터를 수집합니다:
-	•	✅ 합격자 정보 (applicants)
-	•	✅ 자격증 정보 (certifications)
-	•	✅ 기업의 모집 요강 (job_postings)
+
 
 
 ## 🧩 기능
-### 1. 합격자 크롤링
-### 
+
 
 ## 실행 순서
 
 
@@ -0,0 +1,107 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+import pandas as pd
+
+# 크롬 설정
+options = Options()
+options.add_experimental_option("detach", True)
+driver = webdriver.Chrome(options=options)
+driver.set_window_size(1280, 1024)
+wait = WebDriverWait(driver, 20)
+
+# 사이트 접속
+driver.get("https://zighang.com/it")
+time.sleep(2)
+
+# 직무 설정
+job_name = "서버·백엔드"
+
+# 직무 필터 열기
+arrow_xpath = '//*[@id="root"]/main/div[3]/div/div/div/div/div[2]/div/section/button[2]/div/img'
+wait.until(EC.element_to_be_clickable((By.XPATH, arrow_xpath))).click()
+print("직무 필터 열기 성공")
+time.sleep(1)
+
+# 직무 클릭
+job_button_xpath = f'//button[normalize-space()="{job_name}"]'
+wait.until(EC.element_to_be_clickable((By.XPATH, job_button_xpath))).click()
+print(f" 직무 '{job_name}' 선택 완료")
+time.sleep(1)
+
+# 공고 보기 클릭
+confirm_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.sticky.bottom-0 button.bg-primary')))
+driver.execute_script("arguments[0].scrollIntoView(true);", confirm_button)
+driver.execute_script("arguments[0].click();", confirm_button)
+print("공고 보기 버튼 클릭 완료")
+wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'p.ds-web-title2')))
+print("공고 리스트 로딩 완료")
+
+# 공고 순회
+MAX_CLICKS = 10
+original_tab = driver.current_window_handle
+results = []
+
+for i in range(2, MAX_CLICKS + 1):
+    try:
+        title_xpath = f'(//p[contains(@class, "ds-web-title2")])[{i}]'
+        title_elem = wait.until(EC.presence_of_element_located((By.XPATH, title_xpath)))
+        driver.execute_script("arguments[0].scrollIntoView(true);", title_elem)
+        driver.execute_script("window.scrollBy(0, -200);")
+        time.sleep(0.3)
+        parent_link = title_elem.find_element(By.XPATH, "./ancestor::a[1]")
+        driver.execute_script("arguments[0].click();", parent_link)
+        print(f"▶️ [{i}]번째 공고 클릭 → 새 탭 열림 예상")
+        time.sleep(2)
+
+        # 새 탭 전환
+        new_tab = [tab for tab in driver.window_handles if tab != original_tab][0]
+        driver.switch_to.window(new_tab)
+
+        # 공고 정보 크롤링
+        data = {}
+        data["회사명"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[1]/div/a').text
+        data["경력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[1]/div/div').text
+        data["학력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[3]/div/div').text
+        data["근무지"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[2]/div/div').text
+        data["직군"] = job_name
+
+        try:
+            data["우대사항"] = driver.find_element(By.XPATH, '//h2[text()="우대사항"]/following-sibling::p').text
+            print(f"우대사항 크롤링 완료: {data['우대사항'][:10]}...")
+        except:
+            data["우대사항"] = ""
+
+        try:
+            data["자격요건"] = driver.find_element(By.XPATH, '//h2[text()="자격요건"]/following-sibling::p').text
+            print(f"자격요건 크롤링 완료: {data['자격요건'][:10]}...")
+        except:
+            data["자격요건"] = ""
+
+        if data["우대사항"] == "" and data["자격요건"] == "":
+            try:
+                img_elem = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[4]/img')
+                data["이미지경로"] = img_elem.get_attribute("src")
+                print("이미지 URL 저장 완료")
+            except:
+                data["이미지경로"] = ""
+                print("이미지 URL 저장 실패")
+        else:
+            data["이미지경로"] = ""
+
+        results.append(data)
+        driver.close()
+        driver.switch_to.window(original_tab)
+        print(f"🔙 기존 탭 복귀 완료\n")
+
+    except Exception as e:
+        print(f"[{i}]번째 공고 실패: {e}")
+        continue
+
+# 엑셀 저장
+df = pd.DataFrame(results)
+df.to_excel("직행_크롤링_결과.xlsx", index=False)
+print("엑셀 저장 완료")
@@ -0,0 +1,102 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import time
+
+import pandas as pd
+import time
+
+
+chrome_options = Options()
+chrome_options.add_experimental_option("detach", True)
+driver = webdriver.Chrome(options=chrome_options)
+wait = WebDriverWait(driver, 10)
+
+results = []
+
+# 크롤링할 페이지 범위 (1~5페이지 예시)
+for page in range(1, 6):  # 필요시 1~원하는 페이지 범위 수정
+    list_url = f"https://linkareer.com/list/recruit?filterBy_activityTypeID=5&filterBy_categoryIDs=58&filterBy_status=OPEN&orderBy_direction=DESC&orderBy_field=RECENT&page={page}"
+    driver.get(list_url)
+    time.sleep(2)
+
+    print(f"📄 {page}페이지 접속 완료")
+
+    # 메인 탭 핸들 저장
+    main_window = driver.current_window_handle
+
+    # 공고 row 개수 가져오기
+    row_count = len(driver.find_elements(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr'))
+    print(f"{row_count}개의 공고 탐색 예정")
+
+    for i in range(1, row_count + 1):
+        try:
+            link_element = driver.find_element(By.XPATH, f'//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr[{i}]/td[2]/div/a/div/p')
+            link_element.click()
+
+            # 새 탭으로 전환
+            driver.switch_to.window(driver.window_handles[-1])
+
+            # 기본 정보 크롤링
+            company_name = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/header/h2'))).text.strip()
+            company_type = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[1]/dd'))).text.strip()
+
+            position_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[5]/dd')))
+            position = position_element.text.strip()
+
+            # 모든 p 태그 텍스트 수집
+            p_elements = driver.find_elements(By.XPATH, '//*[@id="DETAIL"]/section[1]/div/p')
+            p_texts = [p.text.strip() for p in p_elements]
+
+            # 자격요건/모집직무 인덱스 탐색
+            qual_idx = next((idx for idx, text in enumerate(p_texts) if '자격요건' in text or '자격 요건' in text), None)
+            pos_idx = next((idx for idx, text in enumerate(p_texts) if '모집 직무' in text or '세부 직무' in text), None)
+
+            # 자격요건 텍스트 추출
+            qualification_texts = []
+            if qual_idx is not None:
+                for t in p_texts[qual_idx + 1:]:
+                    if any(keyword in t for keyword in ['지원', '혜택', '우대', '다음', '근무']):
+                        break
+                    qualification_texts.append(t)
+            qualification = "\n".join(qualification_texts)
+
+            # 세부직무 추출
+            detail_position = ''
+            if pos_idx is not None:
+                detail_position = p_texts[pos_idx + 1] if pos_idx + 1 < len(p_texts) else ''
+
+            # 결과 저장
+            results.append({
+                '회사명': company_name,
+                '기업형태': company_type,
+                '모집직무': position,
+                '세부직무': detail_position,
+                '자격요건': qualification
+            })
+
+            print(f"{company_name} ({i}/{row_count}, page {page}) 크롤링 완료")
+
+            # 새 탭 닫기 & 메인 탭으로 전환
+            driver.close()
+            driver.switch_to.window(main_window)
+
+            time.sleep(1)
+
+        except Exception as e:
+            print(f"{page}페이지 {i}번째 공고 오류: {e}")
+            # 예외 시 새 탭 닫고 메인 탭 복귀
+            if len(driver.window_handles) > 1:
+                driver.close()
+                driver.switch_to.window(main_window)
+            continue
+
+driver.quit()
+
+# DataFrame 변환 및 저장
+df = pd.DataFrame(results)
+df.to_csv('linkareer_crawling.csv', index=False, encoding='utf-8-sig')
+
+print("크롤링 완료! CSV 저장됨.")