khyeonm · haein1012 · Jun 7, 2025 · Jun 7, 2025
diff --git a/backend/crawling/crawling_demo_jikhaeng.ipynb b/backend/crawling/crawling_demo_jikhaeng.ipynb
diff --git a/backend/crawling/downloads/NHN_공고8.png b/backend/crawling/downloads/NHN_공고8.png
diff --git a/backend/crawling/downloads/네이버클라우드_공고5.png b/backend/crawling/downloads/네이버클라우드_공고5.png
diff --git a/backend/crawling/downloads/네이버클라우드_공고9.png b/backend/crawling/downloads/네이버클라우드_공고9.png
diff --git a/backend/crawling/downloads/코웨이_공고7.png b/backend/crawling/downloads/코웨이_공고7.png
diff --git a/backend/crawling/downloads/쿠팡_공고2.png b/backend/crawling/downloads/쿠팡_공고2.png
diff --git a/backend/crawling/downloads/클로봇_공고6.png b/backend/crawling/downloads/클로봇_공고6.png
diff --git a/backend/crawling/직행_크롤링_결과.xlsx b/backend/crawling/직행_크롤링_결과.xlsx
diff --git a/backend/crawling/채용공고_크롤링결과.xlsx b/backend/crawling/채용공고_크롤링결과.xlsx
diff --git a/backend/test_crawl/applicant_crawler.py b/backend/test_crawl/applicant_crawler.py
diff --git a/backend/test_crawl/certification_crawler.py b/backend/test_crawl/certification_crawler.py
diff --git a/backend/test_crawl/insert_to_db.py b/backend/test_crawl/insert_to_db.py
@@ -0,0 +1,26 @@
+import os
+from dotenv import load_dotenv
+from sqlalchemy import create_engine, text
+import pandas as pd
+
+
+# 1. 환경변수 불러오기
+load_dotenv()
+DATABASE_URL = os.getenv("DATABASE_URL")
+print("🔗 DATABASE_URL:", DATABASE_URL)
+
+# 2. DB 연결
+engine = create_engine(DATABASE_URL)
+
+# 3. 연결 테스트 + 데이터 조회
+try:
+    with engine.connect() as conn:
+        conn.execute(text("SELECT 1;"))  # SQLAlchemy 2.0+에서는 text() 필요
+        print("✅ DB 연결 성공")
+
+        # 실제 테이블에서 5개만 조회
+        df = pd.read_sql("SELECT * FROM certifications LIMIT 5;", con=engine)
+        print("📦 certifications 테이블 샘플:\n", df)
+
+except Exception as e:
+    print("❌ DB 연결 실패:", e)
diff --git a/backend/test_crawl/recruit_crawler.py b/backend/test_crawl/recruit_crawler.py
@@ -0,0 +1,173 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import pandas as pd
+import time
+
+def crawl_zighang(job_name="서버·백엔드", max_clicks=10):
+    options = Options()
+    options.add_argument("--headless")  # 필요 시 제거 가능
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Chrome(options=options)
+    driver.set_window_size(1280, 1024)
+    wait = WebDriverWait(driver, 20)
+
+    try:
+        driver.get("https://zighang.com/it")
+        time.sleep(2)
+
+        arrow_xpath = '//*[@id="root"]/main/div[3]/div/div/div/div/div[2]/div/section/button[2]/div/img'
+        wait.until(EC.element_to_be_clickable((By.XPATH, arrow_xpath))).click()
+        time.sleep(1)
+
+        job_button_xpath = f'//button[normalize-space()="{job_name}"]'
+        wait.until(EC.element_to_be_clickable((By.XPATH, job_button_xpath))).click()
+        time.sleep(1)
+
+        confirm_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.sticky.bottom-0 button.bg-primary')))
+        driver.execute_script("arguments[0].scrollIntoView(true);", confirm_button)
+        driver.execute_script("arguments[0].click();", confirm_button)
+        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'p.ds-web-title2')))
+
+        original_tab = driver.current_window_handle
+        results = []
+
+        for i in range(2, max_clicks + 1):
+            try:
+                title_xpath = f'(//p[contains(@class, "ds-web-title2")])[{i}]'
+                title_elem = wait.until(EC.presence_of_element_located((By.XPATH, title_xpath)))
+                driver.execute_script("arguments[0].scrollIntoView(true);", title_elem)
+                driver.execute_script("window.scrollBy(0, -200);")
+                time.sleep(0.3)
+
+                parent_link = title_elem.find_element(By.XPATH, "./ancestor::a[1]")
+                driver.execute_script("arguments[0].click();", parent_link)
+                time.sleep(2)
+
+                new_tab = [tab for tab in driver.window_handles if tab != original_tab][0]
+                driver.switch_to.window(new_tab)
+
+                data = {}
+                data["회사명"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[1]/div/a').text
+                data["경력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[1]/div/div').text
+                data["학력"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[3]/div/div').text
+                data["근무지"] = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[1]/div[5]/div/section/div[2]/div/div').text
+                data["직군"] = job_name
+
+                try:
+                    data["우대사항"] = driver.find_element(By.XPATH, '//h2[text()="우대사항"]/following-sibling::p').text
+                except:
+                    data["우대사항"] = ""
+
+                try:
+                    data["자격요건"] = driver.find_element(By.XPATH, '//h2[text()="자격요건"]/following-sibling::p').text
+                except:
+                    data["자격요건"] = ""
+
+                if data["우대사항"] == "" and data["자격요건"] == "":
+                    try:
+                        img_elem = driver.find_element(By.XPATH, '//*[@id="root"]/main/div[2]/div[1]/div[1]/div[4]/img')
+                        data["이미지경로"] = img_elem.get_attribute("src")
+                    except:
+                        data["이미지경로"] = ""
+                else:
+                    data["이미지경로"] = ""
+
+                results.append(data)
+                driver.close()
+                driver.switch_to.window(original_tab)
+
+            except Exception as e:
+                print(f"❌ [{i}]번째 공고 실패: {e}")
+                continue
+
+        return pd.DataFrame(results)
+
+    finally:
+        driver.quit()
+
+
+
+
+def crawl_linkareer(max_pages=5):
+    chrome_options = Options()
+    chrome_options.add_experimental_option("detach", True)
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    # chrome_options.add_argument("--headless")  # 필요 시 주석 제거
+    driver = webdriver.Chrome(options=chrome_options)
+    wait = WebDriverWait(driver, 10)
+
+    results = []
+
+    try:
+        for page in range(1, max_pages + 1):
+            list_url = f"https://linkareer.com/list/recruit?filterBy_activityTypeID=5&filterBy_categoryIDs=58&filterBy_status=OPEN&orderBy_direction=DESC&orderBy_field=RECENT&page={page}"
+            driver.get(list_url)
+            time.sleep(2)
+            print(f"📄 {page}페이지 접속 완료")
+
+            main_window = driver.current_window_handle
+            row_count = len(driver.find_elements(By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr'))
+            print(f"🔍 {row_count}개의 공고 탐색 예정")
+
+            for i in range(1, row_count + 1):
+                try:
+                    link_element = driver.find_element(By.XPATH, f'//*[@id="__next"]/div[1]/div/main/div/section/div[2]/table/tbody/tr[{i}]/td[2]/div/a/div/p')
+                    link_element.click()
+
+                    driver.switch_to.window(driver.window_handles[-1])
+
+                    company_name = wait.until(EC.presence_of_element_located(
+                        (By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/header/h2'))).text.strip()
+                    company_type = wait.until(EC.presence_of_element_located(
+                        (By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[1]/dd'))).text.strip()
+                    position_element = wait.until(EC.presence_of_element_located(
+                        (By.XPATH, '//*[@id="__next"]/div[1]/div/main/div/div/section[1]/div/article/div/dl[5]/dd')))
+                    position = position_element.text.strip()
+
+                    p_elements = driver.find_elements(By.XPATH, '//*[@id="DETAIL"]/section[1]/div/p')
+                    p_texts = [p.text.strip() for p in p_elements]
+
+                    qual_idx = next((idx for idx, text in enumerate(p_texts) if '자격요건' in text or '자격 요건' in text), None)
+                    pos_idx = next((idx for idx, text in enumerate(p_texts) if '모집 직무' in text or '세부 직무' in text), None)
+
+                    qualification_texts = []
+                    if qual_idx is not None:
+                        for t in p_texts[qual_idx + 1:]:
+                            if any(keyword in t for keyword in ['지원', '혜택', '우대', '다음', '근무']):
+                                break
+                            qualification_texts.append(t)
+                    qualification = "\n".join(qualification_texts)
+
+                    detail_position = ''
+                    if pos_idx is not None and pos_idx + 1 < len(p_texts):
+                        detail_position = p_texts[pos_idx + 1]
+
+                    results.append({
+                        '회사명': company_name,
+                        '기업형태': company_type,
+                        '모집직무': position,
+                        '세부직무': detail_position,
+                        '자격요건': qualification
+                    })
+
+                    print(f"✅ {company_name} ({i}/{row_count}, page {page}) 크롤링 완료")
+                    driver.close()
+                    driver.switch_to.window(main_window)
+                    time.sleep(1)
+
+                except Exception as e:
+                    print(f"❌ {page}페이지 {i}번째 공고 오류: {e}")
+                    if len(driver.window_handles) > 1:
+                        driver.close()
+                        driver.switch_to.window(main_window)
+                    continue
+
+        return pd.DataFrame(results)
+
+    finally:
+        driver.quit()
diff --git a/backend/test_crawl/selenium_basic/app.py b/backend/test_crawl/selenium_basic/app.py
@@ -0,0 +1,19 @@
+import time
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import chromedriver_autoinstaller
+
+chromedriver_autoinstaller.install()
+
+driver = webdriver.Chrome()
+# 1. 드라이버.get() 메서드를 사용하여 웹 페이지를 연다.
+driver.get("https://www.naver.com")
+time.sleep(3)
+
+# 2. 웹 페이지가 열리면, 웹 페이지의 요소를 찾기 위해 find_element() 메서드를 사용한다.
+css_selector = "#shortcutArea > ul > li:nth-child(8) > a > span.service_name"
+group_navigation = driver.find_element(By.CSS_SELECTOR, css_selector)
+
+print(group_navigation.text)
+group_navigation.click()
+input()
diff --git a/backend/test_crawl/selenium_basic/selenium_tools.py b/backend/test_crawl/selenium_basic/selenium_tools.py
@@ -0,0 +1,65 @@
+import time
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webrdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+import chromedriver_autoinstaller
+
+chromedriver_autoinstaller.install()
+
+driver = webdriver.Chrome()
+driver.get("https://www.naver.com")
+
+# # 1. Navigation (웹 페이지 이동) 관련 툴
+# # get, back, forward, refresh 메서드를 사용하여 웹 페이지를 이동하는 방법을 알아보자.
+
+# # 1-1. get() 원하는 페이지로 이동하는 함수
+# driver.get("https://www.naver.com")
+# time.sleep(1)
+# driver.get("https://www.google.com")
+
+# # 1-2. back() 이전 페이지로 이동하는 함수
+# driver.back()
+# time.sleep(2)
+
+# # 1-3. forward() 다음 페이지로 이동하는 함수
+# driver.forward()
+# time.sleep(2)
+
+# # 1-4. refresh() 현재 페이지를 새로고침하는 함수
+# driver.refresh()
+# time.sleep(2)
+# print("동작 끝")
+# input()
+
+# # 2.browser information
+# # 2-1. title ~ 웹 사이트의 제목을 가져오는 함수
+# title = driver.title
+# print("제목:", title)
+# # 2-2. current_url ~ 현재 웹 페이지의 URL을 가져오는 함수
+# current_url = driver.current_url
+# print("현재 URL:", current_url)
+
+# if "nid.naver.com" in current_url:
+#     print("지금은 로그인 하는 로직이 필요함")
+# else:
+#     print("네이버 로그인 페이지가 아닙니다.")
+
+
+# 3. Driver Wait (드라이버 대기)
+# 3-1. 3초 때 로딩이 끝나서, element가 찾아짐.
+# 3-2. 30초 까지는 기다리겠음.
+# 3-3. 30초가 넘어가면 에러던짐
+
+try:
+    selector = "#shortcutArea > ul > li:nth-child(8) > a > span.service_name"
+    WebDriverWait(driver, 30).until(EC.presence_of_element_located(
+        By.CSS_SELECTOR, selector
+    ))
+except:
+    print("예외 발생, 예외 처리 코드 실행하기")
+print("엘리먼트 로딩 끝")
+print("다음 코드 실행")
+
+input()