From bf6de8f9d91adc4e5a09df8235eb50ff4f8011e7 Mon Sep 17 00:00:00 2001 From: SoClose <33631880+SoClosee@users.noreply.github.com> Date: Fri, 27 Feb 2026 11:03:05 +0100 Subject: [PATCH] fix: parallelize job extraction for performance improvement --- freework_scraper/scraper/job_extractor.py | 66 +++++++++++------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/freework_scraper/scraper/job_extractor.py b/freework_scraper/scraper/job_extractor.py index 6283a29..1745045 100644 --- a/freework_scraper/scraper/job_extractor.py +++ b/freework_scraper/scraper/job_extractor.py @@ -1,13 +1,12 @@ -"""Job detail page extractor for FreeWork.""" - from __future__ import annotations import logging import re -import time +time import random from datetime import datetime -from typing import Callable +from typing import Callable, List +from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup @@ -28,7 +27,6 @@ logger = logging.getLogger(__name__) - def _match_icon(svg_d: str) -> str | None: """Match an SVG path 'd' attribute to a known icon type.""" if not svg_d: @@ -39,7 +37,6 @@ def _match_icon(svg_d: str) -> str | None: return icon_name return None - def _clean_text(text: str | None) -> str: """Clean extracted text: strip whitespace and normalize spaces.""" if not text: @@ -48,7 +45,6 @@ def _clean_text(text: str | None) -> str: cleaned = re.sub(r"\s+", " ", cleaned) return cleaned - def _extract_header(soup: BeautifulSoup) -> dict[str, str]: """Extract header information (title, company, location, category).""" result = {"title": "", "company_name": "", "company_location": "", "job_category": ""} @@ -71,7 +67,6 @@ def _extract_header(soup: BeautifulSoup) -> dict[str, str]: return result - def _extract_publish_date(soup: BeautifulSoup) -> str: """Extract job publish date.""" time_tag = soup.find("time", class_=lambda c: c and "text-sm" in c) @@ -79,7 +74,6 @@ def _extract_publish_date(soup: BeautifulSoup) -> str: time_tag = soup.select_one(SELECTOR_JOB_TIME) return _clean_text(time_tag.text) if time_tag else "" - def _extract_description(soup: BeautifulSoup) -> str: """Extract full job description content.""" content_div = soup.find("div", class_="prose-content") @@ -94,7 +88,6 @@ def _extract_description(soup: BeautifulSoup) -> str: cleaned = [line.strip() for line in paragraphs if line.strip()] return "\n".join(cleaned) - def _extract_icon_attributes(soup: BeautifulSoup) -> dict[str, str]: """Extract job attributes from SVG icon containers.""" attributes = { @@ -128,7 +121,6 @@ def _extract_icon_attributes(soup: BeautifulSoup) -> dict[str, str]: return attributes - def _extract_skills(soup: BeautifulSoup) -> str: """Extract skills/technologies from the job page.""" skills = [] @@ -149,7 +141,6 @@ def _extract_skills(soup: BeautifulSoup) -> str: return ", ".join(skills) - def _extract_sector(soup: BeautifulSoup) -> str: """Extract job sector/industry if available.""" # Look for breadcrumbs or category links @@ -161,10 +152,8 @@ def _extract_sector(soup: BeautifulSoup) -> str: return "" - def extract_job_detail(browser: BrowserManager, job_url: str, search_url: str = "", page_num: int = 0) -> FreeWorkJob: - """ - Navigate to a job detail page and extract all available data. + """Navigate to a job detail page and extract all available data. Args: browser: Browser manager instance. @@ -226,15 +215,13 @@ def extract_job_detail(browser: BrowserManager, job_url: str, search_url: str = return job - def extract_all_jobs( browser: BrowserManager, - job_links: list[str], + job_links: List[str], search_url: str = "", on_job_done: Callable | None = None, -) -> list[FreeWorkJob]: - """ - Extract details for all job links. +) -> List[FreeWorkJob]: + """Extract details for all job links. Args: browser: Browser manager instance. @@ -245,23 +232,36 @@ def extract_all_jobs( Returns: List of FreeWorkJob instances. """ - jobs: list[FreeWorkJob] = [] + jobs: List[FreeWorkJob] = [] total = len(job_links) - for idx, url in enumerate(job_links): - logger.info("Extracting job %d/%d: %s", idx + 1, total, url) - - job = extract_job_detail(browser, url, search_url=search_url) - jobs.append(job) - - if on_job_done: - on_job_done(idx + 1, total, job) - - # Delay between requests - time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX)) + with ThreadPoolExecutor(max_workers=5) as executor: + future_to_url = {executor.submit(extract_job_detail, browser, url, search_url=search_url): url for url in job_links} + for future in as_completed(future_to_url): + url = future_to_url[future] + try: + job = future.result() + jobs.append(job) + + if on_job_done: + idx = job_links.index(url) + 1 + on_job_done(idx, total, job) + + except Exception as exc: + logger.error("Error extracting %s: %s", url, exc) + job = FreeWorkJob( + job_url=url, + search_url=search_url, + status="error", + error_message=str(exc), + ) + jobs.append(job) + + # Delay between requests + time.sleep(random.uniform(REQUEST_DELAY_MIN, REQUEST_DELAY_MAX)) valid = sum(1 for j in jobs if j.status == "ok") errors = sum(1 for j in jobs if j.status == "error") logger.info("Extraction complete: %d valid, %d errors out of %d total.", valid, errors, total) - return jobs + return jobs \ No newline at end of file