diff --git a/freework_scraper/scraper/search.py b/freework_scraper/scraper/search.py index db6eb15..90f6ff8 100644 --- a/freework_scraper/scraper/search.py +++ b/freework_scraper/scraper/search.py @@ -1,5 +1,3 @@ -"""Search page navigation and pagination for FreeWork.""" - from __future__ import annotations import logging @@ -10,38 +8,18 @@ from bs4 import BeautifulSoup -from freework_scraper.config import ( - SELECTOR_PAGINATION_BTN, - SELECTOR_SEARCH_RESULT, - JOB_LINK_PREFIX, - BASE_URL, - PAGE_LOAD_WAIT, - PAGE_LOAD_MAX, - REQUEST_DELAY_MIN, - REQUEST_DELAY_MAX, -) -from freework_scraper.scraper.browser import BrowserManager - -logger = logging.getLogger(__name__) - - def _random_delay(min_s: float = REQUEST_DELAY_MIN, max_s: float = REQUEST_DELAY_MAX) -> None: - """Wait a random interval to mimic human behavior.""" delay = random.uniform(min_s, max_s) time.sleep(delay) - def _build_page_url(base_url: str, page: int) -> str: - """Append or update the ?page= parameter on a URL.""" parsed = urlparse(base_url) params = parse_qs(parsed.query) params["page"] = [str(page)] new_query = urlencode(params, doseq=True) return urlunparse(parsed._replace(query=new_query)) - def detect_total_pages(browser: BrowserManager) -> int: - """Detect the total number of result pages from pagination buttons.""" soup = BeautifulSoup(browser.page_source, "html.parser") page_numbers = [] @@ -58,9 +36,7 @@ def detect_total_pages(browser: BrowserManager) -> int: logger.info("Detected %d total pages.", total) return total - def extract_job_links_from_page(browser: BrowserManager) -> list[str]: - """Extract all job detail links from the current search results page.""" soup = BeautifulSoup(browser.page_source, "html.parser") links: list[str] = [] @@ -81,26 +57,12 @@ def extract_job_links_from_page(browser: BrowserManager) -> list[str]: logger.info("Found %d job links on current page.", len(links)) return links - def collect_all_job_links( browser: BrowserManager, search_url: str, max_pages: int = 0, on_page_done: Callable | None = None, ) -> list[str]: - """ - Navigate through all search result pages and collect job links. - - Args: - browser: Browser manager instance. - search_url: The initial FreeWork search URL. - max_pages: Max pages to scrape (0 = all). - on_page_done: Optional callback(page_num, total_pages, links_count). - - Returns: - List of all job detail URLs found. - """ - # Load first page and detect pagination browser.get(search_url) time.sleep(random.uniform(PAGE_LOAD_WAIT, PAGE_LOAD_MAX)) @@ -110,13 +72,11 @@ def collect_all_job_links( all_links: list[str] = [] - # Extract links from page 1 (already loaded) page_links = extract_job_links_from_page(browser) all_links.extend(page_links) if on_page_done: on_page_done(1, total_pages, len(page_links)) - # Continue from page 2 onwards for page_num in range(2, total_pages + 1): try: page_url = _build_page_url(search_url, page_num) @@ -138,4 +98,4 @@ def collect_all_job_links( on_page_done(page_num, total_pages, 0) logger.info("Total job links collected: %d across %d pages.", len(all_links), total_pages) - return all_links + return all_links \ No newline at end of file