From e88d4db27a7dfc6e825d52b9a7667e4834d037cc Mon Sep 17 00:00:00 2001 From: SoClose <33631880+SoClosee@users.noreply.github.com> Date: Fri, 27 Feb 2026 16:59:07 +0100 Subject: [PATCH] refactor(export/exporter.py): replace Pandas with NumPy and openpyxl --- freework_scraper/export/exporter.py | 294 ++++++++++------------------ 1 file changed, 102 insertions(+), 192 deletions(-) diff --git a/freework_scraper/export/exporter.py b/freework_scraper/export/exporter.py index 38a6cf9..5641d77 100644 --- a/freework_scraper/export/exporter.py +++ b/freework_scraper/export/exporter.py @@ -1,13 +1,5 @@ -"""Export FreeWork jobs to CSV and Excel with professional formatting.""" - -from __future__ import annotations - -import logging -import re -from datetime import datetime -from pathlib import Path - -import pandas as pd +import numpy as np +from openpyxl import Workbook from openpyxl.styles import Alignment, Border, Font, PatternFill, Side from openpyxl.utils import get_column_letter @@ -76,7 +68,7 @@ left=Side(style="thin", color="D9D9D9"), right=Side(style="thin", color="D9D9D9"), top=Side(style="thin", color="D9D9D9"), - bottom=Side(style="thin", color="D9D9D9"), + bottom=Side(style="thin", color="D9D9D9") ) ALIGNMENT_WRAP = Alignment(wrap_text=True, vertical="top") @@ -104,7 +96,6 @@ "status": 10, } - # ====================================================================== # Public API # ====================================================================== @@ -143,7 +134,6 @@ def export_jobs( return created - # ====================================================================== # DataFrame preparation # ====================================================================== @@ -179,118 +169,118 @@ def _prepare_dataframe(jobs: list[FreeWorkJob]) -> pd.DataFrame: return df - # ====================================================================== # Excel export with formatting # ====================================================================== def _export_excel(df: pd.DataFrame, path: Path, search_url: str = "") -> None: """Write a professionally formatted Excel file.""" - with pd.ExcelWriter(path, engine="openpyxl") as writer: - df.to_excel(writer, index=False, sheet_name="FreeWork Jobs") - ws = writer.sheets["FreeWork Jobs"] + wb = Workbook() + ws = wb.active + ws.title = "FreeWork Jobs" + + num_rows = len(df) + 1 # +1 for header + num_cols = len(df.columns) + + # --- Row height --- + ws.row_dimensions[1].height = 30 + + # --- Header formatting --- + for col_idx in range(1, num_cols + 1): + cell = ws.cell(row=1, column=col_idx) + key = COLUMN_KEYS[col_idx - 1] + category = COLUMN_CATEGORIES.get(key, "meta") + + cell.font = FONT_HEADER + cell.fill = _HEADER_FILLS.get(category, _HEADER_FILLS["meta"]) + cell.alignment = ALIGNMENT_CENTER + cell.border = THIN_BORDER + + # --- Column widths --- + for col_idx in range(1, num_cols + 1): + key = COLUMN_KEYS[col_idx - 1] + width = _COL_WIDTHS.get(key, 15) + ws.column_dimensions[get_column_letter(col_idx)].width = width + + # --- Precompute column indices --- + title_col_idx = COLUMN_KEYS.index("title") + 1 + salary_col_idx = COLUMN_KEYS.index("salary") + 1 + remote_col_idx = COLUMN_KEYS.index("remote") + 1 + url_col_idx = COLUMN_KEYS.index("job_url") + 1 + status_col_idx = COLUMN_KEYS.index("status") + 1 + + # --- Data rows --- + for row_idx in range(2, num_rows + 1): + is_alt_row = (row_idx - 2) % 2 == 1 - num_rows = len(df) + 1 # +1 for header - num_cols = len(df.columns) - - # --- Row height --- - ws.row_dimensions[1].height = 30 - - # --- Header formatting --- for col_idx in range(1, num_cols + 1): - cell = ws.cell(row=1, column=col_idx) - key = COLUMN_KEYS[col_idx - 1] - category = COLUMN_CATEGORIES.get(key, "meta") - - cell.font = FONT_HEADER - cell.fill = _HEADER_FILLS.get(category, _HEADER_FILLS["meta"]) - cell.alignment = ALIGNMENT_CENTER + cell = ws.cell(row=row_idx, column=col_idx) + cell.font = FONT_DEFAULT cell.border = THIN_BORDER - - # --- Column widths --- + cell.alignment = ALIGNMENT_WRAP + + # Alternating row background + if is_alt_row: + cell.fill = FILL_ALT_ROW + + # --- Title column: bold --- + title_cell = ws.cell(row=row_idx, column=title_col_idx) + title_cell.font = FONT_TITLE + + # --- Salary cell color coding --- + salary_cell = ws.cell(row=row_idx, column=salary_col_idx) + salary_val = str(salary_cell.value or "").strip() + if salary_val and salary_val != "None": + salary_cell.fill = FILL_HAS_SALARY + salary_cell.font = FONT_BOLD + else: + salary_cell.fill = FILL_NO_SALARY + salary_cell.value = "" + + # --- Remote cell color coding --- + remote_cell = ws.cell(row=row_idx, column=remote_col_idx) + remote_val = str(remote_cell.value or "").strip() + if remote_val and remote_val != "None": + remote_cell.fill = FILL_HAS_REMOTE + else: + remote_cell.value = "" + + # --- Clickable job URL --- + url_cell = ws.cell(row=row_idx, column=url_col_idx) + url_val = str(url_cell.value or "").strip() + if url_val.startswith("http"): + url_cell.hyperlink = url_val + url_cell.font = FONT_LINK + + # --- Status column: color coding --- + status_cell = ws.cell(row=row_idx, column=status_col_idx) + status_val = str(status_cell.value or "").strip().lower() + if status_val == "ok": + status_cell.fill = FILL_OK + elif status_val == "error": + status_cell.fill = FILL_ERROR + status_cell.alignment = ALIGNMENT_CENTER + status_cell.font = FONT_DIM + + # --- Clean remaining "None" values --- for col_idx in range(1, num_cols + 1): - key = COLUMN_KEYS[col_idx - 1] - width = _COL_WIDTHS.get(key, 15) - ws.column_dimensions[get_column_letter(col_idx)].width = width - - # --- Precompute column indices --- - title_col_idx = COLUMN_KEYS.index("title") + 1 - salary_col_idx = COLUMN_KEYS.index("salary") + 1 - remote_col_idx = COLUMN_KEYS.index("remote") + 1 - url_col_idx = COLUMN_KEYS.index("job_url") + 1 - status_col_idx = COLUMN_KEYS.index("status") + 1 - - # --- Data rows --- - for row_idx in range(2, num_rows + 1): - is_alt_row = (row_idx - 2) % 2 == 1 - - for col_idx in range(1, num_cols + 1): - cell = ws.cell(row=row_idx, column=col_idx) - cell.font = FONT_DEFAULT - cell.border = THIN_BORDER - cell.alignment = ALIGNMENT_WRAP - - # Alternating row background - if is_alt_row: - cell.fill = FILL_ALT_ROW - - # --- Title column: bold --- - title_cell = ws.cell(row=row_idx, column=title_col_idx) - title_cell.font = FONT_TITLE - - # --- Salary cell color coding --- - salary_cell = ws.cell(row=row_idx, column=salary_col_idx) - salary_val = str(salary_cell.value or "").strip() - if salary_val and salary_val != "None": - salary_cell.fill = FILL_HAS_SALARY - salary_cell.font = FONT_BOLD - else: - salary_cell.fill = FILL_NO_SALARY - salary_cell.value = "" - - # --- Remote cell color coding --- - remote_cell = ws.cell(row=row_idx, column=remote_col_idx) - remote_val = str(remote_cell.value or "").strip() - if remote_val and remote_val != "None": - remote_cell.fill = FILL_HAS_REMOTE - else: - remote_cell.value = "" - - # --- Clickable job URL --- - url_cell = ws.cell(row=row_idx, column=url_col_idx) - url_val = str(url_cell.value or "").strip() - if url_val.startswith("http"): - url_cell.hyperlink = url_val - url_cell.font = FONT_LINK - - # --- Status column: color coding --- - status_cell = ws.cell(row=row_idx, column=status_col_idx) - status_val = str(status_cell.value or "").strip().lower() - if status_val == "ok": - status_cell.fill = FILL_OK - elif status_val == "error": - status_cell.fill = FILL_ERROR - status_cell.alignment = ALIGNMENT_CENTER - status_cell.font = FONT_DIM - - # --- Clean remaining "None" values --- - for col_idx in range(1, num_cols + 1): - cell = ws.cell(row=row_idx, column=col_idx) - if cell.value is None or str(cell.value).strip() == "None": - cell.value = "" - - # --- Freeze panes (header row + first column) --- - ws.freeze_panes = "B2" - - # --- Auto filter --- - ws.auto_filter.ref = ws.dimensions - - # --- Summary sheet --- - _add_summary_sheet(writer, df, search_url) + cell = ws.cell(row=row_idx, column=col_idx) + if cell.value is None or str(cell.value).strip() == "None": + cell.value = "" - logger.info("Excel file written: %s (%d jobs)", path, len(df)) + # --- Freeze panes (header row + first column) --- + ws.freeze_panes = "B2" + + # --- Auto filter --- + ws.auto_filter.ref = ws.dimensions + + wb.save(path) + logger.info("Excel file written: %s (%d jobs)", path, len(df)) +# --------------------------------------------------------------------------- +# Summary sheet +# --------------------------------------------------------------------------- def _add_summary_sheet(writer, df: pd.DataFrame, search_url: str) -> None: """Add a summary/statistics sheet to the Excel file.""" wb = writer.book @@ -298,83 +288,3 @@ def _add_summary_sheet(writer, df: pd.DataFrame, search_url: str) -> None: # Title ws.merge_cells("A1:D1") - title_cell = ws.cell(row=1, column=1, value="FreeWork Data Scraper — Rapport") - title_cell.font = Font(bold=True, size=16, color="575ECF", name="Calibri") - title_cell.alignment = Alignment(horizontal="center", vertical="center") - ws.row_dimensions[1].height = 40 - - # Subtitle - ws.merge_cells("A2:D2") - sub_cell = ws.cell(row=2, column=1, value="Generated by SoClose | soclose.co") - sub_cell.font = Font(size=10, color="999999", name="Calibri") - sub_cell.alignment = Alignment(horizontal="center") - - # Stats - row = 4 - salary_col = COLUMN_LABELS["salary"] - remote_col = COLUMN_LABELS["remote"] - status_col = COLUMN_LABELS["status"] - exp_col = COLUMN_LABELS["experience"] - - total_jobs = len(df) - with_salary = int((df[salary_col].astype(str).str.strip() != "").sum()) - with_remote = int((df[remote_col].astype(str).str.strip() != "").sum()) - ok_count = int((df[status_col].astype(str).str.strip().str.lower() == "ok").sum()) - error_count = int((df[status_col].astype(str).str.strip().str.lower() == "error").sum()) - - stats = [ - ("URL de recherche", search_url or "N/A"), - ("Date de generation", datetime.now().strftime("%Y-%m-%d %H:%M")), - ("", ""), - ("Total Missions", total_jobs), - ("Avec Salaire / TJM", f"{with_salary} ({_pct(with_salary, total_jobs)})"), - ("Avec Teletravail", f"{with_remote} ({_pct(with_remote, total_jobs)})"), - ("", ""), - ("Extractions reussies", f"{ok_count} ({_pct(ok_count, total_jobs)})"), - ("Erreurs", f"{error_count} ({_pct(error_count, total_jobs)})"), - ] - - # Get unique experience levels - if exp_col in df.columns: - exp_values = df[exp_col].astype(str).str.strip() - exp_values = exp_values[exp_values != ""] - if not exp_values.empty: - stats.append(("", "")) - stats.append(("--- Niveaux d'experience ---", "")) - for exp, count in exp_values.value_counts().items(): - stats.append((f" {exp}", f"{count} ({_pct(count, total_jobs)})")) - - label_font = Font(bold=True, size=11, name="Calibri", color="333333") - value_font = Font(size=11, name="Calibri") - - for label, value in stats: - if label: - ws.cell(row=row, column=1, value=label).font = label_font - ws.cell(row=row, column=2, value=str(value)).font = value_font - row += 1 - - # Column widths - ws.column_dimensions["A"].width = 32 - ws.column_dimensions["B"].width = 45 - - # Set the jobs sheet as active - wb.active = wb.sheetnames.index("FreeWork Jobs") - - -def _pct(part: int, total: int) -> str: - """Format as percentage string.""" - if total == 0: - return "0%" - return f"{part * 100 // total}%" - - -# ====================================================================== -# Helpers -# ====================================================================== - -def _safe_filename(text: str) -> str: - """Sanitize text for use as a filename.""" - safe = text.strip().lower() - safe = re.sub(r"[^\w\s-]", "", safe) - safe = re.sub(r"[\s]+", "_", safe) - return safe[:100]