HanssonMagnus · HanssonMagnus · Nov 10, 2025 · Nov 10, 2025 · Nov 10, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+- Fixed metadata preservation during recategorization: structured metadata fields (speech_type, speaker, role, event, etc.) are now preserved when recategorizing files from the unknown folder
+- Fixed remaining count calculation to correctly handle cases where metadata exists but PDF files don't
+- Fixed early return bug that prevented processing metadata entries when no PDF files were present
+
+### Changed
+- Improved recategorization function to process metadata entries even when no PDF files are present
+- Enhanced remaining count calculation to account for PDFs without metadata and metadata entries without PDFs
+
 ## [0.1.0] - 2025-11-09
 
 ### Added

diff --git a/README.md b/README.md
@@ -110,15 +110,25 @@ Convert only a specific date range (inclusive):
 bis-scraper convert --start-date 2020-01-01 --end-date 2020-01-31
 ```
 
-#### Run Both Steps
+#### Re-categorize Unknown Files
 
-Run both scraping and conversion in one command:
+After scraping, some files may be placed in an `unknown/` folder if their institution couldn't be identified. If you've updated institution mappings in `constants.py`, you can re-categorize these files:
+
+```bash
+bis-scraper recategorize
+```
+
+This command moves files from the `unknown/` folder to their correct institution folders based on updated mappings. It processes both PDFs and text files together.
+
+#### Run All Steps
+
+Run scraping, recategorization, and conversion in one command:
 
 ```bash
 bis-scraper run-all --start-date 2020-01-01 --end-date 2020-01-31
 ```
 
-Note: The run-all command forwards the same date range to conversion, so only PDFs in that range are converted.
+Note: The `run-all` command runs scraping, then recategorization, then conversion. The date range is forwarded to conversion, so only PDFs in that range are converted.
 
 #### Cache Management
 
@@ -159,6 +169,7 @@ See [Scripts README](scripts/README.md) for more details.
 from pathlib import Path
 import datetime
 from bis_scraper.scrapers.controller import scrape_bis
+from bis_scraper.scrapers.recategorize import recategorize_unknown_files
 from bis_scraper.converters.controller import convert_pdfs_dates
 
 # Download speeches
@@ -172,6 +183,11 @@ result = scrape_bis(
     limit=None
 )
 
+# Re-categorize files from unknown folder (if any)
+recategorized_count, remaining_unknown = recategorize_unknown_files(Path("data"))
+if recategorized_count > 0:
+    print(f"Re-categorized {recategorized_count} file(s) from unknown folder")
+
 # Convert to text (filtering to the same date range)
 convert_result = convert_pdfs_dates(
     data_dir=Path("data"),
@@ -195,6 +211,7 @@ import datetime
 import logging
 from pathlib import Path
 from bis_scraper.scrapers.controller import scrape_bis
+from bis_scraper.scrapers.recategorize import recategorize_unknown_files
 from bis_scraper.converters.controller import convert_pdfs
 from bis_scraper.utils.constants import INSTITUTIONS
 from bis_scraper.utils.institution_utils import get_all_institutions
@@ -239,6 +256,11 @@ scrape_result = scrape_bis(
     limit=10
 )
 
+# Re-categorize files from unknown folder (if any)
+recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
+if recategorized_count > 0:
+    print(f"Re-categorized {recategorized_count} file(s) from unknown folder")
+
 # Convert all downloaded speeches
 convert_result = convert_pdfs(
     data_dir=data_dir,

diff --git a/bis_scraper/cli/main.py b/bis_scraper/cli/main.py
@@ -147,6 +147,38 @@ def scrape(
     click.echo("Scraping completed!")
 
 
+@main.command()
+@click.pass_context
+def recategorize(ctx: click.Context) -> None:
+    """Re-categorize files from unknown folder using updated institution mappings.
+
+    This command moves files from the unknown folder to their correct institution
+    folders based on updated institution mappings in constants.py. Useful when
+    you've added new institution aliases or mappings.
+
+    The command processes both PDFs and text files, moving them together to
+    maintain consistency.
+    """
+    from bis_scraper.scrapers.recategorize import recategorize_unknown_files
+
+    data_dir = ctx.obj["data_dir"]
+
+    click.echo("Re-categorizing files from unknown folder...")
+    click.echo(f"Data directory: {data_dir.absolute()}")
+
+    recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
+
+    if recategorized_count > 0:
+        click.echo(f"Re-categorized {recategorized_count} file(s) from unknown folder")
+    else:
+        click.echo("No files found to re-categorize")
+
+    if remaining_unknown > 0:
+        click.echo(f"{remaining_unknown} file(s) still remain in unknown folder")
+
+    click.echo("Re-categorization completed!")
+
+
 @main.command()
 @click.option(
     "--start-date",
@@ -244,7 +276,7 @@ def run_all(
     force: bool,
     limit: Optional[int],
 ) -> None:
-    """Run both scraping and conversion steps."""
+    """Run scraping, recategorization, and conversion steps."""
     ctx.invoke(
         scrape,
         start_date=start_date,
@@ -253,6 +285,8 @@ def run_all(
         force=force,
         limit=limit,
     )
+    # Re-categorize files from unknown folder
+    ctx.invoke(recategorize)
     # Call convert with the same date range
     ctx.invoke(
         convert,
@@ -276,7 +310,7 @@ def clear_cache(ctx: click.Context) -> None:
             # Read cache to show info
             import json
 
-            with open(cache_file, "r") as f:
+            with open(cache_file, "r", encoding="utf-8") as f:
                 cache_data = json.load(f)
                 num_dates = len(cache_data.get("dates", {}))
 

diff --git a/bis_scraper/scrapers/bis_scraper.py b/bis_scraper/scrapers/bis_scraper.py
@@ -87,7 +87,7 @@ def _load_date_cache(self) -> None:
         """Load the date cache from disk."""
         if self.date_cache_file.exists():
             try:
-                with open(self.date_cache_file, "r") as f:
+                with open(self.date_cache_file, "r", encoding="utf-8") as f:
                     cache_data = json.load(f)
                     # Convert cache format if needed (for backwards compatibility)
                     if isinstance(cache_data, dict) and "version" in cache_data:
@@ -110,7 +110,7 @@ def _save_date_cache(self) -> None:
                 "dates": self.checked_dates,
                 "updated": datetime.datetime.now().isoformat(),
             }
-            with open(self.date_cache_file, "w") as f:
+            with open(self.date_cache_file, "w", encoding="utf-8") as f:
                 json.dump(cache_data, f, indent=2)
             logger.debug(f"Saved date cache with {len(self.checked_dates)} dates")
         except Exception as e:
@@ -147,6 +147,7 @@ def scrape_date(self, date_obj: datetime.date) -> bool:
         # Track if we found any speeches for this date
         date_had_speeches = False
         files_found_count = 0
+        had_network_error = False  # Track if we had network errors
 
         # Format date for URL: YYMMDD (without century)
         date_str = date_obj.strftime("%y%m%d")
@@ -198,19 +199,31 @@ def scrape_date(self, date_obj: datetime.date) -> bool:
                     return False
 
             except Exception as e:
-                if response.status_code != 404:  # Don't log 404s as errors
+                # Check if response exists before accessing status_code
+                # Network errors (DNS, connection) may occur before response is set
+                if "response" in locals() and response.status_code != 404:
                     logger.error(
                         f"Error scraping {date_obj.isoformat()} - {speech_code}: {str(e)}",
                         exc_info=True,
                     )
                     self.result.failed += 1
-
-        # Mark this date as fully checked in the cache
-        self.checked_dates[cache_key] = {
-            "checked_at": datetime.datetime.now().isoformat(),
-            "had_speeches": date_had_speeches,
-            "files_found": files_found_count,
-        }
+                elif "response" not in locals():
+                    # Network error before response was created
+                    logger.error(
+                        f"Network error scraping {date_obj.isoformat()} - {speech_code}: {str(e)}",
+                        exc_info=True,
+                    )
+                    self.result.failed += 1
+                    had_network_error = True
+
+        # Only mark date as checked if we didn't have network errors
+        # Network errors mean we couldn't fully check the date, so we shouldn't cache it
+        if not had_network_error:
+            self.checked_dates[cache_key] = {
+                "checked_at": datetime.datetime.now().isoformat(),
+                "had_speeches": date_had_speeches,
+                "files_found": files_found_count,
+            }
         # Save cache periodically (every 10 dates to balance performance and safety)
         if len(self.checked_dates) % 10 == 0:
             self._save_date_cache()

diff --git a/bis_scraper/scrapers/controller.py b/bis_scraper/scrapers/controller.py
@@ -8,7 +8,6 @@
 
 from bis_scraper.models import ScrapingResult
 from bis_scraper.scrapers.bis_scraper import BisScraper
-from bis_scraper.scrapers.recategorize import recategorize_unknown_files
 from bis_scraper.utils.constants import RAW_DATA_DIR
 from bis_scraper.utils.file_utils import create_directory
 from bis_scraper.utils.institution_utils import normalize_institution_name
@@ -18,7 +17,7 @@
 
 def scrape_bis(
     data_dir: Path,
-    log_dir: Path,
+    log_dir: Path,  # noqa: ARG001
     start_date: Optional[datetime.datetime] = None,
     end_date: Optional[datetime.datetime] = None,
     institutions: Optional[Tuple[str, ...]] = None,
@@ -29,7 +28,7 @@ def scrape_bis(
 
     Args:
         data_dir: Base directory for data storage
-        log_dir: Directory for log files
+        log_dir: Directory for log files (part of API signature for consistency, logging configured at CLI level)
         start_date: Start date for scraping
         end_date: End date for scraping
         institutions: Specific institutions to scrape (default: all)
@@ -133,12 +132,6 @@ def scrape_bis(
     # Get results
     result = scraper.get_results()
 
-    # Re-categorize files from unknown folder if any exist
-    recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
-    if recategorized_count > 0:
-        logger.info(f"Re-categorized {recategorized_count} file(s) from unknown folder")
-        print(f"Re-categorized {recategorized_count} file(s) from unknown folder")
-
     # Log summary
     elapsed_time = time.time() - start_time
     hours, remainder = divmod(elapsed_time, 3600)