HanssonMagnus · HanssonMagnus · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/README.md b/README.md
@@ -367,6 +367,24 @@ print("\nSpeeches with most 'inflation' mentions:")
 print(results_df.sort_values('inflation', ascending=False)[['file_code', 'inflation']].head())
 ```
 
+## Known Limitations
+
+### PDF Text Conversion
+
+Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process.
+
+**What happens:**
+- The PDF is downloaded successfully
+- Text conversion fails with an encoding error
+- The PDF file remains available for manual processing
+
+**If you encounter this issue:**
+- The PDF file is still available in the `pdfs/` directory
+- You can open it directly in a PDF viewer
+- For text extraction, you may need to use OCR software or contact the source institution for alternative formats
+
+**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files.
+
 ## Development
 
 ### Setting Up Development Environment

diff --git a/bis_scraper/converters/pdf_converter.py b/bis_scraper/converters/pdf_converter.py
@@ -154,7 +154,15 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None:
             # Extract text from PDF
             logger.debug(f"Converting {pdf_path}")
             text = textract.process(str(pdf_path))
-            text_str = text.decode("utf-8")
+
+            # Handle different return types from textract
+            if text is None:
+                raise ValueError(f"textract returned None for {pdf_path}")
+            elif isinstance(text, bytes):
+                text_str = text.decode("utf-8")
+            else:
+                # textract may return a string directly
+                text_str = str(text)
 
             # Save text to file
             with open(txt_path, "w", encoding="utf-8") as f:
@@ -172,8 +180,8 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None:
             print(f"Error: {error_message}")  # Print to stdout for CLI feedback
             self.result.failed += 1
             self.result.errors[file_code] = str(e)
-            # Re-raise the exception to be caught by the calling function
-            raise
+            # Don't re-raise - let convert_institution handle continuation
+            return
 
     def get_results(self) -> ConversionResult:
         """Get the conversion results.

diff --git a/bis_scraper/scrapers/controller.py b/bis_scraper/scrapers/controller.py
@@ -8,6 +8,7 @@
 
 from bis_scraper.models import ScrapingResult
 from bis_scraper.scrapers.bis_scraper import BisScraper
+from bis_scraper.scrapers.recategorize import recategorize_unknown_files
 from bis_scraper.utils.constants import RAW_DATA_DIR
 from bis_scraper.utils.file_utils import create_directory
 from bis_scraper.utils.institution_utils import normalize_institution_name
@@ -132,6 +133,12 @@ def scrape_bis(
     # Get results
     result = scraper.get_results()
 
+    # Re-categorize files from unknown folder if any exist
+    recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
+    if recategorized_count > 0:
+        logger.info(f"Re-categorized {recategorized_count} file(s) from unknown folder")
+        print(f"Re-categorized {recategorized_count} file(s) from unknown folder")
+
     # Log summary
     elapsed_time = time.time() - start_time
     hours, remainder = divmod(elapsed_time, 3600)

diff --git a/bis_scraper/scrapers/recategorize.py b/bis_scraper/scrapers/recategorize.py
@@ -0,0 +1,164 @@
+"""Function to re-categorize files from unknown folder."""
+
+import json
+import logging
+import shutil
+from datetime import datetime
+from pathlib import Path
+from typing import Tuple
+
+from bis_scraper.utils.constants import RAW_DATA_DIR, TXT_DATA_DIR
+from bis_scraper.utils.file_utils import (
+    get_institution_directory,
+    save_metadata_to_json,
+)
+from bis_scraper.utils.institution_utils import get_institution_from_metadata
+
+logger = logging.getLogger(__name__)
+
+
+def recategorize_unknown_files(data_dir: Path) -> Tuple[int, int]:
+    """Re-categorize files from unknown folder using updated institution mappings.
+
+    This function checks the unknown folder and attempts to re-categorize files
+    based on their metadata. This is useful when institution mappings are updated
+    in constants.py after files have already been downloaded.
+
+    Args:
+        data_dir: Base directory for data storage
+
+    Returns:
+        Tuple of (files_recategorized, files_remaining) counts
+    """
+    output_dir = data_dir / RAW_DATA_DIR
+    unknown_dir = output_dir / "unknown"
+
+    # If unknown folder doesn't exist, nothing to do
+    if not unknown_dir.exists():
+        return (0, 0)
+
+    metadata_file = unknown_dir / "metadata.json"
+    if not metadata_file.exists():
+        logger.debug("No metadata.json found in unknown folder")
+        return (0, 0)
+
+    # Load metadata
+    try:
+        with open(metadata_file, "r", encoding="utf-8") as f:
+            metadata_data = json.load(f)
+    except (json.JSONDecodeError, IOError) as e:
+        logger.warning(f"Error reading metadata.json from unknown folder: {e}")
+        return (0, 0)
+
+    recategorized = 0
+    remaining_metadata = {}
+
+    # Process each entry in metadata
+    for speech_code, metadata_entry in metadata_data.items():
+        raw_text = metadata_entry.get("raw_text", "")
+        if not raw_text:
+            # Keep entry if no raw_text available
+            remaining_metadata[speech_code] = metadata_entry
+            continue
+
+        # Try to extract institution from metadata
+        institution = get_institution_from_metadata(raw_text)
+
+        if institution:  # None means not found, any string means found
+            # Found a valid institution - move the files
+            pdf_filename = f"{speech_code}.pdf"
+            txt_filename = f"{speech_code}.txt"
+            pdf_path = unknown_dir / pdf_filename
+
+            if pdf_path.exists():
+                # Get target institution directories
+                target_pdf_dir = get_institution_directory(output_dir, institution)
+                target_pdf_path = target_pdf_dir / pdf_filename
+
+                # Get text directories
+                texts_dir = data_dir / TXT_DATA_DIR
+                unknown_texts_dir = texts_dir / "unknown"
+                target_txt_dir = get_institution_directory(texts_dir, institution)
+                txt_path = unknown_texts_dir / txt_filename
+                target_txt_path = target_txt_dir / txt_filename
+
+                # Move PDF file
+                try:
+                    shutil.move(str(pdf_path), str(target_pdf_path))
+                    logger.info(
+                        f"Re-categorized {speech_code} PDF from unknown to {institution}"
+                    )
+
+                    # Move text file if it exists
+                    if txt_path.exists():
+                        # Ensure target text directory exists
+                        target_txt_dir.mkdir(parents=True, exist_ok=True)
+                        shutil.move(str(txt_path), str(target_txt_path))
+                        logger.info(
+                            f"Re-categorized {speech_code} text file from unknown to {institution}"
+                        )
+
+                    # Save metadata to target directory
+                    date_str = metadata_entry.get("date")
+                    date_obj = None
+                    if date_str:
+                        try:
+                            date_obj = datetime.fromisoformat(date_str).date()
+                        except ValueError:
+                            pass
+
+                    save_metadata_to_json(
+                        target_pdf_dir, speech_code, raw_text, date_obj
+                    )
+
+                    recategorized += 1
+                except Exception as e:
+                    logger.error(
+                        f"Error moving {speech_code} to {institution}: {e}",
+                        exc_info=True,
+                    )
+                    # Keep entry if move failed
+                    remaining_metadata[speech_code] = metadata_entry
+            else:
+                # PDF doesn't exist, but keep metadata entry
+                remaining_metadata[speech_code] = metadata_entry
+        else:
+            # Still unknown, keep entry
+            remaining_metadata[speech_code] = metadata_entry
+
+    # Update metadata.json in unknown folder
+    if remaining_metadata:
+        with open(metadata_file, "w", encoding="utf-8") as f:
+            json.dump(remaining_metadata, f, indent=2, ensure_ascii=False)
+    else:
+        # No remaining files, remove metadata.json and unknown folders if empty
+        metadata_file.unlink()
+        try:
+            unknown_dir.rmdir()  # Remove if empty
+            logger.info("Removed empty unknown PDF folder")
+        except OSError:
+            # Folder not empty (might have other files)
+            pass
+
+        # Also try to remove unknown texts folder if empty
+        texts_dir = data_dir / TXT_DATA_DIR
+        unknown_texts_dir = texts_dir / "unknown"
+        if unknown_texts_dir.exists():
+            try:
+                # Check if folder is empty (only check for .txt files, ignore other files)
+                txt_files = list(unknown_texts_dir.glob("*.txt"))
+                if not txt_files:
+                    unknown_texts_dir.rmdir()
+                    logger.info("Removed empty unknown texts folder")
+            except OSError:
+                # Folder not empty or other error
+                pass
+
+    remaining_count = len(remaining_metadata)
+    if recategorized > 0:
+        logger.info(
+            f"Re-categorized {recategorized} file(s) from unknown folder. "
+            f"{remaining_count} file(s) still unknown."
+        )
+
+    return (recategorized, remaining_count)
diff --git a/bis_scraper/utils/constants.py b/bis_scraper/utils/constants.py
@@ -53,6 +53,7 @@
     "Central Bank of Colombia",
     "Central Bank of Curaçao and Sint Maarten",
     "Central Bank of Cyprus",
+    "Central Bank of Eswatini",
     "Central Bank of Iceland",
     "Central Bank of Ireland",
     "Central Bank of Jordan",
@@ -196,6 +197,7 @@
     ],
     "central bank of norway": ["norges bank"],
     "bank of france": ["banque de france"],
+    "bank of portugal": ["banco de portugal"],
     "netherlands bank": ["nederlandsche bank"],
     "south african reserve bank": ["bank of south africa"],
     "hong kong monetary authority": ["hong kong monetary"],
@@ -209,4 +211,5 @@
     "central bank of the republic of turkey": ["bank of turkey"],
     "people's bank of china": ["bank of china"],
     "reserve bank of australia": ["australian reserve bank", "bank of australia"],
+    "saudi arabian monetary agency": ["saudi central bank"],
 }
diff --git a/docs/api.md b/docs/api.md
@@ -82,6 +82,8 @@ Returns a `ConversionResult` object with the following attributes:
 - `failed`: Number of PDFs that failed to convert
 - `errors`: Dictionary mapping file codes to error messages
 
+**Note:** Some PDFs (approximately 8%) may fail to convert due to encoding issues in the source PDF files. These PDFs are still downloaded successfully and remain available in the `pdfs/` directory for manual processing. See [Known Limitations](#known-limitations) for more details.
+
 ### `convert_pdfs_dates()`
 
 Convert PDFs with an optional inclusive date range filter.
@@ -287,6 +289,24 @@ scrape_and_upload_to_gcs(
 
 - **Error Handling**: Add appropriate error handling and retry logic for production use.
 
+## Known Limitations
+
+### PDF Text Conversion
+
+Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process.
+
+**What happens:**
+- The PDF is downloaded successfully
+- Text conversion fails with an encoding error
+- The PDF file remains available for manual processing
+
+**If you encounter this issue:**
+- The PDF file is still available in the `pdfs/` directory
+- You can open it directly in a PDF viewer
+- For text extraction, you may need to use OCR software or contact the source institution for alternative formats
+
+**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files.
+
 ## Command-Line Interface (CLI)
 
 The package also provides a command-line interface that can be used in your terminal after installation:

diff --git a/scripts/run_full_scrape.sh b/scripts/run_full_scrape.sh
@@ -7,12 +7,12 @@
 #LIMIT_NUM=5
 
 # Data storage locations
-DATA_DIR="$HOME/bis_full_data"
-LOG_DIR="$HOME/bis_full_data/logs"
+DATA_DIR="../data/bis_full_data"
+LOG_DIR="../data/bis_full_data/logs"
 
 # Date range (format: YYYY-MM-DD)
 #START_DATE="1997-01-01"   # BIS speeches start around 1997
-START_DATE="2025-10-01"   # BIS speeches start around 1997
+START_DATE="2025-08-01"   # BIS speeches start around 1997
 #START_DATE=$(date +%Y-%m-%d)  # Today's date
 END_DATE=$(date +%Y-%m-%d)  # Today's date
 

diff --git a/tests/unit/test_pdf_converter.py b/tests/unit/test_pdf_converter.py
@@ -227,9 +227,7 @@ def side_effect(file_path):
         # Check results
         result = converter.get_results()
         self.assertEqual(result.successful, 1)  # First file converted
-        # The PDF converter counts errors both in the converter's internal state
-        # and in the exception handler in convert_institution, resulting in 2 counts
-        self.assertEqual(result.failed, 2)  # Second file failed, counted in two places
+        self.assertEqual(result.failed, 1)  # Second file failed
         self.assertIn("220102b", result.errors)  # Error recorded