diff --git a/README.md b/README.md index 9fd815e..ec00a45 100644 --- a/README.md +++ b/README.md @@ -367,6 +367,24 @@ print("\nSpeeches with most 'inflation' mentions:") print(results_df.sort_values('inflation', ascending=False)[['file_code', 'inflation']].head()) ``` +## Known Limitations + +### PDF Text Conversion + +Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process. + +**What happens:** +- The PDF is downloaded successfully +- Text conversion fails with an encoding error +- The PDF file remains available for manual processing + +**If you encounter this issue:** +- The PDF file is still available in the `pdfs/` directory +- You can open it directly in a PDF viewer +- For text extraction, you may need to use OCR software or contact the source institution for alternative formats + +**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files. + ## Development ### Setting Up Development Environment diff --git a/bis_scraper/converters/pdf_converter.py b/bis_scraper/converters/pdf_converter.py index fe3724b..2a4277e 100644 --- a/bis_scraper/converters/pdf_converter.py +++ b/bis_scraper/converters/pdf_converter.py @@ -154,7 +154,15 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None: # Extract text from PDF logger.debug(f"Converting {pdf_path}") text = textract.process(str(pdf_path)) - text_str = text.decode("utf-8") + + # Handle different return types from textract + if text is None: + raise ValueError(f"textract returned None for {pdf_path}") + elif isinstance(text, bytes): + text_str = text.decode("utf-8") + else: + # textract may return a string directly + text_str = str(text) # Save text to file with open(txt_path, "w", encoding="utf-8") as f: @@ -172,8 +180,8 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None: print(f"Error: {error_message}") # Print to stdout for CLI feedback self.result.failed += 1 self.result.errors[file_code] = str(e) - # Re-raise the exception to be caught by the calling function - raise + # Don't re-raise - let convert_institution handle continuation + return def get_results(self) -> ConversionResult: """Get the conversion results. diff --git a/bis_scraper/scrapers/controller.py b/bis_scraper/scrapers/controller.py index 40c6629..ef7f99d 100644 --- a/bis_scraper/scrapers/controller.py +++ b/bis_scraper/scrapers/controller.py @@ -8,6 +8,7 @@ from bis_scraper.models import ScrapingResult from bis_scraper.scrapers.bis_scraper import BisScraper +from bis_scraper.scrapers.recategorize import recategorize_unknown_files from bis_scraper.utils.constants import RAW_DATA_DIR from bis_scraper.utils.file_utils import create_directory from bis_scraper.utils.institution_utils import normalize_institution_name @@ -132,6 +133,12 @@ def scrape_bis( # Get results result = scraper.get_results() + # Re-categorize files from unknown folder if any exist + recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir) + if recategorized_count > 0: + logger.info(f"Re-categorized {recategorized_count} file(s) from unknown folder") + print(f"Re-categorized {recategorized_count} file(s) from unknown folder") + # Log summary elapsed_time = time.time() - start_time hours, remainder = divmod(elapsed_time, 3600) diff --git a/bis_scraper/scrapers/recategorize.py b/bis_scraper/scrapers/recategorize.py new file mode 100644 index 0000000..47aea90 --- /dev/null +++ b/bis_scraper/scrapers/recategorize.py @@ -0,0 +1,164 @@ +"""Function to re-categorize files from unknown folder.""" + +import json +import logging +import shutil +from datetime import datetime +from pathlib import Path +from typing import Tuple + +from bis_scraper.utils.constants import RAW_DATA_DIR, TXT_DATA_DIR +from bis_scraper.utils.file_utils import ( + get_institution_directory, + save_metadata_to_json, +) +from bis_scraper.utils.institution_utils import get_institution_from_metadata + +logger = logging.getLogger(__name__) + + +def recategorize_unknown_files(data_dir: Path) -> Tuple[int, int]: + """Re-categorize files from unknown folder using updated institution mappings. + + This function checks the unknown folder and attempts to re-categorize files + based on their metadata. This is useful when institution mappings are updated + in constants.py after files have already been downloaded. + + Args: + data_dir: Base directory for data storage + + Returns: + Tuple of (files_recategorized, files_remaining) counts + """ + output_dir = data_dir / RAW_DATA_DIR + unknown_dir = output_dir / "unknown" + + # If unknown folder doesn't exist, nothing to do + if not unknown_dir.exists(): + return (0, 0) + + metadata_file = unknown_dir / "metadata.json" + if not metadata_file.exists(): + logger.debug("No metadata.json found in unknown folder") + return (0, 0) + + # Load metadata + try: + with open(metadata_file, "r", encoding="utf-8") as f: + metadata_data = json.load(f) + except (json.JSONDecodeError, IOError) as e: + logger.warning(f"Error reading metadata.json from unknown folder: {e}") + return (0, 0) + + recategorized = 0 + remaining_metadata = {} + + # Process each entry in metadata + for speech_code, metadata_entry in metadata_data.items(): + raw_text = metadata_entry.get("raw_text", "") + if not raw_text: + # Keep entry if no raw_text available + remaining_metadata[speech_code] = metadata_entry + continue + + # Try to extract institution from metadata + institution = get_institution_from_metadata(raw_text) + + if institution: # None means not found, any string means found + # Found a valid institution - move the files + pdf_filename = f"{speech_code}.pdf" + txt_filename = f"{speech_code}.txt" + pdf_path = unknown_dir / pdf_filename + + if pdf_path.exists(): + # Get target institution directories + target_pdf_dir = get_institution_directory(output_dir, institution) + target_pdf_path = target_pdf_dir / pdf_filename + + # Get text directories + texts_dir = data_dir / TXT_DATA_DIR + unknown_texts_dir = texts_dir / "unknown" + target_txt_dir = get_institution_directory(texts_dir, institution) + txt_path = unknown_texts_dir / txt_filename + target_txt_path = target_txt_dir / txt_filename + + # Move PDF file + try: + shutil.move(str(pdf_path), str(target_pdf_path)) + logger.info( + f"Re-categorized {speech_code} PDF from unknown to {institution}" + ) + + # Move text file if it exists + if txt_path.exists(): + # Ensure target text directory exists + target_txt_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(txt_path), str(target_txt_path)) + logger.info( + f"Re-categorized {speech_code} text file from unknown to {institution}" + ) + + # Save metadata to target directory + date_str = metadata_entry.get("date") + date_obj = None + if date_str: + try: + date_obj = datetime.fromisoformat(date_str).date() + except ValueError: + pass + + save_metadata_to_json( + target_pdf_dir, speech_code, raw_text, date_obj + ) + + recategorized += 1 + except Exception as e: + logger.error( + f"Error moving {speech_code} to {institution}: {e}", + exc_info=True, + ) + # Keep entry if move failed + remaining_metadata[speech_code] = metadata_entry + else: + # PDF doesn't exist, but keep metadata entry + remaining_metadata[speech_code] = metadata_entry + else: + # Still unknown, keep entry + remaining_metadata[speech_code] = metadata_entry + + # Update metadata.json in unknown folder + if remaining_metadata: + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(remaining_metadata, f, indent=2, ensure_ascii=False) + else: + # No remaining files, remove metadata.json and unknown folders if empty + metadata_file.unlink() + try: + unknown_dir.rmdir() # Remove if empty + logger.info("Removed empty unknown PDF folder") + except OSError: + # Folder not empty (might have other files) + pass + + # Also try to remove unknown texts folder if empty + texts_dir = data_dir / TXT_DATA_DIR + unknown_texts_dir = texts_dir / "unknown" + if unknown_texts_dir.exists(): + try: + # Check if folder is empty (only check for .txt files, ignore other files) + txt_files = list(unknown_texts_dir.glob("*.txt")) + if not txt_files: + unknown_texts_dir.rmdir() + logger.info("Removed empty unknown texts folder") + except OSError: + # Folder not empty or other error + pass + + remaining_count = len(remaining_metadata) + if recategorized > 0: + logger.info( + f"Re-categorized {recategorized} file(s) from unknown folder. " + f"{remaining_count} file(s) still unknown." + ) + + return (recategorized, remaining_count) diff --git a/bis_scraper/utils/constants.py b/bis_scraper/utils/constants.py index 068ddee..a1330a6 100644 --- a/bis_scraper/utils/constants.py +++ b/bis_scraper/utils/constants.py @@ -53,6 +53,7 @@ "Central Bank of Colombia", "Central Bank of CuraƧao and Sint Maarten", "Central Bank of Cyprus", + "Central Bank of Eswatini", "Central Bank of Iceland", "Central Bank of Ireland", "Central Bank of Jordan", @@ -196,6 +197,7 @@ ], "central bank of norway": ["norges bank"], "bank of france": ["banque de france"], + "bank of portugal": ["banco de portugal"], "netherlands bank": ["nederlandsche bank"], "south african reserve bank": ["bank of south africa"], "hong kong monetary authority": ["hong kong monetary"], @@ -209,4 +211,5 @@ "central bank of the republic of turkey": ["bank of turkey"], "people's bank of china": ["bank of china"], "reserve bank of australia": ["australian reserve bank", "bank of australia"], + "saudi arabian monetary agency": ["saudi central bank"], } diff --git a/docs/api.md b/docs/api.md index 9a6cfff..583d663 100644 --- a/docs/api.md +++ b/docs/api.md @@ -82,6 +82,8 @@ Returns a `ConversionResult` object with the following attributes: - `failed`: Number of PDFs that failed to convert - `errors`: Dictionary mapping file codes to error messages +**Note:** Some PDFs (approximately 8%) may fail to convert due to encoding issues in the source PDF files. These PDFs are still downloaded successfully and remain available in the `pdfs/` directory for manual processing. See [Known Limitations](#known-limitations) for more details. + ### `convert_pdfs_dates()` Convert PDFs with an optional inclusive date range filter. @@ -287,6 +289,24 @@ scrape_and_upload_to_gcs( - **Error Handling**: Add appropriate error handling and retry logic for production use. +## Known Limitations + +### PDF Text Conversion + +Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process. + +**What happens:** +- The PDF is downloaded successfully +- Text conversion fails with an encoding error +- The PDF file remains available for manual processing + +**If you encounter this issue:** +- The PDF file is still available in the `pdfs/` directory +- You can open it directly in a PDF viewer +- For text extraction, you may need to use OCR software or contact the source institution for alternative formats + +**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files. + ## Command-Line Interface (CLI) The package also provides a command-line interface that can be used in your terminal after installation: diff --git a/scripts/run_full_scrape.sh b/scripts/run_full_scrape.sh index 77af5e4..fda6669 100755 --- a/scripts/run_full_scrape.sh +++ b/scripts/run_full_scrape.sh @@ -7,12 +7,12 @@ #LIMIT_NUM=5 # Data storage locations -DATA_DIR="$HOME/bis_full_data" -LOG_DIR="$HOME/bis_full_data/logs" +DATA_DIR="../data/bis_full_data" +LOG_DIR="../data/bis_full_data/logs" # Date range (format: YYYY-MM-DD) #START_DATE="1997-01-01" # BIS speeches start around 1997 -START_DATE="2025-10-01" # BIS speeches start around 1997 +START_DATE="2025-08-01" # BIS speeches start around 1997 #START_DATE=$(date +%Y-%m-%d) # Today's date END_DATE=$(date +%Y-%m-%d) # Today's date diff --git a/tests/unit/test_pdf_converter.py b/tests/unit/test_pdf_converter.py index f893cde..36b82df 100644 --- a/tests/unit/test_pdf_converter.py +++ b/tests/unit/test_pdf_converter.py @@ -227,9 +227,7 @@ def side_effect(file_path): # Check results result = converter.get_results() self.assertEqual(result.successful, 1) # First file converted - # The PDF converter counts errors both in the converter's internal state - # and in the exception handler in convert_institution, resulting in 2 counts - self.assertEqual(result.failed, 2) # Second file failed, counted in two places + self.assertEqual(result.failed, 1) # Second file failed self.assertIn("220102b", result.errors) # Error recorded diff --git a/tests/unit/test_recategorize.py b/tests/unit/test_recategorize.py new file mode 100644 index 0000000..98ce390 --- /dev/null +++ b/tests/unit/test_recategorize.py @@ -0,0 +1,256 @@ +"""Unit tests for recategorize functionality.""" + +import json +import tempfile +import unittest +from pathlib import Path + +from bis_scraper.scrapers.recategorize import recategorize_unknown_files +from bis_scraper.utils.constants import RAW_DATA_DIR, TXT_DATA_DIR + + +class TestRecategorize(unittest.TestCase): + """Test recategorize_unknown_files function.""" + + def setUp(self) -> None: + """Set up test fixtures.""" + # Create a temporary directory for tests + self.temp_dir = Path(tempfile.mkdtemp()) + self.output_dir = self.temp_dir / RAW_DATA_DIR + self.texts_dir = self.temp_dir / TXT_DATA_DIR + self.unknown_dir = self.output_dir / "unknown" + self.unknown_texts_dir = self.texts_dir / "unknown" + self.unknown_dir.mkdir(parents=True) + self.unknown_texts_dir.mkdir(parents=True) + + def tearDown(self) -> None: + """Tear down test fixtures.""" + # Clean up temporary directory + import shutil + + if self.temp_dir.exists(): + shutil.rmtree(self.temp_dir) + + def test_no_unknown_folder(self) -> None: + """Test when unknown folder doesn't exist.""" + # Remove unknown folder + self.unknown_dir.rmdir() + self.output_dir.rmdir() + + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + self.assertEqual(recategorized, 0) + self.assertEqual(remaining, 0) + + def test_no_metadata_file(self) -> None: + """Test when metadata.json doesn't exist.""" + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + self.assertEqual(recategorized, 0) + self.assertEqual(remaining, 0) + + def test_recategorize_successful(self) -> None: + """Test successful re-categorization of files.""" + # Create metadata.json with ECB speech + metadata_data = { + "200101a": { + "raw_text": "Speech by Mr. John Smith, Governor of the European Central Bank, at the Conference", + "speech_type": "Speech", + "speaker": "Mr. John Smith", + "role": "Governor of the European Central Bank", + "date": "2020-01-01", + } + } + metadata_file = self.unknown_dir / "metadata.json" + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata_data, f, indent=2) + + # Create PDF file + pdf_file = self.unknown_dir / "200101a.pdf" + pdf_file.write_bytes(b"%PDF-1.4\nTest PDF content") + + # Create corresponding text file + txt_file = self.unknown_texts_dir / "200101a.txt" + txt_file.write_text("Test text content", encoding="utf-8") + + # Run recategorize + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Check results + self.assertEqual(recategorized, 1) + self.assertEqual(remaining, 0) + + # Check PDF file was moved + ecb_pdf_dir = self.output_dir / "european_central_bank" + self.assertTrue(ecb_pdf_dir.exists()) + self.assertTrue((ecb_pdf_dir / "200101a.pdf").exists()) + self.assertFalse(pdf_file.exists()) + + # Check text file was moved + ecb_txt_dir = self.texts_dir / "european_central_bank" + self.assertTrue(ecb_txt_dir.exists()) + self.assertTrue((ecb_txt_dir / "200101a.txt").exists()) + self.assertFalse(txt_file.exists()) + + # Check metadata was moved + self.assertTrue((ecb_pdf_dir / "metadata.json").exists()) + with open(ecb_pdf_dir / "metadata.json", "r", encoding="utf-8") as f: + moved_metadata = json.load(f) + self.assertIn("200101a", moved_metadata) + + # Check unknown folders were removed (empty) + self.assertFalse(self.unknown_dir.exists()) + self.assertFalse(self.unknown_texts_dir.exists()) + + def test_recategorize_partial(self) -> None: + """Test partial re-categorization when some files can't be categorized.""" + # Create metadata with one recognizable and one unrecognizable + metadata_data = { + "200101a": { + "raw_text": "Speech by Mr. John Smith, Governor of the European Central Bank", + "date": "2020-01-01", + }, + "200101b": { + "raw_text": "Some unrecognizable text without institution", + "date": "2020-01-01", + }, + } + metadata_file = self.unknown_dir / "metadata.json" + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata_data, f, indent=2) + + # Create PDF files + (self.unknown_dir / "200101a.pdf").write_bytes(b"%PDF-1.4\nTest PDF") + (self.unknown_dir / "200101b.pdf").write_bytes(b"%PDF-1.4\nTest PDF") + + # Create text files + (self.unknown_texts_dir / "200101a.txt").write_text( + "Test text", encoding="utf-8" + ) + (self.unknown_texts_dir / "200101b.txt").write_text( + "Test text", encoding="utf-8" + ) + + # Run recategorize + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Check results + self.assertEqual(recategorized, 1) + self.assertEqual(remaining, 1) + + # Check ECB files were moved + ecb_pdf_dir = self.output_dir / "european_central_bank" + ecb_txt_dir = self.texts_dir / "european_central_bank" + self.assertTrue((ecb_pdf_dir / "200101a.pdf").exists()) + self.assertTrue((ecb_txt_dir / "200101a.txt").exists()) + + # Check unknown files remain + self.assertTrue((self.unknown_dir / "200101b.pdf").exists()) + self.assertTrue((self.unknown_texts_dir / "200101b.txt").exists()) + self.assertTrue(metadata_file.exists()) + + # Check remaining metadata + with open(metadata_file, "r", encoding="utf-8") as f: + remaining_metadata = json.load(f) + self.assertIn("200101b", remaining_metadata) + self.assertNotIn("200101a", remaining_metadata) + + def test_missing_pdf_file(self) -> None: + """Test when PDF file doesn't exist.""" + # Create metadata but no PDF + metadata_data = { + "200101a": { + "raw_text": "Speech by Mr. John Smith, Governor of the European Central Bank", + "date": "2020-01-01", + } + } + metadata_file = self.unknown_dir / "metadata.json" + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata_data, f, indent=2) + + # Create text file (but no PDF) + (self.unknown_texts_dir / "200101a.txt").write_text( + "Test text", encoding="utf-8" + ) + + # Run recategorize + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Should not recategorize (no PDF) + self.assertEqual(recategorized, 0) + self.assertEqual(remaining, 1) + + # Metadata and text file should remain + self.assertTrue(metadata_file.exists()) + self.assertTrue((self.unknown_texts_dir / "200101a.txt").exists()) + + def test_pdf_without_text_file(self) -> None: + """Test when PDF exists but text file doesn't.""" + # Create metadata with ECB speech + metadata_data = { + "200101a": { + "raw_text": "Speech by Mr. John Smith, Governor of the European Central Bank", + "date": "2020-01-01", + } + } + metadata_file = self.unknown_dir / "metadata.json" + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata_data, f, indent=2) + + # Create PDF file but no text file + pdf_file = self.unknown_dir / "200101a.pdf" + pdf_file.write_bytes(b"%PDF-1.4\nTest PDF content") + + # Run recategorize + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Should recategorize (PDF exists, text file optional) + self.assertEqual(recategorized, 1) + self.assertEqual(remaining, 0) + + # Check PDF was moved + ecb_pdf_dir = self.output_dir / "european_central_bank" + self.assertTrue((ecb_pdf_dir / "200101a.pdf").exists()) + + # Text file should not exist in target (wasn't created) + ecb_txt_dir = self.texts_dir / "european_central_bank" + if ecb_txt_dir.exists(): + self.assertFalse((ecb_txt_dir / "200101a.txt").exists()) + + def test_no_raw_text(self) -> None: + """Test when metadata has no raw_text.""" + # Create metadata without raw_text + metadata_data = { + "200101a": { + "speech_type": "Speech", + "date": "2020-01-01", + } + } + metadata_file = self.unknown_dir / "metadata.json" + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata_data, f, indent=2) + + # Create PDF file + (self.unknown_dir / "200101a.pdf").write_bytes(b"%PDF-1.4\nTest PDF") + + # Run recategorize + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Should not recategorize (no raw_text to extract institution from) + self.assertEqual(recategorized, 0) + self.assertEqual(remaining, 1) + + def test_invalid_json(self) -> None: + """Test handling of invalid JSON.""" + # Create invalid JSON file + metadata_file = self.unknown_dir / "metadata.json" + metadata_file.write_text("{ invalid json }", encoding="utf-8") + + # Run recategorize (should not crash) + recategorized, remaining = recategorize_unknown_files(self.temp_dir) + + # Should return 0, 0 (graceful failure) + self.assertEqual(recategorized, 0) + self.assertEqual(remaining, 0) + + +if __name__ == "__main__": + unittest.main()