Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,24 @@ print("\nSpeeches with most 'inflation' mentions:")
print(results_df.sort_values('inflation', ascending=False)[['file_code', 'inflation']].head())
```

## Known Limitations

### PDF Text Conversion

Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process.

**What happens:**
- The PDF is downloaded successfully
- Text conversion fails with an encoding error
- The PDF file remains available for manual processing

**If you encounter this issue:**
- The PDF file is still available in the `pdfs/` directory
- You can open it directly in a PDF viewer
- For text extraction, you may need to use OCR software or contact the source institution for alternative formats

**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files.

## Development

### Setting Up Development Environment
Expand Down
14 changes: 11 additions & 3 deletions bis_scraper/converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,15 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None:
# Extract text from PDF
logger.debug(f"Converting {pdf_path}")
text = textract.process(str(pdf_path))
text_str = text.decode("utf-8")

# Handle different return types from textract
if text is None:
raise ValueError(f"textract returned None for {pdf_path}")
elif isinstance(text, bytes):
text_str = text.decode("utf-8")
else:
# textract may return a string directly
text_str = str(text)

# Save text to file
with open(txt_path, "w", encoding="utf-8") as f:
Expand All @@ -172,8 +180,8 @@ def _convert_pdf(self, pdf_path: Path, output_dir: Path) -> None:
print(f"Error: {error_message}") # Print to stdout for CLI feedback
self.result.failed += 1
self.result.errors[file_code] = str(e)
# Re-raise the exception to be caught by the calling function
raise
# Don't re-raise - let convert_institution handle continuation
return

def get_results(self) -> ConversionResult:
"""Get the conversion results.
Expand Down
7 changes: 7 additions & 0 deletions bis_scraper/scrapers/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from bis_scraper.models import ScrapingResult
from bis_scraper.scrapers.bis_scraper import BisScraper
from bis_scraper.scrapers.recategorize import recategorize_unknown_files
from bis_scraper.utils.constants import RAW_DATA_DIR
from bis_scraper.utils.file_utils import create_directory
from bis_scraper.utils.institution_utils import normalize_institution_name
Expand Down Expand Up @@ -132,6 +133,12 @@ def scrape_bis(
# Get results
result = scraper.get_results()

# Re-categorize files from unknown folder if any exist
recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
if recategorized_count > 0:
logger.info(f"Re-categorized {recategorized_count} file(s) from unknown folder")
print(f"Re-categorized {recategorized_count} file(s) from unknown folder")

# Log summary
elapsed_time = time.time() - start_time
hours, remainder = divmod(elapsed_time, 3600)
Expand Down
164 changes: 164 additions & 0 deletions bis_scraper/scrapers/recategorize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""Function to re-categorize files from unknown folder."""

import json
import logging
import shutil
from datetime import datetime
from pathlib import Path
from typing import Tuple

from bis_scraper.utils.constants import RAW_DATA_DIR, TXT_DATA_DIR
from bis_scraper.utils.file_utils import (
get_institution_directory,
save_metadata_to_json,
)
from bis_scraper.utils.institution_utils import get_institution_from_metadata

logger = logging.getLogger(__name__)


def recategorize_unknown_files(data_dir: Path) -> Tuple[int, int]:
"""Re-categorize files from unknown folder using updated institution mappings.

This function checks the unknown folder and attempts to re-categorize files
based on their metadata. This is useful when institution mappings are updated
in constants.py after files have already been downloaded.

Args:
data_dir: Base directory for data storage

Returns:
Tuple of (files_recategorized, files_remaining) counts
"""
output_dir = data_dir / RAW_DATA_DIR
unknown_dir = output_dir / "unknown"

# If unknown folder doesn't exist, nothing to do
if not unknown_dir.exists():
return (0, 0)

metadata_file = unknown_dir / "metadata.json"
if not metadata_file.exists():
logger.debug("No metadata.json found in unknown folder")
return (0, 0)

# Load metadata
try:
with open(metadata_file, "r", encoding="utf-8") as f:
metadata_data = json.load(f)
except (json.JSONDecodeError, IOError) as e:
logger.warning(f"Error reading metadata.json from unknown folder: {e}")
return (0, 0)

recategorized = 0
remaining_metadata = {}

# Process each entry in metadata
for speech_code, metadata_entry in metadata_data.items():
raw_text = metadata_entry.get("raw_text", "")
if not raw_text:
# Keep entry if no raw_text available
remaining_metadata[speech_code] = metadata_entry
continue

# Try to extract institution from metadata
institution = get_institution_from_metadata(raw_text)

if institution: # None means not found, any string means found
# Found a valid institution - move the files
pdf_filename = f"{speech_code}.pdf"
txt_filename = f"{speech_code}.txt"
pdf_path = unknown_dir / pdf_filename

if pdf_path.exists():
# Get target institution directories
target_pdf_dir = get_institution_directory(output_dir, institution)
target_pdf_path = target_pdf_dir / pdf_filename

# Get text directories
texts_dir = data_dir / TXT_DATA_DIR
unknown_texts_dir = texts_dir / "unknown"
target_txt_dir = get_institution_directory(texts_dir, institution)
txt_path = unknown_texts_dir / txt_filename
target_txt_path = target_txt_dir / txt_filename

# Move PDF file
try:
shutil.move(str(pdf_path), str(target_pdf_path))
logger.info(
f"Re-categorized {speech_code} PDF from unknown to {institution}"
)

# Move text file if it exists
if txt_path.exists():
# Ensure target text directory exists
target_txt_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(txt_path), str(target_txt_path))
logger.info(
f"Re-categorized {speech_code} text file from unknown to {institution}"
)

# Save metadata to target directory
date_str = metadata_entry.get("date")
date_obj = None
if date_str:
try:
date_obj = datetime.fromisoformat(date_str).date()
except ValueError:
pass

save_metadata_to_json(
target_pdf_dir, speech_code, raw_text, date_obj
)

recategorized += 1
except Exception as e:
logger.error(
f"Error moving {speech_code} to {institution}: {e}",
exc_info=True,
)
# Keep entry if move failed
remaining_metadata[speech_code] = metadata_entry
else:
# PDF doesn't exist, but keep metadata entry
remaining_metadata[speech_code] = metadata_entry
else:
# Still unknown, keep entry
remaining_metadata[speech_code] = metadata_entry

# Update metadata.json in unknown folder
if remaining_metadata:
with open(metadata_file, "w", encoding="utf-8") as f:
json.dump(remaining_metadata, f, indent=2, ensure_ascii=False)
else:
# No remaining files, remove metadata.json and unknown folders if empty
metadata_file.unlink()
try:
unknown_dir.rmdir() # Remove if empty
logger.info("Removed empty unknown PDF folder")
except OSError:
# Folder not empty (might have other files)
pass

# Also try to remove unknown texts folder if empty
texts_dir = data_dir / TXT_DATA_DIR
unknown_texts_dir = texts_dir / "unknown"
if unknown_texts_dir.exists():
try:
# Check if folder is empty (only check for .txt files, ignore other files)
txt_files = list(unknown_texts_dir.glob("*.txt"))
if not txt_files:
unknown_texts_dir.rmdir()
logger.info("Removed empty unknown texts folder")
except OSError:
# Folder not empty or other error
pass

remaining_count = len(remaining_metadata)
if recategorized > 0:
logger.info(
f"Re-categorized {recategorized} file(s) from unknown folder. "
f"{remaining_count} file(s) still unknown."
)

return (recategorized, remaining_count)
3 changes: 3 additions & 0 deletions bis_scraper/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"Central Bank of Colombia",
"Central Bank of Curaçao and Sint Maarten",
"Central Bank of Cyprus",
"Central Bank of Eswatini",
"Central Bank of Iceland",
"Central Bank of Ireland",
"Central Bank of Jordan",
Expand Down Expand Up @@ -196,6 +197,7 @@
],
"central bank of norway": ["norges bank"],
"bank of france": ["banque de france"],
"bank of portugal": ["banco de portugal"],
"netherlands bank": ["nederlandsche bank"],
"south african reserve bank": ["bank of south africa"],
"hong kong monetary authority": ["hong kong monetary"],
Expand All @@ -209,4 +211,5 @@
"central bank of the republic of turkey": ["bank of turkey"],
"people's bank of china": ["bank of china"],
"reserve bank of australia": ["australian reserve bank", "bank of australia"],
"saudi arabian monetary agency": ["saudi central bank"],
}
20 changes: 20 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ Returns a `ConversionResult` object with the following attributes:
- `failed`: Number of PDFs that failed to convert
- `errors`: Dictionary mapping file codes to error messages

**Note:** Some PDFs (approximately 8%) may fail to convert due to encoding issues in the source PDF files. These PDFs are still downloaded successfully and remain available in the `pdfs/` directory for manual processing. See [Known Limitations](#known-limitations) for more details.

### `convert_pdfs_dates()`

Convert PDFs with an optional inclusive date range filter.
Expand Down Expand Up @@ -287,6 +289,24 @@ scrape_and_upload_to_gcs(

- **Error Handling**: Add appropriate error handling and retry logic for production use.

## Known Limitations

### PDF Text Conversion

Some PDFs (approximately 8%) may fail to convert to text due to encoding issues in the source PDF files. This occurs when PDFs use non-standard font encodings that the text extraction library cannot process.

**What happens:**
- The PDF is downloaded successfully
- Text conversion fails with an encoding error
- The PDF file remains available for manual processing

**If you encounter this issue:**
- The PDF file is still available in the `pdfs/` directory
- You can open it directly in a PDF viewer
- For text extraction, you may need to use OCR software or contact the source institution for alternative formats

**Note:** This is a limitation of the source PDF files, not a bug in this package. The package handles these errors gracefully and continues processing other files.

## Command-Line Interface (CLI)

The package also provides a command-line interface that can be used in your terminal after installation:
Expand Down
6 changes: 3 additions & 3 deletions scripts/run_full_scrape.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
#LIMIT_NUM=5

# Data storage locations
DATA_DIR="$HOME/bis_full_data"
LOG_DIR="$HOME/bis_full_data/logs"
DATA_DIR="../data/bis_full_data"
LOG_DIR="../data/bis_full_data/logs"

# Date range (format: YYYY-MM-DD)
#START_DATE="1997-01-01" # BIS speeches start around 1997
START_DATE="2025-10-01" # BIS speeches start around 1997
START_DATE="2025-08-01" # BIS speeches start around 1997
#START_DATE=$(date +%Y-%m-%d) # Today's date
END_DATE=$(date +%Y-%m-%d) # Today's date

Expand Down
4 changes: 1 addition & 3 deletions tests/unit/test_pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,7 @@ def side_effect(file_path):
# Check results
result = converter.get_results()
self.assertEqual(result.successful, 1) # First file converted
# The PDF converter counts errors both in the converter's internal state
# and in the exception handler in convert_institution, resulting in 2 counts
self.assertEqual(result.failed, 2) # Second file failed, counted in two places
self.assertEqual(result.failed, 1) # Second file failed
self.assertIn("220102b", result.errors) # Error recorded


Expand Down
Loading