Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed
- Fixed metadata preservation during recategorization: structured metadata fields (speech_type, speaker, role, event, etc.) are now preserved when recategorizing files from the unknown folder
- Fixed remaining count calculation to correctly handle cases where metadata exists but PDF files don't
- Fixed early return bug that prevented processing metadata entries when no PDF files were present

### Changed
- Improved recategorization function to process metadata entries even when no PDF files are present
- Enhanced remaining count calculation to account for PDFs without metadata and metadata entries without PDFs

## [0.1.0] - 2025-11-09

### Added
Expand Down
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,25 @@ Convert only a specific date range (inclusive):
bis-scraper convert --start-date 2020-01-01 --end-date 2020-01-31
```

#### Run Both Steps
#### Re-categorize Unknown Files

Run both scraping and conversion in one command:
After scraping, some files may be placed in an `unknown/` folder if their institution couldn't be identified. If you've updated institution mappings in `constants.py`, you can re-categorize these files:

```bash
bis-scraper recategorize
```

This command moves files from the `unknown/` folder to their correct institution folders based on updated mappings. It processes both PDFs and text files together.

#### Run All Steps

Run scraping, recategorization, and conversion in one command:

```bash
bis-scraper run-all --start-date 2020-01-01 --end-date 2020-01-31
```

Note: The run-all command forwards the same date range to conversion, so only PDFs in that range are converted.
Note: The `run-all` command runs scraping, then recategorization, then conversion. The date range is forwarded to conversion, so only PDFs in that range are converted.

#### Cache Management

Expand Down Expand Up @@ -159,6 +169,7 @@ See [Scripts README](scripts/README.md) for more details.
from pathlib import Path
import datetime
from bis_scraper.scrapers.controller import scrape_bis
from bis_scraper.scrapers.recategorize import recategorize_unknown_files
from bis_scraper.converters.controller import convert_pdfs_dates

# Download speeches
Expand All @@ -172,6 +183,11 @@ result = scrape_bis(
limit=None
)

# Re-categorize files from unknown folder (if any)
recategorized_count, remaining_unknown = recategorize_unknown_files(Path("data"))
if recategorized_count > 0:
print(f"Re-categorized {recategorized_count} file(s) from unknown folder")

# Convert to text (filtering to the same date range)
convert_result = convert_pdfs_dates(
data_dir=Path("data"),
Expand All @@ -195,6 +211,7 @@ import datetime
import logging
from pathlib import Path
from bis_scraper.scrapers.controller import scrape_bis
from bis_scraper.scrapers.recategorize import recategorize_unknown_files
from bis_scraper.converters.controller import convert_pdfs
from bis_scraper.utils.constants import INSTITUTIONS
from bis_scraper.utils.institution_utils import get_all_institutions
Expand Down Expand Up @@ -239,6 +256,11 @@ scrape_result = scrape_bis(
limit=10
)

# Re-categorize files from unknown folder (if any)
recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
if recategorized_count > 0:
print(f"Re-categorized {recategorized_count} file(s) from unknown folder")

# Convert all downloaded speeches
convert_result = convert_pdfs(
data_dir=data_dir,
Expand Down
38 changes: 36 additions & 2 deletions bis_scraper/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,38 @@ def scrape(
click.echo("Scraping completed!")


@main.command()
@click.pass_context
def recategorize(ctx: click.Context) -> None:
"""Re-categorize files from unknown folder using updated institution mappings.

This command moves files from the unknown folder to their correct institution
folders based on updated institution mappings in constants.py. Useful when
you've added new institution aliases or mappings.

The command processes both PDFs and text files, moving them together to
maintain consistency.
"""
from bis_scraper.scrapers.recategorize import recategorize_unknown_files

data_dir = ctx.obj["data_dir"]

click.echo("Re-categorizing files from unknown folder...")
click.echo(f"Data directory: {data_dir.absolute()}")

recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)

if recategorized_count > 0:
click.echo(f"Re-categorized {recategorized_count} file(s) from unknown folder")
else:
click.echo("No files found to re-categorize")

if remaining_unknown > 0:
click.echo(f"{remaining_unknown} file(s) still remain in unknown folder")

click.echo("Re-categorization completed!")


@main.command()
@click.option(
"--start-date",
Expand Down Expand Up @@ -244,7 +276,7 @@ def run_all(
force: bool,
limit: Optional[int],
) -> None:
"""Run both scraping and conversion steps."""
"""Run scraping, recategorization, and conversion steps."""
ctx.invoke(
scrape,
start_date=start_date,
Expand All @@ -253,6 +285,8 @@ def run_all(
force=force,
limit=limit,
)
# Re-categorize files from unknown folder
ctx.invoke(recategorize)
# Call convert with the same date range
ctx.invoke(
convert,
Expand All @@ -276,7 +310,7 @@ def clear_cache(ctx: click.Context) -> None:
# Read cache to show info
import json

with open(cache_file, "r") as f:
with open(cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
num_dates = len(cache_data.get("dates", {}))

Expand Down
33 changes: 23 additions & 10 deletions bis_scraper/scrapers/bis_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _load_date_cache(self) -> None:
"""Load the date cache from disk."""
if self.date_cache_file.exists():
try:
with open(self.date_cache_file, "r") as f:
with open(self.date_cache_file, "r", encoding="utf-8") as f:
cache_data = json.load(f)
# Convert cache format if needed (for backwards compatibility)
if isinstance(cache_data, dict) and "version" in cache_data:
Expand All @@ -110,7 +110,7 @@ def _save_date_cache(self) -> None:
"dates": self.checked_dates,
"updated": datetime.datetime.now().isoformat(),
}
with open(self.date_cache_file, "w") as f:
with open(self.date_cache_file, "w", encoding="utf-8") as f:
json.dump(cache_data, f, indent=2)
logger.debug(f"Saved date cache with {len(self.checked_dates)} dates")
except Exception as e:
Expand Down Expand Up @@ -147,6 +147,7 @@ def scrape_date(self, date_obj: datetime.date) -> bool:
# Track if we found any speeches for this date
date_had_speeches = False
files_found_count = 0
had_network_error = False # Track if we had network errors

# Format date for URL: YYMMDD (without century)
date_str = date_obj.strftime("%y%m%d")
Expand Down Expand Up @@ -198,19 +199,31 @@ def scrape_date(self, date_obj: datetime.date) -> bool:
return False

except Exception as e:
if response.status_code != 404: # Don't log 404s as errors
# Check if response exists before accessing status_code
# Network errors (DNS, connection) may occur before response is set
if "response" in locals() and response.status_code != 404:
logger.error(
f"Error scraping {date_obj.isoformat()} - {speech_code}: {str(e)}",
exc_info=True,
)
self.result.failed += 1

# Mark this date as fully checked in the cache
self.checked_dates[cache_key] = {
"checked_at": datetime.datetime.now().isoformat(),
"had_speeches": date_had_speeches,
"files_found": files_found_count,
}
elif "response" not in locals():
# Network error before response was created
logger.error(
f"Network error scraping {date_obj.isoformat()} - {speech_code}: {str(e)}",
exc_info=True,
)
self.result.failed += 1
had_network_error = True

# Only mark date as checked if we didn't have network errors
# Network errors mean we couldn't fully check the date, so we shouldn't cache it
if not had_network_error:
self.checked_dates[cache_key] = {
"checked_at": datetime.datetime.now().isoformat(),
"had_speeches": date_had_speeches,
"files_found": files_found_count,
}
# Save cache periodically (every 10 dates to balance performance and safety)
if len(self.checked_dates) % 10 == 0:
self._save_date_cache()
Expand Down
11 changes: 2 additions & 9 deletions bis_scraper/scrapers/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from bis_scraper.models import ScrapingResult
from bis_scraper.scrapers.bis_scraper import BisScraper
from bis_scraper.scrapers.recategorize import recategorize_unknown_files
from bis_scraper.utils.constants import RAW_DATA_DIR
from bis_scraper.utils.file_utils import create_directory
from bis_scraper.utils.institution_utils import normalize_institution_name
Expand All @@ -18,7 +17,7 @@

def scrape_bis(
data_dir: Path,
log_dir: Path,
log_dir: Path, # noqa: ARG001
start_date: Optional[datetime.datetime] = None,
end_date: Optional[datetime.datetime] = None,
institutions: Optional[Tuple[str, ...]] = None,
Expand All @@ -29,7 +28,7 @@ def scrape_bis(

Args:
data_dir: Base directory for data storage
log_dir: Directory for log files
log_dir: Directory for log files (part of API signature for consistency, logging configured at CLI level)
start_date: Start date for scraping
end_date: End date for scraping
institutions: Specific institutions to scrape (default: all)
Expand Down Expand Up @@ -133,12 +132,6 @@ def scrape_bis(
# Get results
result = scraper.get_results()

# Re-categorize files from unknown folder if any exist
recategorized_count, remaining_unknown = recategorize_unknown_files(data_dir)
if recategorized_count > 0:
logger.info(f"Re-categorized {recategorized_count} file(s) from unknown folder")
print(f"Re-categorized {recategorized_count} file(s) from unknown folder")

# Log summary
elapsed_time = time.time() - start_time
hours, remainder = divmod(elapsed_time, 3600)
Expand Down
Loading