diff --git a/pyproject.toml b/pyproject.toml index 5081712..158a0ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,11 +42,13 @@ ignore = [ "D205", "D212", "D402", + "EM102", "G004", "PLR0912", "PLR0913", "PLR0915", "S321", + "TRY003" ] # allow autofix behavior for specified rules diff --git a/tests/sources/test_transformer.py b/tests/sources/test_transformer.py index d67fcd0..bec9cbb 100644 --- a/tests/sources/test_transformer.py +++ b/tests/sources/test_transformer.py @@ -23,6 +23,51 @@ def test_load_exclusion_list(source_transformer, mock_s3_exclusion_list): ] +def test_next_iter_sets_action_skip_when_record_is_excluded(tmp_path): + class ExcludingTransformer(Transformer): + @classmethod + def parse_source_file(cls, _source_file: str): + return iter(()) + + @classmethod + def get_main_titles(cls, _source_record): + return ["Title"] + + def get_source_link(self, source_record): + return str(source_record["link"]) + + def get_timdex_record_id(self, source_record): + return f"cool-repo:{source_record['id']}" + + @classmethod + def get_source_record_id(cls, source_record): + return str(source_record["id"]) + + @classmethod + def record_is_deleted(cls, _source_record): + return False + + def record_is_excluded(self, source_record): + source_link = self.get_source_link(source_record) + return source_link in (self.exclusion_list or []) + + exclusion_list_path = tmp_path / "exclusions.csv" + exclusion_list_path.write_text("https://example.com/exclude-me\n") + transformer = ExcludingTransformer( + "cool-repo", + iter([{"id": "123", "link": "https://example.com/exclude-me"}]), + exclusion_list_path=str(exclusion_list_path), + ) + + dataset_record = next(transformer) + assert dataset_record.action == "skip" + assert dataset_record.transformed_record is None + assert json.loads(dataset_record.source_record) == { + "id": "123", + "link": "https://example.com/exclude-me", + } + + def test_transformer_get_transformer_returns_correct_class_name(): assert Transformer.get_transformer("jpal") == Datacite diff --git a/tests/test_cli.py b/tests/test_cli.py index c8acd2b..a2dee8e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ from unittest import mock from transmogrifier.cli import main +from transmogrifier.exceptions import CriticalError def test_transform_no_sentry_not_verbose( @@ -211,3 +212,36 @@ def test_transform_no_memory_fault_for_threaded_bs4_parsing(monkeypatch, tmp_pat check=False, ) assert result.returncode == 0 + + +def test_transform_critical_error_prevents_writing( + caplog, runner, source_input_file, source_transformer, empty_dataset_location +): + caplog.set_level("INFO") + + with ( + mock.patch( + "transmogrifier.cli.Transformer.load", return_value=source_transformer + ), + mock.patch.object( + source_transformer, + "get_timdex_record_id", + side_effect=CriticalError("Catastrophic failure, no records will work!"), + ), + ): + result = runner.invoke( + main, + [ + "-s", + "libguides", + "-i", + source_input_file, + "--output-location", + empty_dataset_location, + ], + ) + + assert isinstance(result.exception, CriticalError) + assert str(result.exception) == "Catastrophic failure, no records will work!" + assert result.exit_code != 0 + assert "Completed transform, total records processed" not in caplog.text diff --git a/transmogrifier/exceptions.py b/transmogrifier/exceptions.py index 8022d9c..0307bb2 100644 --- a/transmogrifier/exceptions.py +++ b/transmogrifier/exceptions.py @@ -19,3 +19,10 @@ class SkippedRecordEvent(Exception): # noqa: N818 def __init__(self, message: str | None = None, source_record_id: str | None = None): super().__init__(message) self.source_record_id = source_record_id + + +class CriticalError(Exception): + """Exception raised for critical errors that should terminate the run.""" + + def __init__(self, message: str | None = None): + super().__init__(message) diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index 9a7f281..e9d5daa 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -23,7 +23,11 @@ import transmogrifier.models as timdex from transmogrifier.config import SOURCES -from transmogrifier.exceptions import DeletedRecordEvent, SkippedRecordEvent +from transmogrifier.exceptions import ( + CriticalError, + DeletedRecordEvent, + SkippedRecordEvent, +) from transmogrifier.helpers import ( generate_citation, validate_date, @@ -102,15 +106,22 @@ def load_exclusion_list(self) -> list[str]: Args: exclusion_list_path: Path to exclusion list file (s3://bucket/key or local path). + + Raises: + On error loading or parsing the file, raises CriticalError which will + terminate the run. """ - with smart_open.open(self.exclusion_list_path, "r") as exclusion_list: - rows = exclusion_list.readlines() - exclusion_list = [row.strip() for row in rows if row.strip()] + try: + with smart_open.open(self.exclusion_list_path, "r") as exclusion_list: + rows = exclusion_list.readlines() + exclusion_list = [row.strip() for row in rows if row.strip()] + except Exception as exc: + raise CriticalError(f"Could not load exclusion list: {exc}") from exc + logger.info( f"Loaded exclusion list from {self.exclusion_list_path} with " f"{len(exclusion_list)} entries" ) - logger.debug(exclusion_list) return exclusion_list @final @@ -149,6 +160,9 @@ def __next__(self) -> DatasetRecord: self.skipped_record_count += 1 action = "skip" + except CriticalError: + raise + except Exception as exception: self.error_record_count += 1 message = f"Unhandled exception during record transformation: {exception}"