Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ ignore = [
"D205",
"D212",
"D402",
"EM102",
"G004",
"PLR0912",
"PLR0913",
"PLR0915",
"S321",
"TRY003"
]

# allow autofix behavior for specified rules
Expand Down
45 changes: 45 additions & 0 deletions tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,51 @@ def test_load_exclusion_list(source_transformer, mock_s3_exclusion_list):
]


def test_next_iter_sets_action_skip_when_record_is_excluded(tmp_path):
class ExcludingTransformer(Transformer):
@classmethod
def parse_source_file(cls, _source_file: str):
return iter(())

@classmethod
def get_main_titles(cls, _source_record):
return ["Title"]

def get_source_link(self, source_record):
return str(source_record["link"])

def get_timdex_record_id(self, source_record):
return f"cool-repo:{source_record['id']}"

@classmethod
def get_source_record_id(cls, source_record):
return str(source_record["id"])

@classmethod
def record_is_deleted(cls, _source_record):
return False

def record_is_excluded(self, source_record):
source_link = self.get_source_link(source_record)
return source_link in (self.exclusion_list or [])

exclusion_list_path = tmp_path / "exclusions.csv"
exclusion_list_path.write_text("https://example.com/exclude-me\n")
transformer = ExcludingTransformer(
"cool-repo",
iter([{"id": "123", "link": "https://example.com/exclude-me"}]),
exclusion_list_path=str(exclusion_list_path),
)

dataset_record = next(transformer)
assert dataset_record.action == "skip"
assert dataset_record.transformed_record is None
assert json.loads(dataset_record.source_record) == {
"id": "123",
"link": "https://example.com/exclude-me",
}


def test_transformer_get_transformer_returns_correct_class_name():
assert Transformer.get_transformer("jpal") == Datacite

Expand Down
34 changes: 34 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest import mock

from transmogrifier.cli import main
from transmogrifier.exceptions import CriticalError


def test_transform_no_sentry_not_verbose(
Expand Down Expand Up @@ -211,3 +212,36 @@ def test_transform_no_memory_fault_for_threaded_bs4_parsing(monkeypatch, tmp_pat
check=False,
)
assert result.returncode == 0


def test_transform_critical_error_prevents_writing(
caplog, runner, source_input_file, source_transformer, empty_dataset_location
):
caplog.set_level("INFO")

with (
mock.patch(
"transmogrifier.cli.Transformer.load", return_value=source_transformer
),
mock.patch.object(
source_transformer,
"get_timdex_record_id",
side_effect=CriticalError("Catastrophic failure, no records will work!"),
),
):
result = runner.invoke(
main,
[
"-s",
"libguides",
"-i",
source_input_file,
"--output-location",
empty_dataset_location,
],
)

assert isinstance(result.exception, CriticalError)
assert str(result.exception) == "Catastrophic failure, no records will work!"
assert result.exit_code != 0
assert "Completed transform, total records processed" not in caplog.text
7 changes: 7 additions & 0 deletions transmogrifier/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,10 @@ class SkippedRecordEvent(Exception): # noqa: N818
def __init__(self, message: str | None = None, source_record_id: str | None = None):
super().__init__(message)
self.source_record_id = source_record_id


class CriticalError(Exception):
"""Exception raised for critical errors that should terminate the run."""

def __init__(self, message: str | None = None):
super().__init__(message)
24 changes: 19 additions & 5 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES
from transmogrifier.exceptions import DeletedRecordEvent, SkippedRecordEvent
from transmogrifier.exceptions import (
CriticalError,
DeletedRecordEvent,
SkippedRecordEvent,
)
from transmogrifier.helpers import (
generate_citation,
validate_date,
Expand Down Expand Up @@ -102,15 +106,22 @@ def load_exclusion_list(self) -> list[str]:
Args:
exclusion_list_path: Path to exclusion list file (s3://bucket/key or local
path).

Raises:
On error loading or parsing the file, raises CriticalError which will
terminate the run.
"""
with smart_open.open(self.exclusion_list_path, "r") as exclusion_list:
rows = exclusion_list.readlines()
exclusion_list = [row.strip() for row in rows if row.strip()]
try:
with smart_open.open(self.exclusion_list_path, "r") as exclusion_list:
rows = exclusion_list.readlines()
exclusion_list = [row.strip() for row in rows if row.strip()]
except Exception as exc:
raise CriticalError(f"Could not load exclusion list: {exc}") from exc

logger.info(
f"Loaded exclusion list from {self.exclusion_list_path} with "
f"{len(exclusion_list)} entries"
)
logger.debug(exclusion_list)
return exclusion_list

@final
Expand Down Expand Up @@ -149,6 +160,9 @@ def __next__(self) -> DatasetRecord:
self.skipped_record_count += 1
action = "skip"

except CriticalError:
raise

except Exception as exception:
self.error_record_count += 1
message = f"Unhandled exception during record transformation: {exception}"
Expand Down