Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 2 additions & 13 deletions .github/pull-request-template.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,5 @@ YES | NO
### What are the relevant tickets?
- Include links to Jira Software and/or Jira Service Management tickets here.

### Developer
- [ ] All new ENV is documented in README
- [ ] All new ENV has been added to staging and production environments
- [ ] All related Jira tickets are linked in commit message(s)
- [ ] Stakeholder approval has been confirmed (or is not needed)

### Code Reviewer(s)
- [ ] The commit message is clear and follows our guidelines (not just this PR message)
- [ ] There are appropriate tests covering any new functionality
- [ ] The provided documentation is sufficient for understanding any new functionality introduced
- [ ] Any manual tests have been performed and verified
- [ ] New dependencies are appropriate or there were no changes

### Code review
* Code review best practices are documented [here](https://mitlibraries.github.io/guides/collaboration/code_review.html) and you are encouraged to have a constructive dialogue with your reviewers about their preferences and expectations.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ ruff:
pipenv run ruff check .

safety: # Check for security vulnerabilities and verify Pipfile.lock is up-to-date
pipenv run pip-audit --ignore-vuln GHSA-4xh5-x5gv-qwph
pipenv run pip-audit
pipenv verify

# apply changes to resolve any linting errors
Expand Down
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ name = "pypi"
[packages]
attrs = "*"
beautifulsoup4 = "==4.12.3"
boto3 = "*"
boto3-stubs = "*"
click = "*"
jsonlines = "*"
lxml = "*"
Expand All @@ -20,6 +22,7 @@ timdex-dataset-api = {git = "https://github.com/MITLibraries/timdex-dataset-api.
black = "*"
coveralls = "*"
ipython = "*"
moto = "*"
mypy = "*"
pre-commit = "*"
pytest = "*"
Expand Down
1,986 changes: 1,111 additions & 875 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Options:
-s, --source [alma|aspace|dspace|jpal|libguides|gismit|gisogm|researchdatabases|whoas|zenodo]
Source records were harvested from, must
choose from list of options [required]
-e, --exclusion-list-path TEXT S3 or local path to exclusion list.
-r, --run-id TEXT Identifier for Transmogrifier run. This can
be used to group transformed records
produced by Transmogrifier, even if they
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ ignore = [
"D205",
"D212",
"D402",
"G004",
"PLR0912",
"PLR0913",
"PLR0915",
Expand Down
37 changes: 29 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import boto3
import pytest
from click.testing import CliRunner
from moto import mock_aws

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
Expand All @@ -19,6 +21,7 @@ def _test_config():
SOURCES["cool-repo"] = {
"name": "A Cool Repository",
"base-url": "https://example.com/",
"transform-class": "transmogrifier.sources.xml.datacite.Datacite",
}


Expand All @@ -39,6 +42,24 @@ def _bad_config():
SOURCES.pop("bad-module-path")


@pytest.fixture(scope="session")
def mock_s3():
with mock_aws():
s3 = boto3.client("s3", region_name="us-east-1")
s3.create_bucket(Bucket="test-bucket")
yield s3


@pytest.fixture(scope="session")
def mock_s3_exclusion_list(mock_s3):
mock_s3.put_object(
Bucket="test-bucket",
Key="libguides/config/libguides-exclusions.csv",
Body="https://libguides.mit.edu/excluded1\n"
"https://libguides.mit.edu/excluded2\n",
)
Comment on lines +45 to +60
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:chef's kiss:

Thanks for going this route! Perhaps we won't need the extensibility this provides, but feels like a good pattern.

Honestly, this all reminds me how easy mocking S3 can be... and realizing there are other instances we could (should) have mocked S3 (ignoring this pattern piece even).



@pytest.fixture
def runner():
return CliRunner()
Expand Down Expand Up @@ -257,33 +278,33 @@ def empty_dataset_location(tmp_path):


@pytest.fixture
def libguides_input_file():
def source_input_file():
return (
"tests/fixtures/dataset/libguides-2024-06-03-full-extracted-records-to-index.xml"
)


@pytest.fixture
def empty_libguides_input_file():
def empty_source_input_file():
return (
"tests/fixtures/dataset/libguides-2024-06-04-full-extracted-records-to-index.xml"
)


@pytest.fixture
def libguides_transformer(monkeypatch, run_id, libguides_input_file):
def source_transformer(monkeypatch, run_id, source_input_file):
return Transformer.load(
"libguides",
libguides_input_file,
"cool-repo",
source_input_file,
run_id=run_id,
)


@pytest.fixture
def libguides_transformer_with_timestamp(monkeypatch, run_id, libguides_input_file):
def source_transformer_with_timestamp(monkeypatch, run_id, source_input_file):
return Transformer.load(
"libguides",
libguides_input_file,
"cool-repo",
source_input_file,
run_id=run_id,
run_timestamp="2024-06-03T15:30:45",
)
73 changes: 42 additions & 31 deletions tests/sources/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,23 @@
from transmogrifier.sources.xml.datacite import Datacite


def test_load_exclusion_list(source_transformer, mock_s3_exclusion_list):
source_transformer.exclusion_list_path = (
"s3://test-bucket/libguides/config/libguides-exclusions.csv"
)
assert source_transformer.load_exclusion_list() == [
"https://libguides.mit.edu/excluded1",
"https://libguides.mit.edu/excluded2",
]


def test_transformer_get_transformer_returns_correct_class_name():
assert Transformer.get_transformer("jpal") == Datacite


def test_transformer_get_transformer_source_missing_class_name_raises_error():
with pytest.raises(KeyError):
Transformer.get_transformer("cool-repo")
Transformer.get_transformer("wrong-repo")


@pytest.mark.usefixtures("_bad_config")
Expand Down Expand Up @@ -84,9 +94,9 @@ def test_create_locations_from_spatial_subjects_success(timdex_record_required_f


def test_transformer_get_run_data_from_source_file_and_run_id(
libguides_transformer, libguides_input_file, run_id
source_transformer, source_input_file, run_id
):
assert libguides_transformer.get_run_data(libguides_input_file, run_id) == {
assert source_transformer.get_run_data(source_input_file, run_id) == {
"source": "libguides",
"run_date": "2024-06-03",
"run_type": "full",
Expand All @@ -96,11 +106,11 @@ def test_transformer_get_run_data_from_source_file_and_run_id(


def test_transformer_get_run_data_with_explicit_run_timestamp(
libguides_transformer, libguides_input_file, run_id
source_transformer, source_input_file, run_id
):
run_timestamp = "2024-06-03T15:30:45"
assert libguides_transformer.get_run_data(
libguides_input_file,
assert source_transformer.get_run_data(
source_input_file,
run_id=run_id,
run_timestamp=run_timestamp,
) == {
Expand All @@ -113,31 +123,31 @@ def test_transformer_get_run_data_with_explicit_run_timestamp(


def test_transformer_get_run_data_mints_timestamp_from_run_date(
libguides_transformer, libguides_input_file, run_id
source_transformer, source_input_file, run_id
):
result = libguides_transformer.get_run_data(libguides_input_file, run_id)
result = source_transformer.get_run_data(source_input_file, run_id)
assert result["run_timestamp"] == "2024-06-03T00:00:00"


def test_transformer_get_run_data_no_source_file_raise_error(
monkeypatch, libguides_transformer
monkeypatch, source_transformer
):
monkeypatch.setenv("WORKSPACE", "dev")
with pytest.raises(
ValueError,
match="'source_file' parameter is required outside of test environments",
):
libguides_transformer.get_run_data(None, "run-abc-123")
source_transformer.get_run_data(None, "run-abc-123")


def test_transformer_next_iter_yields_dataset_records(libguides_transformer):
assert isinstance(next(libguides_transformer), DatasetRecord)
def test_transformer_next_iter_yields_dataset_records(source_transformer):
assert isinstance(next(source_transformer), DatasetRecord)


def test_transform_next_iter_sets_valid_source_and_transformed_records(
libguides_transformer,
source_transformer,
):
record = next(libguides_transformer)
record = next(source_transformer)

# parse source record XML
assert isinstance(record.source_record, bytes)
Expand All @@ -151,10 +161,10 @@ def test_transform_next_iter_sets_valid_source_and_transformed_records(


def test_transform_next_iter_uses_run_data_parsed_from_source_file(
libguides_transformer, libguides_input_file, run_id
source_transformer, source_input_file, run_id
):
record = next(libguides_transformer)
run_data = libguides_transformer.get_run_data(libguides_input_file, run_id)
record = next(source_transformer)
run_data = source_transformer.get_run_data(source_input_file, run_id)
assert (
record.run_date
== datetime.datetime.strptime(run_data["run_date"], "%Y-%m-%d")
Expand All @@ -175,7 +185,7 @@ def test_transform_next_iter_uses_run_data_parsed_from_source_file(
],
)
def test_transformer_action_column_based_on_transformation_exception_handling(
libguides_transformer, transform_exception, expected_action
source_transformer, transform_exception, expected_action
):
"""While Transmogrifier is often considered just an application to transform a source
record into a TIMDEX record, it also serves the purpose of determining if a source
Expand All @@ -189,17 +199,17 @@ def test_transformer_action_column_based_on_transformation_exception_handling(
"""

if transform_exception:
with mock.patch.object(libguides_transformer, "transform") as mocked_transform:
with mock.patch.object(source_transformer, "transform") as mocked_transform:
mocked_transform.side_effect = transform_exception
record = next(libguides_transformer)
record = next(source_transformer)
assert mocked_transform.called
else:
record = next(libguides_transformer)
record = next(source_transformer)
assert record.action == expected_action


def test_transformer_provenance_object_added_to_transformed_record(libguides_transformer):
dataset_record = next(libguides_transformer)
def test_transformer_provenance_object_added_to_transformed_record(source_transformer):
dataset_record = next(source_transformer)
transformed_record = json.loads(dataset_record.transformed_record)
assert transformed_record["timdex_provenance"] == {
"source": "libguides",
Expand All @@ -210,20 +220,20 @@ def test_transformer_provenance_object_added_to_transformed_record(libguides_tra


def test_transformer_provenance_object_run_record_offset_increments(
libguides_transformer,
source_transformer,
):
for i in range(4):
dataset_record = next(libguides_transformer)
dataset_record = next(source_transformer)
if dataset_record.action != "index":
continue
transformed_record = json.loads(dataset_record.transformed_record)
assert transformed_record["timdex_provenance"]["run_record_offset"] == i


def test_transformer_load_with_run_timestamp_parameter(
libguides_transformer_with_timestamp, libguides_input_file, run_id
source_transformer_with_timestamp, source_input_file, run_id
):
assert libguides_transformer_with_timestamp.run_data == {
assert source_transformer_with_timestamp.run_data == {
"source": "libguides",
"run_date": "2024-06-03",
"run_type": "full",
Expand All @@ -233,9 +243,9 @@ def test_transformer_load_with_run_timestamp_parameter(


def test_transformer_load_without_run_timestamp_parameter(
libguides_transformer, libguides_input_file, run_id
source_transformer, source_input_file, run_id
):
assert libguides_transformer.run_data == {
assert source_transformer.run_data == {
"source": "libguides",
"run_date": "2024-06-03",
"run_type": "full",
Expand All @@ -245,10 +255,11 @@ def test_transformer_load_without_run_timestamp_parameter(


def test_transformer_dataset_write_includes_run_timestamp_column(
tmp_path, libguides_transformer
tmp_path,
source_transformer,
):
dataset_location = tmp_path / "dataset"
written_files = libguides_transformer.write_to_parquet_dataset(str(dataset_location))
written_files = source_transformer.write_to_parquet_dataset(str(dataset_location))

parquet_file = written_files[0]
assert "run_timestamp" in parquet_file.metadata.schema.names
Loading