From 59a696ebe7c2b998f88ca1bfb159ad827aeda7ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 29 Feb 2024 15:49:21 +0100 Subject: [PATCH 01/39] State in the readme that this is a fork --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 160502e8..33ec1be3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,18 @@ ![license](https://img.shields.io/github/license/vmenger/deduce) [![black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +# About this fork + +This is Matěj Korvas's fork of the original Deduce tool, available at +https://github.com/vmenger/deduce, forked on 2024-02-29. The latest version +available here has some extra or different functionality on top of the +original tool -- which is maybe obvious but the license requires me to +state it clearly. + +Use at your own risk. + +Original Readme documentation follows. + # deduce > Deduce 3.0.0 is out! It is way more accurate, and faster too. It's fully backward compatible, but some functionality is scheduled for removal, read more about it here: [docs/migrating-to-v3](https://deduce.readthedocs.io/en/latest/migrating.html) @@ -141,4 +153,4 @@ For setting up the dev environment and contributing guidelines, see: [docs/contr ## License -This project is licensed under the GNU General Public License v3.0 - see the [LICENSE.md](LICENSE.md) file for details \ No newline at end of file +This project is licensed under the GNU General Public License v3.0 - see the [LICENSE.md](LICENSE.md) file for details From 4947a91e03718765f1c48ab2c1a0f99f3aebb79f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 29 Feb 2024 15:28:45 +0100 Subject: [PATCH 02/39] Log progress of loading lookup structs --- deduce/deduce.py | 6 ++++++ deduce/lookup_structs.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/deduce/deduce.py b/deduce/deduce.py index a0d935c2..bf73c082 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -84,12 +84,16 @@ def __init__( # pylint: disable=R0913 config = config_file + logging.info('Going to init config.') self.config = self._initialize_config( load_base_config=load_base_config, user_config=config ) self.lookup_data_path = self._initialize_lookup_data_path(lookup_data_path) + + logging.info('Going to init tokenizers.') self.tokenizers = {"default": self._initialize_tokenizer(self.lookup_data_path)} + logging.debug('Done initing tokenizers.') self.lookup_structs = get_lookup_structs( lookup_path=self.lookup_data_path, @@ -97,9 +101,11 @@ def __init__( # pylint: disable=R0913 deduce_version=__version__, build=build_lookup_structs, ) + logging.info('Done loading lookup structs.') extras = {"tokenizer": self.tokenizers["default"], "ds": self.lookup_structs} + logging.info('Going to load the Deduce processor.') self.processors = _DeduceProcessorLoader().load( config=self.config, extras=extras ) diff --git a/deduce/lookup_structs.py b/deduce/lookup_structs.py index 104dfe2e..b116fd41 100644 --- a/deduce/lookup_structs.py +++ b/deduce/lookup_structs.py @@ -61,6 +61,7 @@ def load_raw_itemset(path: Path) -> set[str]: The raw items, as a set of strings. """ + logging.info("Loading itemset %s...", path) items = optional_load_items(path / "items.txt") exceptions = optional_load_items(path / "exceptions.txt") @@ -85,8 +86,10 @@ def load_raw_itemset(path: Path) -> set[str]: transform_config = optional_load_json(path / "transform.json") if transform_config is not None: + logging.info("Applying transformation to %s...", path) items = apply_transform(items, transform_config) + logging.info("Done loading %s.", path) return items @@ -219,11 +222,13 @@ def get_lookup_structs( """ + logging.debug("lookup_path = %s", lookup_path) if not build: lookup_structs = load_lookup_structs_from_cache(lookup_path, deduce_version) if lookup_structs is not None: + logging.info("Loaded lookup structs from the cache.") return lookup_structs logging.info( @@ -258,16 +263,20 @@ def get_lookup_structs( ) for name in defaults: + logging.info("Adding the %s defaults...", name) lookup_set = dd.ds.LookupSet() lookup_set.add_items_from_iterable(base_items[name]) lookup_structs[name] = lookup_set for name, set_init_function in _LOOKUP_SET_LOADERS.items(): + logging.info("Initing the %s set...", name) lookup_structs[name] = set_init_function(base_items) for name, trie_init_function in _LOOKUP_TRIE_LOADERS.items(): + logging.info("Initing the %s trie...", name) lookup_structs[name] = trie_init_function(base_items, tokenizer) + logging.info("Going to cache lookup structs.") if save_cache: cache_lookup_structs( lookup_structs=lookup_structs, From d7a68ad5b6e9d68ebf67264d0e26fd160cc528b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 1 Mar 2024 23:34:24 +0100 Subject: [PATCH 03/39] Titlecase all-caps street names --- deduce/lookup_struct_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduce/lookup_struct_loader.py b/deduce/lookup_struct_loader.py index 7d1f642d..cd869b47 100644 --- a/deduce/lookup_struct_loader.py +++ b/deduce/lookup_struct_loader.py @@ -145,7 +145,7 @@ def load_street_lookup( ) -> dd.ds.LookupTrie: """Load street LookupTrie.""" - street = dd.ds.LookupSet() + street = dd.ds.LookupSet(matching_pipeline=[dd.str.LowercaseTail()]) street.add_items_from_iterable( raw_itemsets["street"], From 7c9abb8282e6c3025eee01d527b2f504f56568e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 10:42:29 +0100 Subject: [PATCH 04/39] Enable skipping of pickling lookup structs --- deduce/deduce.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/deduce/deduce.py b/deduce/deduce.py index bf73c082..a12aadbb 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -41,7 +41,7 @@ class Deduce(dd.DocDeid): # pylint: disable=R0903 """ - Main class for de-identifiation. + Main class for de-identification. Inherits from ``docdeid.DocDeid``, and as such, most information on deidentifying text with a Deduce object is available there. @@ -55,7 +55,7 @@ class Deduce(dd.DocDeid): # pylint: disable=R0903 are overwritten, and other defaults are kept. When `load_base_config` is set to `False`, no defaults are loaded and only configuration from `config` is applied. - looup_data_path: The path to look for lookup data, by default included in + lookup_data_path: The path to look for lookup data, by default included in the package. If you want to make changes to source files, it's recommended to copy the source data and pointing deduce to this folder with this argument. @@ -70,6 +70,7 @@ def __init__( # pylint: disable=R0913 config_file: Optional[str] = None, lookup_data_path: Union[str, Path] = _LOOKUP_LIST_PATH, build_lookup_structs: bool = False, + save_lookup_structs: bool = True, ) -> None: super().__init__() @@ -100,6 +101,7 @@ def __init__( # pylint: disable=R0913 tokenizer=self.tokenizers["default"], deduce_version=__version__, build=build_lookup_structs, + save_cache=save_lookup_structs, ) logging.info('Done loading lookup structs.') @@ -177,6 +179,7 @@ def _get_multi_token_annotator(args: dict, extras: dict) -> dd.process.Annotator args.update( lookup_values=lookup_struct.items(), matching_pipeline=lookup_struct.matching_pipeline, + # XXX Sure the trailing "]" is intentional? tokenizer=extras["tokenizer]"], ) elif isinstance(lookup_struct, dd.ds.LookupTrie): From 535999eba5dd6fd7ba3a3ae6e248de4f75e58025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 10:42:51 +0100 Subject: [PATCH 05/39] Proofread CONTRIBUTING.md --- CONTRIBUTING.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2bcfc365..7ee48db5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,42 +3,42 @@ Thanks for considering making an addition to this project! These contributing guidelines should help make your life easier. Before starting, some things to consider: -* For larger features, it would be helpful to get in touch first (through issue/email) +* For larger features, it would be helpful to get in touch first (through issue/email). * A lot of the logic is in `docdeid`, please consider making a PR there for things that are not specific to `deduce`. -* `deduce` is a rule-based de-identifier -* In case you would like to see any rules added/removed/changed, a decent substantiation (with examples) of the potential improvement is useful +* `deduce` is a rule-based de-identifier. +* In case you would like to see any rules added/removed/changed, a decent substantiation (with examples) of the potential improvement is useful. ## Setting up the environment -* This project uses poetry for package management. Install it with ```pip install poetry``` -* Set up the environment is easy, just use ```poetry install``` +* This project uses poetry for package management. Install it with ``pip install poetry``. +* Setting up the environment is easy, just use ``poetry install``. * The makefile contains some useful commands when developing: - * `make format` formats the package code - * `make lint` runs the linters (check the output) - * `make clean` removes build/test artifacts, etc + * `make format` formats the package code; + * `make lint` runs the linters (check the output); + * `make clean` removes build/test artifacts, etc. * And for docs: - * `make build-docs` builds the docs + * `make build-docs` builds the docs. -## Runing the tests +## Running the tests ```bash pytest . ``` -## PR checlist +## PR checklist -* Verify that tests are passing -* Verify that tests are updated/added according to changes -* Run the formatters (`make format`) -* Run the linters (`make lint`) -* Add a section to the changelog -* Add a description to your PR +* Verify that tests are passing. +* Verify that tests are updated/added according to changes. +* Run the formatters (`make format`). +* Run the linters (`make lint`). +* Add a section to the changelog. +* Add a description to your PR. If all the steps above are followed, this ensures a quick review and release of your contribution. ## Releasing * Readthedocs has a webhook connected to pushes on the main branch. It will trigger and update automatically. -* Create a [release on github](https://github.com/vmenger/docdeid/releases/new), create a tag with the right version, manually copy and paste from the changelog -* Build pipeline and release to PyPi trigger automatically on release +* Create a [release on Github](https://github.com/vmenger/docdeid/releases/new), create a tag with the right version, manually copy and paste from the changelog. +* Build pipeline and release to PyPI trigger automatically on release. Any other questions/issues not covered here? Please just get in touch! \ No newline at end of file From 026218fa010e1531753253d8ba8cde1c8babcbb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 10:50:56 +0100 Subject: [PATCH 06/39] Test with all-caps "IJSWEG" --- tests/pipeline/test_deduce.py | 40 ++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index 3bc80771..e652db7d 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -4,7 +4,7 @@ text = ( "betreft: Jan Jansen, bsn 111222333, patnr 000334433. De patient J. Jansen is 64 " - "jaar oud en woonachtig in Utrecht. Hij werd op 10 oktober 2018 door arts " + "jaar oud en woonachtig in Utrecht, IJSWEG 10r. Hij werd op 10 oktober 2018 door arts " "Peter de Visser ontslagen van de kliniek van het UMCU. Voor nazorg kan hij " "worden bereikt via j.JNSEN.123@gmail.com of (06)12345678." ) @@ -15,22 +15,21 @@ def test_annotate(self, model): metadata = {"patient": Person(first_names=["Jan"], surname="Jansen")} doc = model.deidentify(text, metadata=metadata) - expected_annotations = dd.AnnotationSet( - [ + expected_annotations = { dd.Annotation( text="(06)12345678", - start_char=272, - end_char=284, + start_char=284, + end_char=296, tag="telefoonnummer", ), dd.Annotation(text="111222333", start_char=25, end_char=34, tag="bsn"), dd.Annotation( - text="Peter de Visser", start_char=153, end_char=168, tag="persoon" + text="Peter de Visser", start_char=165, end_char=180, tag="persoon" ), dd.Annotation( text="j.JNSEN.123@gmail.com", - start_char=247, - end_char=268, + start_char=259, + end_char=280, tag="emailadres", ), dd.Annotation( @@ -40,7 +39,7 @@ def test_annotate(self, model): text="Jan Jansen", start_char=9, end_char=19, tag="patient" ), dd.Annotation( - text="10 oktober 2018", start_char=127, end_char=142, tag="datum" + text="10 oktober 2018", start_char=139, end_char=154, tag="datum" ), dd.Annotation(text="64", start_char=77, end_char=79, tag="leeftijd"), dd.Annotation(text="000334433", start_char=42, end_char=51, tag="id"), @@ -48,12 +47,14 @@ def test_annotate(self, model): text="Utrecht", start_char=106, end_char=113, tag="locatie" ), dd.Annotation( - text="UMCU", start_char=202, end_char=206, tag="ziekenhuis" + text="IJSWEG 10r", start_char=115, end_char=125, tag="locatie" ), - ] - ) + dd.Annotation( + text="UMCU", start_char=214, end_char=218, tag="ziekenhuis" + ), + } - assert doc.annotations == set(expected_annotations) + assert set(doc.annotations) == expected_annotations def test_deidentify(self, model): metadata = {"patient": Person(first_names=["Jan"], surname="Jansen")} @@ -61,8 +62,8 @@ def test_deidentify(self, model): expected_deidentified = ( "betreft: [PATIENT], bsn [BSN-1], patnr [ID-1]. De patient [PATIENT] is " - "[LEEFTIJD-1] jaar oud en woonachtig in [LOCATIE-1]. Hij werd op " - "[DATUM-1] door arts [PERSOON-1] ontslagen van de kliniek van het " + "[LEEFTIJD-1] jaar oud en woonachtig in [LOCATIE-1], [LOCATIE-2]. Hij werd " + "op [DATUM-1] door arts [PERSOON-1] ontslagen van de kliniek van het " "[ZIEKENHUIS-1]. Voor nazorg kan hij worden bereikt via [EMAILADRES-1] " "of [TELEFOONNUMMER-1]." ) @@ -77,10 +78,11 @@ def test_annotate_intext(self, model): "betreft: Jan Jansen, bsn 111222333, " "patnr 000334433. De patient J. Jansen is " "64 jaar oud en woonachtig in Utrecht" - ". Hij werd op 10 oktober 2018 door arts " - "Peter de Visser ontslagen van de kliniek van het " - "UMCU. Voor nazorg kan hij worden bereikt " - "via j.JNSEN.123@gmail.com of " + ", IJSWEG 10r. Hij werd op 10 " + "oktober 2018 door arts Peter de " + "Visser ontslagen van de kliniek van het " + "UMCU. Voor nazorg kan hij worden " + "bereikt via j.JNSEN.123@gmail.com of " "(06)12345678." ) From 6062d46af64c23150ec4f476fdcc75a557395d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 13:44:59 +0100 Subject: [PATCH 07/39] Titlecase also when loading resources --- deduce/lookup_struct_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deduce/lookup_struct_loader.py b/deduce/lookup_struct_loader.py index cd869b47..01105927 100644 --- a/deduce/lookup_struct_loader.py +++ b/deduce/lookup_struct_loader.py @@ -152,6 +152,7 @@ def load_street_lookup( cleaning_pipeline=[ dd.str.StripString(), dd.str.FilterByLength(min_len=4), + dd.str.LowercaseTail(), ], ) From 712ebe45df92a0c3cd4ef9ec1de178d2406f07d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 16:17:19 +0100 Subject: [PATCH 08/39] Make `TestLookupStruct` work regardless of cwd --- tests/unit/test_lookup_struct.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_lookup_struct.py b/tests/unit/test_lookup_struct.py index 978ac121..ffb4f4b8 100644 --- a/tests/unit/test_lookup_struct.py +++ b/tests/unit/test_lookup_struct.py @@ -1,4 +1,6 @@ import io + +from os.path import dirname, realpath from pathlib import Path from unittest.mock import patch @@ -12,7 +14,8 @@ validate_lookup_struct_cache, ) -DATA_PATH = Path(".").cwd() / "tests" / "data" / "lookup" +_TESTS_DIR = dirname(dirname(realpath(__file__))) +DATA_PATH = Path(_TESTS_DIR) / "data" / "lookup" class TestLookupStruct: From c5bc0b68b2a7c7ac412010dd768c26e532ccd12c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 16:32:17 +0100 Subject: [PATCH 09/39] Use `pytest-datadir` for data fixtures Beware! `poetry.lock` is not up-to-date in this commit (and most recent commits wouldn't work with the current last released version of `docdeid`, anyway). --- pyproject.toml | 2 +- .../data}/ages.json | 0 .../data}/dates.json | 0 .../data}/emails.json | 0 .../data}/identifiers.json | 0 .../data}/institutions.json | 0 .../data}/locations.json | 0 .../data}/names.json | 0 .../data}/phone_numbers.json | 0 .../data}/urls.json | 0 tests/regression/test_regression.py | 36 +++++------ .../data/lookup/cache/lookup_structs.pickle | Bin .../data/lookup/src/lst_test/exceptions.txt | 0 .../data/lookup/src/lst_test/items.txt | 0 .../data/lookup/src/lst_test/transform.json | 0 .../data/lookup/src/lst_test_nested/items.txt | 0 .../src/lst_test_nested/lst_sublist/items.txt | 0 tests/{ => unit}/data/small.json | 0 tests/unit/test_lookup_struct.py | 60 ++++++++++-------- tests/unit/test_utils.py | 17 ++--- 20 files changed, 62 insertions(+), 53 deletions(-) rename tests/{data/regression_cases => regression/data}/ages.json (100%) rename tests/{data/regression_cases => regression/data}/dates.json (100%) rename tests/{data/regression_cases => regression/data}/emails.json (100%) rename tests/{data/regression_cases => regression/data}/identifiers.json (100%) rename tests/{data/regression_cases => regression/data}/institutions.json (100%) rename tests/{data/regression_cases => regression/data}/locations.json (100%) rename tests/{data/regression_cases => regression/data}/names.json (100%) rename tests/{data/regression_cases => regression/data}/phone_numbers.json (100%) rename tests/{data/regression_cases => regression/data}/urls.json (100%) rename tests/{ => unit}/data/lookup/cache/lookup_structs.pickle (100%) rename tests/{ => unit}/data/lookup/src/lst_test/exceptions.txt (100%) rename tests/{ => unit}/data/lookup/src/lst_test/items.txt (100%) rename tests/{ => unit}/data/lookup/src/lst_test/transform.json (100%) rename tests/{ => unit}/data/lookup/src/lst_test_nested/items.txt (100%) rename tests/{ => unit}/data/lookup/src/lst_test_nested/lst_sublist/items.txt (100%) rename tests/{ => unit}/data/small.json (100%) diff --git a/pyproject.toml b/pyproject.toml index e370140e..f59175de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,10 +27,10 @@ author = "Vincent Menger" [tool.poetry.dependencies] python = "^3.9" rapidfuzz = "^2.11.1" -docdeid = "1.0.0" regex = "^2022.9.13" frozendict = "^2.3.10" deprecated = "^1.2.14" +pytest-datadir = "^1.5.0" [tool.poetry.group.dev] optional = false diff --git a/tests/data/regression_cases/ages.json b/tests/regression/data/ages.json similarity index 100% rename from tests/data/regression_cases/ages.json rename to tests/regression/data/ages.json diff --git a/tests/data/regression_cases/dates.json b/tests/regression/data/dates.json similarity index 100% rename from tests/data/regression_cases/dates.json rename to tests/regression/data/dates.json diff --git a/tests/data/regression_cases/emails.json b/tests/regression/data/emails.json similarity index 100% rename from tests/data/regression_cases/emails.json rename to tests/regression/data/emails.json diff --git a/tests/data/regression_cases/identifiers.json b/tests/regression/data/identifiers.json similarity index 100% rename from tests/data/regression_cases/identifiers.json rename to tests/regression/data/identifiers.json diff --git a/tests/data/regression_cases/institutions.json b/tests/regression/data/institutions.json similarity index 100% rename from tests/data/regression_cases/institutions.json rename to tests/regression/data/institutions.json diff --git a/tests/data/regression_cases/locations.json b/tests/regression/data/locations.json similarity index 100% rename from tests/data/regression_cases/locations.json rename to tests/regression/data/locations.json diff --git a/tests/data/regression_cases/names.json b/tests/regression/data/names.json similarity index 100% rename from tests/data/regression_cases/names.json rename to tests/regression/data/names.json diff --git a/tests/data/regression_cases/phone_numbers.json b/tests/regression/data/phone_numbers.json similarity index 100% rename from tests/data/regression_cases/phone_numbers.json rename to tests/regression/data/phone_numbers.json diff --git a/tests/data/regression_cases/urls.json b/tests/regression/data/urls.json similarity index 100% rename from tests/data/regression_cases/urls.json rename to tests/regression/data/urls.json diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index 3bdb442e..1a102d3a 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -39,65 +39,65 @@ def annotators_from_group(model: Deduce, group: str) -> set[str]: class TestRegression: - def test_regression_name(self, model): + def test_regression_name(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/names.json", + examples_file=shared_datadir / "names.json", enabled=annotators_from_group(model, "names"), ) - def test_regression_location(self, model): + def test_regression_location(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/locations.json", + examples_file=shared_datadir / "locations.json", enabled=annotators_from_group(model, "locations"), ) - def test_regression_institution(self, model): + def test_regression_institution(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/institutions.json", + examples_file=shared_datadir / "institutions.json", enabled=annotators_from_group(model, "institutions"), ) - def test_regression_date(self, model): + def test_regression_date(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/dates.json", + examples_file=shared_datadir / "dates.json", enabled=annotators_from_group(model, "dates"), ) - def test_regression_age(self, model): + def test_regression_age(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/ages.json", + examples_file=shared_datadir / "ages.json", enabled=annotators_from_group(model, "ages"), ) - def test_regression_identifier(self, model): + def test_regression_identifier(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/identifiers.json", + examples_file=shared_datadir / "identifiers.json", enabled=annotators_from_group(model, "identifiers"), ) - def test_regression_phone(self, model): + def test_regression_phone(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/phone_numbers.json", + examples_file=shared_datadir / "phone_numbers.json", enabled=annotators_from_group(model, "phone_numbers"), ) - def test_regression_email(self, model): + def test_regression_email(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/emails.json", + examples_file=shared_datadir / "emails.json", enabled=annotators_from_group(model, "email_addresses"), ) - def test_regression_url(self, model): + def test_regression_url(self, model, shared_datadir): regression_test( model=model, - examples_file="tests/data/regression_cases/urls.json", + examples_file=shared_datadir / "urls.json", enabled=annotators_from_group(model, "urls"), ) diff --git a/tests/data/lookup/cache/lookup_structs.pickle b/tests/unit/data/lookup/cache/lookup_structs.pickle similarity index 100% rename from tests/data/lookup/cache/lookup_structs.pickle rename to tests/unit/data/lookup/cache/lookup_structs.pickle diff --git a/tests/data/lookup/src/lst_test/exceptions.txt b/tests/unit/data/lookup/src/lst_test/exceptions.txt similarity index 100% rename from tests/data/lookup/src/lst_test/exceptions.txt rename to tests/unit/data/lookup/src/lst_test/exceptions.txt diff --git a/tests/data/lookup/src/lst_test/items.txt b/tests/unit/data/lookup/src/lst_test/items.txt similarity index 100% rename from tests/data/lookup/src/lst_test/items.txt rename to tests/unit/data/lookup/src/lst_test/items.txt diff --git a/tests/data/lookup/src/lst_test/transform.json b/tests/unit/data/lookup/src/lst_test/transform.json similarity index 100% rename from tests/data/lookup/src/lst_test/transform.json rename to tests/unit/data/lookup/src/lst_test/transform.json diff --git a/tests/data/lookup/src/lst_test_nested/items.txt b/tests/unit/data/lookup/src/lst_test_nested/items.txt similarity index 100% rename from tests/data/lookup/src/lst_test_nested/items.txt rename to tests/unit/data/lookup/src/lst_test_nested/items.txt diff --git a/tests/data/lookup/src/lst_test_nested/lst_sublist/items.txt b/tests/unit/data/lookup/src/lst_test_nested/lst_sublist/items.txt similarity index 100% rename from tests/data/lookup/src/lst_test_nested/lst_sublist/items.txt rename to tests/unit/data/lookup/src/lst_test_nested/lst_sublist/items.txt diff --git a/tests/data/small.json b/tests/unit/data/small.json similarity index 100% rename from tests/data/small.json rename to tests/unit/data/small.json diff --git a/tests/unit/test_lookup_struct.py b/tests/unit/test_lookup_struct.py index ffb4f4b8..da78f404 100644 --- a/tests/unit/test_lookup_struct.py +++ b/tests/unit/test_lookup_struct.py @@ -1,7 +1,5 @@ import io -from os.path import dirname, realpath -from pathlib import Path from unittest.mock import patch import docdeid as dd @@ -14,14 +12,12 @@ validate_lookup_struct_cache, ) -_TESTS_DIR = dirname(dirname(realpath(__file__))) -DATA_PATH = Path(_TESTS_DIR) / "data" / "lookup" - class TestLookupStruct: - def test_load_raw_itemset(self): + def test_load_raw_itemset(self, shared_datadir): - raw_itemset = load_raw_itemset(DATA_PATH / "src" / "lst_test") + raw_itemset = load_raw_itemset( + shared_datadir / "lookup" / "src" / "lst_test") assert len(raw_itemset) == 5 assert "de Vries" in raw_itemset @@ -31,16 +27,18 @@ def test_load_raw_itemset(self): assert "Pieters" in raw_itemset assert "Wolter" not in raw_itemset - def test_load_raw_itemset_nested(self): + def test_load_raw_itemset_nested(self, shared_datadir): - raw_itemset = load_raw_itemset(DATA_PATH / "src" / "lst_test_nested") + raw_itemset = load_raw_itemset( + shared_datadir / "lookup" / "src" / "lst_test_nested") assert raw_itemset == {"a", "b", "c", "d"} - def test_load_raw_itemsets(self): + def test_load_raw_itemsets(self, shared_datadir): raw_itemsets = load_raw_itemsets( - base_path=DATA_PATH, subdirs=["lst_test", "lst_test_nested"] + base_path=shared_datadir / "lookup", + subdirs=["lst_test", "lst_test_nested"] ) assert "test" in raw_itemsets @@ -48,7 +46,7 @@ def test_load_raw_itemsets(self): assert "test_nested" in raw_itemsets assert len(raw_itemsets["test_nested"]) == 4 - def test_validate_lookup_struct_cache_valid(self): + def test_validate_lookup_struct_cache_valid(self, shared_datadir): cache = { "deduce_version": "2.5.0", @@ -62,10 +60,12 @@ class MockStats: with patch("pathlib.Path.glob", return_value=[1, 2, 3]): with patch("os.stat", return_value=MockStats()): assert validate_lookup_struct_cache( - cache=cache, base_path=DATA_PATH, deduce_version="2.5.0" + cache=cache, + base_path=shared_datadir / "lookup", + deduce_version="2.5.0" ) - def test_validate_lookup_struct_cache_file_changes(self): + def test_validate_lookup_struct_cache_file_changes(self, shared_datadir): cache = { "deduce_version": "2.5.0", @@ -79,45 +79,53 @@ class MockStats: with patch("pathlib.Path.glob", return_value=[1, 2, 3]): with patch("os.stat", return_value=MockStats()): assert not validate_lookup_struct_cache( - cache=cache, base_path=DATA_PATH, deduce_version="2.5.0" + cache=cache, + base_path=shared_datadir / "lookup", + deduce_version="2.5.0" ) - @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=True) - def test_load_lookup_structs_from_cache(self, _): + @patch("deduce.lookup_structs.validate_lookup_struct_cache", + return_value=True) + def test_load_lookup_structs_from_cache(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=DATA_PATH, deduce_version="_" + base_path=shared_datadir / "lookup", + deduce_version="_" ) assert len(ds_collection) == 2 assert "test" in ds_collection assert "test_nested" in ds_collection - @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=True) - def test_load_lookup_structs_from_cache_nofile(self, _): + @patch("deduce.lookup_structs.validate_lookup_struct_cache", + return_value=True) + def test_load_lookup_structs_from_cache_nofile(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=DATA_PATH / "non_existing_dir", deduce_version="_" + base_path=shared_datadir / "non_existing_dir", + deduce_version="_" ) assert ds_collection is None - @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=False) - def test_load_lookup_structs_from_cache_invalid(self, _): + @patch("deduce.lookup_structs.validate_lookup_struct_cache", + return_value=False) + def test_load_lookup_structs_from_cache_invalid(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=DATA_PATH, deduce_version="_" + base_path=shared_datadir / "lookup", + deduce_version="_" ) assert ds_collection is None @patch("builtins.open", return_value=io.BytesIO()) @patch("pickle.dump") - def test_cache_lookup_structs(self, _, mock_pickle_dump): + def test_cache_lookup_structs(self, _, mock_pickle_dump, shared_datadir): cache_lookup_structs( lookup_structs=dd.ds.DsCollection(), - base_path=DATA_PATH, + base_path=shared_datadir / "lookup", deduce_version="2.5.0", ) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 0f0dff1a..ad055420 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -229,26 +229,27 @@ def test_apply_transform_no_strip_lines(self): class TestOptionalLoad: - def test_optional_load_items(self): + def test_optional_load_items(self, shared_datadir): - path = Path("tests/data/lookup/src/lst_test_nested/items.txt") + path = (shared_datadir / + "lookup" / "src" / "lst_test_nested" / "items.txt") assert utils.optional_load_items(path) == {"a", "b"} - def test_optional_load_items_nonexisting(self): + def test_optional_load_items_nonexisting(self, shared_datadir): - path = Path("tests/data/non/existing/file.txt") + path = shared_datadir / "non" / "existing" / "file.txt" assert utils.optional_load_items(path) is None - def test_optional_load_json(self): + def test_optional_load_json(self, shared_datadir): - path = Path("tests/data/small.json") + path = shared_datadir / "small.json" assert utils.optional_load_json(path) == {"test": True} - def test_optional_load_json_nonexisting(self): + def test_optional_load_json_nonexisting(self, shared_datadir): - path = Path("tests/data/non/existing/file.json") + path = shared_datadir / "non" / "existing" / "file.json" assert utils.optional_load_json(path) is None From 9ebca78d60bbba8cd518ed8b35b09f3330ff8c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 23:20:02 +0100 Subject: [PATCH 10/39] Minimize data fixtures for tests --- tests/conftest.py | 8 - .../lst_healthcare_institution/exceptions.txt | 3 + .../lst_healthcare_institution/items.txt | 3 + .../src/institutions/lst_hospital/items.txt | 5 + .../institutions/lst_hospital_abbr/items.txt | 2 + .../lst_placename/lst_municipality/items.txt | 2 + .../lst_placename/lst_province/items.txt | 2 + .../lst_placename/lst_region/items.txt | 5 + .../lst_residence/exceptions.txt | 5 + .../lst_placename/lst_residence/items.txt | 2 + .../src/locations/lst_street/exceptions.txt | 5 + .../lookup/src/locations/lst_street/items.txt | 3 + .../src/locations/lst_street/streets_bag.txt | 3 + .../src/names/lst_first_name/exceptions.txt | 5 + .../lookup/src/names/lst_first_name/items.txt | 4 + .../lookup/src/names/lst_initial/items.txt | 5 + .../lookup/src/names/lst_interfix/items.txt | 5 + .../names/lst_interfix_surname/exceptions.txt | 3 + .../src/names/lst_interfix_surname/items.txt | 3 + .../lookup/src/names/lst_prefix/items.txt | 5 + .../src/names/lst_surname/exceptions.txt | 5 + .../lookup/src/names/lst_surname/items.txt | 4 + .../whitelist/lst_common_word/exceptions.txt | 5 + .../src/whitelist/lst_common_word/items.txt | 6 + .../whitelist/lst_eponymous_disease/items.txt | 5 + .../lst_eponymous_single/items.txt | 5 + .../src/whitelist/lst_medical_term/items.txt | 5 + .../src/whitelist/lst_stop_word/items.txt | 5 + tests/pipeline/test_deduce.py | 9 + tests/regression/data/lookup/src/__init__.py | 17 + .../lst_healthcare_institution/exceptions.txt | 3 + .../lst_healthcare_institution/items.txt | 241 ++++++ .../lst_healthcare_institution/transform.json | 51 ++ .../src/institutions/lst_hospital/items.txt | 39 + .../institutions/lst_hospital/transform.json | 101 +++ .../institutions/lst_hospital_abbr/items.txt | 21 + .../lst_placename/lst_municipality/items.txt | 8 + .../lst_placename/lst_province/items.txt | 3 + .../lst_placename/lst_region/items.txt | 9 + .../lst_residence/exceptions.txt | 7 + .../lst_placename/lst_residence/items.txt | 29 + .../locations/lst_placename/transform.json | 189 +++++ .../src/locations/lst_street/exceptions.txt | 7 + .../lookup/src/locations/lst_street/items.txt | 236 ++++++ .../src/locations/lst_street/streets_bag.txt | 255 +++++++ .../src/locations/lst_street/transform.json | 712 ++++++++++++++++++ .../src/names/lst_first_name/exceptions.txt | 10 + .../lookup/src/names/lst_first_name/items.txt | 18 + .../lookup/src/names/lst_initial/items.txt | 54 ++ .../lookup/src/names/lst_interfix/items.txt | 44 ++ .../names/lst_interfix_surname/exceptions.txt | 3 + .../src/names/lst_interfix_surname/items.txt | 18 + .../lookup/src/names/lst_prefix/items.txt | 45 ++ .../src/names/lst_surname/exceptions.txt | 7 + .../lookup/src/names/lst_surname/items.txt | 32 + .../whitelist/lst_common_word/exceptions.txt | 6 + .../src/whitelist/lst_common_word/items.txt | 12 + .../whitelist/lst_eponymous_disease/items.txt | 5 + .../lst_eponymous_single/items.txt | 5 + .../lst_eponymous_single/transform.json | 22 + .../lst_eponymous_disease/transform.json | 39 + .../src/whitelist/lst_medical_term/items.txt | 18 + .../src/whitelist/lst_stop_word/items.txt | 6 + tests/regression/test_regression.py | 13 + 64 files changed, 2404 insertions(+), 8 deletions(-) delete mode 100644 tests/conftest.py create mode 100644 tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/items.txt create mode 100644 tests/pipeline/data/lookup/src/institutions/lst_hospital/items.txt create mode 100644 tests/pipeline/data/lookup/src/institutions/lst_hospital_abbr/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_placename/lst_municipality/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_placename/lst_province/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_placename/lst_region/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_street/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_street/items.txt create mode 100644 tests/pipeline/data/lookup/src/locations/lst_street/streets_bag.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_first_name/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_first_name/items.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_initial/items.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_interfix/items.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_interfix_surname/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_interfix_surname/items.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_prefix/items.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_surname/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/names/lst_surname/items.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_common_word/exceptions.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_common_word/items.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/items.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_medical_term/items.txt create mode 100644 tests/pipeline/data/lookup/src/whitelist/lst_stop_word/items.txt create mode 100644 tests/regression/data/lookup/src/__init__.py create mode 100644 tests/regression/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt create mode 100644 tests/regression/data/lookup/src/institutions/lst_healthcare_institution/items.txt create mode 100644 tests/regression/data/lookup/src/institutions/lst_healthcare_institution/transform.json create mode 100644 tests/regression/data/lookup/src/institutions/lst_hospital/items.txt create mode 100644 tests/regression/data/lookup/src/institutions/lst_hospital/transform.json create mode 100644 tests/regression/data/lookup/src/institutions/lst_hospital_abbr/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/lst_municipality/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/lst_province/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/lst_region/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/lst_residence/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_placename/transform.json create mode 100644 tests/regression/data/lookup/src/locations/lst_street/exceptions.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_street/items.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_street/streets_bag.txt create mode 100644 tests/regression/data/lookup/src/locations/lst_street/transform.json create mode 100644 tests/regression/data/lookup/src/names/lst_first_name/exceptions.txt create mode 100644 tests/regression/data/lookup/src/names/lst_first_name/items.txt create mode 100644 tests/regression/data/lookup/src/names/lst_initial/items.txt create mode 100644 tests/regression/data/lookup/src/names/lst_interfix/items.txt create mode 100644 tests/regression/data/lookup/src/names/lst_interfix_surname/exceptions.txt create mode 100644 tests/regression/data/lookup/src/names/lst_interfix_surname/items.txt create mode 100644 tests/regression/data/lookup/src/names/lst_prefix/items.txt create mode 100644 tests/regression/data/lookup/src/names/lst_surname/exceptions.txt create mode 100644 tests/regression/data/lookup/src/names/lst_surname/items.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_common_word/exceptions.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_common_word/items.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/items.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/transform.json create mode 100644 tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/transform.json create mode 100644 tests/regression/data/lookup/src/whitelist/lst_medical_term/items.txt create mode 100644 tests/regression/data/lookup/src/whitelist/lst_stop_word/items.txt diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index e7139230..00000000 --- a/tests/conftest.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from deduce import Deduce - - -@pytest.fixture(scope="session") -def model(): - return Deduce(build_lookup_structs=True) diff --git a/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt b/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt new file mode 100644 index 00000000..8441a19d --- /dev/null +++ b/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt @@ -0,0 +1,3 @@ +Oktober +PSOAS +Prisma diff --git a/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/items.txt b/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/items.txt new file mode 100644 index 00000000..1aece757 --- /dev/null +++ b/tests/pipeline/data/lookup/src/institutions/lst_healthcare_institution/items.txt @@ -0,0 +1,3 @@ +De SeT Residentie +De Zorggroep, Woonzorgcentrum Bosscherhof +Oktober diff --git a/tests/pipeline/data/lookup/src/institutions/lst_hospital/items.txt b/tests/pipeline/data/lookup/src/institutions/lst_hospital/items.txt new file mode 100644 index 00000000..65fc1cf9 --- /dev/null +++ b/tests/pipeline/data/lookup/src/institutions/lst_hospital/items.txt @@ -0,0 +1,5 @@ +Weesperplein Ziekenhuis +De Stadsmaten +Sint Vincentius Ziekenhuis +Wilhelmina Kinder +Zaans Medisch Centrum diff --git a/tests/pipeline/data/lookup/src/institutions/lst_hospital_abbr/items.txt b/tests/pipeline/data/lookup/src/institutions/lst_hospital_abbr/items.txt new file mode 100644 index 00000000..f32bd3b0 --- /dev/null +++ b/tests/pipeline/data/lookup/src/institutions/lst_hospital_abbr/items.txt @@ -0,0 +1,2 @@ +MUMC +UMCU diff --git a/tests/pipeline/data/lookup/src/locations/lst_placename/lst_municipality/items.txt b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_municipality/items.txt new file mode 100644 index 00000000..101dadfc --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_municipality/items.txt @@ -0,0 +1,2 @@ +Hengelo +Utrecht diff --git a/tests/pipeline/data/lookup/src/locations/lst_placename/lst_province/items.txt b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_province/items.txt new file mode 100644 index 00000000..45804100 --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_province/items.txt @@ -0,0 +1,2 @@ +Friesland +Utrecht diff --git a/tests/pipeline/data/lookup/src/locations/lst_placename/lst_region/items.txt b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_region/items.txt new file mode 100644 index 00000000..0afab38a --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_region/items.txt @@ -0,0 +1,5 @@ +Oostergo +Prins Alexanderpolder +Rijk Van Nijmegen +Noordenveld +Rottum diff --git a/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt new file mode 100644 index 00000000..504b7724 --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt @@ -0,0 +1,5 @@ +Raar +Smal +Zittend +Brand +Ie diff --git a/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/items.txt b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/items.txt new file mode 100644 index 00000000..1486dde6 --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_placename/lst_residence/items.txt @@ -0,0 +1,2 @@ +Felland +Utrecht diff --git a/tests/pipeline/data/lookup/src/locations/lst_street/exceptions.txt b/tests/pipeline/data/lookup/src/locations/lst_street/exceptions.txt new file mode 100644 index 00000000..4cb5b7ab --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_street/exceptions.txt @@ -0,0 +1,5 @@ +Segment +Generaal +Berg +Vrij +Oost diff --git a/tests/pipeline/data/lookup/src/locations/lst_street/items.txt b/tests/pipeline/data/lookup/src/locations/lst_street/items.txt new file mode 100644 index 00000000..d7e8904c --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_street/items.txt @@ -0,0 +1,3 @@ +Blauw Druifje +IJsweg +Visser diff --git a/tests/pipeline/data/lookup/src/locations/lst_street/streets_bag.txt b/tests/pipeline/data/lookup/src/locations/lst_street/streets_bag.txt new file mode 100644 index 00000000..73533e51 --- /dev/null +++ b/tests/pipeline/data/lookup/src/locations/lst_street/streets_bag.txt @@ -0,0 +1,3 @@ +Blauw druifje +IJsweg +Visser diff --git a/tests/pipeline/data/lookup/src/names/lst_first_name/exceptions.txt b/tests/pipeline/data/lookup/src/names/lst_first_name/exceptions.txt new file mode 100644 index 00000000..5c793052 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_first_name/exceptions.txt @@ -0,0 +1,5 @@ +Lung +Ad +Ace +Man +Heino diff --git a/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt b/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt new file mode 100644 index 00000000..eaa20709 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt @@ -0,0 +1,4 @@ +Annes +Jan +Jansen +Peter diff --git a/tests/pipeline/data/lookup/src/names/lst_initial/items.txt b/tests/pipeline/data/lookup/src/names/lst_initial/items.txt new file mode 100644 index 00000000..b4925874 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_initial/items.txt @@ -0,0 +1,5 @@ +P +Š +Ñ +A +Ã diff --git a/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt new file mode 100644 index 00000000..73318d7b --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt @@ -0,0 +1,5 @@ +uit de +von +in het +de +in 't diff --git a/tests/pipeline/data/lookup/src/names/lst_interfix_surname/exceptions.txt b/tests/pipeline/data/lookup/src/names/lst_interfix_surname/exceptions.txt new file mode 100644 index 00000000..87b8f311 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_interfix_surname/exceptions.txt @@ -0,0 +1,3 @@ +Amersfoort +Utrecht +Veenendaal diff --git a/tests/pipeline/data/lookup/src/names/lst_interfix_surname/items.txt b/tests/pipeline/data/lookup/src/names/lst_interfix_surname/items.txt new file mode 100644 index 00000000..da71be6d --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_interfix_surname/items.txt @@ -0,0 +1,3 @@ +Olst +Utrecht +Visser diff --git a/tests/pipeline/data/lookup/src/names/lst_prefix/items.txt b/tests/pipeline/data/lookup/src/names/lst_prefix/items.txt new file mode 100644 index 00000000..74c2f9a2 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_prefix/items.txt @@ -0,0 +1,5 @@ +de weledelgeleerde +ing. +mr. +dr.h.c. +mevr. diff --git a/tests/pipeline/data/lookup/src/names/lst_surname/exceptions.txt b/tests/pipeline/data/lookup/src/names/lst_surname/exceptions.txt new file mode 100644 index 00000000..4f5351e9 --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_surname/exceptions.txt @@ -0,0 +1,5 @@ +Oost +Tel +Klein +Broer +Lang diff --git a/tests/pipeline/data/lookup/src/names/lst_surname/items.txt b/tests/pipeline/data/lookup/src/names/lst_surname/items.txt new file mode 100644 index 00000000..770a5ece --- /dev/null +++ b/tests/pipeline/data/lookup/src/names/lst_surname/items.txt @@ -0,0 +1,4 @@ +Jansen +Killaars +Peter +Visser diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_common_word/exceptions.txt b/tests/pipeline/data/lookup/src/whitelist/lst_common_word/exceptions.txt new file mode 100644 index 00000000..3a2cb72d --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_common_word/exceptions.txt @@ -0,0 +1,5 @@ +boos +zondag +hemel +bel +helder diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_common_word/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_common_word/items.txt new file mode 100644 index 00000000..615e97f5 --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_common_word/items.txt @@ -0,0 +1,6 @@ +oog +oktober +raam +soms +we +wijs diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/items.txt new file mode 100644 index 00000000..569997d0 --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/items.txt @@ -0,0 +1,5 @@ +Gerbec-Morgagni-Adams-Stokes +Non-Hodgkin +Diamond-Blackfan +Baller-Gerold +Alpers-Huttenlocher diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt new file mode 100644 index 00000000..000ca362 --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt @@ -0,0 +1,5 @@ +ziekte van Eales +ziekte van Glanzmann +Krabbe ziekte +Dupuytren ziekte +ziekte van Laron diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_medical_term/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_medical_term/items.txt new file mode 100644 index 00000000..5e56f639 --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_medical_term/items.txt @@ -0,0 +1,5 @@ +mdl +kinetic +functies +zetpil +schaambeen diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_stop_word/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_stop_word/items.txt new file mode 100644 index 00000000..bc5fa458 --- /dev/null +++ b/tests/pipeline/data/lookup/src/whitelist/lst_stop_word/items.txt @@ -0,0 +1,5 @@ +nu +zonder +doen +kan +een diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index e652db7d..19620b28 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -1,4 +1,7 @@ +import pytest + import docdeid as dd +from deduce import Deduce from deduce.person import Person @@ -10,6 +13,12 @@ ) +@pytest.fixture +def model(shared_datadir): + return Deduce(save_lookup_structs=False, + lookup_data_path=shared_datadir / "lookup") + + class TestDeduce: def test_annotate(self, model): metadata = {"patient": Person(first_names=["Jan"], surname="Jansen")} diff --git a/tests/regression/data/lookup/src/__init__.py b/tests/regression/data/lookup/src/__init__.py new file mode 100644 index 00000000..ae186018 --- /dev/null +++ b/tests/regression/data/lookup/src/__init__.py @@ -0,0 +1,17 @@ +all_lists = [ + "institutions/lst_healthcare_institution", + "institutions/lst_hospital", + "institutions/lst_hospital_abbr", + "locations/lst_placename", + "locations/lst_street", + "names/lst_first_name", + "names/lst_initial", + "names/lst_interfix", + "names/lst_interfix_surname", + "names/lst_prefix", + "names/lst_surname", + "whitelist/lst_common_word", + "whitelist/lst_eponymous_disease", + "whitelist/lst_medical_term", + "whitelist/lst_stop_word", +] diff --git a/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt new file mode 100644 index 00000000..8441a19d --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/exceptions.txt @@ -0,0 +1,3 @@ +Oktober +PSOAS +Prisma diff --git a/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/items.txt b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/items.txt new file mode 100644 index 00000000..124a1778 --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/items.txt @@ -0,0 +1,241 @@ +Alphega apotheek Wilhelmina +Alrijne Apotheek Leiden +Alrijne Apotheek Leiderdorp +Alrijne Zorggroep Fysiotherapie +Alrijne Zorggroep, Verpleeghuis Leythenrode +Alrijne Zorggroep, Verpleeghuis Oudshoorn +Altrecht +Altrecht Acute Psychiatrie +Altrecht Angststoornissen +Altrecht Bipolair +Altrecht Eetstoornissen Rintveld +Altrecht Kinder- en jeugdpsychiatrie +Altrecht Neuropsychiatrie Vesalius +Altrecht Ouderenpsychiatrie +Altrecht Persoonlijkheidsstoornissen +Altrecht Psychiatrie en verslaving +Altrecht Psychosomatiek Eikenboom +Altrecht Voortgezette Klinische Behandeling +Altrecht Vroege psychose ABC +Altrecht senior +Altrecht, Expertise Centrum Diagnostiek +Annelieke Janssen Fysiotherapie, Manuele Therapie en Training +Antonius Apotheek +Antonius Hypercare +Antonius Kindzorg Thuis +Antonius Zorg Thuis +Antonius Zorggroep +Apotheek Antoniushove +Apotheek Centraal Militair Hospitaal +Apotheek Erasmusplein +Apotheek Wilhelmina +BENU Apotheek Antoniusveld +BENU Apotheek Erasmus +BENU Apotheek Wilhelminapark +BrabantZorg, Sint Antonius +Buurtzorg Utrecht Wilhelminapark +Canté praktijk voor Pedagogiek en Psychologie +Careaz Antoniushove +Careyn +Careyn 1e lijns ergotherapie Utrecht Stad +Careyn 1e lijns ergotherapie Zuid Hollandse Eilanden +Careyn 1e lijns ergotherapie regio Utrecht West +Careyn 1e-lijns Fysiotherapie Utrecht-Stad +Careyn 1e-lijns fysiotherapie Zuid-Hollandse Eilanden +Careyn Bernissesteyn +Careyn Blankenburg +Careyn Buitenhof +Careyn De Ark +Careyn De Geuzenveste +Careyn De Plantage +Careyn De Prinses +Careyn De Rozenhoek +Careyn De Vier Ambachten +Careyn De Vloot +Careyn Dierenriem +Careyn Fysiotherapie +Careyn Geriatrische Revalidatie +Careyn Grootenhoek +Careyn Hart van Groenewoud +Careyn Hart van Rozenburg +Careyn Hart van Zuidland +Careyn Herman Gorterhof +Careyn Het Waterschapshuis +Careyn Maria-Oord +Careyn Maria-Oord, Geriatrische revalidatie +Careyn Nieuw Chartreuse +Careyn Parkwijk +Careyn Rosendael +Careyn Rozenhof +Careyn Snavelenburg +Careyn Swellengrebel +Careyn Tamarinde +Careyn Tamarinde, Geriatrische revalidatie +Careyn Thuiszorg en Wijkverpleging +Careyn Thuiszorg en Wijkverpleging Breda +Careyn Thuiszorg en Wijkverpleging De Lier, Maasdijk, Maasland, Schipluiden, Den Hoorn en Maassluis +Careyn Thuiszorg en Wijkverpleging Goeree-Overflakkee +Careyn Thuiszorg en Wijkverpleging Hellevoetsluis +Careyn Thuiszorg en Wijkverpleging Hoeksche Waard +Careyn Thuiszorg en Wijkverpleging Montfoort +Careyn Thuiszorg en Wijkverpleging Naaldwijk, Honselersdijk, 's-Gravenzande, Monster, Poeldijk, Kwintsheul en Wateringen +Careyn Thuiszorg en Wijkverpleging Nissewaard +Careyn Thuiszorg en Wijkverpleging Oudewater, Woerden, Montfoort +Careyn Thuiszorg en Wijkverpleging Pijnacker-Nootdorp en Delft +Careyn Thuiszorg en Wijkverpleging Rozenburg +Careyn Thuiszorg en Wijkverpleging Schiedam +Careyn Thuiszorg en Wijkverpleging Stichtse Vecht, Maarssen, Breukelen, Loenen, Nigtevegt +Careyn Thuiszorg en Wijkverpleging Utrecht Overvecht +Careyn Thuiszorg en Wijkverpleging Vlaardingen +Careyn Thuiszorg en Wijkverpleging Voorne-Bernisse +Careyn Thuiszorg en Wijkverpleging Zeist +Careyn Thuiszorg en Wijkverpleging, CVA-nazorg team Zuid-Hollandse Eilanden +Careyn Thuiszorg en Wijkverpleging, Wondexpertiseteam Zuid-Hollandse Eilanden +Careyn Torenhoeve +Careyn Verpleging en Verzorging thuis Harmelen +Careyn Voeding en Dieet +Careyn Warande +Careyn Weddesteyn +Careyn Weddesteyn, Geriatrische revalidatie +Careyn Woerdblok +Careyn Zes Rozen +Careyn Zorgcentrum +Careyn, Thuiszorg en Wijkverpleging, Palliatief team Zuid-Hollandse Eilanden +Careyn, Thuiszorg en Wijkverpleging, Specialistisch team Zuid-Hollandse Eilanden +CareynThuiszorg en Wijkverpleging Ronde Venen, Mijdrecht, Vinkeveen, Wilnis, Abcoude +Compas Huisartsenpraktijk +Daan & Van Ardenne Huisartsen +De Clavers Goes-Noord Erasmuspark +De Hoogstraat +De Hoogstraat Revalidatie +De Kind- en Jeugdspecialist +De Koperhorst +De Rijnhoven, Antoniushof zorg thuis, Wijkverpleging en verzorging +De SeT Residentie +De Zorggroep, Woonzorgcentrum Bosscherhof +Dialyseafdeling Alrijne Ziekenhuis Alphen aan den Rijn +Dialyseafdeling Alrijne Ziekenhuis Leiderdorp +Dialyseafdeling Canisius-Wilhelmina Ziekenhuis +Dialyseafdeling Erasmus MC +Dialyseafdeling HMC Antoniushove +Dialysecentrum Canisius-Wilhelmina Ziekenhuis Druten +Diëtheek Woerden, St. Antonius Ziekenhuis +Erasmus Care - Tandartsen Mariastraat +Erasmus MC - Kanker Instituut +Erasmus MC - Poliklinische apotheek +Erasmus Psy +Evean Erasmushuis +Fokusproject Utrecht Antonius +Fysio Steins Hoogstraat +Fysio-Oedeem-Manuele therapie Janssen Vaals +Fysiohuis, vestiging Wilhelminastraat +Fysiotherapie Antoniusveld +Fysiotherapie De Clavers Wilhelminadorp +Fysiotherapie De Jong, Wilhelminakade +Fysiotherapie Janneke Janssen +Fysiotherapie Joke Janssen +Fysiotherapie Wilhelminapark +Fysiotherapiepraktijk M.J.C.C. Janssen +Fysiotherapiepraktijk Van den Berg, Alphen aan den Rijn, Alrijne Ziekenhuis +GGz Centraal, Zon en Schild +GOED Ridderkerk, Huisartsenpraktijk M.A. Janssen +Gezondheidscentrum Wilhelminapier +Hallux podotherapie Den Bosch Antonius +Hoogstraat +Huid- en Oedeemtherapie Gerdy Janssen +Huisartsen WateringseVeld - Schuring, Janssens, Van Beek & Veldhoven +Huisartsen Wolters en Janssen +Huisartsengroep De Poort, Praktijk K.J. Janssen-van Hemmen +Huisartsenpraktijk A.G.M. Janssen +Huisartsenpraktijk Antoniusveld +Huisartsenpraktijk De Hoogstraat +Huisartsenpraktijk Janssen Breda +Huisartsenpraktijk Janssen Eindhoven +IJsselheem, Zorghuis Wilhelmina van Sonsbeeck +Innofeet Kampen, Isala poli +Innofeet Zwolle, Isala +Isala Fertiliteitscentrum +Isala Klinieken +Isala Meppel - HD thuishemodialyse +Isala Zwolle - hemodialyse, peritoneale dialyse en thuisdialyse +Janssen & Partners +Janssen Huidtherapie +Janssen Manuele Therapie E.S. +Janssen Podologie +Janssen Psychiatrie +Janssen Psychologie & Cognitieve Gedragstherapie +Janssen van Dijke Fysiotherapeuten +Laurens Antonius Binnenweg +Livit Orthopedie Rotterdam, Servicepunt Erasmus MC Zorgwinkel +Logopediepraktijk Canisius +Logopediepraktijk M.C. Heidema en H.L. Janssen +Lunet zorg, woonlocatie Hoogstraat +Marjon Janssen Wervelendespieren +Mea Fysio Leeuwarden - Erasmus +Meander +Meander Medisch Centrum, afdeling Trombosedienst +Medisch Centrum Aalst, huisarts Janssen +Medisch Centrum Antoniusveld, Diëtisten +Medisch Centrum Nieuwenhagen, Huisartsenpraktijk Janssen +Medisch Centrum Sint Anna, Huisartsenpraktijk Janssen en Van Loon +Mijzo - Sint Antonius +Mondzorg Erasmus +Noorderbreedte, Woonzorgcentrum Erasmus +O.M.G. Janssen, tandarts-implantoloog +ORO - Wilhelminastraat +Oktober +Orthodontiepraktijk Wilhelminapark +P.P.C. Janssen, zelfstandig bedrijfsarts +Pameijer, werkplek Atelier Juffrouw Janssen +Paul Janssen, arts acupunctuur +Podologiepraktijk Janssen Buitensport +Praktijk Orthomanuele geneeskunde R. Janssen +Praktijk voor Logopedie Janssen +Praktijk voor Psychiatrie en Psychotherapie A.M. Janssen +Praktijk voor psychotherapie Caroline Janssen +Psychiatrische afdeling van het Canisius-Wilhelmina Ziekenhuis +Psychologen Wilhelmina +Pysychologiepraktijk G.A.J. Janssen +Radar Woonvorm Wilhelminasingel +Reinaerde, Serviceteam Koningin Wilhelminalaan +SVRZ ZorgThuis Goes - Erasmuspark +Sint Annaklooster, Antonius +St Antonius Apotheek Nieuwegein +St Antonius Apotheek Utrecht +St Antonius Dialysecentrum +St Antonius Spatadercentrum +St Antonius SportsClinic +St. Antonius Cardicare +Surplus, woonzorgcentrum Antonius Abt +Tandartsen- en verwijspraktijk Wilhelminapark +Tandartsenpraktijk Janssen +Tandartsenpraktijk S. Lodder en A. Lodder-Janssen +Tandartsenpraktijk Wilhelminapark +Tandartsenpraktijk Wilhelminaweg, Praktijk voor Orale Implantologie Veluwezoom +Tandartspraktijk Janssen en Linssen +Tandartspraktijk Janssen&Janssen +Tandheelkundig Centrum Wilhelminapier +Thuiszorgwinkel Medipoint | Careyn | Spijkenisse +Thuiszorgwinkel Medipoint | Careyn | Vlaardingen +Trombosedienst Isala +Trombosedienst Isala Meppel +UMC Utrecht, Dialyseafdeling +UMC Utrecht, Wilhelmina Kinderziekenhuis, afdeling Kinderperitoneale dialyse +Verzorgingshuis De Koperhorst +Voetencentrum Wender, Beuningen Wilhelminalaan +Voetencentrum Wender, Rotterdam Wilhelminapier +Wijk en Janssen Psychologisch Advies +Wilhelmina Apotheek +Wilhelmina Ziekenhuis Assen (WZA) +WilhelminaOord +Zon en Schild +ZorgSaam Antonius +Zorgboerderij Wilhelminahoeve +Zorggroep Elde Maasduinen, zorglocatie Antoniushof +Zorggroep Solis Paramedisch Centrum PW Janssen, diëtetiek +Zorggroep Solis Paramedisch Centrum PW Janssen, ergotherapie +Zorggroep Solis Paramedisch Centrum PW Janssen, fysiotherapie +Zorggroep Solis Paramedisch Centrum PW Janssen, logopedie +Zorggroep Solis, Geriatrische Revalidatie Zorg PW Janssen +Zorggroep Solis, Verpleeghuis PW Janssen +Zorgwinkel Erasmus MC diff --git a/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/transform.json b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/transform.json new file mode 100644 index 00000000..a56ed996 --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_healthcare_institution/transform.json @@ -0,0 +1,51 @@ +{ + "transforms": { + "instelling": { + "Huisartsenpraktijk": [ + "Huisartsenpraktijk", + "huisartsenpraktijk", + "Huisartspraktijk", + "huisartspraktijk" + ] + }, + "prefix": { + "\\bDe\\b": [ + "De", + "de" + ] + }, + "punct": { + "\\.": [ + ".", + "" + ], + "-": [ + "-", + "", + " " + ], + " & ": [ + " & ", + " en " + ] + }, + "spell": { + "y": [ + "y", + "ij" + ], + "Y": [ + "Y", + "IJ" + ], + "ij": [ + "ij", + "y" + ], + "IJ": [ + "IJ", + "Y" + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/institutions/lst_hospital/items.txt b/tests/regression/data/lookup/src/institutions/lst_hospital/items.txt new file mode 100644 index 00000000..829115f9 --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_hospital/items.txt @@ -0,0 +1,39 @@ +Academisch Ziekenhuis Amsterdam +Academisch Ziekenhuis Groningen +Academisch Ziekenhuis Leiden +Academisch Ziekenhuis Maastricht +Academisch Ziekenhuis Nijmegen +Academisch Ziekenhuis Rotterdam +Academisch Ziekenhuis Utrecht +Alrijne +Alrijne Ziekenhuis +Antonius +Antonius Ziekenhuis +Canisius-Wilhelmina Ziekenhuis +Centraal Militair Hospitaal +Centraal Militair Hospitaal Utrecht +De Stadsmaten +Erasmus +Erasmus Medisch Centrum +Isala +Isala Ziekenhuis +Meander +Meander Medisch Centrum +Militair Hospitaal +Máxima +P.W. Janssen Ziekenhuis +Prinses Máxima Centrum +Sint Antonius Stichting +Sint Antonius Ziekenhuis +Sint Vincentius Ziekenhuis +Stads en Academisch Ziekenhuis +Universitair Medisch Centrum +Universitair Medisch Centrum Utrecht +Weesperplein Ziekenhuis +Wilhelmina Gasthuis +Wilhelmina Kinder +Wilhelmina Kinder Ziekenhuis +Wilhelmina Ziekenhuis +Zaans Medisch Centrum +het Lange Land +het Lange Land Ziekenhuis diff --git a/tests/regression/data/lookup/src/institutions/lst_hospital/transform.json b/tests/regression/data/lookup/src/institutions/lst_hospital/transform.json new file mode 100644 index 00000000..dca99a2b --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_hospital/transform.json @@ -0,0 +1,101 @@ +{ + "transforms": { + "zkh": { + " (Ziekenhuis|Gasthuis|Kliniek)": [ + " Ziekenhuis", + " Ziekenhuizen", + " Zkh", + " Zkh.", + " Gasthuis", + " Kliniek", + " Klinieken", + " ziekenhuis", + " ziekenhuizen", + " zkh", + " zkh.", + " gasthuis", + " kliniek", + " klinieken", + "ziekenhuis", + "ziekenhuizen", + "zkh", + "zkh.", + "gasthuis", + "kliniek", + "klinieken" + ], + "^(Ziekenhuis|Gasthuis|Kliniek)": [ + "Ziekenhuis", + "Zkh", + "Zkh.", + "Gasthuis", + "Kliniek", + "ziekenhuis", + "zkh", + "zkh.", + "gasthuis", + "kliniek" + ], + "Medisch Centrum": [ + "Medisch Centrum", + "MC" + ] + }, + "zkh_2": { + "Universitair Medisch Centrum": [ + "Universitair Medisch Centrum", + "UMC" + ] + }, + "prefix": { + "\\bhet\\b": [ + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t", + "" + ], + "\\bSint\\b": [ + "Sint", + "sint", + "St.", + "st.", + "st", + "" + ] + }, + "punct": { + "\\.": [ + ".", + "" + ], + "-": [ + "-", + "", + " " + ] + }, + "spelling": { + "y": [ + "y", + "ij" + ], + "Y": [ + "Y", + "IJ" + ], + "ij": [ + "ij", + "y" + ], + "IJ": [ + "IJ", + "Y" + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/institutions/lst_hospital_abbr/items.txt b/tests/regression/data/lookup/src/institutions/lst_hospital_abbr/items.txt new file mode 100644 index 00000000..2c06e0be --- /dev/null +++ b/tests/regression/data/lookup/src/institutions/lst_hospital_abbr/items.txt @@ -0,0 +1,21 @@ +AMC +AZU +CMH +Diak +EKZ +EMC +ETZ +JBZ +LUMC +MCL +MMC +MUMC +PMC +UMC +UMCG +UMCN +UMCU +VMC +VUMC +WKZ +ZMC diff --git a/tests/regression/data/lookup/src/locations/lst_placename/lst_municipality/items.txt b/tests/regression/data/lookup/src/locations/lst_placename/lst_municipality/items.txt new file mode 100644 index 00000000..dc5fa599 --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/lst_municipality/items.txt @@ -0,0 +1,8 @@ +Alphen-Chaam +Goeree-Overflakkee +Hengelo +Molenlanden +Nederweert +Sluis +Smallingerland +Utrecht diff --git a/tests/regression/data/lookup/src/locations/lst_placename/lst_province/items.txt b/tests/regression/data/lookup/src/locations/lst_placename/lst_province/items.txt new file mode 100644 index 00000000..deebd57e --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/lst_province/items.txt @@ -0,0 +1,3 @@ +Friesland +Noord-Brabant +Utrecht diff --git a/tests/regression/data/lookup/src/locations/lst_placename/lst_region/items.txt b/tests/regression/data/lookup/src/locations/lst_placename/lst_region/items.txt new file mode 100644 index 00000000..df64de53 --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/lst_region/items.txt @@ -0,0 +1,9 @@ +Eemvallei +Friese Wouden +Goeree-Overflakkee +Noordenveld +Oostergo +Overflakkee +Prins Alexanderpolder +Rijk Van Nijmegen +Rottum diff --git a/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt b/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt new file mode 100644 index 00000000..b8d3363e --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/exceptions.txt @@ -0,0 +1,7 @@ +Bosch +Brand +Ie +Leeuwen +Raar +Smal +Zittend diff --git a/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/items.txt b/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/items.txt new file mode 100644 index 00000000..a5766a3e --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/lst_residence/items.txt @@ -0,0 +1,29 @@ +Alphen +Bolsward +Bontekoe +Boornbergum +Borkel En Schaft +Bosch +Bosch En Duin +Boven-Leeuwen +Bovenkerk +Bovenkerk (Nh) +Bovenkerk (Zh) +Chaam +Drieënhuizen +Felland +Friesland +Heer +Hoogstraat +Hoogstraat (Li) +Hoogstraat (Nb) +Janssenstichting +Leeuwen +Nederweert +Noord +Sint Antoniusbank +Sluis +Utrecht +Wilhelminadorp +Wilhelminaoord +Zuidgeest diff --git a/tests/regression/data/lookup/src/locations/lst_placename/transform.json b/tests/regression/data/lookup/src/locations/lst_placename/transform.json new file mode 100644 index 00000000..70239c9d --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_placename/transform.json @@ -0,0 +1,189 @@ +{ + "transforms": { + "prefix": { + "\\bhet\\b": [ + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t" + ], + "\\bSint\\b": [ + "Sint", + "sint", + "St.", + "st." + ], + "\\bit\\b": [ + "It", + "it", + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t" + ] + }, + "prop": { + "(\\b|^)Aan\\b": [ + "Aan", + "aan" + ], + "(\\b|^)Bij\\b": [ + "Bij", + "bij" + ], + "(\\b|^)De\\b": [ + "De", + "de" + ], + "(\\b|^)Den\\b": [ + "Den", + "den" + ], + "(\\b|^)En\\b": [ + "En", + "en" + ], + "(\\b|^)Het\\b": [ + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t" + ], + "(\\b|^)In\\b": [ + "In", + "in" + ], + "(\\b|^)Oan\\b": [ + "Oan", + "oan" + ], + "(\\b|^)Of\\b": [ + "Of", + "of" + ], + "(\\b|^)Op\\b": [ + "Op", + "op" + ], + "(\\b|^)Over\\b": [ + "Over", + "over" + ], + "(\\b|^)'S\\b": [ + "'S", + "'s" + ], + "(\\b|^)Ter\\b": [ + "Ter", + "ter" + ], + "(\\b|^)Van\\b": [ + "Van", + "van", + "v.", + "V." + ] + }, + "province": { + "(?<=\\()Fr(?=\\))": [ + "Fr", + "FR", + "Frl", + "FRL", + "F" + ], + "(?<=\\()Gr(?=\\))": [ + "Gr", + "GR", + "Gn", + "GN", + "G" + ], + "(?<=\\()Dr(?=\\))": [ + "Dr", + "DR", + "Dn", + "DN", + "D" + ], + "(?<=\\()Ov(?=\\))": [ + "Ov", + "OV", + "O" + ], + "(?<=\\()Nh(?=\\))": [ + "Nh", + "NH" + ], + "(?<=\\()Ut(?=\\))": [ + "Ut", + "UT", + "U" + ], + "(?<=\\()Gld(?=\\))": [ + "Gld", + "GLD", + "G" + ], + "(?<=\\()Li(?=\\))": [ + "Li", + "LI", + "L" + ], + "(?<=\\()Nb(?=\\))": [ + "Nb", + "NB" + ], + "(?<=\\()Zh(?=\\))": [ + "Zh", + "ZH" + ], + "(?<=\\()Ze(?=\\))": [ + "Ze", + "ZE", + "Z" + ] + }, + "punct": { + "\\.": [ + ".", + "" + ], + "-": [ + "-", + "", + " " + ] + }, + "spell": { + "y": [ + "y", + "ij" + ], + "Y": [ + "Y", + "IJ" + ], + "ij": [ + "ij", + "y" + ], + "IJ": [ + "IJ", + "Y" + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/locations/lst_street/exceptions.txt b/tests/regression/data/lookup/src/locations/lst_street/exceptions.txt new file mode 100644 index 00000000..8cfa7df6 --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_street/exceptions.txt @@ -0,0 +1,7 @@ +Berg +Bosch +Generaal +Noord +Oost +Segment +Vrij diff --git a/tests/regression/data/lookup/src/locations/lst_street/items.txt b/tests/regression/data/lookup/src/locations/lst_street/items.txt new file mode 100644 index 00000000..053f192b --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_street/items.txt @@ -0,0 +1,236 @@ +Abraham Martinus Sorgstraat +Aertjanssenstraat +Antonius Bieleveltstraat +Antonius Deusinglaan +Antonius Heggelaan +Antonius Matthaeuslaan +Antonius O.H. Tellegenlaan +Antonius Struijckenstraat +Antonius van Gilsweg +Antonius van Lieropstraat +Antoniushof +Antoniuslaan +Antoniusmeule +Antoniuspark +Antoniusplein +Antoniusschutlaan +Antoniusstede +Antoniusstraat +Antoniusweg +Arnold Janssenlaan +Arnoldus Janssenstraat +August Janssenweg +Bisschop Janssensstraat +Blauw Druifje +Bosch +Brabant +Burgemeester Canisiusstraat +Burgemeester Erasmusstraat +Burgemeester Janssen van Sonlaan +Burgemeester Janssenring +Burgemeester Janssensstraat +Burgemeester Janssenstraat +Canisiushof +Canisiusstraat +Christina Weidner-Slorsstraat +Cor Janssenstraat +Desiderius Erasmusstraat +Dick de Hoogstraat +Directeur Janssenstraat +Doctor Janssenslaan +Doctor Janssenstraat +Doctor L.J.F. Janssenstraat +Doctor Paul Janssenweg +Dokter Janssenplein +Duin +Erasmusdomein +Erasmusflat +Erasmusgracht +Erasmushage +Erasmushof +Erasmuslaan +Erasmuspad +Erasmuspark +Erasmusplaats +Erasmusplein +Erasmussingel +Erasmusstate +Erasmusstraat +Erasmustuin +Erasmusweg +Florastraat +Gerrit Hoogstraatenlaan +Goeree +Goossen Janssenstraat +Groot Hoogstraat +Grote Hoogstraat +Harrie Janssenstraat +Hoogstraat +Hoogstraat Zijpad +Hoogstraatje +IJsweg +Jan Janssenpad +Janssen & Fritsenplein +Janssen van Raaystraat +Janssen-Dingsweg +Janssen-Stichting +Janssenlaan +Janssensstraat +Janssenstichting +Janssenstraat +Janssenweg +Joachim Kleinsorgstraat +Juliana Wilhelmina van der Noordaatuin +Juliana Wilhelmina van der Noordatuin +Kapelaan Janssenstraat +Kleine Hoogstraat +Koningin Wilhelmina Boulevard +Koningin Wilhelminahaven NZ +Koningin Wilhelminahaven ZOZ +Koningin Wilhelminahaven ZZ +Koningin Wilhelminahof +Koningin Wilhelminakade +Koningin Wilhelminalaan +Koningin Wilhelminapark +Koningin Wilhelminaplein +Koningin Wilhelminasingel +Koningin Wilhelminastraa +Koningin Wilhelminastraat +Koningin Wilhelminaweg +Koningin-Wilhelminalaan +Korte Hoogstraat +Laan +Laantje achter Wilhelmina +Lange Landen +Lange Landweg +Laurent Janssensstraat +Loristraat +Lorkstraat +Louisalaan +Magda Janssenslaan +Magda Janssenspad +Magda Janssensstraat +Magda Janssenstraat +Maria Montessoristraat +Meander +Meester Janssenpad +Meester Janssenweg +Monseigneur F. Janssenstraat +Montessoristraat +Nieuwe Hoogstraat +Noord +Oude +Oude Hoogstraat +Overflakkee +P. W. Janssenweg +P.W. Janssenlaan +Park Hoogstraaten +Pastoor Antoniusstraat +Pastoor Janssenhof +Pastoor Janssenlaan +Pastoor Janssens van Calmthoutweg +Pastoor Janssensplantsoen +Pastoor Janssensstraat +Pastoor Janssenstraat +Pastoor Janssensweg +Pastoor Leonardus Canisiusstraat +Pastoor-Janssenstraat +Pater Arnold Janssenpad +Pater Janssenstraat +Peter Janssenweg +Petrus Canisiusstraat +Pierre Janssenstraat +Pieter +Pieter Verhoogstraat +Pieter de Hoogstraat +Pr Wilhelminastraat +Prinses Ariane Wilhelminapad +Prinses Wilhelminalaan +Prinses Wilhelminasingel +Prinses Wilhelminastraat +Prinses Wilhelminaweg +Professor Doctor J.C. Schoutelaan +Professor Loréstraat +Protterstrjitte +Secretaris Janssenstraat +Sint Antonius Abt Hof +Sint Antoniusbank +Sint Antoniusdreef +Sint Antoniusgilde +Sint Antoniushof +Sint Antoniuslaan +Sint Antoniuspad +Sint Antoniuspark +Sint Antoniusplein +Sint Antoniusstraat +Sint Antoniusveldweg +Sint Antoniusweg +Sint Canisiussingel +Sint Petrus Canisiuslaan +Sint-Antoniusstraat +Slot +Sluis +Smallingerland +To Janssenstraat +Verlengde Hoogstraat +Verlengde Wilhelminalaan +Verlengde Wilhelminastraat +Visser +Vlietsorgstraat +Wethouder Janssenpad +Wilhelmina Bladergroenstraat +Wilhelmina Bladergroenweg +Wilhelmina Blombergplein +Wilhelmina Druckererf +Wilhelmina Druckerhoeve +Wilhelmina Druckerhof +Wilhelmina Druckerlaan +Wilhelmina Druckerpad +Wilhelmina Druckerstraat +Wilhelmina Druckertuin +Wilhelmina Druckerweg +Wilhelmina Geevestraat +Wilhelmina Hofman-Pootstraat +Wilhelmina Nijhoffstraat +Wilhelmina Sangersstraat +Wilhelmina Schweickhardtplein +Wilhelmina Smitstraat +Wilhelmina Voorwindenkade +Wilhelmina van Essenlaan +Wilhelmina van Haeftendreef +Wilhelmina van Pruisenlaan +Wilhelmina van Pruisenweg +Wilhelminadijk +Wilhelminadreef +Wilhelminadwarsweg +Wilhelminahoeve +Wilhelminahof +Wilhelminahofweg +Wilhelminakade +Wilhelminakanaal Noord +Wilhelminakanaal Oost +Wilhelminakanaal Zuid +Wilhelminakanaalstraat +Wilhelminalaan +Wilhelminapark +Wilhelminaparkflat +Wilhelminapassage +Wilhelminaplantsoen +Wilhelminaplein +Wilhelminapolderweg +Wilhelminasingel +Wilhelminasluis +Wilhelminastichting +Wilhelminastraat +Wilhelminastrjitte +Wilhelminatorenvoetpad +Wilhelminaveld +Wilhelminaweg +Wilhelminawijk +Wilhelminawyk +Wilhelminazijstraat +Willem Loréstraat +Zuidgeest +de Hoogstraat +de Lormstraat +het Lange Land diff --git a/tests/regression/data/lookup/src/locations/lst_street/streets_bag.txt b/tests/regression/data/lookup/src/locations/lst_street/streets_bag.txt new file mode 100644 index 00000000..db02c7b3 --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_street/streets_bag.txt @@ -0,0 +1,255 @@ +Abraham Martinus Sorgstraat +Aertjanssenstraat +Antonius Bieleveltstraat +Antonius Deusinglaan +Antonius Heggelaan +Antonius Matthaeuslaan +Antonius O.H. Tellegenlaan +Antonius Struijckenstraat +Antonius van Gilsweg +Antonius van Lieropstraat +Antoniushof +Antoniuslaan +Antoniusmeule +Antoniuspark +Antoniusplein +Antoniusschutlaan +Antoniusstede +Antoniusstraat +Antoniusweg +Arnold Janssenlaan +Arnoldus Janssenstraat +August Janssenweg +Bisschop Janssensstraat +Blauw druifje +Bosch +Brabant +Burg Canisiusstraat +Burg Janssen v Sonln +Burg Janssenstraat +Burg. Janssenring +Burg. Janssensstraat +Burg.Janssensstraat +Burg.Janssenstraat +Burgemeester Erasmusstraat +Burgemeester Janssenstraat +Canisiushof +Canisiusstraat +Christina Weidner-Slorsstraat +Cor Janssenstraat +De Hoogstraat +De Lormstraat +Desiderius Erasmusstraat +Dick de Hoogstraat +Directeur Janssenstraat +Dokter Janssenplein +Dr Janssenstraat +Dr. Janssenslaan +Dr. L.J.F. Janssenstraat +Dr. Paul Janssenweg +Duin +Erasmusdomein +Erasmusflat +Erasmusgracht +Erasmushage +Erasmushof +Erasmuslaan +Erasmuspad +Erasmuspark +Erasmusplaats +Erasmusplein +Erasmussingel +Erasmusstate +Erasmusstraat +Erasmustuin +Erasmusweg +Florastraat +Gerrit Hoogstraatenlaan +Goeree +Goossen Janssenstraat +Groot Hoogstraat +Grote Hoogstraat +Harrie Janssenstraat +Het Lange Land +Hoogstraat +Hoogstraat Zijpad +Hoogstraatje +IJsweg +Jan Janssenpad +Janssen & Fritsenplein +Janssen van Raaystraat +Janssen-Dingsweg +Janssen-Stichting +Janssenlaan +Janssensstraat +Janssenstichting +Janssenstraat +Janssenweg +Joachim Kleinsorgstraat +Juliana Wilhelmina van der Noordaatuin +Juliana Wilhelmina van der Noordatuin +Kapelaan Janssenstraat +Kleine Hoogstraat +Kon Wilhelminastraat +Kon Wilhelminaweg +Kon. Wilhelminastraat +Kon.Wilhelminalaan +Koningin Wilhelmina Boulevard +Koningin Wilhelminahaven NZ +Koningin Wilhelminahaven ZOZ +Koningin Wilhelminahaven ZZ +Koningin Wilhelminahof +Koningin Wilhelminakade +Koningin Wilhelminalaan +Koningin Wilhelminapark +Koningin Wilhelminaplein +Koningin Wilhelminasingel +Koningin Wilhelminastr +Koningin Wilhelminastraa +Koningin Wilhelminastraat +Koningin Wilhelminaweg +Koningin-Wilhelminalaan +Korte Hoogstraat +Laan +Laantje achter Wilhelmina +Lange Landen +Lange Landweg +Laurent Janssensstraat +Loristraat +Lorkstraat +Louisalaan +Magda Janssenslaan +Magda Janssenspad +Magda Janssensstraat +Magda Janssenstraat +Maria Montessoristraat +Meander +Meester Janssenpad +Meester Janssenweg +Mgr. F. Janssenstraat +Montessoristraat +Nieuwe Hoogstraat +Noord +Oude +Oude Hoogstraat +Overflakkee +P W Janssenweg +P.W. Janssenlaan +Park Hoogstraaten +Pastoor Antoniusstraat +Pastoor Janssenhof +Pastoor Janssenlaan +Pastoor Janssens van Calmthoutweg +Pastoor Janssensplantsoen +Pastoor Janssensstraat +Pastoor Janssenstraat +Pastoor Janssensweg +Pastoor Leonardus Canisiusstraat +Pastoor-Janssenstraat +Pater Arnold Janssenpad +Pater Janssenstraat +Peter Janssenweg +Petrus Canisiusstraat +Pierre Janssenstraat +Pieter +Pieter Verhoogstraat +Pieter de Hoogstraat +Pr Wilhelminastraat +Prinses Ariane Wilhelminapad +Prinses Wilhelminalaan +Prinses Wilhelminasingel +Prinses Wilhelminastraat +Prinses Wilhelminaweg +Prof. Dr. J.C. Schoutelaan +Professor Loréstraat +Protterstrjitte +Secr. Janssenstraat +Secretaris Janssenstraat +Sint Antonius Abt Hof +Sint Antoniusbank +Sint Antoniusdreef +Sint Antoniusgilde +Sint Antoniushof +Sint Antoniuslaan +Sint Antoniusplein +Sint Antoniusstraat +Sint Antoniusweg +Sint-Antoniusstraat +Slot +Sluis +Smallingerland +St Antoniusstraat +St Antoniusweg +St Petrus Canisiuslaan +St. Antoniuslaan +St. Antoniuspad +St. Antoniuspark +St. Antoniusplein +St. Antoniusstraat +St. Antoniusveldweg +St. Antoniusweg +St. Canisiussingel +St.-Antoniusstraat +St.Antoniusstraat +St.Antoniusweg +To Janssenstraat +Verlengde Hoogstraat +Verlengde Wilhelminalaan +Verlengde Wilhelminastraat +Visser +Vlietsorgstraat +Wethouder Janssenpad +Wilhelmina Bladergroenstraat +Wilhelmina Bladergroenweg +Wilhelmina Blombergplein +Wilhelmina Druckererf +Wilhelmina Druckerhoeve +Wilhelmina Druckerhof +Wilhelmina Druckerlaan +Wilhelmina Druckerpad +Wilhelmina Druckerstraat +Wilhelmina Druckertuin +Wilhelmina Druckerweg +Wilhelmina Geevestraat +Wilhelmina Hofman-Pootstraat +Wilhelmina Nijhoffstraat +Wilhelmina Sangersstraat +Wilhelmina Schweickhardtplein +Wilhelmina Smitstraat +Wilhelmina Voorwindenkade +Wilhelmina van Essenlaan +Wilhelmina van Haeftendreef +Wilhelmina van Pruisenlaan +Wilhelmina van Pruisenweg +Wilhelminadijk +Wilhelminadreef +Wilhelminadwarsweg +Wilhelminahoeve +Wilhelminahof +Wilhelminahofweg +Wilhelminakade +Wilhelminakanaal Noord +Wilhelminakanaal Oost +Wilhelminakanaal Zuid +Wilhelminakanaalstraat +Wilhelminalaan +Wilhelminapark +Wilhelminaparkflat +Wilhelminapassage +Wilhelminaplantsoen +Wilhelminaplein +Wilhelminapolderweg +Wilhelminasingel +Wilhelminasluis +Wilhelminastichting +Wilhelminastraat +Wilhelminastrjitte +Wilhelminatorenvoetpad +Wilhelminaveld +Wilhelminaweg +Wilhelminawijk +Wilhelminawyk +Wilhelminazijstraat +Willem Loréstraat +Zuidgeest +het Lange Land diff --git a/tests/regression/data/lookup/src/locations/lst_street/transform.json b/tests/regression/data/lookup/src/locations/lst_street/transform.json new file mode 100644 index 00000000..44cca14c --- /dev/null +++ b/tests/regression/data/lookup/src/locations/lst_street/transform.json @@ -0,0 +1,712 @@ +{ + "transforms": { + "prefix": { + "\\bAbraham\\b": [ + "Abraham", + "Abr.", + "abr." + ], + "\\bAdmiraal\\b": [ + "Admiraal", + "Adm.", + "adm." + ], + "\\bAlbert\\b": [ + "Albert", + "Alb.", + "alb." + ], + "\\bBurgemeester\\b": [ + "Burgemeester", + "Burg.", + "burg." + ], + "\\bChris\\b": [ + "Chris", + "Chr.", + "chr." + ], + "\\bCommissaris\\b": [ + "Commissaris", + "Comm.", + "comm." + ], + "\\bDominee\\b": [ + "Dominee", + "Ds.", + "ds." + ], + "\\bDoctor\\b": [ + "Doctor", + "Dr.", + "dr." + ], + "\\bDokter\\b": [ + "Dokter", + "Dr.", + "dr." + ], + "\\bDoctorandus\\b": [ + "Doctorandus", + "Drs.", + "drs." + ], + "\\bFamilie\\b": [ + "Familie", + "Fam.", + "fam." + ], + "\\bGebroeders\\b": [ + "Gebroeders", + "Gebr.", + "gebr.", + "Gebrs.", + "gebrs." + ], + "\\bGeneraal\\b": [ + "Generaal", + "Gen.", + "gen." + ], + "\\bHertog\\b": [ + "Hertog", + "Hert.", + "hert." + ], + "\\bIngenieur\\b": [ + "Ingenieur", + "Ir.", + "ir.", + "Ing.", + "ing." + ], + "\\bJacobus\\b": [ + "Jacobus", + "Jac.", + "jac." + ], + "\\bJacob\\b": [ + "Jacobus", + "Jac.", + "jac." + ], + "\\bJacqueline\\b": [ + "Jacqueline", + "Jacq.", + "jacq." + ], + "\\bJonkhkeer\\b": [ + "Jonkhkeer", + "Jhr.", + "jhr." + ], + "\\bJonkvrouw\\b": [ + "Jonkvrouw", + "Jkvr.", + "jkvr." + ], + "\\bJohan\\b": [ + "Johan", + "Joh.", + "joh." + ], + "\\bKardinaal\\b": [ + "Kardinaal", + "Kard.", + "kard." + ], + "\\bKolonel\\b": [ + "Kolonel", + "Kol.", + "kol." + ], + "\\bKoningin\\b": [ + "Koningin", + "Kon.", + "kon." + ], + "\\bKoning\\b": [ + "Koning", + "Kon.", + "kon." + ], + "\\bMajoor\\b": [ + "Majoor", + "Maj.", + "maj." + ], + "\\bMevrouw\\b": [ + "Mevrouw", + "Mevr.", + "mevr." + ], + "\\bMinister\\b": [ + "Minister", + "Min.", + "min." + ], + "\\bMeester\\b": [ + "Meester", + "Mr.", + "mr." + ], + "\\bMonseigneur\\b": [ + "Monseigneur", + "Mgr.", + "mgr." + ], + "\\bPrinses\\b": [ + "Prinses", + "Pr.", + "pr." + ], + "\\bProfessor\\b": [ + "Professor", + "Prof.", + "prof." + ], + "\\bRector\\b": [ + "Rector", + "Rect.", + "rect." + ], + "\\bSecretaris\\b": [ + "Secretaris", + "Secr.", + "secr." + ], + "\\bSenior\\b": [ + "Senior", + "Sr.", + "sr." + ], + "\\bSint\\b": [ + "Sint", + "sint", + "St.", + "st." + ], + "\\bTheo\\b": [ + "Theo", + "Th.", + "th." + ], + "\\bVeldmaarschalk\\b": [ + "Veldmaarschalk", + "Veldm.", + "Veldm" + ], + "\\bVicaris\\b": [ + "Vicaris", + "Vic.", + "vic." + ], + "\\bZuster\\b": [ + "Zuster", + "Zr.", + "zr." + ] + }, + "prop": { + "\\baan\\b": [ + "Aan", + "aan" + ], + "\\bachter\\b": [ + "Achter", + "achter" + ], + "\\band\\b": [ + "And", + "and" + ], + "\\bbie\\b": [ + "Bie", + "bie" + ], + "\\bbij\\b": [ + "Bij", + "bij" + ], + "\\bbinnenzijde\\b": [ + "Binnenzijde", + "binnenzijde", + "BZ", + "Bz", + "bz" + ], + "\\bbuitenzijde\\b": [ + "Buitenzijde", + "buitenzijde", + "BZ", + "Bz", + "bz" + ], + "\\bda\\b": [ + "Da", + "da" + ], + "\\bde\\b": [ + "De", + "de" + ], + "\\bdel\\b": [ + "Del", + "del" + ], + "\\bden\\b": [ + "Den", + "den" + ], + "\\bder\\b": [ + "Der", + "der" + ], + "\\bdes\\b": [ + "Des", + "des" + ], + "\\bdi\\b": [ + "Di", + "di" + ], + "\\bdie\\b": [ + "Die", + "die" + ], + "\\bdoor\\b": [ + "Door", + "door" + ], + "\\bdu\\b": [ + "Du", + "du" + ], + "\\bein\\b": [ + "Ein", + "ein" + ], + "\\ben\\b": [ + "En", + "en" + ], + "\\bfan\\b": [ + "Fan", + "fan" + ], + "\\bge\\b": [ + "Ge", + "ge" + ], + "\\bgen\\b": [ + "Gen", + "gen" + ], + "\\bhet\\b": [ + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t" + ], + "\\bin\\b": [ + "In", + "in" + ], + "\\bis\\b": [ + "Is", + "is" + ], + "\\bit\\b": [ + "It", + "it", + "Het", + "het", + "'T", + "'t", + "`T", + "`t", + "T", + "t" + ], + "\\bla\\b": [ + "La", + "la" + ], + "\\blangs\\b": [ + "Langs", + "langs" + ], + "\\ble\\b": [ + "Le", + "le" + ], + "\\bnaar\\b": [ + "Naar", + "naar" + ], + "\\bnabij\\b": [ + "Nabij", + "nabij" + ], + "\\boan\\b": [ + "Oan", + "oan" + ], + "\\bof\\b": [ + "Of", + "of" + ], + "\\bom\\b": [ + "Om", + "om" + ], + "\\bonder\\b": [ + "Onder", + "onder" + ], + "\\bop\\b": [ + "Op", + "op" + ], + "\\bover\\b": [ + "Over", + "over" + ], + "\\bsur\\b": [ + "Sur", + "sur" + ], + "\\bte\\b": [ + "Te", + "te" + ], + "\\bten\\b": [ + "Ten", + "ten" + ], + "\\bter\\b": [ + "Ter", + "ter" + ], + "\\btot\\b": [ + "Tot", + "tot" + ], + "\\btusschen\\b": [ + "Tusschen", + "tusschen" + ], + "\\btussen\\b": [ + "Tussen", + "tussen" + ], + "\\but\\b": [ + "Ut", + "ut" + ], + "\\buten\\b": [ + "Uten", + "uten" + ], + "\\bvan\\b": [ + "Van", + "van", + "v.", + "V." + ], + "\\bvon\\b": [ + "Von", + "von" + ], + "\\bvoor\\b": [ + "Voor", + "voor" + ] + }, + "windrichting": { + "\\bNoord$": [ + "Noord", + "noord", + "N" + ], + "\\bOost$": [ + "Oost", + "oost", + "O" + ], + "\\bZuid$": [ + "Zuid", + "zuid", + "Z" + ], + "\\bWest$": [ + "West", + "west", + "W" + ], + "NZ$": [ + "N.Z.", + "N.z.", + "n.z.", + "Noordzijde", + "noordzijde", + "" + ], + "OZ$": [ + "O.Z.", + "O.z.", + "o.z.", + "Oostzijde", + "oostzijde", + "" + ], + "ZZ$": [ + "Z.Z.", + "Z.z.", + "z.z.", + "Zuidzijde", + "zuidzijde", + "" + ], + "WZ$": [ + "W.Z.", + "W.z.", + "w.z.", + "Westzijde", + "westzijde", + "" + ], + "NO$": [ + "N.O.", + "N.o.", + "n.o.", + "" + ], + "NW$": [ + "N.W.", + "N.w.", + "n.w.", + "" + ], + "ZO$": [ + "Z.O.", + "Z.o.", + "z.o.", + "" + ], + "ZW$": [ + "Z.W.", + "Z.w.", + "z.w.", + "" + ] + }, + "suffix": { + "dreef$": [ + "dreef", + "drf" + ], + "gracht$": [ + "gracht", + "gr" + ], + "hof$": [ + "hof", + "hf" + ], + "laan$": [ + "laan", + "ln" + ], + "markt$": [ + "markt", + "mrkt" + ], + "pad$": [ + "pad", + "pd" + ], + "park$": [ + "park", + "prk" + ], + "plantsoen$": [ + "plantsoen", + "plnts", + "pltsn" + ], + "plein$": [ + "plein", + "pln" + ], + "singel$": [ + "singel", + "sngl" + ], + "steeg$": [ + "steeg", + "stg", + "st" + ], + "straat$": [ + "straat", + "str" + ], + "weg$": [ + "weg", + "wg" + ] + }, + "loc": { + "\\bAcker\\b": [ + "Acker", + "acker" + ], + "\\bAkker\\b": [ + "Akker", + "akker" + ], + "\\bBoulevard\\b": [ + "Boulevard", + "boulevard" + ], + "\\bDijk\\b": [ + "Dijk", + "dijk" + ], + "\\bDreef\\b": [ + "Dreef", + "dreef" + ], + "\\bDwarsweg\\b": [ + "Dwarsweg", + "dwarsweg" + ], + "\\bDyk\\b": [ + "Dyk", + "dyk" + ], + "\\bErf\\b": [ + "Erf", + "erf" + ], + "\\bHeide\\b": [ + "Heide", + "heide" + ], + "\\bHof\\b": [ + "Hof", + "hof" + ], + "\\bKade\\b": [ + "Kade", + "kade" + ], + "\\bKanaal\\b": [ + "Kanaal", + "kanaal" + ], + "\\bLaan\\b": [ + "Laan", + "laan" + ], + "\\bPad\\b": [ + "Pad", + "pad" + ], + "\\bPark\\b": [ + "Park", + "park" + ], + "\\bPlantsoen\\b": [ + "Plantsoen", + "plantsoen" + ], + "\\bPlein\\b": [ + "Plein", + "plein" + ], + "\\bReed\\b": [ + "Reed", + "reed" + ], + "\\bRotonde\\b": [ + "Rotonde", + "rotonde" + ], + "\\bSloot\\b": [ + "Sloot", + "sloot" + ], + "\\bSluis\\b": [ + "Sluis", + "sluis" + ], + "\\bSteeg\\b": [ + "Steeg", + "steeg" + ], + "\\bStraat\\b": [ + "Straat", + "straat" + ], + "\\bTunnel\\b": [ + "Tunnel", + "tunnel" + ], + "\\bWal\\b": [ + "Wal", + "wal" + ], + "\\bWeg\\b": [ + "Weg", + "weg" + ], + "\\bWei\\b": [ + "Wei", + "wei" + ], + "\\bWijk\\b": [ + "Wijk", + "wijk" + ], + "\\bVen\\b": [ + "Ven", + "ven" + ] + }, + "punct": { + "\\.": [ + ".", + "" + ], + "-": [ + "-", + "", + " " + ] + }, + "spelling": { + "y": [ + "y", + "ij" + ], + "Y": [ + "Y", + "IJ" + ], + "ij": [ + "ij", + "y" + ], + "IJ": [ + "IJ", + "Y" + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/names/lst_first_name/exceptions.txt b/tests/regression/data/lookup/src/names/lst_first_name/exceptions.txt new file mode 100644 index 00000000..c10fbe0f --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_first_name/exceptions.txt @@ -0,0 +1,10 @@ +Ace +Ad +Antonius +Canisius +Erasmus +Heino +Lung +Man +Meander +Wilhelmina diff --git a/tests/regression/data/lookup/src/names/lst_first_name/items.txt b/tests/regression/data/lookup/src/names/lst_first_name/items.txt new file mode 100644 index 00000000..bc7e7431 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_first_name/items.txt @@ -0,0 +1,18 @@ +Ahmed +Annes +Antonius +Canisius +Daan +Erasmus +Jan +Jan-Willem +Jansen +Kees +Marie +Maxima +Meander +Peter +Piet +Pieter +Wilhelmina +Willem diff --git a/tests/regression/data/lookup/src/names/lst_initial/items.txt b/tests/regression/data/lookup/src/names/lst_initial/items.txt new file mode 100644 index 00000000..cdfdac38 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_initial/items.txt @@ -0,0 +1,54 @@ +A +B +C +Ch +Chr +D +E +F +G +H +I +J +K +L +M +N +O +P +Ph +Q +R +S +T +Th +U +V +W +X +Y +Z +À +Á +Â +Ã +Ä +Å +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ñ +Ó +Ô +Õ +Ö +Ø +Ù +Ü +Š diff --git a/tests/regression/data/lookup/src/names/lst_interfix/items.txt b/tests/regression/data/lookup/src/names/lst_interfix/items.txt new file mode 100644 index 00000000..96d23a2e --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_interfix/items.txt @@ -0,0 +1,44 @@ +'t +aan de +bij de +d' +da +de +de la +del +den +der +di +dos +du +el +in 't +in den +in het +l' +la +le +lo +op 't +op de +op den +op het +te +ten +ter +uit de +v +v. +v.d. +v/d +van +van 't +van de +van den +van der +van het +van t +vd +vd. +von +zur diff --git a/tests/regression/data/lookup/src/names/lst_interfix_surname/exceptions.txt b/tests/regression/data/lookup/src/names/lst_interfix_surname/exceptions.txt new file mode 100644 index 00000000..87b8f311 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_interfix_surname/exceptions.txt @@ -0,0 +1,3 @@ +Amersfoort +Utrecht +Veenendaal diff --git a/tests/regression/data/lookup/src/names/lst_interfix_surname/items.txt b/tests/regression/data/lookup/src/names/lst_interfix_surname/items.txt new file mode 100644 index 00000000..55e81785 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_interfix_surname/items.txt @@ -0,0 +1,18 @@ +Alphen +Bakker +Bosch +Boven +Duin +Groot +Heer +Laan +Leeuwen +Noord +Olst +Oude +Schaft +Slot +Sluis +Utrecht +Visser +Wouden diff --git a/tests/regression/data/lookup/src/names/lst_prefix/items.txt b/tests/regression/data/lookup/src/names/lst_prefix/items.txt new file mode 100644 index 00000000..6111ce14 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_prefix/items.txt @@ -0,0 +1,45 @@ +bacc +bacc. +bc +bc. +collega +de Hooggeleerde +de Weledelgeleerde +de Weledelzeergeleerde +de heer +de hooggeleerde +de weledelgeleerde +de weledelzeergeleerde +dhr +dhr. +dr +dr. +dr.h.c +dr.h.c. +dra +dra. +drs +drs. +ds +ds. +ing +ing. +ir +ir. +kand +kand. +lec +lec. +mej +mej. +meneer +mevr +mevr. +mevrouw +mijnheer +mr +mr. +mw +mw. +prof +prof.de Weledelgeleerde diff --git a/tests/regression/data/lookup/src/names/lst_surname/exceptions.txt b/tests/regression/data/lookup/src/names/lst_surname/exceptions.txt new file mode 100644 index 00000000..67ad9f99 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_surname/exceptions.txt @@ -0,0 +1,7 @@ +Bosch +Broer +Groot +Klein +Lang +Oost +Tel diff --git a/tests/regression/data/lookup/src/names/lst_surname/items.txt b/tests/regression/data/lookup/src/names/lst_surname/items.txt new file mode 100644 index 00000000..03d38368 --- /dev/null +++ b/tests/regression/data/lookup/src/names/lst_surname/items.txt @@ -0,0 +1,32 @@ +Ahmadi +Ahmed +Bakker +Bontekoe +Bosch +Boven +Bruins +Bruins Slot +Damhuis +Duin +Goeree +Groot +Groot Wassink +Jansen +Janssen +Janssens +Kees +Killaars +Laan +Nijhuis +Oude Nijhuis +Peter +Piet +Pieter +Schaft +Slot +Sluis +Visser +Wassink +Wiegmans +Zoutenbier +Zuidgeest diff --git a/tests/regression/data/lookup/src/whitelist/lst_common_word/exceptions.txt b/tests/regression/data/lookup/src/whitelist/lst_common_word/exceptions.txt new file mode 100644 index 00000000..0eb5e047 --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_common_word/exceptions.txt @@ -0,0 +1,6 @@ +bel +boos +groot +helder +hemel +zondag diff --git a/tests/regression/data/lookup/src/whitelist/lst_common_word/items.txt b/tests/regression/data/lookup/src/whitelist/lst_common_word/items.txt new file mode 100644 index 00000000..7bd329c7 --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_common_word/items.txt @@ -0,0 +1,12 @@ +boven +en +groot +heer +noord +oktober +oog +raam +slot +soms +we +wijs diff --git a/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/items.txt b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/items.txt new file mode 100644 index 00000000..f6c5cb15 --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/items.txt @@ -0,0 +1,5 @@ +Alpers-Huttenlocher +Baller-Gerold +Diamond-Blackfan +Gerbec-Morgagni-Adams-Stokes +Non-Hodgkin diff --git a/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt new file mode 100644 index 00000000..33d5c540 --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt @@ -0,0 +1,5 @@ +Dupuytren ziekte +Krabbe ziekte +ziekte van Eales +ziekte van Glanzmann +ziekte van Laron diff --git a/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/transform.json b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/transform.json new file mode 100644 index 00000000..8f0e933a --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/transform.json @@ -0,0 +1,22 @@ +{ + "transforms": { + "ziekte_1": { + " ziekte$": [ + " ziekte", + "' ziekte", + "'s ziekte" + ] + }, + "ziekte_2": { + "ziekte": [ + "ziekte", + "syndroom", + "afwijking", + "tumor", + "reactie", + "complex", + "aandoening" + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/transform.json b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/transform.json new file mode 100644 index 00000000..1975e8bf --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_eponymous_disease/transform.json @@ -0,0 +1,39 @@ +{ + "transforms": { + "ziekte_1": { + " ziekte$": [ + " ziekte", + "' ziekte", + "'s ziekte" + ] + }, + "ziekte_2": { + "ziekte": [ + "ziekte", + "syndroom", + "afwijking", + "tumor", + "reactie", + "complex", + "aandoening" + ] + }, + "prop": { + "\\bVon": [ + "Von", + "von" + ] + }, + "punct": { + "\\.": [ + ".", + "" + ], + "-": [ + "-", + "", + " " + ] + } + } +} \ No newline at end of file diff --git a/tests/regression/data/lookup/src/whitelist/lst_medical_term/items.txt b/tests/regression/data/lookup/src/whitelist/lst_medical_term/items.txt new file mode 100644 index 00000000..5d5e74c2 --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_medical_term/items.txt @@ -0,0 +1,18 @@ +alzheimer +auricularis +brachialis +canisius +cava +coli +functies +inferior +kinetic +mdl +multiple +neuroloog +olanzapine +schaambeen +suralis +ulna +weledelgeleerde +zetpil diff --git a/tests/regression/data/lookup/src/whitelist/lst_stop_word/items.txt b/tests/regression/data/lookup/src/whitelist/lst_stop_word/items.txt new file mode 100644 index 00000000..d1a981ec --- /dev/null +++ b/tests/regression/data/lookup/src/whitelist/lst_stop_word/items.txt @@ -0,0 +1,6 @@ +doen +een +en +kan +nu +zonder diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index 1a102d3a..28862722 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -1,11 +1,24 @@ import json from typing import Optional +import pytest + from docdeid import Annotation, AnnotationSet from deduce import Deduce +@pytest.fixture +def model(shared_datadir): + # FIXME Sorry, due to the design decision of pytest-datadir to create a new copy + # of `shared_datadir` for every test, we cannot reuse this fixture + # for all tests in this module or package. + return Deduce(build_lookup_structs=True, + save_lookup_structs=False, + lookup_data_path=shared_datadir / "lookup") + + + def regression_test( model: Deduce, examples_file: str, From 7924ff3952ac78bd86183fa7088d97e3cca7be40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 4 Mar 2024 23:21:34 +0100 Subject: [PATCH 11/39] Make `ensure_path` a plain util function --- deduce/deduce.py | 23 ++++++++++------------- deduce/utils.py | 9 ++++++++- tests/unit/test_utils.py | 2 -- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/deduce/deduce.py b/deduce/deduce.py index a12aadbb..c527a18c 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -29,6 +29,7 @@ __version__ = importlib.metadata.version(__package__ or __name__) +from deduce.utils import ensure_path _BASE_PATH = Path(os.path.dirname(__file__)).parent _LOOKUP_LIST_PATH = _BASE_PATH / "deduce" / "data" / "lookup" @@ -59,8 +60,10 @@ class Deduce(dd.DocDeid): # pylint: disable=R0903 the package. If you want to make changes to source files, it's recommended to copy the source data and pointing deduce to this folder with this argument. - build_lookup_structs: Will always reload and rebuild lookup structs rather than - using the cache when this is set to `True`. + build_lookup_structs: Will always reload and rebuild lookup structs + rather than using the cache when this is set to `True`. + save_lookup_structs: Should the lookup structures be pickled (cached) + upon loading? Default: `True`. """ def __init__( # pylint: disable=R0913 @@ -90,10 +93,11 @@ def __init__( # pylint: disable=R0913 load_base_config=load_base_config, user_config=config ) - self.lookup_data_path = self._initialize_lookup_data_path(lookup_data_path) + self.lookup_data_path = ensure_path(lookup_data_path) logging.info('Going to init tokenizers.') - self.tokenizers = {"default": self._initialize_tokenizer(self.lookup_data_path)} + self.tokenizers = { + "default": self._initialize_tokenizer(self.lookup_data_path)} logging.debug('Done initing tokenizers.') self.lookup_structs = get_lookup_structs( @@ -105,7 +109,8 @@ def __init__( # pylint: disable=R0913 ) logging.info('Done loading lookup structs.') - extras = {"tokenizer": self.tokenizers["default"], "ds": self.lookup_structs} + extras = {"tokenizer": self.tokenizers["default"], + "ds": self.lookup_structs} logging.info('Going to load the Deduce processor.') self.processors = _DeduceProcessorLoader().load( @@ -142,14 +147,6 @@ def _initialize_config( return frozendict(config) - @staticmethod - def _initialize_lookup_data_path(lookup_data_path: Union[str, Path]) -> Path: - - if isinstance(lookup_data_path, str): - lookup_data_path = Path(lookup_data_path) - - return lookup_data_path - @staticmethod def _initialize_tokenizer(lookup_data_path: Path) -> dd.Tokenizer: diff --git a/deduce/utils.py b/deduce/utils.py index b8822abd..b39657de 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -3,7 +3,7 @@ import json import re from pathlib import Path -from typing import Optional +from typing import Optional, Union import docdeid as dd from docdeid import Tokenizer @@ -281,3 +281,10 @@ def lookup_set_to_trie( trie.add_item([token.text for token in tokenizer.tokenize(item)]) return trie + + +def ensure_path(path_or_str: Union[str, Path]) -> Path: + """\ + Casts the argument as a `Path` if it's not a `Path` already. + """ + return path_or_str if isinstance(path_or_str, Path) else Path(path_or_str) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ad055420..e4620c78 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,5 +1,3 @@ -from pathlib import Path - import docdeid as dd import pytest From f2d967581e9d0c8d76dae9e90f1e79aa2b4d9c4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 5 Mar 2024 13:29:58 +0100 Subject: [PATCH 12/39] Add a rant (a FIXME) about transformations --- deduce/utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/deduce/utils.py b/deduce/utils.py index b39657de..3cea9c4e 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -213,6 +213,23 @@ def apply_transform(items: set[str], transform_config: dict) -> set[str]: to_add = [] for item in items: + # FIXME Why _add_ the result of `str_variations` rather than + # replace the original item? In most cases, manual effort was + # exerted to include also the original string in + # the replacements, however some transformations do not include + # it (e.g. for "(?<=\\()Ut(?=\\))", the surrounding parens are + # always dropped). I guess that these transformations do not + # include the original version because it's supposed to be + # dropped. Or if the original version ("(Ut)" in this case) was + # supposed to be kept, by not including it explicitly yet + # _adding_ all variations to the set of terms, the net effect is + # that just all _other_ transformations within the string will + # be excluded in the version that keeps the original "(Ut)". + # + # We should either avoid combining the result of `str_variations` + # with the original set, `{item}`, or _always_ apply the void + # transformation so as to save effort in writing + # the `transform.json` configs and prevent subtle bugs. to_add += str_variations(item, transform) items.update(to_add) From 7f46345e11e94250ab7d359721799c508993a5fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 5 Mar 2024 16:01:53 +0100 Subject: [PATCH 13/39] Reproduce the "de Quervain" issue Leaving the test case commented out for now. This won't be a frequent problem but it's something I noticed when first trying out this tool. --- .../pipeline/data/lookup/src/names/lst_interfix/items.txt | 7 ++++--- .../pipeline/data/lookup/src/names/lst_surname/items.txt | 2 ++ .../lst_eponymous_disease/lst_eponymous_single/items.txt | 1 + tests/pipeline/test_deduce.py | 8 ++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt index 73318d7b..70721d37 100644 --- a/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt +++ b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt @@ -1,5 +1,6 @@ -uit de -von -in het +aan de de in 't +in het +uit de +von \ No newline at end of file diff --git a/tests/pipeline/data/lookup/src/names/lst_surname/items.txt b/tests/pipeline/data/lookup/src/names/lst_surname/items.txt index 770a5ece..97509bb7 100644 --- a/tests/pipeline/data/lookup/src/names/lst_surname/items.txt +++ b/tests/pipeline/data/lookup/src/names/lst_surname/items.txt @@ -2,3 +2,5 @@ Jansen Killaars Peter Visser +Quervain +de Quervain \ No newline at end of file diff --git a/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt index 000ca362..4ce5d915 100644 --- a/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt +++ b/tests/pipeline/data/lookup/src/whitelist/lst_eponymous_disease/lst_eponymous_single/items.txt @@ -3,3 +3,4 @@ ziekte van Glanzmann Krabbe ziekte Dupuytren ziekte ziekte van Laron +de Quervain ziekte \ No newline at end of file diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index 19620b28..0a36a5f9 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -10,12 +10,19 @@ "jaar oud en woonachtig in Utrecht, IJSWEG 10r. Hij werd op 10 oktober 2018 door arts " "Peter de Visser ontslagen van de kliniek van het UMCU. Voor nazorg kan hij " "worden bereikt via j.JNSEN.123@gmail.com of (06)12345678." + # FIXME "aan de" is joined to one token (due to "lst_interfix/items.txt"), + # preventing "de Quervain ziekte" from matching. Furthermore, when I + # managed to get this term censored, the "aan" word was censored, too. + # Use a simple whitespace/punctuation-based tokenizer for that annotator + # to fix this issue. + # " De patient lijdt aan de Quervain ziekte." ) @pytest.fixture def model(shared_datadir): return Deduce(save_lookup_structs=False, + build_lookup_structs=True, lookup_data_path=shared_datadir / "lookup") @@ -93,6 +100,7 @@ def test_annotate_intext(self, model): "UMCU. Voor nazorg kan hij worden " "bereikt via j.JNSEN.123@gmail.com of " "(06)12345678." + # " De patient lijdt aan de Quervain ziekte." ) assert dd.utils.annotate_intext(doc) == expected_intext_annotated From 3620b0858c78157f48a0e15ebae0783cb877a7f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 5 Mar 2024 16:40:19 +0100 Subject: [PATCH 14/39] Test overzealous matching of patients --- tests/pipeline/test_deduce.py | 104 +++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 39 deletions(-) diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index 0a36a5f9..d9472aec 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -9,7 +9,9 @@ "betreft: Jan Jansen, bsn 111222333, patnr 000334433. De patient J. Jansen is 64 " "jaar oud en woonachtig in Utrecht, IJSWEG 10r. Hij werd op 10 oktober 2018 door arts " "Peter de Visser ontslagen van de kliniek van het UMCU. Voor nazorg kan hij " - "worden bereikt via j.JNSEN.123@gmail.com of (06)12345678." + "worden bereikt via j.JNSEN.123@gmail.com of (06)12345678. " + "Vader, Peter Jansen, 104 jr, woont ook in Utrecht. Met collegiale groeten, " + "Jan de Visser." # FIXME "aan de" is joined to one token (due to "lst_interfix/items.txt"), # preventing "de Quervain ziekte" from matching. Furthermore, when I # managed to get this term censored, the "aan" word was censored, too. @@ -32,42 +34,56 @@ def test_annotate(self, model): doc = model.deidentify(text, metadata=metadata) expected_annotations = { - dd.Annotation( - text="(06)12345678", - start_char=284, - end_char=296, - tag="telefoonnummer", - ), - dd.Annotation(text="111222333", start_char=25, end_char=34, tag="bsn"), - dd.Annotation( - text="Peter de Visser", start_char=165, end_char=180, tag="persoon" - ), - dd.Annotation( - text="j.JNSEN.123@gmail.com", - start_char=259, - end_char=280, - tag="emailadres", - ), - dd.Annotation( - text="J. Jansen", start_char=64, end_char=73, tag="patient" - ), - dd.Annotation( - text="Jan Jansen", start_char=9, end_char=19, tag="patient" - ), - dd.Annotation( - text="10 oktober 2018", start_char=139, end_char=154, tag="datum" - ), - dd.Annotation(text="64", start_char=77, end_char=79, tag="leeftijd"), - dd.Annotation(text="000334433", start_char=42, end_char=51, tag="id"), - dd.Annotation( - text="Utrecht", start_char=106, end_char=113, tag="locatie" - ), - dd.Annotation( - text="IJSWEG 10r", start_char=115, end_char=125, tag="locatie" - ), - dd.Annotation( - text="UMCU", start_char=214, end_char=218, tag="ziekenhuis" - ), + dd.Annotation( + text="(06)12345678", + start_char=284, + end_char=296, + tag="telefoonnummer", + ), + dd.Annotation(text="111222333", start_char=25, end_char=34, tag="bsn"), + dd.Annotation( + text="Peter de Visser", start_char=165, end_char=180, tag="persoon" + ), + dd.Annotation( + text="j.JNSEN.123@gmail.com", + start_char=259, + end_char=280, + tag="emailadres", + ), + dd.Annotation( + text="J. Jansen", start_char=64, end_char=73, tag="patient" + ), + dd.Annotation( + text="Jan Jansen", start_char=9, end_char=19, tag="patient" + ), + dd.Annotation( + text="10 oktober 2018", start_char=139, end_char=154, tag="datum" + ), + dd.Annotation(text="64", start_char=77, end_char=79, tag="leeftijd"), + dd.Annotation(text="000334433", start_char=42, end_char=51, tag="id"), + dd.Annotation( + text="Utrecht", start_char=106, end_char=113, tag="locatie" + ), + dd.Annotation( + text="IJSWEG 10r", start_char=115, end_char=125, tag="locatie" + ), + dd.Annotation( + text="UMCU", start_char=214, end_char=218, tag="ziekenhuis" + ), + dd.Annotation( + text="Peter Jansen", start_char=305, end_char=317, + tag="persoon" + ), + dd.Annotation( + text="104", start_char=319, end_char=322, tag="leeftijd" + ), + dd.Annotation( + text="Utrecht", start_char=340, end_char=347, tag="locatie" + ), + dd.Annotation( + text="Jan de Visser", start_char=373, end_char=386, + tag="persoon" + ), } assert set(doc.annotations) == expected_annotations @@ -81,7 +97,13 @@ def test_deidentify(self, model): "[LEEFTIJD-1] jaar oud en woonachtig in [LOCATIE-1], [LOCATIE-2]. Hij werd " "op [DATUM-1] door arts [PERSOON-1] ontslagen van de kliniek van het " "[ZIEKENHUIS-1]. Voor nazorg kan hij worden bereikt via [EMAILADRES-1] " - "of [TELEFOONNUMMER-1]." + "of [TELEFOONNUMMER-1]. Vader, [PERSOON-2], [LEEFTIJD-2] jr, woont " + # XXX Btw, if we wanted more perfect security, we should + # not give away whether two mentions of age (or street or + # anything) were equal before deidentification or not. + # Concretely, it shouldn't matter whether LEEFTIJD-1 is the same + # as LEEFTIJD-2. + "ook in [LOCATIE-1]. Met collegiale groeten, [PERSOON-3]." ) assert doc.deidentified_text == expected_deidentified @@ -101,6 +123,10 @@ def test_annotate_intext(self, model): "bereikt via j.JNSEN.123@gmail.com of " "(06)12345678." # " De patient lijdt aan de Quervain ziekte." + " Vader, Peter Jansen, " + "104 jr, woont ook in " + "Utrecht. Met collegiale groeten, " + "Jan de Visser." ) - assert dd.utils.annotate_intext(doc) == expected_intext_annotated + assert dd.utils.annotate_intext(doc) == expected_intext_annotated \ No newline at end of file From cabbecac679a51067093840ffbe2fda4c291c72c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 5 Mar 2024 22:52:34 +0100 Subject: [PATCH 15/39] Label only entities with all pat subtags as pat --- base_config.json | 32 ++++++++++++--- deduce/annotation_processor.py | 35 +++++++++------- deduce/annotator.py | 74 +++++++++++++++++++++------------- 3 files changed, 94 insertions(+), 47 deletions(-) diff --git a/base_config.json b/base_config.json index d0f37871..621bc5fd 100644 --- a/base_config.json +++ b/base_config.json @@ -177,6 +177,32 @@ "args": { "iterative": true, "pattern": [ + { + "name": "patient_left", + "direction": "left", + "pre_tag": [ + "achternaam_patient" + ], + "tag": "voornaam_patient+achternaam_patient", + "pattern": [ + { + "lookup": "patient.first_names" + } + ] + }, + { + "name": "patient_right", + "direction": "right", + "pre_tag": [ + "voornaam_patient" + ], + "tag": "voornaam_patient+achternaam_patient", + "pattern": [ + { + "lookup": "patient.surname" + } + ] + }, { "name": "interfix_right", "direction": "right", @@ -295,11 +321,7 @@ "skip": ["."], "pattern": [ { - "and": [ - { - "lookup": "prefix" - } - ] + "lookup": "prefix" } ] } diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index de482f8a..1f5ff689 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -56,13 +56,15 @@ def _adjacent_annotations_replacement( class PersonAnnotationConverter(dd.process.AnnotationProcessor): """ - Responsible for processing the annotations produced by all name annotators (regular - and context-based). - - Any overlap with annotations that are contain "pseudo" in their tag are removed, as - are those annotations. Then resolves overlap between remaining annotations, and maps - the tags to either "patient" or "persoon", based on whether "patient" is in the tag - (e.g. voornaam_patient => patient, achternaam_onbekend => persoon). + Responsible for processing the annotations produced by all name annotators + (regular and context-based). + + Any overlap with annotations that contain "pseudo" in their tag is removed, + as are those annotations. Then resolves overlap between remaining + annotations, and maps the tags to either "patient" or "persoon", based on + whether "patient" is in all constituent tags + (e.g. voornaam_patient+achternaam_patient => patient, + achternaam_onbekend => persoon). """ def __init__(self) -> None: @@ -89,16 +91,19 @@ def process_annotations( annotations, text=text ) - return dd.AnnotationSet( + real_annos = (anno for anno in new_annotations + if "pseudo" not in anno.tag and anno.text.strip()) + with_patient = ( dd.Annotation( - text=annotation.text, - start_char=annotation.start_char, - end_char=annotation.end_char, - tag="patient" if "patient" in annotation.tag else "persoon", + text=anno.text, + start_char=anno.start_char, + end_char=anno.end_char, + tag="patient" if all( + "patient" in subtag for subtag in anno.tag.split('+') + ) else "persoon", ) - for annotation in new_annotations - if ("pseudo" not in annotation.tag and len(annotation.text.strip()) != 0) - ) + for anno in real_annos) + return dd.AnnotationSet(with_patient) class RemoveAnnotations(dd.process.AnnotationProcessor): diff --git a/deduce/annotator.py b/deduce/annotator.py index 9fdcf9a3..a483cfc7 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -2,7 +2,7 @@ import re import warnings -from typing import Literal, Optional +from typing import Literal, Optional, Any import docdeid as dd from docdeid import Annotation, Document, Tokenizer @@ -81,9 +81,9 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09 and not any(ch.isdigit() for ch in kwargs.get("token").text) ) == value if func == "lookup": - return kwargs.get("token").text in kwargs.get("ds")[value] + return cls._lookup(value, **kwargs) if func == "neg_lookup": - return kwargs.get("token").text not in kwargs.get("ds")[value] + return not cls._lookup(value, **kwargs) if func == "and": return all( _PatternPositionMatcher.match(pattern_position=x, **kwargs) @@ -97,6 +97,21 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09 raise NotImplementedError(f"No known logic for pattern {func}") + @classmethod + def _lookup(cls, ent_type: str, **kwargs) -> bool: + token = kwargs.get("token").text + if '.' in ent_type: + meta_key, meta_attr = ent_type.split('.', 1) + try: + meta_val = getattr(kwargs['metadata'][meta_key], meta_attr) + except (TypeError, KeyError, AttributeError): + return False + else: + return (token == meta_val if isinstance(meta_val, str) + else token in meta_val) + else: + return token in kwargs.get("ds")[ent_type] + class TokenPatternAnnotator(dd.process.Annotator): """ @@ -158,14 +173,14 @@ def _get_chained_token( return token - def _match_sequence( # pylint: disable=R0913 - self, - text: str, - pattern: list[dict], - start_token: dd.tokenizer.Token, - direction: Literal["left", "right"] = "right", - skip: Optional[set[str]] = None, - ) -> Optional[dd.Annotation]: + def _match_sequence(self, + text: str, + pattern: list[dict], + start_token: dd.tokenizer.Token, + direction: Literal["left", "right"] = "right", + skip: Optional[set[str]] = None, + metadata: Optional[dict[str, Any]]=None) \ + -> Optional[dd.Annotation]: """ Sequentially match a pattern against a specified start_token. @@ -175,6 +190,7 @@ def _match_sequence( # pylint: disable=R0913 start_token: The start token to match. direction: The direction to match, choice of "left" or "right". skip: Any string values that should be skipped in matching. + metadata: Document metadata (like the patient name). Returns: An Annotation if matching is possible, None otherwise. @@ -190,7 +206,10 @@ def _match_sequence( # pylint: disable=R0913 for pattern_position in pattern: if current_token is None or not _PatternPositionMatcher.match( - pattern_position=pattern_position, token=current_token, ds=self.ds + pattern_position=pattern_position, + token=current_token, + ds=self.ds, + metadata=metadata, ): return None @@ -235,8 +254,8 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: for token in tokens: annotation = self._match_sequence( - doc.text, self.pattern, token, direction="right", skip=self.skip - ) + doc.text, self.pattern, token, + direction="right", skip=self.skip) if annotation is not None: annotations.append(annotation) @@ -264,9 +283,12 @@ def __init__( self.iterative = iterative super().__init__(*args, **kwargs, ds=ds, tag="_") - def _apply_context_pattern( - self, text: str, annotations: dd.AnnotationSet, context_pattern: dict - ) -> dd.AnnotationSet: + def _apply_context_pattern(self, + text: str, + annotations: dd.AnnotationSet, + context_pattern: dict, + metadata: Optional[dict[str, Any]]=None) \ + -> dd.AnnotationSet: direction = context_pattern["direction"] skip = set(context_pattern.get("skip", [])) @@ -285,12 +307,8 @@ def _apply_context_pattern( _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip ) new_annotation = self._match_sequence( - text, - context_pattern["pattern"], - start_token, - direction=direction, - skip=skip, - ) + text, context_pattern["pattern"], start_token, + direction=direction, skip=skip, metadata=metadata) if new_annotation: left_ann, right_ann = _DIRECTION_MAP[direction]["order"]( @@ -312,7 +330,8 @@ def _apply_context_pattern( return annotations - def _annotate(self, text: str, annotations: dd.AnnotationSet) -> dd.AnnotationSet: + def _annotate(self, text: str, annotations: dd.AnnotationSet, + metadata=None) -> dd.AnnotationSet: """ Does the annotation, by calling _apply_context_pattern, and then optionally recursing. Also keeps track of the (un)changed annotations, so they are not @@ -321,6 +340,7 @@ def _annotate(self, text: str, annotations: dd.AnnotationSet) -> dd.AnnotationSe Args: text: The input text. annotations: The input annotations. + metadata: Document metadata (like the patient name). Returns: An extended set of annotations, based on the patterns provided. @@ -330,8 +350,7 @@ def _annotate(self, text: str, annotations: dd.AnnotationSet) -> dd.AnnotationSe for context_pattern in self.pattern: annotations = self._apply_context_pattern( - text, annotations, context_pattern - ) + text, annotations, context_pattern, metadata) if self.iterative: @@ -356,7 +375,8 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: An empty list, as annotations are modified and not added. """ - doc.annotations = self._annotate(doc.text, doc.annotations) + doc.annotations = self._annotate( + doc.text, doc.annotations, doc.metadata) return [] From 3b7c20cddf5ef9308c08a03ebf8e55aecef47707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 11:40:29 +0100 Subject: [PATCH 16/39] Add a more extensive test for patient name --- .../lookup/src/names/lst_first_name/items.txt | 2 +- .../data/lookup/src/names/lst_interfix/items.txt | 1 + tests/pipeline/test_deduce.py | 16 +++++++++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt b/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt index eaa20709..8bf097e6 100644 --- a/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt +++ b/tests/pipeline/data/lookup/src/names/lst_first_name/items.txt @@ -1,4 +1,4 @@ Annes Jan Jansen -Peter +Peter \ No newline at end of file diff --git a/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt index 70721d37..a00f7851 100644 --- a/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt +++ b/tests/pipeline/data/lookup/src/names/lst_interfix/items.txt @@ -3,4 +3,5 @@ de in 't in het uit de +van den von \ No newline at end of file diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index d9472aec..09c29f04 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -129,4 +129,18 @@ def test_annotate_intext(self, model): "Jan de Visser." ) - assert dd.utils.annotate_intext(doc) == expected_intext_annotated \ No newline at end of file + assert dd.utils.annotate_intext(doc) == expected_intext_annotated + + def test_patient_2(self, model): + metadata = {"patient": Person(first_names=["Jan"], surname="Jansen")} + doc = ("Lorem ipsum JANSEN sit amet, Peter Jansen adipiscing elit. " + "Curabitur J. Jansen sapien, J. P. Jansen a vestibulum quis, " + "facilisis vel J Jansen. Jan de Visser iaculis gravida nulla. " + "Etiam quis Jan van den Jansen.") + want = ("Lorem ipsum [PATIENT] sit amet, [PERSOON-1] adipiscing elit. " + "Curabitur [PATIENT] sapien, [PERSOON-2] a vestibulum quis, " + "facilisis vel [PATIENT]. [PERSOON-3] iaculis gravida nulla. " + "Etiam quis [PERSOON-4].") + + deid = model.deidentify(doc, metadata=metadata) + assert deid.deidentified_text == want From 1eadb3ce82b79de456e9b06ea867a78bfc1d5505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 14:01:07 +0100 Subject: [PATCH 17/39] Titlecase patient names when matching --- deduce/utils.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/deduce/utils.py b/deduce/utils.py index 3cea9c4e..81525e8a 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -9,8 +9,16 @@ from docdeid import Tokenizer from rapidfuzz.distance import DamerauLevenshtein +from docdeid.str import LowercaseTail -def str_match(str_1: str, str_2: str, max_edit_distance: Optional[int] = None) -> bool: + +_TITLECASER = LowercaseTail() + + +def str_match(str_1: str, str_2: str, + max_edit_distance: Optional[int] = None, + titlecase: bool = True, + ) -> bool: """ Match two strings, potentially in a fuzzy way. @@ -23,13 +31,16 @@ def str_match(str_1: str, str_2: str, max_edit_distance: Optional[int] = None) - Returns: ``True`` if the strings match, ``False`` otherwise. """ + norm_1, norm_2 = ((_TITLECASER.process(str_1), _TITLECASER.process(str_2)) + if titlecase + else (str_1, str_2)) if max_edit_distance is not None: return ( - DamerauLevenshtein.distance(str_1, str_2, score_cutoff=max_edit_distance) + DamerauLevenshtein.distance(norm_1, norm_2, score_cutoff=max_edit_distance) <= max_edit_distance ) - return str_1 == str_2 + return norm_1 == norm_2 def class_for_name(module_name: str, class_name: str) -> type: From 010840b872ca9e2ef59f81dd7e35e95a7a4ca666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 14:05:58 +0100 Subject: [PATCH 18/39] Make (patient, persoon) become persoon Otherwise, random names are labeled as "patient", which will be wrong in most cases. --- deduce/annotation_processor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index 1f5ff689..a7e152bb 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -37,17 +37,18 @@ def _adjacent_annotations_replacement( """ Replace two annotations that have equal tags with a new annotation. - If one of the two annotations has the patient tag, the new annotation will also - be tagged patient. In other cases, the tags are already equal. + If one of the two annotations has the "patient" tag (and the other is + either "patient" or "persoon"), the other annotation will be used. + In other cases, the tags are always equal. """ if left_annotation.tag != right_annotation.tag: - replacement_tag = "patient" + replacement_tag = "persoon" else: replacement_tag = left_annotation.tag return dd.Annotation( - text=text[left_annotation.start_char : right_annotation.end_char], + text=text[left_annotation.start_char:right_annotation.end_char], start_char=left_annotation.start_char, end_char=right_annotation.end_char, tag=replacement_tag, From 4ec583658eb7ce66f3fa518ddaca88198f246c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 15:16:22 +0100 Subject: [PATCH 19/39] Retain first-/surname distinction longer... ...and use it to determine where patient name is to be merged with a neighbouring person mention and when not. --- deduce/annotation_processor.py | 48 ++++++++++++++++++++++++---------- deduce/deduce.py | 15 +++++++++++ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index a7e152bb..c4d0ff8c 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -6,9 +6,16 @@ class DeduceMergeAdjacentAnnotations(dd.process.MergeAdjacentAnnotations): - """Merge adjacent tags, according to deduce logic: adjacent annotations with mixed - patient/person tags are replaced with a patient annotation, in other cases only - annotations with equal tags are considered adjacent.""" + """\ + Merges adjacent tags, according to Deduce logic: + + - adjacent annotations with mixed patient/person tags are replaced + with the "persoon" annotation; + - adjacent annotations with patient tags of which one is the surname + are replaced with the "patient" annotation; and + - adjacent annotations with other patient tags are replaced with + the "part_of_patient" annotation. + """ def _tags_match(self, left_tag: str, right_tag: str) -> bool: """ @@ -23,10 +30,14 @@ def _tags_match(self, left_tag: str, right_tag: str) -> bool: ``True`` if tags match, ``False`` otherwise. """ - return (left_tag == right_tag) or {left_tag, right_tag} == { - "patient", - "persoon", - } + patient_part = [tag.endswith('_patient') + for tag in (left_tag, right_tag)] + # FIXME Ideally, we should be first looking for a `*_patient` tag in + # both directions and only failing that, merge with an adjacent + # "persoon" tag. + return (left_tag == right_tag or + all(patient_part) or + (patient_part[0] and right_tag == "persoon")) def _adjacent_annotations_replacement( self, @@ -42,10 +53,14 @@ def _adjacent_annotations_replacement( In other cases, the tags are always equal. """ - if left_annotation.tag != right_annotation.tag: - replacement_tag = "persoon" - else: - replacement_tag = left_annotation.tag + ltag = left_annotation.tag + rtag = right_annotation.tag + replacement_tag = ( + ltag if ltag == rtag else + "persoon" if rtag == "persoon" else + "patient" if any(tag.startswith("achternaam") for tag in + (ltag, rtag)) else + "part_of_patient") return dd.Annotation( text=text[left_annotation.start_char:right_annotation.end_char], @@ -99,13 +114,18 @@ def process_annotations( text=anno.text, start_char=anno.start_char, end_char=anno.end_char, - tag="patient" if all( - "patient" in subtag for subtag in anno.tag.split('+') - ) else "persoon", + tag=PersonAnnotationConverter._resolve_tag(anno.tag) ) for anno in real_annos) return dd.AnnotationSet(with_patient) + @classmethod + def _resolve_tag(cls, tag: str) -> str: + if '+' not in tag: + return tag + return ('patient' if all('patient' in part for part in tag.split('+')) + else 'persoon') + class RemoveAnnotations(dd.process.AnnotationProcessor): """Removes all annotations with corresponding tags.""" diff --git a/deduce/deduce.py b/deduce/deduce.py index c527a18c..9c0ce619 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -400,6 +400,21 @@ def _load_post_processors( ), ) + post_group.add_processor( + "patient_cleaner", + CleanAnnotationTag( + tag_map={ + "voornaam_patient": "patient", + "initiaal_patient": "patient", + "achternaam_patient": "patient", + "part_of_patient": "persoon", + # TODO We should probably merge this new "person" + # annotation with neighbouring annotations in yet another + # postprocessing step. + } + ), + ) + post_group.add_processor( "redactor", DeduceRedactor( From ca4977f9f77c755c85fc6c91ac4a1ba5c5aefd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 15:37:18 +0100 Subject: [PATCH 20/39] Fix tests for `PersonAnnotationConverter` --- deduce/annotation_processor.py | 2 +- tests/unit/test_annotation_processor.py | 32 ++++++++++++++++++------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index c4d0ff8c..84465ed2 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -122,7 +122,7 @@ def process_annotations( @classmethod def _resolve_tag(cls, tag: str) -> str: if '+' not in tag: - return tag + return tag if 'patient' in tag else 'persoon' return ('patient' if all('patient' in part for part in tag.split('+')) else 'persoon') diff --git a/tests/unit/test_annotation_processor.py b/tests/unit/test_annotation_processor.py index 8e3ce8f5..d08db864 100644 --- a/tests/unit/test_annotation_processor.py +++ b/tests/unit/test_annotation_processor.py @@ -14,16 +14,23 @@ def test_tags_match(self): assert proc._tags_match("a", "a") assert proc._tags_match("huisnummer", "huisnummer") + + # XXX Dubious behaviour: assert proc._tags_match("patient", "patient") + assert proc._tags_match("persoon", "persoon") - assert proc._tags_match("patient", "persoon") - assert proc._tags_match("persoon", "patient") + assert proc._tags_match("initiaal_patient", "persoon") + assert proc._tags_match("initiaal_patient", "name_patient") assert not proc._tags_match("a", "b") assert not proc._tags_match("patient", "huisnummer") assert not proc._tags_match("huisnummer", "patient") assert not proc._tags_match("persoon", "huisnummer") assert not proc._tags_match("huisnummer", "persoon") + assert not proc._tags_match("patient", "persoon") + assert not proc._tags_match("persoon", "patient") + assert not proc._tags_match("name_patient", "patient") + assert not proc._tags_match("persoon", "initiaal_patient") def test_annotation_replacement_equal_tags(self): proc = DeduceMergeAdjacentAnnotations() @@ -84,8 +91,10 @@ def test_patient_no_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"), - dd.Annotation(text="Jansen", start_char=4, end_char=10, tag="patient"), + dd.Annotation(text="Jan", start_char=0, end_char=3, + tag="voornaam_patient"), + dd.Annotation(text="Jansen", start_char=4, end_char=10, + tag="achternaam_patient"), ] ) @@ -107,7 +116,8 @@ def test_patient_with_overlap(self): ) expected_annotations = dd.AnnotationSet( - [dd.Annotation(text="Jan Jansen", start_char=0, end_char=10, tag="patient")] + [dd.Annotation(text="Jan Jansen", start_char=0, end_char=10, + tag="naam_patient")] ) assert proc.process_annotations(annotations, text) == expected_annotations @@ -129,8 +139,10 @@ def test_mixed_no_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"), - dd.Annotation(text="Jansen", start_char=4, end_char=10, tag="persoon"), + dd.Annotation(text="Jan", start_char=0, end_char=3, + tag="voornaam_patient"), + dd.Annotation(text="Jansen", start_char=4, end_char=10, + tag="persoon"), ] ) @@ -153,8 +165,10 @@ def test_mixed_with_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, tag="patient"), - dd.Annotation(text=" Jansen", start_char=3, end_char=10, tag="persoon"), + dd.Annotation(text="Jan", start_char=0, end_char=3, + tag="voornaam_patient"), + dd.Annotation(text=" Jansen", start_char=3, end_char=10, + tag="persoon"), ] ) From 8710bcdfa19d8bf48fa5b8d81b4857261726d4f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 15:40:05 +0100 Subject: [PATCH 21/39] Update documentation slightly --- deduce/annotation_processor.py | 2 +- deduce/utils.py | 4 ++-- docs/source/tutorial.md | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index 84465ed2..5568af70 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -140,7 +140,7 @@ def process_annotations( class CleanAnnotationTag(dd.process.AnnotationProcessor): - """Cleans annotation tags based on the corresponding mapping.""" + """Renames tags using a mapping.""" def __init__(self, tag_map: dict[str, str]) -> None: self.tag_map = tag_map diff --git a/deduce/utils.py b/deduce/utils.py index 81525e8a..0ba63c76 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -65,8 +65,8 @@ def initialize_class(cls: type, args: dict, extras: dict) -> object: items in extras are passed to the class initializer if they are present. Args: - cls: The class to initialze. - args: The arguments to pass to the initalizer. + cls: The class to initialize. + args: The arguments to pass to the initializer. extras: A superset of arguments that should be passed to the initializer. Will be checked against the class. diff --git a/docs/source/tutorial.md b/docs/source/tutorial.md index 69b894aa..d2084925 100644 --- a/docs/source/tutorial.md +++ b/docs/source/tutorial.md @@ -57,14 +57,14 @@ It's possible to add, remove, apply subsets, or to implement custom annotators, In addition to annotators, a `docdeid` de-identifier contains annotation processors, which do some operation to the set of annotations generated previously, and redactors, which take the annotation and replace them in the text. Other processors included in `deduce` are listed below: -| **Name** | **Group** | **Description** | -|-----------------------------|-----------------|-------------------------------------------------------------------------------------------------------| -| person_annotation_converter | names | Maps name tags to either PERSON or PATIENT, and removes overlap with 'pseudo_name'. | -| remove_street_tags | locations | Removes any matched street names that are not followed by a housenumber | -| clean_street_tags | locations | Cleans up street tags, e.g. straat+huisnummer -> locatie | -| overlap_resolver | post_processing | Makes sure overlap among annotations is resolved. | -| merge_adjacent_annotations | post_processing | If there are any adjacent annotations with the same tag, they are merged into a single annotation. | -| redactor | post_processing | Takes care of replacing the annotated PHIs with `[TAG]` (e.g. `[LOCATION-1]`, `[DATE-2]`) | +| **Name** | **Group** | **Description** | +|-----------------------------|-----------------|-------------------------------------------------------------------------------------------------------------------------------| +| person_annotation_converter | names | Collapses competing annotations for the same span to either "persoon" or "\*patient", and removes overlap with 'pseudo_name'. | +| remove_street_tags | locations | Removes any matched street names that are not followed by a housenumber. | +| clean_street_tags | locations | Renames compound street tags, e.g. "straat+huisnummer", to "locatie". | +| overlap_resolver | post_processing | Makes sure overlap among annotations is resolved. | +| merge_adjacent_annotations | post_processing | If there are any adjacent annotations with the same tag, they are merged into a single annotation. | +| redactor | post_processing | Takes care of replacing the annotated PHIs with `[TAG]` (e.g. `[LOCATION-1]`, `[DATE-2]`). | ### Lookup sets From bf9a910671300c1ff5b2427c83d99d2e3b621274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 6 Mar 2024 22:04:29 +0100 Subject: [PATCH 22/39] Enable matching tags of tokens --- base_config.json | 4 +- deduce/annotator.py | 78 +++++++++++++++++++++++++++--------- tests/unit/test_annotator.py | 25 ++++++++++-- 3 files changed, 83 insertions(+), 24 deletions(-) diff --git a/base_config.json b/base_config.json index 621bc5fd..d1841ad7 100644 --- a/base_config.json +++ b/base_config.json @@ -186,7 +186,7 @@ "tag": "voornaam_patient+achternaam_patient", "pattern": [ { - "lookup": "patient.first_names" + "tag": "vornaam_patient" } ] }, @@ -199,7 +199,7 @@ "tag": "voornaam_patient+achternaam_patient", "pattern": [ { - "lookup": "patient.surname" + "tag": "achternaam_patient" } ] }, diff --git a/deduce/annotator.py b/deduce/annotator.py index a483cfc7..819925bd 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -2,10 +2,12 @@ import re import warnings +from collections import defaultdict +from collections.abc import Iterable, Mapping from typing import Literal, Optional, Any import docdeid as dd -from docdeid import Annotation, Document, Tokenizer +from docdeid import Annotation, Document, Tokenizer, TokenList from docdeid.process import RegexpAnnotator from deduce.utils import str_match @@ -84,6 +86,9 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09 return cls._lookup(value, **kwargs) if func == "neg_lookup": return not cls._lookup(value, **kwargs) + if func == "tag": + annos = kwargs.get("annos", ()) + return any(anno.tag == value for anno in annos) if func == "and": return all( _PatternPositionMatcher.match(pattern_position=x, **kwargs) @@ -177,9 +182,12 @@ def _match_sequence(self, text: str, pattern: list[dict], start_token: dd.tokenizer.Token, + annos_by_token: defaultdict[dd.tokenizer.Token, + Iterable[dd.Annotation]], direction: Literal["left", "right"] = "right", skip: Optional[set[str]] = None, - metadata: Optional[dict[str, Any]]=None) \ + metadata: Optional[dict[str, Any]] = None, + ) \ -> Optional[dd.Annotation]: """ Sequentially match a pattern against a specified start_token. @@ -188,6 +196,7 @@ def _match_sequence(self, text: The original document text. pattern: The pattern to match. start_token: The start token to match. + annos_by_token: Map from tokens to annotations covering it. direction: The direction to match, choice of "left" or "right". skip: Any string values that should be skipped in matching. metadata: Document metadata (like the patient name). @@ -208,6 +217,7 @@ def _match_sequence(self, if current_token is None or not _PatternPositionMatcher.match( pattern_position=pattern_position, token=current_token, + annos=annos_by_token[current_token], ds=self.ds, metadata=metadata, ): @@ -251,10 +261,13 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: matching_pipeline=self._matching_pipeline, ) + annos_by_token = TokenPatternAnnotator._index_by_token( + doc.annotations, doc.token_lists) + for token in tokens: annotation = self._match_sequence( - doc.text, self.pattern, token, + doc.text, self.pattern, token, annos_by_token, direction="right", skip=self.skip) if annotation is not None: @@ -262,6 +275,26 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: return annotations + @classmethod + def _index_by_token(cls, annotations, token_lists) \ + -> defaultdict[str, set[dd.Annotation]]: + """\ + Assigns existing annotations to tokens. + """ + annos_by_token = defaultdict(set) + for token_list in token_lists.values(): + # TODO Improve efficiency, simplify. + for anno in annotations: + found_first = False + for token in token_list: + if anno.start_char < token.end_char: + found_first = True + if token.start_char >= anno.end_char: + break + if found_first: + annos_by_token[token].add(anno) + return annos_by_token + class ContextAnnotator(TokenPatternAnnotator): """ @@ -283,16 +316,18 @@ def __init__( self.iterative = iterative super().__init__(*args, **kwargs, ds=ds, tag="_") - def _apply_context_pattern(self, - text: str, - annotations: dd.AnnotationSet, + def _apply_context_pattern(self, text: str, annotations: dd.AnnotationSet, + token_lists: Mapping[str, TokenList], context_pattern: dict, - metadata: Optional[dict[str, Any]]=None) \ + metadata: Optional[dict[str, Any]] = None) \ -> dd.AnnotationSet: direction = context_pattern["direction"] skip = set(context_pattern.get("skip", [])) + annos_by_token = TokenPatternAnnotator._index_by_token(annotations, + token_lists) + for annotation in annotations.copy(): tag = list(_DIRECTION_MAP[direction]["order"](annotation.tag.split("+")))[ @@ -307,7 +342,7 @@ def _apply_context_pattern(self, _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip ) new_annotation = self._match_sequence( - text, context_pattern["pattern"], start_token, + text, context_pattern["pattern"], start_token, annos_by_token, direction=direction, skip=skip, metadata=metadata) if new_annotation: @@ -331,15 +366,18 @@ def _apply_context_pattern(self, return annotations def _annotate(self, text: str, annotations: dd.AnnotationSet, + token_lists: Mapping[str, TokenList], metadata=None) -> dd.AnnotationSet: """ - Does the annotation, by calling _apply_context_pattern, and then optionally - recursing. Also keeps track of the (un)changed annotations, so they are not - repeatedly processed. + Does the annotation, by calling _apply_context_pattern, and then + optionally recursing. Also keeps track of the (un)changed annotations, + so they are not repeatedly processed. Args: text: The input text. annotations: The input annotations. + token_lists: Token lists available in this pipeline, indexed by + the tokenizer name. metadata: Document metadata (like the patient name). Returns: @@ -349,18 +387,20 @@ def _annotate(self, text: str, annotations: dd.AnnotationSet, original_annotations = annotations.copy() for context_pattern in self.pattern: - annotations = self._apply_context_pattern( - text, annotations, context_pattern, metadata) + annotations = self._apply_context_pattern(text, annotations, + token_lists, + context_pattern, + metadata) if self.iterative: - changed = dd.AnnotationSet(annotations.difference(original_annotations)) + changed = dd.AnnotationSet( + annotations.difference(original_annotations)) annotations = dd.AnnotationSet( - annotations.intersection(original_annotations) - ) + annotations.intersection(original_annotations)) if changed: - annotations.update(self._annotate(text, changed)) + annotations.update(self._annotate(text, changed, token_lists)) return annotations @@ -375,8 +415,8 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: An empty list, as annotations are modified and not added. """ - doc.annotations = self._annotate( - doc.text, doc.annotations, doc.metadata) + doc.annotations = self._annotate(doc.text, doc.annotations, + doc.token_lists, doc.metadata) return [] diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index 2f7daec3..11739f89 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -1,4 +1,5 @@ import re +from collections import defaultdict from unittest.mock import patch import docdeid as dd @@ -213,13 +214,16 @@ def test_match_sequence(self, pattern_doc, ds): tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_") assert tpa._match_sequence( - pattern_doc.text, start_token=pattern_doc.get_tokens()[3], pattern=pattern + pattern_doc.text, start_token=pattern_doc.get_tokens()[3], + pattern=pattern, + annos_by_token=defaultdict(list), ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") assert ( tpa._match_sequence( pattern_doc.text, start_token=pattern_doc.get_tokens()[7], pattern=pattern, + annos_by_token=defaultdict(list), ) is None ) @@ -233,6 +237,7 @@ def test_match_sequence_left(self, pattern_doc, ds): pattern_doc.text, start_token=pattern_doc.get_tokens()[4], pattern=pattern, + annos_by_token=defaultdict(list), direction="left", ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") @@ -240,6 +245,7 @@ def test_match_sequence_left(self, pattern_doc, ds): tpa._match_sequence( pattern_doc.text, start_token=pattern_doc.get_tokens()[8], + annos_by_token=defaultdict(list), direction="left", pattern=pattern, ) @@ -255,6 +261,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): pattern_doc.text, start_token=pattern_doc.get_tokens()[4], pattern=pattern, + annos_by_token=defaultdict(list), skip={"-"}, ) == dd.Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_") assert ( @@ -262,6 +269,7 @@ def test_match_sequence_skip(self, pattern_doc, ds): pattern_doc.text, start_token=pattern_doc.get_tokens()[4], pattern=pattern, + annos_by_token=defaultdict(list), skip=set(), ) is None @@ -297,6 +305,7 @@ def test_apply_context_pattern(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc.text, annotations, + {}, { "pattern": [{"like_name": True}], "direction": "right", @@ -333,6 +342,7 @@ def test_apply_context_pattern_left(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc.text, annotations, + {}, { "pattern": [{"like_name": True}], "direction": "left", @@ -369,6 +379,7 @@ def test_apply_context_pattern_skip(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc.text, annotations, + {}, { "pattern": [{"like_name": True}], "direction": "right", @@ -419,7 +430,11 @@ def test_annotate_multiple(self, pattern_doc): ] ) - assert annotator._annotate(pattern_doc.text, annotations) == dd.AnnotationSet( + assert annotator._annotate( + pattern_doc.text, + annotations, + {}, + ) == dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", @@ -456,7 +471,11 @@ def test_annotate_iterative(self, pattern_doc): ] ) - assert annotator._annotate(pattern_doc.text, annotations) == dd.AnnotationSet( + assert annotator._annotate( + pattern_doc.text, + annotations, + {}, + ) == dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", From 27c27ceccee18043bf9bb7fdd3dac06e67025f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 11:43:04 +0100 Subject: [PATCH 23/39] (Almost) automatically format code --- deduce/annotation_processor.py | 86 +++++++++-------- deduce/annotator.py | 117 ++++++++++++++---------- deduce/deduce.py | 16 ++-- deduce/utils.py | 26 +++--- tests/pipeline/test_deduce.py | 61 ++++++------ tests/regression/test_regression.py | 10 +- tests/unit/test_annotation_processor.py | 33 ++++--- tests/unit/test_annotator.py | 3 +- tests/unit/test_lookup_struct.py | 32 +++---- tests/unit/test_utils.py | 3 +- 10 files changed, 201 insertions(+), 186 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index 5568af70..0c55eb94 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -6,15 +6,15 @@ class DeduceMergeAdjacentAnnotations(dd.process.MergeAdjacentAnnotations): - """\ + """ Merges adjacent tags, according to Deduce logic: - - adjacent annotations with mixed patient/person tags are replaced - with the "persoon" annotation; - - adjacent annotations with patient tags of which one is the surname - are replaced with the "patient" annotation; and - - adjacent annotations with other patient tags are replaced with - the "part_of_patient" annotation. + - adjacent annotations with mixed patient/person tags are replaced + with the "persoon" annotation; + - adjacent annotations with patient tags of which one is the surname + are replaced with the "patient" annotation; and + - adjacent annotations with other patient tags are replaced with + the "part_of_patient" annotation. """ def _tags_match(self, left_tag: str, right_tag: str) -> bool: @@ -30,14 +30,15 @@ def _tags_match(self, left_tag: str, right_tag: str) -> bool: ``True`` if tags match, ``False`` otherwise. """ - patient_part = [tag.endswith('_patient') - for tag in (left_tag, right_tag)] + patient_part = [tag.endswith("_patient") for tag in (left_tag, right_tag)] # FIXME Ideally, we should be first looking for a `*_patient` tag in # both directions and only failing that, merge with an adjacent # "persoon" tag. - return (left_tag == right_tag or - all(patient_part) or - (patient_part[0] and right_tag == "persoon")) + return ( + left_tag == right_tag + or all(patient_part) + or (patient_part[0] and right_tag == "persoon") + ) def _adjacent_annotations_replacement( self, @@ -48,22 +49,25 @@ def _adjacent_annotations_replacement( """ Replace two annotations that have equal tags with a new annotation. - If one of the two annotations has the "patient" tag (and the other is - either "patient" or "persoon"), the other annotation will be used. - In other cases, the tags are always equal. + If one of the two annotations has the "patient" tag (and the other is either + "patient" or "persoon"), the other annotation will be used. In other cases, the + tags are always equal. """ ltag = left_annotation.tag rtag = right_annotation.tag replacement_tag = ( - ltag if ltag == rtag else - "persoon" if rtag == "persoon" else - "patient" if any(tag.startswith("achternaam") for tag in - (ltag, rtag)) else - "part_of_patient") + ltag + if ltag == rtag + else "persoon" + if rtag == "persoon" + else "patient" + if any(tag.startswith("achternaam") for tag in (ltag, rtag)) + else "part_of_patient" + ) return dd.Annotation( - text=text[left_annotation.start_char:right_annotation.end_char], + text=text[left_annotation.start_char : right_annotation.end_char], start_char=left_annotation.start_char, end_char=right_annotation.end_char, tag=replacement_tag, @@ -72,15 +76,14 @@ def _adjacent_annotations_replacement( class PersonAnnotationConverter(dd.process.AnnotationProcessor): """ - Responsible for processing the annotations produced by all name annotators - (regular and context-based). - - Any overlap with annotations that contain "pseudo" in their tag is removed, - as are those annotations. Then resolves overlap between remaining - annotations, and maps the tags to either "patient" or "persoon", based on - whether "patient" is in all constituent tags - (e.g. voornaam_patient+achternaam_patient => patient, - achternaam_onbekend => persoon). + Responsible for processing the annotations produced by all name annotators (regular + and context-based). + + Any overlap with annotations that contain "pseudo" in their tag is removed, as are + those annotations. Then resolves overlap between remaining annotations, and maps the + tags to either "patient" or "persoon", based on whether "patient" is in all + constituent tags (e.g. voornaam_patient+achternaam_patient => patient, + achternaam_onbekend => persoon). """ def __init__(self) -> None: @@ -107,24 +110,31 @@ def process_annotations( annotations, text=text ) - real_annos = (anno for anno in new_annotations - if "pseudo" not in anno.tag and anno.text.strip()) + real_annos = ( + anno + for anno in new_annotations + if "pseudo" not in anno.tag and anno.text.strip() + ) with_patient = ( dd.Annotation( text=anno.text, start_char=anno.start_char, end_char=anno.end_char, - tag=PersonAnnotationConverter._resolve_tag(anno.tag) + tag=PersonAnnotationConverter._resolve_tag(anno.tag), ) - for anno in real_annos) + for anno in real_annos + ) return dd.AnnotationSet(with_patient) @classmethod def _resolve_tag(cls, tag: str) -> str: - if '+' not in tag: - return tag if 'patient' in tag else 'persoon' - return ('patient' if all('patient' in part for part in tag.split('+')) - else 'persoon') + if "+" not in tag: + return tag if "patient" in tag else "persoon" + return ( + "patient" + if all("patient" in part for part in tag.split("+")) + else "persoon" + ) class RemoveAnnotations(dd.process.AnnotationProcessor): diff --git a/deduce/annotator.py b/deduce/annotator.py index 819925bd..bd3c08e9 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -4,7 +4,7 @@ import warnings from collections import defaultdict from collections.abc import Iterable, Mapping -from typing import Literal, Optional, Any +from typing import Any, Literal, Optional import docdeid as dd from docdeid import Annotation, Document, Tokenizer, TokenList @@ -105,15 +105,18 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09 @classmethod def _lookup(cls, ent_type: str, **kwargs) -> bool: token = kwargs.get("token").text - if '.' in ent_type: - meta_key, meta_attr = ent_type.split('.', 1) + if "." in ent_type: + meta_key, meta_attr = ent_type.split(".", 1) try: - meta_val = getattr(kwargs['metadata'][meta_key], meta_attr) + meta_val = getattr(kwargs["metadata"][meta_key], meta_attr) except (TypeError, KeyError, AttributeError): return False else: - return (token == meta_val if isinstance(meta_val, str) - else token in meta_val) + return ( + token == meta_val + if isinstance(meta_val, str) + else token in meta_val + ) else: return token in kwargs.get("ds")[ent_type] @@ -178,17 +181,16 @@ def _get_chained_token( return token - def _match_sequence(self, - text: str, - pattern: list[dict], - start_token: dd.tokenizer.Token, - annos_by_token: defaultdict[dd.tokenizer.Token, - Iterable[dd.Annotation]], - direction: Literal["left", "right"] = "right", - skip: Optional[set[str]] = None, - metadata: Optional[dict[str, Any]] = None, - ) \ - -> Optional[dd.Annotation]: + def _match_sequence( + self, + text: str, + pattern: list[dict], + start_token: dd.tokenizer.Token, + annos_by_token: defaultdict[dd.tokenizer.Token, Iterable[dd.Annotation]], + direction: Literal["left", "right"] = "right", + skip: Optional[set[str]] = None, + metadata: Optional[dict[str, Any]] = None, + ) -> Optional[dd.Annotation]: """ Sequentially match a pattern against a specified start_token. @@ -262,13 +264,19 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: ) annos_by_token = TokenPatternAnnotator._index_by_token( - doc.annotations, doc.token_lists) + doc.annotations, doc.token_lists + ) for token in tokens: annotation = self._match_sequence( - doc.text, self.pattern, token, annos_by_token, - direction="right", skip=self.skip) + doc.text, + self.pattern, + token, + annos_by_token, + direction="right", + skip=self.skip, + ) if annotation is not None: annotations.append(annotation) @@ -276,11 +284,10 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: return annotations @classmethod - def _index_by_token(cls, annotations, token_lists) \ - -> defaultdict[str, set[dd.Annotation]]: - """\ - Assigns existing annotations to tokens. - """ + def _index_by_token( + cls, annotations, token_lists + ) -> defaultdict[str, set[dd.Annotation]]: + """Assigns existing annotations to tokens.""" annos_by_token = defaultdict(set) for token_list in token_lists.values(): # TODO Improve efficiency, simplify. @@ -316,17 +323,19 @@ def __init__( self.iterative = iterative super().__init__(*args, **kwargs, ds=ds, tag="_") - def _apply_context_pattern(self, text: str, annotations: dd.AnnotationSet, - token_lists: Mapping[str, TokenList], - context_pattern: dict, - metadata: Optional[dict[str, Any]] = None) \ - -> dd.AnnotationSet: + def _apply_context_pattern( + self, + text: str, + annotations: dd.AnnotationSet, + token_lists: Mapping[str, TokenList], + context_pattern: dict, + metadata: Optional[dict[str, Any]] = None, + ) -> dd.AnnotationSet: direction = context_pattern["direction"] skip = set(context_pattern.get("skip", [])) - annos_by_token = TokenPatternAnnotator._index_by_token(annotations, - token_lists) + annos_by_token = TokenPatternAnnotator._index_by_token(annotations, token_lists) for annotation in annotations.copy(): @@ -342,8 +351,14 @@ def _apply_context_pattern(self, text: str, annotations: dd.AnnotationSet, _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip ) new_annotation = self._match_sequence( - text, context_pattern["pattern"], start_token, annos_by_token, - direction=direction, skip=skip, metadata=metadata) + text, + context_pattern["pattern"], + start_token, + annos_by_token, + direction=direction, + skip=skip, + metadata=metadata, + ) if new_annotation: left_ann, right_ann = _DIRECTION_MAP[direction]["order"]( @@ -365,13 +380,17 @@ def _apply_context_pattern(self, text: str, annotations: dd.AnnotationSet, return annotations - def _annotate(self, text: str, annotations: dd.AnnotationSet, - token_lists: Mapping[str, TokenList], - metadata=None) -> dd.AnnotationSet: + def _annotate( + self, + text: str, + annotations: dd.AnnotationSet, + token_lists: Mapping[str, TokenList], + metadata=None, + ) -> dd.AnnotationSet: """ - Does the annotation, by calling _apply_context_pattern, and then - optionally recursing. Also keeps track of the (un)changed annotations, - so they are not repeatedly processed. + Does the annotation, by calling _apply_context_pattern, and then optionally + recursing. Also keeps track of the (un)changed annotations, so they are not + repeatedly processed. Args: text: The input text. @@ -387,17 +406,16 @@ def _annotate(self, text: str, annotations: dd.AnnotationSet, original_annotations = annotations.copy() for context_pattern in self.pattern: - annotations = self._apply_context_pattern(text, annotations, - token_lists, - context_pattern, - metadata) + annotations = self._apply_context_pattern( + text, annotations, token_lists, context_pattern, metadata + ) if self.iterative: - changed = dd.AnnotationSet( - annotations.difference(original_annotations)) + changed = dd.AnnotationSet(annotations.difference(original_annotations)) annotations = dd.AnnotationSet( - annotations.intersection(original_annotations)) + annotations.intersection(original_annotations) + ) if changed: annotations.update(self._annotate(text, changed, token_lists)) @@ -415,8 +433,9 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: An empty list, as annotations are modified and not added. """ - doc.annotations = self._annotate(doc.text, doc.annotations, - doc.token_lists, doc.metadata) + doc.annotations = self._annotate( + doc.text, doc.annotations, doc.token_lists, doc.metadata + ) return [] diff --git a/deduce/deduce.py b/deduce/deduce.py index 9c0ce619..8610f6d3 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -88,17 +88,16 @@ def __init__( # pylint: disable=R0913 config = config_file - logging.info('Going to init config.') + logging.info("Going to init config.") self.config = self._initialize_config( load_base_config=load_base_config, user_config=config ) self.lookup_data_path = ensure_path(lookup_data_path) - logging.info('Going to init tokenizers.') - self.tokenizers = { - "default": self._initialize_tokenizer(self.lookup_data_path)} - logging.debug('Done initing tokenizers.') + logging.info("Going to init tokenizers.") + self.tokenizers = {"default": self._initialize_tokenizer(self.lookup_data_path)} + logging.debug("Done initing tokenizers.") self.lookup_structs = get_lookup_structs( lookup_path=self.lookup_data_path, @@ -107,12 +106,11 @@ def __init__( # pylint: disable=R0913 build=build_lookup_structs, save_cache=save_lookup_structs, ) - logging.info('Done loading lookup structs.') + logging.info("Done loading lookup structs.") - extras = {"tokenizer": self.tokenizers["default"], - "ds": self.lookup_structs} + extras = {"tokenizer": self.tokenizers["default"], "ds": self.lookup_structs} - logging.info('Going to load the Deduce processor.') + logging.info("Going to load the Deduce processor.") self.processors = _DeduceProcessorLoader().load( config=self.config, extras=extras ) diff --git a/deduce/utils.py b/deduce/utils.py index 0ba63c76..63c3e675 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -7,18 +7,18 @@ import docdeid as dd from docdeid import Tokenizer -from rapidfuzz.distance import DamerauLevenshtein - from docdeid.str import LowercaseTail - +from rapidfuzz.distance import DamerauLevenshtein _TITLECASER = LowercaseTail() -def str_match(str_1: str, str_2: str, - max_edit_distance: Optional[int] = None, - titlecase: bool = True, - ) -> bool: +def str_match( + str_1: str, + str_2: str, + max_edit_distance: Optional[int] = None, + titlecase: bool = True, +) -> bool: """ Match two strings, potentially in a fuzzy way. @@ -31,9 +31,11 @@ def str_match(str_1: str, str_2: str, Returns: ``True`` if the strings match, ``False`` otherwise. """ - norm_1, norm_2 = ((_TITLECASER.process(str_1), _TITLECASER.process(str_2)) - if titlecase - else (str_1, str_2)) + norm_1, norm_2 = ( + (_TITLECASER.process(str_1), _TITLECASER.process(str_2)) + if titlecase + else (str_1, str_2) + ) if max_edit_distance is not None: return ( DamerauLevenshtein.distance(norm_1, norm_2, score_cutoff=max_edit_distance) @@ -312,7 +314,5 @@ def lookup_set_to_trie( def ensure_path(path_or_str: Union[str, Path]) -> Path: - """\ - Casts the argument as a `Path` if it's not a `Path` already. - """ + """Casts the argument as a `Path` if it's not a `Path` already.""" return path_or_str if isinstance(path_or_str, Path) else Path(path_or_str) diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index 09c29f04..ae4029e9 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -1,8 +1,7 @@ +import docdeid as dd import pytest -import docdeid as dd from deduce import Deduce - from deduce.person import Person text = ( @@ -23,9 +22,11 @@ @pytest.fixture def model(shared_datadir): - return Deduce(save_lookup_structs=False, - build_lookup_structs=True, - lookup_data_path=shared_datadir / "lookup") + return Deduce( + save_lookup_structs=False, + build_lookup_structs=True, + lookup_data_path=shared_datadir / "lookup", + ) class TestDeduce: @@ -50,39 +51,25 @@ def test_annotate(self, model): end_char=280, tag="emailadres", ), - dd.Annotation( - text="J. Jansen", start_char=64, end_char=73, tag="patient" - ), - dd.Annotation( - text="Jan Jansen", start_char=9, end_char=19, tag="patient" - ), + dd.Annotation(text="J. Jansen", start_char=64, end_char=73, tag="patient"), + dd.Annotation(text="Jan Jansen", start_char=9, end_char=19, tag="patient"), dd.Annotation( text="10 oktober 2018", start_char=139, end_char=154, tag="datum" ), dd.Annotation(text="64", start_char=77, end_char=79, tag="leeftijd"), dd.Annotation(text="000334433", start_char=42, end_char=51, tag="id"), - dd.Annotation( - text="Utrecht", start_char=106, end_char=113, tag="locatie" - ), + dd.Annotation(text="Utrecht", start_char=106, end_char=113, tag="locatie"), dd.Annotation( text="IJSWEG 10r", start_char=115, end_char=125, tag="locatie" ), + dd.Annotation(text="UMCU", start_char=214, end_char=218, tag="ziekenhuis"), dd.Annotation( - text="UMCU", start_char=214, end_char=218, tag="ziekenhuis" + text="Peter Jansen", start_char=305, end_char=317, tag="persoon" ), + dd.Annotation(text="104", start_char=319, end_char=322, tag="leeftijd"), + dd.Annotation(text="Utrecht", start_char=340, end_char=347, tag="locatie"), dd.Annotation( - text="Peter Jansen", start_char=305, end_char=317, - tag="persoon" - ), - dd.Annotation( - text="104", start_char=319, end_char=322, tag="leeftijd" - ), - dd.Annotation( - text="Utrecht", start_char=340, end_char=347, tag="locatie" - ), - dd.Annotation( - text="Jan de Visser", start_char=373, end_char=386, - tag="persoon" + text="Jan de Visser", start_char=373, end_char=386, tag="persoon" ), } @@ -133,14 +120,18 @@ def test_annotate_intext(self, model): def test_patient_2(self, model): metadata = {"patient": Person(first_names=["Jan"], surname="Jansen")} - doc = ("Lorem ipsum JANSEN sit amet, Peter Jansen adipiscing elit. " - "Curabitur J. Jansen sapien, J. P. Jansen a vestibulum quis, " - "facilisis vel J Jansen. Jan de Visser iaculis gravida nulla. " - "Etiam quis Jan van den Jansen.") - want = ("Lorem ipsum [PATIENT] sit amet, [PERSOON-1] adipiscing elit. " - "Curabitur [PATIENT] sapien, [PERSOON-2] a vestibulum quis, " - "facilisis vel [PATIENT]. [PERSOON-3] iaculis gravida nulla. " - "Etiam quis [PERSOON-4].") + doc = ( + "Lorem ipsum JANSEN sit amet, Peter Jansen adipiscing elit. " + "Curabitur J. Jansen sapien, J. P. Jansen a vestibulum quis, " + "facilisis vel J Jansen. Jan de Visser iaculis gravida nulla. " + "Etiam quis Jan van den Jansen." + ) + want = ( + "Lorem ipsum [PATIENT] sit amet, [PERSOON-1] adipiscing elit. " + "Curabitur [PATIENT] sapien, [PERSOON-2] a vestibulum quis, " + "facilisis vel [PATIENT]. [PERSOON-3] iaculis gravida nulla. " + "Etiam quis [PERSOON-4]." + ) deid = model.deidentify(doc, metadata=metadata) assert deid.deidentified_text == want diff --git a/tests/regression/test_regression.py b/tests/regression/test_regression.py index 28862722..75b0b18b 100644 --- a/tests/regression/test_regression.py +++ b/tests/regression/test_regression.py @@ -2,7 +2,6 @@ from typing import Optional import pytest - from docdeid import Annotation, AnnotationSet from deduce import Deduce @@ -13,10 +12,11 @@ def model(shared_datadir): # FIXME Sorry, due to the design decision of pytest-datadir to create a new copy # of `shared_datadir` for every test, we cannot reuse this fixture # for all tests in this module or package. - return Deduce(build_lookup_structs=True, - save_lookup_structs=False, - lookup_data_path=shared_datadir / "lookup") - + return Deduce( + build_lookup_structs=True, + save_lookup_structs=False, + lookup_data_path=shared_datadir / "lookup", + ) def regression_test( diff --git a/tests/unit/test_annotation_processor.py b/tests/unit/test_annotation_processor.py index d08db864..4fdb00e9 100644 --- a/tests/unit/test_annotation_processor.py +++ b/tests/unit/test_annotation_processor.py @@ -91,10 +91,12 @@ def test_patient_no_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, - tag="voornaam_patient"), - dd.Annotation(text="Jansen", start_char=4, end_char=10, - tag="achternaam_patient"), + dd.Annotation( + text="Jan", start_char=0, end_char=3, tag="voornaam_patient" + ), + dd.Annotation( + text="Jansen", start_char=4, end_char=10, tag="achternaam_patient" + ), ] ) @@ -116,8 +118,11 @@ def test_patient_with_overlap(self): ) expected_annotations = dd.AnnotationSet( - [dd.Annotation(text="Jan Jansen", start_char=0, end_char=10, - tag="naam_patient")] + [ + dd.Annotation( + text="Jan Jansen", start_char=0, end_char=10, tag="naam_patient" + ) + ] ) assert proc.process_annotations(annotations, text) == expected_annotations @@ -139,10 +144,10 @@ def test_mixed_no_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, - tag="voornaam_patient"), - dd.Annotation(text="Jansen", start_char=4, end_char=10, - tag="persoon"), + dd.Annotation( + text="Jan", start_char=0, end_char=3, tag="voornaam_patient" + ), + dd.Annotation(text="Jansen", start_char=4, end_char=10, tag="persoon"), ] ) @@ -165,10 +170,10 @@ def test_mixed_with_overlap(self): expected_annotations = dd.AnnotationSet( [ - dd.Annotation(text="Jan", start_char=0, end_char=3, - tag="voornaam_patient"), - dd.Annotation(text=" Jansen", start_char=3, end_char=10, - tag="persoon"), + dd.Annotation( + text="Jan", start_char=0, end_char=3, tag="voornaam_patient" + ), + dd.Annotation(text=" Jansen", start_char=3, end_char=10, tag="persoon"), ] ) diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index 11739f89..cd7804f7 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -214,7 +214,8 @@ def test_match_sequence(self, pattern_doc, ds): tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_") assert tpa._match_sequence( - pattern_doc.text, start_token=pattern_doc.get_tokens()[3], + pattern_doc.text, + start_token=pattern_doc.get_tokens()[3], pattern=pattern, annos_by_token=defaultdict(list), ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") diff --git a/tests/unit/test_lookup_struct.py b/tests/unit/test_lookup_struct.py index da78f404..7004b474 100644 --- a/tests/unit/test_lookup_struct.py +++ b/tests/unit/test_lookup_struct.py @@ -1,5 +1,4 @@ import io - from unittest.mock import patch import docdeid as dd @@ -16,8 +15,7 @@ class TestLookupStruct: def test_load_raw_itemset(self, shared_datadir): - raw_itemset = load_raw_itemset( - shared_datadir / "lookup" / "src" / "lst_test") + raw_itemset = load_raw_itemset(shared_datadir / "lookup" / "src" / "lst_test") assert len(raw_itemset) == 5 assert "de Vries" in raw_itemset @@ -30,15 +28,15 @@ def test_load_raw_itemset(self, shared_datadir): def test_load_raw_itemset_nested(self, shared_datadir): raw_itemset = load_raw_itemset( - shared_datadir / "lookup" / "src" / "lst_test_nested") + shared_datadir / "lookup" / "src" / "lst_test_nested" + ) assert raw_itemset == {"a", "b", "c", "d"} def test_load_raw_itemsets(self, shared_datadir): raw_itemsets = load_raw_itemsets( - base_path=shared_datadir / "lookup", - subdirs=["lst_test", "lst_test_nested"] + base_path=shared_datadir / "lookup", subdirs=["lst_test", "lst_test_nested"] ) assert "test" in raw_itemsets @@ -62,7 +60,7 @@ class MockStats: assert validate_lookup_struct_cache( cache=cache, base_path=shared_datadir / "lookup", - deduce_version="2.5.0" + deduce_version="2.5.0", ) def test_validate_lookup_struct_cache_file_changes(self, shared_datadir): @@ -81,40 +79,34 @@ class MockStats: assert not validate_lookup_struct_cache( cache=cache, base_path=shared_datadir / "lookup", - deduce_version="2.5.0" + deduce_version="2.5.0", ) - @patch("deduce.lookup_structs.validate_lookup_struct_cache", - return_value=True) + @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=True) def test_load_lookup_structs_from_cache(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=shared_datadir / "lookup", - deduce_version="_" + base_path=shared_datadir / "lookup", deduce_version="_" ) assert len(ds_collection) == 2 assert "test" in ds_collection assert "test_nested" in ds_collection - @patch("deduce.lookup_structs.validate_lookup_struct_cache", - return_value=True) + @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=True) def test_load_lookup_structs_from_cache_nofile(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=shared_datadir / "non_existing_dir", - deduce_version="_" + base_path=shared_datadir / "non_existing_dir", deduce_version="_" ) assert ds_collection is None - @patch("deduce.lookup_structs.validate_lookup_struct_cache", - return_value=False) + @patch("deduce.lookup_structs.validate_lookup_struct_cache", return_value=False) def test_load_lookup_structs_from_cache_invalid(self, _, shared_datadir): ds_collection = load_lookup_structs_from_cache( - base_path=shared_datadir / "lookup", - deduce_version="_" + base_path=shared_datadir / "lookup", deduce_version="_" ) assert ds_collection is None diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index e4620c78..1bb70e71 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -229,8 +229,7 @@ def test_apply_transform_no_strip_lines(self): class TestOptionalLoad: def test_optional_load_items(self, shared_datadir): - path = (shared_datadir / - "lookup" / "src" / "lst_test_nested" / "items.txt") + path = shared_datadir / "lookup" / "src" / "lst_test_nested" / "items.txt" assert utils.optional_load_items(path) == {"a", "b"} From 6a8d1f581fa5d47ee8fd178ad138986e384963b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 14:38:01 +0100 Subject: [PATCH 24/39] Address issues reported by Flake8 --- deduce/annotator.py | 8 +++++--- tests/pipeline/test_deduce.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index bd3c08e9..e2199012 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -7,7 +7,7 @@ from typing import Any, Literal, Optional import docdeid as dd -from docdeid import Annotation, Document, Tokenizer, TokenList +from docdeid import Annotation, Document, Tokenizer, TokenList, MetaData from docdeid.process import RegexpAnnotator from deduce.utils import str_match @@ -285,7 +285,9 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: @classmethod def _index_by_token( - cls, annotations, token_lists + cls, + annotations: Iterable[dd.Annotation], + token_lists: Mapping[str, TokenList], ) -> defaultdict[str, set[dd.Annotation]]: """Assigns existing annotations to tokens.""" annos_by_token = defaultdict(set) @@ -385,7 +387,7 @@ def _annotate( text: str, annotations: dd.AnnotationSet, token_lists: Mapping[str, TokenList], - metadata=None, + metadata: MetaData = None, ) -> dd.AnnotationSet: """ Does the annotation, by calling _apply_context_pattern, and then optionally diff --git a/tests/pipeline/test_deduce.py b/tests/pipeline/test_deduce.py index ae4029e9..b1136619 100644 --- a/tests/pipeline/test_deduce.py +++ b/tests/pipeline/test_deduce.py @@ -6,8 +6,8 @@ text = ( "betreft: Jan Jansen, bsn 111222333, patnr 000334433. De patient J. Jansen is 64 " - "jaar oud en woonachtig in Utrecht, IJSWEG 10r. Hij werd op 10 oktober 2018 door arts " - "Peter de Visser ontslagen van de kliniek van het UMCU. Voor nazorg kan hij " + "jaar oud en woonachtig in Utrecht, IJSWEG 10r. Hij werd op 10 oktober 2018 door " + "arts Peter de Visser ontslagen van de kliniek van het UMCU. Voor nazorg kan hij " "worden bereikt via j.JNSEN.123@gmail.com of (06)12345678. " "Vader, Peter Jansen, 104 jr, woont ook in Utrecht. Met collegiale groeten, " "Jan de Visser." From 0c10ba44b115d9dd9ff70c8d53f36158c7d2e0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 17:08:17 +0100 Subject: [PATCH 25/39] Simplify `MultiTokenLookupAnnotator`... ...as required by pylint. --- deduce/annotator.py | 2 +- deduce/deduce.py | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index e2199012..72eae339 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -7,7 +7,7 @@ from typing import Any, Literal, Optional import docdeid as dd -from docdeid import Annotation, Document, Tokenizer, TokenList, MetaData +from docdeid import Annotation, Document, MetaData, Tokenizer, TokenList from docdeid.process import RegexpAnnotator from deduce.utils import str_match diff --git a/deduce/deduce.py b/deduce/deduce.py index 8610f6d3..59f2f136 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -29,6 +29,8 @@ __version__ = importlib.metadata.version(__package__ or __name__) +from docdeid.ds import LookupSet, LookupTrie + from deduce.utils import ensure_path _BASE_PATH = Path(os.path.dirname(__file__)).parent @@ -168,25 +170,32 @@ class _DeduceProcessorLoader: # pylint: disable=R0903 @staticmethod def _get_multi_token_annotator(args: dict, extras: dict) -> dd.process.Annotator: - lookup_struct = extras["ds"][args["lookup_values"]] + lookup_struct = extras["ds"][args.pop("lookup_values")] + + if isinstance(lookup_struct, LookupTrie): + lookup_trie = lookup_struct + elif isinstance(lookup_struct, LookupSet): + try: + tokenizer = args["tokenizer"] + except KeyError: + # This indicates an error in the code, not in configuration, as + # the "tokenizer" key is always added to `extras` where `extras` is + # defined -- in `Deduce.__init__`. + raise ValueError( + "When constructing a MultiTokenLookupAnnotator from a LookupSet, " + "a tokenizer must be given." + ) - if isinstance(lookup_struct, dd.ds.LookupSet): - args.update( - lookup_values=lookup_struct.items(), - matching_pipeline=lookup_struct.matching_pipeline, - # XXX Sure the trailing "]" is intentional? - tokenizer=extras["tokenizer]"], - ) - elif isinstance(lookup_struct, dd.ds.LookupTrie): - args.update(trie=lookup_struct) - del args["lookup_values"] + lookup_trie = LookupTrie(matching_pipeline=lookup_struct.matching_pipeline) + for phrase in filter(None, map(tokenizer.tokenize, lookup_struct)): + lookup_trie.add_item([token.text for token in phrase]) else: raise ValueError( f"Don't know how to present lookup structure with type " f"{type(lookup_struct)} to MultiTokenLookupAnnotator" ) - return dd.process.MultiTokenLookupAnnotator(**args) + return dd.process.MultiTokenLookupAnnotator(trie=lookup_trie, **args) @deprecated( "The multi_token annotatortype is deprecated and will be removed in a " From 7cc15384511998667b7c25887ed80c7d14f817eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Thu, 7 Mar 2024 22:19:45 +0100 Subject: [PATCH 26/39] Properly assign priority to patient v. other tags --- deduce/annotation_processor.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py index 0c55eb94..ab73494c 100644 --- a/deduce/annotation_processor.py +++ b/deduce/annotation_processor.py @@ -87,13 +87,20 @@ class PersonAnnotationConverter(dd.process.AnnotationProcessor): """ def __init__(self) -> None: - def map_tag_to_prio(tag: str) -> int: - if "pseudo" in tag: - return 0 - if "patient" in tag: - return 1 - - return 2 + def map_tag_to_prio(tag: str) -> (int, int, int): + """ + Maps from the tag of a mention to its priority. The lower, the higher + priority. + + The return value is a tuple of: + 1. Is this a pseudo tag? If it is, it's a priority. + 2. How many subtags does the tag have? The more, the higher priority. + 3. Is this a patient tag? If it is, it's a priority. + """ + is_pseudo = "pseudo" in tag + num_subtags = tag.count("+") + 1 + is_patient = tag.count("patient") == num_subtags + return (-int(is_pseudo), -num_subtags, -int(is_patient)) self._overlap_resolver = dd.process.OverlapResolver( sort_by=("tag", "length"), From f299b14e4b8fd31932d36093d4246ab43dacf0f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 8 Mar 2024 18:38:33 +0100 Subject: [PATCH 27/39] Make pylint happier --- deduce/annotator.py | 39 ++++++++++++++---------------------- deduce/deduce.py | 6 ++++-- deduce/utils.py | 2 +- tests/unit/test_annotator.py | 22 ++++++++------------ 4 files changed, 28 insertions(+), 41 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 72eae339..b3162e4f 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -117,7 +117,7 @@ def _lookup(cls, ent_type: str, **kwargs) -> bool: if isinstance(meta_val, str) else token in meta_val ) - else: + else: # pylint: disable=R1705 return token in kwargs.get("ds")[ent_type] @@ -283,6 +283,7 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: return annotations + # TODO Test. @classmethod def _index_by_token( cls, @@ -382,45 +383,37 @@ def _apply_context_pattern( return annotations - def _annotate( - self, - text: str, - annotations: dd.AnnotationSet, - token_lists: Mapping[str, TokenList], - metadata: MetaData = None, - ) -> dd.AnnotationSet: + def _get_annotations(self, doc: Document) -> dd.AnnotationSet: """ - Does the annotation, by calling _apply_context_pattern, and then optionally - recursing. Also keeps track of the (un)changed annotations, so they are not - repeatedly processed. + Computes the annotation for `doc` and returns it. + + Does this by calling _apply_context_pattern and then optionally recursing. + Also keeps track of the (un)changed annotations, so they are not repeatedly + processed. Args: - text: The input text. - annotations: The input annotations. - token_lists: Token lists available in this pipeline, indexed by - the tokenizer name. - metadata: Document metadata (like the patient name). + doc: The input document. Returns: An extended set of annotations, based on the patterns provided. """ - original_annotations = annotations.copy() + annotations = doc.annotations.copy() for context_pattern in self.pattern: annotations = self._apply_context_pattern( - text, annotations, token_lists, context_pattern, metadata + doc.text, annotations, doc.token_lists, context_pattern, doc.metadata ) if self.iterative: - changed = dd.AnnotationSet(annotations.difference(original_annotations)) + changed = dd.AnnotationSet(annotations.difference(doc.annotations)) annotations = dd.AnnotationSet( - annotations.intersection(original_annotations) + annotations.intersection(doc.annotations) ) if changed: - annotations.update(self._annotate(text, changed, token_lists)) + annotations.update(self._get_annotations(doc)) return annotations @@ -435,9 +428,7 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: An empty list, as annotations are modified and not added. """ - doc.annotations = self._annotate( - doc.text, doc.annotations, doc.token_lists, doc.metadata - ) + doc.annotations = self._get_annotations(doc) return [] diff --git a/deduce/deduce.py b/deduce/deduce.py index 59f2f136..615c2bcf 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -10,10 +10,12 @@ from pathlib import Path from typing import Any, Optional, Union -import docdeid as dd from deprecated import deprecated from frozendict import frozendict +import docdeid as dd +from docdeid.ds import LookupSet, LookupTrie + from deduce import utils from deduce.annotation_processor import ( CleanAnnotationTag, @@ -29,7 +31,6 @@ __version__ = importlib.metadata.version(__package__ or __name__) -from docdeid.ds import LookupSet, LookupTrie from deduce.utils import ensure_path @@ -181,6 +182,7 @@ def _get_multi_token_annotator(args: dict, extras: dict) -> dd.process.Annotator # This indicates an error in the code, not in configuration, as # the "tokenizer" key is always added to `extras` where `extras` is # defined -- in `Deduce.__init__`. + # pylint: disable=W0707 raise ValueError( "When constructing a MultiTokenLookupAnnotator from a LookupSet, " "a tokenizer must be given." diff --git a/deduce/utils.py b/deduce/utils.py index 63c3e675..1801ac55 100644 --- a/deduce/utils.py +++ b/deduce/utils.py @@ -136,7 +136,7 @@ def repl_segments(s: str, matches: list[tuple]) -> list[list[str]]: (5, 8, ["Mr.", "Meester"]). Returns: - A list of options that together segement the entire string, e.g. [["Prof.", + A list of options that together segment the entire string, e.g. [["Prof.", "Professor"], [" "], ["Meester", "Mr."], [" Lievenslaan"]]. """ diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index cd7804f7..bf973589 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -418,7 +418,7 @@ def test_annotate_multiple(self, pattern_doc): annotator = ContextAnnotator(pattern=pattern, iterative=False) - annotations = dd.AnnotationSet( + pattern_doc.annotations = dd.AnnotationSet( [ dd.Annotation( text="Andries", @@ -431,11 +431,8 @@ def test_annotate_multiple(self, pattern_doc): ] ) - assert annotator._annotate( - pattern_doc.text, - annotations, - {}, - ) == dd.AnnotationSet( + assert (annotator._get_annotations(pattern_doc) == + dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", @@ -444,7 +441,7 @@ def test_annotate_multiple(self, pattern_doc): tag="voornaam+naam+naam", ) } - ) + )) def test_annotate_iterative(self, pattern_doc): pattern = [ @@ -459,7 +456,7 @@ def test_annotate_iterative(self, pattern_doc): annotator = ContextAnnotator(pattern=pattern, iterative=True) - annotations = dd.AnnotationSet( + pattern_doc.annotations = dd.AnnotationSet( [ dd.Annotation( text="Andries", @@ -472,11 +469,8 @@ def test_annotate_iterative(self, pattern_doc): ] ) - assert annotator._annotate( - pattern_doc.text, - annotations, - {}, - ) == dd.AnnotationSet( + assert (annotator._get_annotations(pattern_doc) == + dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", @@ -485,7 +479,7 @@ def test_annotate_iterative(self, pattern_doc): tag="voornaam+naam+naam", ) } - ) + )) class TestPatientNameAnnotator: From 21546dabe1568ef38ba8a925e9a5e9bb1d702f37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 10:02:23 +0100 Subject: [PATCH 28/39] Reduce num of args of `_apply_context_pattern` This makes Pylint happier and the code simpler. --- deduce/annotator.py | 165 ++++++++++++++++++----------------- tests/unit/test_annotator.py | 25 +++--- 2 files changed, 95 insertions(+), 95 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index b3162e4f..97834622 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -7,7 +7,7 @@ from typing import Any, Literal, Optional import docdeid as dd -from docdeid import Annotation, Document, MetaData, Tokenizer, TokenList +from docdeid import Annotation, Document, Token, Tokenizer, TokenList from docdeid.process import RegexpAnnotator from deduce.utils import str_match @@ -170,9 +170,7 @@ def __init__( super().__init__(*args, **kwargs) @staticmethod - def _get_chained_token( - token: dd.Token, attr: str, skip: set[str] - ) -> Optional[dd.Token]: + def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]: while True: token = getattr(token, attr)() @@ -185,12 +183,12 @@ def _match_sequence( self, text: str, pattern: list[dict], - start_token: dd.tokenizer.Token, - annos_by_token: defaultdict[dd.tokenizer.Token, Iterable[dd.Annotation]], + start_token: Token, + annos_by_token: defaultdict[Token, Iterable[Annotation]], direction: Literal["left", "right"] = "right", skip: Optional[set[str]] = None, metadata: Optional[dict[str, Any]] = None, - ) -> Optional[dd.Annotation]: + ) -> Optional[Annotation]: """ Sequentially match a pattern against a specified start_token. @@ -226,13 +224,15 @@ def _match_sequence( return None end_token = current_token - current_token = self._get_chained_token(current_token, attr, skip) + current_token = TokenPatternAnnotator._get_chained_token( + current_token, attr, skip + ) start_token, end_token = _DIRECTION_MAP[direction]["order"]( (start_token, end_token) ) - return dd.Annotation( + return Annotation( text=text[start_token.start_char : end_token.end_char], start_char=start_token.start_char, end_char=end_token.end_char, @@ -242,7 +242,7 @@ def _match_sequence( end_token=end_token, ) - def annotate(self, doc: dd.Document) -> list[dd.Annotation]: + def annotate(self, doc: Document) -> list[Annotation]: """ Annotate the document, by matching the pattern against all tokens. @@ -287,9 +287,9 @@ def annotate(self, doc: dd.Document) -> list[dd.Annotation]: @classmethod def _index_by_token( cls, - annotations: Iterable[dd.Annotation], + annotations: Iterable[Annotation], token_lists: Mapping[str, TokenList], - ) -> defaultdict[str, set[dd.Annotation]]: + ) -> defaultdict[Token, set[Annotation]]: """Assigns existing annotations to tokens.""" annos_by_token = defaultdict(set) for token_list in token_lists.values(): @@ -328,62 +328,70 @@ def __init__( def _apply_context_pattern( self, - text: str, - annotations: dd.AnnotationSet, - token_lists: Mapping[str, TokenList], + doc: Document, context_pattern: dict, - metadata: Optional[dict[str, Any]] = None, + orig_annos: Optional[dd.AnnotationSet] = None, ) -> dd.AnnotationSet: - direction = context_pattern["direction"] - skip = set(context_pattern.get("skip", [])) - - annos_by_token = TokenPatternAnnotator._index_by_token(annotations, token_lists) + # TODO Maybe we should index all annotations here, not just the `new` ones. + annos_by_token = TokenPatternAnnotator._index_by_token( + orig_annos, doc.token_lists + ) - for annotation in annotations.copy(): + return dd.AnnotationSet( + self._maybe_merge_anno(anno, context_pattern, doc, annos_by_token) + for anno in orig_annos + ) - tag = list(_DIRECTION_MAP[direction]["order"](annotation.tag.split("+")))[ - -1 - ] + def _maybe_merge_anno( + self, + annotation: Annotation, + context_pattern: dict, + doc: Document, + annos_by_token: defaultdict[str, Iterable[Annotation]], + ) -> Annotation: + direction = context_pattern["direction"] + skip = set(context_pattern.get("skip", [])) - if tag not in context_pattern["pre_tag"]: - continue + tag = list(_DIRECTION_MAP[direction]["order"](annotation.tag.split("+")))[-1] - attr = _DIRECTION_MAP[direction]["attr"] - start_token = self._get_chained_token( - _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip - ) - new_annotation = self._match_sequence( - text, - context_pattern["pattern"], - start_token, - annos_by_token, - direction=direction, - skip=skip, - metadata=metadata, - ) + if tag not in context_pattern["pre_tag"]: + return annotation - if new_annotation: - left_ann, right_ann = _DIRECTION_MAP[direction]["order"]( - (annotation, new_annotation) - ) + attr = _DIRECTION_MAP[direction]["attr"] + start_token = TokenPatternAnnotator._get_chained_token( + _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip + ) + new_annotation = self._match_sequence( + doc.text, + context_pattern["pattern"], + start_token, + annos_by_token, + direction=direction, + skip=skip, + metadata=doc.metadata, + ) - merged_annotation = dd.Annotation( - text=text[left_ann.start_char : right_ann.end_char], - start_char=left_ann.start_char, - end_char=right_ann.end_char, - start_token=left_ann.start_token, - end_token=right_ann.end_token, - tag=context_pattern["tag"].format(tag=annotation.tag), - priority=annotation.priority, - ) + if not new_annotation: + return annotation - annotations.remove(annotation) - annotations.add(merged_annotation) + left_ann, right_ann = _DIRECTION_MAP[direction]["order"]( + (annotation, new_annotation) + ) - return annotations + return Annotation( + text=doc.text[left_ann.start_char : right_ann.end_char], + start_char=left_ann.start_char, + end_char=right_ann.end_char, + start_token=left_ann.start_token, + end_token=right_ann.end_token, + tag=context_pattern["tag"].format(tag=annotation.tag), + priority=annotation.priority, + ) - def _get_annotations(self, doc: Document) -> dd.AnnotationSet: + def _get_annotations( + self, doc: Document, orig_annos: Optional[dd.AnnotationSet] = None + ) -> dd.AnnotationSet: """ Computes the annotation for `doc` and returns it. @@ -393,31 +401,30 @@ def _get_annotations(self, doc: Document) -> dd.AnnotationSet: Args: doc: The input document. + orig_annos: Current set of annotations. If `None`, `doc.annotations` will + be consulted. Returns: An extended set of annotations, based on the patterns provided. """ - annotations = doc.annotations.copy() + if orig_annos is None: + orig_annos = doc.annotations + annotations = orig_annos.copy() for context_pattern in self.pattern: - annotations = self._apply_context_pattern( - doc.text, annotations, doc.token_lists, context_pattern, doc.metadata - ) + annotations = self._apply_context_pattern(doc, context_pattern, annotations) - if self.iterative: - - changed = dd.AnnotationSet(annotations.difference(doc.annotations)) + if self.iterative and (new := dd.AnnotationSet(annotations - orig_annos)): + # XXX Are we sure that other annotations than `new` don't matter anymore + # to the operation of the `_get_annotations` method? annotations = dd.AnnotationSet( - annotations.intersection(doc.annotations) + (annotations - new) | self._get_annotations(doc, new) ) - if changed: - annotations.update(self._get_annotations(doc)) - return annotations - def annotate(self, doc: dd.Document) -> list[dd.Annotation]: + def annotate(self, doc: Document) -> list[Annotation]: """ Wrapper for annotating. @@ -451,8 +458,8 @@ def __init__(self, tokenizer: Tokenizer, *args, **kwargs) -> None: @staticmethod def _match_first_names( - doc: dd.Document, token: dd.Token - ) -> Optional[tuple[dd.Token, dd.Token]]: + doc: Document, token: Token + ) -> Optional[tuple[Token, Token]]: for first_name in doc.metadata["patient"].first_names: @@ -466,8 +473,8 @@ def _match_first_names( @staticmethod def _match_initial_from_name( - doc: dd.Document, token: dd.Token - ) -> Optional[tuple[dd.Token, dd.Token]]: + doc: Document, token: Token + ) -> Optional[tuple[Token, Token]]: for _, first_name in enumerate(doc.metadata["patient"].first_names): if str_match(token.text, first_name[0]): @@ -481,16 +488,14 @@ def _match_initial_from_name( return None @staticmethod - def _match_initials( - doc: dd.Document, token: dd.Token - ) -> Optional[tuple[dd.Token, dd.Token]]: + def _match_initials(doc: Document, token: Token) -> Optional[tuple[Token, Token]]: if str_match(token.text, doc.metadata["patient"].initials): return token, token return None - def next_with_skip(self, token: dd.Token) -> Optional[dd.Token]: + def next_with_skip(self, token: Token) -> Optional[Token]: """Find the next token, while skipping certain punctuation.""" while True: @@ -502,8 +507,8 @@ def next_with_skip(self, token: dd.Token) -> Optional[dd.Token]: return token def _match_surname( - self, doc: dd.Document, token: dd.Token - ) -> Optional[tuple[dd.Token, dd.Token]]: + self, doc: Document, token: Token + ) -> Optional[tuple[Token, Token]]: if doc.metadata["surname_pattern"] is None: doc.metadata["surname_pattern"] = self.tokenizer.tokenize( @@ -571,7 +576,7 @@ def annotate(self, doc: Document) -> list[Annotation]: start_token, end_token = match annotations.append( - dd.Annotation( + Annotation( text=doc.text[start_token.start_char : end_token.end_char], start_char=start_token.start_char, end_char=end_token.end_char, @@ -758,7 +763,7 @@ def annotate(self, doc: Document) -> list[Annotation]: class PhoneNumberAnnotator(dd.process.Annotator): """ Annotates phone numbers, based on a regexp and min and max number of digits. - Additionally employs some logic like detecting parentheses and hyphens. + Additionally, employs some logic like detecting parentheses and hyphens. Args: phone_regexp: The regexp to detect phone numbers. diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index bf973589..0d6eeca0 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -304,15 +304,14 @@ def test_apply_context_pattern(self, pattern_doc): ) assert annotator._apply_context_pattern( - pattern_doc.text, - annotations, - {}, + pattern_doc, { "pattern": [{"like_name": True}], "direction": "right", "pre_tag": "voornaam", "tag": "{tag}+naam", }, + annotations, ) == dd.AnnotationSet( [ dd.Annotation( @@ -341,15 +340,14 @@ def test_apply_context_pattern_left(self, pattern_doc): ) assert annotator._apply_context_pattern( - pattern_doc.text, - annotations, - {}, + pattern_doc, { "pattern": [{"like_name": True}], "direction": "left", "pre_tag": "achternaam", "tag": "naam+{tag}", }, + annotations, ) == dd.AnnotationSet( [ dd.Annotation( @@ -378,9 +376,7 @@ def test_apply_context_pattern_skip(self, pattern_doc): ) assert annotator._apply_context_pattern( - pattern_doc.text, - annotations, - {}, + pattern_doc, { "pattern": [{"like_name": True}], "direction": "right", @@ -388,6 +384,7 @@ def test_apply_context_pattern_skip(self, pattern_doc): "pre_tag": "achternaam", "tag": "{tag}+naam", }, + annotations, ) == dd.AnnotationSet( [ dd.Annotation( @@ -431,8 +428,7 @@ def test_annotate_multiple(self, pattern_doc): ] ) - assert (annotator._get_annotations(pattern_doc) == - dd.AnnotationSet( + assert annotator._get_annotations(pattern_doc) == dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", @@ -441,7 +437,7 @@ def test_annotate_multiple(self, pattern_doc): tag="voornaam+naam+naam", ) } - )) + ) def test_annotate_iterative(self, pattern_doc): pattern = [ @@ -469,8 +465,7 @@ def test_annotate_iterative(self, pattern_doc): ] ) - assert (annotator._get_annotations(pattern_doc) == - dd.AnnotationSet( + assert annotator._get_annotations(pattern_doc) == dd.AnnotationSet( { dd.Annotation( text="Andries Meijer-Heerma", @@ -479,7 +474,7 @@ def test_annotate_iterative(self, pattern_doc): tag="voornaam+naam+naam", ) } - )) + ) class TestPatientNameAnnotator: From ff0fd50ad86d56db703f34278a6b8277c2cd15b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 12:47:56 +0100 Subject: [PATCH 29/39] Move `SequenceTokenizer` to Docdeid This is needed so as to reduce the number of arguments for the `_match_sequence` method and creates a cleaner inheritance hierarchy between annotators, too. --- base_config.json | 14 +- deduce/annotator.py | 357 +++++------------------------------ deduce/deduce.py | 15 +- tests/unit/test_annotator.py | 123 +++--------- tests/unit/test_utils.py | 15 +- 5 files changed, 92 insertions(+), 432 deletions(-) diff --git a/base_config.json b/base_config.json index d1841ad7..bf1c3522 100644 --- a/base_config.json +++ b/base_config.json @@ -14,7 +14,7 @@ "redactor_close_char": "]", "annotators": { "prefix_with_initial": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "prefix+initiaal", @@ -37,7 +37,7 @@ } }, "prefix_with_interfix": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "prefix+interfix+naam", @@ -56,7 +56,7 @@ } }, "prefix_with_name": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "prefix+naam", @@ -79,7 +79,7 @@ } }, "interfix_with_name": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "interfix+achternaam", @@ -102,7 +102,7 @@ } }, "initial_with_name": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "initiaal+naam", @@ -128,7 +128,7 @@ } }, "initial_interfix": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "names", "args": { "tag": "initiaal+interfix+naam", @@ -347,7 +347,7 @@ } }, "street_pattern": { - "annotator_type": "deduce.annotator.TokenPatternAnnotator", + "annotator_type": "docdeid.process.SequenceAnnotator", "group": "locations", "args": { "pattern": [ diff --git a/deduce/annotator.py b/deduce/annotator.py index 97834622..dcc3904e 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -3,310 +3,32 @@ import re import warnings from collections import defaultdict -from collections.abc import Iterable, Mapping -from typing import Any, Literal, Optional +from collections.abc import Iterable +from dataclasses import dataclass +from typing import Optional import docdeid as dd -from docdeid import Annotation, Document, Token, Tokenizer, TokenList -from docdeid.process import RegexpAnnotator +from docdeid import Annotation, Document, Token, Tokenizer +from docdeid.process import _DIRECTION_MAP, Annotator, RegexpAnnotator from deduce.utils import str_match +from docdeid.process.annotator import SequencePattern, SequenceAnnotator, \ + as_token_pattern warnings.simplefilter(action="default") -_DIRECTION_MAP = { - "left": { - "attr": "previous", - "order": reversed, - "start_token": lambda annotation: annotation.start_token, - }, - "right": { - "attr": "next", - "order": lambda pattern: pattern, - "start_token": lambda annotation: annotation.end_token, - }, -} - - -class _PatternPositionMatcher: # pylint: disable=R0903 - """Checks if a token matches against a single pattern.""" - - @classmethod - def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R0911 - """ - Matches a pattern position (a dict with one key). Other information should be - presented as kwargs. - - Args: - pattern_position: A dictionary with a single key, e.g. {'is_initial': True} - kwargs: Any other information, like the token or ds - - Returns: - True if the pattern position matches, false otherwise. - """ - if len(pattern_position) > 1: - raise ValueError( - f"Cannot parse token pattern ({pattern_position}) with more than 1 key" - ) - - func, value = next(iter(pattern_position.items())) - - if func == "equal": - return kwargs.get("token").text == value - if func == "re_match": - return re.match(value, kwargs.get("token").text) is not None - if func == "is_initial": - - warnings.warn( - "is_initial matcher pattern is deprecated and will be removed " - "in a future version", - DeprecationWarning, - ) - - return ( - ( - len(kwargs.get("token").text) == 1 - and kwargs.get("token").text[0].isupper() - ) - or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"} - ) == value - if func == "is_initials": - return ( - len(kwargs.get("token").text) <= 4 - and kwargs.get("token").text.isupper() - ) == value - if func == "like_name": - return ( - len(kwargs.get("token").text) >= 3 - and kwargs.get("token").text.istitle() - and not any(ch.isdigit() for ch in kwargs.get("token").text) - ) == value - if func == "lookup": - return cls._lookup(value, **kwargs) - if func == "neg_lookup": - return not cls._lookup(value, **kwargs) - if func == "tag": - annos = kwargs.get("annos", ()) - return any(anno.tag == value for anno in annos) - if func == "and": - return all( - _PatternPositionMatcher.match(pattern_position=x, **kwargs) - for x in value - ) - if func == "or": - return any( - _PatternPositionMatcher.match(pattern_position=x, **kwargs) - for x in value - ) - - raise NotImplementedError(f"No known logic for pattern {func}") - - @classmethod - def _lookup(cls, ent_type: str, **kwargs) -> bool: - token = kwargs.get("token").text - if "." in ent_type: - meta_key, meta_attr = ent_type.split(".", 1) - try: - meta_val = getattr(kwargs["metadata"][meta_key], meta_attr) - except (TypeError, KeyError, AttributeError): - return False - else: - return ( - token == meta_val - if isinstance(meta_val, str) - else token in meta_val - ) - else: # pylint: disable=R1705 - return token in kwargs.get("ds")[ent_type] - - -class TokenPatternAnnotator(dd.process.Annotator): +@dataclass +class ContextPattern: """ - Annotates based on token patterns, which should be provided as a list of dicts. Each - position in the list denotes a token position, e.g.: [{'is_initial': True}, - {'like_name': True}] matches sequences of two tokens, where the first one is an - initial, and the second one is like a name. - - Arguments: - pattern: The pattern - ds: Any datastructures, that can be used for lookup or other logic - skip: Any string values that should be skipped in matching (e.g. periods) + Pattern for matching a sequence of tokens anchored on a certain starting tag. """ + pre_tag: Optional[set[str]] + tag: str + seq_pattern: SequencePattern - def __init__( - self, - pattern: list[dict], - *args, - ds: Optional[dd.ds.DsCollection] = None, - skip: Optional[list[str]] = None, - **kwargs, - ) -> None: - self.pattern = pattern - self.ds = ds - self.skip = set(skip or []) - - self._start_words = None - self._matching_pipeline = None - - if len(self.pattern) > 0 and "lookup" in self.pattern[0]: - - if self.ds is None: - raise RuntimeError( - "Created pattern with lookup in TokenPatternAnnotator, but " - "no lookup structures provided." - ) - - lookup_list = self.ds[self.pattern[0]["lookup"]] - - if not isinstance(lookup_list, dd.ds.LookupSet): - raise ValueError( - f"Expected a LookupSet, but got a " f"{type(lookup_list)}." - ) - - self._start_words = lookup_list.items() - self._matching_pipeline = lookup_list.matching_pipeline - - super().__init__(*args, **kwargs) - - @staticmethod - def _get_chained_token(token: Token, attr: str, skip: set[str]) -> Optional[Token]: - while True: - token = getattr(token, attr)() - - if token is None or token.text not in skip: - break - - return token - - def _match_sequence( - self, - text: str, - pattern: list[dict], - start_token: Token, - annos_by_token: defaultdict[Token, Iterable[Annotation]], - direction: Literal["left", "right"] = "right", - skip: Optional[set[str]] = None, - metadata: Optional[dict[str, Any]] = None, - ) -> Optional[Annotation]: - """ - Sequentially match a pattern against a specified start_token. - - Args: - text: The original document text. - pattern: The pattern to match. - start_token: The start token to match. - annos_by_token: Map from tokens to annotations covering it. - direction: The direction to match, choice of "left" or "right". - skip: Any string values that should be skipped in matching. - metadata: Document metadata (like the patient name). - - Returns: - An Annotation if matching is possible, None otherwise. - """ - - skip = skip or set() - - attr = _DIRECTION_MAP[direction]["attr"] - pattern = _DIRECTION_MAP[direction]["order"](pattern) - - current_token = start_token - end_token = start_token - - for pattern_position in pattern: - if current_token is None or not _PatternPositionMatcher.match( - pattern_position=pattern_position, - token=current_token, - annos=annos_by_token[current_token], - ds=self.ds, - metadata=metadata, - ): - return None - - end_token = current_token - current_token = TokenPatternAnnotator._get_chained_token( - current_token, attr, skip - ) - - start_token, end_token = _DIRECTION_MAP[direction]["order"]( - (start_token, end_token) - ) - - return Annotation( - text=text[start_token.start_char : end_token.end_char], - start_char=start_token.start_char, - end_char=end_token.end_char, - tag=self.tag, - priority=self.priority, - start_token=start_token, - end_token=end_token, - ) - - def annotate(self, doc: Document) -> list[Annotation]: - """ - Annotate the document, by matching the pattern against all tokens. - - Args: - doc: The document being processed. - - Returns: - A list of Annotation. - """ - - annotations = [] - - tokens = doc.get_tokens() - - if self._start_words is not None: - tokens = tokens.token_lookup( - lookup_values=self._start_words, - matching_pipeline=self._matching_pipeline, - ) - - annos_by_token = TokenPatternAnnotator._index_by_token( - doc.annotations, doc.token_lists - ) - for token in tokens: - - annotation = self._match_sequence( - doc.text, - self.pattern, - token, - annos_by_token, - direction="right", - skip=self.skip, - ) - - if annotation is not None: - annotations.append(annotation) - - return annotations - - # TODO Test. - @classmethod - def _index_by_token( - cls, - annotations: Iterable[Annotation], - token_lists: Mapping[str, TokenList], - ) -> defaultdict[Token, set[Annotation]]: - """Assigns existing annotations to tokens.""" - annos_by_token = defaultdict(set) - for token_list in token_lists.values(): - # TODO Improve efficiency, simplify. - for anno in annotations: - found_first = False - for token in token_list: - if anno.start_char < token.end_char: - found_first = True - if token.start_char >= anno.end_char: - break - if found_first: - annos_by_token[token].add(anno) - return annos_by_token - - -class ContextAnnotator(TokenPatternAnnotator): +class ContextAnnotator(Annotator): """ Extends existing annotations to the left or right, based on specified patterns. @@ -318,23 +40,33 @@ class ContextAnnotator(TokenPatternAnnotator): def __init__( self, + pattern: list[dict], # TODO Rename to "patterns" or similar. *args, ds: Optional[dd.ds.DsCollection] = None, iterative: bool = True, **kwargs, ) -> None: + self.ds = ds self.iterative = iterative - super().__init__(*args, **kwargs, ds=ds, tag="_") + self._patterns = [ + ContextPattern(pat['pre_tag'], + pat['tag'], + SequencePattern(pat.get('direction', "right"), + set(pat.get('skip', ())), + list(map(as_token_pattern, pat['pattern'])))) + for pat in pattern + ] + super().__init__(*args, **kwargs, tag='_') # XXX Not sure why exactly '_'. def _apply_context_pattern( self, doc: Document, - context_pattern: dict, + context_pattern: ContextPattern, orig_annos: Optional[dd.AnnotationSet] = None, ) -> dd.AnnotationSet: # TODO Maybe we should index all annotations here, not just the `new` ones. - annos_by_token = TokenPatternAnnotator._index_by_token( + annos_by_token = SequenceAnnotator._index_by_token( orig_annos, doc.token_lists ) @@ -346,31 +78,28 @@ def _apply_context_pattern( def _maybe_merge_anno( self, annotation: Annotation, - context_pattern: dict, + context_pattern: ContextPattern, doc: Document, annos_by_token: defaultdict[str, Iterable[Annotation]], ) -> Annotation: - direction = context_pattern["direction"] - skip = set(context_pattern.get("skip", [])) + direction = context_pattern.seq_pattern.direction + skip = context_pattern.seq_pattern.skip tag = list(_DIRECTION_MAP[direction]["order"](annotation.tag.split("+")))[-1] - if tag not in context_pattern["pre_tag"]: + if tag not in context_pattern.pre_tag: return annotation attr = _DIRECTION_MAP[direction]["attr"] - start_token = TokenPatternAnnotator._get_chained_token( + start_token = SequenceAnnotator._get_chained_token( _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip ) - new_annotation = self._match_sequence( - doc.text, - context_pattern["pattern"], - start_token, - annos_by_token, - direction=direction, - skip=skip, - metadata=doc.metadata, - ) + + new_annotation = self._match_sequence(doc, + context_pattern.seq_pattern, + start_token, + annos_by_token, + self.ds) if not new_annotation: return annotation @@ -385,7 +114,7 @@ def _maybe_merge_anno( end_char=right_ann.end_char, start_token=left_ann.start_token, end_token=right_ann.end_token, - tag=context_pattern["tag"].format(tag=annotation.tag), + tag=context_pattern.tag.format(tag=annotation.tag), priority=annotation.priority, ) @@ -412,7 +141,7 @@ def _get_annotations( orig_annos = doc.annotations annotations = orig_annos.copy() - for context_pattern in self.pattern: + for context_pattern in self._patterns: annotations = self._apply_context_pattern(doc, context_pattern, annotations) if self.iterative and (new := dd.AnnotationSet(annotations - orig_annos)): @@ -829,3 +558,9 @@ def annotate(self, doc: Document) -> list[Annotation]: ) return annotations + + +# TODO Drop this. It's confusing given the other annotator of the same name defined +# in Docdeid. +# For sake of backward compatibility: +TokenPatternAnnotator = SequenceAnnotator \ No newline at end of file diff --git a/deduce/deduce.py b/deduce/deduce.py index 615c2bcf..6f3cee87 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -10,11 +10,11 @@ from pathlib import Path from typing import Any, Optional, Union +import docdeid as dd from deprecated import deprecated -from frozendict import frozendict -import docdeid as dd from docdeid.ds import LookupSet, LookupTrie +from frozendict import frozendict from deduce import utils from deduce.annotation_processor import ( @@ -23,7 +23,8 @@ PersonAnnotationConverter, RemoveAnnotations, ) -from deduce.annotator import ContextAnnotator, TokenPatternAnnotator +from deduce.annotator import ContextAnnotator +from docdeid.process.annotator import SequenceAnnotator, TokenPatternAnnotator from deduce.lookup_struct_loader import load_interfix_lookup, load_prefix_lookup from deduce.lookup_structs import get_lookup_structs, load_raw_itemsets from deduce.redactor import DeduceRedactor @@ -212,20 +213,20 @@ def _get_multi_token_annotator_old(self, *args, **kwargs) -> dd.process.Annotato @deprecated( "The token_pattern annotatortype is deprecated and will be removed in " "a future version. Please set annotator_type field to " - "deduce.annotator.TokenPatternAnnotator. See " + "docdeid.process.SequenceAnnotator. See " "https://github.com/vmenger/deduce/blob/main/base_config.json for " "examples." ) def _get_token_pattern_annotator(args: dict, extras: dict) -> dd.process.Annotator: - return TokenPatternAnnotator(**args, ds=extras["ds"]) + return SequenceAnnotator(**args, ds=extras["ds"]) @staticmethod @deprecated( "The dd_token_pattern annotatortype is deprecated and will be removed " "in a future version. For patient name patterns, please use " "deduce.annotator.PatientNameAnnotator. For other patterns, please " - "switch to deduce.annotator.TokenPatternAnnotator. See " + "switch to docdeid.process.SequenceAnnotator. See " "https://github.com/vmenger/deduce/blob/main/base_config.json for " "examples." ) @@ -240,7 +241,7 @@ def _get_dd_token_pattern_annotator( pattern = utils.initialize_class(cls, args=pattern_args, extras=extras) - return dd.process.TokenPatternAnnotator(pattern=pattern) + return TokenPatternAnnotator(pattern=pattern) @staticmethod @deprecated( diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index 0d6eeca0..bebfd45b 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -8,11 +8,16 @@ from deduce.annotator import ( BsnAnnotator, ContextAnnotator, + ContextPattern, PatientNameAnnotator, PhoneNumberAnnotator, - RegexpPseudoAnnotator, - TokenPatternAnnotator, + RegexpPseudoAnnotator +) +from docdeid.process.annotator import ( + as_token_pattern, _PatternPositionMatcher, + SequencePattern, + SimpleTokenPattern, ) from deduce.person import Person from deduce.tokenizer import DeduceTokenizer @@ -92,6 +97,10 @@ def test_equal(self): assert _PatternPositionMatcher.match({"equal": "test"}, token=token("test")) assert not _PatternPositionMatcher.match({"equal": "_"}, token=token("test")) + def test_equal_with_dataclass(self): + assert _PatternPositionMatcher.match(SimpleTokenPattern("equal", "test"), + token=token("test")) + def test_re_match(self): assert _PatternPositionMatcher.match({"re_match": "[a-z]"}, token=token("abc")) assert _PatternPositionMatcher.match( @@ -207,85 +216,6 @@ def test_match_or(self): ) -class TestTokenPatternAnnotator: - def test_match_sequence(self, pattern_doc, ds): - pattern = [{"lookup": "first_names"}, {"like_name": True}] - - tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_") - - assert tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[3], - pattern=pattern, - annos_by_token=defaultdict(list), - ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") - assert ( - tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[7], - pattern=pattern, - annos_by_token=defaultdict(list), - ) - is None - ) - - def test_match_sequence_left(self, pattern_doc, ds): - pattern = [{"lookup": "first_names"}, {"like_name": True}] - - tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_") - - assert tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[4], - pattern=pattern, - annos_by_token=defaultdict(list), - direction="left", - ) == dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") - - assert ( - tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[8], - annos_by_token=defaultdict(list), - direction="left", - pattern=pattern, - ) - is None - ) - - def test_match_sequence_skip(self, pattern_doc, ds): - pattern = [{"lookup": "surnames"}, {"like_name": True}] - - tpa = TokenPatternAnnotator(pattern=[{}], ds=ds, tag="_") - - assert tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[4], - pattern=pattern, - annos_by_token=defaultdict(list), - skip={"-"}, - ) == dd.Annotation(text="Meijer-Heerma", start_char=20, end_char=33, tag="_") - assert ( - tpa._match_sequence( - pattern_doc.text, - start_token=pattern_doc.get_tokens()[4], - pattern=pattern, - annos_by_token=defaultdict(list), - skip=set(), - ) - is None - ) - - def test_annotate(self, pattern_doc, ds): - pattern = [{"lookup": "first_names"}, {"like_name": True}] - - tpa = TokenPatternAnnotator(pattern=pattern, ds=ds, tag="_") - - assert tpa.annotate(pattern_doc) == [ - dd.Annotation(text="Andries Meijer", start_char=12, end_char=26, tag="_") - ] - - class TestContextAnnotator: def test_apply_context_pattern(self, pattern_doc): annotator = ContextAnnotator(pattern=[]) @@ -305,12 +235,10 @@ def test_apply_context_pattern(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc, - { - "pattern": [{"like_name": True}], - "direction": "right", - "pre_tag": "voornaam", - "tag": "{tag}+naam", - }, + ContextPattern("voornaam", + "{tag}+naam", + SequencePattern("right", set(), + [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( [ @@ -341,12 +269,10 @@ def test_apply_context_pattern_left(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc, - { - "pattern": [{"like_name": True}], - "direction": "left", - "pre_tag": "achternaam", - "tag": "naam+{tag}", - }, + ContextPattern("achternaam", + "naam+{tag}", + SequencePattern("left", set(), + [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( [ @@ -377,13 +303,10 @@ def test_apply_context_pattern_skip(self, pattern_doc): assert annotator._apply_context_pattern( pattern_doc, - { - "pattern": [{"like_name": True}], - "direction": "right", - "skip": ["-"], - "pre_tag": "achternaam", - "tag": "{tag}+naam", - }, + ContextPattern("achternaam", + "{tag}+naam", + SequencePattern("right", {"-"}, + [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( [ diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 1bb70e71..3b7211c3 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -2,7 +2,8 @@ import pytest from deduce import utils -from deduce.annotator import TokenPatternAnnotator +from docdeid.process.annotator import SequenceAnnotator +from deduce.annotator import BsnAnnotator class TestStrMatch: @@ -28,17 +29,17 @@ def test_str_match_fuzzy(self): class TestClassForName: def test_class_for_name(self): assert ( - utils.class_for_name( - module_name="deduce.annotator", class_name="TokenPatternAnnotator" + utils.class_for_name( + module_name="deduce.annotator", class_name="BsnAnnotator" ) - == TokenPatternAnnotator + == BsnAnnotator ) class TestInitializeClass: def test_initialize_class(self): - cls = TokenPatternAnnotator + cls = SequenceAnnotator tag = "_" pattern = [{"key": "value"}] @@ -52,7 +53,7 @@ def test_initialize_class(self): def test_initialize_class_with_extras(self): - cls = TokenPatternAnnotator + cls = SequenceAnnotator tag = "_" pattern = [{"key": "value"}] @@ -66,7 +67,7 @@ def test_initialize_class_with_extras(self): assert annotator.tag == tag assert annotator.pattern == pattern - assert annotator.ds is ds + assert annotator.dicts is ds class TestOverwriteDict: From ece3fa9b1e57c4e04d7ffb61c4612bdff629a256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 19:57:59 +0100 Subject: [PATCH 30/39] Replace `_DIRECTION_MAP` with an enum --- deduce/annotator.py | 42 ++++++++++++++++++++---------------- tests/unit/test_annotator.py | 10 ++++++--- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index dcc3904e..392b7cd0 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -8,12 +8,15 @@ from typing import Optional import docdeid as dd -from docdeid import Annotation, Document, Token, Tokenizer -from docdeid.process import _DIRECTION_MAP, Annotator, RegexpAnnotator - from deduce.utils import str_match -from docdeid.process.annotator import SequencePattern, SequenceAnnotator, \ - as_token_pattern +from docdeid import Annotation, Document, Token, Tokenizer +from docdeid.direction import Direction +from docdeid.process import Annotator, RegexpAnnotator +from docdeid.process.annotator import ( + as_token_pattern, + SequenceAnnotator, + SequencePattern, +) warnings.simplefilter(action="default") @@ -51,9 +54,10 @@ def __init__( self._patterns = [ ContextPattern(pat['pre_tag'], pat['tag'], - SequencePattern(pat.get('direction', "right"), - set(pat.get('skip', ())), - list(map(as_token_pattern, pat['pattern'])))) + SequencePattern( + Direction.from_string(pat.get('direction', "right")), + set(pat.get('skip', ())), + list(map(as_token_pattern, pat['pattern'])))) for pat in pattern ] super().__init__(*args, **kwargs, tag='_') # XXX Not sure why exactly '_'. @@ -82,18 +86,22 @@ def _maybe_merge_anno( doc: Document, annos_by_token: defaultdict[str, Iterable[Annotation]], ) -> Annotation: - direction = context_pattern.seq_pattern.direction - skip = context_pattern.seq_pattern.skip - tag = list(_DIRECTION_MAP[direction]["order"](annotation.tag.split("+")))[-1] + dir_ = context_pattern.seq_pattern.direction + tag = list(dir_.iter(annotation.tag.split("+")))[-1] if tag not in context_pattern.pre_tag: return annotation - attr = _DIRECTION_MAP[direction]["attr"] - start_token = SequenceAnnotator._get_chained_token( - _DIRECTION_MAP[direction]["start_token"](annotation), attr, skip - ) + anno_start = (annotation.end_token if dir_ is Direction.RIGHT else + annotation.start_token) + skip = context_pattern.seq_pattern.skip + tokens = (token for token in anno_start.iter_to(dir_) + if token.text not in skip) + try: + start_token = next(tokens) + except StopIteration: + return annotation new_annotation = self._match_sequence(doc, context_pattern.seq_pattern, @@ -104,9 +112,7 @@ def _maybe_merge_anno( if not new_annotation: return annotation - left_ann, right_ann = _DIRECTION_MAP[direction]["order"]( - (annotation, new_annotation) - ) + left_ann, right_ann = dir_.iter((annotation, new_annotation)) return Annotation( text=doc.text[left_ann.start_char : right_ann.end_char], diff --git a/tests/unit/test_annotator.py b/tests/unit/test_annotator.py index bebfd45b..63c11472 100644 --- a/tests/unit/test_annotator.py +++ b/tests/unit/test_annotator.py @@ -13,6 +13,7 @@ PhoneNumberAnnotator, RegexpPseudoAnnotator ) +from docdeid.direction import Direction from docdeid.process.annotator import ( as_token_pattern, _PatternPositionMatcher, @@ -237,7 +238,8 @@ def test_apply_context_pattern(self, pattern_doc): pattern_doc, ContextPattern("voornaam", "{tag}+naam", - SequencePattern("right", set(), + SequencePattern(Direction.RIGHT, + set(), [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( @@ -271,7 +273,8 @@ def test_apply_context_pattern_left(self, pattern_doc): pattern_doc, ContextPattern("achternaam", "naam+{tag}", - SequencePattern("left", set(), + SequencePattern(Direction.LEFT, + set(), [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( @@ -305,7 +308,8 @@ def test_apply_context_pattern_skip(self, pattern_doc): pattern_doc, ContextPattern("achternaam", "{tag}+naam", - SequencePattern("right", {"-"}, + SequencePattern(Direction.RIGHT, + {"-"}, [as_token_pattern({"like_name": True})])), annotations, ) == dd.AnnotationSet( From bb7aa3c308b1ca69182d71225b3dc1d0baa3bc08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 11 Mar 2024 22:16:11 +0100 Subject: [PATCH 31/39] Improve and test `annos_by_token()` --- deduce/annotator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 392b7cd0..5162bb04 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -66,13 +66,11 @@ def _apply_context_pattern( self, doc: Document, context_pattern: ContextPattern, - orig_annos: Optional[dd.AnnotationSet] = None, + orig_annos: dd.AnnotationSet, ) -> dd.AnnotationSet: # TODO Maybe we should index all annotations here, not just the `new` ones. - annos_by_token = SequenceAnnotator._index_by_token( - orig_annos, doc.token_lists - ) + annos_by_token = orig_annos.annos_by_token(doc) return dd.AnnotationSet( self._maybe_merge_anno(anno, context_pattern, doc, annos_by_token) From 481b23f0c0a0e021db7f8d56634901cb0c12f56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Tue, 12 Mar 2024 12:33:41 +0100 Subject: [PATCH 32/39] Fix a one-off error in iterating context tokens This bug sent Deduce into an endless loop. I wonder whether that's still possible with carefully crafted inputs, and something to be fixed. --- deduce/annotator.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 5162bb04..42282b2f 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -5,6 +5,7 @@ from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass +from itertools import islice from typing import Optional import docdeid as dd @@ -91,13 +92,14 @@ def _maybe_merge_anno( if tag not in context_pattern.pre_tag: return annotation - anno_start = (annotation.end_token if dir_ is Direction.RIGHT else - annotation.start_token) + last_anno_token = (annotation.end_token if dir_ is Direction.RIGHT else + annotation.start_token) skip = context_pattern.seq_pattern.skip - tokens = (token for token in anno_start.iter_to(dir_) - if token.text not in skip) + following_tokens = islice(last_anno_token.iter_to(dir_), 1, None) + nonskip_tokens = (token for token in following_tokens + if token.text not in skip) try: - start_token = next(tokens) + start_token = next(nonskip_tokens) except StopIteration: return annotation @@ -143,7 +145,7 @@ def _get_annotations( if orig_annos is None: orig_annos = doc.annotations - annotations = orig_annos.copy() + annotations = dd.AnnotationSet(orig_annos.copy()) for context_pattern in self._patterns: annotations = self._apply_context_pattern(doc, context_pattern, annotations) From d259c0d2907e58c1a520af8a5bd06724c9a859e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 12 Jul 2024 18:40:12 +0200 Subject: [PATCH 33/39] Document how to run tests better + cosmetics --- CONTRIBUTING.md | 4 ++-- deduce/lookup_structs.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7ee48db5..08842170 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,7 +22,7 @@ Before starting, some things to consider: ## Running the tests ```bash -pytest . +poetry run pytest . ``` ## PR checklist @@ -41,4 +41,4 @@ If all the steps above are followed, this ensures a quick review and release of * Create a [release on Github](https://github.com/vmenger/docdeid/releases/new), create a tag with the right version, manually copy and paste from the changelog. * Build pipeline and release to PyPI trigger automatically on release. -Any other questions/issues not covered here? Please just get in touch! \ No newline at end of file +Any other questions/issues not covered here? Please just get in touch! diff --git a/deduce/lookup_structs.py b/deduce/lookup_structs.py index b116fd41..720294f1 100644 --- a/deduce/lookup_structs.py +++ b/deduce/lookup_structs.py @@ -102,7 +102,7 @@ def load_raw_itemsets(base_path: Path, subdirs: list[str]) -> dict[str, set[str] subdirs: The lists to load. Returns: - The raw itemsetes, represented as a dictionary mapping the name of the + The raw itemsets, represented as a dictionary mapping the name of the lookup list to a set of strings. """ From 82c04edc3abbbf67cc6c9b956b82222122e17b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Fri, 12 Jul 2024 19:10:30 +0200 Subject: [PATCH 34/39] Resolve minor issues from the merge --- deduce/deduce.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deduce/deduce.py b/deduce/deduce.py index 0f890fe4..f207e9ad 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -6,11 +6,11 @@ import logging import os import sys +import warnings from pathlib import Path from typing import Any, Optional, Union import docdeid as dd -from deprecated import deprecated from docdeid.ds import LookupSet, LookupTrie from frozendict import frozendict @@ -56,6 +56,8 @@ class Deduce(dd.DocDeid): # pylint: disable=R0903 are overwritten, and other defaults are kept. When `load_base_config` is set to `False`, no defaults are loaded and only configuration from `config` is applied. + config_file: (Deprecated!) Same as `config` but it's expected to be a `str` + holding the path to the config JSON file. lookup_data_path: The path to look for lookup data, by default included in the package. If you want to make changes to source files, it's recommended to copy the source data and pointing deduce to this folder with this From 1cc4bda3d3bce5cead560c33211e3df4340b2578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Mon, 28 Oct 2024 15:41:46 +0100 Subject: [PATCH 35/39] Pass older annotations to iterations of the conx annor This is a change I had locally and surfaced during merge conflict resolution, which may be necessary for some tests to work (and by extension to the system at runtime, too). --- deduce/annotator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 6a0b6d1f..87e9b6b7 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -154,7 +154,8 @@ def _get_annotations( # XXX Are we sure that other annotations than `new` don't matter anymore # to the operation of the `_get_annotations` method? annotations = dd.AnnotationSet( - (annotations - new) | self._get_annotations(doc, new) + (annotations - new) | + self._get_annotations(doc, dd.AnnotationSet(annotations | new)) ) return annotations From 6bd6ce4dec9e8c860628dee5fa9bb664cabde437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 14:18:59 +0100 Subject: [PATCH 36/39] Move `annos_by_token` to `Document` --- deduce/annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 87e9b6b7..0f4a7388 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -71,7 +71,7 @@ def _apply_context_pattern( ) -> dd.AnnotationSet: # TODO Maybe we should index all annotations here, not just the `new` ones. - annos_by_token = orig_annos.annos_by_token(doc) + annos_by_token = doc.annos_by_token(orig_annos) return dd.AnnotationSet( self._maybe_merge_anno(anno, context_pattern, doc, annos_by_token) From 392e21d9f1489b5f68e736d21cbc1fa50ebdf0ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 15:48:43 +0100 Subject: [PATCH 37/39] Rename `SequenceAnnotator.dicts` to `ds` --- tests/unit/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 3b7211c3..e901fd7a 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -67,7 +67,7 @@ def test_initialize_class_with_extras(self): assert annotator.tag == tag assert annotator.pattern == pattern - assert annotator.dicts is ds + assert annotator.ds is ds class TestOverwriteDict: From 34a6716beeb4ad89133dfb49ef920a752f9592cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 16:33:18 +0100 Subject: [PATCH 38/39] Replace `list(map(f, xs))` with list comprehension --- deduce/annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deduce/annotator.py b/deduce/annotator.py index 0f4a7388..166d9f87 100644 --- a/deduce/annotator.py +++ b/deduce/annotator.py @@ -58,7 +58,7 @@ def __init__( SequencePattern( Direction.from_string(pat.get('direction', "right")), set(pat.get('skip', ())), - list(map(as_token_pattern, pat['pattern'])))) + [as_token_pattern(it) for it in pat['pattern']])) for pat in pattern ] super().__init__(*args, **kwargs, tag='_') # XXX Not sure why exactly '_'. From 02af57b5b2cb5879c0c52687902123ead981cdf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Korvas?= Date: Wed, 8 Jan 2025 18:02:48 +0100 Subject: [PATCH 39/39] Re-add `MultiTokenLookupAnnotator` accepting a `LookupSet` --- deduce/deduce.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/deduce/deduce.py b/deduce/deduce.py index 0d18cde4..496b0afb 100644 --- a/deduce/deduce.py +++ b/deduce/deduce.py @@ -175,7 +175,7 @@ def _get_multi_token_annotator(args: dict, extras: dict) -> dd.process.Annotator lookup_struct = extras["ds"][args.pop("lookup_values")] if isinstance(lookup_struct, LookupTrie): - lookup_trie = lookup_struct + return dd.process.MultiTokenTrieAnnotator(trie=lookup_struct, **args) elif isinstance(lookup_struct, LookupSet): try: tokenizer = args["tokenizer"] @@ -189,17 +189,16 @@ def _get_multi_token_annotator(args: dict, extras: dict) -> dd.process.Annotator "a tokenizer must be given." ) - lookup_trie = LookupTrie(matching_pipeline=lookup_struct.matching_pipeline) - for phrase in filter(None, map(tokenizer.tokenize, lookup_struct)): - lookup_trie.add_item([token.text for token in phrase]) + return dd.process.MultiTokenLookupAnnotator( + lookup_values=lookup_struct, + matching_pipeline=lookup_struct.matching_pipeline, + tokenizer=tokenizer) else: raise ValueError( f"Don't know how to present lookup structure with type " f"{type(lookup_struct)} to MultiTokenLookupAnnotator" ) - return dd.process.MultiTokenLookupAnnotator(trie=lookup_trie, **args) - @staticmethod def _get_annotator_from_class( annotator_type: str, args: dict, extras: dict