vmenger · mkorvas · Feb 29, 2024 · Feb 29, 2024 · Mar 1, 2024 · Mar 4, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -3,42 +3,42 @@
 Thanks for considering making an addition to this project! These contributing guidelines should help make your life easier. 
 
 Before starting, some things to consider:
-* For larger features, it would be helpful to get in touch first (through issue/email)
+* For larger features, it would be helpful to get in touch first (through issue/email).
 * A lot of the logic is in `docdeid`, please consider making a PR there for things that are not specific to `deduce`.
-* `deduce` is a rule-based de-identifier
-* In case you would like to see any rules added/removed/changed, a decent substantiation (with examples) of the potential improvement is useful
+* `deduce` is a rule-based de-identifier.
+* In case you would like to see any rules added/removed/changed, a decent substantiation (with examples) of the potential improvement is useful.
 
 ## Setting up the environment
 
-* This project uses poetry for package management. Install it with ```pip install poetry```
-* Set up the environment is easy, just use ```poetry install```
+* This project uses poetry for package management. Install it with ``pip install poetry``.
+* Setting up the environment is easy, just use ``poetry install``.
 * The makefile contains some useful commands when developing:
-  * `make format` formats the package code
-  * `make lint` runs the linters (check the output)
-  * `make clean` removes build/test artifacts, etc
+  * `make format` formats the package code;
+  * `make lint` runs the linters (check the output);
+  * `make clean` removes build/test artifacts, etc.
 * And for docs:
-  * `make build-docs` builds the docs
+  * `make build-docs` builds the docs.
 
-## Runing the tests
+## Running the tests
 
 ```bash
-pytest .
+poetry run pytest .
 ```
 
-## PR checlist
+## PR checklist
 
-* Verify that tests are passing
-* Verify that tests are updated/added according to changes
-* Run the formatters (`make format`)
-* Run the linters (`make lint`)
-* Add a section to the changelog
-* Add a description to your PR
+* Verify that tests are passing.
+* Verify that tests are updated/added according to changes.
+* Run the formatters (`make format`).
+* Run the linters (`make lint`).
+* Add a section to the changelog.
+* Add a description to your PR.
 
 If all the steps above are followed, this ensures a quick review and release of your contribution. 
 
 ## Releasing
 * Readthedocs has a webhook connected to pushes on the main branch. It will trigger and update automatically. 
-* Create a [release on github](https://github.com/vmenger/docdeid/releases/new), create a tag with the right version, manually copy and paste from the changelog
-* Build pipeline and release to PyPi trigger automatically on release
+* Create a [release on Github](https://github.com/vmenger/docdeid/releases/new), create a tag with the right version, manually copy and paste from the changelog.
+* Build pipeline and release to PyPI trigger automatically on release.
 
-Any other questions/issues not covered here? Please just get in touch!
+Any other questions/issues not covered here? Please just get in touch!
diff --git a/README.md b/README.md
@@ -7,6 +7,18 @@
 ![license](https://img.shields.io/github/license/vmenger/deduce)
 [![black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
+# About this fork
+
+This is Matěj Korvas's fork of the original Deduce tool, available at 
+https://github.com/vmenger/deduce, forked on 2024-02-29. The latest version 
+available here has some extra or different functionality on top of the 
+original tool -- which is maybe obvious but the license requires me to 
+state it clearly.
+
+Use at your own risk.
+
+Original Readme documentation follows.
+
 # deduce
 
 > Deduce 3.0.0 is out! It is way more accurate, and faster too. It's fully backward compatible, but some functionality is scheduled for removal, read more about it here: [docs/migrating-to-v3](https://deduce.readthedocs.io/en/latest/migrating.html)
@@ -141,4 +153,4 @@ For setting up the dev environment and contributing guidelines, see: [docs/contr
 
 ## License
 
-This project is licensed under the GNU General Public License v3.0 - see the [LICENSE.md](LICENSE.md) file for details
+This project is licensed under the GNU General Public License v3.0 - see the [LICENSE.md](LICENSE.md) file for details
diff --git a/base_config.json b/base_config.json
@@ -14,7 +14,7 @@
   "redactor_close_char": "]",
   "annotators": {
     "prefix_with_initial": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "prefix+initiaal",
@@ -37,7 +37,7 @@
       }
     },
     "prefix_with_interfix": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "prefix+interfix+naam",
@@ -56,7 +56,7 @@
       }
     },
     "prefix_with_name": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "prefix+naam",
@@ -79,7 +79,7 @@
       }
     },
     "interfix_with_name": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "interfix+achternaam",
@@ -102,7 +102,7 @@
       }
     },
     "initial_with_name": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "initiaal+naam",
@@ -128,7 +128,7 @@
       }
     },
     "initial_interfix": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "names",
       "args": {
         "tag": "initiaal+interfix+naam",
@@ -177,6 +177,32 @@
       "args": {
         "iterative": true,
         "pattern": [
+          {
+            "name": "patient_left",
+            "direction": "left",
+            "pre_tag": [
+              "achternaam_patient"
+            ],
+            "tag": "voornaam_patient+achternaam_patient",
+            "pattern": [
+              {
+                "tag": "vornaam_patient"
+              }
+            ]
+          },
+          {
+            "name": "patient_right",
+            "direction": "right",
+            "pre_tag": [
+              "voornaam_patient"
+            ],
+            "tag": "voornaam_patient+achternaam_patient",
+            "pattern": [
+              {
+                "tag": "achternaam_patient"
+              }
+            ]
+          },
           {
             "name": "interfix_right",
             "direction": "right",
@@ -295,11 +321,7 @@
             "skip": ["."],
             "pattern": [
               {
-                "and": [
-                  {
-                    "lookup": "prefix"
-                  }
-                ]
+                  "lookup": "prefix"
               }
             ]
           }
@@ -325,7 +347,7 @@
       }
     },
     "street_pattern": {
-      "annotator_type": "deduce.annotator.TokenPatternAnnotator",
+      "annotator_type": "docdeid.process.SequenceAnnotator",
       "group": "locations",
       "args": {
         "pattern": [

diff --git a/deduce/annotation_processor.py b/deduce/annotation_processor.py
@@ -6,9 +6,16 @@
 
 
 class DeduceMergeAdjacentAnnotations(dd.process.MergeAdjacentAnnotations):
-    """Merge adjacent tags, according to deduce logic: adjacent annotations with mixed
-    patient/person tags are replaced with a patient annotation, in other cases only
-    annotations with equal tags are considered adjacent."""
+    """
+    Merges adjacent tags, according to Deduce logic:
+
+    - adjacent annotations with mixed patient/person tags are replaced
+      with the "persoon" annotation;
+    - adjacent annotations with patient tags of which one is the surname
+      are replaced with the "patient" annotation; and
+    - adjacent annotations with other patient tags are replaced with
+      the "part_of_patient" annotation.
+    """
 
     def _tags_match(self, left_tag: str, right_tag: str) -> bool:
         """
@@ -23,10 +30,15 @@ def _tags_match(self, left_tag: str, right_tag: str) -> bool:
             ``True`` if tags match, ``False`` otherwise.
         """
 
-        return (left_tag == right_tag) or {left_tag, right_tag} == {
-            "patient",
-            "persoon",
-        }
+        patient_part = [tag.endswith("_patient") for tag in (left_tag, right_tag)]
+        # FIXME Ideally, we should be first looking for a `*_patient` tag in
+        #  both directions and only failing that, merge with an adjacent
+        #  "persoon" tag.
+        return (
+            left_tag == right_tag
+            or all(patient_part)
+            or (patient_part[0] and right_tag == "persoon")
+        )
 
     def _adjacent_annotations_replacement(
         self,
@@ -37,14 +49,22 @@ def _adjacent_annotations_replacement(
         """
         Replace two annotations that have equal tags with a new annotation.
 
-        If one of the two annotations has the patient tag, the new annotation will also
-        be tagged patient. In other cases, the tags are already equal.
+        If one of the two annotations has the "patient" tag (and the other is either
+        "patient" or "persoon"), the other annotation will be used. In other cases, the
+        tags are always equal.
         """
 
-        if left_annotation.tag != right_annotation.tag:
-            replacement_tag = "patient"
-        else:
-            replacement_tag = left_annotation.tag
+        ltag = left_annotation.tag
+        rtag = right_annotation.tag
+        replacement_tag = (
+            ltag
+            if ltag == rtag
+            else "persoon"
+            if rtag == "persoon"
+            else "patient"
+            if any(tag.startswith("achternaam") for tag in (ltag, rtag))
+            else "part_of_patient"
+        )
 
         return dd.Annotation(
             text=text[left_annotation.start_char : right_annotation.end_char],
@@ -59,20 +79,28 @@ class PersonAnnotationConverter(dd.process.AnnotationProcessor):
     Responsible for processing the annotations produced by all name annotators (regular
     and context-based).
 
-    Any overlap with annotations that are  contain "pseudo" in their tag are removed, as
-    are those annotations. Then resolves overlap between remaining annotations, and maps
-    the tags to either "patient" or "persoon", based on whether "patient" is in the tag
-    (e.g. voornaam_patient => patient, achternaam_onbekend => persoon).
+    Any overlap with annotations that contain "pseudo" in their tag is removed, as are
+    those annotations. Then resolves overlap between remaining annotations, and maps the
+    tags to either "patient" or "persoon", based on whether "patient" is in all
+    constituent tags (e.g. voornaam_patient+achternaam_patient => patient,
+    achternaam_onbekend => persoon).
     """
 
     def __init__(self) -> None:
-        def map_tag_to_prio(tag: str) -> int:
-            if "pseudo" in tag:
-                return 0
-            if "patient" in tag:
-                return 1
-
-            return 2
+        def map_tag_to_prio(tag: str) -> (int, int, int):
+            """
+            Maps from the tag of a mention to its priority. The lower, the higher
+            priority.
+
+            The return value is a tuple of:
+              1. Is this a pseudo tag? If it is, it's a priority.
+              2. How many subtags does the tag have? The more, the higher priority.
+              3. Is this a patient tag? If it is, it's a priority.
+            """
+            is_pseudo = "pseudo" in tag
+            num_subtags = tag.count("+") + 1
+            is_patient = tag.count("patient") == num_subtags
+            return (-int(is_pseudo), -num_subtags, -int(is_patient))
 
         self._overlap_resolver = dd.process.OverlapResolver(
             sort_by=("tag", "length"),
@@ -89,15 +117,30 @@ def process_annotations(
             annotations, text=text
         )
 
-        return dd.AnnotationSet(
+        real_annos = (
+            anno
+            for anno in new_annotations
+            if "pseudo" not in anno.tag and anno.text.strip()
+        )
+        with_patient = (
             dd.Annotation(
-                text=annotation.text,
-                start_char=annotation.start_char,
-                end_char=annotation.end_char,
-                tag="patient" if "patient" in annotation.tag else "persoon",
+                text=anno.text,
+                start_char=anno.start_char,
+                end_char=anno.end_char,
+                tag=PersonAnnotationConverter._resolve_tag(anno.tag),
             )
-            for annotation in new_annotations
-            if ("pseudo" not in annotation.tag and len(annotation.text.strip()) != 0)
+            for anno in real_annos
+        )
+        return dd.AnnotationSet(with_patient)
+
+    @classmethod
+    def _resolve_tag(cls, tag: str) -> str:
+        if "+" not in tag:
+            return tag if "patient" in tag else "persoon"
+        return (
+            "patient"
+            if all("patient" in part for part in tag.split("+"))
+            else "persoon"
         )
 
 
@@ -114,7 +157,7 @@ def process_annotations(
 
 
 class CleanAnnotationTag(dd.process.AnnotationProcessor):
-    """Cleans annotation tags based on the corresponding mapping."""
+    """Renames tags using a mapping."""
 
     def __init__(self, tag_map: dict[str, str]) -> None:
         self.tag_map = tag_map