From ad477e309ac35ae9bff84d3ed94ea23e8f547f35 Mon Sep 17 00:00:00 2001
From: Fabian Rosenthal <frosenthal@cc.systems>
Date: Tue, 23 Sep 2025 08:53:14 +0200
Subject: [PATCH 1/5] chore(typing): Disallow untyped defs

---
 epidoc_parser/api.py       | 92 ++++++++++++++++++++------------------
 epidoc_parser/body.py      | 12 +++--
 epidoc_parser/header.py    | 20 +++++----
 epidoc_parser/normalize.py | 14 +++++-
 mypy.ini                   |  2 +
 5 files changed, 82 insertions(+), 58 deletions(-)
 create mode 100644 mypy.ini
diff --git a/epidoc_parser/api.py b/epidoc_parser/api.py
index 87a96a7..17a4585 100644
--- a/epidoc_parser/api.py
+++ b/epidoc_parser/api.py
@@ -1,10 +1,10 @@
-from typing import Optional
+from typing import Optional, Self, TextIO
 
 from bs4 import BeautifulSoup
 
 from .body import _Edition, _Head
 from .header import _History, _ProfileDesc
-from .normalize import _normalize, _normalized_get_text
+from .normalize import _normalize, _normalized_get_text, _must_find_sub_tag
 
 
 class EpiDoc:
@@ -14,10 +14,10 @@ class EpiDoc:
     authority: Optional[str] = None
     availability: Optional[str] = None
     material = None
-    origin_dates: list[str] = []
+    origin_dates: list[dict[str, str]] = []
     origin_place: dict[str, str] = {}
-    provenances: dict[str, str] = {}
-    terms: list[str] = []
+    provenances: dict[str, list[dict[str, str]]] = {}
+    terms: list[dict[str, str]] = []
     languages: dict[str, str] = {}
     commentary = None
     edition_language = None
@@ -28,22 +28,22 @@ class EpiDoc:
     @classmethod
     def create(
         cls,
-        title,
-        idno,
-        authority=None,
-        availability=None,
-        material=None,
-        origin_dates=None,
-        origin_place=None,
-        provenances=None,
-        terms=None,
-        languages=None,
-        commentary=None,
-        edition_language=None,
-        edition_foreign_languages=None,
-        reprint_from=None,
-        reprint_in=None,
-    ):
+        title: str,
+        idno: dict[str, str],
+        authority: Optional[str] = None,
+        availability: Optional[str] = None,
+        material: Optional[str] = None,
+        origin_dates: Optional[list[dict[str, str]]] = None,
+        origin_place: Optional[dict[str, str]] = None,
+        provenances: Optional[dict[str, list[dict[str, str]]]] = None,
+        terms: Optional[list[dict[str, str]]] = None,
+        languages: Optional[dict[str, str]] = None,
+        commentary: Optional[str] = None,
+        edition_language: Optional[str] = None,
+        edition_foreign_languages: Optional[dict[str, int]] = None,
+        reprint_from: Optional[list[str]] = None,
+        reprint_in: Optional[list[str]] = None,
+    ) -> Self:
         h = cls()
         h.title = title
         h.idno = idno
@@ -72,23 +72,24 @@ def create(
             h.reprint_in = reprint_in
         return h
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f'<EpiDoc "{self.title}">'
 
 
-def load(fp):
+def load(fp: TextIO) -> EpiDoc:
     return loads(fp.read())
 
 
-def loads(s):
+def loads(s: str) -> EpiDoc:
     soup = BeautifulSoup(s, features="lxml")
     doc = EpiDoc()
 
-    teiheader = soup.teiheader
-    filedesc = teiheader.filedesc
-    doc.title = filedesc.titlestmt.title.getText()
+    teiheader = _must_find_sub_tag(soup, "teiheader")
+    filedesc = _must_find_sub_tag(teiheader, "filedesc")
+    title = _must_find_sub_tag(filedesc, "titlestmt", "title")
+    doc.title = title.getText()
     idnos = {}
-    publication_stmt = filedesc.publicationstmt
+    publication_stmt = _must_find_sub_tag(filedesc, "publicationstmt")
     for idno in publication_stmt.find_all("idno"):
         typ = _normalize(idno.attrs.get("type"))
         value = _normalize(idno.getText())
@@ -99,35 +100,40 @@ def loads(s):
     authority = publication_stmt.find("authority")
     if authority:
         doc.authority = _normalized_get_text(authority)
-    availability = publication_stmt.find("availability")
+    availability = publication_stmt.availability
     if availability:
         availability_text = _normalized_get_text(availability)
         license = availability.find("ref", type="license")
-        if license:
+        if availability_text and isinstance(license, Tag):
             license_target = license.attrs.get("target")
             if license_target:
                 availability_text += f" {license_target}"
         doc.availability = availability_text
 
-    msdesc = filedesc.sourcedesc.msdesc
-    if msdesc:
-        physdesc = msdesc.physdesc
-        if physdesc:
-            support = physdesc.objectdesc.support
-            if hasattr(support, "material"):
-                doc.material = _normalize(_normalized_get_text(support.material))
-        history = msdesc.history
-        if history:
-            doc.origin_dates = _History.origin_dates(history)
-            doc.origin_place = _History.origin_place(history)
-            doc.provenances = _History.provenances(history)
+    sourcedesc = filedesc.sourcedesc
+    if sourcedesc:
+        msdesc = sourcedesc.msdesc
+        if msdesc:
+            physdesc = msdesc.physdesc
+            if physdesc:
+                objectdesc = physdesc.objectdesc
+                if objectdesc:
+                    support = objectdesc.support
+                    if support and hasattr(support, "material"):
+                        doc.material = _normalize(_normalized_get_text(support.material))
+
+            history = msdesc.history
+            if history:
+                doc.origin_dates = _History.origin_dates(history)
+                doc.origin_place = _History.origin_place(history)
+                doc.provenances = _History.provenances(history)
 
     profile_desc = teiheader.profiledesc
     if profile_desc:
         doc.languages = _ProfileDesc.lang_usage(profile_desc)
         doc.terms = _ProfileDesc.keyword_terms(profile_desc)
 
-    body = soup.body
+    body = _must_find_sub_tag(soup, "body")
     commentary = body.find("div", type="commentary", subtype="general")
     if commentary:
         doc.commentary = _normalized_get_text(commentary)
diff --git a/epidoc_parser/body.py b/epidoc_parser/body.py
index 2141eae..6579658 100644
--- a/epidoc_parser/body.py
+++ b/epidoc_parser/body.py
@@ -1,3 +1,5 @@
+from typing import Optional, Any
+
 from bs4 import Tag
 
 from .normalize import _normalize
@@ -5,15 +7,17 @@
 
 class _Edition:
     @staticmethod
-    def _edition(body: Tag):
+    def _edition(body: Tag) -> Optional[Any]:
         return body.find("div", type="edition")  # Note: limit to xml:space="preserve"?
 
     @staticmethod
-    def language(body: Tag):
+    def language(body: Tag) -> Optional[str]:
         edition = _Edition._edition(body)
         if edition:
             return _normalize(edition.attrs.get("xml:lang"))
 
+        return None
+
     @staticmethod
     def foreign_languages(body: Tag) -> dict[str, int]:
         edition = _Edition._edition(body)
@@ -33,7 +37,7 @@ class _Head:
     @staticmethod
     def reprint_from(body: Tag) -> list[str]:
         result: list[str] = []
-        for elem in body.findAll("ref", type="reprint-from"):
+        for elem in body.find_all("ref", type="reprint-from"):
             n = _normalize(elem.attrs.get("n"))
             if n:
                 result.append(n)
@@ -42,7 +46,7 @@ def reprint_from(body: Tag) -> list[str]:
     @staticmethod
     def reprint_in(body: Tag) -> list[str]:
         result: list[str] = []
-        for elem in body.findAll("ref", type="reprint-in"):
+        for elem in body.find_all("ref", type="reprint-in"):
             n = _normalize(elem.attrs.get("n"))
             if n:
                 result.append(n)
diff --git a/epidoc_parser/header.py b/epidoc_parser/header.py
index 87bc943..356f9af 100644
--- a/epidoc_parser/header.py
+++ b/epidoc_parser/header.py
@@ -7,13 +7,13 @@
 
 class _History:
     @staticmethod
-    def origin_dates(history: Tag) -> list[Any]:
+    def origin_dates(history: Tag) -> list[dict[str, str]]:
         origin = history.origin
         if origin is None:
             return []
 
-        result = []
-        for elem in origin.findAll("origdate"):  # type: ignore
+        result: list[dict[str, str]] = []
+        for elem in origin.find_all("origdate"):  # type: ignore
             date = _normalized_attrs(elem)
             date["text"] = _normalized_get_text(elem)
             result.append(date)
@@ -36,7 +36,7 @@ def origin_place(history: Tag) -> dict[str, Any]:
     @staticmethod
     def provenances(history: Tag) -> dict[str, Any]:
         result: dict[str, Any] = {}
-        for elem in history.findAll("provenance"):
+        for elem in history.find_all("provenance"):
             typ = _normalize(elem.attrs.get("type"))
             if typ is None:
                 continue
@@ -44,10 +44,10 @@ def provenances(history: Tag) -> dict[str, Any]:
         return result
 
     @staticmethod
-    def _provenance(provenance: Tag):
+    def _provenance(provenance: Tag) -> list[Any]:
         result = []
         # Note: For some it's provenance.p.placename
-        for elem in provenance.findAll("placename"):
+        for elem in provenance.find_all("placename"):
             place = _normalized_attrs(elem)
             place["text"] = _normalized_get_text(elem)
             if "ref" in place:
@@ -68,7 +68,7 @@ def keyword_terms(profile_desc: Tag) -> list[dict[str, Any]]:
             return []
 
         result: list[dict[str, Any]] = []
-        for elem in keywords.findAll("term"):  # type: ignore
+        for elem in keywords.find_all("term"):  # type: ignore
             term = _normalized_attrs(elem)
             term["text"] = _normalized_get_text(elem)
             result.append(term)
@@ -80,7 +80,9 @@ def lang_usage(profile_desc: Tag) -> dict[str, str]:
         lang_usage = profile_desc.langusage
         if lang_usage is None:
             return result
-        for elem in lang_usage.findAll("language"):
+        for elem in lang_usage.find_all("language"):
             ident = _normalize(elem.attrs.get("ident"))
-            result[ident] = _normalized_get_text(elem)
+            text = _normalized_get_text(elem)
+            if text is not None:
+                result[ident] = text
         return result
diff --git a/epidoc_parser/normalize.py b/epidoc_parser/normalize.py
index 44ae738..ecde729 100644
--- a/epidoc_parser/normalize.py
+++ b/epidoc_parser/normalize.py
@@ -1,4 +1,4 @@
-from typing import TypeVar, Any
+from typing import TypeVar, Any, Optional
 
 from bs4 import Tag
 
@@ -11,7 +11,7 @@ def _normalize(v: T) -> T:
     return v
 
 
-def _normalized_get_text(raw):
+def _normalized_get_text(raw: Any) -> Optional[str]:
     if not raw:
         return None
     parsed = raw.getText().strip().replace("\n", "")
@@ -23,3 +23,13 @@ def _normalized_attrs(raw: Tag) -> dict[str, Any]:
     for name, value in raw.attrs.items():
         parsed[_normalize(name)] = _normalize(value)
     return parsed
+
+
+def _must_find_sub_tag(tag: Tag, *keys: str) -> Tag:
+    current_tag = tag
+    for key in keys:
+        sub_tag = current_tag.find(key)
+        assert isinstance(sub_tag, Tag), f"${key} is not None"
+        current_tag = sub_tag
+
+    return current_tag
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..82aa7eb
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+disallow_untyped_defs = True

From 8f1fa02d494a21800b4bb28e0406d7314a89fb32 Mon Sep 17 00:00:00 2001
From: Xennis <code@xennis.org>
Date: Tue, 23 Sep 2025 18:22:29 +0200
Subject: [PATCH 2/5] chore: Ignore XML might be HTML warning

---
 epidoc_parser/api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/epidoc_parser/api.py b/epidoc_parser/api.py
index 17a4585..cfa30b8 100644
--- a/epidoc_parser/api.py
+++ b/epidoc_parser/api.py
@@ -1,6 +1,7 @@
+import warnings
 from typing import Optional, Self, TextIO
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning
 
 from .body import _Edition, _Head
 from .header import _History, _ProfileDesc
@@ -81,6 +82,7 @@ def load(fp: TextIO) -> EpiDoc:
 
 
 def loads(s: str) -> EpiDoc:
+    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
     soup = BeautifulSoup(s, features="lxml")
     doc = EpiDoc()
 

From 9660a36367e96e06209ddeddd7ab9eaa02884722 Mon Sep 17 00:00:00 2001
From: Xennis <code@xennis.org>
Date: Tue, 23 Sep 2025 18:22:45 +0200
Subject: [PATCH 3/5] fix: Integration due to renaming

---
 tests/integration_api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration_api.py b/tests/integration_api.py
index a358526..3c960a3 100644
--- a/tests/integration_api.py
+++ b/tests/integration_api.py
@@ -3,7 +3,7 @@
 import os
 import unittest
 
-import epidoc
+import epidoc_parser
 
 TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata", "idp.data")
 
@@ -22,7 +22,7 @@ def setUp(self):
     def load(filename):
         with open(filename) as f:
             try:
-                epidoc.load(f)
+                epidoc_parser.load(f)
             except Exception as e:
                 # Raised exception are ignored by executor.map
                 return Exception(f"{filename} has error {e.__class__.__name__}: {e}")

From 8c9c92fcf50f641a124221bf1879585a2ffae2b7 Mon Sep 17 00:00:00 2001
From: Xennis <code@xennis.org>
Date: Tue, 23 Sep 2025 18:26:28 +0200
Subject: [PATCH 4/5] chore: Reduce the ignored type checks

---
 epidoc_parser/header.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/epidoc_parser/header.py b/epidoc_parser/header.py
index 356f9af..022b0ef 100644
--- a/epidoc_parser/header.py
+++ b/epidoc_parser/header.py
@@ -13,7 +13,7 @@ def origin_dates(history: Tag) -> list[dict[str, str]]:
             return []
 
         result: list[dict[str, str]] = []
-        for elem in origin.find_all("origdate"):  # type: ignore
+        for elem in origin.find_all("origdate"):
             date = _normalized_attrs(elem)
             date["text"] = _normalized_get_text(elem)
             result.append(date)
@@ -25,7 +25,7 @@ def origin_place(history: Tag) -> dict[str, Any]:
         if origin is None:
             return {}
 
-        origin_place = origin.origplace  # type: ignore
+        origin_place = origin.origplace
         if not origin_place:
             return {}
 
@@ -68,7 +68,7 @@ def keyword_terms(profile_desc: Tag) -> list[dict[str, Any]]:
             return []
 
         result: list[dict[str, Any]] = []
-        for elem in keywords.find_all("term"):  # type: ignore
+        for elem in keywords.find_all("term"):
             term = _normalized_attrs(elem)
             term["text"] = _normalized_get_text(elem)
             result.append(term)

From 8aceb4d4d43f47f7a45487102e434fc30878e3ae Mon Sep 17 00:00:00 2001
From: Xennis <code@xennis.org>
Date: Tue, 23 Sep 2025 18:30:21 +0200
Subject: [PATCH 5/5] chore: Drop Python 3.10

---
 .github/workflows/python.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index ce55b89..95f612a 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -5,7 +5,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ '3.10', '3.11', '3.12', '3.13' ]
+        python-version: [ '3.11', '3.12', '3.13' ]
     steps:
     - name: Checkout
       uses: actions/checkout@v4