From ad477e309ac35ae9bff84d3ed94ea23e8f547f35 Mon Sep 17 00:00:00 2001 From: Fabian Rosenthal Date: Tue, 23 Sep 2025 08:53:14 +0200 Subject: [PATCH 1/5] chore(typing): Disallow untyped defs --- epidoc_parser/api.py | 92 ++++++++++++++++++++------------------ epidoc_parser/body.py | 12 +++-- epidoc_parser/header.py | 20 +++++---- epidoc_parser/normalize.py | 14 +++++- mypy.ini | 2 + 5 files changed, 82 insertions(+), 58 deletions(-) create mode 100644 mypy.ini diff --git a/epidoc_parser/api.py b/epidoc_parser/api.py index 87a96a7..17a4585 100644 --- a/epidoc_parser/api.py +++ b/epidoc_parser/api.py @@ -1,10 +1,10 @@ -from typing import Optional +from typing import Optional, Self, TextIO from bs4 import BeautifulSoup from .body import _Edition, _Head from .header import _History, _ProfileDesc -from .normalize import _normalize, _normalized_get_text +from .normalize import _normalize, _normalized_get_text, _must_find_sub_tag class EpiDoc: @@ -14,10 +14,10 @@ class EpiDoc: authority: Optional[str] = None availability: Optional[str] = None material = None - origin_dates: list[str] = [] + origin_dates: list[dict[str, str]] = [] origin_place: dict[str, str] = {} - provenances: dict[str, str] = {} - terms: list[str] = [] + provenances: dict[str, list[dict[str, str]]] = {} + terms: list[dict[str, str]] = [] languages: dict[str, str] = {} commentary = None edition_language = None @@ -28,22 +28,22 @@ class EpiDoc: @classmethod def create( cls, - title, - idno, - authority=None, - availability=None, - material=None, - origin_dates=None, - origin_place=None, - provenances=None, - terms=None, - languages=None, - commentary=None, - edition_language=None, - edition_foreign_languages=None, - reprint_from=None, - reprint_in=None, - ): + title: str, + idno: dict[str, str], + authority: Optional[str] = None, + availability: Optional[str] = None, + material: Optional[str] = None, + origin_dates: Optional[list[dict[str, str]]] = None, + origin_place: Optional[dict[str, str]] = None, + provenances: Optional[dict[str, list[dict[str, str]]]] = None, + terms: Optional[list[dict[str, str]]] = None, + languages: Optional[dict[str, str]] = None, + commentary: Optional[str] = None, + edition_language: Optional[str] = None, + edition_foreign_languages: Optional[dict[str, int]] = None, + reprint_from: Optional[list[str]] = None, + reprint_in: Optional[list[str]] = None, + ) -> Self: h = cls() h.title = title h.idno = idno @@ -72,23 +72,24 @@ def create( h.reprint_in = reprint_in return h - def __repr__(self): + def __repr__(self) -> str: return f'' -def load(fp): +def load(fp: TextIO) -> EpiDoc: return loads(fp.read()) -def loads(s): +def loads(s: str) -> EpiDoc: soup = BeautifulSoup(s, features="lxml") doc = EpiDoc() - teiheader = soup.teiheader - filedesc = teiheader.filedesc - doc.title = filedesc.titlestmt.title.getText() + teiheader = _must_find_sub_tag(soup, "teiheader") + filedesc = _must_find_sub_tag(teiheader, "filedesc") + title = _must_find_sub_tag(filedesc, "titlestmt", "title") + doc.title = title.getText() idnos = {} - publication_stmt = filedesc.publicationstmt + publication_stmt = _must_find_sub_tag(filedesc, "publicationstmt") for idno in publication_stmt.find_all("idno"): typ = _normalize(idno.attrs.get("type")) value = _normalize(idno.getText()) @@ -99,35 +100,40 @@ def loads(s): authority = publication_stmt.find("authority") if authority: doc.authority = _normalized_get_text(authority) - availability = publication_stmt.find("availability") + availability = publication_stmt.availability if availability: availability_text = _normalized_get_text(availability) license = availability.find("ref", type="license") - if license: + if availability_text and isinstance(license, Tag): license_target = license.attrs.get("target") if license_target: availability_text += f" {license_target}" doc.availability = availability_text - msdesc = filedesc.sourcedesc.msdesc - if msdesc: - physdesc = msdesc.physdesc - if physdesc: - support = physdesc.objectdesc.support - if hasattr(support, "material"): - doc.material = _normalize(_normalized_get_text(support.material)) - history = msdesc.history - if history: - doc.origin_dates = _History.origin_dates(history) - doc.origin_place = _History.origin_place(history) - doc.provenances = _History.provenances(history) + sourcedesc = filedesc.sourcedesc + if sourcedesc: + msdesc = sourcedesc.msdesc + if msdesc: + physdesc = msdesc.physdesc + if physdesc: + objectdesc = physdesc.objectdesc + if objectdesc: + support = objectdesc.support + if support and hasattr(support, "material"): + doc.material = _normalize(_normalized_get_text(support.material)) + + history = msdesc.history + if history: + doc.origin_dates = _History.origin_dates(history) + doc.origin_place = _History.origin_place(history) + doc.provenances = _History.provenances(history) profile_desc = teiheader.profiledesc if profile_desc: doc.languages = _ProfileDesc.lang_usage(profile_desc) doc.terms = _ProfileDesc.keyword_terms(profile_desc) - body = soup.body + body = _must_find_sub_tag(soup, "body") commentary = body.find("div", type="commentary", subtype="general") if commentary: doc.commentary = _normalized_get_text(commentary) diff --git a/epidoc_parser/body.py b/epidoc_parser/body.py index 2141eae..6579658 100644 --- a/epidoc_parser/body.py +++ b/epidoc_parser/body.py @@ -1,3 +1,5 @@ +from typing import Optional, Any + from bs4 import Tag from .normalize import _normalize @@ -5,15 +7,17 @@ class _Edition: @staticmethod - def _edition(body: Tag): + def _edition(body: Tag) -> Optional[Any]: return body.find("div", type="edition") # Note: limit to xml:space="preserve"? @staticmethod - def language(body: Tag): + def language(body: Tag) -> Optional[str]: edition = _Edition._edition(body) if edition: return _normalize(edition.attrs.get("xml:lang")) + return None + @staticmethod def foreign_languages(body: Tag) -> dict[str, int]: edition = _Edition._edition(body) @@ -33,7 +37,7 @@ class _Head: @staticmethod def reprint_from(body: Tag) -> list[str]: result: list[str] = [] - for elem in body.findAll("ref", type="reprint-from"): + for elem in body.find_all("ref", type="reprint-from"): n = _normalize(elem.attrs.get("n")) if n: result.append(n) @@ -42,7 +46,7 @@ def reprint_from(body: Tag) -> list[str]: @staticmethod def reprint_in(body: Tag) -> list[str]: result: list[str] = [] - for elem in body.findAll("ref", type="reprint-in"): + for elem in body.find_all("ref", type="reprint-in"): n = _normalize(elem.attrs.get("n")) if n: result.append(n) diff --git a/epidoc_parser/header.py b/epidoc_parser/header.py index 87bc943..356f9af 100644 --- a/epidoc_parser/header.py +++ b/epidoc_parser/header.py @@ -7,13 +7,13 @@ class _History: @staticmethod - def origin_dates(history: Tag) -> list[Any]: + def origin_dates(history: Tag) -> list[dict[str, str]]: origin = history.origin if origin is None: return [] - result = [] - for elem in origin.findAll("origdate"): # type: ignore + result: list[dict[str, str]] = [] + for elem in origin.find_all("origdate"): # type: ignore date = _normalized_attrs(elem) date["text"] = _normalized_get_text(elem) result.append(date) @@ -36,7 +36,7 @@ def origin_place(history: Tag) -> dict[str, Any]: @staticmethod def provenances(history: Tag) -> dict[str, Any]: result: dict[str, Any] = {} - for elem in history.findAll("provenance"): + for elem in history.find_all("provenance"): typ = _normalize(elem.attrs.get("type")) if typ is None: continue @@ -44,10 +44,10 @@ def provenances(history: Tag) -> dict[str, Any]: return result @staticmethod - def _provenance(provenance: Tag): + def _provenance(provenance: Tag) -> list[Any]: result = [] # Note: For some it's provenance.p.placename - for elem in provenance.findAll("placename"): + for elem in provenance.find_all("placename"): place = _normalized_attrs(elem) place["text"] = _normalized_get_text(elem) if "ref" in place: @@ -68,7 +68,7 @@ def keyword_terms(profile_desc: Tag) -> list[dict[str, Any]]: return [] result: list[dict[str, Any]] = [] - for elem in keywords.findAll("term"): # type: ignore + for elem in keywords.find_all("term"): # type: ignore term = _normalized_attrs(elem) term["text"] = _normalized_get_text(elem) result.append(term) @@ -80,7 +80,9 @@ def lang_usage(profile_desc: Tag) -> dict[str, str]: lang_usage = profile_desc.langusage if lang_usage is None: return result - for elem in lang_usage.findAll("language"): + for elem in lang_usage.find_all("language"): ident = _normalize(elem.attrs.get("ident")) - result[ident] = _normalized_get_text(elem) + text = _normalized_get_text(elem) + if text is not None: + result[ident] = text return result diff --git a/epidoc_parser/normalize.py b/epidoc_parser/normalize.py index 44ae738..ecde729 100644 --- a/epidoc_parser/normalize.py +++ b/epidoc_parser/normalize.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Any +from typing import TypeVar, Any, Optional from bs4 import Tag @@ -11,7 +11,7 @@ def _normalize(v: T) -> T: return v -def _normalized_get_text(raw): +def _normalized_get_text(raw: Any) -> Optional[str]: if not raw: return None parsed = raw.getText().strip().replace("\n", "") @@ -23,3 +23,13 @@ def _normalized_attrs(raw: Tag) -> dict[str, Any]: for name, value in raw.attrs.items(): parsed[_normalize(name)] = _normalize(value) return parsed + + +def _must_find_sub_tag(tag: Tag, *keys: str) -> Tag: + current_tag = tag + for key in keys: + sub_tag = current_tag.find(key) + assert isinstance(sub_tag, Tag), f"${key} is not None" + current_tag = sub_tag + + return current_tag diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..82aa7eb --- /dev/null +++ b/mypy.ini @@ -0,0 +1,2 @@ +[mypy] +disallow_untyped_defs = True From 8f1fa02d494a21800b4bb28e0406d7314a89fb32 Mon Sep 17 00:00:00 2001 From: Xennis Date: Tue, 23 Sep 2025 18:22:29 +0200 Subject: [PATCH 2/5] chore: Ignore XML might be HTML warning --- epidoc_parser/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/epidoc_parser/api.py b/epidoc_parser/api.py index 17a4585..cfa30b8 100644 --- a/epidoc_parser/api.py +++ b/epidoc_parser/api.py @@ -1,6 +1,7 @@ +import warnings from typing import Optional, Self, TextIO -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning from .body import _Edition, _Head from .header import _History, _ProfileDesc @@ -81,6 +82,7 @@ def load(fp: TextIO) -> EpiDoc: def loads(s: str) -> EpiDoc: + warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) soup = BeautifulSoup(s, features="lxml") doc = EpiDoc() From 9660a36367e96e06209ddeddd7ab9eaa02884722 Mon Sep 17 00:00:00 2001 From: Xennis Date: Tue, 23 Sep 2025 18:22:45 +0200 Subject: [PATCH 3/5] fix: Integration due to renaming --- tests/integration_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_api.py b/tests/integration_api.py index a358526..3c960a3 100644 --- a/tests/integration_api.py +++ b/tests/integration_api.py @@ -3,7 +3,7 @@ import os import unittest -import epidoc +import epidoc_parser TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata", "idp.data") @@ -22,7 +22,7 @@ def setUp(self): def load(filename): with open(filename) as f: try: - epidoc.load(f) + epidoc_parser.load(f) except Exception as e: # Raised exception are ignored by executor.map return Exception(f"{filename} has error {e.__class__.__name__}: {e}") From 8c9c92fcf50f641a124221bf1879585a2ffae2b7 Mon Sep 17 00:00:00 2001 From: Xennis Date: Tue, 23 Sep 2025 18:26:28 +0200 Subject: [PATCH 4/5] chore: Reduce the ignored type checks --- epidoc_parser/header.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epidoc_parser/header.py b/epidoc_parser/header.py index 356f9af..022b0ef 100644 --- a/epidoc_parser/header.py +++ b/epidoc_parser/header.py @@ -13,7 +13,7 @@ def origin_dates(history: Tag) -> list[dict[str, str]]: return [] result: list[dict[str, str]] = [] - for elem in origin.find_all("origdate"): # type: ignore + for elem in origin.find_all("origdate"): date = _normalized_attrs(elem) date["text"] = _normalized_get_text(elem) result.append(date) @@ -25,7 +25,7 @@ def origin_place(history: Tag) -> dict[str, Any]: if origin is None: return {} - origin_place = origin.origplace # type: ignore + origin_place = origin.origplace if not origin_place: return {} @@ -68,7 +68,7 @@ def keyword_terms(profile_desc: Tag) -> list[dict[str, Any]]: return [] result: list[dict[str, Any]] = [] - for elem in keywords.find_all("term"): # type: ignore + for elem in keywords.find_all("term"): term = _normalized_attrs(elem) term["text"] = _normalized_get_text(elem) result.append(term) From 8aceb4d4d43f47f7a45487102e434fc30878e3ae Mon Sep 17 00:00:00 2001 From: Xennis Date: Tue, 23 Sep 2025 18:30:21 +0200 Subject: [PATCH 5/5] chore: Drop Python 3.10 --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index ce55b89..95f612a 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -5,7 +5,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.10', '3.11', '3.12', '3.13' ] + python-version: [ '3.11', '3.12', '3.13' ] steps: - name: Checkout uses: actions/checkout@v4