Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.10', '3.11', '3.12', '3.13' ]
python-version: [ '3.11', '3.12', '3.13' ]
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
96 changes: 52 additions & 44 deletions epidoc_parser/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import Optional
import warnings
from typing import Optional, Self, TextIO

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag, XMLParsedAsHTMLWarning

from .body import _Edition, _Head
from .header import _History, _ProfileDesc
from .normalize import _normalize, _normalized_get_text
from .normalize import _normalize, _normalized_get_text, _must_find_sub_tag


class EpiDoc:
Expand All @@ -14,10 +15,10 @@ class EpiDoc:
authority: Optional[str] = None
availability: Optional[str] = None
material = None
origin_dates: list[str] = []
origin_dates: list[dict[str, str]] = []
origin_place: dict[str, str] = {}
provenances: dict[str, str] = {}
terms: list[str] = []
provenances: dict[str, list[dict[str, str]]] = {}
terms: list[dict[str, str]] = []
languages: dict[str, str] = {}
commentary = None
edition_language = None
Expand All @@ -28,22 +29,22 @@ class EpiDoc:
@classmethod
def create(
cls,
title,
idno,
authority=None,
availability=None,
material=None,
origin_dates=None,
origin_place=None,
provenances=None,
terms=None,
languages=None,
commentary=None,
edition_language=None,
edition_foreign_languages=None,
reprint_from=None,
reprint_in=None,
):
title: str,
idno: dict[str, str],
authority: Optional[str] = None,
availability: Optional[str] = None,
material: Optional[str] = None,
origin_dates: Optional[list[dict[str, str]]] = None,
origin_place: Optional[dict[str, str]] = None,
provenances: Optional[dict[str, list[dict[str, str]]]] = None,
terms: Optional[list[dict[str, str]]] = None,
languages: Optional[dict[str, str]] = None,
commentary: Optional[str] = None,
edition_language: Optional[str] = None,
edition_foreign_languages: Optional[dict[str, int]] = None,
reprint_from: Optional[list[str]] = None,
reprint_in: Optional[list[str]] = None,
) -> Self:
h = cls()
h.title = title
h.idno = idno
Expand Down Expand Up @@ -72,23 +73,25 @@ def create(
h.reprint_in = reprint_in
return h

def __repr__(self):
def __repr__(self) -> str:
return f'<EpiDoc "{self.title}">'


def load(fp):
def load(fp: TextIO) -> EpiDoc:
return loads(fp.read())


def loads(s):
def loads(s: str) -> EpiDoc:
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
soup = BeautifulSoup(s, features="lxml")
doc = EpiDoc()

teiheader = soup.teiheader
filedesc = teiheader.filedesc
doc.title = filedesc.titlestmt.title.getText()
teiheader = _must_find_sub_tag(soup, "teiheader")
filedesc = _must_find_sub_tag(teiheader, "filedesc")
title = _must_find_sub_tag(filedesc, "titlestmt", "title")
doc.title = title.getText()
idnos = {}
publication_stmt = filedesc.publicationstmt
publication_stmt = _must_find_sub_tag(filedesc, "publicationstmt")
for idno in publication_stmt.find_all("idno"):
typ = _normalize(idno.attrs.get("type"))
value = _normalize(idno.getText())
Expand All @@ -99,35 +102,40 @@ def loads(s):
authority = publication_stmt.find("authority")
if authority:
doc.authority = _normalized_get_text(authority)
availability = publication_stmt.find("availability")
availability = publication_stmt.availability
if availability:
availability_text = _normalized_get_text(availability)
license = availability.find("ref", type="license")
if license:
if availability_text and isinstance(license, Tag):
license_target = license.attrs.get("target")
if license_target:
availability_text += f" {license_target}"
doc.availability = availability_text

msdesc = filedesc.sourcedesc.msdesc
if msdesc:
physdesc = msdesc.physdesc
if physdesc:
support = physdesc.objectdesc.support
if hasattr(support, "material"):
doc.material = _normalize(_normalized_get_text(support.material))
history = msdesc.history
if history:
doc.origin_dates = _History.origin_dates(history)
doc.origin_place = _History.origin_place(history)
doc.provenances = _History.provenances(history)
sourcedesc = filedesc.sourcedesc
if sourcedesc:
msdesc = sourcedesc.msdesc
if msdesc:
physdesc = msdesc.physdesc
if physdesc:
objectdesc = physdesc.objectdesc
if objectdesc:
support = objectdesc.support
if support and hasattr(support, "material"):
doc.material = _normalize(_normalized_get_text(support.material))

history = msdesc.history
if history:
doc.origin_dates = _History.origin_dates(history)
doc.origin_place = _History.origin_place(history)
doc.provenances = _History.provenances(history)

profile_desc = teiheader.profiledesc
if profile_desc:
doc.languages = _ProfileDesc.lang_usage(profile_desc)
doc.terms = _ProfileDesc.keyword_terms(profile_desc)

body = soup.body
body = _must_find_sub_tag(soup, "body")
commentary = body.find("div", type="commentary", subtype="general")
if commentary:
doc.commentary = _normalized_get_text(commentary)
Expand Down
12 changes: 8 additions & 4 deletions epidoc_parser/body.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
from typing import Optional, Any

from bs4 import Tag

from .normalize import _normalize


class _Edition:
@staticmethod
def _edition(body: Tag):
def _edition(body: Tag) -> Optional[Any]:
return body.find("div", type="edition") # Note: limit to xml:space="preserve"?

@staticmethod
def language(body: Tag):
def language(body: Tag) -> Optional[str]:
edition = _Edition._edition(body)
if edition:
return _normalize(edition.attrs.get("xml:lang"))

return None

@staticmethod
def foreign_languages(body: Tag) -> dict[str, int]:
edition = _Edition._edition(body)
Expand All @@ -33,7 +37,7 @@ class _Head:
@staticmethod
def reprint_from(body: Tag) -> list[str]:
result: list[str] = []
for elem in body.findAll("ref", type="reprint-from"):
for elem in body.find_all("ref", type="reprint-from"):
n = _normalize(elem.attrs.get("n"))
if n:
result.append(n)
Expand All @@ -42,7 +46,7 @@ def reprint_from(body: Tag) -> list[str]:
@staticmethod
def reprint_in(body: Tag) -> list[str]:
result: list[str] = []
for elem in body.findAll("ref", type="reprint-in"):
for elem in body.find_all("ref", type="reprint-in"):
n = _normalize(elem.attrs.get("n"))
if n:
result.append(n)
Expand Down
22 changes: 12 additions & 10 deletions epidoc_parser/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@

class _History:
@staticmethod
def origin_dates(history: Tag) -> list[Any]:
def origin_dates(history: Tag) -> list[dict[str, str]]:
origin = history.origin
if origin is None:
return []

result = []
for elem in origin.findAll("origdate"): # type: ignore
result: list[dict[str, str]] = []
for elem in origin.find_all("origdate"):
date = _normalized_attrs(elem)
date["text"] = _normalized_get_text(elem)
result.append(date)
Expand All @@ -25,7 +25,7 @@ def origin_place(history: Tag) -> dict[str, Any]:
if origin is None:
return {}

origin_place = origin.origplace # type: ignore
origin_place = origin.origplace
if not origin_place:
return {}

Expand All @@ -36,18 +36,18 @@ def origin_place(history: Tag) -> dict[str, Any]:
@staticmethod
def provenances(history: Tag) -> dict[str, Any]:
result: dict[str, Any] = {}
for elem in history.findAll("provenance"):
for elem in history.find_all("provenance"):
typ = _normalize(elem.attrs.get("type"))
if typ is None:
continue
result[typ] = result.get(typ, []) + _History._provenance(elem)
return result

@staticmethod
def _provenance(provenance: Tag):
def _provenance(provenance: Tag) -> list[Any]:
result = []
# Note: For some it's provenance.p.placename
for elem in provenance.findAll("placename"):
for elem in provenance.find_all("placename"):
place = _normalized_attrs(elem)
place["text"] = _normalized_get_text(elem)
if "ref" in place:
Expand All @@ -68,7 +68,7 @@ def keyword_terms(profile_desc: Tag) -> list[dict[str, Any]]:
return []

result: list[dict[str, Any]] = []
for elem in keywords.findAll("term"): # type: ignore
for elem in keywords.find_all("term"):
term = _normalized_attrs(elem)
term["text"] = _normalized_get_text(elem)
result.append(term)
Expand All @@ -80,7 +80,9 @@ def lang_usage(profile_desc: Tag) -> dict[str, str]:
lang_usage = profile_desc.langusage
if lang_usage is None:
return result
for elem in lang_usage.findAll("language"):
for elem in lang_usage.find_all("language"):
ident = _normalize(elem.attrs.get("ident"))
result[ident] = _normalized_get_text(elem)
text = _normalized_get_text(elem)
if text is not None:
result[ident] = text
return result
14 changes: 12 additions & 2 deletions epidoc_parser/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import TypeVar, Any
from typing import TypeVar, Any, Optional

from bs4 import Tag

Expand All @@ -11,7 +11,7 @@ def _normalize(v: T) -> T:
return v


def _normalized_get_text(raw):
def _normalized_get_text(raw: Any) -> Optional[str]:
if not raw:
return None
parsed = raw.getText().strip().replace("\n", "")
Expand All @@ -23,3 +23,13 @@ def _normalized_attrs(raw: Tag) -> dict[str, Any]:
for name, value in raw.attrs.items():
parsed[_normalize(name)] = _normalize(value)
return parsed


def _must_find_sub_tag(tag: Tag, *keys: str) -> Tag:
current_tag = tag
for key in keys:
sub_tag = current_tag.find(key)
assert isinstance(sub_tag, Tag), f"${key} is not None"
current_tag = sub_tag

return current_tag
2 changes: 2 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[mypy]
disallow_untyped_defs = True
4 changes: 2 additions & 2 deletions tests/integration_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import unittest

import epidoc
import epidoc_parser

TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata", "idp.data")

Expand All @@ -22,7 +22,7 @@ def setUp(self):
def load(filename):
with open(filename) as f:
try:
epidoc.load(f)
epidoc_parser.load(f)
except Exception as e:
# Raised exception are ignored by executor.map
return Exception(f"{filename} has error {e.__class__.__name__}: {e}")
Expand Down