Skip to content

Commit 7613416

Browse files
mspinolaeiematteo.spinola
andauthored
Fix v1.1.0 charset_normalizer packaging regression (#24)
Co-authored-by: matteo.spinola <matteo.spinola@ctao-consortium.org>
1 parent cab4fd1 commit 7613416

3 files changed

Lines changed: 149 additions & 35 deletions

File tree

MarkItDown.spec

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,10 @@
11
# -*- mode: python ; coding: utf-8 -*-
22
import os
33
from PyInstaller.utils.hooks import collect_data_files, collect_submodules
4+
from markitdowngui.build_config import build_datas, build_hiddenimports
45

5-
# Keep hidden imports minimal and focused on runtime-dynamic conversion modules.
6-
hiddenimports = [
7-
"packaging.version",
8-
"requests",
9-
]
10-
hiddenimports += collect_submodules("markitdown")
11-
for package in (
12-
"azure.ai.documentintelligence",
13-
"azure.identity",
14-
"pypdfium2",
15-
"pypdfium2_raw",
16-
"pytesseract",
17-
):
18-
try:
19-
hiddenimports += collect_submodules(package)
20-
except Exception as e:
21-
print(f"Warning: Could not collect hidden imports for {package}: {e}")
22-
23-
datas = [
24-
("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
25-
("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
26-
("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
27-
("LICENSE", "."),
28-
]
29-
30-
try:
31-
datas += collect_data_files("magika")
32-
except Exception as e:
33-
print(f"Warning: Could not collect magika data files: {e}")
34-
35-
for package in ("pypdfium2", "pypdfium2_raw"):
36-
try:
37-
datas += collect_data_files(package)
38-
except Exception as e:
39-
print(f"Warning: Could not collect data files for {package}: {e}")
6+
hiddenimports = build_hiddenimports(collect_submodules, warn=print)
7+
datas = build_datas(collect_data_files, warn=print)
408

419
a = Analysis(
4210
["markitdowngui/main.py"],

markitdowngui/build_config.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from __future__ import annotations
2+
3+
from collections.abc import Callable
4+
5+
BASE_HIDDENIMPORTS = (
6+
"packaging.version",
7+
"requests",
8+
"charset_normalizer",
9+
"charset_normalizer.md",
10+
"charset_normalizer.md__mypyc",
11+
)
12+
MANDATORY_HIDDENIMPORT_PACKAGES = (
13+
"markitdown",
14+
"charset_normalizer",
15+
)
16+
OPTIONAL_HIDDENIMPORT_PACKAGES = (
17+
"azure.ai.documentintelligence",
18+
"azure.identity",
19+
"pypdfium2",
20+
"pypdfium2_raw",
21+
"pytesseract",
22+
)
23+
BASE_DATAS = (
24+
("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
25+
("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
26+
("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
27+
("LICENSE", "."),
28+
)
29+
OPTIONAL_DATA_PACKAGES = (
30+
"magika",
31+
"pypdfium2",
32+
"pypdfium2_raw",
33+
)
34+
35+
36+
def _dedupe(items: list[str]) -> list[str]:
37+
return list(dict.fromkeys(items))
38+
39+
40+
def build_hiddenimports(
41+
collect_submodules: Callable[[str], list[str]],
42+
*,
43+
warn: Callable[[str], None] | None = None,
44+
) -> list[str]:
45+
hiddenimports = list(BASE_HIDDENIMPORTS)
46+
47+
for package in MANDATORY_HIDDENIMPORT_PACKAGES:
48+
hiddenimports.extend(collect_submodules(package))
49+
50+
for package in OPTIONAL_HIDDENIMPORT_PACKAGES:
51+
try:
52+
hiddenimports.extend(collect_submodules(package))
53+
except Exception as exc:
54+
if warn is not None:
55+
warn(
56+
f"Warning: Could not collect hidden imports for {package}: {exc}"
57+
)
58+
59+
return _dedupe(hiddenimports)
60+
61+
62+
def build_datas(
63+
collect_data_files: Callable[[str], list[tuple[str, str]]],
64+
*,
65+
warn: Callable[[str], None] | None = None,
66+
) -> list[tuple[str, str]]:
67+
datas = list(BASE_DATAS)
68+
69+
for package in OPTIONAL_DATA_PACKAGES:
70+
try:
71+
datas.extend(collect_data_files(package))
72+
except Exception as exc:
73+
if warn is not None:
74+
warn(f"Warning: Could not collect data files for {package}: {exc}")
75+
76+
return datas

tests/test_build_config.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from markitdowngui import build_config
2+
3+
4+
def test_build_hiddenimports_includes_charset_normalizer_mypyc_runtime():
5+
calls = []
6+
7+
def fake_collect(package: str) -> list[str]:
8+
calls.append(package)
9+
return {
10+
"markitdown": ["markitdown._markdown"],
11+
"charset_normalizer": ["charset_normalizer.api"],
12+
"azure.ai.documentintelligence": ["azure.ai.documentintelligence._client"],
13+
"azure.identity": [],
14+
"pypdfium2": [],
15+
"pypdfium2_raw": [],
16+
"pytesseract": [],
17+
}[package]
18+
19+
hiddenimports = build_config.build_hiddenimports(fake_collect)
20+
21+
assert "charset_normalizer" in hiddenimports
22+
assert "charset_normalizer.md" in hiddenimports
23+
assert "charset_normalizer.md__mypyc" in hiddenimports
24+
assert "markitdown._markdown" in hiddenimports
25+
assert calls[:2] == ["markitdown", "charset_normalizer"]
26+
27+
28+
def test_build_hiddenimports_warns_and_keeps_required_modules_when_optional_collection_fails():
29+
warnings = []
30+
31+
def fake_collect(package: str) -> list[str]:
32+
if package == "markitdown":
33+
return []
34+
if package == "charset_normalizer":
35+
return []
36+
if package == "pytesseract":
37+
raise RuntimeError("missing optional package")
38+
return []
39+
40+
hiddenimports = build_config.build_hiddenimports(
41+
fake_collect,
42+
warn=warnings.append,
43+
)
44+
45+
assert "charset_normalizer.md__mypyc" in hiddenimports
46+
assert warnings == [
47+
"Warning: Could not collect hidden imports for pytesseract: missing optional package"
48+
]
49+
50+
51+
def test_build_datas_keeps_base_files_and_warns_for_missing_optional_packages():
52+
warnings = []
53+
54+
def fake_collect(package: str) -> list[tuple[str, str]]:
55+
if package == "magika":
56+
return [("magika/model.onnx", "magika")]
57+
if package == "pypdfium2":
58+
raise RuntimeError("missing pdf runtime")
59+
if package == "pypdfium2_raw":
60+
return [("pdfium.dll", "pypdfium2_raw")]
61+
raise AssertionError(f"Unexpected package: {package}")
62+
63+
datas = build_config.build_datas(fake_collect, warn=warnings.append)
64+
65+
assert ("LICENSE", ".") in datas
66+
assert ("magika/model.onnx", "magika") in datas
67+
assert ("pdfium.dll", "pypdfium2_raw") in datas
68+
assert warnings == [
69+
"Warning: Could not collect data files for pypdfium2: missing pdf runtime"
70+
]

0 commit comments

Comments
 (0)