diff --git a/MarkItDown.spec b/MarkItDown.spec index 3e100dc..613231d 100644 --- a/MarkItDown.spec +++ b/MarkItDown.spec @@ -1,42 +1,10 @@ # -*- mode: python ; coding: utf-8 -*- import os from PyInstaller.utils.hooks import collect_data_files, collect_submodules +from markitdowngui.build_config import build_datas, build_hiddenimports -# Keep hidden imports minimal and focused on runtime-dynamic conversion modules. -hiddenimports = [ - "packaging.version", - "requests", -] -hiddenimports += collect_submodules("markitdown") -for package in ( - "azure.ai.documentintelligence", - "azure.identity", - "pypdfium2", - "pypdfium2_raw", - "pytesseract", -): - try: - hiddenimports += collect_submodules(package) - except Exception as e: - print(f"Warning: Could not collect hidden imports for {package}: {e}") - -datas = [ - ("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"), - ("markitdowngui/resources/moon.svg", "markitdowngui/resources"), - ("markitdowngui/resources/sun.svg", "markitdowngui/resources"), - ("LICENSE", "."), -] - -try: - datas += collect_data_files("magika") -except Exception as e: - print(f"Warning: Could not collect magika data files: {e}") - -for package in ("pypdfium2", "pypdfium2_raw"): - try: - datas += collect_data_files(package) - except Exception as e: - print(f"Warning: Could not collect data files for {package}: {e}") +hiddenimports = build_hiddenimports(collect_submodules, warn=print) +datas = build_datas(collect_data_files, warn=print) a = Analysis( ["markitdowngui/main.py"], diff --git a/markitdowngui/build_config.py b/markitdowngui/build_config.py new file mode 100644 index 0000000..4347271 --- /dev/null +++ b/markitdowngui/build_config.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from collections.abc import Callable + +BASE_HIDDENIMPORTS = ( + "packaging.version", + "requests", + "charset_normalizer", + "charset_normalizer.md", + "charset_normalizer.md__mypyc", +) +MANDATORY_HIDDENIMPORT_PACKAGES = ( + "markitdown", + "charset_normalizer", +) +OPTIONAL_HIDDENIMPORT_PACKAGES = ( + "azure.ai.documentintelligence", + "azure.identity", + "pypdfium2", + "pypdfium2_raw", + "pytesseract", +) +BASE_DATAS = ( + ("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"), + ("markitdowngui/resources/moon.svg", "markitdowngui/resources"), + ("markitdowngui/resources/sun.svg", "markitdowngui/resources"), + ("LICENSE", "."), +) +OPTIONAL_DATA_PACKAGES = ( + "magika", + "pypdfium2", + "pypdfium2_raw", +) + + +def _dedupe(items: list[str]) -> list[str]: + return list(dict.fromkeys(items)) + + +def build_hiddenimports( + collect_submodules: Callable[[str], list[str]], + *, + warn: Callable[[str], None] | None = None, +) -> list[str]: + hiddenimports = list(BASE_HIDDENIMPORTS) + + for package in MANDATORY_HIDDENIMPORT_PACKAGES: + hiddenimports.extend(collect_submodules(package)) + + for package in OPTIONAL_HIDDENIMPORT_PACKAGES: + try: + hiddenimports.extend(collect_submodules(package)) + except Exception as exc: + if warn is not None: + warn( + f"Warning: Could not collect hidden imports for {package}: {exc}" + ) + + return _dedupe(hiddenimports) + + +def build_datas( + collect_data_files: Callable[[str], list[tuple[str, str]]], + *, + warn: Callable[[str], None] | None = None, +) -> list[tuple[str, str]]: + datas = list(BASE_DATAS) + + for package in OPTIONAL_DATA_PACKAGES: + try: + datas.extend(collect_data_files(package)) + except Exception as exc: + if warn is not None: + warn(f"Warning: Could not collect data files for {package}: {exc}") + + return datas diff --git a/tests/test_build_config.py b/tests/test_build_config.py new file mode 100644 index 0000000..f4efc9d --- /dev/null +++ b/tests/test_build_config.py @@ -0,0 +1,70 @@ +from markitdowngui import build_config + + +def test_build_hiddenimports_includes_charset_normalizer_mypyc_runtime(): + calls = [] + + def fake_collect(package: str) -> list[str]: + calls.append(package) + return { + "markitdown": ["markitdown._markdown"], + "charset_normalizer": ["charset_normalizer.api"], + "azure.ai.documentintelligence": ["azure.ai.documentintelligence._client"], + "azure.identity": [], + "pypdfium2": [], + "pypdfium2_raw": [], + "pytesseract": [], + }[package] + + hiddenimports = build_config.build_hiddenimports(fake_collect) + + assert "charset_normalizer" in hiddenimports + assert "charset_normalizer.md" in hiddenimports + assert "charset_normalizer.md__mypyc" in hiddenimports + assert "markitdown._markdown" in hiddenimports + assert calls[:2] == ["markitdown", "charset_normalizer"] + + +def test_build_hiddenimports_warns_and_keeps_required_modules_when_optional_collection_fails(): + warnings = [] + + def fake_collect(package: str) -> list[str]: + if package == "markitdown": + return [] + if package == "charset_normalizer": + return [] + if package == "pytesseract": + raise RuntimeError("missing optional package") + return [] + + hiddenimports = build_config.build_hiddenimports( + fake_collect, + warn=warnings.append, + ) + + assert "charset_normalizer.md__mypyc" in hiddenimports + assert warnings == [ + "Warning: Could not collect hidden imports for pytesseract: missing optional package" + ] + + +def test_build_datas_keeps_base_files_and_warns_for_missing_optional_packages(): + warnings = [] + + def fake_collect(package: str) -> list[tuple[str, str]]: + if package == "magika": + return [("magika/model.onnx", "magika")] + if package == "pypdfium2": + raise RuntimeError("missing pdf runtime") + if package == "pypdfium2_raw": + return [("pdfium.dll", "pypdfium2_raw")] + raise AssertionError(f"Unexpected package: {package}") + + datas = build_config.build_datas(fake_collect, warn=warnings.append) + + assert ("LICENSE", ".") in datas + assert ("magika/model.onnx", "magika") in datas + assert ("pdfium.dll", "pypdfium2_raw") in datas + assert warnings == [ + "Warning: Could not collect data files for pypdfium2: missing pdf runtime" + ]