Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 3 additions & 35 deletions MarkItDown.spec
Original file line number Diff line number Diff line change
@@ -1,42 +1,10 @@
# -*- mode: python ; coding: utf-8 -*-
import os
from PyInstaller.utils.hooks import collect_data_files, collect_submodules
from markitdowngui.build_config import build_datas, build_hiddenimports

# Keep hidden imports minimal and focused on runtime-dynamic conversion modules.
hiddenimports = [
"packaging.version",
"requests",
]
hiddenimports += collect_submodules("markitdown")
for package in (
"azure.ai.documentintelligence",
"azure.identity",
"pypdfium2",
"pypdfium2_raw",
"pytesseract",
):
try:
hiddenimports += collect_submodules(package)
except Exception as e:
print(f"Warning: Could not collect hidden imports for {package}: {e}")

datas = [
("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
("LICENSE", "."),
]

try:
datas += collect_data_files("magika")
except Exception as e:
print(f"Warning: Could not collect magika data files: {e}")

for package in ("pypdfium2", "pypdfium2_raw"):
try:
datas += collect_data_files(package)
except Exception as e:
print(f"Warning: Could not collect data files for {package}: {e}")
hiddenimports = build_hiddenimports(collect_submodules, warn=print)
datas = build_datas(collect_data_files, warn=print)

a = Analysis(
["markitdowngui/main.py"],
Expand Down
76 changes: 76 additions & 0 deletions markitdowngui/build_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import annotations

from collections.abc import Callable

BASE_HIDDENIMPORTS = (
"packaging.version",
"requests",
"charset_normalizer",
"charset_normalizer.md",
"charset_normalizer.md__mypyc",
)
MANDATORY_HIDDENIMPORT_PACKAGES = (
"markitdown",
"charset_normalizer",
)
OPTIONAL_HIDDENIMPORT_PACKAGES = (
"azure.ai.documentintelligence",
"azure.identity",
"pypdfium2",
"pypdfium2_raw",
"pytesseract",
)
BASE_DATAS = (
("markitdowngui/resources/markitdown-gui.ico", "markitdowngui/resources"),
("markitdowngui/resources/moon.svg", "markitdowngui/resources"),
("markitdowngui/resources/sun.svg", "markitdowngui/resources"),
("LICENSE", "."),
)
OPTIONAL_DATA_PACKAGES = (
"magika",
"pypdfium2",
"pypdfium2_raw",
)


def _dedupe(items: list[str]) -> list[str]:
return list(dict.fromkeys(items))


def build_hiddenimports(
collect_submodules: Callable[[str], list[str]],
*,
warn: Callable[[str], None] | None = None,
) -> list[str]:
hiddenimports = list(BASE_HIDDENIMPORTS)

for package in MANDATORY_HIDDENIMPORT_PACKAGES:
hiddenimports.extend(collect_submodules(package))

for package in OPTIONAL_HIDDENIMPORT_PACKAGES:
try:
hiddenimports.extend(collect_submodules(package))
except Exception as exc:
if warn is not None:
warn(
f"Warning: Could not collect hidden imports for {package}: {exc}"
)

return _dedupe(hiddenimports)


def build_datas(
collect_data_files: Callable[[str], list[tuple[str, str]]],
*,
warn: Callable[[str], None] | None = None,
) -> list[tuple[str, str]]:
datas = list(BASE_DATAS)

for package in OPTIONAL_DATA_PACKAGES:
try:
datas.extend(collect_data_files(package))
except Exception as exc:
if warn is not None:
warn(f"Warning: Could not collect data files for {package}: {exc}")

return datas
70 changes: 70 additions & 0 deletions tests/test_build_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from markitdowngui import build_config


def test_build_hiddenimports_includes_charset_normalizer_mypyc_runtime():
calls = []

def fake_collect(package: str) -> list[str]:
calls.append(package)
return {
"markitdown": ["markitdown._markdown"],
"charset_normalizer": ["charset_normalizer.api"],
"azure.ai.documentintelligence": ["azure.ai.documentintelligence._client"],
"azure.identity": [],
"pypdfium2": [],
"pypdfium2_raw": [],
"pytesseract": [],
}[package]

hiddenimports = build_config.build_hiddenimports(fake_collect)

assert "charset_normalizer" in hiddenimports
assert "charset_normalizer.md" in hiddenimports
assert "charset_normalizer.md__mypyc" in hiddenimports
assert "markitdown._markdown" in hiddenimports
assert calls[:2] == ["markitdown", "charset_normalizer"]


def test_build_hiddenimports_warns_and_keeps_required_modules_when_optional_collection_fails():
warnings = []

def fake_collect(package: str) -> list[str]:
if package == "markitdown":
return []
if package == "charset_normalizer":
return []
if package == "pytesseract":
raise RuntimeError("missing optional package")
return []

hiddenimports = build_config.build_hiddenimports(
fake_collect,
warn=warnings.append,
)

assert "charset_normalizer.md__mypyc" in hiddenimports
assert warnings == [
"Warning: Could not collect hidden imports for pytesseract: missing optional package"
]


def test_build_datas_keeps_base_files_and_warns_for_missing_optional_packages():
warnings = []

def fake_collect(package: str) -> list[tuple[str, str]]:
if package == "magika":
return [("magika/model.onnx", "magika")]
if package == "pypdfium2":
raise RuntimeError("missing pdf runtime")
if package == "pypdfium2_raw":
return [("pdfium.dll", "pypdfium2_raw")]
raise AssertionError(f"Unexpected package: {package}")

datas = build_config.build_datas(fake_collect, warn=warnings.append)

assert ("LICENSE", ".") in datas
assert ("magika/model.onnx", "magika") in datas
assert ("pdfium.dll", "pypdfium2_raw") in datas
assert warnings == [
"Warning: Could not collect data files for pypdfium2: missing pdf runtime"
]