diff --git a/README.md b/README.md index 3f13987..be0bcd6 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,9 @@ It focuses on fast multi-file conversion to Markdown with a modern Fluent-style ## Features -- Queue-based file workflow with drag and drop. -- Batch conversion with start, pause/resume, cancel, and progress feedback. +- Queue-based file workflow with drag and drop. +- Paste website URLs and convert article content to Markdown with the hosted Defuddle API. +- Batch conversion with start, pause/resume, cancel, and progress feedback. - Results view with per-file selection and Markdown preview. - Preview modes: rendered Markdown view and raw Markdown view. - Save modes: export as one combined file or separate files. @@ -23,10 +24,10 @@ It focuses on fast multi-file conversion to Markdown with a modern Fluent-style Download prebuilt binaries from [Releases](https://github.com/imadreamerboy/markitdown-gui/releases), or run from source. -### Prerequisites - -- Python `3.10+` -- `uv` (recommended) +### Prerequisites + +- Python `3.10+` +- `uv` (recommended) Install dependencies: @@ -40,14 +41,23 @@ Alternative: pip install -e .[dev] ``` -### OCR Notes +### OCR Notes - OCR is optional and disabled by default. - Local OCR requires a system `tesseract` binary. Install it from the [official Tesseract project](https://github.com/tesseract-ocr/tesseract). If it is not on your `PATH`, set the executable path in Settings. - Azure OCR requires an Azure Document Intelligence endpoint in Settings. - Azure Document Intelligence pricing includes [500 free pages per month](https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing) at the time of writing. - For API-key auth, set `AZURE_OCR_API_KEY`. -- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`. +- If `AZURE_OCR_API_KEY` is not set, Azure OCR falls back to Azure identity credentials supported by `DefaultAzureCredential`. + +### Website URL Notes + +- Website conversion uses the hosted [Defuddle](https://defuddle.md/) API. +- The app sends the pasted `http://` or `https://` URL to `https://defuddle.md/` and stores the returned Markdown in the normal results view. +- Defuddle responses typically include YAML frontmatter metadata at the top when available. +- According to the [Defuddle Terms](https://defuddle.md/terms), unauthenticated requests are limited to `1,000` requests per month per IP address as of March 14, 2026. +- Because requests are sent directly from the desktop app, that free-tier limit applies to the user's own network IP. +- Website conversion requires an internet connection and depends on the external Defuddle service being available. ## Run the App diff --git a/markitdowngui/core/conversion.py b/markitdowngui/core/conversion.py index f2fb90c..cad4a56 100644 --- a/markitdowngui/core/conversion.py +++ b/markitdowngui/core/conversion.py @@ -1,22 +1,30 @@ from __future__ import annotations -from dataclasses import dataclass -from itertools import islice -import os -from pathlib import Path - -from PySide6.QtCore import QThread, Signal - -IMAGE_EXTENSIONS = {".bmp", ".gif", ".jpeg", ".jpg", ".png", ".tiff", ".webp"} -DOCINTEL_IMAGE_EXTENSIONS = {".bmp", ".jpeg", ".jpg", ".png", ".tiff"} -PDF_EXTENSION = ".pdf" -PDF_RENDER_SCALE = 3.0 -LOCAL_OCR_TIMEOUT_SECONDS = 60 -AZURE_OCR_API_KEY_ENV_VAR = "AZURE_OCR_API_KEY" -CONVERSION_ERROR_PREFIX = "Error converting " -BACKEND_AZURE = "azure" -BACKEND_LOCAL = "local" -BACKEND_NATIVE = "native" +from dataclasses import dataclass +from itertools import islice +import os +from pathlib import Path +from urllib.parse import quote + +import requests + +from PySide6.QtCore import QThread, Signal + +from markitdowngui.core.input_sources import is_web_url + +IMAGE_EXTENSIONS = {".bmp", ".gif", ".jpeg", ".jpg", ".png", ".tiff", ".webp"} +DOCINTEL_IMAGE_EXTENSIONS = {".bmp", ".jpeg", ".jpg", ".png", ".tiff"} +PDF_EXTENSION = ".pdf" +PDF_RENDER_SCALE = 3.0 +LOCAL_OCR_TIMEOUT_SECONDS = 60 +DEFUDDLE_REQUEST_TIMEOUT_SECONDS = 30 +DEFUDDLE_API_BASE_URL = "https://defuddle.md/" +AZURE_OCR_API_KEY_ENV_VAR = "AZURE_OCR_API_KEY" +CONVERSION_ERROR_PREFIX = "Error converting " +BACKEND_AZURE = "azure" +BACKEND_DEFUDDLE = "defuddle" +BACKEND_LOCAL = "local" +BACKEND_NATIVE = "native" @dataclass(frozen=True) @@ -144,13 +152,20 @@ def test_azure_ocr_connection(options: ConversionOptions) -> str: return "api_key" -def convert_file_with_details( - file_path: str, - options: ConversionOptions | None = None, -) -> ConversionOutcome: - """Convert a single file to Markdown text and report which backend produced it.""" - effective_options = options or ConversionOptions() - extension = Path(file_path).suffix.lower() +def convert_file_with_details( + file_path: str, + options: ConversionOptions | None = None, +) -> ConversionOutcome: + """Convert a single file to Markdown text and report which backend produced it.""" + effective_options = options or ConversionOptions() + + if is_web_url(file_path): + return ConversionOutcome( + markdown=_convert_url_with_defuddle(file_path), + backend=BACKEND_DEFUDDLE, + ) + + extension = Path(file_path).suffix.lower() if not effective_options.ocr_enabled: return ConversionOutcome( @@ -259,10 +274,10 @@ def _convert_pdf_with_ocr_fallback( ) -def _convert_with_markitdown( - file_path: str, - options: ConversionOptions, - *, +def _convert_with_markitdown( + file_path: str, + options: ConversionOptions, + *, use_docintel: bool = False, ) -> str: # Delay heavy imports until conversion is requested. @@ -274,8 +289,42 @@ def _convert_with_markitdown( kwargs["docintel_credential"], _auth_method = _build_docintel_credential() md = MarkItDown(**kwargs) - result = md.convert(file_path) - return result.text_content or "" + result = md.convert(file_path) + return result.text_content or "" + + +def _convert_url_with_defuddle(url: str) -> str: + request_url = _build_defuddle_request_url(url) + + try: + response = requests.get( + request_url, + timeout=DEFUDDLE_REQUEST_TIMEOUT_SECONDS, + ) + except requests.Timeout as exc: + raise RuntimeError( + "Website conversion timed out while waiting for the Defuddle service." + ) from exc + except requests.RequestException as exc: + raise RuntimeError( + f"Website conversion failed to reach the Defuddle service: {exc}" + ) from exc + + if response.status_code == 429: + raise RuntimeError( + "Defuddle rate limit reached. The free tier allows up to 1,000 requests per month per IP." + ) + + if not response.ok: + message = response.text.strip() + raise RuntimeError(message or "Defuddle failed to convert the URL.") + + return response.text.strip() + + +def _build_defuddle_request_url(url: str) -> str: + encoded_url = quote(url.strip(), safe="") + return f"{DEFUDDLE_API_BASE_URL}{encoded_url}" def _convert_image_with_local_ocr(file_path: str, options: ConversionOptions) -> str: diff --git a/markitdowngui/core/input_sources.py b/markitdowngui/core/input_sources.py new file mode 100644 index 0000000..2b9b1fd --- /dev/null +++ b/markitdowngui/core/input_sources.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import re +from pathlib import Path, PureWindowsPath +from urllib.parse import unquote, urlparse + +WEB_URL_SCHEMES = {"http", "https"} +UNSAFE_FILENAME_CHARS = re.compile(r"[^A-Za-z0-9._-]+") + + +def is_web_url(value: str) -> bool: + candidate = value.strip() + if not candidate: + return False + if any(ch.isspace() or ord(ch) < 32 for ch in candidate): + return False + + parsed = urlparse(candidate) + return parsed.scheme.lower() in WEB_URL_SCHEMES and bool(parsed.netloc) + + +def _source_path(source: str) -> Path | PureWindowsPath: + candidate = source.strip() + if "\\" in candidate: + return PureWindowsPath(candidate) + return Path(candidate) + + +def source_display_name(source: str) -> str: + return source.strip() if is_web_url(source) else _source_path(source).name or source + + +def source_output_stem(source: str) -> str: + if not is_web_url(source): + return _source_path(source).stem or "converted" + + parsed = urlparse(source.strip()) + path_parts = [part for part in parsed.path.split("/") if part] + slug = unquote(path_parts[-1]) if path_parts else "" + query = parsed.query.split("&", 1)[0] if parsed.query else "" + + segments = [parsed.netloc] + if slug: + segments.append(slug) + elif query: + segments.append(query) + + candidate = "-".join(segments) + sanitized = UNSAFE_FILENAME_CHARS.sub("-", candidate).strip("._-") + return sanitized or "website" diff --git a/markitdowngui/ui/components/url_input_bar.py b/markitdowngui/ui/components/url_input_bar.py new file mode 100644 index 0000000..8062a74 --- /dev/null +++ b/markitdowngui/ui/components/url_input_bar.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from PySide6.QtCore import Signal +from PySide6.QtWidgets import QHBoxLayout, QWidget +from qfluentwidgets import LineEdit, PushButton + + +class UrlInputBar(QWidget): + url_submitted = Signal(str) + + def __init__(self, translate, parent=None): + super().__init__(parent=parent) + self.translate = translate + + layout = QHBoxLayout(self) + layout.setContentsMargins(0, 0, 0, 0) + layout.setSpacing(8) + + self.url_edit = LineEdit(self) + self.url_edit.returnPressed.connect(self.submit_url) + + self.submit_button = PushButton(self) + self.submit_button.clicked.connect(self.submit_url) + + layout.addWidget(self.url_edit, 1) + layout.addWidget(self.submit_button) + + self.retranslate_ui(translate) + + def submit_url(self) -> None: + value = self.url_edit.text().strip() + if value: + self.url_submitted.emit(value) + + def clear(self) -> None: + self.url_edit.clear() + + def retranslate_ui(self, translate) -> None: + self.translate = translate + self.url_edit.setPlaceholderText(self.translate("home_url_placeholder")) + self.submit_button.setText(self.translate("home_add_url_button")) diff --git a/markitdowngui/ui/help_interface.py b/markitdowngui/ui/help_interface.py index 5eff907..52e4553 100644 --- a/markitdowngui/ui/help_interface.py +++ b/markitdowngui/ui/help_interface.py @@ -76,13 +76,19 @@ def _build_ui(self) -> None: repo_btn = HyperlinkButton() repo_btn.setText(self.translate("help_open_repository")) - repo_btn.setIcon(FIF.GITHUB) - repo_btn.setUrl(QUrl("https://github.com/imadreamerboy/markitdown-gui")) - layout.addWidget(repo_btn, 0, Qt.AlignmentFlag.AlignLeft) - - azure_pricing_btn = HyperlinkButton() - azure_pricing_btn.setText(self.translate("help_open_azure_ocr_pricing")) - azure_pricing_btn.setIcon(FIF.LINK) + repo_btn.setIcon(FIF.GITHUB) + repo_btn.setUrl(QUrl("https://github.com/imadreamerboy/markitdown-gui")) + layout.addWidget(repo_btn, 0, Qt.AlignmentFlag.AlignLeft) + + defuddle_docs_btn = HyperlinkButton() + defuddle_docs_btn.setText(self.translate("help_open_defuddle_docs")) + defuddle_docs_btn.setIcon(FIF.LINK) + defuddle_docs_btn.setUrl(QUrl("https://defuddle.md/docs")) + layout.addWidget(defuddle_docs_btn, 0, Qt.AlignmentFlag.AlignLeft) + + azure_pricing_btn = HyperlinkButton() + azure_pricing_btn.setText(self.translate("help_open_azure_ocr_pricing")) + azure_pricing_btn.setIcon(FIF.LINK) azure_pricing_btn.setUrl( QUrl( "https://azure.microsoft.com/en-us/products/ai-foundry/tools/document-intelligence#Pricing" @@ -96,8 +102,22 @@ def _build_ui(self) -> None: tesseract_btn.setUrl(QUrl("https://github.com/tesseract-ocr/tesseract")) layout.addWidget(tesseract_btn, 0, Qt.AlignmentFlag.AlignLeft) - layout.addWidget(TitleLabel(self.translate("help_faq_title"))) - + layout.addWidget(TitleLabel(self.translate("help_faq_title"))) + + layout.addWidget( + self._build_faq_card( + FIF.GLOBE, + "help_faq_defuddle_question", + "help_faq_defuddle_answer", + ) + ) + layout.addWidget( + self._build_faq_card( + FIF.GLOBE, + "help_faq_defuddle_limits_question", + "help_faq_defuddle_limits_answer", + ) + ) layout.addWidget( self._build_faq_card( FIF.FOLDER, diff --git a/markitdowngui/ui/home_interface.py b/markitdowngui/ui/home_interface.py index 832d65f..e1718f0 100644 --- a/markitdowngui/ui/home_interface.py +++ b/markitdowngui/ui/home_interface.py @@ -32,19 +32,26 @@ SegmentedWidget, ) -from markitdowngui.core.conversion import ( - BACKEND_AZURE, - BACKEND_LOCAL, - BACKEND_NATIVE, - ConversionOptions, - ConversionWorker, -) -from markitdowngui.core.file_utils import FileManager -from markitdowngui.core.settings import SettingsManager -from markitdowngui.ui.components.file_panel import FilePanel -from markitdowngui.ui.dialogs.shortcuts import ShortcutDialog -from markitdowngui.ui.home_state import next_state_after_queue_change -from markitdowngui.ui.themes import markdown_html_css +from markitdowngui.core.conversion import ( + BACKEND_AZURE, + BACKEND_DEFUDDLE, + BACKEND_LOCAL, + BACKEND_NATIVE, + ConversionOptions, + ConversionWorker, +) +from markitdowngui.core.file_utils import FileManager +from markitdowngui.core.input_sources import ( + is_web_url, + source_display_name, + source_output_stem, +) +from markitdowngui.core.settings import SettingsManager +from markitdowngui.ui.components.file_panel import FilePanel +from markitdowngui.ui.components.url_input_bar import UrlInputBar +from markitdowngui.ui.dialogs.shortcuts import ShortcutDialog +from markitdowngui.ui.home_state import next_state_after_queue_change +from markitdowngui.ui.themes import markdown_html_css from markitdowngui.utils.logger import AppLogger from markitdowngui.utils.translations import DEFAULT_LANG @@ -114,6 +121,10 @@ def _build_ui(self) -> None: self.empty_select_btn.setIcon(FIF.FOLDER_ADD) self.empty_select_btn.clicked.connect(self.browse_files) empty_layout.addWidget(self.empty_select_btn, 0, Qt.AlignmentFlag.AlignHCenter) + self.empty_url_input = UrlInputBar(self.translate, self.empty_card) + self.empty_url_input.setMaximumWidth(560) + self.empty_url_input.url_submitted.connect(self.submit_url) + empty_layout.addWidget(self.empty_url_input, 0, Qt.AlignmentFlag.AlignHCenter) empty_layout.addWidget( self.supported_formats_label, 0, @@ -128,13 +139,17 @@ def _build_ui(self) -> None: queue_header = QHBoxLayout() queue_header.setSpacing(8) self.queue_title = BodyLabel(self.translate("home_queue_title")) - self.add_files_btn = PillPushButton(self.translate("home_add_files_button")) - self.add_files_btn.setIcon(FIF.ADD) - self.add_files_btn.clicked.connect(self.browse_files) - queue_header.addWidget(self.queue_title) - queue_header.addStretch(1) - queue_header.addWidget(self.add_files_btn) - queue_layout.addLayout(queue_header) + self.add_files_btn = PillPushButton(self.translate("home_add_files_button")) + self.add_files_btn.setIcon(FIF.ADD) + self.add_files_btn.clicked.connect(self.browse_files) + queue_header.addWidget(self.queue_title) + queue_header.addStretch(1) + queue_header.addWidget(self.add_files_btn) + queue_layout.addLayout(queue_header) + + self.queue_url_input = UrlInputBar(self.translate, self.queue_card) + self.queue_url_input.url_submitted.connect(self.submit_url) + queue_layout.addWidget(self.queue_url_input) self.filePanel = FilePanel(self.translate) self.filePanel.files_added.connect(self.handle_files_added) @@ -371,44 +386,59 @@ def dragEnterEvent(self, event) -> None: if event.mimeData().hasUrls(): event.acceptProposedAction() - def dropEvent(self, event) -> None: - if not event.mimeData().hasUrls(): - return - files = [url.toLocalFile() for url in event.mimeData().urls() if url.toLocalFile()] - if files: - self._add_files_to_queue(files) - event.acceptProposedAction() - - def browse_files(self) -> None: - files, _ = QFileDialog.getOpenFileNames( - self, - self.translate("select_files_title"), - "", - self.translate("all_files_filter"), - ) - if files: - self._add_files_to_queue(files) - - def handle_files_added(self, files: list[str]) -> None: - self._add_files_to_queue(files, add_to_panel=False) - - def _add_files_to_queue(self, files: list[str], add_to_panel: bool = True) -> None: - existing = set(self.filePanel.get_all_files()) - added = False - for file in files: - if not file or file in existing: - continue - if add_to_panel: - self.filePanel.add_file(file) - existing.add(file) - added = True - self.handleNewFile(file) - - if added: - had_results = bool(self.conversionResults) - self._set_state_queue() - self._clear_result_views(reset_progress=had_results) - self._update_queue_title() + def dropEvent(self, event) -> None: + if not event.mimeData().hasUrls(): + return + files = [url.toLocalFile() for url in event.mimeData().urls() if url.toLocalFile()] + if files: + self._add_sources_to_queue(files) + event.acceptProposedAction() + + def browse_files(self) -> None: + files, _ = QFileDialog.getOpenFileNames( + self, + self.translate("select_files_title"), + "", + self.translate("all_files_filter"), + ) + if files: + self._add_sources_to_queue(files) + + def handle_files_added(self, files: list[str]) -> None: + self._add_sources_to_queue(files, add_to_panel=False) + + def submit_url(self, url: str) -> None: + candidate = url.strip() + if not is_web_url(candidate): + QMessageBox.warning( + self, + self.translate("home_url_invalid_title"), + self.translate("home_url_invalid_message"), + ) + return + + self._add_sources_to_queue([candidate]) + self.empty_url_input.clear() + self.queue_url_input.clear() + + def _add_sources_to_queue(self, sources: list[str], add_to_panel: bool = True) -> None: + existing = set(self.filePanel.get_all_files()) + added = False + for source in sources: + if not source or source in existing: + continue + if add_to_panel: + self.filePanel.add_file(source) + existing.add(source) + added = True + if not is_web_url(source): + self.handleNewFile(source) + + if added: + had_results = bool(self.conversionResults) + self._set_state_queue() + self._clear_result_views(reset_progress=had_results) + self._update_queue_title() def handleNewFile(self, filepath: str) -> None: try: @@ -455,40 +485,44 @@ def cancel_conversion(self) -> None: self.pause_button.setChecked(False) AppLogger.info(self.translate("conversion_cancelled_log")) - def convert_files(self) -> None: - if self.worker and self.worker.isRunning(): - QMessageBox.warning( - self, - self.translate("conversion_in_progress_title"), + def convert_files(self) -> None: + if self.worker and self.worker.isRunning(): + QMessageBox.warning( + self, + self.translate("conversion_in_progress_title"), self.translate("conversion_in_progress_message"), ) return - files = self.filePanel.get_all_files() - if not files: - QMessageBox.warning( - self, - self.translate("no_files_to_convert_title"), - self.translate("no_files_to_convert_message"), - ) - return - - valid_files = [f for f in files if os.path.exists(f) and os.access(f, os.R_OK)] - if not valid_files: - QMessageBox.warning( - self, - self.translate("no_valid_files_title"), - self.translate("no_valid_files_message"), - ) + sources = self.filePanel.get_all_files() + if not sources: + QMessageBox.warning( + self, + self.translate("no_files_to_convert_title"), + self.translate("no_files_to_convert_message"), + ) + return + + valid_sources = [ + source + for source in sources + if is_web_url(source) or (os.path.exists(source) and os.access(source, os.R_OK)) + ] + if not valid_sources: + QMessageBox.warning( + self, + self.translate("no_valid_files_title"), + self.translate("no_valid_files_message"), + ) return - try: - batch_size = self.settings_manager.get_batch_size() - options = self._build_conversion_options() - self.worker = ConversionWorker(valid_files, batch_size, options) - self.worker.progress.connect(self.update_progress) - self.worker.finished.connect(self.handle_conversion_finished) - self.worker.error.connect(self.handle_conversion_error) + try: + batch_size = self.settings_manager.get_batch_size() + options = self._build_conversion_options() + self.worker = ConversionWorker(valid_sources, batch_size, options) + self.worker.progress.connect(self.update_progress) + self.worker.finished.connect(self.handle_conversion_finished) + self.worker.error.connect(self.handle_conversion_error) self.pause_button.setEnabled(True) self.cancel_button.setEnabled(True) @@ -515,13 +549,13 @@ def _build_conversion_options(self) -> ConversionOptions: tesseract_path=self.settings_manager.get_tesseract_path(), ) - def update_progress(self, progress: int, current_file: str) -> None: - text = self.translate("conversion_progress_format").format( - progress=progress, file=os.path.basename(current_file) - ) - self.progress.setValue(progress) - self.progress.setFormat(text) - self.progress_status.setText(text) + def update_progress(self, progress: int, current_file: str) -> None: + text = self.translate("conversion_progress_format").format( + progress=progress, file=source_display_name(current_file) + ) + self.progress.setValue(progress) + self.progress.setFormat(text) + self.progress_status.setText(text) def handle_conversion_finished(self, results: dict[str, str]) -> None: self.conversionResults = results @@ -563,12 +597,13 @@ def handle_conversion_finished(self, results: dict[str, str]) -> None: parent=self, ) - def _format_processing_backend_summary(self) -> str: - counts = { - BACKEND_AZURE: 0, - BACKEND_LOCAL: 0, - BACKEND_NATIVE: 0, - } + def _format_processing_backend_summary(self) -> str: + counts = { + BACKEND_AZURE: 0, + BACKEND_DEFUDDLE: 0, + BACKEND_LOCAL: 0, + BACKEND_NATIVE: 0, + } for file_path, backend in self.processingBackends.items(): if file_path in self.failedConversionFiles: @@ -576,14 +611,15 @@ def _format_processing_backend_summary(self) -> str: if backend in counts: counts[backend] += 1 - parts: list[str] = [] - for backend, label_key in ( - (BACKEND_AZURE, "conversion_backend_azure"), - (BACKEND_LOCAL, "conversion_backend_local"), - (BACKEND_NATIVE, "conversion_backend_native"), - ): - count = counts[backend] - if count: + parts: list[str] = [] + for backend, label_key in ( + (BACKEND_AZURE, "conversion_backend_azure"), + (BACKEND_DEFUDDLE, "conversion_backend_defuddle"), + (BACKEND_LOCAL, "conversion_backend_local"), + (BACKEND_NATIVE, "conversion_backend_native"), + ): + count = counts[backend] + if count: parts.append( self.translate("conversion_backend_summary_item").format( label=self.translate(label_key), @@ -611,16 +647,16 @@ def _reset_controls(self) -> None: self.convert_button.setEnabled(True) self.worker = None - def _populate_result_view(self) -> None: - self.result_file_list.clear() - for file in self.conversionResults.keys(): - item_text = os.path.basename(file) - self.result_file_list.addItem(item_text) - self.result_file_list.item(self.result_file_list.count() - 1).setData( - Qt.ItemDataRole.UserRole, file - ) - if self.result_file_list.count() > 0: - self.result_file_list.setCurrentRow(0) + def _populate_result_view(self) -> None: + self.result_file_list.clear() + for source in self.conversionResults.keys(): + item_text = source_display_name(source) + self.result_file_list.addItem(item_text) + self.result_file_list.item(self.result_file_list.count() - 1).setData( + Qt.ItemDataRole.UserRole, source + ) + if self.result_file_list.count() > 0: + self.result_file_list.setCurrentRow(0) def _on_result_file_changed(self, current, _previous) -> None: if not current: @@ -629,7 +665,9 @@ def _on_result_file_changed(self, current, _previous) -> None: return file_path = current.data(Qt.ItemDataRole.UserRole) self._set_preview_file_caption( - self.translate("home_preview_for_file").format(file=os.path.basename(file_path)) + self.translate("home_preview_for_file").format( + file=source_display_name(file_path) + ) ) markdown = self.conversionResults.get(file_path, "") self._set_markdown_preview(markdown) @@ -696,8 +734,11 @@ def save_combined_output(self) -> None: if not output_path.lower().endswith(output_ext.lower()): output_path += output_ext - parts = [f"File: {file}\n{content}" for file, content in self.conversionResults.items()] - combined_output = "\n\n".join(parts) + parts = [ + self.translate("conversion_source_heading").format(source=source) + f"\n{content}" + for source, content in self.conversionResults.items() + ] + combined_output = "\n\n".join(parts) try: self.file_manager.save_markdown_file(output_path, combined_output) @@ -722,14 +763,14 @@ def save_individual_outputs(self) -> None: if not output_dir: return - output_ext = self.settings_manager.get_default_output_format() - for input_file, content in self.conversionResults.items(): - base_name = os.path.splitext(os.path.basename(input_file))[0] - output_path = os.path.join(output_dir, f"{base_name}{output_ext}") - counter = 1 - while os.path.exists(output_path): - output_path = os.path.join(output_dir, f"{base_name}_{counter}{output_ext}") - counter += 1 + output_ext = self.settings_manager.get_default_output_format() + for input_file, content in self.conversionResults.items(): + base_name = source_output_stem(input_file) + output_path = os.path.join(output_dir, f"{base_name}{output_ext}") + counter = 1 + while os.path.exists(output_path): + output_path = os.path.join(output_dir, f"{base_name}_{counter}{output_ext}") + counter += 1 try: self.file_manager.save_markdown_file(output_path, content) except Exception: diff --git a/markitdowngui/utils/translations.py b/markitdowngui/utils/translations.py index 5823c32..6a5bf0b 100644 --- a/markitdowngui/utils/translations.py +++ b/markitdowngui/utils/translations.py @@ -23,9 +23,14 @@ "help_description": "Quick access to updates, shortcuts, project links, and OCR resources.", "help_open_releases": "Open Releases Page", "help_open_repository": "Open Repository", + "help_open_defuddle_docs": "Defuddle Docs", "help_open_azure_ocr_pricing": "Azure OCR Pricing", "help_open_tesseract": "Tesseract Installation", "help_faq_title": "FAQ", + "help_faq_defuddle_question": "How does website URL conversion work?", + "help_faq_defuddle_answer": "When you paste an http:// or https:// URL, the app sends that URL to the hosted Defuddle service and stores the Markdown response in the same results view used for files.\n\nDefuddle returns Markdown with YAML frontmatter metadata at the top when available.\n\nThis means website conversion depends on the external Defuddle service and an internet connection.", + "help_faq_defuddle_limits_question": "What are the limits for website conversion?", + "help_faq_defuddle_limits_answer": "The Defuddle terms say unauthenticated requests are limited to 1,000 requests per month per IP address.\n\nBecause this app calls the service directly from your machine, that limit applies to your own network IP, not a shared server.\n\nIf the service returns a rate limit or is unavailable, website conversion will fail until the limit resets or the service is back.", "help_faq_tesseract_windows_question": "How do I set the Tesseract path on Windows?", "help_faq_tesseract_windows_answer": "Install Tesseract first.\n\nIf tesseract.exe is already on PATH, leave the field empty.\nOtherwise, browse to the executable in Settings.\n\nCommon locations:\nC:\\Program Files\\Tesseract-OCR\\tesseract.exe\nC:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe\n\nYou can also run `where tesseract` in PowerShell.", "help_faq_tesseract_macos_question": "How do I set the Tesseract path on macOS?", @@ -38,10 +43,10 @@ "help_faq_local_fallback_answer": "Azure OCR is tried first for supported PDFs and images when an endpoint is configured.\n\nIf Azure authentication or endpoint setup fails, the app falls back to local Tesseract OCR.\n\nCheck the endpoint.\nMake sure AZURE_OCR_API_KEY is set for the app process.\nRestart the app after changing environment variables.", "home_empty_title": "Drag and drop files here", "home_empty_state_title": "Start a new conversion", - "home_empty_subtitle": "or click to select files", - "home_supported_formats": "Supported formats: docx, pptx, xlsx/xls, pdf, epub, html, txt, csv, json, xml, images, zip", - "home_queue_title": "Selected files", - "home_queue_title_with_count": "Files to Convert ({count})", + "home_empty_subtitle": "Add files or paste a website URL", + "home_supported_formats": "Supported inputs: docx, pptx, xlsx/xls, pdf, epub, html, txt, csv, json, xml, images, zip, website URLs", + "home_queue_title": "Selected inputs", + "home_queue_title_with_count": "Items to Convert ({count})", "home_results_title": "Conversion Complete!", "home_source_label": "Source", "home_source_placeholder": "Select a converted file to inspect the source preview.", @@ -54,6 +59,10 @@ "home_rendered_view_button": "Rendered", "home_raw_view_button": "Raw Markdown", "home_add_files_button": "Add Files", + "home_add_url_button": "Add URL", + "home_url_placeholder": "https://example.com/article", + "home_url_invalid_title": "Invalid URL", + "home_url_invalid_message": "Enter a full website URL starting with http:// or https://.", "home_remove_selected_button": "Remove Selected", "home_clear_queue_button": "Clear Queue", "home_back_to_queue_button": "Back to Queue", @@ -118,13 +127,13 @@ "pause_button": "Pause", "resume_button": "Resume", "cancel_button": "Cancel", - "convert_files_button": "Convert Files", + "convert_files_button": "Convert", "output_save_all_in_one_checkbox": "Save all files in one document", "output_save_all_in_one_tooltip": "When unchecked, each file will be saved separately", "copy_output_button": "Copy Output", "save_output_button": "Save Output", - "no_files_to_convert_title": "No Files", - "no_files_to_convert_message": "Please add files to convert.", + "no_files_to_convert_title": "No Input", + "no_files_to_convert_message": "Add at least one file or website URL to convert.", "conversion_progress_format": "{progress}% - Processing: {file}", "conversion_complete_message": "Conversion Complete", "conversion_partial_failure_title": "Conversion Completed With Errors", @@ -132,8 +141,10 @@ "conversion_backend_summary": "Backends used: {details}", "conversion_backend_summary_item": "{label}: {count}", "conversion_backend_azure": "Azure OCR", + "conversion_backend_defuddle": "Defuddle", "conversion_backend_local": "Local OCR", "conversion_backend_native": "Native extraction", + "conversion_source_heading": "Source: {source}", "conversion_error_title": "Error", "save_combined_title": "Save Combined Markdown Output", "markdown_files_filter": "Markdown Files (*.md);;All Files (*)", @@ -199,8 +210,8 @@ # Error handling translations "conversion_in_progress_title": "Conversion In Progress", "conversion_in_progress_message": "A conversion is already in progress. Please wait for it to complete or cancel it first.", - "no_valid_files_title": "No Valid Files", - "no_valid_files_message": "No valid files found. Please check that the files exist and are readable.", + "no_valid_files_title": "No Valid Input", + "no_valid_files_message": "No readable files or valid website URLs were found.", "settings_error_title": "Settings Error", "settings_error_message": "Error loading format settings: {error}", "markitdown_config_error_title": "Configuration Error", @@ -233,9 +244,14 @@ "help_description": "快速访问更新、快捷键、项目链接和 OCR 相关资源。", "help_open_releases": "打开发布页面", "help_open_repository": "打开仓库", + "help_open_defuddle_docs": "Defuddle 文档", "help_open_azure_ocr_pricing": "Azure OCR 定价", "help_open_tesseract": "Tesseract 安装说明", "help_faq_title": "常见问题", + "help_faq_defuddle_question": "网页 URL 转换是如何工作的?", + "help_faq_defuddle_answer": "当你粘贴以 http:// 或 https:// 开头的 URL 时,应用会把该 URL 发送到托管的 Defuddle 服务,并将返回的 Markdown 放入与文件转换相同的结果视图中。\n\nDefuddle 在可用时会返回包含 YAML frontmatter 元数据的 Markdown。\n\n这意味着网页转换依赖外部 Defuddle 服务和网络连接。", + "help_faq_defuddle_limits_question": "网页转换有哪些限制?", + "help_faq_defuddle_limits_answer": "Defuddle 的条款说明,未认证请求限制为每个 IP 地址每月 1,000 次。\n\n由于这个应用直接从你的设备调用该服务,这个限制作用于你自己的网络 IP,而不是共享服务器。\n\n如果服务返回限流或暂时不可用,网页转换会失败,直到额度重置或服务恢复。", "help_faq_tesseract_windows_question": "Windows 上如何设置 Tesseract 路径?", "help_faq_tesseract_windows_answer": "先安装 Tesseract。\n\n如果 tesseract.exe 已在 PATH 中,可将该字段留空。\n否则请在设置中浏览到可执行文件。\n\n常见路径:\nC:\\Program Files\\Tesseract-OCR\\tesseract.exe\nC:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe\n\n也可以在 PowerShell 中运行 `where tesseract`。", "help_faq_tesseract_macos_question": "macOS 上如何设置 Tesseract 路径?", @@ -248,10 +264,10 @@ "help_faq_local_fallback_answer": "只要已配置终结点,支持的 PDF 和图片会先尝试 Azure OCR。\n\n如果 Azure 认证或终结点配置失败,应用会回退到本地 Tesseract OCR。\n\n请检查终结点。\n确认应用进程能读取到 AZURE_OCR_API_KEY。\n修改环境变量后请重启应用。", "home_empty_title": "将文件拖放到这里", "home_empty_state_title": "开始新的转换", - "home_empty_subtitle": "或点击选择文件", - "home_supported_formats": "支持格式: docx, pptx, xlsx/xls, pdf, epub, html, txt, csv, json, xml, 图片, zip", - "home_queue_title": "已选择文件", - "home_queue_title_with_count": "待转换文件 ({count})", + "home_empty_subtitle": "添加文件或粘贴网页 URL", + "home_supported_formats": "支持的输入: docx, pptx, xlsx/xls, pdf, epub, html, txt, csv, json, xml, 图片, zip, 网页 URL", + "home_queue_title": "已选择输入", + "home_queue_title_with_count": "待转换项目 ({count})", "home_results_title": "转换完成!", "home_source_label": "源内容", "home_source_placeholder": "选择已转换文件以查看源内容预览。", @@ -264,6 +280,10 @@ "home_rendered_view_button": "渲染视图", "home_raw_view_button": "原始 Markdown", "home_add_files_button": "添加文件", + "home_add_url_button": "添加 URL", + "home_url_placeholder": "https://example.com/article", + "home_url_invalid_title": "URL 无效", + "home_url_invalid_message": "请输入以 http:// 或 https:// 开头的完整网页 URL。", "home_remove_selected_button": "移除所选", "home_clear_queue_button": "清空队列", "home_back_to_queue_button": "返回队列", @@ -328,13 +348,13 @@ "pause_button": "暂停", "resume_button": "恢复", "cancel_button": "取消", - "convert_files_button": "转换文件", + "convert_files_button": "开始转换", "output_save_all_in_one_checkbox": "将所有文件保存在一个文档中", "output_save_all_in_one_tooltip": "取消选中时,每个文件将单独保存", "copy_output_button": "复制输出", "save_output_button": "保存输出", - "no_files_to_convert_title": "无文件", - "no_files_to_convert_message": "请添加要转换的文件。", + "no_files_to_convert_title": "无输入", + "no_files_to_convert_message": "请至少添加一个文件或网页 URL。", "conversion_progress_format": "{progress}% - 正在处理: {file}", "conversion_complete_message": "转换完成", "conversion_partial_failure_title": "转换已完成,但存在错误", @@ -342,8 +362,10 @@ "conversion_backend_summary": "使用的后端: {details}", "conversion_backend_summary_item": "{label}: {count}", "conversion_backend_azure": "Azure OCR", + "conversion_backend_defuddle": "Defuddle", "conversion_backend_local": "本地 OCR", "conversion_backend_native": "原生提取", + "conversion_source_heading": "来源: {source}", "conversion_error_title": "错误", "save_combined_title": "保存合并的 Markdown 输出", "markdown_files_filter": "Markdown 文件 (*.md);;所有文件 (*)", @@ -401,8 +423,8 @@ # Error handling translations "conversion_in_progress_title": "转换进行中", "conversion_in_progress_message": "转换已在进行中。请等待完成或先取消当前转换。", - "no_valid_files_title": "无有效文件", - "no_valid_files_message": "未找到有效文件。请检查文件是否存在且可读取。", + "no_valid_files_title": "无有效输入", + "no_valid_files_message": "未找到可读取的文件或有效的网页 URL。", "settings_error_title": "设置错误", "settings_error_message": "加载格式设置时出错: {error}", "markitdown_config_error_title": "配置错误", diff --git a/tests/core/test_conversion.py b/tests/core/test_conversion.py index 348b9c6..b52e4b9 100644 --- a/tests/core/test_conversion.py +++ b/tests/core/test_conversion.py @@ -37,8 +37,8 @@ def conversion(monkeypatch): return importlib.reload(module) -def test_convert_file_uses_markitdown_when_ocr_disabled(monkeypatch, conversion): - calls = [] +def test_convert_file_uses_markitdown_when_ocr_disabled(monkeypatch, conversion): + calls = [] def fake_convert(file_path, options, use_docintel=False): calls.append((file_path, use_docintel)) @@ -51,12 +51,73 @@ def fake_convert(file_path, options, use_docintel=False): conversion.ConversionOptions(ocr_enabled=False), ) - assert result == "native text" - assert calls == [("scan.png", False)] - - -def test_convert_image_prefers_docintel_when_configured(monkeypatch, conversion): - calls = [] + assert result == "native text" + assert calls == [("scan.png", False)] + + +def test_convert_url_uses_defuddle_http_api(monkeypatch, conversion): + captured = {} + + class FakeResponse: + status_code = 200 + ok = True + text = "# Article\n" + + def fake_get(url, **kwargs): + captured["url"] = url + captured["kwargs"] = kwargs + return FakeResponse() + + monkeypatch.setattr(conversion.requests, "get", fake_get) + + outcome = conversion.convert_file_with_details("https://example.com/article") + + assert outcome.markdown == "# Article" + assert outcome.backend == conversion.BACKEND_DEFUDDLE + expected_url = conversion._build_defuddle_request_url("https://example.com/article") + assert captured["url"] == expected_url + assert captured["kwargs"]["timeout"] == conversion.DEFUDDLE_REQUEST_TIMEOUT_SECONDS + + +def test_build_defuddle_request_url_encodes_embedded_url(conversion): + request_url = conversion._build_defuddle_request_url( + "https://example.com/article?a=1&key=abc#intro" + ) + + assert ( + request_url + == "https://defuddle.md/https%3A%2F%2Fexample.com%2Farticle%3Fa%3D1%26key%3Dabc%23intro" + ) + + +def test_convert_url_surfaces_rate_limit(monkeypatch, conversion): + class FakeResponse: + status_code = 429 + ok = False + text = "Too many requests" + + monkeypatch.setattr(conversion.requests, "get", lambda *_args, **_kwargs: FakeResponse()) + + with pytest.raises(RuntimeError) as exc_info: + conversion.convert_file("https://example.com/article") + + assert "1,000 requests per month per IP" in str(exc_info.value) + + +def test_convert_url_surfaces_request_errors(monkeypatch, conversion): + def fake_get(*_args, **_kwargs): + raise conversion.requests.RequestException("network down") + + monkeypatch.setattr(conversion.requests, "get", fake_get) + + with pytest.raises(RuntimeError) as exc_info: + conversion.convert_file("https://example.com/article") + + assert "failed to reach the Defuddle service" in str(exc_info.value) + + +def test_convert_image_prefers_docintel_when_configured(monkeypatch, conversion): + calls = [] def fake_convert(file_path, options, use_docintel=False): calls.append(use_docintel) diff --git a/tests/core/test_input_sources.py b/tests/core/test_input_sources.py new file mode 100644 index 0000000..5000d29 --- /dev/null +++ b/tests/core/test_input_sources.py @@ -0,0 +1,28 @@ +from markitdowngui.core.input_sources import ( + is_web_url, + source_display_name, + source_output_stem, +) + + +def test_is_web_url_accepts_http_and_https(): + assert is_web_url("https://example.com/article") is True + assert is_web_url("http://example.com/article") is True + assert is_web_url("example.com/article") is False + assert is_web_url(r"C:\docs\article.html") is False + assert is_web_url("https://example.com/hello world") is False + assert is_web_url("https://example.com/hello\tworld") is False + + +def test_source_display_name_uses_basename_for_files(): + assert source_display_name(r"C:\docs\article.html") == "article.html" + + +def test_source_display_name_preserves_full_url(): + url = "https://example.com/posts/hello-world?ref=test" + assert source_display_name(url) == url + + +def test_source_output_stem_sanitizes_urls(): + stem = source_output_stem("https://example.com/posts/hello-world?ref=test") + assert stem == "example.com-hello-world" diff --git a/tests/utils/test_translations.py b/tests/utils/test_translations.py index 414b37a..fd0179d 100644 --- a/tests/utils/test_translations.py +++ b/tests/utils/test_translations.py @@ -42,6 +42,10 @@ def test_home_translation_keys_exist(): "home_raw_view_button", "home_copy_markdown_button", "home_save_markdown_button", + "home_add_url_button", + "home_url_placeholder", + "home_url_invalid_title", + "home_url_invalid_message", "home_preview_file_default", "home_preview_for_file", "home_save_mode_label", @@ -69,11 +73,18 @@ def test_home_translation_keys_exist(): "conversion_backend_summary", "conversion_backend_summary_item", "conversion_backend_azure", + "conversion_backend_defuddle", "conversion_backend_local", "conversion_backend_native", + "conversion_source_heading", + "help_open_defuddle_docs", "help_open_azure_ocr_pricing", "help_open_tesseract", "help_faq_title", + "help_faq_defuddle_question", + "help_faq_defuddle_answer", + "help_faq_defuddle_limits_question", + "help_faq_defuddle_limits_answer", "help_faq_tesseract_windows_question", "help_faq_tesseract_windows_answer", "help_faq_tesseract_macos_question",