HiThink-Research · W-Geong · Jan 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -216,3 +216,4 @@ scripts/oneclick/.env.example
 __pycache__/
 *.pyc
 *.pyo
+run.sh
diff --git a/config/custom/omnidocbench_qwen_mllm.yaml b/config/custom/omnidocbench_qwen_mllm.yaml
@@ -0,0 +1,64 @@
+api_version: gage/v1alpha1
+kind: PipelineConfig
+metadata:
+  name: omnidocbench_qwen_mllm
+  description: A multi-modal evaluation for OmniDocBench using a Qwen MLLM.
+
+custom:
+  steps:
+    - step: inference
+    - step: auto_eval
+
+# 1. Dataset Configuration
+datasets:
+  - dataset_id: omnidocbench_val
+    loader: jsonl
+    params:
+      path: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/omnidocbench15_gage_r.jsonl
+      preprocess: omnidoc_image_standardizer
+      preprocess_kwargs:
+        question_field: prompt
+        content_field: image
+        content_root: /mnt/sdb1/ywt/OmniDocBench-main/OmniDocBench1_5/images
+      doc_to_visual: gage_eval.assets.datasets.utils.multimodal:embed_local_message_images
+      # doc_to_visual_kwargs 自动继承 preprocess_kwargs 的同名字段
+
+backends:
+  - backend_id: omnidocbench_qwen_mllm_backend
+    type: litellm
+    config:
+      provider: openai
+      api_base: http://127.0.0.1:8685/v1  # 替换为实际 Vision-LLM 服务地址
+      model: Qwen/Qwen3-Omni-30B-A3B-Instruct
+      generation_parameters:
+        max_new_tokens: 4096 # 输出尽量短，避免内存/延迟
+        temperature: 0.1
+      async_max_concurrency: 2  # MacBook 本地轻量并发
+
+role_adapters:
+  - adapter_id: omnidocbench_qwen_vl
+    role_type: dut_model
+    backend_id: omnidocbench_qwen_mllm_backend
+    # Capability must match what the backend/adapter supports for multi-modal input
+    capabilities:
+      - vision_chat
+
+# 4. Metric Configuration
+metrics:
+  - metric_id: omnidocbench_all_metric
+    # This implementation needs to be created as per test-1113.md
+    implementation: gage_eval.metrics.builtin.ominidoc_all_metric:OmniDocBenchMetric
+    # implementation: OmniDocBenchMetric
+    aggregation: omnidoclazycalc
+
+# 5. Task Configuration
+tasks:
+  - task_id: doc_parsing_eval
+    dataset_id: omnidocbench_val
+    max_samples: 20 # MacBook 快速 smoke，可 CLI 覆盖
+    reporting:
+      sinks:
+        - type: console
+        - type: file
+          params:
+            output_path: ${GAGE_EVAL_SAVE_DIR:-./runs}/omnidocbench_events.jsonl
diff --git a/src/gage_eval/assets/datasets/loaders/jsonl_loader.py b/src/gage_eval/assets/datasets/loaders/jsonl_loader.py
@@ -54,6 +54,8 @@ def load(self, hub_handle: Optional[DatasetHubHandle], *, trace=None) -> DataSou
         doc_to_text = resolve_doc_to_callable(self.spec, "doc_to_text")
         doc_to_visual = resolve_doc_to_callable(self.spec, "doc_to_visual")
         doc_to_audio = resolve_doc_to_callable(self.spec, "doc_to_audio")
+        from gage_eval.assets.datasets import preprocessors 
+        # apply_preprocess need registered preprocessors but fail to import preprocessors.builtin.py at begin due to cross registry
         records = apply_preprocess(
             raw_records,
             self.spec,

diff --git a/src/gage_eval/assets/datasets/preprocessors/__init__.py b/src/gage_eval/assets/datasets/preprocessors/__init__.py
@@ -1,3 +1,4 @@
 """Preprocessor utilities."""
+from . import builtin
 
-__all__ = []
+__all__ = ["builtin"]
diff --git a/src/gage_eval/assets/datasets/preprocessors/builtin.py b/src/gage_eval/assets/datasets/preprocessors/builtin.py
@@ -6,6 +6,7 @@
 from gage_eval.assets.datasets.preprocessors.default_preprocessor import DefaultPreprocessor
 from gage_eval.assets.datasets.preprocessors.multi_choice_preprocessor import MultiChoicePreprocessor as NewMultiChoice
 from gage_eval.assets.datasets.preprocessors.docvqa_preprocessor import DocVQAPreprocessor as NewDocVQA
+from gage_eval.assets.datasets.preprocessors.omnidoc_preprocessor import OmniDocPreprocessor as NewOmniDoc
 from gage_eval.assets.datasets.preprocessors.mathvista_preprocessor import (
     MathVistaPreprocessor as NewMathVista,
     MathVistaStructOnlyPreprocessor as NewMathVistaStructOnly,
@@ -72,6 +73,14 @@ class MultiChoicePreprocessor(NewMultiChoice):
 class DocVQAPreprocessor(NewDocVQA):
     pass
 
+@registry.asset(
+    "dataset_preprocessors",
+    "omnidoc_image_standardizer",
+    desc="OmniDocBench multimodal preprocessor (new)",
+    tags=("prompt", "vision", "omnidoc"),
+)
+class OmniDocPreprocessor(NewOmniDoc):
+    pass
 
 @registry.asset(
     "dataset_preprocessors",

diff --git a/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py b/src/gage_eval/assets/datasets/preprocessors/omnidoc_preprocessor.py
@@ -0,0 +1,121 @@
+"""Class-based DocVQA preprocessor (new implementation)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Dict, List
+import base64
+import mimetypes
+
+from gage_eval.assets.datasets.utils.multimodal import (
+    _derive_root,
+    collect_content_fragments,
+    embed_local_message_images,
+    embed_local_image_as_data_url
+)
+from gage_eval.assets.datasets.preprocessors.base import BasePreprocessor
+from gage_eval.assets.datasets.utils.mapping import extract_field
+from gage_eval.assets.datasets.utils.normalization import list_images, ensure_chat_template_flags
+from gage_eval.assets.datasets.utils.answers import parse_list_from_string, enrich_answer_with_options
+from gage_eval.assets.datasets.utils.rendering import set_render_flags
+
+def encode_image_to_data_uri(image_path):
+    mime_type, _ = mimetypes.guess_type(image_path)
+    if mime_type is None:
+        mime_type = 'image/jpeg'
+
+    with open(image_path, "rb") as image_file:
+        base64_data = base64.b64encode(image_file.read()).decode('utf-8')
+
+    return f"data:{mime_type};base64,{base64_data}"
+
+class OmniDocPreprocessor(BasePreprocessor):
+    """Normalize OminiDoc samples with text + image content."""
+
+    def to_sample(
+        self,
+        record: Dict[str, Any],
+        *,
+        question_field: str = "question",
+        content_field: str = "image",
+        content_root: str | None = None,
+        data_path: str | None = None,
+        system_prompt: str | None = None,
+        instruction: str | None = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+
+        sample = dict(record)
+        question = extract_field(sample, question_field)
+        if question is None:
+            raise ValueError(f"OminiDoc sample missing question field '{question_field}'")
+
+        # 1. Resolve Root Path
+        # Ensure dataset metadata path is set if data_path is provided, for _derive_root fallback
+        if data_path and "_dataset_metadata" not in sample:
+            sample["_dataset_metadata"] = {"path": data_path}
+        elif data_path:
+            sample.setdefault("_dataset_metadata", {})["path"] = data_path
+
+        resolved_root = _derive_root(sample, content_root)
+        if resolved_root and isinstance(resolved_root, str) and not resolved_root.startswith(("http://", "https://", "data:")):
+            try:
+                resolved_root = str(Path(resolved_root).expanduser().resolve())
+            except Exception:
+                resolved_root = str(Path(resolved_root).expanduser())
+
+        # 2. Construct Content
+        text_content = str(question).strip()
+        if instruction:
+            text_content = f"{text_content}\n\n{instruction.strip()}"
+
+        user_content_parts = [{"type": "text", "text": text_content}]
+
+        # 3. Embed Local Images
+        fragments = collect_content_fragments(sample, content_field=content_field, content_root=resolved_root)
+
+        converted = embed_local_image_as_data_url(
+            sample,
+            image_field=content_field,
+            strict=False,
+            cache_dir=None,
+            content_root=content_root,
+        )
+
+        # 4. Build Messages
+        messages: List[Dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt.strip()}]})
+
+        visual_fragments=[{"type": "image_url", "image_url": sample['image']}]
+        user_content_parts.extend(visual_fragments)
+        messages.append({"role": "user", "content": user_content_parts})
+
+        sample["messages"] = messages
+        sample["prompt"] = question
+        sample["chat_template_mode"] = "preprocess"
+        sample["rendered_by"] = "preprocess"
+        sample["template_source"] = "manual"
+        sample["cache_suffix"] = "-converted"
+
+        ensure_chat_template_flags(sample)
+
+        # 5. Finalize Metadata
+        final_image_name = fragments
+        metadata = dict(sample.get("metadata") or {})
+        metadata.pop("image_root", None)
+        metadata.update({
+            "question_field": question_field,
+            "content_field": content_field,
+        })
+        if final_image_name:
+            metadata["image_name"] = final_image_name
+        if resolved_root:
+            metadata["content_root"] = resolved_root
+
+        sample["metadata"] = metadata
+        sample["inputs"] = sample.get("inputs") or {"prompt": question}
+        return sample
+
+
+__all__ = ["OmniDocPreprocessor"]
diff --git a/src/gage_eval/evaluation/sample_loop.py b/src/gage_eval/evaluation/sample_loop.py
@@ -123,7 +123,6 @@ def run(self, planner: TaskPlanner, role_manager: RoleManager, trace: Observabil
             daemon=True,
         )
         producer.start()
-
         if ff_mode:
             self._run_fire_and_forget(
                 sample_queue,
-Original file line number
+Diff line change
@@ Expand Up / @@ -216,3 +216,4 @@ scripts/oneclick/.env.example @@
     __pycache__/
     *.pyc
     *.pyo
+    run.sh