From 53b06463f1c713dcd9017f711505601d86ebb5b2 Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Thu, 20 Nov 2025 10:50:08 +0800
Subject: [PATCH 1/5] feat: update code content_list

---
 llm_web_kit/extractor/html/recognizer/cccode.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py
index 0baf436c..9c7aa8df 100644
--- a/llm_web_kit/extractor/html/recognizer/cccode.py
+++ b/llm_web_kit/extractor/html/recognizer/cccode.py
@@ -88,29 +88,29 @@ def recognize(
 
     @override
     def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
-        """
-        把代码元素转换为content list node.
+        """把代码元素转换为content list node. 注意：此方法只处理块级代码(CC_CODE)，行内代码(CC_CODE_INLIN
+        E)由TextParagraphRecognizer处理.
+
         Args:
             base_url:
             parsed_content: HtmlElement对象
             raw_html_segment:
 
         Returns:
-
         """
         d = {
             'type': 'code',
-            # "bbox": [],
-            'raw_content': raw_html_segment,
-            'inline': parsed_content.get('inline', 'false') == 'true',
+            'bbox': [],
             'content': {
                 'code_content': parsed_content.text,
             },
         }
 
+        # 可选字段：language
         if lang := parsed_content.get('language', None):
             d['content']['language'] = lang
 
+        # 可选字段：by（代码高亮工具）
         if by := parsed_content.get('by', None):
             d['content']['by'] = by
 

From ecf739bedf265e5fe305e286838b812f9f2d3e04 Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Thu, 20 Nov 2025 11:19:42 +0800
Subject: [PATCH 2/5] feat: update math content_list

---
 .../extractor/html/recognizer/ccmath.py       | 25 +++++++++++--------
 .../extractor/html/recognizer/test_math.py    |  2 +-
 .../extractor/test_extractor_chain.py         |  1 -
 .../input/assets/content_json.json            |  5 ++--
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index b83a2edc..ade75dcb 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl
 
     @override
     def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
-        """将content转换成content_list_node.
-        每种类型的html元素都有自己的content-list格式：参考 docs/specification/output_format/content_list_spec.md
-        例如代码的返回格式：
+        """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式：参考
+        docs/specification/output_format/content_list_spec.md.
+
+        返回格式示例：
         ```json
             {
-                "type": "equation-inline", # 数学公式类型，一共equation-inline和equation-interline两种
-                "raw_content": "<ccmath type="latex" by="mathjax">$u_{x_0}^{in}(x)$</ccmath>",
+                "type": "equation-interline",
+                "bbox": [x1, y1, x2, y2],
                 "content": {
-                    "math_content": "u_{x_0}^{in}(x)",
+                    "math_content": "a^2 + b^2 = c^2",
                     "math_type": "latex",
                     "by": "mathjax"
                 }
             }
-            ```
+        ```
 
-            Args:
-                content: str: 要转换的content
+        Args:
+            base_url: 基础URL
+            parsed_content: 解析后的HtmlElement对象
+            raw_html_segment: 原始HTML片段
 
         Returns:
             dict: content_list_node
@@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             math_content = self.cm.wrap_math_md(math_content)
             return {
                 'type': DocElementType.EQUATION_INTERLINE,
-                'raw_content': raw_html_segment,
+                'bbox': [],
                 'content': {
                     'math_content': math_content,
                     'math_type': inter_ele[0].get('type'),  # 数学语言类型
@@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             math_content = in_els[0].text
             return {
                 'type': DocElementType.EQUATION_INLINE,
-                'raw_content': raw_html_segment,
+                'bbox': [],
                 'content': {
                     'math_content': math_content,
                     'math_type': in_els[0].get('type'),  # 数学语言类型
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index 9a9af500..20572874 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -287,7 +287,7 @@
         ),
         'expected': {
             'type': 'equation-interline',
-            'raw_content': '<span class="math-container">$$h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}$$</span>',
+            'bbox': [],
             'content': {
                 'math_content': 'h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}',
                 'math_type': 'latex',
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 0981c9ca..29f00da6 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -145,7 +145,6 @@ def test_html_pipeline(self):
         self.assertEqual(html_content['type'], DocElementType.CODE)
         self.assertEqual(len(html_content['content']['code_content']), 251)
         self.assertEqual(html_content['content']['by'], 'tag_pre_code')
-        self.assertEqual(html_content['inline'], False)
 
         # 有序列表
         html_content = html_content_list[10]
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index 34236da4..672c3fc8 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -69,8 +69,7 @@
       },
       {
         "type": "code",
-        "raw_content": "<div class=\"tFormatCodeBlock supportThreadCodeBlock\" style=\"border: 1px  solid  #7f9db9;overflow-y: auto;\" id=\"highlighter_46368\">\n    <div style=\"background-color: #ffffff;\"><span style=\"margin-left: 0px !important;\"><code style=\"color: #000000;\">Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated</code></span>\n    </div>\n    <div style=\"background-color: #f8f8f8;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)</code></span></span>\n    </div>\n    <div style=\"background-color: #ffffff;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">If item IsNot Nothing Then</code></span></span>\n    </div>\n    <div style=\"background-color: #f8f8f8;\"><span><code>            </code><span style=\"margin-left: 36px !important;\"><code style=\"color: #000000;\">item.CssClass = \"focused\"</code></span></span>\n    </div>\n    <div style=\"background-color: #ffffff;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">End If</code></span></span>\n    </div>\n    <div style=\"background-color: #f8f8f8;\"><span style=\"margin-left: 0px !important;\"> </span></div>\n    <div style=\"background-color: #ffffff;\"><span><code>    </code><span style=\"margin-left: 12px !important;\"><code style=\"color: #000000;\">End Sub</code></span></span>\n    </div>\n</div>\n\n",
-        "inline": false,
+        "bbox": [],
         "content": {
           "code_content": "Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n        Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n        If item IsNot Nothing Then\n            item.CssClass = \"focused\"\n        End If\n\n    End Sub",
           "by": "tag_code"
@@ -164,7 +163,7 @@
       },
       {
         "type": "equation-interline",
-        "raw_content": "<p>$$a^2 + b^2 = c^2$$</p>",
+        "bbox": [],
         "content": {
           "math_content": "a^2 + b^2 = c^2",
           "math_type": "latex",

From e940e7ba467e662575e161cbfe983ccd2cd1ee3b Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Thu, 20 Nov 2025 11:22:59 +0800
Subject: [PATCH 3/5] x

---
 llm_web_kit/extractor/html/recognizer/ccmath.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index ade75dcb..32884087 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -60,7 +60,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
         ```json
             {
                 "type": "equation-interline",
-                "bbox": [x1, y1, x2, y2],
+                "bbox": [],
                 "content": {
                     "math_content": "a^2 + b^2 = c^2",
                     "math_type": "latex",

From aac019de977061e27ab8186bb8a3b051287b8fbb Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Thu, 20 Nov 2025 14:06:50 +0800
Subject: [PATCH 4/5] x

---
 llm_web_kit/input/datajson.py                    | 16 ----------------
 .../extractor/html/recognizer/test_text.py       |  3 ++-
 .../extractor/test_extractor_chain.py            |  4 ----
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 56ed8272..32d662bd 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -152,22 +152,6 @@ def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=
         md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url)
         return md
 
-    def to_main_html(self) -> str:
-        """拼接和每个content_list_node对应的html内容，返回一个完整的html文档.
-
-        Args:
-            content_lst_node (dict): content_list里定义的每种元素块
-        Returns:
-            str: html格式
-        """
-        content_lst = self._get_data()
-        html = ''
-        for page in content_lst:
-            for content_lst_node in page:
-                raw_html = content_lst_node['raw_content']
-                html += raw_html
-        return html
-
     def to_json(self, pretty=False) -> str:
         content_lst = self._get_data()
         if pretty:
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index 0c0f8db2..7f2cd7a2 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -331,7 +331,8 @@ def test_interactive_element(self):
         }
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        main_html = result.get_content_list().to_main_html()
+        # 验证 main_html 中没有交互元素
+        main_html = result.get('main_html', '')
         assert '<input' not in main_html
 
     def test_normalize_space1(self):
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 29f00da6..7ae20fc2 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -175,10 +175,6 @@ def test_html_pipeline(self):
         self.assertNotEqual(md_content[-2], '\n')
         self.assertEqual(md_content[-1], '\n')
 
-        # main_html
-        main_html = result.get_content_list().to_main_html()  # 获取main_html内容
-        self.assertEqual(main_html, self.main_html_expected_content)  # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO
-
     def test_html_pipeline_suit_2(self):
         """测试第二个数据：这个数据会丢失一些文本信息."""
         chain = ExtractSimpleFactory.create(self.config)

From b0b0600cffe5eb95e7a285ba066bfd1de31ec298 Mon Sep 17 00:00:00 2001
From: chupei <njuchupei@gmail.com>
Date: Thu, 20 Nov 2025 14:10:45 +0800
Subject: [PATCH 5/5] x

---
 tests/llm_web_kit/extractor/html/recognizer/test_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index 7f2cd7a2..e3c6119f 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -332,7 +332,7 @@ def test_interactive_element(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         # 验证 main_html 中没有交互元素
-        main_html = result.get('main_html', '')
+        main_html = result.get('main_html')
         assert '<input' not in main_html
 
     def test_normalize_space1(self):