From 53b06463f1c713dcd9017f711505601d86ebb5b2 Mon Sep 17 00:00:00 2001 From: chupei Date: Thu, 20 Nov 2025 10:50:08 +0800 Subject: [PATCH 1/5] feat: update code content_list --- llm_web_kit/extractor/html/recognizer/cccode.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index 0baf436c..9c7aa8df 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -88,29 +88,29 @@ def recognize( @override def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: - """ - 把代码元素转换为content list node. + """把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN + E)由TextParagraphRecognizer处理. + Args: base_url: parsed_content: HtmlElement对象 raw_html_segment: Returns: - """ d = { 'type': 'code', - # "bbox": [], - 'raw_content': raw_html_segment, - 'inline': parsed_content.get('inline', 'false') == 'true', + 'bbox': [], 'content': { 'code_content': parsed_content.text, }, } + # 可选字段:language if lang := parsed_content.get('language', None): d['content']['language'] = lang + # 可选字段:by(代码高亮工具) if by := parsed_content.get('by', None): d['content']['by'] = by From ecf739bedf265e5fe305e286838b812f9f2d3e04 Mon Sep 17 00:00:00 2001 From: chupei Date: Thu, 20 Nov 2025 11:19:42 +0800 Subject: [PATCH 2/5] feat: update math content_list --- .../extractor/html/recognizer/ccmath.py | 25 +++++++++++-------- .../extractor/html/recognizer/test_math.py | 2 +- .../extractor/test_extractor_chain.py | 1 - .../input/assets/content_json.json | 5 ++-- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index b83a2edc..ade75dcb 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl @override def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: - """将content转换成content_list_node. - 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md - 例如代码的返回格式: + """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 + docs/specification/output_format/content_list_spec.md. + + 返回格式示例: ```json { - "type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种 - "raw_content": "$u_{x_0}^{in}(x)$", + "type": "equation-interline", + "bbox": [x1, y1, x2, y2], "content": { - "math_content": "u_{x_0}^{in}(x)", + "math_content": "a^2 + b^2 = c^2", "math_type": "latex", "by": "mathjax" } } - ``` + ``` - Args: - content: str: 要转换的content + Args: + base_url: 基础URL + parsed_content: 解析后的HtmlElement对象 + raw_html_segment: 原始HTML片段 Returns: dict: content_list_node @@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h math_content = self.cm.wrap_math_md(math_content) return { 'type': DocElementType.EQUATION_INTERLINE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'math_content': math_content, 'math_type': inter_ele[0].get('type'), # 数学语言类型 @@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h math_content = in_els[0].text return { 'type': DocElementType.EQUATION_INLINE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'math_content': math_content, 'math_type': in_els[0].get('type'), # 数学语言类型 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 9a9af500..20572874 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -287,7 +287,7 @@ ), 'expected': { 'type': 'equation-interline', - 'raw_content': '$$h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}$$', + 'bbox': [], 'content': { 'math_content': 'h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}', 'math_type': 'latex', diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 0981c9ca..29f00da6 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -145,7 +145,6 @@ def test_html_pipeline(self): self.assertEqual(html_content['type'], DocElementType.CODE) self.assertEqual(len(html_content['content']['code_content']), 251) self.assertEqual(html_content['content']['by'], 'tag_pre_code') - self.assertEqual(html_content['inline'], False) # 有序列表 html_content = html_content_list[10] diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index 34236da4..672c3fc8 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -69,8 +69,7 @@ }, { "type": "code", - "raw_content": "
\n
Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n
\n
        Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n
\n
        If item IsNot Nothing Then\n
\n
            item.CssClass = \"focused\"\n
\n
        End If\n
\n
 
\n
    End Sub\n
\n
\n\n", - "inline": false, + "bbox": [], "content": { "code_content": "Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n If item IsNot Nothing Then\n item.CssClass = \"focused\"\n End If\n\n End Sub", "by": "tag_code" @@ -164,7 +163,7 @@ }, { "type": "equation-interline", - "raw_content": "

$$a^2 + b^2 = c^2$$

", + "bbox": [], "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", From e940e7ba467e662575e161cbfe983ccd2cd1ee3b Mon Sep 17 00:00:00 2001 From: chupei Date: Thu, 20 Nov 2025 11:22:59 +0800 Subject: [PATCH 3/5] x --- llm_web_kit/extractor/html/recognizer/ccmath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index ade75dcb..32884087 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -60,7 +60,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h ```json { "type": "equation-interline", - "bbox": [x1, y1, x2, y2], + "bbox": [], "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", From aac019de977061e27ab8186bb8a3b051287b8fbb Mon Sep 17 00:00:00 2001 From: chupei Date: Thu, 20 Nov 2025 14:06:50 +0800 Subject: [PATCH 4/5] x --- llm_web_kit/input/datajson.py | 16 ---------------- .../extractor/html/recognizer/test_text.py | 3 ++- .../extractor/test_extractor_chain.py | 4 ---- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 56ed8272..32d662bd 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -152,22 +152,6 @@ def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url= md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url) return md - def to_main_html(self) -> str: - """拼接和每个content_list_node对应的html内容,返回一个完整的html文档. - - Args: - content_lst_node (dict): content_list里定义的每种元素块 - Returns: - str: html格式 - """ - content_lst = self._get_data() - html = '' - for page in content_lst: - for content_lst_node in page: - raw_html = content_lst_node['raw_content'] - html += raw_html - return html - def to_json(self, pretty=False) -> str: content_lst = self._get_data() if pretty: diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 0c0f8db2..7f2cd7a2 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -331,7 +331,8 @@ def test_interactive_element(self): } input_data = DataJson(test_data) result = chain.extract(input_data) - main_html = result.get_content_list().to_main_html() + # 验证 main_html 中没有交互元素 + main_html = result.get('main_html', '') assert ' Date: Thu, 20 Nov 2025 14:10:45 +0800 Subject: [PATCH 5/5] x --- tests/llm_web_kit/extractor/html/recognizer/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 7f2cd7a2..e3c6119f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -332,7 +332,7 @@ def test_interactive_element(self): input_data = DataJson(test_data) result = chain.extract(input_data) # 验证 main_html 中没有交互元素 - main_html = result.get('main_html', '') + main_html = result.get('main_html') assert '