From a6ce74c9fba3a7596793df62b82b392903419831 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 10 Oct 2025 16:07:02 +0800 Subject: [PATCH 1/4] =?UTF-8?q?fix:=20=E5=87=8F=E5=B0=91=E5=B0=86=E5=85=AC?= =?UTF-8?q?=E5=BC=8F'$'=E9=94=99=E8=AF=AF=E8=AF=86=E5=88=AB=E4=B8=BA?= =?UTF-8?q?=E8=B4=A7=E5=B8=81'$'=E7=9A=84=E6=83=85=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extractor/html/recognizer/ccmath.py | 19 +++++------ llm_web_kit/libs/html_utils.py | 34 ++++++++++++++----- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 780ba583..717573c2 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -5,8 +5,7 @@ from llm_web_kit.exception.exception import ( HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException) -from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify, - tag_img, tag_math, +from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math, tag_mjx, tag_script) from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN, ZHIHU) @@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq - if node.tag == 'span' and node.get('class') and ( - 'math-container' in node.get('class') or - 'mathjax' in node.get('class') or - 'wp-katex-eq' in node.get('class') or - 'x-ck12-mathEditor' in node.get('class') or - 'tex' in node.get('class') - ): - tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) + # if node.tag == 'span' and node.get('class') and ( + # 'math-container' in node.get('class') or + # 'mathjax' in node.get('class') or + # 'wp-katex-eq' in node.get('class') or + # 'x-ck12-mathEditor' in node.get('class') or + # 'tex' in node.get('class') + # ): + # tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) # math tags if node.tag == 'math' or node.tag.endswith(':math'): diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index d1d0a648..229ab66e 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -569,26 +569,42 @@ def groups(self): def optimized_dollar_matching(text): - """美元金额匹配.""" - # 用于存储需要修改的位置和替换内容 + """美元金额匹配,避免误判数学公式.""" replacements = [] pattern = r'(? Date: Fri, 10 Oct 2025 16:26:52 +0800 Subject: [PATCH 2/4] x --- tests/llm_web_kit/input/assets/content_json.json | 2 +- tests/llm_web_kit/input/test_datajson.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index d8f038a2..34236da4 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -168,7 +168,7 @@ "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", - "by": "None" + "by": "mathjax_mock" } }, { diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index 6996fc38..2854f201 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -564,7 +564,7 @@ def test_to_plain_md(self): self.assertNotIn('flower.mp4', mm_md) content_json = json_loads(base_dir.joinpath('assets/content_json.json').read_text(encoding='utf-8')) - self.assertEqual(json_json['content_list'], content_json['content_list']) + assert json_json['content_list'] == content_json['content_list'] plain_md_main = extract_content_from_main_html(url, raw_html, 'plain_md') mm_md_main = extract_content_from_html_with_magic_html(url, raw_html, 'mm_md') From 08f6fae6bfa5deb1983f11d00bb9fb297e64fc90 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Fri, 10 Oct 2025 16:37:05 +0800 Subject: [PATCH 3/4] add --- tests/llm_web_kit/extractor/html/recognizer/test_math.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 79d010aa..490c4486 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -955,11 +955,11 @@ def test_latex_not_closed(self): def test_dollar_sign(self): """美元符合与公式共存的情况.""" - html_content = """
referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity
""" + html_content = """referring $18.1 to $18.1 the packet center $ p$ and $9 \cdot 10^9$ apparently coinciding with the particle velocity and $18.1
""" parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert element_to_html(parts[0][0]) == 'referring \\$18.1 to \\$18.1 the packet center
referring \\$18.1 to \\$18.1 the packet center
referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity
""" + html_content = """referring $18.1 to $18.1 the packet center $ p$ and $9 + 10^9$ apparently coinciding with the particle velocity and $18.1
""" parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert element_to_html(parts[0][0]) == 'referring \\$18.1 to \\$18.1 the packet center
referring \\$18.1 to \\$18.1 the packet center