diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 780ba583..717573c2 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -5,8 +5,7 @@ from llm_web_kit.exception.exception import ( HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException) -from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify, - tag_img, tag_math, +from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math, tag_mjx, tag_script) from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN, ZHIHU) @@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq - if node.tag == 'span' and node.get('class') and ( - 'math-container' in node.get('class') or - 'mathjax' in node.get('class') or - 'wp-katex-eq' in node.get('class') or - 'x-ck12-mathEditor' in node.get('class') or - 'tex' in node.get('class') - ): - tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) + # if node.tag == 'span' and node.get('class') and ( + # 'math-container' in node.get('class') or + # 'mathjax' in node.get('class') or + # 'wp-katex-eq' in node.get('class') or + # 'x-ck12-mathEditor' in node.get('class') or + # 'tex' in node.get('class') + # ): + # tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) # math tags if node.tag == 'math' or node.tag.endswith(':math'): diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index d1d0a648..229ab66e 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -569,26 +569,42 @@ def groups(self): def optimized_dollar_matching(text): - """美元金额匹配.""" - # 用于存储需要修改的位置和替换内容 + """美元金额匹配,避免误判数学公式.""" replacements = [] pattern = r'(?referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity
""" + html_content = """referring $18.1 to $18.1 the packet center $ p$ and $9 + 10^9$ apparently coinciding with the particle velocity and $18.1
""" parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert element_to_html(parts[0][0]) == 'referring \\$18.1 to \\$18.1 the packet center
referring \\$18.1 to \\$18.1 the packet center