ccprocessor · 1041206149 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -538,62 +538,58 @@ def fix_mathml_superscript(self, mathml_str):
             parent.remove(msup)
         return etree.tostring(root, encoding='unicode', pretty_print=True)
 
-    def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
-        # pattern re数学公式匹配 func 公式预处理 默认不处理
-        # ascii公式处理逻辑转移到mathjax渲染器方案中
-        if asciimath_wrap:
-            return node
-
-        pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
-        original_text = node.text or ''
-
-        def is_ccmath_wrapped(match_text, original_text: str) -> bool:
-            if not match_text or not original_text:
-                return False
-            start_idx = match_text.start()
-            end_idx = match_text.end()
-            before_match = original_text[:start_idx].strip()
-            after_match = original_text[end_idx:].strip()
-            if 'ccmath' in before_match and 'ccmath' in after_match:
-                return True
-            if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
-                for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
-                    if start in before_match and end in after_match:
-                        return True
-            return False
-
-        def process(match_text):
-            try:
-                match = match_text.group(0)
-                if is_ccmath_wrapped(match_text, original_text):
-                    return match
-                wrapped_text = func(match) if func else match
-                # html保留原始的，而不是传入修改过的wrapped_text
-                original_wrapped = wrapped_text
-                wrapped_text = self.wrap_math_md(wrapped_text)
-                if not wrapped_text:
-                    return match
-                new_span = build_cc_element(
-                    html_tag_name=new_tag,
-                    text=wrapped_text,
-                    tail='',
-                    type=math_type,
-                    by=math_render,
-                    html=original_wrapped
-                )
-            except Exception:
-                return match
-            return element_to_html(new_span)
+    def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement:
+        """替换数学公式节点.
+
+        Args:
+            new_tag: 新标签名称(CCMATH_INLINE 或 CCMATH_INTERLINE)
+            math_type: 数学公式类型(MathType.LATEX 等)
+            math_render: 渲染器类型
+            node: 当前HTML节点
+            func: 公式预处理函数(可选)
+
+        Returns:
+            处理后的节点
+        """
         try:
-            for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
-                pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
-                regex = re.compile(pattern, re.DOTALL)
-                original_text = re.sub(regex, process, original_text)
-        except Exception:
-            node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
-            return node
-        node.text = original_text
-        return html_to_element(element_to_html_unescaped(node))
+            text = node.text
+            if not text or not text_strip(text):
+                return node
+
+            # 预处理公式
+            if func:
+                text = func(text)
+
+            # 去除分隔符并标准化
+            formula = self.wrap_math_md(text)
+
+            # 处理特殊类型
+            if math_type == MathType.ASCIIMATH:
+                formula = self.extract_asciimath(formula)
+                formula = self.wrap_math_md(formula)
+
+            # 构建新节点
+            new_span = build_cc_element(
+                html_tag_name=new_tag,
+                text=formula,
+                tail=text_strip(node.tail),
+                type=math_type,
+                by=math_render,
+                html=element_to_html(node)
+            )
+
+            return new_span
+
+        except Exception as e:
+            # 处理失败时返回失败标记节点
+            return build_cc_element(
+                html_tag_name=CCMATH_HANDLE_FAILED,
+                text=node.text if node.text else '',
+                tail=text_strip(node.tail),
+                type=math_type,
+                by=math_render,
+                html=element_to_html(node)
+            )
 
     def build_cc_exception_tag(self, text, math_type, math_render) -> str:
         return element_to_html(build_cc_element(

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
@@ -2,12 +2,12 @@
 
 from llm_web_kit.exception.exception import HtmlMathRecognizerException
 from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
-                                                                  MathType,
                                                                   text_strip)
 from llm_web_kit.libs.html_utils import replace_element
 
 
-def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
+
+def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement):
     try:
         text = node.text
         tag_math_type_list = cm.get_equation_type(o_html)
@@ -18,17 +18,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
             tail = node.tail
             new_span.tail = None
             for new_tag, math_type in tag_math_type_list:
-                asciimath_wrap = True if math_type == MathType.ASCIIMATH else False
-                new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None,asciimath_wrap)
+                new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None)
             new_span.tail = tail
             replace_element(node,new_span)
-            # if math_type == MathType.ASCIIMATH:
-            #     text = cm.wrap_math_md(text)
-            #     text = cm.extract_asciimath(text)
-            #     new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
-            #     replace_element(node, new_span)
-            # elif math_type == MathType.LATEX:
-            #     new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
-            #     replace_element(node, new_span)
+
     except Exception as e:
         raise HtmlMathRecognizerException(f'Error processing script mathtex: {e}')
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -140,14 +140,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
 
                 # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
-                if node.tag == 'span' and node.get('class') and (
-                        'math-container' in node.get('class') or
-                        'mathjax' in node.get('class') or
-                        'wp-katex-eq' in node.get('class') or
-                        'x-ck12-mathEditor' in node.get('class') or
-                        'tex' in node.get('class')
-                ):
-                    tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                if node.tag == 'span' and node.get('class') and 'math-container' in node.get('class'):
+                    tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node)
 
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):

diff --git a/...ctor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html b/...ctor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
@@ -1406,7 +1406,7 @@ <h2 class="mb0" data-answercount="2">
 <div class="answercell post-layout--right">
     <span class="d-none">$\begingroup$</span>
     <div class="s-prose js-post-body" itemprop="text">
-<p>When gravitational waves reach Earth, <a href="https://en.wikipedia.org/wiki/Gravitational_wave#Difficulties" rel="nofollow noreferrer">they usually give a strain</a> of <span class="math-container">$\delta L \over L$$=10^{-21}$</span>.</p>
+<p>When gravitational waves reach Earth, <a href="https://en.wikipedia.org/wiki/Gravitational_wave#Difficulties" rel="nofollow noreferrer">they usually give a strain</a> of <span class="math-container">$\delta L \over L = 10^{-21}$</span>.</p>
 <p>If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.</p>
 <p>LIGO detected the <a href="https://en.wikipedia.org/wiki/First_observation_of_gravitational_waves" rel="nofollow noreferrer">first merger</a> of black holes at 1.3 billion light years away.</p>
 <p>If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of <span class="math-container">$10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$</span>. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.</p>

diff --git a/.../recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/.../recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
@@ -7,8 +7,7 @@
 M_{\odot}
 M_{\odot}
 M_{\odot}
-\delta L \over L
-=10^{-21}
+\delta L \over L = 10^{-21}
 10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}
 1/r
 10^{-21} \times (1.3 \cdot 10^9)=10^{-9}

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -511,6 +511,7 @@ def test_math_recognizer_html(self):
             # print('answers::::::::', answers)
             # self.write_to_html(answers, test_case['input'][0])
             # 检查行内公式抽取正确性
+            # 检查行内公式抽取正确性
             if test_case.get('expected_inline', None):
                 # 从所有parts中提取所有行内公式
                 all_inline_formulas = []
@@ -521,16 +522,40 @@ def test_math_recognizer_html(self):
                         for inline_elem in inline_elements:
                             formula = inline_elem.text.replace('\n', '').strip()
                             all_inline_formulas.append(formula)
-                # print(f"Found {len(all_inline_formulas)} total inline formulas")
-                # print(f"Total new_parts: {len(new_parts)}")
+
                 expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
                 expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
-                # print(f"Expected {len(expect_inline_formulas)} inline formulas")
+
+                # 如果数量不匹配，输出详细信息
+                if len(all_inline_formulas) != len(expect_inline_formulas):
+                    print("\n" + "=" * 80)
+                    print("行内公式抽取出错!")
+                    print("=" * 80)
+                    print(f"出错样例: {test_case['input']}")
+                    print(f"预期公式数: {len(expect_inline_formulas)}")
+                    print(f"实际公式数: {len(all_inline_formulas)}")
+                    print("\n预期公式列表:")
+                    for i, formula in enumerate(expect_inline_formulas, 1):
+                        print(f"  {i}. {formula}")
+                    print("\n实际公式列表:")
+                    for i, formula in enumerate(all_inline_formulas, 1):
+                        print(f"  {i}. {formula}")
+
+                    # 找出差异
+                    print("\n差异分析:")
+                    if len(all_inline_formulas) > len(expect_inline_formulas):
+                        print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:")
+                        extra_formulas = all_inline_formulas[len(expect_inline_formulas):]
+                        for i, formula in enumerate(extra_formulas, 1):
+                            print(f"  {i}. {formula}")
+                    else:
+                        print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:")
+                        missing_formulas = expect_inline_formulas[len(all_inline_formulas):]
+                        for i, formula in enumerate(missing_formulas, 1):
+                            print(f"  {i}. {formula}")
+                    print("=" * 80 + "\n")
+
                 self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
-                for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
-                    # print('inline expect::::::::', expect)
-                    # print('inline answer::::::::', formula)
-                    self.assertEqual(expect, formula)
 
     def write_to_html(self, answers, file_name):
         file_name = file_name.split('.')[0]

diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
@@ -168,7 +168,7 @@
         "content": {
           "math_content": "a^2 + b^2 = c^2",
           "math_type": "latex",
-          "by": "None"
+          "by": "mathjax_mock"
         }
       },
       {