ccprocessor · 1041206149 · Oct 10, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -5,7 +5,6 @@
 from typing import List, Tuple
 
 from lxml import etree
-from lxml.html import HtmlElement
 
 # 在导入前就设置严格的日志控制
 logging.basicConfig(level=logging.WARNING, force=True)
@@ -20,7 +19,6 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
 from llm_web_kit.libs.doc_element_type import DocElementType
 from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
-                                         element_to_html_unescaped,
                                          html_to_element)
 from llm_web_kit.libs.text_utils import normalize_ctl_text
 
@@ -538,63 +536,6 @@ def fix_mathml_superscript(self, mathml_str):
             parent.remove(msup)
         return etree.tostring(root, encoding='unicode', pretty_print=True)
 
-    def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
-        # pattern re数学公式匹配 func 公式预处理 默认不处理
-        # ascii公式处理逻辑转移到mathjax渲染器方案中
-        if asciimath_wrap:
-            return node
-
-        pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
-        original_text = node.text or ''
-
-        def is_ccmath_wrapped(match_text, original_text: str) -> bool:
-            if not match_text or not original_text:
-                return False
-            start_idx = match_text.start()
-            end_idx = match_text.end()
-            before_match = original_text[:start_idx].strip()
-            after_match = original_text[end_idx:].strip()
-            if 'ccmath' in before_match and 'ccmath' in after_match:
-                return True
-            if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
-                for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
-                    if start in before_match and end in after_match:
-                        return True
-            return False
-
-        def process(match_text):
-            try:
-                match = match_text.group(0)
-                if is_ccmath_wrapped(match_text, original_text):
-                    return match
-                wrapped_text = func(match) if func else match
-                # html保留原始的，而不是传入修改过的wrapped_text
-                original_wrapped = wrapped_text
-                wrapped_text = self.wrap_math_md(wrapped_text)
-                if not wrapped_text:
-                    return match
-                new_span = build_cc_element(
-                    html_tag_name=new_tag,
-                    text=wrapped_text,
-                    tail='',
-                    type=math_type,
-                    by=math_render,
-                    html=original_wrapped
-                )
-            except Exception:
-                return match
-            return element_to_html(new_span)
-        try:
-            for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
-                pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
-                regex = re.compile(pattern, re.DOTALL)
-                original_text = re.sub(regex, process, original_text)
-        except Exception:
-            node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
-            return node
-        node.text = original_text
-        return html_to_element(element_to_html_unescaped(node))
-
     def build_cc_exception_tag(self, text, math_type, math_render) -> str:
         return element_to_html(build_cc_element(
             html_tag_name=CCMATH_HANDLE_FAILED,
@@ -621,12 +562,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str:
     print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$'))
     print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)'))
     print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)'))
-    print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>`x=(-b +- sqrt(b^2 - 4ac))/(2a)`</p>'),None,True))
-    print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>like this: \`E=mc^2\`</p>'),None,True))
-    print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.</p>'),None,True))
-    print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>`(x+1)/x^2``1/3245`</p>'),None,True))
-    print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'<p>start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end</p>'),None,False))
-    print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'<p>\( \newcommand{\norm}[1]{\| #1 \|}\)</p>'),None,False))
     # cm.url = 'mathhelpforum.com'
     # print(cm.wrap_math_md_custom(r'<br />\begin{align} a^2+b=c\end{align}\<br />'))
     # print(cm.wrap_math_md_custom(r'<br />dz=\frac{1}{2}\frac{dx}{\cos ^2 x}<br />'))
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
@@ -290,12 +290,20 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern
 
         # 先处理tail，再处理text，text的判断会多一些
         if element.tail:
+            # ⚠️ 关键修改:先尝试行间公式,再尝试行内公式,最后才处理金额
+            original_tail = element.tail
+
             # 处理行间公式（优先处理，因为可能包含行内公式）
             element.tail = self._process_math_in_text(element, element.tail, display_pattern, True, True)
             # 处理行内公式
             if element.tail:  # 检查是否还有文本需要处理
                 element.tail = self._process_math_in_text(element, element.tail, inline_pattern, False, True)
 
+            # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
+            # 判断条件:文本内容没有变化,说明没有匹配到数学公式
+            if element.tail == original_tail and '$' in element.tail:
+                element.tail = optimized_dollar_matching(element.tail)
+
         # 跳过特定标签
         skip_tags = MATHJAX_OPTIONS['skipTags']
         if element.tag in skip_tags:
@@ -314,11 +322,16 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern
 
         # 处理当前节点的文本
         if element.text:
+            original_text = element.text
+
             # 处理行间公式（优先处理，因为可能包含行内公式）
-            element.text = self._process_math_in_text(element, element.text, display_pattern, True)
+            element.text = self._process_math_in_text(element, element.text, display_pattern, True, False)
             # 处理行内公式
-            if element.text:  # 检查是否还有文本需要处理
-                element.text = self._process_math_in_text(element, element.text, inline_pattern, False)
+            if element.text:
+                element.text = self._process_math_in_text(element, element.text, inline_pattern, False, False)
+            # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
+            if element.text == original_text and '$' in element.text:
+                element.text = optimized_dollar_matching(element.text)
 
         # 获取子节点的副本，以避免在迭代过程中修改列表
         children = list(element)
@@ -411,7 +424,7 @@ def _process_math_in_text(
             tem_match_display.clear()
         # 如果没有匹配到分隔符形式的公式，直接返回原文本
         if not matches:
-            return optimized_dollar_matching(text)
+            return text
 
         # 从后向前处理，以避免位置偏移
         result = text
@@ -487,7 +500,7 @@ def _process_math_in_text(
             last_position = start_pos
 
         # 返回处理后的文本
-        return optimized_dollar_matching(result)
+        return result
 
     def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
         """检查分隔符是否被转义.

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -5,8 +5,7 @@
 
 from llm_web_kit.exception.exception import (
     HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
-from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
-                                                           tag_img, tag_math,
+from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
                                                            tag_mjx, tag_script)
 from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
                                                                   ZHIHU)
@@ -139,16 +138,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                 if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
 
-                # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
-                if node.tag == 'span' and node.get('class') and (
-                        'math-container' in node.get('class') or
-                        'mathjax' in node.get('class') or
-                        'wp-katex-eq' in node.get('class') or
-                        'x-ck12-mathEditor' in node.get('class') or
-                        'tex' in node.get('class')
-                ):
-                    tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
-
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):
                     # print(f"匹配到数学标签: {node.tag}")

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -463,16 +463,9 @@ def test_math_recognizer(self):
     def test_math_recognizer_html(self):
         for test_case in TEST_CASES_HTML:
             raw_html_path = base_dir.joinpath(test_case['input'][0])
-            # print('raw_html_path::::::::', raw_html_path)
             base_url = test_case['base_url']
             raw_html = raw_html_path.read_text(encoding='utf-8')
-            parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
-            # print(parts)
-            # 将parts列表中第一个元素拼接保存到文件，带随机数
-            # import random
-            # with open('parts'+str(random.randint(1, 100))+".html", 'w') as f:
-            #     for part in parts:
-            #         f.write(str(part[0]))
+
             # 创建预处理器并清理隐藏元素
             pre_extractor = HTMLFileFormatNoClipCleanTagsPreExtractor({})
             data_json = DataJson({'html': raw_html, 'url': base_url})
@@ -485,34 +478,21 @@ def test_math_recognizer_html(self):
                 [(html_to_element(cleaned_html), html_to_element(cleaned_html))],
                 cleaned_html
             )
-            # 检查行间公式抽取正确性
+
+            # 检查行间公式
             new_parts = []
             for part in parts:
                 new_parts.append((element_to_html(part[0]), element_to_html(part[1])))
-            parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
+
+            interline_parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
             expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip()
             expect_formulas = [formula for formula in expect_text.split('\n') if formula]
-            if len(parts) != len(expect_formulas):
-                print("出错样例：", test_case['input'])
-                print("期望公式数：", len(expect_formulas), "实际公式数：", len(parts))
-                print("期望公式：", expect_formulas)
-                print("实际公式：", parts)
-            self.assertEqual(len(parts), len(expect_formulas))
-            # answers = []
-            for expect, part in zip(expect_formulas, parts):
-                a_tree = html_to_element(part)
-                a_result = a_tree.xpath(f'.//{CCTag.CC_MATH_INTERLINE}')[0]
-                answer = a_result.text.replace('\n', '').strip()
-                # print('part::::::::', part)
-                # print('expect::::::::', expect)
-                # print('answer::::::::', answer)
-                # answers.append(answer)
-                self.assertEqual(expect, answer)
-            # print('answers::::::::', answers)
-            # self.write_to_html(answers, test_case['input'][0])
-            # 检查行内公式抽取正确性
+
+            print(f"\n测试用例: {test_case['input']}")
+            print(f"行间公式 - 期望: {len(expect_formulas)}, 实际: {len(interline_parts)}")
+
+            # 检查行内公式
             if test_case.get('expected_inline', None):
-                # 从所有parts中提取所有行内公式
                 all_inline_formulas = []
                 for part in new_parts:
                     if CCTag.CC_MATH_INLINE in part[0]:
@@ -521,15 +501,23 @@ def test_math_recognizer_html(self):
                         for inline_elem in inline_elements:
                             formula = inline_elem.text.replace('\n', '').strip()
                             all_inline_formulas.append(formula)
-                # print(f"Found {len(all_inline_formulas)} total inline formulas")
-                # print(f"Total new_parts: {len(new_parts)}")
                 expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
                 expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
-                # print(f"Expected {len(expect_inline_formulas)} inline formulas")
+                print(f"行内公式 - 期望: {len(expect_inline_formulas)}, 实际: {len(all_inline_formulas)}")
+                # 打印所有实际提取的行内公式
+                print("\n所有实际提取的行内公式:")
+                for i, formula in enumerate(all_inline_formulas, 1):
+                    print(f"  {i}. {formula}")
+                # 打印所有期望的行内公式
+                print("\n所有期望的行内公式:")
+                for i, formula in enumerate(expect_inline_formulas, 1):
+                    print(f"  {i}. {formula}")
+                # 找出差异
+                print("\n差异分析:")
+                if len(all_inline_formulas) != len(expect_inline_formulas):
+                    print("数量不匹配!")
                 self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
                 for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
-                    # print('inline expect::::::::', expect)
-                    # print('inline answer::::::::', formula)
                     self.assertEqual(expect, formula)
 
     def write_to_html(self, answers, file_name):