ccprocessor · 1041206149 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -5,8 +5,7 @@
 
 from llm_web_kit.exception.exception import (
     HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
-from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
-                                                           tag_img, tag_math,
+from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
                                                            tag_mjx, tag_script)
 from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
                                                                   ZHIHU)
@@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
 
                 # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
-                if node.tag == 'span' and node.get('class') and (
-                        'math-container' in node.get('class') or
-                        'mathjax' in node.get('class') or
-                        'wp-katex-eq' in node.get('class') or
-                        'x-ck12-mathEditor' in node.get('class') or
-                        'tex' in node.get('class')
-                ):
-                    tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                # if node.tag == 'span' and node.get('class') and (
+                #         'math-container' in node.get('class') or
+                #         'mathjax' in node.get('class') or
+                #         'wp-katex-eq' in node.get('class') or
+                #         'x-ck12-mathEditor' in node.get('class') or
+                #         'tex' in node.get('class')
+                # ):
+                #     tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
 
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -569,26 +569,42 @@ def groups(self):
 
 
 def optimized_dollar_matching(text):
-    """美元金额匹配."""
-    # 用于存储需要修改的位置和替换内容
+    """美元金额匹配,避免误判数学公式."""
     replacements = []
 
     pattern = r'(?<!\\)(\$\d{1,3}(?:,\d{3})*(?:\.\d{1,})?)'
-    matches_result = re.finditer(pattern, text)
+    matches_result = list(re.finditer(pattern, text))
+
     for match in matches_result:
-        # 获取匹配的起始和结束位置
         start, end = match.start(), match.end()
-        # 检查匹配后的字符（如果存在）
+
+        # 检查匹配后的字符
         if end < len(text):
             next_char = text[end]
-            # 只有当后接字符不在列表中时才进行替换
-            if next_char not in ["^", "$", "\\", "/"]:
-                replacements.append((start, end, match.group()))
+            # 原有逻辑:排除数学运算符
+            if next_char in ["^", "$", "\\", "/"]:
+                continue
+
+        # 新增逻辑:检查后续是否存在配对的$符号
+        remaining_text = text[end:]
+        # 查找下一个未转义的$
+        next_dollar_match = re.search(r'(?<!\\)\$', remaining_text)
+
+        if next_dollar_match:
+            next_dollar_pos = end + next_dollar_match.start()
+            # 检查第二个$后面的字符
+            after_second_dollar = text[next_dollar_pos + 1:next_dollar_pos + 2]
+
+            # 如果第二个$后面不是数字或为空,则认为是公式,跳过转义
+            if not after_second_dollar or not after_second_dollar.isdigit():
+                continue
+
+        # 如果通过所有检查,则进行转义
+        replacements.append((start, end, match.group()))
 
     if replacements:
         text_chars = list(text)
         for start, end, original_match in sorted(replacements, reverse=True):
-            # 只转义金额前的$符号
             escaped_match = f"\\{original_match}"
             text_chars[start:end] = list(escaped_match)
         return ''.join(text_chars)

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -955,11 +955,11 @@ def test_latex_not_closed(self):
 
     def test_dollar_sign(self):
         """美元符合与公式共存的情况."""
-        html_content = """<p>referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity</p>"""
+        html_content = """<p>referring $18.1 to $18.1 the packet center $ p$ and <span class="math-container">$9 + 10^9$</span> apparently coinciding with the particle velocity and $18.1</p>"""
         parts = self.math_recognizer.recognize('https://www.baidu.com',
                                                [(html_to_element(html_content), html_to_element(html_content))],
                                                html_content)
-        assert element_to_html(parts[0][0]) == '<p>referring \\$18.1 to \\$18.1 the packet center <ccmath-inline type="latex" by="mathjax_mock" html="$ p$">p</ccmath-inline> and apparently coinciding with the particle velocity</p>'
+        assert element_to_html(parts[0][0]) == '<p>referring \\$18.1 to \\$18.1 the packet center <ccmath-inline type="latex" by="mathjax_mock" html="$ p$">p</ccmath-inline> and <span class="math-container"><ccmath-inline type="latex" by="mathjax_mock" html="$9 + 10^9$">9 + 10^9</ccmath-inline></span> apparently coinciding with the particle velocity and \\$18.1</p>'
 
     def test_begin_end(self):
         """$begin end$的嵌套组合识别时候$$没有处理."""

diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
@@ -168,7 +168,7 @@
         "content": {
           "math_content": "a^2 + b^2 = c^2",
           "math_type": "latex",
-          "by": "None"
+          "by": "mathjax_mock"
         }
       },
       {

diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py
@@ -564,7 +564,7 @@ def test_to_plain_md(self):
         self.assertNotIn('flower.mp4', mm_md)
 
         content_json = json_loads(base_dir.joinpath('assets/content_json.json').read_text(encoding='utf-8'))
-        self.assertEqual(json_json['content_list'], content_json['content_list'])
+        assert json_json['content_list'] == content_json['content_list']
 
         plain_md_main = extract_content_from_main_html(url, raw_html, 'plain_md')
         mm_md_main = extract_content_from_html_with_magic_html(url, raw_html, 'mm_md')