ccprocessor · 1041206149 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -5,8 +5,7 @@
 
 from llm_web_kit.exception.exception import (
     HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
-from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
-                                                           tag_img, tag_math,
+from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
                                                            tag_mjx, tag_script)
 from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
                                                                   ZHIHU)
@@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
 
                 # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
-                if node.tag == 'span' and node.get('class') and (
-                        'math-container' in node.get('class') or
-                        'mathjax' in node.get('class') or
-                        'wp-katex-eq' in node.get('class') or
-                        'x-ck12-mathEditor' in node.get('class') or
-                        'tex' in node.get('class')
-                ):
-                    tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                # if node.tag == 'span' and node.get('class') and (
+                #         'math-container' in node.get('class') or
+                #         'mathjax' in node.get('class') or
+                #         'wp-katex-eq' in node.get('class') or
+                #         'x-ck12-mathEditor' in node.get('class') or
+                #         'tex' in node.get('class')
+                # ):
+                #     tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
 
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -569,26 +569,42 @@ def groups(self):
 
 
 def optimized_dollar_matching(text):
-    """美元金额匹配."""
-    # 用于存储需要修改的位置和替换内容
+    """美元金额匹配,避免误判数学公式."""
     replacements = []
 
     pattern = r'(?<!\\)(\$\d{1,3}(?:,\d{3})*(?:\.\d{1,})?)'
-    matches_result = re.finditer(pattern, text)
+    matches_result = list(re.finditer(pattern, text))
+
     for match in matches_result:
-        # 获取匹配的起始和结束位置
         start, end = match.start(), match.end()
-        # 检查匹配后的字符（如果存在）
+
+        # 检查匹配后的字符
         if end < len(text):
             next_char = text[end]
-            # 只有当后接字符不在列表中时才进行替换
-            if next_char not in ["^", "$", "\\", "/"]:
-                replacements.append((start, end, match.group()))
+            # 原有逻辑:排除数学运算符
+            if next_char in ["^", "$", "\\", "/"]:
+                continue
+
+        # 新增逻辑:检查后续是否存在配对的$符号
+        remaining_text = text[end:]
+        # 查找下一个未转义的$
+        next_dollar_match = re.search(r'(?<!\\)\$', remaining_text)
+
+        if next_dollar_match:
+            next_dollar_pos = end + next_dollar_match.start()
+            # 检查第二个$后面的字符
+            after_second_dollar = text[next_dollar_pos + 1:next_dollar_pos + 2]
+
+            # 如果第二个$后面不是数字或为空,则认为是公式,跳过转义
+            if not after_second_dollar or not after_second_dollar.isdigit():
+                continue
+
+        # 如果通过所有检查,则进行转义
+        replacements.append((start, end, match.group()))
 
     if replacements:
         text_chars = list(text)
         for start, end, original_match in sorted(replacements, reverse=True):
-            # 只转义金额前的$符号
             escaped_match = f"\\{original_match}"
             text_chars[start:end] = list(escaped_match)
         return ''.join(text_chars)