ccprocessor · e06084 · Oct 9, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
@@ -1,10 +1,14 @@
 import re
 from typing import Any, Dict, List
 
+from pylatexenc import latexwalker
+
 from llm_web_kit.extractor.html.recognizer.cc_math.common import CCMATH
 from llm_web_kit.extractor.html.recognizer.cc_math.render.render import (
     BaseMathRender, MathRenderType)
-from llm_web_kit.libs.html_utils import HtmlElement, html_to_element
+from llm_web_kit.libs.html_utils import (HtmlElement, SimpleMatch,
+                                         html_to_element,
+                                         optimized_dollar_matching)
 from llm_web_kit.libs.text_utils import normalize_ctl_text
 
 # 添加MATHJAX_OPTIONS变量定义
@@ -29,6 +33,21 @@
     'AM_CHTML'
 ]
 
+# 独立公式环境
+independent_math_environments = [
+    'displaymath',
+    'equation',
+    'equation*',
+    'align',
+    'align*',
+    'gather',
+    'gather*',
+    'multline',
+    'multline*',
+    'vmatrix',
+    'Vmatrix'
+]
+
 
 class MathJaxRender(BaseMathRender):
     """MathJax渲染器实现."""
@@ -246,10 +265,10 @@ def find_math(self, root: HtmlElement) -> None:
             display_patterns.append(pattern)
 
         # 添加对环境的支持
-        if MATHJAX_OPTIONS.get('processEnvironments', True):
-            # 通用匹配任何 \begin{...}\end{...} 环境的模式，保证环境名称相同时才匹配
-            env_pattern = r'(\\begin\{(?P<env>[^}]+)\}.*?\\end\{(?P=env)\})'
-            display_patterns.append(env_pattern)
+        # if MATHJAX_OPTIONS.get('processEnvironments', True):
+        #     # 通用匹配任何 \begin{...}\end{...} 环境的模式，保证环境名称相同时才匹配
+        #     env_pattern = r'(\\begin\{(?P<env>[^}]+)\}.*?\\end\{(?P=env)\})'
+        #     display_patterns.append(env_pattern)
 
         # 编译正则表达式
         inline_pattern = re.compile('|'.join(inline_patterns), re.DOTALL)
@@ -358,11 +377,41 @@ def _process_math_in_text(
             return text
 
         # 首先查找所有分隔符形式的匹配
-        matches = list(pattern.finditer(text))
-
+        if not is_display:
+            matches = list(pattern.finditer(text))
+        else:
+            matches = []
+            tem_match_display = []
+            walker = latexwalker.LatexWalker(text)
+            nodelist, pos, len_ = walker.get_latex_nodes(pos=0)
+            for node in nodelist:
+                # 标准的数学环境
+                if node.isNodeType(latexwalker.LatexMathNode):
+                    # 判断是行内公式还是独立公式
+                    if node.displaytype == 'inline':
+                        pass
+                    elif node.displaytype == 'display':
+                        tem_match_display.append(node.latex_verbatim())
+                        fake_match = SimpleMatch(text, node.pos, node.len)
+                        matches.append(fake_match)
+                # 其他数学环境
+                if (node.isNodeType(latexwalker.LatexEnvironmentNode) and
+                        hasattr(node, 'environmentname') and
+                        node.environmentname in independent_math_environments):
+                    tem_match_display.append(node.latex_verbatim())
+                    fake_match = SimpleMatch(text, node.pos, node.len)
+                    matches.append(fake_match)
+            # 公式自定义边界逻辑
+            new_display_patterns = [item for item in pattern.pattern.split('|') if "$" not in item]
+            custom_pattern = re.compile('|'.join(new_display_patterns), re.DOTALL)
+            custom_matches = list(custom_pattern.finditer(text))
+            for item in custom_matches:
+                if item.group() not in tem_match_display:
+                    matches.append(item)
+            tem_match_display.clear()
         # 如果没有匹配到分隔符形式的公式，直接返回原文本
         if not matches:
-            return text
+            return optimized_dollar_matching(text)
 
         # 从后向前处理，以避免位置偏移
         result = text
@@ -438,7 +487,7 @@ def _process_math_in_text(
             last_position = start_pos
 
         # 返回处理后的文本
-        return result
+        return optimized_dollar_matching(result)
 
     def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
         """检查分隔符是否被转义.

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
@@ -7,8 +7,9 @@
 from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
                                                                   MathType,
                                                                   text_strip)
-from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
-                                         replace_element)
+from llm_web_kit.libs.html_utils import (build_cc_element,
+                                         check_and_balance_delimiters,
+                                         element_to_html, replace_element)
 
 
 def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
@@ -55,6 +56,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
             # 处理未转义的%为\%
             if latex:
                 latex = re.sub(r'(?<!\\)%', r'\\%', latex)
+                latex = check_and_balance_delimiters(latex)
             text = cm.wrap_math_md(latex)
             if text:
                 # Set the html of the new span tag to the text

diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
@@ -134,6 +134,8 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
 
             if el.tag == CCTag.CC_CODE_INLINE:
                 blks.append(f'`{el.text}`')
+            elif el.tag == CCTag.CC_MATH_INLINE:
+                blks.append(f'${el.text.strip()}$')
             elif el.tag in ['br']:
                 blks.extend(['$br$'])
             else:

diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
@@ -55,7 +55,7 @@ def __init__(self):
         self.__text_end = '\n'
         self.__list_item_start = '-'  # md里的列表项前缀
         self.__list_para_prefix = '  '  # 两个空格，md里的列表项非第一个段落的前缀：如果多个段落的情况，第二个以及之后的段落前缀
-        self.__md_special_chars = ['#', '`', '$']  # TODO 拼装table的时候还应该转义掉|符号
+        self.__md_special_chars = ['#', '`']  # TODO 拼装table的时候还应该转义掉|符号
         self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST,
                                       DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE,
                                       DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO,

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -483,6 +483,41 @@ def restore_sub_sup_from_text_regex(processed_content):
     return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content)
 
 
+def check_and_balance_delimiters(latex_str):
+    """检查LaTeX字符串中的left和right是否成对，并移除多余的left或right，但保留分隔符。
+
+    Args:
+        latex_str (str): 输入的LaTeX字符串
+
+    Returns:
+        str: 处理后的字符串，多余的left或right已被移除，分隔符保留。
+    """
+    stack = []
+    to_remove = []
+    pattern = re.compile(r'(\\left|\\right)(\\[{}()[\]]|\.|)')
+
+    matches = list(pattern.finditer(latex_str))
+    for match in matches:
+        start_idx = match.start()  # 整个匹配的起始位置
+        command = match.group(1)  # 匹配到的命令，是 '\left' 或 '\right'
+
+        if command == r'\left':
+            stack.append((start_idx, len(command)))
+        elif command == r'\right':
+            if stack:
+                stack.pop()
+            else:
+                to_remove.append((start_idx, len(command)))
+
+    for left_start, left_cmd_len in stack:
+        to_remove.append((left_start, left_cmd_len))
+
+    for pos, cmd_len in sorted(to_remove, reverse=True):
+        latex_str = latex_str[:pos] + latex_str[pos + cmd_len:]
+
+    return latex_str
+
+
 def get_plain_text_fast(html_source: str) -> str:
     """使用lxml快速获取html中的纯文本.
 
@@ -506,3 +541,56 @@ def get_plain_text_fast(html_source: str) -> str:
     texts = doc.xpath('//text()')
     full_text = ' '.join(text.strip() for text in texts if text.strip())
     return full_text
+
+
+class SimpleMatch:
+    """一个简单的模拟 re.Match 的对象。 根据提供的原始字符串、起始位置和长度来模拟匹配结果。"""
+    def __init__(self, original_string, start_pos, length):
+        self._string = original_string
+        self._start = start_pos
+        self._end = start_pos + length
+        self._match = original_string[start_pos:self._end]  # 提取匹配的字符串
+
+    def group(self, group_num=0):
+        if group_num == 0:
+            return self._match
+
+    def start(self, group_num=0):
+        if group_num == 0:
+            return self._start
+
+    def end(self, group_num=0):
+        if group_num == 0:
+            return self._end
+
+    def groups(self):
+        # 返回空元组，因为不支持捕获组
+        return ()
+
+
+def optimized_dollar_matching(text):
+    """美元金额匹配."""
+    # 用于存储需要修改的位置和替换内容
+    replacements = []
+
+    pattern = r'(?<!\\)(\$\d{1,3}(?:,\d{3})*(?:\.\d{1,})?)'
+    matches_result = re.finditer(pattern, text)
+    for match in matches_result:
+        # 获取匹配的起始和结束位置
+        start, end = match.start(), match.end()
+        # 检查匹配后的字符（如果存在）
+        if end < len(text):
+            next_char = text[end]
+            # 只有当后接字符不在列表中时才进行替换
+            if next_char not in ["^", "$", "\\", "/"]:
+                replacements.append((start, end, match.group()))
+
+    if replacements:
+        text_chars = list(text)
+        for start, end, original_match in sorted(replacements, reverse=True):
+            # 只转义金额前的$符号
+            escaped_match = f"\\{original_match}"
+            text_chars[start:end] = list(escaped_match)
+        return ''.join(text_chars)
+    else:
+        return text
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
@@ -26,6 +26,7 @@ py-asciimath==0.3.0
 pyahocorasick==2.0.0
 pydantic==2.11.7
 pydantic-settings==2.10.1
+pylatexenc==2.10
 python-dotenv==1.1.1
 python-multipart==0.0.20
 scikit-learn>=1.6.1