diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 06ac62a9..2f1cc3ad 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -1,10 +1,14 @@ import re from typing import Any, Dict, List +from pylatexenc import latexwalker + from llm_web_kit.extractor.html.recognizer.cc_math.common import CCMATH from llm_web_kit.extractor.html.recognizer.cc_math.render.render import ( BaseMathRender, MathRenderType) -from llm_web_kit.libs.html_utils import HtmlElement, html_to_element +from llm_web_kit.libs.html_utils import (HtmlElement, SimpleMatch, + html_to_element, + optimized_dollar_matching) from llm_web_kit.libs.text_utils import normalize_ctl_text # 添加MATHJAX_OPTIONS变量定义 @@ -29,6 +33,21 @@ 'AM_CHTML' ] +# 独立公式环境 +independent_math_environments = [ + 'displaymath', + 'equation', + 'equation*', + 'align', + 'align*', + 'gather', + 'gather*', + 'multline', + 'multline*', + 'vmatrix', + 'Vmatrix' +] + class MathJaxRender(BaseMathRender): """MathJax渲染器实现.""" @@ -246,10 +265,10 @@ def find_math(self, root: HtmlElement) -> None: display_patterns.append(pattern) # 添加对环境的支持 - if MATHJAX_OPTIONS.get('processEnvironments', True): - # 通用匹配任何 \begin{...}\end{...} 环境的模式,保证环境名称相同时才匹配 - env_pattern = r'(\\begin\{(?P[^}]+)\}.*?\\end\{(?P=env)\})' - display_patterns.append(env_pattern) + # if MATHJAX_OPTIONS.get('processEnvironments', True): + # # 通用匹配任何 \begin{...}\end{...} 环境的模式,保证环境名称相同时才匹配 + # env_pattern = r'(\\begin\{(?P[^}]+)\}.*?\\end\{(?P=env)\})' + # display_patterns.append(env_pattern) # 编译正则表达式 inline_pattern = re.compile('|'.join(inline_patterns), re.DOTALL) @@ -358,11 +377,41 @@ def _process_math_in_text( return text # 首先查找所有分隔符形式的匹配 - matches = list(pattern.finditer(text)) - + if not is_display: + matches = list(pattern.finditer(text)) + else: + matches = [] + tem_match_display = [] + walker = latexwalker.LatexWalker(text) + nodelist, pos, len_ = walker.get_latex_nodes(pos=0) + for node in nodelist: + # 标准的数学环境 + if node.isNodeType(latexwalker.LatexMathNode): + # 判断是行内公式还是独立公式 + if node.displaytype == 'inline': + pass + elif node.displaytype == 'display': + tem_match_display.append(node.latex_verbatim()) + fake_match = SimpleMatch(text, node.pos, node.len) + matches.append(fake_match) + # 其他数学环境 + if (node.isNodeType(latexwalker.LatexEnvironmentNode) and + hasattr(node, 'environmentname') and + node.environmentname in independent_math_environments): + tem_match_display.append(node.latex_verbatim()) + fake_match = SimpleMatch(text, node.pos, node.len) + matches.append(fake_match) + # 公式自定义边界逻辑 + new_display_patterns = [item for item in pattern.pattern.split('|') if "$" not in item] + custom_pattern = re.compile('|'.join(new_display_patterns), re.DOTALL) + custom_matches = list(custom_pattern.finditer(text)) + for item in custom_matches: + if item.group() not in tem_match_display: + matches.append(item) + tem_match_display.clear() # 如果没有匹配到分隔符形式的公式,直接返回原文本 if not matches: - return text + return optimized_dollar_matching(text) # 从后向前处理,以避免位置偏移 result = text @@ -438,7 +487,7 @@ def _process_math_in_text( last_position = start_pos # 返回处理后的文本 - return result + return optimized_dollar_matching(result) def _is_escaped_delimiter(self, text: str, pos: int) -> bool: """检查分隔符是否被转义. diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index aed792c9..ef295565 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -7,8 +7,9 @@ from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, MathType, text_strip) -from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html, - replace_element) +from llm_web_kit.libs.html_utils import (build_cc_element, + check_and_balance_delimiters, + element_to_html, replace_element) def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement): @@ -55,6 +56,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa # 处理未转义的%为\% if latex: latex = re.sub(r'(? li if el.tag == CCTag.CC_CODE_INLINE: blks.append(f'`{el.text}`') + elif el.tag == CCTag.CC_MATH_INLINE: + blks.append(f'${el.text.strip()}$') elif el.tag in ['br']: blks.extend(['$br$']) else: diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 11a4b5f1..56ed8272 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -55,7 +55,7 @@ def __init__(self): self.__text_end = '\n' self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 - self.__md_special_chars = ['#', '`', '$'] # TODO 拼装table的时候还应该转义掉|符号 + self.__md_special_chars = ['#', '`'] # TODO 拼装table的时候还应该转义掉|符号 self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index 05d6783b..d1d0a648 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -483,6 +483,41 @@ def restore_sub_sup_from_text_regex(processed_content): return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content) +def check_and_balance_delimiters(latex_str): + """检查LaTeX字符串中的left和right是否成对,并移除多余的left或right,但保留分隔符。 + + Args: + latex_str (str): 输入的LaTeX字符串 + + Returns: + str: 处理后的字符串,多余的left或right已被移除,分隔符保留。 + """ + stack = [] + to_remove = [] + pattern = re.compile(r'(\\left|\\right)(\\[{}()[\]]|\.|)') + + matches = list(pattern.finditer(latex_str)) + for match in matches: + start_idx = match.start() # 整个匹配的起始位置 + command = match.group(1) # 匹配到的命令,是 '\left' 或 '\right' + + if command == r'\left': + stack.append((start_idx, len(command))) + elif command == r'\right': + if stack: + stack.pop() + else: + to_remove.append((start_idx, len(command))) + + for left_start, left_cmd_len in stack: + to_remove.append((left_start, left_cmd_len)) + + for pos, cmd_len in sorted(to_remove, reverse=True): + latex_str = latex_str[:pos] + latex_str[pos + cmd_len:] + + return latex_str + + def get_plain_text_fast(html_source: str) -> str: """使用lxml快速获取html中的纯文本. @@ -506,3 +541,56 @@ def get_plain_text_fast(html_source: str) -> str: texts = doc.xpath('//text()') full_text = ' '.join(text.strip() for text in texts if text.strip()) return full_text + + +class SimpleMatch: + """一个简单的模拟 re.Match 的对象。 根据提供的原始字符串、起始位置和长度来模拟匹配结果。""" + def __init__(self, original_string, start_pos, length): + self._string = original_string + self._start = start_pos + self._end = start_pos + length + self._match = original_string[start_pos:self._end] # 提取匹配的字符串 + + def group(self, group_num=0): + if group_num == 0: + return self._match + + def start(self, group_num=0): + if group_num == 0: + return self._start + + def end(self, group_num=0): + if group_num == 0: + return self._end + + def groups(self): + # 返回空元组,因为不支持捕获组 + return () + + +def optimized_dollar_matching(text): + """美元金额匹配.""" + # 用于存储需要修改的位置和替换内容 + replacements = [] + + pattern = r'(?=1.6.1 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index d25cc630..79d010aa 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -565,6 +565,410 @@ def test_to_content_list_node(self): ) self.assertIn('No ccmath element found in content', str(exc_info.exception)) + def test_fix_re_match(self): + """修复正则无法正确匹配$...$$...$$...$这种连续公式.""" + html_content = r"""

$\newcommand{\cE}[2]{\mathbf{E}(#1\ |\ #2)}$$\newcommand{\cP}[2]{\mathbf{P}(#1\ |\ #2)}$$\renewcommand{\P}[1]{\mathbf{P}(#1)}$$\newcommand{\E}[1]{\mathbf{E}(#1)}$$\newcommand{\F}{\mathcal{F}}$$\newcommand{\G}{\mathcal{G}}$$\newcommand{\ind}[1]{\mathbf{1}_{#1}}$ + To motivate this note, I’ll pose the following problem:

""" + parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert element_to_html(parts[0][0]) == '

\\newcommand{\\cE}[2]{\\mathbf{E}(#1\\ |\\ #2)}\\newcommand{\\cP}[2]{\\mathbf{P}(#1\\ |\\ #2)}\\renewcommand{\\P}[1]{\\mathbf{P}(#1)}\\newcommand{\\E}[1]{\\mathbf{E}(#1)}\\newcommand{\\F}{\\mathcal{F}}\\newcommand{\\G}{\\mathcal{G}}\\newcommand{\\ind}[1]{\\mathbf{1}_{#1}}\n To motivate this note, I’ll pose the following problem:

' + + def test_latex_not_closed(self): + """移除LaTeX字符多余的left或right.""" + html_content = """

+ + + + + { + + + + + + ∇ + + + ⋅ + + + + ( + + + + + R + + + 2 + + + + ∇ + + + φ + + + + ) + + + + = + + + 0 + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + + ( + + + 6 + + + ) + + + + + + + D + + + + ( + + + + + r + + + + , + + + + k + + + + , + + + ω + + + + ) + + + + ≡ + + + + c + + + + 2 + + + + k + + + 0 + + + + + + + [ + + + + + k + + + 2 + + + + − + + + + + + ( + + + + n + + + + k + + + 0 + + + + + ) + + + + + 2 + + + + + ] + + + + + + + + W + + + + ( + + + + + r + + + + , + + + ω + + + + ) + + + + = + + + 0 + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + + (7) + + + + + + + +

""" + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert '\\{\\begin{array}{l}\\nabla \\cdot \\left({R}^{2}\\nabla \\phi \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(6\\right)\\\\ D\\left(r,k,\\omega \\right)\\equiv \\frac{c}{2{k}_{0}}\\left[{k}^{2}-{\\left(n{k}_{0}\\right)}^{2}\\right]+W\\left(r,\\omega \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(7\\right)\\end{array}' in element_to_html(parts[0][0]) + + def test_dollar_sign(self): + """美元符合与公式共存的情况.""" + html_content = """

referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity

""" + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert element_to_html(parts[0][0]) == '

referring \\$18.1 to \\$18.1 the packet center p and apparently coinciding with the particle velocity

' + + def test_begin_end(self): + """$begin end$的嵌套组合识别时候$$没有处理.""" + html_content = r"""

$\begin{array}{1 1}(a)\;xy=c\\(b)\;xy=c^2\\(c)\;x^2+y^2=a^2\\(d)\;x^2+y^2=1\end{array}$

""" + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert element_to_html(parts[0][0]) == '

\\begin{array}{1 1}(a)\\;xy=c\\\\(b)\\;xy=c^2\\\\(c)\\;x^2+y^2=a^2\\\\(d)\\;x^2+y^2=1\\end{array}

' + class TestCCMATH(unittest.TestCase): def setUp(self): diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index 2edfbeab..b07ee1fe 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -46,3 +46,21 @@ def test_title1(title_recognizer): html_content = file.read() result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content) assert 'Compare vibrational frequencies for two calculations for C<sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) + + +def test_title_has_formula(title_recognizer): + """ + 标题含有公式 + Args: + title_recognizer: + + Returns: + + """ + html_content = r"""

+ + Vector Meson Production in the Final State $K^+ K^- \pi^+ \pi^-$ Photon-photon Collisions + +

""" + result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) + assert r"Vector Meson Production in the Final State $K^+ K^- \pi^+ \pi^-$ Photon-photon Collisions" in element_to_html(result[0][0]) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 533470ef..0ceedc0b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -512,7 +512,7 @@ def test_table_only_include_tr(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_nlp_md() - assert 'List Price: $11.80' in result_md + assert r'List Price: \$11.80' in result_md def test_table_only_one_td(self): """测试table只有一个td."""