[^}]+)\}.*?\\end\{(?P=env)\})'
+ # display_patterns.append(env_pattern)
# 编译正则表达式
inline_pattern = re.compile('|'.join(inline_patterns), re.DOTALL)
@@ -358,11 +377,41 @@ def _process_math_in_text(
return text
# 首先查找所有分隔符形式的匹配
- matches = list(pattern.finditer(text))
-
+ if not is_display:
+ matches = list(pattern.finditer(text))
+ else:
+ matches = []
+ tem_match_display = []
+ walker = latexwalker.LatexWalker(text)
+ nodelist, pos, len_ = walker.get_latex_nodes(pos=0)
+ for node in nodelist:
+ # 标准的数学环境
+ if node.isNodeType(latexwalker.LatexMathNode):
+ # 判断是行内公式还是独立公式
+ if node.displaytype == 'inline':
+ pass
+ elif node.displaytype == 'display':
+ tem_match_display.append(node.latex_verbatim())
+ fake_match = SimpleMatch(text, node.pos, node.len)
+ matches.append(fake_match)
+ # 其他数学环境
+ if (node.isNodeType(latexwalker.LatexEnvironmentNode) and
+ hasattr(node, 'environmentname') and
+ node.environmentname in independent_math_environments):
+ tem_match_display.append(node.latex_verbatim())
+ fake_match = SimpleMatch(text, node.pos, node.len)
+ matches.append(fake_match)
+ # 公式自定义边界逻辑
+ new_display_patterns = [item for item in pattern.pattern.split('|') if "$" not in item]
+ custom_pattern = re.compile('|'.join(new_display_patterns), re.DOTALL)
+ custom_matches = list(custom_pattern.finditer(text))
+ for item in custom_matches:
+ if item.group() not in tem_match_display:
+ matches.append(item)
+ tem_match_display.clear()
# 如果没有匹配到分隔符形式的公式,直接返回原文本
if not matches:
- return text
+ return optimized_dollar_matching(text)
# 从后向前处理,以避免位置偏移
result = text
@@ -438,7 +487,7 @@ def _process_math_in_text(
last_position = start_pos
# 返回处理后的文本
- return result
+ return optimized_dollar_matching(result)
def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
"""检查分隔符是否被转义.
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
index aed792c9..ef295565 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
@@ -7,8 +7,9 @@
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
MathType,
text_strip)
-from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
- replace_element)
+from llm_web_kit.libs.html_utils import (build_cc_element,
+ check_and_balance_delimiters,
+ element_to_html, replace_element)
def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
@@ -55,6 +56,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
# 处理未转义的%为\%
if latex:
latex = re.sub(r'(? li
if el.tag == CCTag.CC_CODE_INLINE:
blks.append(f'`{el.text}`')
+ elif el.tag == CCTag.CC_MATH_INLINE:
+ blks.append(f'${el.text.strip()}$')
elif el.tag in ['br']:
blks.extend(['$br$'])
else:
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 11a4b5f1..56ed8272 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -55,7 +55,7 @@ def __init__(self):
self.__text_end = '\n'
self.__list_item_start = '-' # md里的列表项前缀
self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀
- self.__md_special_chars = ['#', '`', '$'] # TODO 拼装table的时候还应该转义掉|符号
+ self.__md_special_chars = ['#', '`'] # TODO 拼装table的时候还应该转义掉|符号
self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST,
DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE,
DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO,
diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
index 05d6783b..d1d0a648 100644
--- a/llm_web_kit/libs/html_utils.py
+++ b/llm_web_kit/libs/html_utils.py
@@ -483,6 +483,41 @@ def restore_sub_sup_from_text_regex(processed_content):
return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content)
+def check_and_balance_delimiters(latex_str):
+ """检查LaTeX字符串中的left和right是否成对,并移除多余的left或right,但保留分隔符。
+
+ Args:
+ latex_str (str): 输入的LaTeX字符串
+
+ Returns:
+ str: 处理后的字符串,多余的left或right已被移除,分隔符保留。
+ """
+ stack = []
+ to_remove = []
+ pattern = re.compile(r'(\\left|\\right)(\\[{}()[\]]|\.|)')
+
+ matches = list(pattern.finditer(latex_str))
+ for match in matches:
+ start_idx = match.start() # 整个匹配的起始位置
+ command = match.group(1) # 匹配到的命令,是 '\left' 或 '\right'
+
+ if command == r'\left':
+ stack.append((start_idx, len(command)))
+ elif command == r'\right':
+ if stack:
+ stack.pop()
+ else:
+ to_remove.append((start_idx, len(command)))
+
+ for left_start, left_cmd_len in stack:
+ to_remove.append((left_start, left_cmd_len))
+
+ for pos, cmd_len in sorted(to_remove, reverse=True):
+ latex_str = latex_str[:pos] + latex_str[pos + cmd_len:]
+
+ return latex_str
+
+
def get_plain_text_fast(html_source: str) -> str:
"""使用lxml快速获取html中的纯文本.
@@ -506,3 +541,56 @@ def get_plain_text_fast(html_source: str) -> str:
texts = doc.xpath('//text()')
full_text = ' '.join(text.strip() for text in texts if text.strip())
return full_text
+
+
+class SimpleMatch:
+ """一个简单的模拟 re.Match 的对象。 根据提供的原始字符串、起始位置和长度来模拟匹配结果。"""
+ def __init__(self, original_string, start_pos, length):
+ self._string = original_string
+ self._start = start_pos
+ self._end = start_pos + length
+ self._match = original_string[start_pos:self._end] # 提取匹配的字符串
+
+ def group(self, group_num=0):
+ if group_num == 0:
+ return self._match
+
+ def start(self, group_num=0):
+ if group_num == 0:
+ return self._start
+
+ def end(self, group_num=0):
+ if group_num == 0:
+ return self._end
+
+ def groups(self):
+ # 返回空元组,因为不支持捕获组
+ return ()
+
+
+def optimized_dollar_matching(text):
+ """美元金额匹配."""
+ # 用于存储需要修改的位置和替换内容
+ replacements = []
+
+ pattern = r'(?=1.6.1
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index d25cc630..79d010aa 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -565,6 +565,410 @@ def test_to_content_list_node(self):
)
self.assertIn('No ccmath element found in content', str(exc_info.exception))
+ def test_fix_re_match(self):
+ """修复正则无法正确匹配$...$$...$$...$这种连续公式."""
+ html_content = r"""$\newcommand{\cE}[2]{\mathbf{E}(#1\ |\ #2)}$$\newcommand{\cP}[2]{\mathbf{P}(#1\ |\ #2)}$$\renewcommand{\P}[1]{\mathbf{P}(#1)}$$\newcommand{\E}[1]{\mathbf{E}(#1)}$$\newcommand{\F}{\mathcal{F}}$$\newcommand{\G}{\mathcal{G}}$$\newcommand{\ind}[1]{\mathbf{1}_{#1}}$
+ To motivate this note, I’ll pose the following problem:
"""
+ parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
+ assert element_to_html(parts[0][0]) == '\\newcommand{\\cE}[2]{\\mathbf{E}(#1\\ |\\ #2)}\\newcommand{\\cP}[2]{\\mathbf{P}(#1\\ |\\ #2)}\\renewcommand{\\P}[1]{\\mathbf{P}(#1)}\\newcommand{\\E}[1]{\\mathbf{E}(#1)}\\newcommand{\\F}{\\mathcal{F}}\\newcommand{\\G}{\\mathcal{G}}\\newcommand{\\ind}[1]{\\mathbf{1}_{#1}}\n To motivate this note, I’ll pose the following problem:
'
+
+ def test_latex_not_closed(self):
+ """移除LaTeX字符多余的left或right."""
+ html_content = """
+
+
"""
+ parts = self.math_recognizer.recognize('https://www.baidu.com',
+ [(html_to_element(html_content), html_to_element(html_content))],
+ html_content)
+ assert '\\{\\begin{array}{l}\\nabla \\cdot \\left({R}^{2}\\nabla \\phi \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(6\\right)\\\\ D\\left(r,k,\\omega \\right)\\equiv \\frac{c}{2{k}_{0}}\\left[{k}^{2}-{\\left(n{k}_{0}\\right)}^{2}\\right]+W\\left(r,\\omega \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(7\\right)\\end{array}' in element_to_html(parts[0][0])
+
+ def test_dollar_sign(self):
+ """美元符合与公式共存的情况."""
+ html_content = """referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity
"""
+ parts = self.math_recognizer.recognize('https://www.baidu.com',
+ [(html_to_element(html_content), html_to_element(html_content))],
+ html_content)
+ assert element_to_html(parts[0][0]) == 'referring \\$18.1 to \\$18.1 the packet center p and apparently coinciding with the particle velocity
'
+
+ def test_begin_end(self):
+ """$begin end$的嵌套组合识别时候$$没有处理."""
+ html_content = r"""$\begin{array}{1 1}(a)\;xy=c\\(b)\;xy=c^2\\(c)\;x^2+y^2=a^2\\(d)\;x^2+y^2=1\end{array}$
"""
+ parts = self.math_recognizer.recognize('https://www.baidu.com',
+ [(html_to_element(html_content), html_to_element(html_content))],
+ html_content)
+ assert element_to_html(parts[0][0]) == '\\begin{array}{1 1}(a)\\;xy=c\\\\(b)\\;xy=c^2\\\\(c)\\;x^2+y^2=a^2\\\\(d)\\;x^2+y^2=1\\end{array}
'
+
class TestCCMATH(unittest.TestCase):
def setUp(self):
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
index 2edfbeab..b07ee1fe 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
@@ -46,3 +46,21 @@ def test_title1(title_recognizer):
html_content = file.read()
result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content)
assert 'Compare vibrational frequencies for two calculations for C<sub>3</sub> (carbon trimer)' in element_to_html(result[1][0])
+
+
+def test_title_has_formula(title_recognizer):
+ """
+ 标题含有公式
+ Args:
+ title_recognizer:
+
+ Returns:
+
+ """
+ html_content = r""""""
+ result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
+ assert r"Vector Meson Production in the Final State $K^+ K^- \pi^+ \pi^-$ Photon-photon Collisions" in element_to_html(result[0][0])
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 533470ef..0ceedc0b 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -512,7 +512,7 @@ def test_table_only_include_tr(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_md = result.get_content_list().to_nlp_md()
- assert 'List Price: $11.80' in result_md
+ assert r'List Price: \$11.80' in result_md
def test_table_only_one_td(self):
"""测试table只有一个td."""