Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 0 additions & 59 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import List, Tuple

from lxml import etree
from lxml.html import HtmlElement

# 在导入前就设置严格的日志控制
logging.basicConfig(level=logging.WARNING, force=True)
Expand All @@ -20,7 +19,6 @@
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
element_to_html_unescaped,
html_to_element)
from llm_web_kit.libs.text_utils import normalize_ctl_text

Expand Down Expand Up @@ -538,63 +536,6 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)

def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
# pattern re数学公式匹配 func 公式预处理 默认不处理
# ascii公式处理逻辑转移到mathjax渲染器方案中
if asciimath_wrap:
return node

pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
original_text = node.text or ''

def is_ccmath_wrapped(match_text, original_text: str) -> bool:
if not match_text or not original_text:
return False
start_idx = match_text.start()
end_idx = match_text.end()
before_match = original_text[:start_idx].strip()
after_match = original_text[end_idx:].strip()
if 'ccmath' in before_match and 'ccmath' in after_match:
return True
if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
if start in before_match and end in after_match:
return True
return False

def process(match_text):
try:
match = match_text.group(0)
if is_ccmath_wrapped(match_text, original_text):
return match
wrapped_text = func(match) if func else match
# html保留原始的,而不是传入修改过的wrapped_text
original_wrapped = wrapped_text
wrapped_text = self.wrap_math_md(wrapped_text)
if not wrapped_text:
return match
new_span = build_cc_element(
html_tag_name=new_tag,
text=wrapped_text,
tail='',
type=math_type,
by=math_render,
html=original_wrapped
)
except Exception:
return match
return element_to_html(new_span)
try:
for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
regex = re.compile(pattern, re.DOTALL)
original_text = re.sub(regex, process, original_text)
except Exception:
node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
return node
node.text = original_text
return html_to_element(element_to_html_unescaped(node))

def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
html_tag_name=CCMATH_HANDLE_FAILED,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
from llm_web_kit.extractor.html.recognizer.cc_math.render.render import (
BaseMathRender, MathRenderType)
from llm_web_kit.libs.html_utils import (HtmlElement, SimpleMatch,
html_to_element,
optimized_dollar_matching)
html_to_element)
from llm_web_kit.libs.text_utils import normalize_ctl_text

# 添加MATHJAX_OPTIONS变量定义
Expand Down Expand Up @@ -411,7 +410,7 @@ def _process_math_in_text(
tem_match_display.clear()
# 如果没有匹配到分隔符形式的公式,直接返回原文本
if not matches:
return optimized_dollar_matching(text)
return text

# 从后向前处理,以避免位置偏移
result = text
Expand Down Expand Up @@ -487,7 +486,7 @@ def _process_math_in_text(
last_position = start_pos

# 返回处理后的文本
return optimized_dollar_matching(result)
return result

def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
"""检查分隔符是否被转义.
Expand Down
34 changes: 0 additions & 34 deletions llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py

This file was deleted.

20 changes: 10 additions & 10 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from llm_web_kit.exception.exception import (
HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
tag_img, tag_math,
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
tag_mjx, tag_script)
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
ZHIHU)
Expand Down Expand Up @@ -139,15 +138,16 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)

# 提示:被mathjax兜底覆盖,逻辑已经删除
# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
'math-container' in node.get('class') or
'mathjax' in node.get('class') or
'wp-katex-eq' in node.get('class') or
'x-ck12-mathEditor' in node.get('class') or
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
# if node.tag == 'span' and node.get('class') and (
# 'math-container' in node.get('class') or
# 'mathjax' in node.get('class') or
# 'wp-katex-eq' in node.get('class') or
# 'x-ck12-mathEditor' in node.get('class') or
# 'tex' in node.get('class')
# ):
# tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
Expand Down
33 changes: 27 additions & 6 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,17 +521,35 @@ def test_math_recognizer_html(self):
for inline_elem in inline_elements:
formula = inline_elem.text.replace('\n', '').strip()
all_inline_formulas.append(formula)
# print(f"Found {len(all_inline_formulas)} total inline formulas")
# print(f"Total new_parts: {len(new_parts)}")

expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
# print(f"Expected {len(expect_inline_formulas)} inline formulas")

# 打印调试信息
print(f"\n{'=' * 80}")
print(f"测试样例: {test_case['input']}")
print(f"期望公式数量: {len(expect_inline_formulas)}")
print(f"实际公式数量: {len(all_inline_formulas)}")

if len(all_inline_formulas) != len(expect_inline_formulas):
print("\n❌ 公式数量不匹配!")
print("\n期望的行内公式:")
for i, formula in enumerate(expect_inline_formulas, 1):
print(f" {i}. {formula}")
print("\n实际抽取的行内公式:")
for i, formula in enumerate(all_inline_formulas, 1):
print(f" {i}. {formula}")

self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
# print('inline expect::::::::', expect)
# print('inline answer::::::::', formula)

for i, (expect, formula) in enumerate(zip(expect_inline_formulas, all_inline_formulas), 1):
if expect != formula:
print(f" 期望: {expect}")
print(f" 实际: {formula}")
self.assertEqual(expect, formula)

print(f"{'=' * 80}\n")

def write_to_html(self, answers, file_name):
file_name = file_name.split('.')[0]
with open(base_dir.joinpath(f'{file_name}_1.html'), 'w', encoding='utf-8') as file:
Expand Down Expand Up @@ -565,6 +583,7 @@ def test_to_content_list_node(self):
)
self.assertIn('No ccmath element found in content', str(exc_info.exception))

@unittest.skip("逻辑删除,暂时跳过此测试")
def test_fix_re_match(self):
"""修复正则无法正确匹配$...$$...$$...$这种连续公式."""
html_content = r"""<p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-zdx1mj6hxf8" style="">$\newcommand{\cE}[2]{\mathbf{E}(#1\ |\ #2)}$$\newcommand{\cP}[2]{\mathbf{P}(#1\ |\ #2)}$$\renewcommand{\P}[1]{\mathbf{P}(#1)}$$\newcommand{\E}[1]{\mathbf{E}(#1)}$$\newcommand{\F}{\mathcal{F}}$$\newcommand{\G}{\mathcal{G}}$$\newcommand{\ind}[1]{\mathbf{1}_{#1}}$
Expand Down Expand Up @@ -953,6 +972,7 @@ def test_latex_not_closed(self):
html_content)
assert '\\{\\begin{array}{l}\\nabla \\cdot \\left({R}^{2}\\nabla \\phi \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(6\\right)\\\\ D\\left(r,k,\\omega \\right)\\equiv \\frac{c}{2{k}_{0}}\\left[{k}^{2}-{\\left(n{k}_{0}\\right)}^{2}\\right]+W\\left(r,\\omega \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(7\\right)\\end{array}' in element_to_html(parts[0][0])

@unittest.skip("逻辑删除,暂时跳过此测试")
def test_dollar_sign(self):
"""美元符合与公式共存的情况."""
html_content = """<p>referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity</p>"""
Expand All @@ -961,6 +981,7 @@ def test_dollar_sign(self):
html_content)
assert element_to_html(parts[0][0]) == '<p>referring \\$18.1 to \\$18.1 the packet center <ccmath-inline type="latex" by="mathjax_mock" html="$ p$">p</ccmath-inline> and apparently coinciding with the particle velocity</p>'

@unittest.skip("逻辑删除,暂时跳过此测试")
def test_begin_end(self):
"""$begin end$的嵌套组合识别时候$$没有处理."""
html_content = r"""<p data-anno-uid="anno-uid-q8doimblafo"><span cc-select="true" class="mpa-ignore mark-selected" data-anno-uid="anno-uid-ldpcij9lbom" style="">$\begin{array}{1 1}(a)\;xy=c\\(b)\;xy=c^2\\(c)\;x^2+y^2=a^2\\(d)\;x^2+y^2=1\end{array}$</span></p>"""
Expand Down
2 changes: 2 additions & 0 deletions tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ def test_xml_tag(self):
result_md = result.get_content_list().to_mm_md()
self.assertIn('Every child that attends a CHICKS break has a deserving story', result_md)

@unittest.skip("暂时不检查美元转义")
def test_math_dollar(self):
"""测试math美元符号."""
chain = ExtractSimpleFactory.create(self.config)
Expand Down Expand Up @@ -504,6 +505,7 @@ def test_math_physicsforums(self):
self.assertIn('$\\Delta K = (dd^{\\dagger} + d^{\\dagger}d)K$', result_md)
self.assertIn('$$\\Delta K = \\Bigl( \\frac{1}{3!}\\epsilon^{klm}\\epsilon^n_{\\ ij}\\partial_k \\partial_n K_{lm} - \\frac{1}{4}\\partial_{i}\\partial^k K_{jk} \\Bigr) dx^i \\wedge dx^j$$', result_md)

@unittest.skip("暂时不检查美元转义")
def test_table_only_include_tr(self):
"""测试table的表头只包含tr标签."""
chain = ExtractSimpleFactory.create(self.config)
Expand Down
2 changes: 1 addition & 1 deletion tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "None"
"by": "mathjax_mock"
}
},
{
Expand Down