Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 51 additions & 55 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,62 +538,58 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)

def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
# pattern re数学公式匹配 func 公式预处理 默认不处理
# ascii公式处理逻辑转移到mathjax渲染器方案中
if asciimath_wrap:
return node

pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
original_text = node.text or ''

def is_ccmath_wrapped(match_text, original_text: str) -> bool:
if not match_text or not original_text:
return False
start_idx = match_text.start()
end_idx = match_text.end()
before_match = original_text[:start_idx].strip()
after_match = original_text[end_idx:].strip()
if 'ccmath' in before_match and 'ccmath' in after_match:
return True
if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
if start in before_match and end in after_match:
return True
return False

def process(match_text):
try:
match = match_text.group(0)
if is_ccmath_wrapped(match_text, original_text):
return match
wrapped_text = func(match) if func else match
# html保留原始的,而不是传入修改过的wrapped_text
original_wrapped = wrapped_text
wrapped_text = self.wrap_math_md(wrapped_text)
if not wrapped_text:
return match
new_span = build_cc_element(
html_tag_name=new_tag,
text=wrapped_text,
tail='',
type=math_type,
by=math_render,
html=original_wrapped
)
except Exception:
return match
return element_to_html(new_span)
def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement:
"""替换数学公式节点.

Args:
new_tag: 新标签名称(CCMATH_INLINE 或 CCMATH_INTERLINE)
math_type: 数学公式类型(MathType.LATEX 等)
math_render: 渲染器类型
node: 当前HTML节点
func: 公式预处理函数(可选)

Returns:
处理后的节点
"""
try:
for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
regex = re.compile(pattern, re.DOTALL)
original_text = re.sub(regex, process, original_text)
except Exception:
node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
return node
node.text = original_text
return html_to_element(element_to_html_unescaped(node))
text = node.text
if not text or not text_strip(text):
return node

# 预处理公式
if func:
text = func(text)

# 去除分隔符并标准化
formula = self.wrap_math_md(text)

# 处理特殊类型
if math_type == MathType.ASCIIMATH:
formula = self.extract_asciimath(formula)
formula = self.wrap_math_md(formula)

# 构建新节点
new_span = build_cc_element(
html_tag_name=new_tag,
text=formula,
tail=text_strip(node.tail),
type=math_type,
by=math_render,
html=element_to_html(node)
)

return new_span

except Exception as e:
# 处理失败时返回失败标记节点
return build_cc_element(
html_tag_name=CCMATH_HANDLE_FAILED,
text=node.text if node.text else '',
tail=text_strip(node.tail),
type=math_type,
by=math_render,
html=element_to_html(node)
)

def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
Expand Down
16 changes: 4 additions & 12 deletions llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from llm_web_kit.exception.exception import HtmlMathRecognizerException
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
MathType,
text_strip)
from llm_web_kit.libs.html_utils import replace_element


def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):

def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement):
try:
text = node.text
tag_math_type_list = cm.get_equation_type(o_html)
Expand All @@ -18,17 +18,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
tail = node.tail
new_span.tail = None
for new_tag, math_type in tag_math_type_list:
asciimath_wrap = True if math_type == MathType.ASCIIMATH else False
new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None,asciimath_wrap)
new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None)
new_span.tail = tail
replace_element(node,new_span)
# if math_type == MathType.ASCIIMATH:
# text = cm.wrap_math_md(text)
# text = cm.extract_asciimath(text)
# new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
# replace_element(node, new_span)
# elif math_type == MathType.LATEX:
# new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
# replace_element(node, new_span)

except Exception as e:
raise HtmlMathRecognizerException(f'Error processing script mathtex: {e}')
10 changes: 2 additions & 8 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
'math-container' in node.get('class') or
'mathjax' in node.get('class') or
'wp-katex-eq' in node.get('class') or
'x-ck12-mathEditor' in node.get('class') or
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
if node.tag == 'span' and node.get('class') and 'math-container' in node.get('class'):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node)

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1406,7 +1406,7 @@ <h2 class="mb0" data-answercount="2">
<div class="answercell post-layout--right">
<span class="d-none">$\begingroup$</span>
<div class="s-prose js-post-body" itemprop="text">
<p>When gravitational waves reach Earth, <a href="https://en.wikipedia.org/wiki/Gravitational_wave#Difficulties" rel="nofollow noreferrer">they usually give a strain</a> of <span class="math-container">$\delta L \over L$$=10^{-21}$</span>.</p>
<p>When gravitational waves reach Earth, <a href="https://en.wikipedia.org/wiki/Gravitational_wave#Difficulties" rel="nofollow noreferrer">they usually give a strain</a> of <span class="math-container">$\delta L \over L = 10^{-21}$</span>.</p>
<p>If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.</p>
<p>LIGO detected the <a href="https://en.wikipedia.org/wiki/First_observation_of_gravitational_waves" rel="nofollow noreferrer">first merger</a> of black holes at 1.3 billion light years away.</p>
<p>If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of <span class="math-container">$10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$</span>. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.</p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
M_{\odot}
M_{\odot}
M_{\odot}
\delta L \over L
=10^{-21}
\delta L \over L = 10^{-21}
10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}
1/r
10^{-21} \times (1.3 \cdot 10^9)=10^{-9}
Expand Down
39 changes: 32 additions & 7 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ def test_math_recognizer_html(self):
# print('answers::::::::', answers)
# self.write_to_html(answers, test_case['input'][0])
# 检查行内公式抽取正确性
# 检查行内公式抽取正确性
if test_case.get('expected_inline', None):
# 从所有parts中提取所有行内公式
all_inline_formulas = []
Expand All @@ -521,16 +522,40 @@ def test_math_recognizer_html(self):
for inline_elem in inline_elements:
formula = inline_elem.text.replace('\n', '').strip()
all_inline_formulas.append(formula)
# print(f"Found {len(all_inline_formulas)} total inline formulas")
# print(f"Total new_parts: {len(new_parts)}")

expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
# print(f"Expected {len(expect_inline_formulas)} inline formulas")

# 如果数量不匹配,输出详细信息
if len(all_inline_formulas) != len(expect_inline_formulas):
print("\n" + "=" * 80)
print("行内公式抽取出错!")
print("=" * 80)
print(f"出错样例: {test_case['input']}")
print(f"预期公式数: {len(expect_inline_formulas)}")
print(f"实际公式数: {len(all_inline_formulas)}")
print("\n预期公式列表:")
for i, formula in enumerate(expect_inline_formulas, 1):
print(f" {i}. {formula}")
print("\n实际公式列表:")
for i, formula in enumerate(all_inline_formulas, 1):
print(f" {i}. {formula}")

# 找出差异
print("\n差异分析:")
if len(all_inline_formulas) > len(expect_inline_formulas):
print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:")
extra_formulas = all_inline_formulas[len(expect_inline_formulas):]
for i, formula in enumerate(extra_formulas, 1):
print(f" {i}. {formula}")
else:
print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:")
missing_formulas = expect_inline_formulas[len(all_inline_formulas):]
for i, formula in enumerate(missing_formulas, 1):
print(f" {i}. {formula}")
print("=" * 80 + "\n")

self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
# print('inline expect::::::::', expect)
# print('inline answer::::::::', formula)
self.assertEqual(expect, formula)

def write_to_html(self, answers, file_name):
file_name = file_name.split('.')[0]
Expand Down
2 changes: 1 addition & 1 deletion tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "None"
"by": "mathjax_mock"
}
},
{
Expand Down
Loading