diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
index d9178840..8637628b 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -538,62 +538,58 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)
- def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
- # pattern re数学公式匹配 func 公式预处理 默认不处理
- # ascii公式处理逻辑转移到mathjax渲染器方案中
- if asciimath_wrap:
- return node
-
- pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
- original_text = node.text or ''
-
- def is_ccmath_wrapped(match_text, original_text: str) -> bool:
- if not match_text or not original_text:
- return False
- start_idx = match_text.start()
- end_idx = match_text.end()
- before_match = original_text[:start_idx].strip()
- after_match = original_text[end_idx:].strip()
- if 'ccmath' in before_match and 'ccmath' in after_match:
- return True
- if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
- for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
- if start in before_match and end in after_match:
- return True
- return False
-
- def process(match_text):
- try:
- match = match_text.group(0)
- if is_ccmath_wrapped(match_text, original_text):
- return match
- wrapped_text = func(match) if func else match
- # html保留原始的,而不是传入修改过的wrapped_text
- original_wrapped = wrapped_text
- wrapped_text = self.wrap_math_md(wrapped_text)
- if not wrapped_text:
- return match
- new_span = build_cc_element(
- html_tag_name=new_tag,
- text=wrapped_text,
- tail='',
- type=math_type,
- by=math_render,
- html=original_wrapped
- )
- except Exception:
- return match
- return element_to_html(new_span)
+ def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement:
+ """替换数学公式节点.
+
+ Args:
+ new_tag: 新标签名称(CCMATH_INLINE 或 CCMATH_INTERLINE)
+ math_type: 数学公式类型(MathType.LATEX 等)
+ math_render: 渲染器类型
+ node: 当前HTML节点
+ func: 公式预处理函数(可选)
+
+ Returns:
+ 处理后的节点
+ """
try:
- for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
- pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
- regex = re.compile(pattern, re.DOTALL)
- original_text = re.sub(regex, process, original_text)
- except Exception:
- node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
- return node
- node.text = original_text
- return html_to_element(element_to_html_unescaped(node))
+ text = node.text
+ if not text or not text_strip(text):
+ return node
+
+ # 预处理公式
+ if func:
+ text = func(text)
+
+ # 去除分隔符并标准化
+ formula = self.wrap_math_md(text)
+
+ # 处理特殊类型
+ if math_type == MathType.ASCIIMATH:
+ formula = self.extract_asciimath(formula)
+ formula = self.wrap_math_md(formula)
+
+ # 构建新节点
+ new_span = build_cc_element(
+ html_tag_name=new_tag,
+ text=formula,
+ tail=text_strip(node.tail),
+ type=math_type,
+ by=math_render,
+ html=element_to_html(node)
+ )
+
+ return new_span
+
+ except Exception as e:
+ # 处理失败时返回失败标记节点
+ return build_cc_element(
+ html_tag_name=CCMATH_HANDLE_FAILED,
+ text=node.text if node.text else '',
+ tail=text_strip(node.tail),
+ type=math_type,
+ by=math_render,
+ html=element_to_html(node)
+ )
def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
index 260d4b80..cc94277b 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
@@ -2,12 +2,12 @@
from llm_web_kit.exception.exception import HtmlMathRecognizerException
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
- MathType,
text_strip)
from llm_web_kit.libs.html_utils import replace_element
-def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
+
+def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement):
try:
text = node.text
tag_math_type_list = cm.get_equation_type(o_html)
@@ -18,17 +18,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
tail = node.tail
new_span.tail = None
for new_tag, math_type in tag_math_type_list:
- asciimath_wrap = True if math_type == MathType.ASCIIMATH else False
- new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None,asciimath_wrap)
+ new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None)
new_span.tail = tail
replace_element(node,new_span)
- # if math_type == MathType.ASCIIMATH:
- # text = cm.wrap_math_md(text)
- # text = cm.extract_asciimath(text)
- # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
- # replace_element(node, new_span)
- # elif math_type == MathType.LATEX:
- # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
- # replace_element(node, new_span)
+
except Exception as e:
raise HtmlMathRecognizerException(f'Error processing script mathtex: {e}')
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index 780ba583..1a2d0958 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -140,14 +140,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
- if node.tag == 'span' and node.get('class') and (
- 'math-container' in node.get('class') or
- 'mathjax' in node.get('class') or
- 'wp-katex-eq' in node.get('class') or
- 'x-ck12-mathEditor' in node.get('class') or
- 'tex' in node.get('class')
- ):
- tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+ if node.tag == 'span' and node.get('class') and 'math-container' in node.get('class'):
+ tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node)
# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
index c509d756..ea085333 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
@@ -1406,7 +1406,7 @@
$\begingroup$
-
When gravitational waves reach Earth, they usually give a strain of $\delta L \over L$$=10^{-21}$.
+
When gravitational waves reach Earth, they usually give a strain of $\delta L \over L = 10^{-21}$.
If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.
LIGO detected the first merger of black holes at 1.3 billion light years away.
If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of $10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
index 771c4c49..b8fc706a 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
@@ -7,8 +7,7 @@
M_{\odot}
M_{\odot}
M_{\odot}
-\delta L \over L
-=10^{-21}
+\delta L \over L = 10^{-21}
10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}
1/r
10^{-21} \times (1.3 \cdot 10^9)=10^{-9}
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index 79d010aa..679690a0 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -511,6 +511,7 @@ def test_math_recognizer_html(self):
# print('answers::::::::', answers)
# self.write_to_html(answers, test_case['input'][0])
# 检查行内公式抽取正确性
+ # 检查行内公式抽取正确性
if test_case.get('expected_inline', None):
# 从所有parts中提取所有行内公式
all_inline_formulas = []
@@ -521,16 +522,40 @@ def test_math_recognizer_html(self):
for inline_elem in inline_elements:
formula = inline_elem.text.replace('\n', '').strip()
all_inline_formulas.append(formula)
- # print(f"Found {len(all_inline_formulas)} total inline formulas")
- # print(f"Total new_parts: {len(new_parts)}")
+
expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
- # print(f"Expected {len(expect_inline_formulas)} inline formulas")
+
+ # 如果数量不匹配,输出详细信息
+ if len(all_inline_formulas) != len(expect_inline_formulas):
+ print("\n" + "=" * 80)
+ print("行内公式抽取出错!")
+ print("=" * 80)
+ print(f"出错样例: {test_case['input']}")
+ print(f"预期公式数: {len(expect_inline_formulas)}")
+ print(f"实际公式数: {len(all_inline_formulas)}")
+ print("\n预期公式列表:")
+ for i, formula in enumerate(expect_inline_formulas, 1):
+ print(f" {i}. {formula}")
+ print("\n实际公式列表:")
+ for i, formula in enumerate(all_inline_formulas, 1):
+ print(f" {i}. {formula}")
+
+ # 找出差异
+ print("\n差异分析:")
+ if len(all_inline_formulas) > len(expect_inline_formulas):
+ print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:")
+ extra_formulas = all_inline_formulas[len(expect_inline_formulas):]
+ for i, formula in enumerate(extra_formulas, 1):
+ print(f" {i}. {formula}")
+ else:
+ print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:")
+ missing_formulas = expect_inline_formulas[len(all_inline_formulas):]
+ for i, formula in enumerate(missing_formulas, 1):
+ print(f" {i}. {formula}")
+ print("=" * 80 + "\n")
+
self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
- for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
- # print('inline expect::::::::', expect)
- # print('inline answer::::::::', formula)
- self.assertEqual(expect, formula)
def write_to_html(self, answers, file_name):
file_name = file_name.split('.')[0]
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index d8f038a2..34236da4 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -168,7 +168,7 @@
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
- "by": "None"
+ "by": "mathjax_mock"
}
},
{