diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
index d9178840..c45e7e63 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -5,7 +5,6 @@
from typing import List, Tuple
from lxml import etree
-from lxml.html import HtmlElement
# 在导入前就设置严格的日志控制
logging.basicConfig(level=logging.WARNING, force=True)
@@ -20,7 +19,6 @@
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
- element_to_html_unescaped,
html_to_element)
from llm_web_kit.libs.text_utils import normalize_ctl_text
@@ -538,63 +536,6 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)
- def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
- # pattern re数学公式匹配 func 公式预处理 默认不处理
- # ascii公式处理逻辑转移到mathjax渲染器方案中
- if asciimath_wrap:
- return node
-
- pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
- original_text = node.text or ''
-
- def is_ccmath_wrapped(match_text, original_text: str) -> bool:
- if not match_text or not original_text:
- return False
- start_idx = match_text.start()
- end_idx = match_text.end()
- before_match = original_text[:start_idx].strip()
- after_match = original_text[end_idx:].strip()
- if 'ccmath' in before_match and 'ccmath' in after_match:
- return True
- if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
- for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
- if start in before_match and end in after_match:
- return True
- return False
-
- def process(match_text):
- try:
- match = match_text.group(0)
- if is_ccmath_wrapped(match_text, original_text):
- return match
- wrapped_text = func(match) if func else match
- # html保留原始的,而不是传入修改过的wrapped_text
- original_wrapped = wrapped_text
- wrapped_text = self.wrap_math_md(wrapped_text)
- if not wrapped_text:
- return match
- new_span = build_cc_element(
- html_tag_name=new_tag,
- text=wrapped_text,
- tail='',
- type=math_type,
- by=math_render,
- html=original_wrapped
- )
- except Exception:
- return match
- return element_to_html(new_span)
- try:
- for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
- pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
- regex = re.compile(pattern, re.DOTALL)
- original_text = re.sub(regex, process, original_text)
- except Exception:
- node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
- return node
- node.text = original_text
- return html_to_element(element_to_html_unescaped(node))
-
def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
html_tag_name=CCMATH_HANDLE_FAILED,
@@ -621,12 +562,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str:
print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$'))
print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)'))
print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)'))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'
`x=(-b +- sqrt(b^2 - 4ac))/(2a)`
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'like this: \`E=mc^2\`
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'`(x+1)/x^2``1/3245`
'),None,True))
- print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end
'),None,False))
- print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'\( \newcommand{\norm}[1]{\| #1 \|}\)
'),None,False))
# cm.url = 'mathhelpforum.com'
# print(cm.wrap_math_md_custom(r'
\begin{align} a^2+b=c\end{align}\
'))
# print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
'))
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
index 2f1cc3ad..ffe97caf 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
@@ -290,12 +290,20 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern
# 先处理tail,再处理text,text的判断会多一些
if element.tail:
+ # ⚠️ 关键修改:先尝试行间公式,再尝试行内公式,最后才处理金额
+ original_tail = element.tail
+
# 处理行间公式(优先处理,因为可能包含行内公式)
element.tail = self._process_math_in_text(element, element.tail, display_pattern, True, True)
# 处理行内公式
if element.tail: # 检查是否还有文本需要处理
element.tail = self._process_math_in_text(element, element.tail, inline_pattern, False, True)
+ # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
+ # 判断条件:文本内容没有变化,说明没有匹配到数学公式
+ if element.tail == original_tail and '$' in element.tail:
+ element.tail = optimized_dollar_matching(element.tail)
+
# 跳过特定标签
skip_tags = MATHJAX_OPTIONS['skipTags']
if element.tag in skip_tags:
@@ -314,11 +322,16 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern
# 处理当前节点的文本
if element.text:
+ original_text = element.text
+
# 处理行间公式(优先处理,因为可能包含行内公式)
- element.text = self._process_math_in_text(element, element.text, display_pattern, True)
+ element.text = self._process_math_in_text(element, element.text, display_pattern, True, False)
# 处理行内公式
- if element.text: # 检查是否还有文本需要处理
- element.text = self._process_math_in_text(element, element.text, inline_pattern, False)
+ if element.text:
+ element.text = self._process_math_in_text(element, element.text, inline_pattern, False, False)
+ # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
+ if element.text == original_text and '$' in element.text:
+ element.text = optimized_dollar_matching(element.text)
# 获取子节点的副本,以避免在迭代过程中修改列表
children = list(element)
@@ -411,7 +424,7 @@ def _process_math_in_text(
tem_match_display.clear()
# 如果没有匹配到分隔符形式的公式,直接返回原文本
if not matches:
- return optimized_dollar_matching(text)
+ return text
# 从后向前处理,以避免位置偏移
result = text
@@ -487,7 +500,7 @@ def _process_math_in_text(
last_position = start_pos
# 返回处理后的文本
- return optimized_dollar_matching(result)
+ return result
def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
"""检查分隔符是否被转义.
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index 780ba583..fdf67275 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -5,8 +5,7 @@
from llm_web_kit.exception.exception import (
HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
-from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
- tag_img, tag_math,
+from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
tag_mjx, tag_script)
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
ZHIHU)
@@ -139,16 +138,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
- # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
- if node.tag == 'span' and node.get('class') and (
- 'math-container' in node.get('class') or
- 'mathjax' in node.get('class') or
- 'wp-katex-eq' in node.get('class') or
- 'x-ck12-mathEditor' in node.get('class') or
- 'tex' in node.get('class')
- ):
- tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
-
# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
# print(f"匹配到数学标签: {node.tag}")
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index 79d010aa..37d3226f 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -463,16 +463,9 @@ def test_math_recognizer(self):
def test_math_recognizer_html(self):
for test_case in TEST_CASES_HTML:
raw_html_path = base_dir.joinpath(test_case['input'][0])
- # print('raw_html_path::::::::', raw_html_path)
base_url = test_case['base_url']
raw_html = raw_html_path.read_text(encoding='utf-8')
- parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
- # print(parts)
- # 将parts列表中第一个元素拼接保存到文件,带随机数
- # import random
- # with open('parts'+str(random.randint(1, 100))+".html", 'w') as f:
- # for part in parts:
- # f.write(str(part[0]))
+
# 创建预处理器并清理隐藏元素
pre_extractor = HTMLFileFormatNoClipCleanTagsPreExtractor({})
data_json = DataJson({'html': raw_html, 'url': base_url})
@@ -485,34 +478,21 @@ def test_math_recognizer_html(self):
[(html_to_element(cleaned_html), html_to_element(cleaned_html))],
cleaned_html
)
- # 检查行间公式抽取正确性
+
+ # 检查行间公式
new_parts = []
for part in parts:
new_parts.append((element_to_html(part[0]), element_to_html(part[1])))
- parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
+
+ interline_parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip()
expect_formulas = [formula for formula in expect_text.split('\n') if formula]
- if len(parts) != len(expect_formulas):
- print("出错样例:", test_case['input'])
- print("期望公式数:", len(expect_formulas), "实际公式数:", len(parts))
- print("期望公式:", expect_formulas)
- print("实际公式:", parts)
- self.assertEqual(len(parts), len(expect_formulas))
- # answers = []
- for expect, part in zip(expect_formulas, parts):
- a_tree = html_to_element(part)
- a_result = a_tree.xpath(f'.//{CCTag.CC_MATH_INTERLINE}')[0]
- answer = a_result.text.replace('\n', '').strip()
- # print('part::::::::', part)
- # print('expect::::::::', expect)
- # print('answer::::::::', answer)
- # answers.append(answer)
- self.assertEqual(expect, answer)
- # print('answers::::::::', answers)
- # self.write_to_html(answers, test_case['input'][0])
- # 检查行内公式抽取正确性
+
+ print(f"\n测试用例: {test_case['input']}")
+ print(f"行间公式 - 期望: {len(expect_formulas)}, 实际: {len(interline_parts)}")
+
+ # 检查行内公式
if test_case.get('expected_inline', None):
- # 从所有parts中提取所有行内公式
all_inline_formulas = []
for part in new_parts:
if CCTag.CC_MATH_INLINE in part[0]:
@@ -521,15 +501,23 @@ def test_math_recognizer_html(self):
for inline_elem in inline_elements:
formula = inline_elem.text.replace('\n', '').strip()
all_inline_formulas.append(formula)
- # print(f"Found {len(all_inline_formulas)} total inline formulas")
- # print(f"Total new_parts: {len(new_parts)}")
expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
- # print(f"Expected {len(expect_inline_formulas)} inline formulas")
+ print(f"行内公式 - 期望: {len(expect_inline_formulas)}, 实际: {len(all_inline_formulas)}")
+ # 打印所有实际提取的行内公式
+ print("\n所有实际提取的行内公式:")
+ for i, formula in enumerate(all_inline_formulas, 1):
+ print(f" {i}. {formula}")
+ # 打印所有期望的行内公式
+ print("\n所有期望的行内公式:")
+ for i, formula in enumerate(expect_inline_formulas, 1):
+ print(f" {i}. {formula}")
+ # 找出差异
+ print("\n差异分析:")
+ if len(all_inline_formulas) != len(expect_inline_formulas):
+ print("数量不匹配!")
self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
- # print('inline expect::::::::', expect)
- # print('inline answer::::::::', formula)
self.assertEqual(expect, formula)
def write_to_html(self, answers, file_name):