Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 0 additions & 65 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import List, Tuple

from lxml import etree
from lxml.html import HtmlElement

# 在导入前就设置严格的日志控制
logging.basicConfig(level=logging.WARNING, force=True)
Expand All @@ -20,7 +19,6 @@
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
element_to_html_unescaped,
html_to_element)
from llm_web_kit.libs.text_utils import normalize_ctl_text

Expand Down Expand Up @@ -538,63 +536,6 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)

def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
# pattern re数学公式匹配 func 公式预处理 默认不处理
# ascii公式处理逻辑转移到mathjax渲染器方案中
if asciimath_wrap:
return node

pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
original_text = node.text or ''

def is_ccmath_wrapped(match_text, original_text: str) -> bool:
if not match_text or not original_text:
return False
start_idx = match_text.start()
end_idx = match_text.end()
before_match = original_text[:start_idx].strip()
after_match = original_text[end_idx:].strip()
if 'ccmath' in before_match and 'ccmath' in after_match:
return True
if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
if start in before_match and end in after_match:
return True
return False

def process(match_text):
try:
match = match_text.group(0)
if is_ccmath_wrapped(match_text, original_text):
return match
wrapped_text = func(match) if func else match
# html保留原始的,而不是传入修改过的wrapped_text
original_wrapped = wrapped_text
wrapped_text = self.wrap_math_md(wrapped_text)
if not wrapped_text:
return match
new_span = build_cc_element(
html_tag_name=new_tag,
text=wrapped_text,
tail='',
type=math_type,
by=math_render,
html=original_wrapped
)
except Exception:
return match
return element_to_html(new_span)
try:
for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
regex = re.compile(pattern, re.DOTALL)
original_text = re.sub(regex, process, original_text)
except Exception:
node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
return node
node.text = original_text
return html_to_element(element_to_html_unescaped(node))

def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
html_tag_name=CCMATH_HANDLE_FAILED,
Expand All @@ -621,12 +562,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str:
print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$'))
print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)'))
print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)'))
print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>`x=(-b +- sqrt(b^2 - 4ac))/(2a)`</p>'),None,True))
print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>like this: \`E=mc^2\`</p>'),None,True))
print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.</p>'),None,True))
print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'<p>`(x+1)/x^2``1/3245`</p>'),None,True))
print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'<p>start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end</p>'),None,False))
print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'<p>\( \newcommand{\norm}[1]{\| #1 \|}\)</p>'),None,False))
# cm.url = 'mathhelpforum.com'
# print(cm.wrap_math_md_custom(r'<br />\begin{align} a^2+b=c\end{align}\<br />'))
# print(cm.wrap_math_md_custom(r'<br />dz=\frac{1}{2}\frac{dx}{\cos ^2 x}<br />'))
23 changes: 18 additions & 5 deletions llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,12 +290,20 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern

# 先处理tail,再处理text,text的判断会多一些
if element.tail:
# ⚠️ 关键修改:先尝试行间公式,再尝试行内公式,最后才处理金额
original_tail = element.tail

# 处理行间公式(优先处理,因为可能包含行内公式)
element.tail = self._process_math_in_text(element, element.tail, display_pattern, True, True)
# 处理行内公式
if element.tail: # 检查是否还有文本需要处理
element.tail = self._process_math_in_text(element, element.tail, inline_pattern, False, True)

# 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
# 判断条件:文本内容没有变化,说明没有匹配到数学公式
if element.tail == original_tail and '$' in element.tail:
element.tail = optimized_dollar_matching(element.tail)

# 跳过特定标签
skip_tags = MATHJAX_OPTIONS['skipTags']
if element.tag in skip_tags:
Expand All @@ -314,11 +322,16 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern

# 处理当前节点的文本
if element.text:
original_text = element.text

# 处理行间公式(优先处理,因为可能包含行内公式)
element.text = self._process_math_in_text(element, element.text, display_pattern, True)
element.text = self._process_math_in_text(element, element.text, display_pattern, True, False)
# 处理行内公式
if element.text: # 检查是否还有文本需要处理
element.text = self._process_math_in_text(element, element.text, inline_pattern, False)
if element.text:
element.text = self._process_math_in_text(element, element.text, inline_pattern, False, False)
# 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching
if element.text == original_text and '$' in element.text:
element.text = optimized_dollar_matching(element.text)

# 获取子节点的副本,以避免在迭代过程中修改列表
children = list(element)
Expand Down Expand Up @@ -411,7 +424,7 @@ def _process_math_in_text(
tem_match_display.clear()
# 如果没有匹配到分隔符形式的公式,直接返回原文本
if not matches:
return optimized_dollar_matching(text)
return text

# 从后向前处理,以避免位置偏移
result = text
Expand Down Expand Up @@ -487,7 +500,7 @@ def _process_math_in_text(
last_position = start_pos

# 返回处理后的文本
return optimized_dollar_matching(result)
return result

def _is_escaped_delimiter(self, text: str, pos: int) -> bool:
"""检查分隔符是否被转义.
Expand Down
13 changes: 1 addition & 12 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from llm_web_kit.exception.exception import (
HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
tag_img, tag_math,
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
tag_mjx, tag_script)
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
ZHIHU)
Expand Down Expand Up @@ -139,16 +138,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
'math-container' in node.get('class') or
'mathjax' in node.get('class') or
'wp-katex-eq' in node.get('class') or
'x-ck12-mathEditor' in node.get('class') or
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
# print(f"匹配到数学标签: {node.tag}")
Expand Down
58 changes: 23 additions & 35 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,16 +463,9 @@ def test_math_recognizer(self):
def test_math_recognizer_html(self):
for test_case in TEST_CASES_HTML:
raw_html_path = base_dir.joinpath(test_case['input'][0])
# print('raw_html_path::::::::', raw_html_path)
base_url = test_case['base_url']
raw_html = raw_html_path.read_text(encoding='utf-8')
parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
# print(parts)
# 将parts列表中第一个元素拼接保存到文件,带随机数
# import random
# with open('parts'+str(random.randint(1, 100))+".html", 'w') as f:
# for part in parts:
# f.write(str(part[0]))

# 创建预处理器并清理隐藏元素
pre_extractor = HTMLFileFormatNoClipCleanTagsPreExtractor({})
data_json = DataJson({'html': raw_html, 'url': base_url})
Expand All @@ -485,34 +478,21 @@ def test_math_recognizer_html(self):
[(html_to_element(cleaned_html), html_to_element(cleaned_html))],
cleaned_html
)
# 检查行间公式抽取正确性

# 检查行间公式
new_parts = []
for part in parts:
new_parts.append((element_to_html(part[0]), element_to_html(part[1])))
parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]

interline_parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip()
expect_formulas = [formula for formula in expect_text.split('\n') if formula]
if len(parts) != len(expect_formulas):
print("出错样例:", test_case['input'])
print("期望公式数:", len(expect_formulas), "实际公式数:", len(parts))
print("期望公式:", expect_formulas)
print("实际公式:", parts)
self.assertEqual(len(parts), len(expect_formulas))
# answers = []
for expect, part in zip(expect_formulas, parts):
a_tree = html_to_element(part)
a_result = a_tree.xpath(f'.//{CCTag.CC_MATH_INTERLINE}')[0]
answer = a_result.text.replace('\n', '').strip()
# print('part::::::::', part)
# print('expect::::::::', expect)
# print('answer::::::::', answer)
# answers.append(answer)
self.assertEqual(expect, answer)
# print('answers::::::::', answers)
# self.write_to_html(answers, test_case['input'][0])
# 检查行内公式抽取正确性

print(f"\n测试用例: {test_case['input']}")
print(f"行间公式 - 期望: {len(expect_formulas)}, 实际: {len(interline_parts)}")

# 检查行内公式
if test_case.get('expected_inline', None):
# 从所有parts中提取所有行内公式
all_inline_formulas = []
for part in new_parts:
if CCTag.CC_MATH_INLINE in part[0]:
Expand All @@ -521,15 +501,23 @@ def test_math_recognizer_html(self):
for inline_elem in inline_elements:
formula = inline_elem.text.replace('\n', '').strip()
all_inline_formulas.append(formula)
# print(f"Found {len(all_inline_formulas)} total inline formulas")
# print(f"Total new_parts: {len(new_parts)}")
expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip()
expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula]
# print(f"Expected {len(expect_inline_formulas)} inline formulas")
print(f"行内公式 - 期望: {len(expect_inline_formulas)}, 实际: {len(all_inline_formulas)}")
# 打印所有实际提取的行内公式
print("\n所有实际提取的行内公式:")
for i, formula in enumerate(all_inline_formulas, 1):
print(f" {i}. {formula}")
# 打印所有期望的行内公式
print("\n所有期望的行内公式:")
for i, formula in enumerate(expect_inline_formulas, 1):
print(f" {i}. {formula}")
# 找出差异
print("\n差异分析:")
if len(all_inline_formulas) != len(expect_inline_formulas):
print("数量不匹配!")
self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas))
for expect, formula in zip(expect_inline_formulas, all_inline_formulas):
# print('inline expect::::::::', expect)
# print('inline answer::::::::', formula)
self.assertEqual(expect, formula)

def write_to_html(self, answers, file_name):
Expand Down
Loading