diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index d9178840..c45e7e63 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -5,7 +5,6 @@ from typing import List, Tuple from lxml import etree -from lxml.html import HtmlElement # 在导入前就设置严格的日志控制 logging.basicConfig(level=logging.WARNING, force=True) @@ -20,7 +19,6 @@ from llm_web_kit.extractor.html.recognizer.recognizer import CCTag from llm_web_kit.libs.doc_element_type import DocElementType from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html, - element_to_html_unescaped, html_to_element) from llm_web_kit.libs.text_utils import normalize_ctl_text @@ -538,63 +536,6 @@ def fix_mathml_superscript(self, mathml_str): parent.remove(msup) return etree.tostring(root, encoding='unicode', pretty_print=True) - def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement: - # pattern re数学公式匹配 func 公式预处理 默认不处理 - # ascii公式处理逻辑转移到mathjax渲染器方案中 - if asciimath_wrap: - return node - - pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH - original_text = node.text or '' - - def is_ccmath_wrapped(match_text, original_text: str) -> bool: - if not match_text or not original_text: - return False - start_idx = match_text.start() - end_idx = match_text.end() - before_match = original_text[:start_idx].strip() - after_match = original_text[end_idx:].strip() - if 'ccmath' in before_match and 'ccmath' in after_match: - return True - if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH: - for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]: - if start in before_match and end in after_match: - return True - return False - - def process(match_text): - try: - match = match_text.group(0) - if is_ccmath_wrapped(match_text, original_text): - return match - wrapped_text = func(match) if func else match - # html保留原始的,而不是传入修改过的wrapped_text - original_wrapped = wrapped_text - wrapped_text = self.wrap_math_md(wrapped_text) - if not wrapped_text: - return match - new_span = build_cc_element( - html_tag_name=new_tag, - text=wrapped_text, - tail='', - type=math_type, - by=math_render, - html=original_wrapped - ) - except Exception: - return match - return element_to_html(new_span) - try: - for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]: - pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?') - regex = re.compile(pattern, re.DOTALL) - original_text = re.sub(regex, process, original_text) - except Exception: - node.text = self.build_cc_exception_tag(original_text, math_type, math_render) - return node - node.text = original_text - return html_to_element(element_to_html_unescaped(node)) - def build_cc_exception_tag(self, text, math_type, math_render) -> str: return element_to_html(build_cc_element( html_tag_name=CCMATH_HANDLE_FAILED, @@ -621,12 +562,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str: print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$')) print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)')) print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)')) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

`x=(-b +- sqrt(b^2 - 4ac))/(2a)`

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

like this: \`E=mc^2\`

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

`(x+1)/x^2``1/3245`

'),None,True)) - print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'

start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end

'),None,False)) - print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'

\( \newcommand{\norm}[1]{\| #1 \|}\)

'),None,False)) # cm.url = 'mathhelpforum.com' # print(cm.wrap_math_md_custom(r'
\begin{align} a^2+b=c\end{align}\
')) # print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
')) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 2f1cc3ad..ffe97caf 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -290,12 +290,20 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern # 先处理tail,再处理text,text的判断会多一些 if element.tail: + # ⚠️ 关键修改:先尝试行间公式,再尝试行内公式,最后才处理金额 + original_tail = element.tail + # 处理行间公式(优先处理,因为可能包含行内公式) element.tail = self._process_math_in_text(element, element.tail, display_pattern, True, True) # 处理行内公式 if element.tail: # 检查是否还有文本需要处理 element.tail = self._process_math_in_text(element, element.tail, inline_pattern, False, True) + # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching + # 判断条件:文本内容没有变化,说明没有匹配到数学公式 + if element.tail == original_tail and '$' in element.tail: + element.tail = optimized_dollar_matching(element.tail) + # 跳过特定标签 skip_tags = MATHJAX_OPTIONS['skipTags'] if element.tag in skip_tags: @@ -314,11 +322,16 @@ def _find_math_in_element(self, element: HtmlElement, inline_pattern: re.Pattern # 处理当前节点的文本 if element.text: + original_text = element.text + # 处理行间公式(优先处理,因为可能包含行内公式) - element.text = self._process_math_in_text(element, element.text, display_pattern, True) + element.text = self._process_math_in_text(element, element.text, display_pattern, True, False) # 处理行内公式 - if element.text: # 检查是否还有文本需要处理 - element.text = self._process_math_in_text(element, element.text, inline_pattern, False) + if element.text: + element.text = self._process_math_in_text(element, element.text, inline_pattern, False, False) + # 3. 只有当前两步都没有处理文本时,才调用 optimized_dollar_matching + if element.text == original_text and '$' in element.text: + element.text = optimized_dollar_matching(element.text) # 获取子节点的副本,以避免在迭代过程中修改列表 children = list(element) @@ -411,7 +424,7 @@ def _process_math_in_text( tem_match_display.clear() # 如果没有匹配到分隔符形式的公式,直接返回原文本 if not matches: - return optimized_dollar_matching(text) + return text # 从后向前处理,以避免位置偏移 result = text @@ -487,7 +500,7 @@ def _process_math_in_text( last_position = start_pos # 返回处理后的文本 - return optimized_dollar_matching(result) + return result def _is_escaped_delimiter(self, text: str, pos: int) -> bool: """检查分隔符是否被转义. diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 780ba583..fdf67275 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -5,8 +5,7 @@ from llm_web_kit.exception.exception import ( HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException) -from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify, - tag_img, tag_math, +from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math, tag_mjx, tag_script) from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN, ZHIHU) @@ -139,16 +138,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH: tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) - # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq - if node.tag == 'span' and node.get('class') and ( - 'math-container' in node.get('class') or - 'mathjax' in node.get('class') or - 'wp-katex-eq' in node.get('class') or - 'x-ck12-mathEditor' in node.get('class') or - 'tex' in node.get('class') - ): - tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) - # math tags if node.tag == 'math' or node.tag.endswith(':math'): # print(f"匹配到数学标签: {node.tag}") diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 79d010aa..37d3226f 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -463,16 +463,9 @@ def test_math_recognizer(self): def test_math_recognizer_html(self): for test_case in TEST_CASES_HTML: raw_html_path = base_dir.joinpath(test_case['input'][0]) - # print('raw_html_path::::::::', raw_html_path) base_url = test_case['base_url'] raw_html = raw_html_path.read_text(encoding='utf-8') - parts = self.math_recognizer.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) - # print(parts) - # 将parts列表中第一个元素拼接保存到文件,带随机数 - # import random - # with open('parts'+str(random.randint(1, 100))+".html", 'w') as f: - # for part in parts: - # f.write(str(part[0])) + # 创建预处理器并清理隐藏元素 pre_extractor = HTMLFileFormatNoClipCleanTagsPreExtractor({}) data_json = DataJson({'html': raw_html, 'url': base_url}) @@ -485,34 +478,21 @@ def test_math_recognizer_html(self): [(html_to_element(cleaned_html), html_to_element(cleaned_html))], cleaned_html ) - # 检查行间公式抽取正确性 + + # 检查行间公式 new_parts = [] for part in parts: new_parts.append((element_to_html(part[0]), element_to_html(part[1]))) - parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]] + + interline_parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]] expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip() expect_formulas = [formula for formula in expect_text.split('\n') if formula] - if len(parts) != len(expect_formulas): - print("出错样例:", test_case['input']) - print("期望公式数:", len(expect_formulas), "实际公式数:", len(parts)) - print("期望公式:", expect_formulas) - print("实际公式:", parts) - self.assertEqual(len(parts), len(expect_formulas)) - # answers = [] - for expect, part in zip(expect_formulas, parts): - a_tree = html_to_element(part) - a_result = a_tree.xpath(f'.//{CCTag.CC_MATH_INTERLINE}')[0] - answer = a_result.text.replace('\n', '').strip() - # print('part::::::::', part) - # print('expect::::::::', expect) - # print('answer::::::::', answer) - # answers.append(answer) - self.assertEqual(expect, answer) - # print('answers::::::::', answers) - # self.write_to_html(answers, test_case['input'][0]) - # 检查行内公式抽取正确性 + + print(f"\n测试用例: {test_case['input']}") + print(f"行间公式 - 期望: {len(expect_formulas)}, 实际: {len(interline_parts)}") + + # 检查行内公式 if test_case.get('expected_inline', None): - # 从所有parts中提取所有行内公式 all_inline_formulas = [] for part in new_parts: if CCTag.CC_MATH_INLINE in part[0]: @@ -521,15 +501,23 @@ def test_math_recognizer_html(self): for inline_elem in inline_elements: formula = inline_elem.text.replace('\n', '').strip() all_inline_formulas.append(formula) - # print(f"Found {len(all_inline_formulas)} total inline formulas") - # print(f"Total new_parts: {len(new_parts)}") expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip() expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula] - # print(f"Expected {len(expect_inline_formulas)} inline formulas") + print(f"行内公式 - 期望: {len(expect_inline_formulas)}, 实际: {len(all_inline_formulas)}") + # 打印所有实际提取的行内公式 + print("\n所有实际提取的行内公式:") + for i, formula in enumerate(all_inline_formulas, 1): + print(f" {i}. {formula}") + # 打印所有期望的行内公式 + print("\n所有期望的行内公式:") + for i, formula in enumerate(expect_inline_formulas, 1): + print(f" {i}. {formula}") + # 找出差异 + print("\n差异分析:") + if len(all_inline_formulas) != len(expect_inline_formulas): + print("数量不匹配!") self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas)) for expect, formula in zip(expect_inline_formulas, all_inline_formulas): - # print('inline expect::::::::', expect) - # print('inline answer::::::::', formula) self.assertEqual(expect, formula) def write_to_html(self, answers, file_name):