From a564e16518081cb0ff4d6f0e57096da154a686f9 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Wed, 22 Oct 2025 16:06:48 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E7=AE=80=E5=8C=96math=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/common.py | 82 ++++++------------- .../recognizer/cc_math/tag_common_modify.py | 15 +--- .../extractor/html/recognizer/ccmath.py | 10 +-- ...e_1_span-math-container_latex_mathjax.html | 2 +- ...math-container_latex_mathjax_inline_1.html | 3 +- .../extractor/html/recognizer/test_math.py | 39 +++++++-- .../input/assets/content_json.json | 2 +- 7 files changed, 67 insertions(+), 86 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index d9178840..9dea5eba 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -20,7 +20,6 @@ from llm_web_kit.extractor.html.recognizer.recognizer import CCTag from llm_web_kit.libs.doc_element_type import DocElementType from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html, - element_to_html_unescaped, html_to_element) from llm_web_kit.libs.text_utils import normalize_ctl_text @@ -538,62 +537,35 @@ def fix_mathml_superscript(self, mathml_str): parent.remove(msup) return etree.tostring(root, encoding='unicode', pretty_print=True) - def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement: - # pattern re数学公式匹配 func 公式预处理 默认不处理 - # ascii公式处理逻辑转移到mathjax渲染器方案中 - if asciimath_wrap: - return node + def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement: + """替换数学公式节点. + + Args: + new_tag: 新标签名称(CCMATH_INLINE 或 CCMATH_INTERLINE) + math_type: 数学公式类型(MathType.LATEX 等) + math_render: 渲染器类型 + node: 当前HTML节点 + func: 公式预处理函数(可选) - pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH - original_text = node.text or '' - - def is_ccmath_wrapped(match_text, original_text: str) -> bool: - if not match_text or not original_text: - return False - start_idx = match_text.start() - end_idx = match_text.end() - before_match = original_text[:start_idx].strip() - after_match = original_text[end_idx:].strip() - if 'ccmath' in before_match and 'ccmath' in after_match: - return True - if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH: - for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]: - if start in before_match and end in after_match: - return True - return False - - def process(match_text): - try: - match = match_text.group(0) - if is_ccmath_wrapped(match_text, original_text): - return match - wrapped_text = func(match) if func else match - # html保留原始的,而不是传入修改过的wrapped_text - original_wrapped = wrapped_text - wrapped_text = self.wrap_math_md(wrapped_text) - if not wrapped_text: - return match - new_span = build_cc_element( - html_tag_name=new_tag, - text=wrapped_text, - tail='', - type=math_type, - by=math_render, - html=original_wrapped - ) - except Exception: - return match - return element_to_html(new_span) - try: - for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]: - pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?') - regex = re.compile(pattern, re.DOTALL) - original_text = re.sub(regex, process, original_text) - except Exception: - node.text = self.build_cc_exception_tag(original_text, math_type, math_render) + Returns: + 处理后的节点 + """ + text = node.text + if not text or not text_strip(text): return node - node.text = original_text - return html_to_element(element_to_html_unescaped(node)) + + formula = self.wrap_math_md(text) + # 构建新节点 + new_span = build_cc_element( + html_tag_name=new_tag, + text=formula, + tail=text_strip(node.tail), + type=math_type, + by=math_render, + html=element_to_html(node) + ) + + return new_span def build_cc_exception_tag(self, text, math_type, math_render) -> str: return element_to_html(build_cc_element( diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py index 260d4b80..6801cf43 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py @@ -2,12 +2,11 @@ from llm_web_kit.exception.exception import HtmlMathRecognizerException from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, - MathType, text_strip) from llm_web_kit.libs.html_utils import replace_element -def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement): +def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement): try: text = node.text tag_math_type_list = cm.get_equation_type(o_html) @@ -18,17 +17,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa tail = node.tail new_span.tail = None for new_tag, math_type in tag_math_type_list: - asciimath_wrap = True if math_type == MathType.ASCIIMATH else False - new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None,asciimath_wrap) + new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None) new_span.tail = tail replace_element(node,new_span) - # if math_type == MathType.ASCIIMATH: - # text = cm.wrap_math_md(text) - # text = cm.extract_asciimath(text) - # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html) - # replace_element(node, new_span) - # elif math_type == MathType.LATEX: - # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html) - # replace_element(node, new_span) + except Exception as e: raise HtmlMathRecognizerException(f'Error processing script mathtex: {e}') diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 780ba583..1a2d0958 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -140,14 +140,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq - if node.tag == 'span' and node.get('class') and ( - 'math-container' in node.get('class') or - 'mathjax' in node.get('class') or - 'wp-katex-eq' in node.get('class') or - 'x-ck12-mathEditor' in node.get('class') or - 'tex' in node.get('class') - ): - tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) + if node.tag == 'span' and node.get('class') and 'math-container' in node.get('class'): + tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node) # math tags if node.tag == 'math' or node.tag.endswith(':math'): diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html index c509d756..ea085333 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html @@ -1406,7 +1406,7 @@

$\begingroup$
-

When gravitational waves reach Earth, they usually give a strain of $\delta L \over L$$=10^{-21}$.

+

When gravitational waves reach Earth, they usually give a strain of $\delta L \over L = 10^{-21}$.

If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.

LIGO detected the first merger of black holes at 1.3 billion light years away.

If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of $10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.

diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html index 771c4c49..b8fc706a 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html @@ -7,8 +7,7 @@ M_{\odot} M_{\odot} M_{\odot} -\delta L \over L -=10^{-21} +\delta L \over L = 10^{-21} 10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3} 1/r 10^{-21} \times (1.3 \cdot 10^9)=10^{-9} diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 79d010aa..679690a0 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -511,6 +511,7 @@ def test_math_recognizer_html(self): # print('answers::::::::', answers) # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 + # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): # 从所有parts中提取所有行内公式 all_inline_formulas = [] @@ -521,16 +522,40 @@ def test_math_recognizer_html(self): for inline_elem in inline_elements: formula = inline_elem.text.replace('\n', '').strip() all_inline_formulas.append(formula) - # print(f"Found {len(all_inline_formulas)} total inline formulas") - # print(f"Total new_parts: {len(new_parts)}") + expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip() expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula] - # print(f"Expected {len(expect_inline_formulas)} inline formulas") + + # 如果数量不匹配,输出详细信息 + if len(all_inline_formulas) != len(expect_inline_formulas): + print("\n" + "=" * 80) + print("行内公式抽取出错!") + print("=" * 80) + print(f"出错样例: {test_case['input']}") + print(f"预期公式数: {len(expect_inline_formulas)}") + print(f"实际公式数: {len(all_inline_formulas)}") + print("\n预期公式列表:") + for i, formula in enumerate(expect_inline_formulas, 1): + print(f" {i}. {formula}") + print("\n实际公式列表:") + for i, formula in enumerate(all_inline_formulas, 1): + print(f" {i}. {formula}") + + # 找出差异 + print("\n差异分析:") + if len(all_inline_formulas) > len(expect_inline_formulas): + print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:") + extra_formulas = all_inline_formulas[len(expect_inline_formulas):] + for i, formula in enumerate(extra_formulas, 1): + print(f" {i}. {formula}") + else: + print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:") + missing_formulas = expect_inline_formulas[len(all_inline_formulas):] + for i, formula in enumerate(missing_formulas, 1): + print(f" {i}. {formula}") + print("=" * 80 + "\n") + self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas)) - for expect, formula in zip(expect_inline_formulas, all_inline_formulas): - # print('inline expect::::::::', expect) - # print('inline answer::::::::', formula) - self.assertEqual(expect, formula) def write_to_html(self, answers, file_name): file_name = file_name.split('.')[0] diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index d8f038a2..34236da4 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -168,7 +168,7 @@ "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", - "by": "None" + "by": "mathjax_mock" } }, { From 08daa672a20b4cb8033798fef92998aeb797c69d Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Wed, 22 Oct 2025 16:17:11 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E7=AE=80=E5=8C=96math=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/common.py | 18 +++++++++++------- .../recognizer/cc_math/tag_common_modify.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index 9dea5eba..c1ce7377 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -537,7 +537,7 @@ def fix_mathml_superscript(self, mathml_str): parent.remove(msup) return etree.tostring(root, encoding='unicode', pretty_print=True) - def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func) -> HtmlElement: + def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement) -> HtmlElement: """替换数学公式节点. Args: @@ -593,12 +593,16 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str: print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$')) print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)')) print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)')) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

`x=(-b +- sqrt(b^2 - 4ac))/(2a)`

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

like this: \`E=mc^2\`

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.

'),None,True)) - print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'

`(x+1)/x^2``1/3245`

'),None,True)) - print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'

start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end

'),None,False)) - print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'

\( \newcommand{\norm}[1]{\| #1 \|}\)

'),None,False)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

`x=(-b +- sqrt(b^2 - 4ac))/(2a)`

'), + True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

like this: \`E=mc^2\`

'), True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.

'), + True)) + print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

`(x+1)/x^2``1/3245`

'), True)) + print(cm.replace_math('ccmath-interline','latex','', html_to_element(r'

start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end

'), + False)) + print(cm.replace_math('ccmath-inline','latex','', html_to_element(r'

\( \newcommand{\norm}[1]{\| #1 \|}\)

'), + False)) # cm.url = 'mathhelpforum.com' # print(cm.wrap_math_md_custom(r'
\begin{align} a^2+b=c\end{align}\
')) # print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
')) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py index 6801cf43..aaa4a9de 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py @@ -17,7 +17,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement): tail = node.tail new_span.tail = None for new_tag, math_type in tag_math_type_list: - new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None) + new_span = cm.replace_math(new_tag, math_type, math_render, new_span) new_span.tail = tail replace_element(node,new_span) From 1ee8049828f0b162929d123636437b0cf24d1825 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Wed, 22 Oct 2025 16:27:34 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E7=AE=80=E5=8C=96math=E6=8A=BD=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/common.py | 10 ----- .../extractor/html/recognizer/test_math.py | 39 ++++--------------- 2 files changed, 7 insertions(+), 42 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index c1ce7377..1d5ef3a2 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -593,16 +593,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str: print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$')) print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)')) print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)')) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

`x=(-b +- sqrt(b^2 - 4ac))/(2a)`

'), - True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

like this: \`E=mc^2\`

'), True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.

'), - True)) - print(cm.replace_math('ccmath-interline','asciimath','', html_to_element(r'

`(x+1)/x^2``1/3245`

'), True)) - print(cm.replace_math('ccmath-interline','latex','', html_to_element(r'

start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end

'), - False)) - print(cm.replace_math('ccmath-inline','latex','', html_to_element(r'

\( \newcommand{\norm}[1]{\| #1 \|}\)

'), - False)) # cm.url = 'mathhelpforum.com' # print(cm.wrap_math_md_custom(r'
\begin{align} a^2+b=c\end{align}\
')) # print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
')) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 679690a0..79d010aa 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -511,7 +511,6 @@ def test_math_recognizer_html(self): # print('answers::::::::', answers) # self.write_to_html(answers, test_case['input'][0]) # 检查行内公式抽取正确性 - # 检查行内公式抽取正确性 if test_case.get('expected_inline', None): # 从所有parts中提取所有行内公式 all_inline_formulas = [] @@ -522,40 +521,16 @@ def test_math_recognizer_html(self): for inline_elem in inline_elements: formula = inline_elem.text.replace('\n', '').strip() all_inline_formulas.append(formula) - + # print(f"Found {len(all_inline_formulas)} total inline formulas") + # print(f"Total new_parts: {len(new_parts)}") expect_inline_text = base_dir.joinpath(test_case['expected_inline']).read_text(encoding='utf-8').strip() expect_inline_formulas = [formula for formula in expect_inline_text.split('\n') if formula] - - # 如果数量不匹配,输出详细信息 - if len(all_inline_formulas) != len(expect_inline_formulas): - print("\n" + "=" * 80) - print("行内公式抽取出错!") - print("=" * 80) - print(f"出错样例: {test_case['input']}") - print(f"预期公式数: {len(expect_inline_formulas)}") - print(f"实际公式数: {len(all_inline_formulas)}") - print("\n预期公式列表:") - for i, formula in enumerate(expect_inline_formulas, 1): - print(f" {i}. {formula}") - print("\n实际公式列表:") - for i, formula in enumerate(all_inline_formulas, 1): - print(f" {i}. {formula}") - - # 找出差异 - print("\n差异分析:") - if len(all_inline_formulas) > len(expect_inline_formulas): - print(f"多提取了 {len(all_inline_formulas) - len(expect_inline_formulas)} 个公式:") - extra_formulas = all_inline_formulas[len(expect_inline_formulas):] - for i, formula in enumerate(extra_formulas, 1): - print(f" {i}. {formula}") - else: - print(f"少提取了 {len(expect_inline_formulas) - len(all_inline_formulas)} 个公式:") - missing_formulas = expect_inline_formulas[len(all_inline_formulas):] - for i, formula in enumerate(missing_formulas, 1): - print(f" {i}. {formula}") - print("=" * 80 + "\n") - + # print(f"Expected {len(expect_inline_formulas)} inline formulas") self.assertEqual(len(all_inline_formulas), len(expect_inline_formulas)) + for expect, formula in zip(expect_inline_formulas, all_inline_formulas): + # print('inline expect::::::::', expect) + # print('inline answer::::::::', formula) + self.assertEqual(expect, formula) def write_to_html(self, answers, file_name): file_name = file_name.split('.')[0]