diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
index d9178840..1d5ef3a2 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -20,7 +20,6 @@
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
from llm_web_kit.libs.doc_element_type import DocElementType
from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html,
- element_to_html_unescaped,
html_to_element)
from llm_web_kit.libs.text_utils import normalize_ctl_text
@@ -538,62 +537,35 @@ def fix_mathml_superscript(self, mathml_str):
parent.remove(msup)
return etree.tostring(root, encoding='unicode', pretty_print=True)
- def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement, func, asciimath_wrap: bool = False) -> HtmlElement:
- # pattern re数学公式匹配 func 公式预处理 默认不处理
- # ascii公式处理逻辑转移到mathjax渲染器方案中
- if asciimath_wrap:
- return node
+ def replace_math(self, new_tag: str, math_type: str, math_render: str, node: HtmlElement) -> HtmlElement:
+ """替换数学公式节点.
+
+ Args:
+ new_tag: 新标签名称(CCMATH_INLINE 或 CCMATH_INTERLINE)
+ math_type: 数学公式类型(MathType.LATEX 等)
+ math_render: 渲染器类型
+ node: 当前HTML节点
+ func: 公式预处理函数(可选)
- pattern_type = MATH_TYPE_PATTERN.DISPLAYMATH if new_tag == CCMATH_INTERLINE else MATH_TYPE_PATTERN.INLINEMATH
- original_text = node.text or ''
-
- def is_ccmath_wrapped(match_text, original_text: str) -> bool:
- if not match_text or not original_text:
- return False
- start_idx = match_text.start()
- end_idx = match_text.end()
- before_match = original_text[:start_idx].strip()
- after_match = original_text[end_idx:].strip()
- if 'ccmath' in before_match and 'ccmath' in after_match:
- return True
- if pattern_type == MATH_TYPE_PATTERN.DISPLAYMATH:
- for start, end in MATH_TYPE_TO_DISPLAY[MathType.LATEX][MATH_TYPE_PATTERN.INLINEMATH]:
- if start in before_match and end in after_match:
- return True
- return False
-
- def process(match_text):
- try:
- match = match_text.group(0)
- if is_ccmath_wrapped(match_text, original_text):
- return match
- wrapped_text = func(match) if func else match
- # html保留原始的,而不是传入修改过的wrapped_text
- original_wrapped = wrapped_text
- wrapped_text = self.wrap_math_md(wrapped_text)
- if not wrapped_text:
- return match
- new_span = build_cc_element(
- html_tag_name=new_tag,
- text=wrapped_text,
- tail='',
- type=math_type,
- by=math_render,
- html=original_wrapped
- )
- except Exception:
- return match
- return element_to_html(new_span)
- try:
- for start, end in MATH_TYPE_TO_DISPLAY[math_type][pattern_type]:
- pattern = f'{re.escape(start)}.*?{re.escape(end)}'.replace(r'\.\*\?', '.*?')
- regex = re.compile(pattern, re.DOTALL)
- original_text = re.sub(regex, process, original_text)
- except Exception:
- node.text = self.build_cc_exception_tag(original_text, math_type, math_render)
+ Returns:
+ 处理后的节点
+ """
+ text = node.text
+ if not text or not text_strip(text):
return node
- node.text = original_text
- return html_to_element(element_to_html_unescaped(node))
+
+ formula = self.wrap_math_md(text)
+ # 构建新节点
+ new_span = build_cc_element(
+ html_tag_name=new_tag,
+ text=formula,
+ tail=text_strip(node.tail),
+ type=math_type,
+ by=math_render,
+ html=element_to_html(node)
+ )
+
+ return new_span
def build_cc_exception_tag(self, text, math_type, math_render) -> str:
return element_to_html(build_cc_element(
@@ -621,12 +593,6 @@ def build_cc_exception_tag(self, text, math_type, math_render) -> str:
print(cm.wrap_math_md(r'$$a^2 + b^2 = c^2$$'))
print(cm.wrap_math_md(r'\(a^2 + b^2 = c^2\)'))
print(cm.extract_asciimath('x=(-b +- sqrt(b^2 - 4ac))/(2a)'))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'
`x=(-b +- sqrt(b^2 - 4ac))/(2a)`
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'like this: \`E=mc^2\`
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'A `3xx3` matrix,`((1,2,3),(4,5,6),(7,8,9))`, and a `2xx1` matrix, or vector, `((1),(0))`.
'),None,True))
- print(cm.replace_math('ccmath-interline','asciimath','',html_to_element(r'`(x+1)/x^2``1/3245`
'),None,True))
- print(cm.replace_math('ccmath-interline','latex','',html_to_element(r'start $$f(a,b,c) = (a^2+b^2+c^2)^3$$end
'),None,False))
- print(cm.replace_math('ccmath-inline','latex','',html_to_element(r'\( \newcommand{\norm}[1]{\| #1 \|}\)
'),None,False))
# cm.url = 'mathhelpforum.com'
# print(cm.wrap_math_md_custom(r'
\begin{align} a^2+b=c\end{align}\
'))
# print(cm.wrap_math_md_custom(r'
dz=\frac{1}{2}\frac{dx}{\cos ^2 x}
'))
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
index 260d4b80..aaa4a9de 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_common_modify.py
@@ -2,12 +2,11 @@
from llm_web_kit.exception.exception import HtmlMathRecognizerException
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH,
- MathType,
text_strip)
from llm_web_kit.libs.html_utils import replace_element
-def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
+def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement):
try:
text = node.text
tag_math_type_list = cm.get_equation_type(o_html)
@@ -18,17 +17,9 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
tail = node.tail
new_span.tail = None
for new_tag, math_type in tag_math_type_list:
- asciimath_wrap = True if math_type == MathType.ASCIIMATH else False
- new_span = cm.replace_math(new_tag, math_type, math_render, new_span, None,asciimath_wrap)
+ new_span = cm.replace_math(new_tag, math_type, math_render, new_span)
new_span.tail = tail
replace_element(node,new_span)
- # if math_type == MathType.ASCIIMATH:
- # text = cm.wrap_math_md(text)
- # text = cm.extract_asciimath(text)
- # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
- # replace_element(node, new_span)
- # elif math_type == MathType.LATEX:
- # new_span = build_cc_element(html_tag_name=new_tag, text=cm.wrap_math_md(text), tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html)
- # replace_element(node, new_span)
+
except Exception as e:
raise HtmlMathRecognizerException(f'Error processing script mathtex: {e}')
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index 780ba583..1a2d0958 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -140,14 +140,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
- if node.tag == 'span' and node.get('class') and (
- 'math-container' in node.get('class') or
- 'mathjax' in node.get('class') or
- 'wp-katex-eq' in node.get('class') or
- 'x-ck12-mathEditor' in node.get('class') or
- 'tex' in node.get('class')
- ):
- tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+ if node.tag == 'span' and node.get('class') and 'math-container' in node.get('class'):
+ tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node)
# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
index c509d756..ea085333 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax.html
@@ -1406,7 +1406,7 @@
$\begingroup$
-
When gravitational waves reach Earth, they usually give a strain of $\delta L \over L$$=10^{-21}$.
+
When gravitational waves reach Earth, they usually give a strain of $\delta L \over L = 10^{-21}$.
If we assume that they scale with the distance the same way electromagnetic waves do, thus following the inverse square law, we can get an estimate of the distance needed.
LIGO detected the first merger of black holes at 1.3 billion light years away.
If we would get to 1 light year away from the merger, under the above hypothesis we would get a strain of $10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}$. This means that on 1 meter length we would notice a 1 mm oscillation, which is something we are able to sense.
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
index 771c4c49..b8fc706a 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html
@@ -7,8 +7,7 @@
M_{\odot}
M_{\odot}
M_{\odot}
-\delta L \over L
-=10^{-21}
+\delta L \over L = 10^{-21}
10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3}
1/r
10^{-21} \times (1.3 \cdot 10^9)=10^{-9}
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index d8f038a2..34236da4 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -168,7 +168,7 @@
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
- "by": "None"
+ "by": "mathjax_mock"
}
},
{