Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from llm_web_kit.exception.exception import (
HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
tag_img, tag_math,
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
tag_mjx, tag_script)
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
ZHIHU)
Expand Down Expand Up @@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
'math-container' in node.get('class') or
'mathjax' in node.get('class') or
'wp-katex-eq' in node.get('class') or
'x-ck12-mathEditor' in node.get('class') or
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
# if node.tag == 'span' and node.get('class') and (
# 'math-container' in node.get('class') or
# 'mathjax' in node.get('class') or
# 'wp-katex-eq' in node.get('class') or
# 'x-ck12-mathEditor' in node.get('class') or
# 'tex' in node.get('class')
# ):
# tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tag_common_modify相关的代码是不是可以删除了


# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
Expand Down
34 changes: 25 additions & 9 deletions llm_web_kit/libs/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,26 +569,42 @@ def groups(self):


def optimized_dollar_matching(text):
"""美元金额匹配."""
# 用于存储需要修改的位置和替换内容
"""美元金额匹配,避免误判数学公式."""
replacements = []

pattern = r'(?<!\\)(\$\d{1,3}(?:,\d{3})*(?:\.\d{1,})?)'
matches_result = re.finditer(pattern, text)
matches_result = list(re.finditer(pattern, text))

for match in matches_result:
# 获取匹配的起始和结束位置
start, end = match.start(), match.end()
# 检查匹配后的字符(如果存在)

# 检查匹配后的字符
if end < len(text):
next_char = text[end]
# 只有当后接字符不在列表中时才进行替换
if next_char not in ["^", "$", "\\", "/"]:
replacements.append((start, end, match.group()))
# 原有逻辑:排除数学运算符
if next_char in ["^", "$", "\\", "/"]:
continue

# 新增逻辑:检查后续是否存在配对的$符号
remaining_text = text[end:]
# 查找下一个未转义的$
next_dollar_match = re.search(r'(?<!\\)\$', remaining_text)

if next_dollar_match:
next_dollar_pos = end + next_dollar_match.start()
# 检查第二个$后面的字符
after_second_dollar = text[next_dollar_pos + 1:next_dollar_pos + 2]

# 如果第二个$后面不是数字或为空,则认为是公式,跳过转义
if not after_second_dollar or not after_second_dollar.isdigit():
continue

# 如果通过所有检查,则进行转义
replacements.append((start, end, match.group()))

if replacements:
text_chars = list(text)
for start, end, original_match in sorted(replacements, reverse=True):
# 只转义金额前的$符号
escaped_match = f"\\{original_match}"
text_chars[start:end] = list(escaped_match)
return ''.join(text_chars)
Expand Down
4 changes: 2 additions & 2 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,11 +955,11 @@ def test_latex_not_closed(self):

def test_dollar_sign(self):
"""美元符合与公式共存的情况."""
html_content = """<p>referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity</p>"""
html_content = """<p>referring $18.1 to $18.1 the packet center $ p$ and <span class="math-container">$9 + 10^9$</span> apparently coinciding with the particle velocity and $18.1</p>"""
parts = self.math_recognizer.recognize('https://www.baidu.com',
[(html_to_element(html_content), html_to_element(html_content))],
html_content)
assert element_to_html(parts[0][0]) == '<p>referring \\$18.1 to \\$18.1 the packet center <ccmath-inline type="latex" by="mathjax_mock" html="$ p$">p</ccmath-inline> and apparently coinciding with the particle velocity</p>'
assert element_to_html(parts[0][0]) == '<p>referring \\$18.1 to \\$18.1 the packet center <ccmath-inline type="latex" by="mathjax_mock" html="$ p$">p</ccmath-inline> and <span class="math-container"><ccmath-inline type="latex" by="mathjax_mock" html="$9 + 10^9$">9 + 10^9</ccmath-inline></span> apparently coinciding with the particle velocity and \\$18.1</p>'

def test_begin_end(self):
"""$begin end$的嵌套组合识别时候$$没有处理."""
Expand Down
2 changes: 1 addition & 1 deletion tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "None"
"by": "mathjax_mock"
}
},
{
Expand Down
2 changes: 1 addition & 1 deletion tests/llm_web_kit/input/test_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def test_to_plain_md(self):
self.assertNotIn('flower.mp4', mm_md)

content_json = json_loads(base_dir.joinpath('assets/content_json.json').read_text(encoding='utf-8'))
self.assertEqual(json_json['content_list'], content_json['content_list'])
assert json_json['content_list'] == content_json['content_list']

plain_md_main = extract_content_from_main_html(url, raw_html, 'plain_md')
mm_md_main = extract_content_from_html_with_magic_html(url, raw_html, 'mm_md')
Expand Down