Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

from llm_web_kit.exception.exception import (
HtmlMathMathjaxRenderRecognizerException, HtmlMathRecognizerException)
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_common_modify,
tag_img, tag_math,
from llm_web_kit.extractor.html.recognizer.cc_math import (tag_img, tag_math,
tag_mjx, tag_script)
from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, CSDN,
ZHIHU)
Expand Down Expand Up @@ -140,14 +139,14 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
'math-container' in node.get('class') or
'mathjax' in node.get('class') or
'wp-katex-eq' in node.get('class') or
'x-ck12-mathEditor' in node.get('class') or
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
# if node.tag == 'span' and node.get('class') and (
# 'math-container' in node.get('class') or
# 'mathjax' in node.get('class') or
# 'wp-katex-eq' in node.get('class') or
# 'x-ck12-mathEditor' in node.get('class') or
# 'tex' in node.get('class')
# ):
# tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
Expand Down
34 changes: 25 additions & 9 deletions llm_web_kit/libs/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,26 +569,42 @@ def groups(self):


def optimized_dollar_matching(text):
"""美元金额匹配."""
# 用于存储需要修改的位置和替换内容
"""美元金额匹配,避免误判数学公式."""
replacements = []

pattern = r'(?<!\\)(\$\d{1,3}(?:,\d{3})*(?:\.\d{1,})?)'
matches_result = re.finditer(pattern, text)
matches_result = list(re.finditer(pattern, text))

for match in matches_result:
# 获取匹配的起始和结束位置
start, end = match.start(), match.end()
# 检查匹配后的字符(如果存在)

# 检查匹配后的字符
if end < len(text):
next_char = text[end]
# 只有当后接字符不在列表中时才进行替换
if next_char not in ["^", "$", "\\", "/"]:
replacements.append((start, end, match.group()))
# 原有逻辑:排除数学运算符
if next_char in ["^", "$", "\\", "/"]:
continue

# 新增逻辑:检查后续是否存在配对的$符号
remaining_text = text[end:]
# 查找下一个未转义的$
next_dollar_match = re.search(r'(?<!\\)\$', remaining_text)

if next_dollar_match:
next_dollar_pos = end + next_dollar_match.start()
# 检查第二个$后面的字符
after_second_dollar = text[next_dollar_pos + 1:next_dollar_pos + 2]

# 如果第二个$后面不是数字或为空,则认为是公式,跳过转义
if not after_second_dollar or not after_second_dollar.isdigit():
continue

# 如果通过所有检查,则进行转义
replacements.append((start, end, match.group()))

if replacements:
text_chars = list(text)
for start, end, original_match in sorted(replacements, reverse=True):
# 只转义金额前的$符号
escaped_match = f"\\{original_match}"
text_chars[start:end] = list(escaped_match)
return ''.join(text_chars)
Expand Down
Loading
Loading