Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ llm_web_kit.egg-info/*
.llm-web-kit.jsonc
.llm-web-kit-pageclassify.jsonc
tests/llm_web_kit/extractor/ygq_testmd
output.md
output.jsonl
435 changes: 257 additions & 178 deletions docs/specification/output_format/content_list_spec.md

Large diffs are not rendered by default.

161 changes: 161 additions & 0 deletions llm_web_kit/api/services/request_log_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""请求日志服务.

提供请求日志的创建、更新和查询功能。
"""

import uuid
from datetime import datetime
from typing import Optional

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from ..dependencies import get_logger
from ..models.db_models import RequestLog

logger = get_logger(__name__)


class RequestLogService:
"""请求日志服务类."""
@staticmethod
def generate_request_id() -> str:
"""生成唯一的请求ID."""
return str(uuid.uuid4())

@staticmethod
async def create_log(
session: Optional[AsyncSession],
request_id: str,
input_type: str,
input_html: Optional[str] = None,
url: Optional[str] = None,
) -> Optional[RequestLog]:
"""创建请求日志记录.

Args:
session: 数据库会话
request_id: 请求ID
input_type: 输入类型 (html_content, url, file)
input_html: 输入HTML内容
url: URL地址
Returns:
创建的日志记录,如果数据库未配置则返回 None
"""
if session is None:
logger.debug("数据库会话为空,跳过日志记录")
return None
try:
log = RequestLog(
request_id=request_id,
input_type=input_type,
input_html=input_html,
url=url,
status='processing',
created_at=datetime.now(),
updated_at=datetime.now(),
)
session.add(log)
await session.flush() # 立即写入,获取ID
logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
return log
except Exception as e:
logger.error(f"创建请求日志失败: {e}")
return None

@staticmethod
async def update_log_success(
session: Optional[AsyncSession],
request_id: str,
output_markdown: Optional[str] = None,
) -> bool:
"""更新请求日志为成功状态.

Args:
session: 数据库会话
request_id: 请求ID
output_markdown: 输出Markdown内容
Returns:
是否更新成功
"""
if session is None:
return False
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
log = result.scalar_one_or_none()
if log:
log.status = 'success'
log.output_markdown = output_markdown
log.updated_at = datetime.now()
await session.flush()
logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
return True
else:
logger.warning(f"未找到请求日志: request_id={request_id}")
return False
except Exception as e:
logger.error(f"更新请求日志失败: {e}")
return False

@staticmethod
async def update_log_failure(
session: Optional[AsyncSession],
request_id: str,
error_message: str,
) -> bool:
"""更新请求日志为失败状态.

Args:
session: 数据库会话
request_id: 请求ID
error_message: 错误信息
Returns:
是否更新成功
"""
if session is None:
return False
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
log = result.scalar_one_or_none()
if log:
log.status = 'fail'
log.error_message = error_message
log.updated_at = datetime.now()
await session.flush()
logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
return True
else:
logger.warning(f"未找到请求日志: request_id={request_id}")
return False

except Exception as e:
logger.error(f"更新请求日志失败: {e}")
return False

@staticmethod
async def get_log_by_request_id(
session: Optional[AsyncSession],
request_id: str,
) -> Optional[RequestLog]:
"""根据请求ID查询日志.

Args:
session: 数据库会话
request_id: 请求ID
Returns:
日志记录,如果未找到则返回 None
"""
if session is None:
return None
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
return result.scalar_one_or_none()
except Exception as e:
logger.error(f"查询请求日志失败: {e}")
return None
12 changes: 6 additions & 6 deletions llm_web_kit/extractor/html/recognizer/cccode.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,29 +88,29 @@ def recognize(

@override
def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
"""
把代码元素转换为content list node.
"""把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN
E)由TextParagraphRecognizer处理.

Args:
base_url:
parsed_content: HtmlElement对象
raw_html_segment:

Returns:

"""
d = {
'type': 'code',
# "bbox": [],
'raw_content': raw_html_segment,
'inline': parsed_content.get('inline', 'false') == 'true',
'bbox': [],
'content': {
'code_content': parsed_content.text,
},
}

# 可选字段:language
if lang := parsed_content.get('language', None):
d['content']['language'] = lang

# 可选字段:by(代码高亮工具)
if by := parsed_content.get('by', None):
d['content']['by'] = by

Expand Down
25 changes: 14 additions & 11 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl

@override
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
"""将content转换成content_list_node.
每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md
例如代码的返回格式:
"""将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考
docs/specification/output_format/content_list_spec.md.

返回格式示例:
```json
{
"type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种
"raw_content": "<ccmath type="latex" by="mathjax">$u_{x_0}^{in}(x)$</ccmath>",
"type": "equation-interline",
"bbox": [],
"content": {
"math_content": "u_{x_0}^{in}(x)",
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "mathjax"
}
}
```
```

Args:
content: str: 要转换的content
Args:
base_url: 基础URL
parsed_content: 解析后的HtmlElement对象
raw_html_segment: 原始HTML片段

Returns:
dict: content_list_node
Expand All @@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = self.cm.wrap_math_md(math_content)
return {
'type': DocElementType.EQUATION_INTERLINE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'math_content': math_content,
'math_type': inter_ele[0].get('type'), # 数学语言类型
Expand All @@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = in_els[0].text
return {
'type': DocElementType.EQUATION_INLINE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'math_content': math_content,
'math_type': in_els[0].get('type'), # 数学语言类型
Expand Down
7 changes: 5 additions & 2 deletions llm_web_kit/extractor/html/recognizer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')

def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
caption = html_obj.get('caption')
footnote = html_obj.get('footnote')
result = {
'type': DocElementType.IMAGE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'url': html_obj.text if html_obj.get('format') == 'url' else None,
'data': html_obj.text if html_obj.get('format') == 'base64' else None,
'alt': html_obj.get('alt'),
'title': html_obj.get('title'),
'caption': html_obj.get('caption')
'caption': [caption] if caption else [],
'footnote': [footnote] if footnote else []
}
}
return result
Expand Down
8 changes: 6 additions & 2 deletions llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re
from typing import Any, List, Tuple

from lxml import html as lxml_html
Expand Down Expand Up @@ -44,7 +45,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h

ele_node = {
'type': DocElementType.LIST,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'items': content_list,
'list_attribute': list_attribute,
Expand Down Expand Up @@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
paragraph[-1]['c'] += _new_tail
else:
if len(paragraph) > 0 and el.tag not in inline_tags:
_new_tail = '$br$' + _new_tail
paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})

if paragraph:
Expand All @@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
text_paragraph.append(new_paragraph)

for n, item in enumerate(text_paragraph):
tem_json = json.dumps(item).replace('$br$', '\\n\\n')
tem_json = json.dumps(item, ensure_ascii=False)
tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
text_paragraph[n] = json.loads(tem_json)

return text_paragraph
Expand Down
24 changes: 16 additions & 8 deletions llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
# 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
if table_type:
cc_table_type = DocElementType.COMPLEX_TABLE
d = {
'type': cc_table_type,
'content': {
'html': html_content,
'table_nest_level': table_nest_level,
"caption": [],
"footnote": []
}
}
else:
cc_table_type = DocElementType.SIMPLE_TABLE
d = {
'type': cc_table_type,
'raw_content': raw_html_segment,
'content': {
'html': html_content,
'is_complex': table_type,
'table_nest_level': table_nest_level
d = {
'type': cc_table_type,
'content': {
'html': html_content,
"caption": [],
"footnote": []
}
}
}
return d

def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:
Expand Down
4 changes: 2 additions & 2 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
el = parsed_content
node = {
'type': DocElementType.PARAGRAPH,
'raw_content': raw_html_segment,
# 'raw_content': raw_html_segment,
'content': json.loads(el.text),
}
return node
Expand Down Expand Up @@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:

for item in para_text:
if item['c'] is not None:
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
else:
item['c'] = ""

Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
return None
cctitle_content_node = {
'type': DocElementType.TITLE,
'raw_content': raw_html_segment,
# 'raw_content': raw_html_segment,
'content': {
'title_content': text,
'level': level
Expand Down
Loading