diff --git a/llm_web_kit/api/services/request_log_service.py b/llm_web_kit/api/services/request_log_service.py
new file mode 100644
index 00000000..1de0950d
--- /dev/null
+++ b/llm_web_kit/api/services/request_log_service.py
@@ -0,0 +1,161 @@
+"""请求日志服务.
+
+提供请求日志的创建、更新和查询功能。
+"""
+
+import uuid
+from datetime import datetime
+from typing import Optional
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ..dependencies import get_logger
+from ..models.db_models import RequestLog
+
+logger = get_logger(__name__)
+
+
+class RequestLogService:
+ """请求日志服务类."""
+ @staticmethod
+ def generate_request_id() -> str:
+ """生成唯一的请求ID."""
+ return str(uuid.uuid4())
+
+ @staticmethod
+ async def create_log(
+ session: Optional[AsyncSession],
+ request_id: str,
+ input_type: str,
+ input_html: Optional[str] = None,
+ url: Optional[str] = None,
+ ) -> Optional[RequestLog]:
+ """创建请求日志记录.
+
+ Args:
+ session: 数据库会话
+ request_id: 请求ID
+ input_type: 输入类型 (html_content, url, file)
+ input_html: 输入HTML内容
+ url: URL地址
+ Returns:
+ 创建的日志记录,如果数据库未配置则返回 None
+ """
+ if session is None:
+ logger.debug("数据库会话为空,跳过日志记录")
+ return None
+ try:
+ log = RequestLog(
+ request_id=request_id,
+ input_type=input_type,
+ input_html=input_html,
+ url=url,
+ status='processing',
+ created_at=datetime.now(),
+ updated_at=datetime.now(),
+ )
+ session.add(log)
+ await session.flush() # 立即写入,获取ID
+ logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
+ return log
+ except Exception as e:
+ logger.error(f"创建请求日志失败: {e}")
+ return None
+
+ @staticmethod
+ async def update_log_success(
+ session: Optional[AsyncSession],
+ request_id: str,
+ output_markdown: Optional[str] = None,
+ ) -> bool:
+ """更新请求日志为成功状态.
+
+ Args:
+ session: 数据库会话
+ request_id: 请求ID
+ output_markdown: 输出Markdown内容
+ Returns:
+ 是否更新成功
+ """
+ if session is None:
+ return False
+ try:
+ result = await session.execute(
+ select(RequestLog).where(RequestLog.request_id == request_id)
+ )
+ log = result.scalar_one_or_none()
+ if log:
+ log.status = 'success'
+ log.output_markdown = output_markdown
+ log.updated_at = datetime.now()
+ await session.flush()
+ logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
+ return True
+ else:
+ logger.warning(f"未找到请求日志: request_id={request_id}")
+ return False
+ except Exception as e:
+ logger.error(f"更新请求日志失败: {e}")
+ return False
+
+ @staticmethod
+ async def update_log_failure(
+ session: Optional[AsyncSession],
+ request_id: str,
+ error_message: str,
+ ) -> bool:
+ """更新请求日志为失败状态.
+
+ Args:
+ session: 数据库会话
+ request_id: 请求ID
+ error_message: 错误信息
+ Returns:
+ 是否更新成功
+ """
+ if session is None:
+ return False
+ try:
+ result = await session.execute(
+ select(RequestLog).where(RequestLog.request_id == request_id)
+ )
+ log = result.scalar_one_or_none()
+ if log:
+ log.status = 'fail'
+ log.error_message = error_message
+ log.updated_at = datetime.now()
+ await session.flush()
+ logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
+ return True
+ else:
+ logger.warning(f"未找到请求日志: request_id={request_id}")
+ return False
+
+ except Exception as e:
+ logger.error(f"更新请求日志失败: {e}")
+ return False
+
+ @staticmethod
+ async def get_log_by_request_id(
+ session: Optional[AsyncSession],
+ request_id: str,
+ ) -> Optional[RequestLog]:
+ """根据请求ID查询日志.
+
+ Args:
+ session: 数据库会话
+ request_id: 请求ID
+ Returns:
+ 日志记录,如果未找到则返回 None
+ """
+ if session is None:
+ return None
+ try:
+ result = await session.execute(
+ select(RequestLog).where(RequestLog.request_id == request_id)
+ )
+ return result.scalar_one_or_none()
+ except Exception as e:
+ logger.error(f"查询请求日志失败: {e}")
+ return None
diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py
index 53f612dc..6e1d192b 100644
--- a/llm_web_kit/extractor/html/recognizer/image.py
+++ b/llm_web_kit/extractor/html/recognizer/image.py
@@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')
def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
+ caption = html_obj.get('caption')
+ footnote = html_obj.get('footnote')
result = {
'type': DocElementType.IMAGE,
- 'raw_content': raw_html_segment,
+ 'bbox': [],
'content': {
'url': html_obj.text if html_obj.get('format') == 'url' else None,
'data': html_obj.text if html_obj.get('format') == 'base64' else None,
'alt': html_obj.get('alt'),
'title': html_obj.get('title'),
- 'caption': html_obj.get('caption')
+ 'caption': [caption] if caption else [],
+ 'footnote': [footnote] if footnote else []
}
}
return result
diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index 61f113b8..723f292f 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -44,7 +44,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
ele_node = {
'type': DocElementType.LIST,
- 'raw_content': raw_html_segment,
+ 'bbox': [],
'content': {
'items': content_list,
'list_attribute': list_attribute,
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 32d662bd..c9b533c2 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -280,9 +280,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
image_data = content_lst_node['content'].get('data', '')
image_alt = content_lst_node['content'].get('alt', '')
image_title = content_lst_node['content'].get('title', '')
- image_caption = content_lst_node['content'].get('caption', '')
+ image_caption = content_lst_node['content'].get('caption', [])
image_url = content_lst_node['content'].get('url', '')
-
+ image_footnote = content_lst_node['content'].get('footnote', [])
if not image_path and not image_data:
image_path = sha256_hash(image_url)
@@ -299,11 +299,16 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
else:
image_title = ''
- if image_caption:
- image_caption = image_caption.strip()
+ if len(image_caption) > 0:
+ image_caption = image_caption[0].strip()
else:
image_caption = ''
+ if len(image_footnote) > 0:
+ image_footnote = image_footnote[0].strip()
+ else:
+ image_footnote = ''
+
image_des = image_title if image_title else ''
# 优先使用data, 其次path.其中data是base64编码的图片,path是图片的url
if image_data:
@@ -322,6 +327,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
else:
image_with_caption = image
+ if image_footnote:
+ image_with_caption = f'{image_with_caption}\n\n{image_footnote}'
+
return image_with_caption
elif node_type == DocElementType.AUDIO:
return '' # TODO: 音频格式
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
index 6396c6d1..281f0b6d 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
@@ -82,9 +82,9 @@
'url': 'xxx',
'parsed_content': """
",
+ "bbox": [],
"content": {
"url": "http://example.com/image.png",
"data": null,
"alt": "image",
"title": null,
- "caption": ""
+ "caption": [],
+ "footnote": []
}
},
{
@@ -203,7 +204,7 @@
},
{
"type": "list",
- "raw_content": "