Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions llm_web_kit/api/services/request_log_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""请求日志服务.

提供请求日志的创建、更新和查询功能。
"""

import uuid
from datetime import datetime
from typing import Optional

from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from ..dependencies import get_logger
from ..models.db_models import RequestLog

logger = get_logger(__name__)


class RequestLogService:
"""请求日志服务类."""
@staticmethod
def generate_request_id() -> str:
"""生成唯一的请求ID."""
return str(uuid.uuid4())

@staticmethod
async def create_log(
session: Optional[AsyncSession],
request_id: str,
input_type: str,
input_html: Optional[str] = None,
url: Optional[str] = None,
) -> Optional[RequestLog]:
"""创建请求日志记录.

Args:
session: 数据库会话
request_id: 请求ID
input_type: 输入类型 (html_content, url, file)
input_html: 输入HTML内容
url: URL地址
Returns:
创建的日志记录,如果数据库未配置则返回 None
"""
if session is None:
logger.debug("数据库会话为空,跳过日志记录")
return None
try:
log = RequestLog(
request_id=request_id,
input_type=input_type,
input_html=input_html,
url=url,
status='processing',
created_at=datetime.now(),
updated_at=datetime.now(),
)
session.add(log)
await session.flush() # 立即写入,获取ID
logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
return log
except Exception as e:
logger.error(f"创建请求日志失败: {e}")
return None

@staticmethod
async def update_log_success(
session: Optional[AsyncSession],
request_id: str,
output_markdown: Optional[str] = None,
) -> bool:
"""更新请求日志为成功状态.

Args:
session: 数据库会话
request_id: 请求ID
output_markdown: 输出Markdown内容
Returns:
是否更新成功
"""
if session is None:
return False
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
log = result.scalar_one_or_none()
if log:
log.status = 'success'
log.output_markdown = output_markdown
log.updated_at = datetime.now()
await session.flush()
logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
return True
else:
logger.warning(f"未找到请求日志: request_id={request_id}")
return False
except Exception as e:
logger.error(f"更新请求日志失败: {e}")
return False

@staticmethod
async def update_log_failure(
session: Optional[AsyncSession],
request_id: str,
error_message: str,
) -> bool:
"""更新请求日志为失败状态.

Args:
session: 数据库会话
request_id: 请求ID
error_message: 错误信息
Returns:
是否更新成功
"""
if session is None:
return False
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
log = result.scalar_one_or_none()
if log:
log.status = 'fail'
log.error_message = error_message
log.updated_at = datetime.now()
await session.flush()
logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
return True
else:
logger.warning(f"未找到请求日志: request_id={request_id}")
return False

except Exception as e:
logger.error(f"更新请求日志失败: {e}")
return False

@staticmethod
async def get_log_by_request_id(
session: Optional[AsyncSession],
request_id: str,
) -> Optional[RequestLog]:
"""根据请求ID查询日志.

Args:
session: 数据库会话
request_id: 请求ID
Returns:
日志记录,如果未找到则返回 None
"""
if session is None:
return None
try:
result = await session.execute(
select(RequestLog).where(RequestLog.request_id == request_id)
)
return result.scalar_one_or_none()
except Exception as e:
logger.error(f"查询请求日志失败: {e}")
return None
7 changes: 5 additions & 2 deletions llm_web_kit/extractor/html/recognizer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')

def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
caption = html_obj.get('caption')
footnote = html_obj.get('footnote')
result = {
'type': DocElementType.IMAGE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'url': html_obj.text if html_obj.get('format') == 'url' else None,
'data': html_obj.text if html_obj.get('format') == 'base64' else None,
'alt': html_obj.get('alt'),
'title': html_obj.get('title'),
'caption': html_obj.get('caption')
'caption': [caption] if caption else [],
'footnote': [footnote] if footnote else []
}
}
return result
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h

ele_node = {
'type': DocElementType.LIST,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'items': content_list,
'list_attribute': list_attribute,
Expand Down
16 changes: 12 additions & 4 deletions llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,9 +280,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
image_data = content_lst_node['content'].get('data', '')
image_alt = content_lst_node['content'].get('alt', '')
image_title = content_lst_node['content'].get('title', '')
image_caption = content_lst_node['content'].get('caption', '')
image_caption = content_lst_node['content'].get('caption', [])
image_url = content_lst_node['content'].get('url', '')

image_footnote = content_lst_node['content'].get('footnote', [])
if not image_path and not image_data:
image_path = sha256_hash(image_url)

Expand All @@ -299,11 +299,16 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
else:
image_title = ''

if image_caption:
image_caption = image_caption.strip()
if len(image_caption) > 0:
image_caption = image_caption[0].strip()
else:
image_caption = ''

if len(image_footnote) > 0:
image_footnote = image_footnote[0].strip()
else:
image_footnote = ''

image_des = image_title if image_title else ''
# 优先使用data, 其次path.其中data是base64编码的图片,path是图片的url
if image_data:
Expand All @@ -322,6 +327,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
else:
image_with_caption = image

if image_footnote:
image_with_caption = f'{image_with_caption}\n\n{image_footnote}'

return image_with_caption
elif node_type == DocElementType.AUDIO:
return '' # TODO: 音频格式
Expand Down
8 changes: 4 additions & 4 deletions tests/llm_web_kit/extractor/html/recognizer/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@
'url': 'xxx',
'parsed_content': """<ccimage by="img" html='&lt;img src="http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg" alt="Janser Logo"&gt;' format="url" alt="Janser Logo">http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg</ccimage>""",
'html': '...',
'expected': {'type': 'image', 'raw_content': '...', 'content': {
'expected': {'type': 'image', 'bbox': [], 'content': {
'url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg', 'data': None,
'alt': 'Janser Logo', 'title': None, 'caption': None}},
'alt': 'Janser Logo', 'title': None, 'caption': [], 'footnote': []}},
'alt': 'Janser Logo',
'img_url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg'
},
Expand All @@ -94,9 +94,9 @@

' format="url" alt="Układanie wykładzin">http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg</ccimage>""",
'html': '...',
'expected': {'type': 'image', 'raw_content': '...',
'expected': {'type': 'image', 'bbox': [],
'content': {'url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg',
'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': None}},
'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': [], 'footnote': []}},
'alt': 'Układanie wykładzin',
'img_url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg'
},
Expand Down
17 changes: 8 additions & 9 deletions tests/llm_web_kit/extractor/html/recognizer/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,28 @@ def setUp(self):
self.__list_with_ul_text_content = None
self.__list_with_sub_no_prefix_content = None

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r', encoding='utf-8') as file:
self.__simple_list_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r', encoding='utf-8') as file:
self.__complex_list_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r', encoding='utf-8') as file:
self.__with_empty_list_item_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r', encoding='utf-8') as file:
self.__list_with_sub_sup_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r', encoding='utf-8') as file:
self.__list_with_br_and_cctags_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r', encoding='utf-8') as file:
self.__list_with_sub_sup_tail_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r', encoding='utf-8') as file:
self.__list_with_ul_text_content = file.read()

with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r') as file:
with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r', encoding='utf-8') as file:
self.__list_with_sub_no_prefix_content = file.read()

def test_simple_list(self):
Expand Down Expand Up @@ -158,7 +158,6 @@ def test_to_content_list_node(self):
# 验证返回的内容结构正确
assert 'type' in content_node, '返回的content_node缺少type字段'
assert 'content' in content_node, '返回的content_node缺少content字段'
assert 'raw_content' in content_node, '返回的content_node缺少raw_content字段'

# 验证content字段包含必要的内容
assert 'items' in content_node['content'], 'content字段缺少items'
Expand Down
5 changes: 4 additions & 1 deletion tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,12 @@ def test_html_pipeline(self):
# 然后是img
html_content = html_content_list[2]
self.assertEqual(html_content['type'], DocElementType.IMAGE)
self.assertEqual(html_content['bbox'], [])
self.assertEqual(html_content['content']['title'], 'image-title')
self.assertEqual(html_content['content']['alt'], 'image-alt')
self.assertEqual(html_content['content']['url'], 'https://www.test.com/test.png')
self.assertEqual(html_content['content']['caption'], '')
self.assertEqual(html_content['content']['caption'], [])
self.assertEqual(html_content['content']['footnote'], [])

# 然后是simple table
html_content = html_content_list[4]
Expand All @@ -121,6 +123,7 @@ def test_html_pipeline(self):
# 然后是list
html_content = html_content_list[6]
self.assertEqual(html_content['type'], DocElementType.LIST)
self.assertEqual(html_content['bbox'], [])
self.assertEqual(len(html_content['content']['items']), 2)
self.assertEqual(html_content['content']['list_attribute'], 'unordered')
self.assertEqual(html_content['content']['items'][0]['c'], '1')
Expand Down
9 changes: 5 additions & 4 deletions tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,14 @@
},
{
"type": "image",
"raw_content": "<img src=\"http://example.com/image.png\" alt=\"image\">",
"bbox": [],
"content": {
"url": "http://example.com/image.png",
"data": null,
"alt": "image",
"title": null,
"caption": ""
"caption": [],
"footnote": []
}
},
{
Expand All @@ -203,7 +204,7 @@
},
{
"type": "list",
"raw_content": "<ul><li>UL1 <span>UL1.1</span></li><li>UL2</li></ul>",
"bbox": [],
"content": {
"items": [
{
Expand All @@ -219,7 +220,7 @@
},
{
"type": "list",
"raw_content": "<dl><dt>HTML</dt><dd>瓒呮枃鏈爣璁拌瑷€</dd><dt>CSS</dt><dd>灞傚彔鏍峰紡琛�</dd></dl>",
"bbox": [],
"content": {
"items": [
{
Expand Down
3 changes: 2 additions & 1 deletion tests/llm_web_kit/input/test_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ def test_datajson_exclude_nodes_to_mmd(self):
'data': None,
'alt': 'Curtindo o apartamento com piscina no centro de SP. ',
'title': 'Curtindo o apartamento com piscina no centro de SP. ',
'caption': None
'caption': [],
'footnote': ['test image footnote']
}
}]]
}
Expand Down