From 11a7b38d5dc7f31a9578e47bb82909fbb79cd9c3 Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Thu, 20 Nov 2025 16:09:02 +0800 Subject: [PATCH 1/4] feat: reformat content list of image and list --- .../api/services/request_log_service.py | 161 ++++++++++++++++++ .../extractor/html/recognizer/image.py | 7 +- llm_web_kit/extractor/html/recognizer/list.py | 2 +- .../extractor/html/recognizer/test_image.py | 8 +- .../extractor/html/recognizer/test_list.py | 17 +- .../extractor/test_extractor_chain.py | 9 +- 6 files changed, 185 insertions(+), 19 deletions(-) create mode 100644 llm_web_kit/api/services/request_log_service.py diff --git a/llm_web_kit/api/services/request_log_service.py b/llm_web_kit/api/services/request_log_service.py new file mode 100644 index 00000000..1de0950d --- /dev/null +++ b/llm_web_kit/api/services/request_log_service.py @@ -0,0 +1,161 @@ +"""请求日志服务. + +提供请求日志的创建、更新和查询功能。 +""" + +import uuid +from datetime import datetime +from typing import Optional + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from ..dependencies import get_logger +from ..models.db_models import RequestLog + +logger = get_logger(__name__) + + +class RequestLogService: + """请求日志服务类.""" + @staticmethod + def generate_request_id() -> str: + """生成唯一的请求ID.""" + return str(uuid.uuid4()) + + @staticmethod + async def create_log( + session: Optional[AsyncSession], + request_id: str, + input_type: str, + input_html: Optional[str] = None, + url: Optional[str] = None, + ) -> Optional[RequestLog]: + """创建请求日志记录. + + Args: + session: 数据库会话 + request_id: 请求ID + input_type: 输入类型 (html_content, url, file) + input_html: 输入HTML内容 + url: URL地址 + Returns: + 创建的日志记录,如果数据库未配置则返回 None + """ + if session is None: + logger.debug("数据库会话为空,跳过日志记录") + return None + try: + log = RequestLog( + request_id=request_id, + input_type=input_type, + input_html=input_html, + url=url, + status='processing', + created_at=datetime.now(), + updated_at=datetime.now(), + ) + session.add(log) + await session.flush() # 立即写入,获取ID + logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing") + return log + except Exception as e: + logger.error(f"创建请求日志失败: {e}") + return None + + @staticmethod + async def update_log_success( + session: Optional[AsyncSession], + request_id: str, + output_markdown: Optional[str] = None, + ) -> bool: + """更新请求日志为成功状态. + + Args: + session: 数据库会话 + request_id: 请求ID + output_markdown: 输出Markdown内容 + Returns: + 是否更新成功 + """ + if session is None: + return False + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + log = result.scalar_one_or_none() + if log: + log.status = 'success' + log.output_markdown = output_markdown + log.updated_at = datetime.now() + await session.flush() + logger.info(f"更新请求日志为成功: request_id={request_id}, status=success") + return True + else: + logger.warning(f"未找到请求日志: request_id={request_id}") + return False + except Exception as e: + logger.error(f"更新请求日志失败: {e}") + return False + + @staticmethod + async def update_log_failure( + session: Optional[AsyncSession], + request_id: str, + error_message: str, + ) -> bool: + """更新请求日志为失败状态. + + Args: + session: 数据库会话 + request_id: 请求ID + error_message: 错误信息 + Returns: + 是否更新成功 + """ + if session is None: + return False + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + log = result.scalar_one_or_none() + if log: + log.status = 'fail' + log.error_message = error_message + log.updated_at = datetime.now() + await session.flush() + logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail") + return True + else: + logger.warning(f"未找到请求日志: request_id={request_id}") + return False + + except Exception as e: + logger.error(f"更新请求日志失败: {e}") + return False + + @staticmethod + async def get_log_by_request_id( + session: Optional[AsyncSession], + request_id: str, + ) -> Optional[RequestLog]: + """根据请求ID查询日志. + + Args: + session: 数据库会话 + request_id: 请求ID + Returns: + 日志记录,如果未找到则返回 None + """ + if session is None: + return None + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + return result.scalar_one_or_none() + except Exception as e: + logger.error(f"查询请求日志失败: {e}") + return None diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 53f612dc..6e1d192b 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}') def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict: + caption = html_obj.get('caption') + footnote = html_obj.get('footnote') result = { 'type': DocElementType.IMAGE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'url': html_obj.text if html_obj.get('format') == 'url' else None, 'data': html_obj.text if html_obj.get('format') == 'base64' else None, 'alt': html_obj.get('alt'), 'title': html_obj.get('title'), - 'caption': html_obj.get('caption') + 'caption': [caption] if caption else [], + 'footnote': [footnote] if footnote else [] } } return result diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 61f113b8..723f292f 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -44,7 +44,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h ele_node = { 'type': DocElementType.LIST, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'items': content_list, 'list_attribute': list_attribute, diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py index 6396c6d1..281f0b6d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py @@ -82,9 +82,9 @@ 'url': 'xxx', 'parsed_content': """http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg""", 'html': '...', - 'expected': {'type': 'image', 'raw_content': '...', 'content': { + 'expected': {'type': 'image', 'bbox': [], 'content': { 'url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg', 'data': None, - 'alt': 'Janser Logo', 'title': None, 'caption': None}}, + 'alt': 'Janser Logo', 'title': None, 'caption': [], 'footnote': []}}, 'alt': 'Janser Logo', 'img_url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg' }, @@ -94,9 +94,9 @@ ' format="url" alt="Układanie wykładzin">http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg""", 'html': '...', - 'expected': {'type': 'image', 'raw_content': '...', + 'expected': {'type': 'image', 'bbox': [], 'content': {'url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg', - 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': None}}, + 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': [], 'footnote': []}}, 'alt': 'Układanie wykładzin', 'img_url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg' }, diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index 5f8d61de..7bb5bdb5 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -19,28 +19,28 @@ def setUp(self): self.__list_with_ul_text_content = None self.__list_with_sub_no_prefix_content = None - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r', encoding='utf-8') as file: self.__simple_list_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r', encoding='utf-8') as file: self.__complex_list_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r', encoding='utf-8') as file: self.__with_empty_list_item_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r', encoding='utf-8') as file: self.__list_with_sub_sup_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r', encoding='utf-8') as file: self.__list_with_br_and_cctags_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r', encoding='utf-8') as file: self.__list_with_sub_sup_tail_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r', encoding='utf-8') as file: self.__list_with_ul_text_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r', encoding='utf-8') as file: self.__list_with_sub_no_prefix_content = file.read() def test_simple_list(self): @@ -158,7 +158,6 @@ def test_to_content_list_node(self): # 验证返回的内容结构正确 assert 'type' in content_node, '返回的content_node缺少type字段' assert 'content' in content_node, '返回的content_node缺少content字段' - assert 'raw_content' in content_node, '返回的content_node缺少raw_content字段' # 验证content字段包含必要的内容 assert 'items' in content_node['content'], 'content字段缺少items' diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 0981c9ca..df22a84d 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -102,10 +102,12 @@ def test_html_pipeline(self): # 然后是img html_content = html_content_list[2] self.assertEqual(html_content['type'], DocElementType.IMAGE) + self.assertEqual(html_content['bbox'], []) self.assertEqual(html_content['content']['title'], 'image-title') self.assertEqual(html_content['content']['alt'], 'image-alt') self.assertEqual(html_content['content']['url'], 'https://www.test.com/test.png') - self.assertEqual(html_content['content']['caption'], '') + self.assertEqual(html_content['content']['caption'], []) + self.assertEqual(html_content['content']['footnote'], []) # 然后是simple table html_content = html_content_list[4] @@ -121,6 +123,7 @@ def test_html_pipeline(self): # 然后是list html_content = html_content_list[6] self.assertEqual(html_content['type'], DocElementType.LIST) + self.assertEqual(html_content['bbox'], []) self.assertEqual(len(html_content['content']['items']), 2) self.assertEqual(html_content['content']['list_attribute'], 'unordered') self.assertEqual(html_content['content']['items'][0]['c'], '1') @@ -177,8 +180,8 @@ def test_html_pipeline(self): self.assertEqual(md_content[-1], '\n') # main_html - main_html = result.get_content_list().to_main_html() # 获取main_html内容 - self.assertEqual(main_html, self.main_html_expected_content) # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO + main_html = result.get('main_html') + self.assertEqual(len(main_html), 1869) # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO def test_html_pipeline_suit_2(self): """测试第二个数据:这个数据会丢失一些文本信息.""" From a66187cdad14ef53c1b241e943f977df2e1e9252 Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Thu, 20 Nov 2025 16:45:13 +0800 Subject: [PATCH 2/4] feat: reformat content list of image and list --- llm_web_kit/input/datajson.py | 15 ++++++++++++--- tests/llm_web_kit/input/assets/content_json.json | 9 +++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 32d662bd..46911860 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -280,8 +280,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: image_data = content_lst_node['content'].get('data', '') image_alt = content_lst_node['content'].get('alt', '') image_title = content_lst_node['content'].get('title', '') - image_caption = content_lst_node['content'].get('caption', '') + image_caption = content_lst_node['content'].get('caption', []) image_url = content_lst_node['content'].get('url', '') + image_footnote = content_lst_node['content'].get('caption', []) if not image_path and not image_data: image_path = sha256_hash(image_url) @@ -299,11 +300,16 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: else: image_title = '' - if image_caption: - image_caption = image_caption.strip() + if len(image_caption) > 0: + image_caption = image_caption[0].strip() else: image_caption = '' + if len(image_footnote) > 0: + image_footnote = image_footnote[0].strip() + else: + image_footnote = '' + image_des = image_title if image_title else '' # 优先使用data, 其次path.其中data是base64编码的图片,path是图片的url if image_data: @@ -322,6 +328,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: else: image_with_caption = image + if image_footnote: + image_with_caption = f'{image_with_caption}\n\n{image_footnote}' + return image_with_caption elif node_type == DocElementType.AUDIO: return '' # TODO: 音频格式 diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index 672c3fc8..84eacaa6 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -182,13 +182,14 @@ }, { "type": "image", - "raw_content": "\"image\"", + "bbox": [], "content": { "url": "http://example.com/image.png", "data": null, "alt": "image", "title": null, - "caption": "" + "caption": [], + "footnote": [] } }, { @@ -203,7 +204,7 @@ }, { "type": "list", - "raw_content": "", + "bbox": [], "content": { "items": [ { @@ -219,7 +220,7 @@ }, { "type": "list", - "raw_content": "
HTML
瓒呮枃鏈爣璁拌瑷€
CSS
灞傚彔鏍峰紡琛�
", + "bbox": [], "content": { "items": [ { From 6cfe0691d6efe82811517486d2cfbf9b6aa5e8ae Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Thu, 20 Nov 2025 16:53:50 +0800 Subject: [PATCH 3/4] feat: reformat content list of image and list --- llm_web_kit/input/datajson.py | 3 +-- tests/llm_web_kit/input/test_datajson.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 46911860..c9b533c2 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -282,8 +282,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: image_title = content_lst_node['content'].get('title', '') image_caption = content_lst_node['content'].get('caption', []) image_url = content_lst_node['content'].get('url', '') - image_footnote = content_lst_node['content'].get('caption', []) - + image_footnote = content_lst_node['content'].get('footnote', []) if not image_path and not image_data: image_path = sha256_hash(image_url) diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index 6996fc38..8f317c8b 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -156,7 +156,8 @@ def test_datajson_exclude_nodes_to_mmd(self): 'data': None, 'alt': 'Curtindo o apartamento com piscina no centro de SP. ', 'title': 'Curtindo o apartamento com piscina no centro de SP. ', - 'caption': None + 'caption': [], + 'footnote': [] } }]] } From 09b44ae103dff9ad6e241b3a1105742b8a0b4d4b Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Thu, 20 Nov 2025 17:06:28 +0800 Subject: [PATCH 4/4] feat: reformat content list of image and list --- tests/llm_web_kit/input/test_datajson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index 8f317c8b..76779260 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -157,7 +157,7 @@ def test_datajson_exclude_nodes_to_mmd(self): 'alt': 'Curtindo o apartamento com piscina no centro de SP. ', 'title': 'Curtindo o apartamento com piscina no centro de SP. ', 'caption': [], - 'footnote': [] + 'footnote': ['test image footnote'] } }]] }