From 11a7b38d5dc7f31a9578e47bb82909fbb79cd9c3 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 20 Nov 2025 16:09:02 +0800
Subject: [PATCH 1/4] feat: reformat content list of image and list

---
 .../api/services/request_log_service.py       | 161 ++++++++++++++++++
 .../extractor/html/recognizer/image.py        |   7 +-
 llm_web_kit/extractor/html/recognizer/list.py |   2 +-
 .../extractor/html/recognizer/test_image.py   |   8 +-
 .../extractor/html/recognizer/test_list.py    |  17 +-
 .../extractor/test_extractor_chain.py         |   9 +-
 6 files changed, 185 insertions(+), 19 deletions(-)
 create mode 100644 llm_web_kit/api/services/request_log_service.py

diff --git a/llm_web_kit/api/services/request_log_service.py b/llm_web_kit/api/services/request_log_service.py
new file mode 100644
index 00000000..1de0950d
--- /dev/null
+++ b/llm_web_kit/api/services/request_log_service.py
@@ -0,0 +1,161 @@
+"""请求日志服务.
+
+提供请求日志的创建、更新和查询功能。
+"""
+
+import uuid
+from datetime import datetime
+from typing import Optional
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ..dependencies import get_logger
+from ..models.db_models import RequestLog
+
+logger = get_logger(__name__)
+
+
+class RequestLogService:
+    """请求日志服务类."""
+    @staticmethod
+    def generate_request_id() -> str:
+        """生成唯一的请求ID."""
+        return str(uuid.uuid4())
+
+    @staticmethod
+    async def create_log(
+        session: Optional[AsyncSession],
+        request_id: str,
+        input_type: str,
+        input_html: Optional[str] = None,
+        url: Optional[str] = None,
+    ) -> Optional[RequestLog]:
+        """创建请求日志记录.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            input_type: 输入类型 (html_content, url, file)
+            input_html: 输入HTML内容
+            url: URL地址
+        Returns:
+            创建的日志记录，如果数据库未配置则返回 None
+        """
+        if session is None:
+            logger.debug("数据库会话为空，跳过日志记录")
+            return None
+        try:
+            log = RequestLog(
+                request_id=request_id,
+                input_type=input_type,
+                input_html=input_html,
+                url=url,
+                status='processing',
+                created_at=datetime.now(),
+                updated_at=datetime.now(),
+            )
+            session.add(log)
+            await session.flush()  # 立即写入，获取ID
+            logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
+            return log
+        except Exception as e:
+            logger.error(f"创建请求日志失败: {e}")
+            return None
+
+    @staticmethod
+    async def update_log_success(
+        session: Optional[AsyncSession],
+        request_id: str,
+        output_markdown: Optional[str] = None,
+    ) -> bool:
+        """更新请求日志为成功状态.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            output_markdown: 输出Markdown内容
+        Returns:
+            是否更新成功
+        """
+        if session is None:
+            return False
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            log = result.scalar_one_or_none()
+            if log:
+                log.status = 'success'
+                log.output_markdown = output_markdown
+                log.updated_at = datetime.now()
+                await session.flush()
+                logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
+                return True
+            else:
+                logger.warning(f"未找到请求日志: request_id={request_id}")
+                return False
+        except Exception as e:
+            logger.error(f"更新请求日志失败: {e}")
+            return False
+
+    @staticmethod
+    async def update_log_failure(
+        session: Optional[AsyncSession],
+        request_id: str,
+        error_message: str,
+    ) -> bool:
+        """更新请求日志为失败状态.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            error_message: 错误信息
+        Returns:
+            是否更新成功
+        """
+        if session is None:
+            return False
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            log = result.scalar_one_or_none()
+            if log:
+                log.status = 'fail'
+                log.error_message = error_message
+                log.updated_at = datetime.now()
+                await session.flush()
+                logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
+                return True
+            else:
+                logger.warning(f"未找到请求日志: request_id={request_id}")
+                return False
+
+        except Exception as e:
+            logger.error(f"更新请求日志失败: {e}")
+            return False
+
+    @staticmethod
+    async def get_log_by_request_id(
+        session: Optional[AsyncSession],
+        request_id: str,
+    ) -> Optional[RequestLog]:
+        """根据请求ID查询日志.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+        Returns:
+            日志记录，如果未找到则返回 None
+        """
+        if session is None:
+            return None
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            return result.scalar_one_or_none()
+        except Exception as e:
+            logger.error(f"查询请求日志失败: {e}")
+            return None
diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py
index 53f612dc..6e1d192b 100644
--- a/llm_web_kit/extractor/html/recognizer/image.py
+++ b/llm_web_kit/extractor/html/recognizer/image.py
@@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')
 
     def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
+        caption = html_obj.get('caption')
+        footnote = html_obj.get('footnote')
         result = {
             'type': DocElementType.IMAGE,
-            'raw_content': raw_html_segment,
+            'bbox': [],
             'content': {
                 'url': html_obj.text if html_obj.get('format') == 'url' else None,
                 'data': html_obj.text if html_obj.get('format') == 'base64' else None,
                 'alt': html_obj.get('alt'),
                 'title': html_obj.get('title'),
-                'caption': html_obj.get('caption')
+                'caption': [caption] if caption else [],
+                'footnote': [footnote] if footnote else []
             }
         }
         return result
diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index 61f113b8..723f292f 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -44,7 +44,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
 
         ele_node = {
             'type': DocElementType.LIST,
-            'raw_content': raw_html_segment,
+            'bbox': [],
             'content': {
                 'items': content_list,
                 'list_attribute': list_attribute,
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
index 6396c6d1..281f0b6d 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py
@@ -82,9 +82,9 @@
         'url': 'xxx',
         'parsed_content': """<ccimage by="img" html='&lt;img src="http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg" alt="Janser Logo"&gt;' format="url" alt="Janser Logo">http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg</ccimage>""",
         'html': '...',
-        'expected': {'type': 'image', 'raw_content': '...', 'content': {
+        'expected': {'type': 'image', 'bbox': [], 'content': {
             'url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg', 'data': None,
-            'alt': 'Janser Logo', 'title': None, 'caption': None}},
+            'alt': 'Janser Logo', 'title': None, 'caption': [], 'footnote': []}},
         'alt': 'Janser Logo',
         'img_url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg'
     },
@@ -94,9 +94,9 @@
 
                                     ' format="url" alt="Układanie wykładzin">http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg</ccimage>""",
         'html': '...',
-        'expected': {'type': 'image', 'raw_content': '...',
+        'expected': {'type': 'image', 'bbox': [],
                      'content': {'url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg',
-                                 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': None}},
+                                 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': [], 'footnote': []}},
         'alt': 'Układanie wykładzin',
         'img_url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg'
     },
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
index 5f8d61de..7bb5bdb5 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
@@ -19,28 +19,28 @@ def setUp(self):
         self.__list_with_ul_text_content = None
         self.__list_with_sub_no_prefix_content = None
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r', encoding='utf-8') as file:
             self.__simple_list_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r', encoding='utf-8') as file:
             self.__complex_list_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r', encoding='utf-8') as file:
             self.__with_empty_list_item_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r', encoding='utf-8') as file:
             self.__list_with_sub_sup_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r', encoding='utf-8') as file:
             self.__list_with_br_and_cctags_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r', encoding='utf-8') as file:
             self.__list_with_sub_sup_tail_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r', encoding='utf-8') as file:
             self.__list_with_ul_text_content = file.read()
 
-        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r') as file:
+        with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r', encoding='utf-8') as file:
             self.__list_with_sub_no_prefix_content = file.read()
 
     def test_simple_list(self):
@@ -158,7 +158,6 @@ def test_to_content_list_node(self):
         # 验证返回的内容结构正确
         assert 'type' in content_node, '返回的content_node缺少type字段'
         assert 'content' in content_node, '返回的content_node缺少content字段'
-        assert 'raw_content' in content_node, '返回的content_node缺少raw_content字段'
 
         # 验证content字段包含必要的内容
         assert 'items' in content_node['content'], 'content字段缺少items'
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 0981c9ca..df22a84d 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -102,10 +102,12 @@ def test_html_pipeline(self):
         # 然后是img
         html_content = html_content_list[2]
         self.assertEqual(html_content['type'], DocElementType.IMAGE)
+        self.assertEqual(html_content['bbox'], [])
         self.assertEqual(html_content['content']['title'], 'image-title')
         self.assertEqual(html_content['content']['alt'], 'image-alt')
         self.assertEqual(html_content['content']['url'], 'https://www.test.com/test.png')
-        self.assertEqual(html_content['content']['caption'], '')
+        self.assertEqual(html_content['content']['caption'], [])
+        self.assertEqual(html_content['content']['footnote'], [])
 
         # 然后是simple table
         html_content = html_content_list[4]
@@ -121,6 +123,7 @@ def test_html_pipeline(self):
         # 然后是list
         html_content = html_content_list[6]
         self.assertEqual(html_content['type'], DocElementType.LIST)
+        self.assertEqual(html_content['bbox'], [])
         self.assertEqual(len(html_content['content']['items']), 2)
         self.assertEqual(html_content['content']['list_attribute'], 'unordered')
         self.assertEqual(html_content['content']['items'][0]['c'], '1')
@@ -177,8 +180,8 @@ def test_html_pipeline(self):
         self.assertEqual(md_content[-1], '\n')
 
         # main_html
-        main_html = result.get_content_list().to_main_html()  # 获取main_html内容
-        self.assertEqual(main_html, self.main_html_expected_content)  # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO
+        main_html = result.get('main_html')
+        self.assertEqual(len(main_html), 1869)  # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO
 
     def test_html_pipeline_suit_2(self):
         """测试第二个数据：这个数据会丢失一些文本信息."""

From a66187cdad14ef53c1b241e943f977df2e1e9252 Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 20 Nov 2025 16:45:13 +0800
Subject: [PATCH 2/4] feat: reformat content list of image and list

---
 llm_web_kit/input/datajson.py                    | 15 ++++++++++++---
 tests/llm_web_kit/input/assets/content_json.json |  9 +++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 32d662bd..46911860 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -280,8 +280,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
             image_data = content_lst_node['content'].get('data', '')
             image_alt = content_lst_node['content'].get('alt', '')
             image_title = content_lst_node['content'].get('title', '')
-            image_caption = content_lst_node['content'].get('caption', '')
+            image_caption = content_lst_node['content'].get('caption', [])
             image_url = content_lst_node['content'].get('url', '')
+            image_footnote = content_lst_node['content'].get('caption', [])
 
             if not image_path and not image_data:
                 image_path = sha256_hash(image_url)
@@ -299,11 +300,16 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
             else:
                 image_title = ''
 
-            if image_caption:
-                image_caption = image_caption.strip()
+            if len(image_caption) > 0:
+                image_caption = image_caption[0].strip()
             else:
                 image_caption = ''
 
+            if len(image_footnote) > 0:
+                image_footnote = image_footnote[0].strip()
+            else:
+                image_footnote = ''
+
             image_des = image_title if image_title else ''
             # 优先使用data, 其次path.其中data是base64编码的图片，path是图片的url
             if image_data:
@@ -322,6 +328,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
             else:
                 image_with_caption = image
 
+            if image_footnote:
+                image_with_caption = f'{image_with_caption}\n\n{image_footnote}'
+
             return image_with_caption
         elif node_type == DocElementType.AUDIO:
             return ''  # TODO: 音频格式
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index 672c3fc8..84eacaa6 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -182,13 +182,14 @@
       },
       {
         "type": "image",
-        "raw_content": "<img src=\"http://example.com/image.png\" alt=\"image\">",
+        "bbox": [],
         "content": {
           "url": "http://example.com/image.png",
           "data": null,
           "alt": "image",
           "title": null,
-          "caption": ""
+          "caption": [],
+          "footnote": []
         }
       },
       {
@@ -203,7 +204,7 @@
       },
       {
         "type": "list",
-        "raw_content": "<ul><li>UL1 <span>UL1.1</span></li><li>UL2</li></ul>",
+        "bbox": [],
         "content": {
           "items": [
             {
@@ -219,7 +220,7 @@
       },
       {
         "type": "list",
-        "raw_content": "<dl><dt>HTML</dt><dd>瓒呮枃鏈爣璁拌瑷€</dd><dt>CSS</dt><dd>灞傚彔鏍峰紡琛�</dd></dl>",
+        "bbox": [],
         "content": {
           "items": [
             {

From 6cfe0691d6efe82811517486d2cfbf9b6aa5e8ae Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 20 Nov 2025 16:53:50 +0800
Subject: [PATCH 3/4] feat: reformat content list of image and list

---
 llm_web_kit/input/datajson.py            | 3 +--
 tests/llm_web_kit/input/test_datajson.py | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 46911860..c9b533c2 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -282,8 +282,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types:
             image_title = content_lst_node['content'].get('title', '')
             image_caption = content_lst_node['content'].get('caption', [])
             image_url = content_lst_node['content'].get('url', '')
-            image_footnote = content_lst_node['content'].get('caption', [])
-
+            image_footnote = content_lst_node['content'].get('footnote', [])
             if not image_path and not image_data:
                 image_path = sha256_hash(image_url)
 
diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py
index 6996fc38..8f317c8b 100644
--- a/tests/llm_web_kit/input/test_datajson.py
+++ b/tests/llm_web_kit/input/test_datajson.py
@@ -156,7 +156,8 @@ def test_datajson_exclude_nodes_to_mmd(self):
                     'data': None,
                     'alt': 'Curtindo o apartamento com piscina no centro de SP. ',
                     'title': 'Curtindo o apartamento com piscina no centro de SP. ',
-                    'caption': None
+                    'caption': [],
+                    'footnote': []
                 }
             }]]
         }

From 09b44ae103dff9ad6e241b3a1105742b8a0b4d4b Mon Sep 17 00:00:00 2001
From: liukaiwen <kl3157@columbia.edu>
Date: Thu, 20 Nov 2025 17:06:28 +0800
Subject: [PATCH 4/4] feat: reformat content list of image and list

---
 tests/llm_web_kit/input/test_datajson.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py
index 8f317c8b..76779260 100644
--- a/tests/llm_web_kit/input/test_datajson.py
+++ b/tests/llm_web_kit/input/test_datajson.py
@@ -157,7 +157,7 @@ def test_datajson_exclude_nodes_to_mmd(self):
                     'alt': 'Curtindo o apartamento com piscina no centro de SP. ',
                     'title': 'Curtindo o apartamento com piscina no centro de SP. ',
                     'caption': [],
-                    'footnote': []
+                    'footnote': ['test image footnote']
                 }
             }]]
         }