ccprocessor · e06084 · Nov 26, 2025 · Nov 18, 2025 · Nov 20, 2025 · Nov 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -49,3 +49,5 @@ llm_web_kit.egg-info/*
 .llm-web-kit.jsonc
 .llm-web-kit-pageclassify.jsonc
 tests/llm_web_kit/extractor/ygq_testmd
+output.md
+output.jsonl
diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md
diff --git a/llm_web_kit/api/services/request_log_service.py b/llm_web_kit/api/services/request_log_service.py
@@ -0,0 +1,161 @@
+"""请求日志服务.
+
+提供请求日志的创建、更新和查询功能。
+"""
+
+import uuid
+from datetime import datetime
+from typing import Optional
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from ..dependencies import get_logger
+from ..models.db_models import RequestLog
+
+logger = get_logger(__name__)
+
+
+class RequestLogService:
+    """请求日志服务类."""
+    @staticmethod
+    def generate_request_id() -> str:
+        """生成唯一的请求ID."""
+        return str(uuid.uuid4())
+
+    @staticmethod
+    async def create_log(
+        session: Optional[AsyncSession],
+        request_id: str,
+        input_type: str,
+        input_html: Optional[str] = None,
+        url: Optional[str] = None,
+    ) -> Optional[RequestLog]:
+        """创建请求日志记录.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            input_type: 输入类型 (html_content, url, file)
+            input_html: 输入HTML内容
+            url: URL地址
+        Returns:
+            创建的日志记录，如果数据库未配置则返回 None
+        """
+        if session is None:
+            logger.debug("数据库会话为空，跳过日志记录")
+            return None
+        try:
+            log = RequestLog(
+                request_id=request_id,
+                input_type=input_type,
+                input_html=input_html,
+                url=url,
+                status='processing',
+                created_at=datetime.now(),
+                updated_at=datetime.now(),
+            )
+            session.add(log)
+            await session.flush()  # 立即写入，获取ID
+            logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing")
+            return log
+        except Exception as e:
+            logger.error(f"创建请求日志失败: {e}")
+            return None
+
+    @staticmethod
+    async def update_log_success(
+        session: Optional[AsyncSession],
+        request_id: str,
+        output_markdown: Optional[str] = None,
+    ) -> bool:
+        """更新请求日志为成功状态.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            output_markdown: 输出Markdown内容
+        Returns:
+            是否更新成功
+        """
+        if session is None:
+            return False
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            log = result.scalar_one_or_none()
+            if log:
+                log.status = 'success'
+                log.output_markdown = output_markdown
+                log.updated_at = datetime.now()
+                await session.flush()
+                logger.info(f"更新请求日志为成功: request_id={request_id}, status=success")
+                return True
+            else:
+                logger.warning(f"未找到请求日志: request_id={request_id}")
+                return False
+        except Exception as e:
+            logger.error(f"更新请求日志失败: {e}")
+            return False
+
+    @staticmethod
+    async def update_log_failure(
+        session: Optional[AsyncSession],
+        request_id: str,
+        error_message: str,
+    ) -> bool:
+        """更新请求日志为失败状态.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+            error_message: 错误信息
+        Returns:
+            是否更新成功
+        """
+        if session is None:
+            return False
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            log = result.scalar_one_or_none()
+            if log:
+                log.status = 'fail'
+                log.error_message = error_message
+                log.updated_at = datetime.now()
+                await session.flush()
+                logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail")
+                return True
+            else:
+                logger.warning(f"未找到请求日志: request_id={request_id}")
+                return False
+
+        except Exception as e:
+            logger.error(f"更新请求日志失败: {e}")
+            return False
+
+    @staticmethod
+    async def get_log_by_request_id(
+        session: Optional[AsyncSession],
+        request_id: str,
+    ) -> Optional[RequestLog]:
+        """根据请求ID查询日志.
+
+        Args:
+            session: 数据库会话
+            request_id: 请求ID
+        Returns:
+            日志记录，如果未找到则返回 None
+        """
+        if session is None:
+            return None
+        try:
+            result = await session.execute(
+                select(RequestLog).where(RequestLog.request_id == request_id)
+            )
+            return result.scalar_one_or_none()
+        except Exception as e:
+            logger.error(f"查询请求日志失败: {e}")
+            return None
diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py
@@ -88,29 +88,29 @@ def recognize(
 
     @override
     def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
-        """
-        把代码元素转换为content list node.
+        """把代码元素转换为content list node. 注意：此方法只处理块级代码(CC_CODE)，行内代码(CC_CODE_INLIN
+        E)由TextParagraphRecognizer处理.
+
         Args:
             base_url:
             parsed_content: HtmlElement对象
             raw_html_segment:
 
         Returns:
-
         """
         d = {
             'type': 'code',
-            # "bbox": [],
-            'raw_content': raw_html_segment,
-            'inline': parsed_content.get('inline', 'false') == 'true',
+            'bbox': [],
             'content': {
                 'code_content': parsed_content.text,
             },
         }
 
+        # 可选字段：language
         if lang := parsed_content.get('language', None):
             d['content']['language'] = lang
 
+        # 可选字段：by（代码高亮工具）
         if by := parsed_content.get('by', None):
             d['content']['by'] = by
 

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl
 
     @override
     def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
-        """将content转换成content_list_node.
-        每种类型的html元素都有自己的content-list格式：参考 docs/specification/output_format/content_list_spec.md
-        例如代码的返回格式：
+        """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式：参考
+        docs/specification/output_format/content_list_spec.md.
+
+        返回格式示例：
         ```json
             {
-                "type": "equation-inline", # 数学公式类型，一共equation-inline和equation-interline两种
-                "raw_content": "<ccmath type="latex" by="mathjax">$u_{x_0}^{in}(x)$</ccmath>",
+                "type": "equation-interline",
+                "bbox": [],
                 "content": {
-                    "math_content": "u_{x_0}^{in}(x)",
+                    "math_content": "a^2 + b^2 = c^2",
                     "math_type": "latex",
                     "by": "mathjax"
                 }
             }
-            ```
+        ```
 
-            Args:
-                content: str: 要转换的content
+        Args:
+            base_url: 基础URL
+            parsed_content: 解析后的HtmlElement对象
+            raw_html_segment: 原始HTML片段
 
         Returns:
             dict: content_list_node
@@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             math_content = self.cm.wrap_math_md(math_content)
             return {
                 'type': DocElementType.EQUATION_INTERLINE,
-                'raw_content': raw_html_segment,
+                'bbox': [],
                 'content': {
                     'math_content': math_content,
                     'math_type': inter_ele[0].get('type'),  # 数学语言类型
@@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             math_content = in_els[0].text
             return {
                 'type': DocElementType.EQUATION_INLINE,
-                'raw_content': raw_html_segment,
+                'bbox': [],
                 'content': {
                     'math_content': math_content,
                     'math_type': in_els[0].get('type'),  # 数学语言类型

diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py
@@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}')
 
     def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict:
+        caption = html_obj.get('caption')
+        footnote = html_obj.get('footnote')
         result = {
             'type': DocElementType.IMAGE,
-            'raw_content': raw_html_segment,
+            'bbox': [],
             'content': {
                 'url': html_obj.text if html_obj.get('format') == 'url' else None,
                 'data': html_obj.text if html_obj.get('format') == 'base64' else None,
                 'alt': html_obj.get('alt'),
                 'title': html_obj.get('title'),
-                'caption': html_obj.get('caption')
+                'caption': [caption] if caption else [],
+                'footnote': [footnote] if footnote else []
             }
         }
         return result

diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any, List, Tuple
 
 from lxml import html as lxml_html
@@ -44,7 +45,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
 
         ele_node = {
             'type': DocElementType.LIST,
-            'raw_content': raw_html_segment,
+            'bbox': [],
             'content': {
                 'items': content_list,
                 'list_attribute': list_attribute,
@@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                     if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
                         paragraph[-1]['c'] += _new_tail
                 else:
+                    if len(paragraph) > 0 and el.tag not in inline_tags:
+                        _new_tail = '$br$' + _new_tail
                     paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
 
             if paragraph:
@@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
             text_paragraph.append(new_paragraph)
 
         for n, item in enumerate(text_paragraph):
-            tem_json = json.dumps(item).replace('$br$', '\\n\\n')
+            tem_json = json.dumps(item, ensure_ascii=False)
+            tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
             text_paragraph[n] = json.loads(tem_json)
 
         return text_paragraph

diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
@@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
         # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
         if table_type:
             cc_table_type = DocElementType.COMPLEX_TABLE
+            d = {
+                'type': cc_table_type,
+                'content': {
+                    'html': html_content,
+                    'table_nest_level': table_nest_level,
+                    "caption": [],
+                    "footnote": []
+                }
+            }
         else:
             cc_table_type = DocElementType.SIMPLE_TABLE
-        d = {
-            'type': cc_table_type,
-            'raw_content': raw_html_segment,
-            'content': {
-                'html': html_content,
-                'is_complex': table_type,
-                'table_nest_level': table_nest_level
+            d = {
+                'type': cc_table_type,
+                'content': {
+                    'html': html_content,
+                    "caption": [],
+                    "footnote": []
+                }
             }
-        }
         return d
 
     def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:

diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
         el = parsed_content
         node = {
             'type': DocElementType.PARAGRAPH,
-            'raw_content': raw_html_segment,
+            # 'raw_content': raw_html_segment,
             'content': json.loads(el.text),
         }
         return node
@@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
 
         for item in para_text:
             if item['c'] is not None:
-                item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
+                item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
             else:
                 item['c'] = ""
 

diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
@@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
             return None
         cctitle_content_node = {
             'type': DocElementType.TITLE,
-            'raw_content': raw_html_segment,
+            # 'raw_content': raw_html_segment,
             'content': {
                 'title_content': text,
                 'level': level