diff --git a/.gitignore b/.gitignore index 44fad1cc..216ffd1a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ llm_web_kit.egg-info/* .llm-web-kit.jsonc .llm-web-kit-pageclassify.jsonc tests/llm_web_kit/extractor/ygq_testmd +output.md +output.jsonl diff --git a/docs/specification/output_format/content_list_spec.md b/docs/specification/output_format/content_list_spec.md index c46cdcd4..82956390 100644 --- a/docs/specification/output_format/content_list_spec.md +++ b/docs/specification/output_format/content_list_spec.md @@ -3,170 +3,137 @@ ## 目的 定义content_list的目的是为了统一流水线输出的数据格式,无论是网页、电子书、富文本pdf,word,ppt等,都可以转化到这个格式。 -使得不同的下游任务可以快速根据content_list导出所需要的数据格式。 +使得不同的下游任务可以: + +1. 快速根据content_list导出所需要的数据格式。 +2. 利用content_list筛选包含某些元素的内容 > 目的不是用户最终使用的格式,而是为了快速转化为下游任务需要的格式,例如大语言模型不需要图和音视频而多模态模型需要用。 -## 样例 +## 详细说明 遵循原则 - content_list 是被分解的文档,每个元素是文章中的一段内容,可以使文本、图片、代码、音视频等。 - 每个元素的表达方式是不一样的,受制于其`type`类型,逐层深入。 -- 为了调试找问题方便,留下了`raw_content`字段,用于存储原始的文本内容。 -- 整体结构是一个二维数组,每个元素是一个数组,表示一页内容。如果页面为空,则需要填充一个空数组进行占位,默认二维数组下标即为**页码**。 +- 整体结构是一个二维数组,第一维表示一页内容。如果页面为空,则需要填充一个**空数组进行占位**,二维数组下标即为**页码**。 + +整体结构 ```json -[ - [ - { - "type": "code", - "raw_content": "def add(a, b):\\n return a + b", - "inline": false, - "content": { - "code_content": "def add(a, b):\\n return a + b", - "by": "tag_code" - } - }, - { - "type": "equation-interline", - "raw_content": "

$$a^2 + b^2 = c^2$$

", - "content": { - "math_content": "a^2 + b^2 = c^2", - "math_type": "latex", - "by": "mathjax_mock" - } - }, - { - "type": "image", - "raw_content": "
\"Screen
What it ACTUALLY looks like
", - "content": { - "url": "http://static4.wikia.nocookie.net/__cb20120619225143/central/images/thumb/3/30/Screen_Shot_2012-06-19_at_6.25.45_PM.png/180px-Screen_Shot_2012-06-19_at_6.25.45_PM.png", - "data": null, - "alt": "Screen Shot 2012-06-19 at 6.25.45 PM", - "title": null, - "caption": "What it ACTUALLY looks like" - } - }, - { - "type": "simple_table", - "raw_content": "
项目
A1
", - "content": { - "html": "
项目
A1
", - "is_complex": false, - "table_nest_level": 1 - } - }, - { - "type": "complex_table", - "raw_content": "
指标数据
20232024
营收1015
", - "content": { - "html": "
指标数据
20232024
营收1015
", - "is_complex": true, - "table_nest_level": "1" - } - }, - { - "type": "list", - "raw_content": "
外层列表项
  1. 行内公式: E=mc^2
  2. 行内代码: x = 1
外层另一个列表项
  • 第二层菜单项
  • ", - "content": { - "items": [ - { - "c": "外层列表项" - }, - { - "child_list": { - "list_attribute": "ordered", - "items": [ - { - "c": "行内公式: $E=mc^2$" - }, - { - "c": "行内代码: `x = 1`" - } - ] - } - }, - { - "c": "外层另一个列表项" - }, - { - "child_list": { - "list_attribute": "unordered", - "items": [ - { - "c": "第二层菜单项" - } - ] - } - } - ], - "list_attribute": "definition", - "list_nest_level": "2" - } - }, - { - "type": "title", - "raw_content": "

    大模型好,大模型棒1

    ", - "content": { - "title_content": "大模型好,大模型棒1", - "level": "1" - } - }, - { - "type": "paragraph", - "raw_content": "Who Is In Your Top 3 Mentalists Of All Time? <code>x = 1</code> <ccmath-inline type=\"latex\" by=\"mathjax_mock\" html=\"$E=mc^2$\">E=mc^2</ccmath-inline> • MAGICIANSANDMAGIC.COM", - "content": [ - { - "c": "Who Is In Your Top 3 Mentalists Of All Time? x = 1", - "t": "text" - }, - { - "c": "E=mc^2", - "t": "equation-inline" - }, - { - "c": "• MAGICIANSANDMAGIC.COM", - "t": "text" +[ //文档结构开始 + [ //这里是第一页的内容开始,里面每个字典表述一个文档元素的全部信息 + { + "type":"code", + "bbox":[x0, y0, x1, y1], + "content":{ + //content内容根据type不同而不同 + }, + { + "type":"image", + "bbox":[x0, y0, x1, y1], + "content":{ + // ... } - ] - } + }, + ... + } + ],//第一页内容结束 + [ + //这里是第二页的内容,如果没有内容则必须留空(例如某一页PDF是空白) ], - [] -] +] //结构结束 ``` +使用数组而非类似`page_index=1` 的方式组织是为了能够快速索引到某一页的数据。 +对于网页大多数情况下只有一页。 + +支持的文档元素类型 + +| ---- | 网页 | 文档 | 说明 | +| ------------------ | ---- | ---- | ------------------------------------------------------------------------------------ | +| code | ✅ | ✅ | 代码 | +| algorithm | ❌ | ✅ | 伪代码 | +| equation-interline | ✅ | ✅ | 行内公式 | +| image | ✅ | ✅ | 图片 | +| simple_table | ✅ | ✅ | 可转化为markdown的表格 | +| complex_table | ✅ | ✅ | 含有合并单元格的表格,不可转为markdown | +| list | ✅ | ✅ | 列表 | +| ref_list | ❌ | ✅ | 论文参考文献列表 | +| title | ✅ | ✅ | 标题 | +| paragraph | ✅ | ✅ | 文字可表示内容,内部可以含有多种文字类型,`纯文本`,`行内公式`,`行内代码`,`拼音`等 | +| audio | ✅ | ❌ | 音频,只在网页数据里有 | +| video | ✅ | ❌ | 视频,只在网页数据里有 | +| page_header | ❌ | ✅ | 文档页眉 | +| page_footer | ❌ | ✅ | 文档页脚 | +| page_number | ❌ | ✅ | 文档页码 | +| page_aside_text | ❌ | ✅ | 文档边注 | +| page_footnote | ❌ | ✅ | 文档论文脚注 | + +其中`paragraph`又由以下几种类型组成: + +- `equation-inline`代表行内公式 +- `text`代表普通纯文本 +- `code-inline`代表行内文本,例如“执行linux`ls`命令” + ## 字段定义 ### 代码段 +代表多行的独立代码段 + +- 在PDF里伪代码被分入 `algorithm`类 +- 在网页里则不区分可运行代码和伪代码,但是有`行内代码`,行内代码位于`paragraph` + ```json { "type": "code", - "raw_content": "def add(a, b):\\n return a + b", - "inline": false, + "bbox":[x1, y1, x2, y2] "content": { + "caption":["下面是一段python求和函数"], "code_content": "def add(a, b):\\n return a + b", + "language":"python", "by": "tag_code" } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| -------------------- | ------ | ----------------------------- | -------- | -| type | string | 值固定为code | 是 | -| raw_content | string | 原始文本内容 | 可选 | -| inline | bool | 是否为行内代码 | 是 | -| content.code_content | string | 干净的,格式化过的代码内容 | 是 | -| content.language | string | 代码语言,python\\cpp\\php... | 可选 | -| content.by | string | 哪种代码高亮工具 、自定义规则 | 是 | +| 字段 | 类型 | 描述 | 是否必须 | +| -------------------- | ------ | ---------------------------------------------- | -------- | +| type | string | 值固定为code | 是 | +| content.code_content | string | 干净的,格式化过的代码内容 | 是 | +| content.caption | list | 代码标题,可以有多个。网页没有此字段 | 否 | +| content.language | string | 代码语言,python\\cpp\\php... | 可选 | +| content.by | string | 哪种代码高亮工具 、自定义规则,目前只在网页里有 | 可选 | -### 公式段 +### 伪代码 + +> ⚠️只在文档中出现,网页中无 + +```json +{ + "type":"algorithm", + "bbox": [x1, y1, x2, y2], + "content":{ + "algorithm_content":"循环:\n当x<0时停止", + "caption":["title-1", "title-2"] + } +} + +``` + +| 字段 | 类型 | 描述 | 是否必须 | +| ------------------------- | ------ | ------------------------------------ | -------- | +| type | string | 固定为algorithm,代表伪代码内容 | 是 | +| content.algorithm_content | string | 干净的,格式化过的代码内容 | 是 | +| content.caption | list | 代码标题,可以有多个。网页没有此字段 | 否 | + +### 行间公式段 ```json { "type": "equation-interline", - "raw_content": "

    $$a^2 + b^2 = c^2$$

    ", + "bbox": [x1, y1, x2, y2], "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", @@ -178,7 +145,6 @@ | 字段 | 类型 | 描述 | 是否必须 | | -------------------- | ------ | --------------------------------------------------------------- | -------- | | type | string | 可选为equation-interline或者equation-inline | 是 | -| raw_content | string | 原始文本内容 | 可选 | | content.math_content | string | 干净的,格式化过的公式内容。无论是行内还是行间公式两边都不能有$ | 是 | | content.math_type | string | 公式语言类型,latex\\mathml\\asciimath | 可选 | | content.by | string | 原html中使用公式渲染器,mathjax\\katex | 可选 | @@ -188,40 +154,42 @@ ```json { "type": "image", - "raw_content": "
    \"Screen
    What it ACTUALLY looks like
    ", + "bbox": [x1, y1, x2, y2], "content": { "url": "http://static4.wikia.nocookie.net/__cb20120619225143/central/images/thumb/3/30/Screen_Shot_2012-06-19_at_6.25.45_PM.png/180px-Screen_Shot_2012-06-19_at_6.25.45_PM.png", "data": null, "alt": "Screen Shot 2012-06-19 at 6.25.45 PM", "title": null, - "caption": "What it ACTUALLY looks like" + "caption": ["What it ACTUALLY looks like"], + "footnote":[] } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| --------------- | ------ | -------------------- | -------- | -| type | string | 值固定为image | 是 | -| raw_content | string | 原始文本内容 | 可选 | -| content.url | string | 图片的url地址 | 可选 | -| content.data | string | base64形式的图片数据 | 可选 | -| content.alt | string | 图片的alt属性 | 可选 | -| content.title | string | 图片的title属性 | 可选 | -| content.caption | string | 图片的caption属性 | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ---------------- | ------ | -------------------- | -------- | +| type | string | 值固定为image | 是 | +| content.url | string | 图片的url地址 | 可选 | +| content.data | string | base64形式的图片数据 | 可选 | +| content.alt | string | 图片的alt属性 | 可选 | +| content.title | string | 图片的title属性 | 可选 | +| content.caption | list | 图片的caption属性 | 可选 | +| content.footnote | list | 图片的footnote属性 | 可选 | > `content.url`和`content.data`二者必须有一个,数据使用优先级是`data`>`url`。 -### 音频段(未实现) +### 音频段 + +> ⚠️网页中有,文档中没有 ```json { "type": "audio", - "raw_content": null, "content": { "sources": ["https://www.example.com/audio.mp3"], "path": "s3://llm-media/audio.mp3", "title": "example audio", - "caption": "text from somewhere" + "caption": ["text from somewhere"] } } ``` @@ -230,13 +198,14 @@ | --------------- | ------ | ------------------ | -------- | | type | string | 值固定为audio | 是 | | bbox | array | \[x1, y1, x2, y2\] | 可选 | -| raw_content | string | 原始文本内容 | 可选 | | content.sources | array | 音频的url地址 | 可选 | | content.path | string | 音频的存储路径 | 可选 | | content.title | string | 音频的title属性 | 可选 | -| content.caption | string | 音频的caption属性 | 可选 | +| content.caption | list | 音频的caption属性 | 可选 | + +### 视频段 -### 视频段(未实现) +> ⚠️网页中有,文档中没有 ```json { @@ -247,7 +216,7 @@ "sources": ["https://www.example.com/video.avi"], "path": "s3://llm-media/video.mp4", "title": "example video", - "caption": "text from somewhere" + "caption": ["text from somewhere"] } } ``` @@ -260,36 +229,60 @@ | content.sources | array | 视频的url地址 | 可选 | | content.path | string | 视频的存储路径 | 可选 | | content.title | string | 视频的title属性 | 可选 | -| content.caption | string | 视频的caption属性 | 可选 | +| content.caption | list | 视频的caption属性 | 可选 | -### 表格段 +### 复杂表格\[含跨行、列合并,嵌套\] ```json { "type": "complex_table", - "raw_content": "
    指标数据
    20232024
    营收1015
    ", + "bbox": [x1, y1, x2, y2], "content": { "html": "
    指标数据
    20232024
    营收1015
    ", - "is_complex": true, - "table_nest_level": "1" + "table_nest_level": "1", + "caption":[], + "footnote":[] } } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| ------------------------ | ------- | ------------------------------------------------- | -------- | -| type | string | 可选值为simple_table、complex_table | 是 | -| raw_content | string | 原始文本内容 | 可选 | -| content.html | string | 表格的html内容 | 是 | -| content.is_complex | boolean | 是否是复杂表格(跨行、跨列的/嵌套表格, 默认为false | 可选 | -| content.table_nest_level | int | table嵌套层级(单个table为1,两层为2,以此类推) | 可选 | +| 字段 | 类型 | 描述 | 是否必须 | +| ------------------------ | ------ | --------------------------------------------- | -------- | +| type | string | 可选值为simple_table、complex_table | 是 | +| bbox | 四元组 | 元素位置坐标 | 可选 | +| content.html | string | 表格的html内容 | 是 | +| content.table_nest_level | int | table嵌套层级(单个table为1,两层为2,以此类推) | 可选 | +| content.caption | string | 表格的caption内容 | 否 | +| content.footnote | string | 表格的footnote内容 | 否 | -### 列表段 +### 简单表格 \[可用markdown表达的表格\] + +```json +{ + "type":"simple_table", + "bbox": [x1, y1, x2, y2], + "content": { + "html":"", + "caption":[], + "footnote":[] + } +} +``` + +| 字段 | 类型 | 描述 | 是否必须 | +| ---------------- | ------ | ----------------------------------- | -------- | +| type | string | 可选值为simple_table、complex_table | 是 | +| bbox | 四元组 | 元素位置坐标 | 可选 | +| content.html | string | 表格的html内容 | 是 | +| content.caption | list | 表格的caption内容 | 否 | +| content.footnote | list | 表格的footnote内容 | 否 | + +### 列表段\[支持嵌套\] ```json { "type": "list", - "raw_content": "
    外层列表项
    1. 行内公式: E=mc^2
    2. 行内代码: x = 1
    外层另一个列表项
  • 第二层菜单项
  • ", + "bbox": [x1, y1, x2, y2], "content": { "items": [ { @@ -331,7 +324,7 @@ | 字段 | 类型 | 描述 | 是否必须 | | ----------------------- | ------ | --------------------------------------------------------- | -------- | | type | string | 值固定为list | 是 | -| raw_content | string | 原始文本内容 | 可选 | +| bbox | 四元组 | 元素位置坐标 | 可选 | | content.items | array | 列表项,每个元素是N个段落,段落里的元素是文本、公式或代码 | 是 | | content.list_attribute | string | unordered/ordered/definition | 可选 | | content.list_nest_level | int | list的嵌套层级(单层list list_nest_level为1) | 可选 | @@ -339,15 +332,30 @@ items字段说明 - `items`是一个二维数组,每个元素是一个段落,段落里的元素是文本、公式、markdown或行内代码。 -- 每个元素是一个对象,包含字段:c和t。 c是内容,t是类型。 -- t的取值有4种:`text`、`equation-inline`、`md`、`code-inline`。 +- 每个元素是一个对象,包含字段:c和t。 c是内容content首字母,t是类型type首字母。 +- t的取值同`paragraph` + +### ref_list 参考文献列表 + +> ⚠️仅在文档论文中有 + +```json +{ + "type":"ref_list", + "bbox": [x1, y1, x2, y2], + "content":{ + // same as list + } +} + +``` ### 标题段 ```json { "type": "title", - "raw_content": "

    大模型好,大模型棒1

    ", + "bbox": [x1, y1, x2, y2], "content": { "title_content": "大模型好,大模型棒1", "level": "1" @@ -358,7 +366,7 @@ | 字段 | 类型 | 描述 | 是否必须 | | --------------------- | ------ | -------------------- | -------- | | type | string | 值固定为title | 是 | -| raw_content | string | 原始文本内容 | 可选 | +| bbox | 四元组 | 元素位置坐标 | 可选 | | content.title_content | string | 标题内容 | 是 | | content.level | int | 标题级别,1-N, 1最大 | 可选 | @@ -367,7 +375,7 @@ ```json { "type": "paragraph", - "raw_content": "Who Is In Your Top 3 Mentalists Of All Time? <code>x = 1</code> <ccmath-inline type=\"latex\" by=\"mathjax_mock\" html=\"$E=mc^2$\">E=mc^2</ccmath-inline> • MAGICIANSANDMAGIC.COM", + "bbox": [x1, y1, x2, y2], "content": [ { "c": "Who Is In Your Top 3 Mentalists Of All Time? x = 1", @@ -385,17 +393,88 @@ } ``` -| 字段 | 类型 | 描述 | 是否必须 | -| ----------- | ------ | --------------------------------------------------------------- | -------- | -| type | string | 值固定为paragraph | 是 | -| raw_content | string | 原始文本内容 | 可选 | -| content | array | 段落内容,每个元素是一个对象,包含字段c和t。 c是内容,t是类型。 | 是 | +| 字段 | 类型 | 描述 | 是否必须 | +| ------- | ------ | --------------------------------------------------------------- | -------- | +| type | string | 值固定为paragraph | 是 | +| bbox | 四元组 | 元素位置坐标 | 可选 | +| content | array | 段落内容,每个元素是一个对象,包含字段c和t。 c是内容,t是类型。 | 是 | content字段说明 - content是一个数组,每个元素是一个对象,包含字段:`c`和`t`。 c是内容,t是类型。 -- t的取值有4种:`text`、`equation-inline`、`md`、`code-inline`。 +- `t`的取值有以下几种: + - `text` : 普通文本 + - `equation-inline`: 行内公式 + - `md`:markdown格式的文本,通常用于格式化从网上下来的mardown文档 + - `code-inline`:行内公式,例如 “执行linux的`ls`命令查看文件” + - `phonetic`:拼音 + +### 页眉 + +> ⚠️只在文档中出现,网页中没有此项 + +```json +{ + "type": "page_header", + "bbox":[x0, y0, x1, y1], + "content": { + "page_header_content": "大模型学习资料", + } +} +``` + +### 页脚 + +> ⚠️只在文档中出现,网页中没有此项 + +```json +{ + "type": "page_footer", + "bbox":[x0, y0, x1, y1], + "content": { + "page_footer_content": "~~内部资料请勿外传~~", + } +} +``` + +### 页码 + +> ⚠️只在文档中出现,网页中没有此项 + +```json +{ + "type": "page_number", + "bbox":[x0, y0, x1, y1], + "content": { + "page_number_content": "--12--", + } +} +``` -## 参考 +### 边注 -- [图文交错数据标准格式(2.1)](https://aicarrier.feishu.cn/wiki/L1vUwB0Ozi9vZBkdrzycaHwAn0e) +> ⚠️只在文档中出现,网页中没有此项 + +```json +{ + "type": "page_aside_text", + "bbox":[x0, y0, x1, y1], + "content": { + "page_aside_text_content": "LLM大模型学习资料", + } +} +``` + +### 论文脚注 + +> ⚠️只在文档中出现,网页中没有此项 + +```json +{ + "type": "page_footnote", + "bbox":[x0, y0, x1, y1], + "content": { + "page_footnote_content": "大模型的代表公司有OpenAI、Claude等", + } +} +``` diff --git a/llm_web_kit/api/services/request_log_service.py b/llm_web_kit/api/services/request_log_service.py new file mode 100644 index 00000000..1de0950d --- /dev/null +++ b/llm_web_kit/api/services/request_log_service.py @@ -0,0 +1,161 @@ +"""请求日志服务. + +提供请求日志的创建、更新和查询功能。 +""" + +import uuid +from datetime import datetime +from typing import Optional + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from ..dependencies import get_logger +from ..models.db_models import RequestLog + +logger = get_logger(__name__) + + +class RequestLogService: + """请求日志服务类.""" + @staticmethod + def generate_request_id() -> str: + """生成唯一的请求ID.""" + return str(uuid.uuid4()) + + @staticmethod + async def create_log( + session: Optional[AsyncSession], + request_id: str, + input_type: str, + input_html: Optional[str] = None, + url: Optional[str] = None, + ) -> Optional[RequestLog]: + """创建请求日志记录. + + Args: + session: 数据库会话 + request_id: 请求ID + input_type: 输入类型 (html_content, url, file) + input_html: 输入HTML内容 + url: URL地址 + Returns: + 创建的日志记录,如果数据库未配置则返回 None + """ + if session is None: + logger.debug("数据库会话为空,跳过日志记录") + return None + try: + log = RequestLog( + request_id=request_id, + input_type=input_type, + input_html=input_html, + url=url, + status='processing', + created_at=datetime.now(), + updated_at=datetime.now(), + ) + session.add(log) + await session.flush() # 立即写入,获取ID + logger.info(f"创建请求日志: request_id={request_id}, input_type={input_type}, status=processing") + return log + except Exception as e: + logger.error(f"创建请求日志失败: {e}") + return None + + @staticmethod + async def update_log_success( + session: Optional[AsyncSession], + request_id: str, + output_markdown: Optional[str] = None, + ) -> bool: + """更新请求日志为成功状态. + + Args: + session: 数据库会话 + request_id: 请求ID + output_markdown: 输出Markdown内容 + Returns: + 是否更新成功 + """ + if session is None: + return False + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + log = result.scalar_one_or_none() + if log: + log.status = 'success' + log.output_markdown = output_markdown + log.updated_at = datetime.now() + await session.flush() + logger.info(f"更新请求日志为成功: request_id={request_id}, status=success") + return True + else: + logger.warning(f"未找到请求日志: request_id={request_id}") + return False + except Exception as e: + logger.error(f"更新请求日志失败: {e}") + return False + + @staticmethod + async def update_log_failure( + session: Optional[AsyncSession], + request_id: str, + error_message: str, + ) -> bool: + """更新请求日志为失败状态. + + Args: + session: 数据库会话 + request_id: 请求ID + error_message: 错误信息 + Returns: + 是否更新成功 + """ + if session is None: + return False + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + log = result.scalar_one_or_none() + if log: + log.status = 'fail' + log.error_message = error_message + log.updated_at = datetime.now() + await session.flush() + logger.info(f"更新请求日志为失败: request_id={request_id}, status=fail") + return True + else: + logger.warning(f"未找到请求日志: request_id={request_id}") + return False + + except Exception as e: + logger.error(f"更新请求日志失败: {e}") + return False + + @staticmethod + async def get_log_by_request_id( + session: Optional[AsyncSession], + request_id: str, + ) -> Optional[RequestLog]: + """根据请求ID查询日志. + + Args: + session: 数据库会话 + request_id: 请求ID + Returns: + 日志记录,如果未找到则返回 None + """ + if session is None: + return None + try: + result = await session.execute( + select(RequestLog).where(RequestLog.request_id == request_id) + ) + return result.scalar_one_or_none() + except Exception as e: + logger.error(f"查询请求日志失败: {e}") + return None diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index 0baf436c..9c7aa8df 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -88,29 +88,29 @@ def recognize( @override def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict: - """ - 把代码元素转换为content list node. + """把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN + E)由TextParagraphRecognizer处理. + Args: base_url: parsed_content: HtmlElement对象 raw_html_segment: Returns: - """ d = { 'type': 'code', - # "bbox": [], - 'raw_content': raw_html_segment, - 'inline': parsed_content.get('inline', 'false') == 'true', + 'bbox': [], 'content': { 'code_content': parsed_content.text, }, } + # 可选字段:language if lang := parsed_content.get('language', None): d['content']['language'] = lang + # 可选字段:by(代码高亮工具) if by := parsed_content.get('by', None): d['content']['by'] = by diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index b83a2edc..32884087 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl @override def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict: - """将content转换成content_list_node. - 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md - 例如代码的返回格式: + """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考 + docs/specification/output_format/content_list_spec.md. + + 返回格式示例: ```json { - "type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种 - "raw_content": "$u_{x_0}^{in}(x)$", + "type": "equation-interline", + "bbox": [], "content": { - "math_content": "u_{x_0}^{in}(x)", + "math_content": "a^2 + b^2 = c^2", "math_type": "latex", "by": "mathjax" } } - ``` + ``` - Args: - content: str: 要转换的content + Args: + base_url: 基础URL + parsed_content: 解析后的HtmlElement对象 + raw_html_segment: 原始HTML片段 Returns: dict: content_list_node @@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h math_content = self.cm.wrap_math_md(math_content) return { 'type': DocElementType.EQUATION_INTERLINE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'math_content': math_content, 'math_type': inter_ele[0].get('type'), # 数学语言类型 @@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h math_content = in_els[0].text return { 'type': DocElementType.EQUATION_INLINE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'math_content': math_content, 'math_type': in_els[0].get('type'), # 数学语言类型 diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 53f612dc..6e1d192b 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -53,15 +53,18 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h raise HtmlImageRecognizerException(f'No ccimage element found in content: {parsed_content}') def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) -> dict: + caption = html_obj.get('caption') + footnote = html_obj.get('footnote') result = { 'type': DocElementType.IMAGE, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'url': html_obj.text if html_obj.get('format') == 'url' else None, 'data': html_obj.text if html_obj.get('format') == 'base64' else None, 'alt': html_obj.get('alt'), 'title': html_obj.get('title'), - 'caption': html_obj.get('caption') + 'caption': [caption] if caption else [], + 'footnote': [footnote] if footnote else [] } } return result diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 61f113b8..a33a7712 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,4 +1,5 @@ import json +import re from typing import Any, List, Tuple from lxml import html as lxml_html @@ -44,7 +45,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h ele_node = { 'type': DocElementType.LIST, - 'raw_content': raw_html_segment, + 'bbox': [], 'content': { 'items': content_list, 'list_attribute': list_attribute, @@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT: paragraph[-1]['c'] += _new_tail else: + if len(paragraph) > 0 and el.tag not in inline_tags: + _new_tail = '$br$' + _new_tail paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) if paragraph: @@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): - tem_json = json.dumps(item).replace('$br$', '\\n\\n') + tem_json = json.dumps(item, ensure_ascii=False) + tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json) text_paragraph[n] = json.loads(tem_json) return text_paragraph diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 239776cd..7ec23317 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 if table_type: cc_table_type = DocElementType.COMPLEX_TABLE + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + 'table_nest_level': table_nest_level, + "caption": [], + "footnote": [] + } + } else: cc_table_type = DocElementType.SIMPLE_TABLE - d = { - 'type': cc_table_type, - 'raw_content': raw_html_segment, - 'content': { - 'html': html_content, - 'is_complex': table_type, - 'table_nest_level': table_nest_level + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + "caption": [], + "footnote": [] + } } - } return d def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index db90f4a7..0b5b08cc 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h el = parsed_content node = { 'type': DocElementType.PARAGRAPH, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': json.loads(el.text), } return node @@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: for item in para_text: if item['c'] is not None: - item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n') else: item['c'] = "" diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 6fa1cd59..f8ee7635 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return None cctitle_content_node = { 'type': DocElementType.TITLE, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': { 'title_content': text, 'level': level diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 56ed8272..b02bc7d1 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -51,7 +51,7 @@ class StructureMapper(ABC): def __init__(self): self.__txt_para_splitter = '\n' - self.__md_para_splitter = '\n\n' + self.__md_para_splitter = '' self.__text_end = '\n' self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 @@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F if content_lst_node['type'] not in exclude_nodes: txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types, use_raw_image_url) + if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"): # 若段落间没有换行,则添加换行 + md_blocks.append("\n\n") if txt_content and len(txt_content) > 0: md_blocks.append(txt_content) @@ -152,22 +154,6 @@ def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url= md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url) return md - def to_main_html(self) -> str: - """拼接和每个content_list_node对应的html内容,返回一个完整的html文档. - - Args: - content_lst_node (dict): content_list里定义的每种元素块 - Returns: - str: html格式 - """ - content_lst = self._get_data() - html = '' - for page in content_lst: - for content_lst_node in page: - raw_html = content_lst_node['raw_content'] - html += raw_html - return html - def to_json(self, pretty=False) -> str: content_lst = self._get_data() if pretty: @@ -296,9 +282,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: image_data = content_lst_node['content'].get('data', '') image_alt = content_lst_node['content'].get('alt', '') image_title = content_lst_node['content'].get('title', '') - image_caption = content_lst_node['content'].get('caption', '') + image_caption = content_lst_node['content'].get('caption', []) image_url = content_lst_node['content'].get('url', '') - + image_footnote = content_lst_node['content'].get('footnote', []) if not image_path and not image_data: image_path = sha256_hash(image_url) @@ -315,11 +301,16 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: else: image_title = '' - if image_caption: - image_caption = image_caption.strip() + if len(image_caption) > 0: + image_caption = image_caption[0].strip() else: image_caption = '' + if len(image_footnote) > 0: + image_footnote = image_footnote[0].strip() + else: + image_footnote = '' + image_des = image_title if image_title else '' # 优先使用data, 其次path.其中data是base64编码的图片,path是图片的url if image_data: @@ -338,6 +329,9 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: else: image_with_caption = image + if image_footnote: + image_with_caption = f'{image_with_caption}\n\n{image_footnote}' + return image_with_caption elif node_type == DocElementType.AUDIO: return '' # TODO: 音频格式 diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 112d9ac9..62bcbc52 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -302,7 +302,7 @@ def htmll_to_content2(self, body_str): else: parent.text = (parent.text or '') + (element.tail or '') parent.remove(element) - self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) + # self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) output = [] main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body)) for line in main_content: diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index 5bb85151..ba341040 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -1,9 +1,9 @@ { "type": "complex_table", - "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
    ফেব্রুয়ারি ২০২৪
    সোমমঙ্গলবুধবৃহশুক্রশনিরবি
    « জানুয়ারি
    ১০১১
    ১২১৩১৪১৫১৬১৭১৮
    ১৯২০২১২২২৩২৪২৫
    ২৬২৭২৮২৯
    ", - "is_complex": true, - "table_nest_level": null + "table_nest_level": null, + "caption": [], + "footnote": [] } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json index 357f2843..95c43154 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json @@ -1 +1 @@ -{"type": "simple_table", "raw_content": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>1234", "content": {"html": "
    12
    34
    ", "is_complex": false}} \ No newline at end of file +{"type": "simple_table", "content": {"html": "
    12
    34
    ", "caption": [], "footnote": []}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py index 6396c6d1..281f0b6d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py @@ -82,9 +82,9 @@ 'url': 'xxx', 'parsed_content': """http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg""", 'html': '...', - 'expected': {'type': 'image', 'raw_content': '...', 'content': { + 'expected': {'type': 'image', 'bbox': [], 'content': { 'url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg', 'data': None, - 'alt': 'Janser Logo', 'title': None, 'caption': None}}, + 'alt': 'Janser Logo', 'title': None, 'caption': [], 'footnote': []}}, 'alt': 'Janser Logo', 'img_url': 'http://15.demooo.pl/wp-content/themes/starter/dist/images/logos/janser-logo.svg' }, @@ -94,9 +94,9 @@ ' format="url" alt="Układanie wykładzin">http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg""", 'html': '...', - 'expected': {'type': 'image', 'raw_content': '...', + 'expected': {'type': 'image', 'bbox': [], 'content': {'url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg', - 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': None}}, + 'data': None, 'alt': 'Układanie wykładzin', 'title': None, 'caption': [], 'footnote': []}}, 'alt': 'Układanie wykładzin', 'img_url': 'http://15.demooo.pl/wp-content/uploads/2022/08/ukladanie-wykladzin.svg' }, diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index 5f8d61de..7bb5bdb5 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -19,28 +19,28 @@ def setUp(self): self.__list_with_ul_text_content = None self.__list_with_sub_no_prefix_content = None - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/simple_list.html', 'r', encoding='utf-8') as file: self.__simple_list_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/complex_list.html', 'r', encoding='utf-8') as file: self.__complex_list_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/test-list-item.html', 'r', encoding='utf-8') as file: self.__with_empty_list_item_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_sub_sup.html', 'r', encoding='utf-8') as file: self.__list_with_sub_sup_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_br_and_cctags.html', 'r', encoding='utf-8') as file: self.__list_with_br_and_cctags_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_sup_tail.html', 'r', encoding='utf-8') as file: self.__list_with_sub_sup_tail_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_ul_text.html', 'r', encoding='utf-8') as file: self.__list_with_ul_text_content = file.read() - with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r') as file: + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/list_with_sub_no_prefix.html', 'r', encoding='utf-8') as file: self.__list_with_sub_no_prefix_content = file.read() def test_simple_list(self): @@ -158,7 +158,6 @@ def test_to_content_list_node(self): # 验证返回的内容结构正确 assert 'type' in content_node, '返回的content_node缺少type字段' assert 'content' in content_node, '返回的content_node缺少content字段' - assert 'raw_content' in content_node, '返回的content_node缺少raw_content字段' # 验证content字段包含必要的内容 assert 'items' in content_node['content'], 'content字段缺少items' diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 9a9af500..20572874 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -287,7 +287,7 @@ ), 'expected': { 'type': 'equation-interline', - 'raw_content': '$$h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}$$', + 'bbox': [], 'content': { 'math_content': 'h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}', 'math_type': 'latex', diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index eb58bb98..fe17c919 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -131,8 +131,6 @@ def test_table_to_content_list_node_simple(self): expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] - assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] - assert result['raw_content'] == json.loads(expect_json)['raw_content'] self.assertTrue(result['content']['html'].startswith('')) self.assertTrue(result['content']['html'].endswith('
    ')) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 0c0f8db2..0bd90084 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -31,7 +31,7 @@ def test_text_1(self): '中共中央政治局召开会议审议《成-2020年10月16日新闻联播', 'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播' result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) + assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) def test_text_2(self): """ @@ -53,7 +53,7 @@ def test_text_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md + assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md def test_text_3(self): """ @@ -75,7 +75,7 @@ def test_text_3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md + assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md def test_text_4(self): """ @@ -97,7 +97,7 @@ def test_text_4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md + assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md def test_text_5(self): """ @@ -119,7 +119,7 @@ def test_text_5(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md + assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md def test_text_6(self): """ @@ -165,7 +165,7 @@ def test_text_8(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) + assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) def test_text_9(self): """ @@ -177,7 +177,7 @@ def test_text_9(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) + assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) def test_text_10(self): """ @@ -199,7 +199,7 @@ def test_text_10(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n\n In the book' in content_md + assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n In the book' in content_md def test_text_11(self): """ @@ -331,7 +331,8 @@ def test_interactive_element(self): } input_data = DataJson(test_data) result = chain.extract(input_data) - main_html = result.get_content_list().to_main_html() + # 验证 main_html 中没有交互元素 + main_html = result.get('main_html') assert '
    ==========================title====================================
    ", "content": [ { "c": "==========================title====================================", @@ -23,7 +22,6 @@ }, { "type": "title", - "raw_content": "

    Title Test

    ", "content": { "title_content": "Title Test", "level": "1" @@ -31,7 +29,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================code inline====================================
    ", "content": [ { "c": "==========================code inline====================================", @@ -41,7 +38,6 @@ }, { "type": "paragraph", - "raw_content": "
  • Dead simple\n Include prism.css and prism.js, use proper HTML5 code tags (code.language-xxxx), done!\n
  • ", "content": [ { "c": "Dead simple Include prism.css and prism.js, use proper HTML5 code tags (", @@ -59,7 +55,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================code====================================
    ", "content": [ { "c": "==========================code====================================", @@ -69,8 +64,7 @@ }, { "type": "code", - "raw_content": "
    \n
    Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n
    \n
            Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n
    \n
            If item IsNot Nothing Then\n
    \n
                item.CssClass = \"focused\"\n
    \n
            End If\n
    \n
     
    \n
        End Sub\n
    \n
    \n\n", - "inline": false, + "bbox": [], "content": { "code_content": "Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n If item IsNot Nothing Then\n item.CssClass = \"focused\"\n End If\n\n End Sub", "by": "tag_code" @@ -78,7 +72,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================simple table====================================
    ", "content": [ { "c": "==========================simple table====================================", @@ -88,16 +81,14 @@ }, { "type": "simple_table", - "raw_content": "
    1.12.1
    3.14.1
    ", "content": { "html": "
    1.12.1
    3.14.1
    ", - "is_complex": false, - "table_nest_level": "1" + "caption": [], + "footnote": [] } }, { "type": "paragraph", - "raw_content": "
    ==========================complex table====================================
    ", "content": [ { "c": "==========================complex table====================================", @@ -107,16 +98,15 @@ }, { "type": "complex_table", - "raw_content": "
    123
    4
    567
    ", "content": { "html": "
    123
    4
    567
    ", - "is_complex": true, - "table_nest_level": "1" + "table_nest_level": "1", + "caption": [], + "footnote": [] } }, { "type": "paragraph", - "raw_content": "
    ==========================equation inline====================================
    ", "content": [ { "c": "==========================equation inline====================================", @@ -126,7 +116,6 @@ }, { "type": "paragraph", - "raw_content": "

    测试行内公式x=4

    ", "content": [ { "c": "测试行内公式", @@ -144,7 +133,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================equation interline====================================
    ", "content": [ { "c": "==========================equation interline====================================", @@ -154,7 +142,6 @@ }, { "type": "paragraph", - "raw_content": "

    公式如下:

    ", "content": [ { "c": "公式如下:", @@ -164,7 +151,7 @@ }, { "type": "equation-interline", - "raw_content": "

    $$a^2 + b^2 = c^2$$

    ", + "bbox": [], "content": { "math_content": "a^2 + b^2 = c^2", "math_type": "latex", @@ -173,7 +160,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================img====================================
    ", "content": [ { "c": "==========================img====================================", @@ -183,18 +169,18 @@ }, { "type": "image", - "raw_content": "\"image\"", + "bbox": [], "content": { "url": "http://example.com/image.png", "data": null, "alt": "image", "title": null, - "caption": "" + "caption": [], + "footnote": [] } }, { "type": "paragraph", - "raw_content": "
    ==========================list====================================
    ", "content": [ { "c": "==========================list====================================", @@ -204,7 +190,7 @@ }, { "type": "list", - "raw_content": "", + "bbox": [], "content": { "items": [ { @@ -220,7 +206,7 @@ }, { "type": "list", - "raw_content": "
    HTML
    瓒呮枃鏈爣璁拌瑷€
    CSS
    灞傚彔鏍峰紡琛�
    ", + "bbox": [], "content": { "items": [ { @@ -242,7 +228,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================paragraph====================================
    ", "content": [ { "c": "==========================paragraph====================================", @@ -252,7 +237,6 @@ }, { "type": "paragraph", - "raw_content": "

    test paragraph

    ", "content": [ { "c": "test paragraph", @@ -262,7 +246,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================audio====================================
    ", "content": [ { "c": "==========================audio====================================", @@ -272,7 +255,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================video====================================
    ", "content": [ { "c": "==========================video====================================", @@ -282,7 +264,6 @@ }, { "type": "paragraph", - "raw_content": "
    ", "content": [ { "c": "Download the WEBM or MP4 video.", diff --git a/tests/llm_web_kit/input/test_datajson.py b/tests/llm_web_kit/input/test_datajson.py index 6996fc38..76779260 100644 --- a/tests/llm_web_kit/input/test_datajson.py +++ b/tests/llm_web_kit/input/test_datajson.py @@ -156,7 +156,8 @@ def test_datajson_exclude_nodes_to_mmd(self): 'data': None, 'alt': 'Curtindo o apartamento com piscina no centro de SP. ', 'title': 'Curtindo o apartamento com piscina no centro de SP. ', - 'caption': None + 'caption': [], + 'footnote': ['test image footnote'] } }]] } diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html new file mode 100644 index 00000000..7e0592ce --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html @@ -0,0 +1,773 @@ + + + + + + Versace Bright Crystal EDT Perfume for Women 90ml | yangonbranded + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    top of page

    Versace Bright Crystal EDT Perfume for Women (In stock)

    @ 90ml retail packaging - K 319,000

    @ 90ml tester packaging (အဖုံးပါ) - K 259,000

     

    Made in Italy

     

    အမြဲပူအိုက်တဲ့ မြန်မာနိုင်ငံရာသီဥတုမှာ သုံးဖို့အဆင်ပြေတဲ့ Versace Bright Crystal က သလဲသီး၊ Yuzu လိမ္မော်၊ ရေခဲရနံ့ တွေနဲ့ Peony၊ စံကားဝါ၊ ကြာပန်း ရနံ့သင်းသင်းလေးတွေကို ပေါင်းထားတဲ့ fresh juicy floral ရနံ့ဖြစ်ပြီး drydown မှာ မပြင်းလွန်းတဲ့ ပယင်း၊ ကတိုး နှင့် မဟော်ဂနီရနံ့တွေသာ ပါလို့ classy ဖြစ်ပြီး လူကိုလန်းဆန်းစေတဲ့ soft and subtle airy scent ရနံ့သင်းသင်းလေး ဖြစ်ပါတယ်။

     

    ပေါ့ပေါ့ပါးပါး ခေါင်းမမူး ခေါင်းမကိုက်နိုင်တဲ့ အနံ့ fresh and clean feel ရှိတဲ့အနံ့ ဖြစ်ပြီး vanilla, powder, pepper နဲ့ aqua ရနံ့တွေကို ရှောင်ထားတဲ့ light scent ဖြစ်လို့ အဲဒီ note တွေမကြိုက်တဲ့သူတွေ ရေမွှေးပြင်းမကြိုက်တဲ့သူတွေ အတွက်ပိုသင့်ပါတယ်။

    မိန်းကလေးတော်တော်များသိပြီး သုံးပြီးဖြစ်လို့ နာမည်ကျော်ကြားပြီးဖြစ်တဲ့ အီတလီနိုင်ငံလုပ် ရေမွှေးဖြစ်ပါတယ်။

    Retail packaging နဲ့ tester packaging နှစ်မျိုးလုံးရှိပါတယ်။

    Tester packaging မှာအဖုံးပါတာမို့ ပုလင်းက retail packaging အတိုင်းဖြစ်ပါတယ်။ အပြင်စက္ကူဗူးကပဲ tester packaging ဗူးဖြစ်နေတာပါ။

    လက်ဆောင်ပေးဖို့ဝယ်တာဆိုရင်တော့ ဗူးခွံအမြင်လှတဲ့ retail packaging ကိုပဲဝယ်ဖို့ recommend လုပ်ပါတယ်။

    https://www.yangonbrandedperfume.com/product-page/versace-bright-crystal-edt-perfume-for-women-90ml-2

     

    အခြား In stock ရနိုင်တဲ့ Versace perfume တွေကို https://www.yangonbrandedperfume.com/versace မှာကြည့်နိုင်ပါတယ်။

    Versace Bright Crystal EDT Perfume for Women 90ml

    K319,000.00Price
        +
      • +

        ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

        +
      • +
      • +

        Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

        +
      • +
      • +

        ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

        +
      • +

      You are visitor number

      bottom of page
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      top of page
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +

      Perfume for Women

      +
      +
      + + +
      +
      +

      All Products

      +
      +
      + +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
        +
      • +

        ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

        +
      • +
      • +

        Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

        +
      • +
      • +

        ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

        +
      • +
      +

      You are visitor number

      +
      +
      +
      +
      +
      bottom of page
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py index 658f880e..9c2aa5c2 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py @@ -474,3 +474,31 @@ def test_code_newline(self): parts = parser.parse(pre_data) main_html = parts[PreDataJsonKey.MAIN_HTML] assert 'conda install bioconductor-annotationdbi' in main_html + + def test_fix_newlines(self): + # 构造测试html + typical_raw_tag_html = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + html_source = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + # 简化网页 + # 模型结果格式改写 + llm_path = 'assets/input_layout_batch_parser/test_code_newline.json' + llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) + pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html, + 'llm_response': llm_response, 'html_source': html_source} + pre_data = PreDataJson(pre_data) + # 映射 + parser = MapItemToHtmlTagsParser({}) + pre_data = parser.parse(pre_data) + + # 推广 + pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True + pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True + pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True + parser = LayoutBatchParser({}) + parts = parser.parse(pre_data) + main_html = parts[PreDataJsonKey.MAIN_HTML_BODY] + assert len(main_html) == 39746