diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py
index 0baf436c..9c7aa8df 100644
--- a/llm_web_kit/extractor/html/recognizer/cccode.py
+++ b/llm_web_kit/extractor/html/recognizer/cccode.py
@@ -88,29 +88,29 @@ def recognize(
@override
def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
- """
- 把代码元素转换为content list node.
+ """把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN
+ E)由TextParagraphRecognizer处理.
+
Args:
base_url:
parsed_content: HtmlElement对象
raw_html_segment:
Returns:
-
"""
d = {
'type': 'code',
- # "bbox": [],
- 'raw_content': raw_html_segment,
- 'inline': parsed_content.get('inline', 'false') == 'true',
+ 'bbox': [],
'content': {
'code_content': parsed_content.text,
},
}
+ # 可选字段:language
if lang := parsed_content.get('language', None):
d['content']['language'] = lang
+ # 可选字段:by(代码高亮工具)
if by := parsed_content.get('by', None):
d['content']['by'] = by
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
index b83a2edc..32884087 100644
--- a/llm_web_kit/extractor/html/recognizer/ccmath.py
+++ b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl
@override
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
- """将content转换成content_list_node.
- 每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md
- 例如代码的返回格式:
+ """将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考
+ docs/specification/output_format/content_list_spec.md.
+
+ 返回格式示例:
```json
{
- "type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种
- "raw_content": "$u_{x_0}^{in}(x)$",
+ "type": "equation-interline",
+ "bbox": [],
"content": {
- "math_content": "u_{x_0}^{in}(x)",
+ "math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "mathjax"
}
}
- ```
+ ```
- Args:
- content: str: 要转换的content
+ Args:
+ base_url: 基础URL
+ parsed_content: 解析后的HtmlElement对象
+ raw_html_segment: 原始HTML片段
Returns:
dict: content_list_node
@@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = self.cm.wrap_math_md(math_content)
return {
'type': DocElementType.EQUATION_INTERLINE,
- 'raw_content': raw_html_segment,
+ 'bbox': [],
'content': {
'math_content': math_content,
'math_type': inter_ele[0].get('type'), # 数学语言类型
@@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = in_els[0].text
return {
'type': DocElementType.EQUATION_INLINE,
- 'raw_content': raw_html_segment,
+ 'bbox': [],
'content': {
'math_content': math_content,
'math_type': in_els[0].get('type'), # 数学语言类型
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index 56ed8272..32d662bd 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -152,22 +152,6 @@ def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=
md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url)
return md
- def to_main_html(self) -> str:
- """拼接和每个content_list_node对应的html内容,返回一个完整的html文档.
-
- Args:
- content_lst_node (dict): content_list里定义的每种元素块
- Returns:
- str: html格式
- """
- content_lst = self._get_data()
- html = ''
- for page in content_lst:
- for content_lst_node in page:
- raw_html = content_lst_node['raw_content']
- html += raw_html
- return html
-
def to_json(self, pretty=False) -> str:
content_lst = self._get_data()
if pretty:
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
index 9a9af500..20572874 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -287,7 +287,7 @@
),
'expected': {
'type': 'equation-interline',
- 'raw_content': '$$h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}$$',
+ 'bbox': [],
'content': {
'math_content': 'h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}',
'math_type': 'latex',
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index 0c0f8db2..e3c6119f 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -331,7 +331,8 @@ def test_interactive_element(self):
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
- main_html = result.get_content_list().to_main_html()
+ # 验证 main_html 中没有交互元素
+ main_html = result.get('main_html')
assert '\n
Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n
\n Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n
\n If item IsNot Nothing Then\n
\n item.CssClass = \"focused\"\n
\n End If\n
\n
\n End Sub\n
\n\n\n",
- "inline": false,
+ "bbox": [],
"content": {
"code_content": "Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n If item IsNot Nothing Then\n item.CssClass = \"focused\"\n End If\n\n End Sub",
"by": "tag_code"
@@ -164,7 +163,7 @@
},
{
"type": "equation-interline",
- "raw_content": "$$a^2 + b^2 = c^2$$
",
+ "bbox": [],
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",