Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions llm_web_kit/extractor/html/recognizer/cccode.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,29 +88,29 @@ def recognize(

@override
def to_content_list_node(self, base_url:str, parsed_content: HtmlElement, raw_html_segment:str) -> dict:
"""
把代码元素转换为content list node.
"""把代码元素转换为content list node. 注意:此方法只处理块级代码(CC_CODE),行内代码(CC_CODE_INLIN
E)由TextParagraphRecognizer处理.

Args:
base_url:
parsed_content: HtmlElement对象
raw_html_segment:

Returns:

"""
d = {
'type': 'code',
# "bbox": [],
'raw_content': raw_html_segment,
'inline': parsed_content.get('inline', 'false') == 'true',
'bbox': [],
'content': {
'code_content': parsed_content.text,
},
}

# 可选字段:language
if lang := parsed_content.get('language', None):
d['content']['language'] = lang

# 可选字段:by(代码高亮工具)
if by := parsed_content.get('by', None):
d['content']['by'] = by

Expand Down
25 changes: 14 additions & 11 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,23 +53,26 @@ def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlEl

@override
def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_html_segment: str) -> dict:
"""将content转换成content_list_node.
每种类型的html元素都有自己的content-list格式:参考 docs/specification/output_format/content_list_spec.md
例如代码的返回格式:
"""将content转换成content_list_node. 每种类型的html元素都有自己的content-list格式:参考
docs/specification/output_format/content_list_spec.md.

返回格式示例:
```json
{
"type": "equation-inline", # 数学公式类型,一共equation-inline和equation-interline两种
"raw_content": "<ccmath type="latex" by="mathjax">$u_{x_0}^{in}(x)$</ccmath>",
"type": "equation-interline",
"bbox": [],
"content": {
"math_content": "u_{x_0}^{in}(x)",
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
"by": "mathjax"
}
}
```
```

Args:
content: str: 要转换的content
Args:
base_url: 基础URL
parsed_content: 解析后的HtmlElement对象
raw_html_segment: 原始HTML片段

Returns:
dict: content_list_node
Expand All @@ -86,7 +89,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = self.cm.wrap_math_md(math_content)
return {
'type': DocElementType.EQUATION_INTERLINE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'math_content': math_content,
'math_type': inter_ele[0].get('type'), # 数学语言类型
Expand All @@ -97,7 +100,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
math_content = in_els[0].text
return {
'type': DocElementType.EQUATION_INLINE,
'raw_content': raw_html_segment,
'bbox': [],
'content': {
'math_content': math_content,
'math_type': in_els[0].get('type'), # 数学语言类型
Expand Down
16 changes: 0 additions & 16 deletions llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,6 @@ def to_mm_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=
md = self.__to_md(exclude_nodes, exclude_inline_types, use_raw_image_url)
return md

def to_main_html(self) -> str:
"""拼接和每个content_list_node对应的html内容,返回一个完整的html文档.

Args:
content_lst_node (dict): content_list里定义的每种元素块
Returns:
str: html格式
"""
content_lst = self._get_data()
html = ''
for page in content_lst:
for content_lst_node in page:
raw_html = content_lst_node['raw_content']
html += raw_html
return html

def to_json(self, pretty=False) -> str:
content_lst = self._get_data()
if pretty:
Expand Down
2 changes: 1 addition & 1 deletion tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@
),
'expected': {
'type': 'equation-interline',
'raw_content': '<span class="math-container">$$h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}$$</span>',
'bbox': [],
'content': {
'math_content': 'h \\approx {{GM} \\over c^2} \\times {1 \\over r} \\times {v^2 \\over c^2}',
'math_type': 'latex',
Expand Down
3 changes: 2 additions & 1 deletion tests/llm_web_kit/extractor/html/recognizer/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,8 @@ def test_interactive_element(self):
}
input_data = DataJson(test_data)
result = chain.extract(input_data)
main_html = result.get_content_list().to_main_html()
# 验证 main_html 中没有交互元素
main_html = result.get('main_html')
assert '<input' not in main_html

def test_normalize_space1(self):
Expand Down
5 changes: 0 additions & 5 deletions tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ def test_html_pipeline(self):
self.assertEqual(html_content['type'], DocElementType.CODE)
self.assertEqual(len(html_content['content']['code_content']), 251)
self.assertEqual(html_content['content']['by'], 'tag_pre_code')
self.assertEqual(html_content['inline'], False)

# 有序列表
html_content = html_content_list[10]
Expand Down Expand Up @@ -176,10 +175,6 @@ def test_html_pipeline(self):
self.assertNotEqual(md_content[-2], '\n')
self.assertEqual(md_content[-1], '\n')

# main_html
main_html = result.get_content_list().to_main_html() # 获取main_html内容
self.assertEqual(main_html, self.main_html_expected_content) # 如果遇到嵌套的html, 则返回原始html的时候还是应当拼接替换一下 TODO

def test_html_pipeline_suit_2(self):
"""测试第二个数据:这个数据会丢失一些文本信息."""
chain = ExtractSimpleFactory.create(self.config)
Expand Down
5 changes: 2 additions & 3 deletions tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@
},
{
"type": "code",
"raw_content": "<div class=\"tFormatCodeBlock supportThreadCodeBlock\" style=\"border: 1px solid #7f9db9;overflow-y: auto;\" id=\"highlighter_46368\">\n <div style=\"background-color: #ffffff;\"><span style=\"margin-left: 0px !important;\"><code style=\"color: #000000;\">Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated</code></span>\n </div>\n <div style=\"background-color: #f8f8f8;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)</code></span></span>\n </div>\n <div style=\"background-color: #ffffff;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">If item IsNot Nothing Then</code></span></span>\n </div>\n <div style=\"background-color: #f8f8f8;\"><span><code>            </code><span style=\"margin-left: 36px !important;\"><code style=\"color: #000000;\">item.CssClass = \"focused\"</code></span></span>\n </div>\n <div style=\"background-color: #ffffff;\"><span><code>        </code><span style=\"margin-left: 24px !important;\"><code style=\"color: #000000;\">End If</code></span></span>\n </div>\n <div style=\"background-color: #f8f8f8;\"><span style=\"margin-left: 0px !important;\"> </span></div>\n <div style=\"background-color: #ffffff;\"><span><code>    </code><span style=\"margin-left: 12px !important;\"><code style=\"color: #000000;\">End Sub</code></span></span>\n </div>\n</div>\n\n",
"inline": false,
"bbox": [],
"content": {
"code_content": "Private Sub sitemenu_ItemCreated(ByVal sender As Object, ByVal e As Telerik.Web.UI.RadMenuEventArgs) Handles sitemenu.ItemCreated\n Dim item As RadMenuItem = sitemenu.FindItemByUrl(Request.Url.PathAndQuery)\n If item IsNot Nothing Then\n item.CssClass = \"focused\"\n End If\n\n End Sub",
"by": "tag_code"
Expand Down Expand Up @@ -164,7 +163,7 @@
},
{
"type": "equation-interline",
"raw_content": "<p>$$a^2 + b^2 = c^2$$</p>",
"bbox": [],
"content": {
"math_content": "a^2 + b^2 = c^2",
"math_type": "latex",
Expand Down