Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
# 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
if table_type:
cc_table_type = DocElementType.COMPLEX_TABLE
d = {
'type': cc_table_type,
'content': {
'html': html_content,
'table_nest_level': table_nest_level,
"caption": [],
"footnote": []
}
}
else:
cc_table_type = DocElementType.SIMPLE_TABLE
d = {
'type': cc_table_type,
# 'raw_content': raw_html_segment,
'content': {
'html': html_content,
'is_complex': table_type,
'table_nest_level': table_nest_level
d = {
'type': cc_table_type,
'content': {
'html': html_content,
"caption": [],
"footnote": []
}
}
}
return d

def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"type": "complex_table",
"content": {
"html": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>",
"is_complex": true,
"table_nest_level": null
"table_nest_level": null,
"caption": [],
"footnote": []
}
}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"type": "simple_table", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
{"type": "simple_table", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "caption": [], "footnote": []}}
1 change: 0 additions & 1 deletion tests/llm_web_kit/extractor/html/recognizer/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def test_table_to_content_list_node_simple(self):
expect = base_dir.joinpath(test_case['expected'][0])
expect_json = expect.read_text(encoding='utf-8')
assert result['type'] == json.loads(expect_json)['type']
assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex']
self.assertTrue(result['content']['html'].startswith('<table>'))
self.assertTrue(result['content']['html'].endswith('</table>'))

Expand Down
22 changes: 10 additions & 12 deletions tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,11 @@ def test_html_pipeline(self):
# 然后是simple table
html_content = html_content_list[4]
self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE)
self.assertEqual(html_content['content']['is_complex'], False)
assert html_content['content']['html'].startswith('<table')

# 然后是complex table
html_content = html_content_list[5]
self.assertEqual(html_content['type'], DocElementType.COMPLEX_TABLE)
self.assertEqual(html_content['content']['is_complex'], True)

# 然后是list
html_content = html_content_list[6]
Expand Down Expand Up @@ -541,8 +539,8 @@ def test_table_span_error(self):
test_data = self.data_json[31]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
assert result_flag is True
result_flag = result.get_content_list()._get_data()[0][0]['type']
assert result_flag == "complex_table"

def test_table_colspan_error(self):
"""测试table的colspan标签为字符串引起的异常错误."""
Expand All @@ -551,8 +549,8 @@ def test_table_colspan_error(self):
test_data = self.data_json[32]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_flag = result.get_content_list()._get_data()[0][15]['content']['is_complex']
assert result_flag is False
result_flag = result.get_content_list()._get_data()[0][15]['type']
assert result_flag == "simple_table"

def test_table_colspan_percent_err(self):
"""测试table的colspan标签为百分数引起的异常错误."""
Expand All @@ -561,8 +559,8 @@ def test_table_colspan_percent_err(self):
test_data = self.data_json[33]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
assert result_flag is True
result_flag = result.get_content_list()._get_data()[0][0]['type']
assert result_flag == "complex_table"

def test_table_colspan_str_error(self):
"""测试table的colspan标签为字符串引起的异常错误."""
Expand All @@ -571,8 +569,8 @@ def test_table_colspan_str_error(self):
test_data = self.data_json[34]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_flag = result.get_content_list()._get_data()[0][28]['content']['is_complex']
assert result_flag is False
result_flag = result.get_content_list()._get_data()[0][28]['type']
assert result_flag == "simple_table"

def test_table_invalid_percent(self):
"""测试table的colspan标签为百分数引起的异常错误."""
Expand All @@ -581,8 +579,8 @@ def test_table_invalid_percent(self):
test_data = self.data_json[35]
input_data = DataJson(test_data)
result = chain.extract(input_data)
result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
assert result_flag is False
result_flag = result.get_content_list()._get_data()[0][0]['type']
assert result_flag == "simple_table"

def test_maigc_html(self):
"""测试magic-html."""
Expand Down
9 changes: 5 additions & 4 deletions tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@
"type": "simple_table",
"content": {
"html": "<table><tr><td>1.1</td><td>2.1</td></tr><tr><td>3.1</td><td>4.1</td></tr></table>",
"is_complex": false,
"table_nest_level": "1"
"caption": [],
"footnote": []
}
},
{
Expand All @@ -100,8 +100,9 @@
"type": "complex_table",
"content": {
"html": "<table><tr><td rowspan=\"2\">1</td><td>2</td><td>3</td></tr><tr><td colspan=\"2\">4</td></tr><tr><td>5</td><td>6</td><td>7</td></tr></table>",
"is_complex": true,
"table_nest_level": "1"
"table_nest_level": "1",
"caption": [],
"footnote": []
}
},
{
Expand Down