diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index c2449bb7..7ec23317 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 if table_type: cc_table_type = DocElementType.COMPLEX_TABLE + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + 'table_nest_level': table_nest_level, + "caption": [], + "footnote": [] + } + } else: cc_table_type = DocElementType.SIMPLE_TABLE - d = { - 'type': cc_table_type, - # 'raw_content': raw_html_segment, - 'content': { - 'html': html_content, - 'is_complex': table_type, - 'table_nest_level': table_nest_level + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + "caption": [], + "footnote": [] + } } - } return d def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index 9216b23b..ba341040 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -2,7 +2,8 @@ "type": "complex_table", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", - "is_complex": true, - "table_nest_level": null + "table_nest_level": null, + "caption": [], + "footnote": [] } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json index 57412c32..95c43154 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json @@ -1 +1 @@ -{"type": "simple_table", "content": {"html": "
12
34
", "is_complex": false}} \ No newline at end of file +{"type": "simple_table", "content": {"html": "
12
34
", "caption": [], "footnote": []}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index bc91d52d..fe17c919 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -131,7 +131,6 @@ def test_table_to_content_list_node_simple(self): expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] - assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] self.assertTrue(result['content']['html'].startswith('')) self.assertTrue(result['content']['html'].endswith('
')) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index d5af6d7e..44680347 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -112,13 +112,11 @@ def test_html_pipeline(self): # 然后是simple table html_content = html_content_list[4] self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE) - self.assertEqual(html_content['content']['is_complex'], False) assert html_content['content']['html'].startswith('1.12.13.14.1", - "is_complex": false, - "table_nest_level": "1" + "caption": [], + "footnote": [] } }, { @@ -100,8 +100,9 @@ "type": "complex_table", "content": { "html": "
123
4
567
", - "is_complex": true, - "table_nest_level": "1" + "table_nest_level": "1", + "caption": [], + "footnote": [] } }, {