diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 3c37da8e..05b8352e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,6 +3,7 @@ import uuid from typing import Dict, List, Tuple +from bs4 import BeautifulSoup from lxml import etree, html from selectolax.parser import HTMLParser @@ -858,8 +859,12 @@ def simplify_html(html_str) -> etree.Element: _xpath_mapping: xpath映射 """ # 使用selectolax的HTMLParser来修复html - soup = HTMLParser(html_str) - fixed_html = soup.html + try: + soup = HTMLParser(html_str) + fixed_html = soup.html + except Exception: + soup = BeautifulSoup(html_str, 'html.parser') + fixed_html = str(soup) preprocessed_html = remove_xml_declaration(fixed_html) # 注释通过lxml的HTMLParser的remove_comments参数处理 diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html new file mode 100644 index 00000000..969074b3 Binary files /dev/null and b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html differ diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 45962b1e..e28cc0ce 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -472,6 +472,24 @@ def test_tag_simplifier_block_select(self): self.assertIsNotNone(p_element.get("_item_id")) self.assertIsNotNone(p_element.get("cc-select")) + def test_tag_simplifier_unusual_encoding(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/unusual_encoding.html' + with open(file_path, 'r') as file: + raw_html = file.read() + + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 102) + + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) + if __name__ == '__main__': unittest.main()