diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
index 3c37da8e..05b8352e 100644
--- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
+++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
@@ -3,6 +3,7 @@
import uuid
from typing import Dict, List, Tuple
+from bs4 import BeautifulSoup
from lxml import etree, html
from selectolax.parser import HTMLParser
@@ -858,8 +859,12 @@ def simplify_html(html_str) -> etree.Element:
_xpath_mapping: xpath映射
"""
# 使用selectolax的HTMLParser来修复html
- soup = HTMLParser(html_str)
- fixed_html = soup.html
+ try:
+ soup = HTMLParser(html_str)
+ fixed_html = soup.html
+ except Exception:
+ soup = BeautifulSoup(html_str, 'html.parser')
+ fixed_html = str(soup)
preprocessed_html = remove_xml_declaration(fixed_html)
# 注释通过lxml的HTMLParser的remove_comments参数处理
diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html
new file mode 100644
index 00000000..969074b3
Binary files /dev/null and b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html differ
diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py
index 45962b1e..e28cc0ce 100644
--- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py
+++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py
@@ -472,6 +472,24 @@ def test_tag_simplifier_block_select(self):
self.assertIsNotNone(p_element.get("_item_id"))
self.assertIsNotNone(p_element.get("cc-select"))
+ def test_tag_simplifier_unusual_encoding(self):
+ file_path = base_dir / 'assets/test_html_data/simplify_cases/unusual_encoding.html'
+ with open(file_path, 'r') as file:
+ raw_html = file.read()
+
+ data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html}
+ pre_data = PreDataJson(data_dict)
+
+ pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data)
+
+ simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '')
+ simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html)
+ self.assertEqual(simple_id_count, 102)
+
+ raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '')
+ tag_id_count = self.check_and_find_max_item_id(raw_tag_html)
+ self.assertEqual(tag_id_count, simple_id_count)
+
if __name__ == '__main__':
unittest.main()