diff --git a/.gitignore b/.gitignore
index 44fad1cc..216ffd1a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,5 @@ llm_web_kit.egg-info/*
.llm-web-kit.jsonc
.llm-web-kit-pageclassify.jsonc
tests/llm_web_kit/extractor/ygq_testmd
+output.md
+output.jsonl
diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
index 239776cd..c2449bb7 100644
--- a/llm_web_kit/extractor/html/recognizer/table.py
+++ b/llm_web_kit/extractor/html/recognizer/table.py
@@ -104,7 +104,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
cc_table_type = DocElementType.SIMPLE_TABLE
d = {
'type': cc_table_type,
- 'raw_content': raw_html_segment,
+ # 'raw_content': raw_html_segment,
'content': {
'html': html_content,
'is_complex': table_type,
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
index db90f4a7..d6b3e857 100644
--- a/llm_web_kit/extractor/html/recognizer/text.py
+++ b/llm_web_kit/extractor/html/recognizer/text.py
@@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
el = parsed_content
node = {
'type': DocElementType.PARAGRAPH,
- 'raw_content': raw_html_segment,
+ # 'raw_content': raw_html_segment,
'content': json.loads(el.text),
}
return node
diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
index 6fa1cd59..f8ee7635 100644
--- a/llm_web_kit/extractor/html/recognizer/title.py
+++ b/llm_web_kit/extractor/html/recognizer/title.py
@@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
return None
cctitle_content_node = {
'type': DocElementType.TITLE,
- 'raw_content': raw_html_segment,
+ # 'raw_content': raw_html_segment,
'content': {
'title_content': text,
'level': level
diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py
index 112d9ac9..62bcbc52 100644
--- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py
+++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py
@@ -302,7 +302,7 @@ def htmll_to_content2(self, body_str):
else:
parent.text = (parent.text or '') + (element.tail or '')
parent.remove(element)
- self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li'])
+ # self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li'])
output = []
main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body))
for line in main_content:
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
index 5bb85151..9216b23b 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
@@ -1,6 +1,5 @@
{
"type": "complex_table",
- "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>",
"content": {
"html": "
ফেব্রুয়ারি ২০২৪| সোম | মঙ্গল | বুধ | বৃহ | শুক্র | শনি | রবি |
|---|
| « জানুয়ারি | | |
| ১ | ২ | ৩ | ৪ |
| ৫ | ৬ | ৭ | ৮ | ৯ | ১০ | ১১ |
| ১২ | ১৩ | ১৪ | ১৫ | ১৬ | ১৭ | ১৮ |
| ১৯ | ২০ | ২১ | ২২ | ২৩ | ২৪ | ২৫ |
| ২৬ | ২৭ | ২৮ | ২৯ | |
",
"is_complex": true,
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
index 357f2843..57412c32 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
@@ -1 +1 @@
-{"type": "simple_table", "raw_content": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>| 1 | 2 |
| 3 | 4 |
", "content": {"html": "", "is_complex": false}}
\ No newline at end of file
+{"type": "simple_table", "content": {"html": "", "is_complex": false}}
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
index eb58bb98..bc91d52d 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
@@ -132,7 +132,6 @@ def test_table_to_content_list_node_simple(self):
expect_json = expect.read_text(encoding='utf-8')
assert result['type'] == json.loads(expect_json)['type']
assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex']
- assert result['raw_content'] == json.loads(expect_json)['raw_content']
self.assertTrue(result['content']['html'].startswith(''))
self.assertTrue(result['content']['html'].endswith('
'))
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index 84eacaa6..a5bf4ef8 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -13,7 +13,6 @@
[
{
"type": "paragraph",
- "raw_content": "==========================title====================================
",
"content": [
{
"c": "==========================title====================================",
@@ -23,7 +22,6 @@
},
{
"type": "title",
- "raw_content": "Title Test
",
"content": {
"title_content": "Title Test",
"level": "1"
@@ -31,7 +29,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================code inline====================================
",
"content": [
{
"c": "==========================code inline====================================",
@@ -41,7 +38,6 @@
},
{
"type": "paragraph",
- "raw_content": "Dead simple\n Include prism.css and prism.js, use proper HTML5 code tags (code.language-xxxx), done!\n ",
"content": [
{
"c": "Dead simple Include prism.css and prism.js, use proper HTML5 code tags (",
@@ -59,7 +55,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================code====================================
",
"content": [
{
"c": "==========================code====================================",
@@ -77,7 +72,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================simple table====================================
",
"content": [
{
"c": "==========================simple table====================================",
@@ -87,7 +81,6 @@
},
{
"type": "simple_table",
- "raw_content": "",
"content": {
"html": "",
"is_complex": false,
@@ -96,7 +89,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================complex table====================================
",
"content": [
{
"c": "==========================complex table====================================",
@@ -106,7 +98,6 @@
},
{
"type": "complex_table",
- "raw_content": "",
"content": {
"html": "",
"is_complex": true,
@@ -115,7 +106,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================equation inline====================================
",
"content": [
{
"c": "==========================equation inline====================================",
@@ -125,7 +115,6 @@
},
{
"type": "paragraph",
- "raw_content": "",
"content": [
{
"c": "测试行内公式",
@@ -143,7 +132,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================equation interline====================================
",
"content": [
{
"c": "==========================equation interline====================================",
@@ -153,7 +141,6 @@
},
{
"type": "paragraph",
- "raw_content": "",
"content": [
{
"c": "公式如下:",
@@ -172,7 +159,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================img====================================
",
"content": [
{
"c": "==========================img====================================",
@@ -194,7 +180,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================list====================================
",
"content": [
{
"c": "==========================list====================================",
@@ -242,7 +227,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================paragraph====================================
",
"content": [
{
"c": "==========================paragraph====================================",
@@ -252,7 +236,6 @@
},
{
"type": "paragraph",
- "raw_content": "",
"content": [
{
"c": "test paragraph",
@@ -262,7 +245,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================audio====================================
",
"content": [
{
"c": "==========================audio====================================",
@@ -272,7 +254,6 @@
},
{
"type": "paragraph",
- "raw_content": "==========================video====================================
",
"content": [
{
"c": "==========================video====================================",
@@ -282,7 +263,6 @@
},
{
"type": "paragraph",
- "raw_content": "",
"content": [
{
"c": "Download the WEBM or MP4 video.",
diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html
new file mode 100644
index 00000000..7e0592ce
--- /dev/null
+++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html
@@ -0,0 +1,773 @@
+
+
+
+
+
+ Versace Bright Crystal EDT Perfume for Women 90ml | yangonbranded
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+top of page
Versace Bright Crystal EDT Perfume for Women (In stock)
@ 90ml retail packaging - K 319,000
@ 90ml tester packaging (အဖုံးပါ) - K 259,000
Made in Italy
အမြဲပူအိုက်တဲ့ မြန်မာနိုင်ငံရာသီဥတုမှာ သုံးဖို့အဆင်ပြေတဲ့ Versace Bright Crystal က သလဲသီး၊ Yuzu လိမ္မော်၊ ရေခဲရနံ့ တွေနဲ့ Peony၊ စံကားဝါ၊ ကြာပန်း ရနံ့သင်းသင်းလေးတွေကို ပေါင်းထားတဲ့ fresh juicy floral ရနံ့ဖြစ်ပြီး drydown မှာ မပြင်းလွန်းတဲ့ ပယင်း၊ ကတိုး နှင့် မဟော်ဂနီရနံ့တွေသာ ပါလို့ classy ဖြစ်ပြီး လူကိုလန်းဆန်းစေတဲ့ soft and subtle airy scent ရနံ့သင်းသင်းလေး ဖြစ်ပါတယ်။
ပေါ့ပေါ့ပါးပါး ခေါင်းမမူး ခေါင်းမကိုက်နိုင်တဲ့ အနံ့ fresh and clean feel ရှိတဲ့အနံ့ ဖြစ်ပြီး vanilla, powder, pepper နဲ့ aqua ရနံ့တွေကို ရှောင်ထားတဲ့ light scent ဖြစ်လို့ အဲဒီ note တွေမကြိုက်တဲ့သူတွေ ရေမွှေးပြင်းမကြိုက်တဲ့သူတွေ အတွက်ပိုသင့်ပါတယ်။
မိန်းကလေးတော်တော်များသိပြီး သုံးပြီးဖြစ်လို့ နာမည်ကျော်ကြားပြီးဖြစ်တဲ့ အီတလီနိုင်ငံလုပ် ရေမွှေးဖြစ်ပါတယ်။
Retail packaging နဲ့ tester packaging နှစ်မျိုးလုံးရှိပါတယ်။
Tester packaging မှာအဖုံးပါတာမို့ ပုလင်းက retail packaging အတိုင်းဖြစ်ပါတယ်။ အပြင်စက္ကူဗူးကပဲ tester packaging ဗူးဖြစ်နေတာပါ။
လက်ဆောင်ပေးဖို့ဝယ်တာဆိုရင်တော့ ဗူးခွံအမြင်လှတဲ့ retail packaging ကိုပဲဝယ်ဖို့ recommend လုပ်ပါတယ်။
https://www.yangonbrandedperfume.com/product-page/versace-bright-crystal-edt-perfume-for-women-90ml-2
အခြား In stock ရနိုင်တဲ့ Versace perfume တွေကို https://www.yangonbrandedperfume.com/versace မှာကြည့်နိုင်ပါတယ်။
Versace Bright Crystal EDT Perfume for Women 90ml
bottom of page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
top of page
+
+
+
+
+
+
+
+
+
Perfume for Women
+
+
+
+
+
+
+
+
+
+
All Products
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
bottom of page
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py
index 658f880e..9c2aa5c2 100644
--- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py
+++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py
@@ -474,3 +474,31 @@ def test_code_newline(self):
parts = parser.parse(pre_data)
main_html = parts[PreDataJsonKey.MAIN_HTML]
assert 'conda install bioconductor-annotationdbi' in main_html
+
+ def test_fix_newlines(self):
+ # 构造测试html
+ typical_raw_tag_html = base_dir.joinpath(
+ 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text(
+ encoding='utf-8')
+ html_source = base_dir.joinpath(
+ 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text(
+ encoding='utf-8')
+ # 简化网页
+ # 模型结果格式改写
+ llm_path = 'assets/input_layout_batch_parser/test_code_newline.json'
+ llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8'))
+ pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html,
+ 'llm_response': llm_response, 'html_source': html_source}
+ pre_data = PreDataJson(pre_data)
+ # 映射
+ parser = MapItemToHtmlTagsParser({})
+ pre_data = parser.parse(pre_data)
+
+ # 推广
+ pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True
+ pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True
+ pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True
+ parser = LayoutBatchParser({})
+ parts = parser.parse(pre_data)
+ main_html = parts[PreDataJsonKey.MAIN_HTML_BODY]
+ assert len(main_html) == 39746