From fe4adac274cbc1594cfaee9c01568cd44a7f4cd5 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Mon, 24 Nov 2025 15:48:40 +0800 Subject: [PATCH 1/3] feat: use new spec to represent title and paragraph --- llm_web_kit/extractor/html/recognizer/text.py | 2 +- llm_web_kit/extractor/html/recognizer/title.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index db90f4a7..d6b3e857 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h el = parsed_content node = { 'type': DocElementType.PARAGRAPH, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': json.loads(el.text), } return node diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 6fa1cd59..f8ee7635 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return None cctitle_content_node = { 'type': DocElementType.TITLE, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': { 'title_content': text, 'level': level From c97d63d715e3673cc9fce66c7783536a5b1079fc Mon Sep 17 00:00:00 2001 From: Kaiwen Liu Date: Mon, 24 Nov 2025 16:41:52 +0800 Subject: [PATCH 2/3] fix: fix all newlines (#593) --- .../parser/layout_batch_parser.py | 2 +- .../test_fix_all_newlines.html | 773 +++++ .../test_fix_all_newlines.json | 40 + .../test_fix_all_newlines_tag.html | 2874 +++++++++++++++++ .../parser/test_layout_parser.py | 28 + 5 files changed, 3716 insertions(+), 1 deletion(-) create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.json create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines_tag.html diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 112d9ac9..62bcbc52 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -302,7 +302,7 @@ def htmll_to_content2(self, body_str): else: parent.text = (parent.text or '') + (element.tail or '') parent.remove(element) - self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) + # self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) output = [] main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body)) for line in main_content: diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html new file mode 100644 index 00000000..7e0592ce --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html @@ -0,0 +1,773 @@ + + + + + + Versace Bright Crystal EDT Perfume for Women 90ml | yangonbranded + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
top of page

Versace Bright Crystal EDT Perfume for Women (In stock)

@ 90ml retail packaging - K 319,000

@ 90ml tester packaging (အဖုံးပါ) - K 259,000

 

Made in Italy

 

အမြဲပူအိုက်တဲ့ မြန်မာနိုင်ငံရာသီဥတုမှာ သုံးဖို့အဆင်ပြေတဲ့ Versace Bright Crystal က သလဲသီး၊ Yuzu လိမ္မော်၊ ရေခဲရနံ့ တွေနဲ့ Peony၊ စံကားဝါ၊ ကြာပန်း ရနံ့သင်းသင်းလေးတွေကို ပေါင်းထားတဲ့ fresh juicy floral ရနံ့ဖြစ်ပြီး drydown မှာ မပြင်းလွန်းတဲ့ ပယင်း၊ ကတိုး နှင့် မဟော်ဂနီရနံ့တွေသာ ပါလို့ classy ဖြစ်ပြီး လူကိုလန်းဆန်းစေတဲ့ soft and subtle airy scent ရနံ့သင်းသင်းလေး ဖြစ်ပါတယ်။

 

ပေါ့ပေါ့ပါးပါး ခေါင်းမမူး ခေါင်းမကိုက်နိုင်တဲ့ အနံ့ fresh and clean feel ရှိတဲ့အနံ့ ဖြစ်ပြီး vanilla, powder, pepper နဲ့ aqua ရနံ့တွေကို ရှောင်ထားတဲ့ light scent ဖြစ်လို့ အဲဒီ note တွေမကြိုက်တဲ့သူတွေ ရေမွှေးပြင်းမကြိုက်တဲ့သူတွေ အတွက်ပိုသင့်ပါတယ်။

မိန်းကလေးတော်တော်များသိပြီး သုံးပြီးဖြစ်လို့ နာမည်ကျော်ကြားပြီးဖြစ်တဲ့ အီတလီနိုင်ငံလုပ် ရေမွှေးဖြစ်ပါတယ်။

Retail packaging နဲ့ tester packaging နှစ်မျိုးလုံးရှိပါတယ်။

Tester packaging မှာအဖုံးပါတာမို့ ပုလင်းက retail packaging အတိုင်းဖြစ်ပါတယ်။ အပြင်စက္ကူဗူးကပဲ tester packaging ဗူးဖြစ်နေတာပါ။

လက်ဆောင်ပေးဖို့ဝယ်တာဆိုရင်တော့ ဗူးခွံအမြင်လှတဲ့ retail packaging ကိုပဲဝယ်ဖို့ recommend လုပ်ပါတယ်။

https://www.yangonbrandedperfume.com/product-page/versace-bright-crystal-edt-perfume-for-women-90ml-2

 

အခြား In stock ရနိုင်တဲ့ Versace perfume တွေကို https://www.yangonbrandedperfume.com/versace မှာကြည့်နိုင်ပါတယ်။

Versace Bright Crystal EDT Perfume for Women 90ml

K319,000.00Price
      +
    • +

      ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

      +
    • +
    • +

      Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

      +
    • +
    • +

      ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

      +
    • +

    You are visitor number

    bottom of page
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    top of page
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +

    Perfume for Women

    +
    +
    + + +
    +
    +

    All Products

    +
    +
    + +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
      +
    • +

      ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

      +
    • +
    • +

      Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

      +
    • +
    • +

      ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

      +
    • +
    +

    You are visitor number

    +
    +
    +
    +
    +
    bottom of page
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py index 658f880e..9c2aa5c2 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py @@ -474,3 +474,31 @@ def test_code_newline(self): parts = parser.parse(pre_data) main_html = parts[PreDataJsonKey.MAIN_HTML] assert 'conda install bioconductor-annotationdbi' in main_html + + def test_fix_newlines(self): + # 构造测试html + typical_raw_tag_html = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + html_source = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + # 简化网页 + # 模型结果格式改写 + llm_path = 'assets/input_layout_batch_parser/test_code_newline.json' + llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) + pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html, + 'llm_response': llm_response, 'html_source': html_source} + pre_data = PreDataJson(pre_data) + # 映射 + parser = MapItemToHtmlTagsParser({}) + pre_data = parser.parse(pre_data) + + # 推广 + pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True + pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True + pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True + parser = LayoutBatchParser({}) + parts = parser.parse(pre_data) + main_html = parts[PreDataJsonKey.MAIN_HTML_BODY] + assert len(main_html) == 39746 From 42d9ba448513dfc1d7f8a20835c525cee4f231f5 Mon Sep 17 00:00:00 2001 From: xuchao Date: Mon, 24 Nov 2025 19:07:46 +0800 Subject: [PATCH 3/3] feat: use new spec to represent title and paragraph --- .gitignore | 2 ++ .../extractor/html/recognizer/table.py | 2 +- .../table_to_content_list_complex_res.json | 1 - .../table_to_content_list_simple_res.json | 2 +- .../extractor/html/recognizer/test_table.py | 1 - .../input/assets/content_json.json | 20 ------------------- 6 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 44fad1cc..216ffd1a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ llm_web_kit.egg-info/* .llm-web-kit.jsonc .llm-web-kit-pageclassify.jsonc tests/llm_web_kit/extractor/ygq_testmd +output.md +output.jsonl diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 239776cd..c2449bb7 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -104,7 +104,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h cc_table_type = DocElementType.SIMPLE_TABLE d = { 'type': cc_table_type, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': { 'html': html_content, 'is_complex': table_type, diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index 5bb85151..9216b23b 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -1,6 +1,5 @@ { "type": "complex_table", - "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
    ফেব্রুয়ারি ২০২৪
    সোমমঙ্গলবুধবৃহশুক্রশনিরবি
    « জানুয়ারি
    ১০১১
    ১২১৩১৪১৫১৬১৭১৮
    ১৯২০২১২২২৩২৪২৫
    ২৬২৭২৮২৯
    ", "is_complex": true, diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json index 357f2843..57412c32 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json @@ -1 +1 @@ -{"type": "simple_table", "raw_content": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>1234", "content": {"html": "
    12
    34
    ", "is_complex": false}} \ No newline at end of file +{"type": "simple_table", "content": {"html": "
    12
    34
    ", "is_complex": false}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index eb58bb98..bc91d52d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -132,7 +132,6 @@ def test_table_to_content_list_node_simple(self): expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] - assert result['raw_content'] == json.loads(expect_json)['raw_content'] self.assertTrue(result['content']['html'].startswith('')) self.assertTrue(result['content']['html'].endswith('
    ')) diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index 84eacaa6..a5bf4ef8 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -13,7 +13,6 @@ [ { "type": "paragraph", - "raw_content": "
    ==========================title====================================
    ", "content": [ { "c": "==========================title====================================", @@ -23,7 +22,6 @@ }, { "type": "title", - "raw_content": "

    Title Test

    ", "content": { "title_content": "Title Test", "level": "1" @@ -31,7 +29,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================code inline====================================
    ", "content": [ { "c": "==========================code inline====================================", @@ -41,7 +38,6 @@ }, { "type": "paragraph", - "raw_content": "
  • Dead simple\n Include prism.css and prism.js, use proper HTML5 code tags (code.language-xxxx), done!\n
  • ", "content": [ { "c": "Dead simple Include prism.css and prism.js, use proper HTML5 code tags (", @@ -59,7 +55,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================code====================================
    ", "content": [ { "c": "==========================code====================================", @@ -77,7 +72,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================simple table====================================
    ", "content": [ { "c": "==========================simple table====================================", @@ -87,7 +81,6 @@ }, { "type": "simple_table", - "raw_content": "
    1.12.1
    3.14.1
    ", "content": { "html": "
    1.12.1
    3.14.1
    ", "is_complex": false, @@ -96,7 +89,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================complex table====================================
    ", "content": [ { "c": "==========================complex table====================================", @@ -106,7 +98,6 @@ }, { "type": "complex_table", - "raw_content": "
    123
    4
    567
    ", "content": { "html": "
    123
    4
    567
    ", "is_complex": true, @@ -115,7 +106,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================equation inline====================================
    ", "content": [ { "c": "==========================equation inline====================================", @@ -125,7 +115,6 @@ }, { "type": "paragraph", - "raw_content": "

    测试行内公式x=4

    ", "content": [ { "c": "测试行内公式", @@ -143,7 +132,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================equation interline====================================
    ", "content": [ { "c": "==========================equation interline====================================", @@ -153,7 +141,6 @@ }, { "type": "paragraph", - "raw_content": "

    公式如下:

    ", "content": [ { "c": "公式如下:", @@ -172,7 +159,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================img====================================
    ", "content": [ { "c": "==========================img====================================", @@ -194,7 +180,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================list====================================
    ", "content": [ { "c": "==========================list====================================", @@ -242,7 +227,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================paragraph====================================
    ", "content": [ { "c": "==========================paragraph====================================", @@ -252,7 +236,6 @@ }, { "type": "paragraph", - "raw_content": "

    test paragraph

    ", "content": [ { "c": "test paragraph", @@ -262,7 +245,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================audio====================================
    ", "content": [ { "c": "==========================audio====================================", @@ -272,7 +254,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================video====================================
    ", "content": [ { "c": "==========================video====================================", @@ -282,7 +263,6 @@ }, { "type": "paragraph", - "raw_content": "
    ", "content": [ { "c": "Download the WEBM or MP4 video.",