diff --git a/.gitignore b/.gitignore index 44fad1cc..216ffd1a 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ llm_web_kit.egg-info/* .llm-web-kit.jsonc .llm-web-kit-pageclassify.jsonc tests/llm_web_kit/extractor/ygq_testmd +output.md +output.jsonl diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 239776cd..c2449bb7 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -104,7 +104,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h cc_table_type = DocElementType.SIMPLE_TABLE d = { 'type': cc_table_type, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': { 'html': html_content, 'is_complex': table_type, diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index db90f4a7..d6b3e857 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h el = parsed_content node = { 'type': DocElementType.PARAGRAPH, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': json.loads(el.text), } return node diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 6fa1cd59..f8ee7635 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return None cctitle_content_node = { 'type': DocElementType.TITLE, - 'raw_content': raw_html_segment, + # 'raw_content': raw_html_segment, 'content': { 'title_content': text, 'level': level diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 112d9ac9..62bcbc52 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -302,7 +302,7 @@ def htmll_to_content2(self, body_str): else: parent.text = (parent.text or '') + (element.tail or '') parent.remove(element) - self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) + # self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li']) output = [] main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body)) for line in main_content: diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index 5bb85151..9216b23b 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -1,6 +1,5 @@ { "type": "complex_table", - "raw_content": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", "is_complex": true, diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json index 357f2843..57412c32 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json @@ -1 +1 @@ -{"type": "simple_table", "raw_content": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>1234", "content": {"html": "
12
34
", "is_complex": false}} \ No newline at end of file +{"type": "simple_table", "content": {"html": "
12
34
", "is_complex": false}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index eb58bb98..bc91d52d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -132,7 +132,6 @@ def test_table_to_content_list_node_simple(self): expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] - assert result['raw_content'] == json.loads(expect_json)['raw_content'] self.assertTrue(result['content']['html'].startswith('')) self.assertTrue(result['content']['html'].endswith('
')) diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json index 84eacaa6..a5bf4ef8 100644 --- a/tests/llm_web_kit/input/assets/content_json.json +++ b/tests/llm_web_kit/input/assets/content_json.json @@ -13,7 +13,6 @@ [ { "type": "paragraph", - "raw_content": "
==========================title====================================
", "content": [ { "c": "==========================title====================================", @@ -23,7 +22,6 @@ }, { "type": "title", - "raw_content": "

Title Test

", "content": { "title_content": "Title Test", "level": "1" @@ -31,7 +29,6 @@ }, { "type": "paragraph", - "raw_content": "
==========================code inline====================================
", "content": [ { "c": "==========================code inline====================================", @@ -41,7 +38,6 @@ }, { "type": "paragraph", - "raw_content": "
  • Dead simple\n Include prism.css and prism.js, use proper HTML5 code tags (code.language-xxxx), done!\n
  • ", "content": [ { "c": "Dead simple Include prism.css and prism.js, use proper HTML5 code tags (", @@ -59,7 +55,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================code====================================
    ", "content": [ { "c": "==========================code====================================", @@ -77,7 +72,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================simple table====================================
    ", "content": [ { "c": "==========================simple table====================================", @@ -87,7 +81,6 @@ }, { "type": "simple_table", - "raw_content": "
    1.12.1
    3.14.1
    ", "content": { "html": "
    1.12.1
    3.14.1
    ", "is_complex": false, @@ -96,7 +89,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================complex table====================================
    ", "content": [ { "c": "==========================complex table====================================", @@ -106,7 +98,6 @@ }, { "type": "complex_table", - "raw_content": "
    123
    4
    567
    ", "content": { "html": "
    123
    4
    567
    ", "is_complex": true, @@ -115,7 +106,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================equation inline====================================
    ", "content": [ { "c": "==========================equation inline====================================", @@ -125,7 +115,6 @@ }, { "type": "paragraph", - "raw_content": "

    测试行内公式x=4

    ", "content": [ { "c": "测试行内公式", @@ -143,7 +132,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================equation interline====================================
    ", "content": [ { "c": "==========================equation interline====================================", @@ -153,7 +141,6 @@ }, { "type": "paragraph", - "raw_content": "

    公式如下:

    ", "content": [ { "c": "公式如下:", @@ -172,7 +159,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================img====================================
    ", "content": [ { "c": "==========================img====================================", @@ -194,7 +180,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================list====================================
    ", "content": [ { "c": "==========================list====================================", @@ -242,7 +227,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================paragraph====================================
    ", "content": [ { "c": "==========================paragraph====================================", @@ -252,7 +236,6 @@ }, { "type": "paragraph", - "raw_content": "

    test paragraph

    ", "content": [ { "c": "test paragraph", @@ -262,7 +245,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================audio====================================
    ", "content": [ { "c": "==========================audio====================================", @@ -272,7 +254,6 @@ }, { "type": "paragraph", - "raw_content": "
    ==========================video====================================
    ", "content": [ { "c": "==========================video====================================", @@ -282,7 +263,6 @@ }, { "type": "paragraph", - "raw_content": "
    ", "content": [ { "c": "Download the WEBM or MP4 video.", diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html new file mode 100644 index 00000000..7e0592ce --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_fix_all_newlines.html @@ -0,0 +1,773 @@ + + + + + + Versace Bright Crystal EDT Perfume for Women 90ml | yangonbranded + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    top of page

    Versace Bright Crystal EDT Perfume for Women (In stock)

    @ 90ml retail packaging - K 319,000

    @ 90ml tester packaging (အဖုံးပါ) - K 259,000

     

    Made in Italy

     

    အမြဲပူအိုက်တဲ့ မြန်မာနိုင်ငံရာသီဥတုမှာ သုံးဖို့အဆင်ပြေတဲ့ Versace Bright Crystal က သလဲသီး၊ Yuzu လိမ္မော်၊ ရေခဲရနံ့ တွေနဲ့ Peony၊ စံကားဝါ၊ ကြာပန်း ရနံ့သင်းသင်းလေးတွေကို ပေါင်းထားတဲ့ fresh juicy floral ရနံ့ဖြစ်ပြီး drydown မှာ မပြင်းလွန်းတဲ့ ပယင်း၊ ကတိုး နှင့် မဟော်ဂနီရနံ့တွေသာ ပါလို့ classy ဖြစ်ပြီး လူကိုလန်းဆန်းစေတဲ့ soft and subtle airy scent ရနံ့သင်းသင်းလေး ဖြစ်ပါတယ်။

     

    ပေါ့ပေါ့ပါးပါး ခေါင်းမမူး ခေါင်းမကိုက်နိုင်တဲ့ အနံ့ fresh and clean feel ရှိတဲ့အနံ့ ဖြစ်ပြီး vanilla, powder, pepper နဲ့ aqua ရနံ့တွေကို ရှောင်ထားတဲ့ light scent ဖြစ်လို့ အဲဒီ note တွေမကြိုက်တဲ့သူတွေ ရေမွှေးပြင်းမကြိုက်တဲ့သူတွေ အတွက်ပိုသင့်ပါတယ်။

    မိန်းကလေးတော်တော်များသိပြီး သုံးပြီးဖြစ်လို့ နာမည်ကျော်ကြားပြီးဖြစ်တဲ့ အီတလီနိုင်ငံလုပ် ရေမွှေးဖြစ်ပါတယ်။

    Retail packaging နဲ့ tester packaging နှစ်မျိုးလုံးရှိပါတယ်။

    Tester packaging မှာအဖုံးပါတာမို့ ပုလင်းက retail packaging အတိုင်းဖြစ်ပါတယ်။ အပြင်စက္ကူဗူးကပဲ tester packaging ဗူးဖြစ်နေတာပါ။

    လက်ဆောင်ပေးဖို့ဝယ်တာဆိုရင်တော့ ဗူးခွံအမြင်လှတဲ့ retail packaging ကိုပဲဝယ်ဖို့ recommend လုပ်ပါတယ်။

    https://www.yangonbrandedperfume.com/product-page/versace-bright-crystal-edt-perfume-for-women-90ml-2

     

    အခြား In stock ရနိုင်တဲ့ Versace perfume တွေကို https://www.yangonbrandedperfume.com/versace မှာကြည့်နိုင်ပါတယ်။

    Versace Bright Crystal EDT Perfume for Women 90ml

    K319,000.00Price
        +
      • +

        ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

        +
      • +
      • +

        Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

        +
      • +
      • +

        ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

        +
      • +

      You are visitor number

      bottom of page
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      top of page
      +
      +
      +
      +
      +
      + +
      +
      +
      +
      +
      +
      +
      +
      +

      Perfume for Women

      +
      +
      + + +
      +
      +

      All Products

      +
      +
      + +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +
      +
      +
      +
      +
      + +
        +
      • +

        ရေမွှေးတွေကို အိမ်အရောက်ပို့စနစ် home delivery နဲ့ဖြစ်ဖြစ်၊ Viber မှာ order တင်ပြီး ရန်ကုန်အိမ်မှာကိုယ်တိုင်လာယူတာဖြစ်ဖြစ် မှာယူနိုင်ပါတယ်။ ဖုံး/Viber 0943065356 ကိုဆက်ပြီး မေးနိုင်ပါတယ်။ Viber channel ကို join ထားရင် နေ့တိုင်း ဈေးလျှော့ထားတဲ့ရေမွှေးတွေနဲ့ review တွေဖတ်နိုင်ပါတယ်။

        +
      • +
      • +

        Yangon Branded ဆိုင်နာမည် တစ်မျိုးတည်းဖြင့်သာ ၂၀၁၁ ခုနှစ်မှစ၍ စဉ်ဆက်မပျက် ရောင်းလာခြင်းဖြစ်သည်။ ပုံမှန်ဝယ်ယူအားပေးသူ ရာပေါင်းများစွာ ရှိပြီးသားမို့ Yangon Branded ဆိုင်သတင်းကို အသိ၊မိတ်ဆွေထံ မဝယ်ခင် မေးကြည့်ပြီးမှသာ ဝယ်ယူရန် တိုက်တွန်းလိုပါတယ်။

        +
      • +
      • +

        ကိုယ်တိုင်တင်သွင်းလာသော Branded ရေမွှေးအစစ်များသက်သက်ကို သင့်တော်သောဈေးဖြင့် ရောင်းပါသည်။ အဆင့်မမှီရေမွှေးများ၊ replica ဆိုသောရေမွှေးများ လုံးဝမရောင်းပါ။

        +
      • +
      +

      You are visitor number

      +
      +
      +
      +
      +
      bottom of page
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py index 658f880e..9c2aa5c2 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py @@ -474,3 +474,31 @@ def test_code_newline(self): parts = parser.parse(pre_data) main_html = parts[PreDataJsonKey.MAIN_HTML] assert 'conda install bioconductor-annotationdbi' in main_html + + def test_fix_newlines(self): + # 构造测试html + typical_raw_tag_html = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + html_source = base_dir.joinpath( + 'assets/input_layout_batch_parser/test_fix_all_newlines.html').read_text( + encoding='utf-8') + # 简化网页 + # 模型结果格式改写 + llm_path = 'assets/input_layout_batch_parser/test_code_newline.json' + llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) + pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html, + 'llm_response': llm_response, 'html_source': html_source} + pre_data = PreDataJson(pre_data) + # 映射 + parser = MapItemToHtmlTagsParser({}) + pre_data = parser.parse(pre_data) + + # 推广 + pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True + pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True + pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True + parser = LayoutBatchParser({}) + parts = parser.parse(pre_data) + main_html = parts[PreDataJsonKey.MAIN_HTML_BODY] + assert len(main_html) == 39746