diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 723f292f..a33a7712 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,4 +1,5 @@ import json +import re from typing import Any, List, Tuple from lxml import html as lxml_html @@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT: paragraph[-1]['c'] += _new_tail else: + if len(paragraph) > 0 and el.tag not in inline_tags: + _new_tail = '$br$' + _new_tail paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) if paragraph: @@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): - tem_json = json.dumps(item).replace('$br$', '\\n\\n') + tem_json = json.dumps(item, ensure_ascii=False) + tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json) text_paragraph[n] = json.loads(tem_json) return text_paragraph diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index d6b3e857..0b5b08cc 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: for item in para_text: if item['c'] is not None: - item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n') else: item['c'] = "" diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index c9b533c2..b02bc7d1 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -51,7 +51,7 @@ class StructureMapper(ABC): def __init__(self): self.__txt_para_splitter = '\n' - self.__md_para_splitter = '\n\n' + self.__md_para_splitter = '' self.__text_end = '\n' self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 @@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F if content_lst_node['type'] not in exclude_nodes: txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types, use_raw_image_url) + if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"): # 若段落间没有换行,则添加换行 + md_blocks.append("\n\n") if txt_content and len(txt_content) > 0: md_blocks.append(txt_content) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index e3c6119f..0bd90084 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -31,7 +31,7 @@ def test_text_1(self): '中共中央政治局召开会议审议《成-2020年10月16日新闻联播', 'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播' result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) + assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) def test_text_2(self): """ @@ -53,7 +53,7 @@ def test_text_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md + assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md def test_text_3(self): """ @@ -75,7 +75,7 @@ def test_text_3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md + assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md def test_text_4(self): """ @@ -97,7 +97,7 @@ def test_text_4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md + assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md def test_text_5(self): """ @@ -119,7 +119,7 @@ def test_text_5(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md + assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md def test_text_6(self): """ @@ -165,7 +165,7 @@ def test_text_8(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) + assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) def test_text_9(self): """ @@ -177,7 +177,7 @@ def test_text_9(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) + assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) def test_text_10(self): """ @@ -199,7 +199,7 @@ def test_text_10(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n\n In the book' in content_md + assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n In the book' in content_md def test_text_11(self): """ @@ -381,7 +381,7 @@ def test_normalize_space2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'December 10th 2009, 06:42 PM\n\n fearless901\n\n Can someone please tell me my code wont work, error after error\n\n\n\n im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong? \n\n \n\n \n\n\n\n Code' in content_md + assert 'December 10th 2009, 06:42 PM\nfearless901\nCan someone please tell me my code wont work, error after error\nim need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?\nCode' in content_md def test_normalize_space3(self): """ @@ -405,7 +405,7 @@ def test_normalize_space3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md + assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\nF. A. Horrigan , S. H. Koozekanani and R. A. Paananen\nScitation Author Page\nPubMed\nGoogle Scholar\nSource' in content_md def test_normalize_space4(self): """ @@ -429,7 +429,7 @@ def test_normalize_space4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert '1. DrDu\nLieber Hendrik,\nkannst Du hierzu was beitragen?\nIch finde keinen rechten Grund' assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来 def test_Lack_content1(self): @@ -478,7 +478,7 @@ def test_para_br(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md + assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n $IQR = Q_{3}-Q_{1}' in content_md def test_para_has_none(self): """