diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index 723f292f..a33a7712 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,4 +1,5 @@
import json
+import re
from typing import Any, List, Tuple
from lxml import html as lxml_html
@@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
paragraph[-1]['c'] += _new_tail
else:
+ if len(paragraph) > 0 and el.tag not in inline_tags:
+ _new_tail = '$br$' + _new_tail
paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
if paragraph:
@@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
text_paragraph.append(new_paragraph)
for n, item in enumerate(text_paragraph):
- tem_json = json.dumps(item).replace('$br$', '\\n\\n')
+ tem_json = json.dumps(item, ensure_ascii=False)
+ tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
text_paragraph[n] = json.loads(tem_json)
return text_paragraph
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
index d6b3e857..0b5b08cc 100644
--- a/llm_web_kit/extractor/html/recognizer/text.py
+++ b/llm_web_kit/extractor/html/recognizer/text.py
@@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
for item in para_text:
if item['c'] is not None:
- item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
+ item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
else:
item['c'] = ""
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index c9b533c2..b02bc7d1 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -51,7 +51,7 @@ class StructureMapper(ABC):
def __init__(self):
self.__txt_para_splitter = '\n'
- self.__md_para_splitter = '\n\n'
+ self.__md_para_splitter = ''
self.__text_end = '\n'
self.__list_item_start = '-' # md里的列表项前缀
self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀
@@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F
if content_lst_node['type'] not in exclude_nodes:
txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types,
use_raw_image_url)
+ if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"): # 若段落间没有换行,则添加换行
+ md_blocks.append("\n\n")
if txt_content and len(txt_content) > 0:
md_blocks.append(txt_content)
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index e3c6119f..0bd90084 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -31,7 +31,7 @@ def test_text_1(self):
'中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播'
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
- assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])
+ assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])
def test_text_2(self):
"""
@@ -53,7 +53,7 @@ def test_text_2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md
+ assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md
def test_text_3(self):
"""
@@ -75,7 +75,7 @@ def test_text_3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md
+ assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md
def test_text_4(self):
"""
@@ -97,7 +97,7 @@ def test_text_4(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md
+ assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md
def test_text_5(self):
"""
@@ -119,7 +119,7 @@ def test_text_5(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
+ assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
def test_text_6(self):
"""
@@ -165,7 +165,7 @@ def test_text_8(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
- assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])
+ assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])
def test_text_9(self):
"""
@@ -177,7 +177,7 @@ def test_text_9(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
- assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])
+ assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])
def test_text_10(self):
"""
@@ -199,7 +199,7 @@ def test_text_10(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n\n\n)\n\n 1)\n\n In the book' in content_md
+ assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n\n\n)\n\n 1)\n In the book' in content_md
def test_text_11(self):
"""
@@ -381,7 +381,7 @@ def test_normalize_space2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert 'December 10th 2009, 06:42 PM\n\n fearless901\n\n Can someone please tell me my code wont work, error after error\n\n\n\n im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong? \n\n \n\n \n\n\n\n Code' in content_md
+ assert 'December 10th 2009, 06:42 PM\nfearless901\nCan someone please tell me my code wont work, error after error\nim need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?\nCode' in content_md
def test_normalize_space3(self):
"""
@@ -405,7 +405,7 @@ def test_normalize_space3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md
+ assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\nF. A. Horrigan , S. H. Koozekanani and R. A. Paananen\nScitation Author Page\nPubMed\nGoogle Scholar\nSource' in content_md
def test_normalize_space4(self):
"""
@@ -429,7 +429,7 @@ def test_normalize_space4(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md
+ assert '1. DrDu\nLieber Hendrik,\nkannst Du hierzu was beitragen?\nIch finde keinen rechten Grund'
assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来
def test_Lack_content1(self):
@@ -478,7 +478,7 @@ def test_para_br(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
- assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md
+ assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n $IQR = Q_{3}-Q_{1}' in content_md
def test_para_has_none(self):
"""