Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re
from typing import Any, List, Tuple

from lxml import html as lxml_html
Expand Down Expand Up @@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
paragraph[-1]['c'] += _new_tail
else:
if len(paragraph) > 0 and el.tag not in inline_tags:
_new_tail = '$br$' + _new_tail
paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})

if paragraph:
Expand All @@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
text_paragraph.append(new_paragraph)

for n, item in enumerate(text_paragraph):
tem_json = json.dumps(item).replace('$br$', '\\n\\n')
tem_json = json.dumps(item, ensure_ascii=False)
tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
text_paragraph[n] = json.loads(tem_json)

return text_paragraph
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:

for item in para_text:
if item['c'] is not None:
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
else:
item['c'] = ""

Expand Down
4 changes: 3 additions & 1 deletion llm_web_kit/input/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class StructureMapper(ABC):

def __init__(self):
self.__txt_para_splitter = '\n'
self.__md_para_splitter = '\n\n'
self.__md_para_splitter = ''
self.__text_end = '\n'
self.__list_item_start = '-' # md里的列表项前缀
self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀
Expand Down Expand Up @@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F
if content_lst_node['type'] not in exclude_nodes:
txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types,
use_raw_image_url)
if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"): # 若段落间没有换行,则添加换行
md_blocks.append("\n\n")
if txt_content and len(txt_content) > 0:
md_blocks.append(txt_content)

Expand Down
24 changes: 12 additions & 12 deletions tests/llm_web_kit/extractor/html/recognizer/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_text_1(self):
'中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播'
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])
assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])

def test_text_2(self):
"""
Expand All @@ -53,7 +53,7 @@ def test_text_2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md
assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md

def test_text_3(self):
"""
Expand All @@ -75,7 +75,7 @@ def test_text_3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md
assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md

def test_text_4(self):
"""
Expand All @@ -97,7 +97,7 @@ def test_text_4(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md
assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md

def test_text_5(self):
"""
Expand All @@ -119,7 +119,7 @@ def test_text_5(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md

def test_text_6(self):
"""
Expand Down Expand Up @@ -165,7 +165,7 @@ def test_text_8(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])
assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])

def test_text_9(self):
"""
Expand All @@ -177,7 +177,7 @@ def test_text_9(self):
with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
html_content = file.read()
result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])
assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])

def test_text_10(self):
"""
Expand All @@ -199,7 +199,7 @@ def test_text_10(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n\n In the book' in content_md
assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n In the book' in content_md

def test_text_11(self):
"""
Expand Down Expand Up @@ -381,7 +381,7 @@ def test_normalize_space2(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert 'December 10th 2009, 06:42 PM\n\n fearless901\n\n Can someone please tell me my code wont work, error after error\n\n\n\n im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong? \n\n \n\n \n\n\n\n Code' in content_md
assert 'December 10th 2009, 06:42 PM\nfearless901\nCan someone please tell me my code wont work, error after error\nim need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?\nCode' in content_md

def test_normalize_space3(self):
"""
Expand All @@ -405,7 +405,7 @@ def test_normalize_space3(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md
assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\nF. A. Horrigan , S. H. Koozekanani and R. A. Paananen\nScitation Author Page\nPubMed\nGoogle Scholar\nSource' in content_md

def test_normalize_space4(self):
"""
Expand All @@ -429,7 +429,7 @@ def test_normalize_space4(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md
assert '1. DrDu\nLieber Hendrik,\nkannst Du hierzu was beitragen?\nIch finde keinen rechten Grund'
assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来

def test_Lack_content1(self):
Expand Down Expand Up @@ -478,7 +478,7 @@ def test_para_br(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
content_md = result.get_content_list().to_mm_md()
assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md
assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n $IQR = Q_{3}-Q_{1}' in content_md

def test_para_has_none(self):
"""
Expand Down