From 85dc9c06d6567cd77970ed68e0218629608ea100 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 16:03:36 +0800 Subject: [PATCH 01/31] =?UTF-8?q?feat:=E6=B8=85=E7=90=86=E5=85=83=E7=B4=A0?= =?UTF-8?q?=E5=B1=9E=E6=80=A7=EF=BC=8C=E4=BF=9D=E7=95=99=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=9A=84=E6=9C=89=E6=95=88src=EF=BC=88=E6=8E=92=E9=99=A4base64?= =?UTF-8?q?=EF=BC=89=E3=80=81alt=EF=BC=8C=E4=BB=A5=E5=8F=8A=E6=89=80?= =?UTF-8?q?=E6=9C=89=E5=85=83=E7=B4=A0=E7=9A=84class=E5=92=8Cid"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../simplify_html/simplify_html.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index b47de995..94e5d629 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -417,23 +417,34 @@ def is_meaningful_content(element) -> bool: def clean_attributes(element): - """清理元素属性,只保留图片的有效src以及所有元素的class和id.""" + """清理元素属性,保留图片的有效src(排除base64)、alt,以及所有元素的class和id.""" if element.tag == 'img': + # 获取图片相关属性 src = element.get('src', '').strip() + alt = element.get('alt', '').strip() class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 - if src: + + element.attrib.clear() # 清除所有属性 + + # 保留非base64的src + if src and not src.startswith('data:image/'): element.set('src', src) + # 保留alt(如果非空) + if alt: + element.set('alt', alt) + # 保留class和id(如果非空) if class_attr: element.set('class', class_attr) if id_attr: element.set('id', id_attr) else: - # 对于其他元素,只保留class和id + # 非图片元素:只保留class和id class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 + + element.attrib.clear() # 清除所有属性 + if class_attr: element.set('class', class_attr) if id_attr: From 6cffbb6f93c4f44b24fde8964fdf44e03b3cbdb8 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 19:15:51 +0800 Subject: [PATCH 02/31] =?UTF-8?q?feat:=20=E7=B2=BE=E7=AE=80=E6=8E=A7?= =?UTF-8?q?=E5=88=B6=E6=98=AF=E5=90=A6=E8=8E=B7=E5=8F=96XPATH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/input/pre_data_json.py | 1 + llm_web_kit/main_html_parser/parser/tag_simplifier.py | 3 ++- .../llm_web_kit/main_html_parser/parser/test_tag_simplifier.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index c8d602c0..2153073f 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -15,6 +15,7 @@ class PreDataJsonKey: TYPICAL_RAW_HTML = 'typical_raw_html' TYPICAL_RAW_TAG_HTML = 'typical_raw_tag_html' + IS_XPATH = True XPATH_MAPPING = 'xpath_mapping' TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典 diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index 1d705c1b..eede6cfb 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,11 +19,12 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') + is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html) + simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 3d656811..ea5d57e9 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -57,7 +57,7 @@ def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() - data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html, PreDataJsonKey.IS_XPATH: False} pre_data = PreDataJson(data_dict) pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') From c96bbf9a242919d996ce0ccfb895fac5e30d2023 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 19:44:34 +0800 Subject: [PATCH 03/31] =?UTF-8?q?feat:=20=E7=B2=BE=E7=AE=80=E6=8E=A7?= =?UTF-8?q?=E5=88=B6=E6=98=AF=E5=90=A6=E8=8E=B7=E5=8F=96XPATH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/input/pre_data_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index 2153073f..7902d093 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -15,7 +15,7 @@ class PreDataJsonKey: TYPICAL_RAW_HTML = 'typical_raw_html' TYPICAL_RAW_TAG_HTML = 'typical_raw_tag_html' - IS_XPATH = True + IS_XPATH = 'is_xpath' XPATH_MAPPING = 'xpath_mapping' TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典 From 61fd634b15f55917c4e447df6ec2335da4eb260d Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Tue, 17 Jun 2025 16:40:22 +0800 Subject: [PATCH 04/31] =?UTF-8?q?feat:=20=E8=87=AA=E5=AE=9A=E4=B9=89?= =?UTF-8?q?=E6=A0=87=E7=AD=BE'marked-tail',=20'marked-text'=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E4=B8=BA=E8=A1=8C=E5=86=85=E6=A0=87=E7=AD=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/main_html_parser/simplify_html/simplify_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 94e5d629..679b0742 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -12,7 +12,7 @@ 'samp', 'blink', 'b', 'code', 'nobr', 'strike', 'bdo', 'basefont', 'abbr', 'var', 'i', 'cccode-inline', 'select', 's', 'pic', 'label', 'mark', 'object', 'dd', 'dt', 'ccmath-inline', 'svg', 'li', 'button', 'a', 'font', 'dfn', 'sup', 'kbd', 'q', 'script', 'acronym', 'option', 'img', 'big', 'cite', - 'em', + 'em', 'marked-tail', 'marked-text' # 'td', 'th' } From a73c2b40da775e822ad43f2e1b1654a9d91d8737 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 19 Jun 2025 19:07:03 +0800 Subject: [PATCH 05/31] =?UTF-8?q?fix:=20=E9=87=8D=E5=91=BD=E5=90=8D?= =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E6=A0=87=E7=AD=BE=E5=90=8D=E7=A7=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_html_parser/simplify_html/simplify_html.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 679b0742..312fb3cd 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -47,6 +47,9 @@ # '-header', '_header', # 有特例,可能自定义的header中有标题,先注释 } +# 自定义标签 +tail_block_tag = "cc-alg-uc-text" + def add_data_uids(dom: html.HtmlElement) -> None: """为DOM所有节点添加data-uid属性(递归所有子节点)""" @@ -762,7 +765,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # trailing_text = last_child.tail # 创建wrapper元素 - wrapper = etree.Element('cc-alg-uc-tex') + wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) # 设置前面的文本 @@ -798,7 +801,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # 检查父节点的text if original_parent.text and original_parent.text.strip() == root_for_xpath.text.strip(): # 创建wrapper - wrapper = etree.Element('cc-alg-uc-tex') + wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) wrapper.text = original_parent.text @@ -818,7 +821,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html for child in original_parent.iterchildren(): if child.tail and child.tail.strip() == root_for_xpath.text.strip(): # 创建wrapper - wrapper = etree.Element('cc-alg-uc-tex') + wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) wrapper.text = child.tail @@ -835,7 +838,7 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html # 如果没有找到匹配的文本节点,使用父节点作为包裹对象 if not found: - wrapper = etree.Element('cc-alg-uc-tex') + wrapper = etree.Element(tail_block_tag) wrapper.set('_item_id', current_id) wrapper.text = root_for_xpath.text # 将父节点的内容移动到wrapper中 From 87d9f822620c6263ed52ba9836c6354a357f8cfa Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 19 Jun 2025 19:14:04 +0800 Subject: [PATCH 06/31] =?UTF-8?q?fix:=20=E9=87=8D=E5=91=BD=E5=90=8D?= =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E6=A0=87=E7=AD=BE=E5=90=8D=E7=A7=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/main_html_parser/simplify_html/simplify_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 312fb3cd..f472c3a8 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -48,7 +48,7 @@ } # 自定义标签 -tail_block_tag = "cc-alg-uc-text" +tail_block_tag = 'cc-alg-uc-text' def add_data_uids(dom: html.HtmlElement) -> None: From ff910e9d750d2d9eacb75da8e7a89528855c0ad8 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 19 Jun 2025 19:44:35 +0800 Subject: [PATCH 07/31] =?UTF-8?q?fix:=20=E5=8E=BB=E6=8E=89=E5=86=97?= =?UTF-8?q?=E4=BD=99=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_html_parser/simplify_html/simplify_html.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index f472c3a8..bfff8f29 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -833,21 +833,8 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html index = parent.index(child) parent.insert(index + 1, wrapper) - found = True break - # 如果没有找到匹配的文本节点,使用父节点作为包裹对象 - if not found: - wrapper = etree.Element(tail_block_tag) - wrapper.set('_item_id', current_id) - wrapper.text = root_for_xpath.text - # 将父节点的内容移动到wrapper中 - for child in list(original_parent.iterchildren()): - wrapper.append(child) - original_parent.remove(child) - - # 添加wrapper到父节点 - original_parent.append(wrapper) else: # 块级元素直接设置属性 original_parent.set('_item_id', current_id) From d7e09d1ca72f5061aa19694a1e92a0d7db544404 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 10 Jul 2025 19:57:25 +0800 Subject: [PATCH 08/31] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=A4=9A?= =?UTF-8?q?=E8=AF=AD=E7=A7=8D=E6=8B=BC=E6=8E=A5=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/extractor.py | 8 +- llm_web_kit/extractor/html/recognizer/text.py | 24 +- .../good_data/html/br.html | 780 ++++++++++++++++++ .../good_data/html/zh.html | 663 +++++++++++++++ .../extractor/html/recognizer/test_text.py | 44 + .../extractor/test_extractor_chain.py | 2 +- 6 files changed, 1507 insertions(+), 14 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/br.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/zh.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 6f71c271..7ec6e1b1 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -90,6 +90,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: raw_html:str = data_json['html'] base_url:str = data_json['url'] main_html:str = data_json['main_html'] + language:str = data_json.get('language', 'en') # page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型 # main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type) @@ -97,8 +98,9 @@ def _do_extract(self, data_json: DataJson) -> DataJson: parsed_html = [(main_html_element, raw_html)] for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, - self._extract_title, self._extract_paragraph]: + self._extract_title]: parsed_html = extract_func(base_url, parsed_html, raw_html) + parsed_html = self._extract_paragraph(base_url, parsed_html, raw_html, language) # 过滤掉包含script和style标签的元素 filtered_parsed_html = [] @@ -222,7 +224,7 @@ def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s lst = self.__title_recognizer.recognize(base_url, html_lst, raw_html) return lst - def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取段落. Args: @@ -233,7 +235,7 @@ def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_ht Returns: """ - lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html, language) return lst def __is_valid_node(self, node: dict) -> bool: diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 22cc07f9..65b60d45 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -67,6 +67,9 @@ 'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins' } +# 词间无分隔符的语言 +no_separation_language = ['zh', 'ja', 'ko', 'wuu', 'th', 'km', 'lo', 'bo', 'ii', 'jv'] + class TextParagraphRecognizer(BaseHTMLElementRecognizer): """解析文本段落元素.""" @@ -93,7 +96,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return node @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析文本段落元素. Args: @@ -111,11 +114,11 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, H new_html_lst.append((html_element, raw_html_element)) else: lst = list(self.__extract_paragraphs(html_element)) - new_lst = self.__to_cctext_lst(lst) + new_lst = self.__to_cctext_lst(lst, language) new_html_lst.extend(new_lst) return new_html_lst - def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]) -> List[Tuple[HtmlElement, HtmlElement]]: + def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]], language:str) -> List[Tuple[HtmlElement, HtmlElement]]: """将lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<>标签. Args: @@ -129,7 +132,7 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]] el_element = html_to_element(el) if isinstance(el, str) else el raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html - para_text = self.__get_paragraph_text(el_element) + para_text = self.__get_paragraph_text(el_element, language) if para_text: cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=element_to_html_unescaped(raw_html_element)) new_lst.append((cctext_el, raw_html_element)) @@ -185,20 +188,20 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: lang: str: 语言 TODO 实现根据语言连接文本的不同方式, 还有就是一些特殊符号开头的连接不加空格。 """ text1 = text1.strip(' ') if text1 else '' - text2 = text2.strip(' ') if text2 else '' - if lang == 'zh': + text2 = text2.rstrip(' ') if text2 else '' + if lang in no_separation_language: txt = text1 + text2 return self.replace_entities(txt.strip(), entities_map) else: # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接 if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' - else : + else: words_sep = ' ' txt = text1 + words_sep + text2 return self.replace_entities(txt.strip(), entities_map) - def __get_paragraph_text(self, root: HtmlElement) -> List[dict]: + def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[dict]: """ 获取段落全部的文本. 对于段落里的行内公式需要特定处理,转换为段落格式: @@ -235,7 +238,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: pass else: if el.text and el.text.strip(): - text = self.__combine_text(text, el.text.strip()) + text = self.__combine_text(text, el.text.strip(), language) for child in el: text = __get_paragraph_text_recusive(child, text) @@ -244,7 +247,8 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: if is_sub_sup: text += el.tail else: - text = self.__combine_text(text, el.tail.strip()) + new_tail = f' {el.tail.strip()}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else el.tail.strip() + text = self.__combine_text(text, new_tail, language) return text diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/br.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/br.html new file mode 100644 index 00000000..d9b7ab22 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/br.html @@ -0,0 +1,780 @@ + + + + +Faustina an Henañ - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Mont d’an endalc’had +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +

Faustina an Henañ

+ + +
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+
+
+ +
Eus Wikipedia
+
+
+ + +
Ur pennad Faustina zo ivez.
+
Faostina an Henañ
+
Aureus da Faustina an Henañ.
+

Annia Galeria Faustina, lesanvet Faustina Maior pe Faustina an Henañ (c. 100 - c. 140), a oa un impalaerez roman, pried da Antoninus Pius, impalaer eus 138 da 161. +

+ + + +

Ganet e oa e Roma, en un tiegezh roman eus Hispania. Merc'h e oa d'ar c'honsul Marcus Annius Verus. He mamm, Rupilia Faustina, a oa gournizez d'an impalaer Trajan. Nizez e oa da Vibia Sabina (83136 pe 137), pried an impalaer Hadrian. +

C'hoar e oa da Varcus Annius Libo, konsul e 128, ha da Varcus Annius Verus, praetor, marvet e 124, a oa pried da Zomitia Lucilla Minor ha tad da Varcus Aurelius ; ur c'hoar he doa ivez, Annia Cornificia Faustina[1]. +

+ +

Dimeziñ a reas da Antoninus Pius etre 110 ha 115. Pevar bugel o doe : +

+ + +
    +
  1. Historia Augusta, Buhez de Marcus-Aurelius, 1. +
  2. +
+ + + + +
+
+ +
+
+ +
+ +
+
+
+
+
+ + + +
+ + +
+
+ +
+
+
+
    + +
+
+ + + +
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/zh.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/zh.html new file mode 100644 index 00000000..2d550c07 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/zh.html @@ -0,0 +1,663 @@ + + + + + + + 浙江省淡水水产研究所 + + + + + + + + + + + + + + + +
+ + + +
+ +
+ + + + +
+ +
+ + +
+ +
+ + + +
+
+
+ + + + + +
+ + +
+
+
+ + +
+ + + + +
+ + + + +
+ + + + + +
+ + + +
+ +
+
+ + + + +
+ + + + + + + +
+ +
+ +
+
+
浙江省淡水水产研究所召开干部职工大会暨党风廉政建设大会
+ +
+ +
+

+ IMG_8429.JPG +

+

为贯彻落实省农业农村厅干部职工大会精神,2月19日下午,浙江省淡水水产研究所召开干部职工大会暨党风廉政建设大会。会议传达了王通林厅长重要讲话精神,全体职工观看了廉政警示教育片,层层签订了党风廉政建设责任书。会议要求,全所干部职工要围绕农业科技创新工作,比学赶超、奋勇争先,以“开局就要奔跑、起步就要跃进”的姿态迅速投入工作;对照干部队伍建设8个“还有之”和党风廉政建设5个“还有之”开展查摆自纠。一要把好方向盘厘清工作思路,明确主次,确保重要工作做踏实、细小工作不遗漏;二要紧盯绩效表抓好工作落实,树牢问题导向和目标导向,坚持“主动想、重点干、善协调、出经验”工作理念,鼓励全所干部职工创先争优,全力打造第一等的标志性成果;三要拧紧安全阀守牢工作底线,按照无规定疫病苗种场的建设要求,扎实做好水生动物疫病防控工作,确保种源生物安全,从实从细抓好安全生产工作和党风廉政建设,营造清正廉洁、干净干事的良好氛围。

+ +

 

+ +
+ + +
+
+ +
+
+ +
+ +
+ + + + + + + + + + + +
+ + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index db392b29..5173e55d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -266,3 +266,47 @@ def test_text_line_exception(self): result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() assert 'sensi dell’art.33' in content_md + + def test_no_separation_language(self): + """ + Returns: + + """ + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://www.zjfish.org/Talent/Detail/876289162604750/2174994673059649', + 'data_source_category': 'HTML', + 'path': 'zh.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'zh' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert '成果; 三' not in content_md + + def test_tail_space(self): + """ + Returns: + + """ + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'br.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'br' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert 'Henañ (c' in content_md diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 14eb2ec1..8c209e15 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -451,7 +451,7 @@ def test_para_is_short(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_txt = result.get_content_list().to_nlp_md() - assert len(content_txt) == 1982 + assert len(content_txt) == 1983 def test_xml_tag(self): """测试xml标签.""" From 9113a0e968cda6c9b2d014bf5aba3427bce96c5f Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Wed, 16 Jul 2025 15:20:24 +0800 Subject: [PATCH 09/31] =?UTF-8?q?fix:=20=E6=96=B0=E5=A2=9Elanguage?= =?UTF-8?q?=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/extractor.py | 37 +++++++++---------- .../extractor/html/recognizer/audio.py | 2 +- .../extractor/html/recognizer/cccode.py | 3 +- .../extractor/html/recognizer/ccmath.py | 2 +- .../extractor/html/recognizer/image.py | 2 +- llm_web_kit/extractor/html/recognizer/list.py | 2 +- .../extractor/html/recognizer/recognizer.py | 2 +- .../extractor/html/recognizer/table.py | 3 +- .../extractor/html/recognizer/title.py | 2 +- .../extractor/html/recognizer/video.py | 2 +- 10 files changed, 29 insertions(+), 28 deletions(-) diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index 5dfeb208..f8509bf8 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -98,9 +98,8 @@ def _do_extract(self, data_json: DataJson) -> DataJson: parsed_html = [(main_html_element, raw_html)] for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, - self._extract_title]: - parsed_html = extract_func(base_url, parsed_html, raw_html) - parsed_html = self._extract_paragraph(base_url, parsed_html, raw_html, language) + self._extract_title, self._extract_paragraph]: + parsed_html = extract_func(base_url, parsed_html, raw_html, language) # 过滤掉包含script和style标签的元素,在这里改,是因为math提取需要保留script标签 filtered_parsed_html = [] @@ -113,7 +112,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: # data_json['title'] = title return data_json - def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: + def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str, language:str) -> List[Tuple[HtmlElement,HtmlElement]]: """从html文本中提取代码. Args: @@ -123,10 +122,10 @@ def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlEleme Returns: """ - lst = self.__code_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__code_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取数学公式. Args: @@ -137,10 +136,10 @@ def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st Returns: """ - lst = self.__math_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__math_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取图片. Args: @@ -151,10 +150,10 @@ def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s Returns: """ - lst = self.__image_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__image_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取音频. Args: @@ -165,10 +164,10 @@ def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s Returns: """ - lst = self.__audio_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__audio_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取视频. Args: @@ -179,10 +178,10 @@ def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s Returns: """ - lst = self.__video_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__video_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取表格. Args: @@ -193,10 +192,10 @@ def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s Returns: """ - lst = self.__table_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__table_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取列表. Args: @@ -207,10 +206,10 @@ def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st Returns: """ - lst = self.__list_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__list_recognizer.recognize(base_url, html_lst, raw_html, language) return lst - def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]: + def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: """从html文本中提取标题. Args: @@ -221,7 +220,7 @@ def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s Returns: """ - lst = self.__title_recognizer.recognize(base_url, html_lst, raw_html) + lst = self.__title_recognizer.recognize(base_url, html_lst, raw_html, language) return lst def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]: diff --git a/llm_web_kit/extractor/html/recognizer/audio.py b/llm_web_kit/extractor/html/recognizer/audio.py index f9e74a7b..b43ead6c 100644 --- a/llm_web_kit/extractor/html/recognizer/audio.py +++ b/llm_web_kit/extractor/html/recognizer/audio.py @@ -10,7 +10,7 @@ class AudioRecognizer(BaseHTMLElementRecognizer): """解析音频元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str, language:str = 'en') -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析音频元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/cccode.py b/llm_web_kit/extractor/html/recognizer/cccode.py index b84ce207..0baf436c 100644 --- a/llm_web_kit/extractor/html/recognizer/cccode.py +++ b/llm_web_kit/extractor/html/recognizer/cccode.py @@ -27,7 +27,8 @@ def recognize( self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], - raw_html: str + raw_html: str, + language:str = 'en' ) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析代码元素. diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 47896359..585c4f06 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -27,7 +27,7 @@ def __init__(self): self.cm = CCMATH() @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析数学公式元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 495bdc0f..30f8241d 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -67,7 +67,7 @@ def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement) return result @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[ + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[ Tuple[HtmlElement, HtmlElement]]: """父类,解析图片元素. diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 38d57f38..6be54bef 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -48,7 +48,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return ele_node @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析列表元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/recognizer.py b/llm_web_kit/extractor/html/recognizer/recognizer.py index 570bf129..63497c9e 100644 --- a/llm_web_kit/extractor/html/recognizer/recognizer.py +++ b/llm_web_kit/extractor/html/recognizer/recognizer.py @@ -29,7 +29,7 @@ class BaseHTMLElementRecognizer(ABC): """基本的元素解析类.""" @abstractmethod - def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str, language:str) -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析html中的元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index fbcc8eec..d0faa969 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -24,7 +24,8 @@ def __init__(self): def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], - raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + raw_html: str, + language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析表格元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 4d7ea4fd..2f711d4b 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -39,7 +39,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h return cctitle_content_node @override - def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]: + def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: """父类,解析标题元素. Args: diff --git a/llm_web_kit/extractor/html/recognizer/video.py b/llm_web_kit/extractor/html/recognizer/video.py index bed7df5a..ac61650b 100644 --- a/llm_web_kit/extractor/html/recognizer/video.py +++ b/llm_web_kit/extractor/html/recognizer/video.py @@ -10,7 +10,7 @@ class VideoRecognizer(BaseHTMLElementRecognizer): """解析视元素.""" @override - def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]: + def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str, language:str = 'en') -> List[Tuple[HtmlElement,HtmlElement]]: """父类,解析视频元素. Args: From 0e6deeece7df568ec59ff0da9331d2be0dd6f7b8 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Wed, 16 Jul 2025 15:39:52 +0800 Subject: [PATCH 10/31] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E5=A4=84=E7=90=86xml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/main_html_parser/typical_html/typical_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/main_html_parser/typical_html/typical_html.py b/llm_web_kit/main_html_parser/typical_html/typical_html.py index 72e9e24b..22ef66a9 100644 --- a/llm_web_kit/main_html_parser/typical_html/typical_html.py +++ b/llm_web_kit/main_html_parser/typical_html/typical_html.py @@ -39,7 +39,7 @@ def select_representative_html(html_strings): continue # 将字符串转换为文件对象 - file_obj = StringIO(html_dict['html']) + file_obj = StringIO(html_str) tree = html.parse(file_obj) # 找到body元素 From f39020697c1f09762e73583a8586c97d711fdafa Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Tue, 22 Jul 2025 16:52:13 +0800 Subject: [PATCH 11/31] =?UTF-8?q?feat:=20noclip=E7=AE=A1=E7=BA=BF=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E9=A2=84=E5=A4=84=E7=90=86=EF=BC=9A=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E8=A1=A8=E5=8D=95=E4=BA=A4=E4=BA=92=E5=BC=8F=E5=85=83=E7=B4=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../config/pipe_tpl/noclip_html_test.jsonc | 42 + llm_web_kit/extractor/html/pre_extractor.py | 73 ++ .../html/delete_interactive_element1.html | 814 ++++++++++++++++++ .../delete_interactive_element1_main.html | 73 ++ .../extractor/html/recognizer/test_text.py | 23 + 5 files changed, 1025 insertions(+) create mode 100644 llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1_main.html diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc new file mode 100644 index 00000000..0e471ce6 --- /dev/null +++ b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc @@ -0,0 +1,42 @@ +{ + "extractor_pipe": { + "enable": true, + "validate_input_format": false, + "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileToDataJsonPreExtractor", + "class_init_kwargs": { + "html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/" + } + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipPreExtractor", + "class_init_kwargs": {} + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + }, + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "class_init_kwargs": {} + } + ], + "extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor", + "class_init_kwargs": {} + } + ], + "post_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + } + ] + } +} diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index a9323a9d..d95f06e5 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -111,3 +111,76 @@ def __clean_invisible_elements(self, data_json: DataJson) -> str: for element in elements: remove_element(element) return element_to_html(tree) + + +class TestHTMLFileToDataJsonPreExtractor(HTMLFileFormatFilterPreExtractor): + """为了方便noclip管线对测试数据进行测试,根据路径读取html文件和main_html文件,然后转换为DataJson格式。""" + + def __init__(self, config: dict, html_parent_dir: str): + """ + 初始化函数 + Args: + config: + html_parent_dir: + """ + super().__init__(config) + self.__html_parent_path = html_parent_dir + + @override + def _do_pre_extract(self, data_json: DataJson) -> DataJson: + """对输入的html和main_html拼装到DataJson中,形成标准输入格式.""" + proj_root_dir = get_proj_root_dir() + html_file_path = os.path.join(proj_root_dir, self.__html_parent_path, data_json.get('path')) + main_html_file_path = os.path.join(proj_root_dir, self.__html_parent_path, data_json.get('main_path')) + + with open(html_file_path, 'r', encoding='utf-8') as f: + html = f.read() + data_json['html'] = html + del data_json['path'] + + with open(main_html_file_path, 'r', encoding='utf-8') as f: + main_html = f.read() + data_json['main_html'] = main_html + del data_json['main_path'] + return data_json + + +class HTMLFileFormatNoClipPreExtractor(HTMLFileFormatFilterPreExtractor): + """noclip管线对main_html预处理.""" + def __init__(self, config: dict): + super().__init__(config) + + @override + def _do_pre_extract(self, data_json: DataJson) -> DataJson: + data_json['main_html'] = self.__clean_interactive_elements(data_json) + return data_json + + def __clean_interactive_elements(self, data_json: DataJson) -> str: + """清除main_html中交互式元素.""" + html_content = data_json['main_html'] + tree = html_to_element(html_content) + interactive_tags = ['input', 'select', 'textarea', 'button'] + # 删除内的交互标签及关联label + for tag in interactive_tags: + for element in tree.xpath(f'//body//{tag}'): + # 删除标签本身 + parent = element.getparent() + if parent is not None: + parent.remove(element) + + # 删除关联的label(通过for属性匹配) + if 'id' in element.attrib: + for label in tree.xpath(f'//body//label[@for="{element.attrib["id"]}"]'): + label.getparent().remove(label) + + # 处理
内的交互标签及关联label + for form in tree.xpath('//form'): + # 删除表单内所有交互标签 + form_elements = form.xpath('.//input | .//select | .//textarea | .//button | .//label | .//img') + for element in form_elements: + element.getparent().remove(element) + + # 检查表单是否为空(无子元素或仅剩空白文本) + if len(form.getchildren()) == 0 or not form.text_content().strip(): + form.getparent().remove(form) + return element_to_html(tree) diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1.html new file mode 100644 index 00000000..4d93c41e --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1.html @@ -0,0 +1,814 @@ + + + + + + + + + Serve the Gay Muscle Stud | Straight Master + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+
+
+ +

Serve the Gay Muscle Stud

+ +
+

Gay Muscle Stud

+

You know you’ve been thinking about serving a gay muscle stud like me.

+

I am the Alpha Male that makes you so weak you can’t do anything but submit to my + needs. Your an inferior little bottom boy who wants to please a real man like me.

+

Whether it’s my ripped body, hot  meat or my tight virgin ass. Most likely you can’t + say the same for your hole, can you?

+

Kneel Before the Gay Muscle Stud

+

You’d cough your holes up to me in a heart beat so don’t fool yourself. It + boils down to this; I am the Master and you are a slave. So take your place and kneel in + the presence of a real gay muscle stud. Always up for anything so you bring it on and + see if you can shock me; it’s impossible.

+

If you are not lucky enough to find me when I am online, you can always get used by one + of my fellow gay + muscle hunks.

+
+
+ + +
+

Leave + a Reply

+

Your email address will not be published.

+

+

+

+ +

+

+

+

+
+
+
+
+
+ +
+
+ +
+ + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1_main.html new file mode 100644 index 00000000..24b5a35c --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/delete_interactive_element1_main.html @@ -0,0 +1,73 @@ + + +
+
+
+
+
+
+

Gay Muscle Stud

+

You know you’ve been thinking about serving a gay muscle stud like me.

+

I am the Alpha Male that makes you so weak you can’t do anything but submit to my needs. + Your an inferior little bottom boy who wants to please a real man like me.

+

Whether it’s my ripped body, hot  meat or my tight virgin ass. Most likely you can’t say + the same for your hole, can you?

+

Kneel Before the Gay Muscle Stud

+

You’d cough your holes up to me in a heart beat so don’t fool yourself. It boils down to + this; I am the Master and you are a slave. So take your place and kneel in the presence + of a real gay muscle stud. Always up for anything so you bring it on and see if you can + shock me; it’s impossible.

+

If you are not lucky enough to find me when I am online, you can always get used by one + of my fellow gay + muscle hunks.

+
+
+
+

Leave + a Reply

+

Your email address will not be published. + +

+

+

+ +

+

+
+
+
+
+
+
+
+
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 5173e55d..a011299d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -310,3 +310,26 @@ def test_tail_space(self): result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() assert 'Henañ (c' in content_md + + def test_interactive_element(self): + """ + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'delete_interactive_element1.html', + 'main_path': 'delete_interactive_element1_main.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + main_html = result.get_content_list().to_main_html() + assert ' Date: Thu, 24 Jul 2025 11:32:51 +0800 Subject: [PATCH 12/31] =?UTF-8?q?feat:=20noclip=E7=AE=A1=E7=BA=BF=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E9=A2=84=E5=A4=84=E7=90=86=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/config/pipe_tpl/noclip_html.jsonc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc index 09f20bd9..25692972 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc @@ -3,6 +3,11 @@ "enable": true, "validate_input_format": false, "pre_extractor": [ + { + "enable": true, + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipPreExtractor", + "class_init_kwargs": {} + }, { "enable": true, "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" From 2b0cdd4bdd50b04c42bbe0373103f5dd4de22f20 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 31 Jul 2025 19:29:13 +0800 Subject: [PATCH 13/31] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8Dtitle=E3=80=81li?= =?UTF-8?q?st=E3=80=81table=E3=80=81text=E7=AE=A1=E7=BA=BF=E4=B8=AD?= =?UTF-8?q?=E6=8D=A2=E8=A1=8C=E4=B8=8D=E6=AD=A3=E7=A1=AE=E4=BB=A5=E5=8F=8A?= =?UTF-8?q?=E7=BC=BA=E5=A4=B1=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/extractor.py | 2 +- llm_web_kit/extractor/html/recognizer/list.py | 49 +- .../extractor/html/recognizer/table.py | 3 + llm_web_kit/extractor/html/recognizer/text.py | 17 +- .../extractor/html/recognizer/title.py | 25 +- llm_web_kit/libs/html_utils.py | 23 + .../good_data/html/Lack_content1.html | 1227 ++++++++++ .../good_data/html/Lack_content1_main.html | 99 + .../good_data/html/text_normalize_space1.html | 1298 +++++++++++ .../html/text_normalize_space1_main.html | 1131 +++++++++ .../good_data/html/text_normalize_space2.html | 141 ++ .../html/text_normalize_space2_main.html | 303 +++ .../good_data/html/text_normalize_space3.html | 2012 +++++++++++++++++ .../html/text_normalize_space3_main.html | 1392 ++++++++++++ .../recognizer/assets/cccode/mathworks.md | 66 +- .../recognizer/assets/cccode/mathworks.txt | 66 +- .../recognizer/assets/recognizer/title1.html | 207 ++ .../assets/recognizer/title1_main.html | 204 ++ .../extractor/html/recognizer/test_table.py | 4 +- .../extractor/html/recognizer/test_text.py | 96 + .../extractor/html/recognizer/test_title.py | 18 + 21 files changed, 8249 insertions(+), 134 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1_main.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1_main.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2_main.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3_main.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1_main.html diff --git a/llm_web_kit/extractor/html/extractor.py b/llm_web_kit/extractor/html/extractor.py index f8509bf8..0335fc7e 100644 --- a/llm_web_kit/extractor/html/extractor.py +++ b/llm_web_kit/extractor/html/extractor.py @@ -95,7 +95,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson: # main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type) main_html_element = html_to_element(main_html) - parsed_html = [(main_html_element, raw_html)] + parsed_html = [(main_html_element, main_html)] for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list, self._extract_image, self._extract_title, self._extract_paragraph]: diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 6be54bef..58d0690d 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -8,9 +8,12 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import process_sub_sup_tags +from llm_web_kit.libs.html_utils import (html_normalize_space, + process_sub_sup_tags) from llm_web_kit.libs.text_utils import normalize_text_segment +from .text import inline_tags + class ListAttribute(): """列表属性.""" @@ -130,7 +133,7 @@ def __extract_list_item_text_recusive(el: HtmlElement): elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip(): paragraph.append({'c': f'`{el.text}`', 't': ParagraphTextType.CODE_INLINE}) elif el.tag == 'br': - paragraph.append({'c': '\n\n', 't': ParagraphTextType.TEXT}) + paragraph.append({'c': '$br$', 't': ParagraphTextType.TEXT}) elif el.tag == 'sub' or el.tag == 'sup': # 处理sub和sup标签,转换为GitHub Flavored Markdown格式 current_text = '' @@ -154,9 +157,17 @@ def __extract_list_item_text_recusive(el: HtmlElement): result['child_list'] = child_list else: if el.text and el.text.strip(): - paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT}) + _new_text = html_normalize_space(el.text.strip()) + if len(el) == 0 and el.tag not in inline_tags: + _new_text += '$br$' + paragraph.append({'c': _new_text, 't': ParagraphTextType.TEXT}) el.text = None - for child in el.getchildren(): + + for child in el: + if child.tag not in inline_tags: + if paragraph: + paragraph[-1]['c'] += '$br$' + p = __extract_list_item_text_recusive(child) if len(p) > 0: # 如果子元素有child_list,需要保存 @@ -166,24 +177,42 @@ def __extract_list_item_text_recusive(el: HtmlElement): if 'c' in p: if p['c'] != '': paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)}) + else: + if paragraph: + last_paragraph = paragraph[-1]['c'] + if last_paragraph == '$br$': + del paragraph[-1] + else: + if last_paragraph.endswith('$br$'): + paragraph[-1]['c'] = last_paragraph[:-4] + if el.tag != 'li' and el.tail and el.tail.strip(): + _new_tail = html_normalize_space(el.tail.strip()) if is_sub_sup: # 如果尾部文本跟在sub/sup后面,直接附加到最后一个文本段落中 if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT: - paragraph[-1]['c'] += el.tail + paragraph[-1]['c'] += _new_tail else: - paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT}) + paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) else: - paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT}) + paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) + if paragraph: # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p') + list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') if child.tag in list_item_tags: paragraph = __extract_list_item_text_recusive(child) if len(paragraph) > 0: - text_paragraph.append(paragraph) + tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') + new_paragraph = json.loads(tem_json) + text_paragraph.append(new_paragraph) + + for n, item in enumerate(text_paragraph): + tem_json = json.dumps(item).replace('$br$', '\\n\\n') + text_paragraph[n] = json.loads(tem_json) + return text_paragraph def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> list: @@ -201,7 +230,7 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis # 处理根元素文本 if ele.text and ele.text.strip(): # 检查元素是否包含数学或代码相关属性 - text_content = ele.text.strip() + text_content = html_normalize_space(ele.text.strip()) root_item = { 'c': text_content, 't': ParagraphTextType.TEXT, diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 91a464a5..45402887 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -289,6 +289,8 @@ def __get_table_body(self, table_type, table_nest_level, table_root): def __do_extract_tables(self, root: HtmlElement) -> None: """递归处理所有子标签.""" if root.tag in ['table']: + temp_tail = root.tail + root.tail = None table_raw_html = self._element_to_html(root) table_type = self.__get_table_type(root) table_nest_level = self.__is_table_nested(root) @@ -297,6 +299,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None: cc_element = self._build_cc_element( CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level, html=table_raw_html) + cc_element.tail = temp_tail self._replace_element(root, cc_element) return for child in root.iterchildren(): diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 65b60d45..2bbd1f47 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -12,7 +12,8 @@ BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType from llm_web_kit.libs.html_utils import (element_to_html_unescaped, - html_to_element, process_sub_sup_tags) + html_normalize_space, html_to_element, + process_sub_sup_tags) special_symbols = [ # TODO 从文件读取 '®', # 注册商标符号 @@ -231,29 +232,33 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: text = '' para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE}) elif el.tag in ['br']: - text += PARAGRAPH_SEPARATOR # TODO 这个地方直接加换行是错误点做法,需要利用数据结构来保证段落。 + text += '$br$' elif el.tag == 'sub' or el.tag == 'sup': text = process_sub_sup_tags(el, text, recursive=False) elif el.tag == 'audio': # 避免audio被识别为paragraph pass else: if el.text and el.text.strip(): - text = self.__combine_text(text, el.text.strip(), language) + tem_text = html_normalize_space(text) + _text = html_normalize_space(el.text.strip()) + text = self.__combine_text(tem_text, _text, language) for child in el: text = __get_paragraph_text_recusive(child, text) # 处理尾部文本 if el.tail and el.tail.strip(): if is_sub_sup: - text += el.tail + _new_tail = html_normalize_space(el.tail.strip()) + text += _new_tail else: - new_tail = f' {el.tail.strip()}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else el.tail.strip() + _new_tail = html_normalize_space(el.tail.strip()) + new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail text = self.__combine_text(text, new_tail, language) return text if final := __get_paragraph_text_recusive(root, ''): - para_text.append({'c': final, 't': ParagraphTextType.TEXT}) + para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT}) return para_text diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 9638516e..2d158578 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -8,7 +8,10 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import process_sub_sup_tags +from llm_web_kit.libs.html_utils import (html_normalize_space, + process_sub_sup_tags) + +from .text import PARAGRAPH_SEPARATOR, inline_tags class TitleRecognizer(BaseHTMLElementRecognizer): @@ -124,16 +127,20 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li if el.tag == CCTag.CC_CODE_INLINE: blks.append(f'`{el.text}`') - elif el.tag in ['sub', 'sup']: - # 使用process_sub_sup_tags保留原始的sub/sup标签 - processed_text = process_sub_sup_tags(el, '', 'en', True) - if processed_text: - blks.append(processed_text) + elif el.tag in ['br']: + blks.extend(['$br$']) else: - blks.append((el.text or '').strip()) + if el.text and el.text.strip(): + _new_text = html_normalize_space(el.text.strip()) + if blks and el.tag not in inline_tags: + blks.extend(['$br$']) + blks.append(_new_text) for child in el.getchildren(): - blks.extend(__extract_title_text_recusive(child)) + if child.tag == 'sub' or child.tag == 'sup': + blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail]) + else: + blks.extend(__extract_title_text_recusive(child)) if with_tail: blks.append((el.tail or '').strip()) @@ -143,7 +150,7 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li # 根元素不保留结尾 blks = __extract_title_text_recusive(header_el, False) - return ' '.join(blk for blk in blks if blk) + return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR) def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]: """获取element的属性.""" diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index 0cd88500..72edbf56 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -3,6 +3,8 @@ import string from copy import deepcopy +from lxml import html as lxmlhtml +from lxml.etree import ParseError from lxml.html import HtmlElement, HTMLParser, fromstring, tostring special_symbols = [ # TODO 从文件读取 @@ -430,3 +432,24 @@ def get_cc_select_html(element: HtmlElement) -> HtmlElement: container.append(elem_copy) return container + + +def html_normalize_space(text: str) -> str: + """ + 标准化html中字符串中的空白字符 + Args: + text: + + Returns: + + """ + if not text.strip(): + return '' + try: + tem_text_el = lxmlhtml.fromstring(text.strip()) + _text = tem_text_el.xpath('normalize-space()') + return _text + except ParseError: + return '' + except Exception: + return text diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1.html new file mode 100644 index 00000000..7cd7313d --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1.html @@ -0,0 +1,1227 @@ + + + + + + + + + +Optical transitions between Landau levels: AA-stacked bilayer graphene + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to main content + + + + + +
+ +
+
+ +
+
+
+ +
+
+
+
+
+ +
+
+banner image +
+
+
+
No data available.
+
Please log in to see this content.
+
You have no subscription access to this content.
+
No metrics data to plot.
+
The attempt to load metrics for this article has failed.
+
The attempt to plot a graph for these metrics has failed.
+
The full text of this article is not currently available.
+
+
+
+
+ +
+
+
+
+ +
+
+
+ + +
+
+
+ +
+
+
/content/aip/journal/apl/97/10/10.1063/1.3488806
+
+
+
http://aip.metastore.ingenta.com/content/aip/journal/apl/97/10/10.1063/1.3488806
+
+
+
+
+
+ +
/content/aip/journal/apl/97/10/10.1063/1.3488806
+
+Loading +

Data & Media loading...

+
+
+
+
+
+Loading +

Article metrics loading...

+
+
+
/content/aip/journal/apl/97/10/10.1063/1.3488806
+
2010-09-09
+
2016-08-27
+
+
+
+ +
+
+
+Loading +

Full text loading...

+
+
+
+
+

+Most read this month + +

+
Article
+
content/aip/journal/apl
+
Journal
+
5
+
3
+Loading +
+
+

+Most cited this month + +

+
+ ++ More +- Less +
+
+
+
+
+ +
+ + + + + + + + + +
+
+
+

Access Key

+
    +
  • FFree Content
  • +
  • OAOpen Access Content
  • +
  • SSubscribed Content
  • +
  • TFree Trial Content
  • +
+
+
+
+ +
+
+ +
+
+ + + +
+752b84549af89a08dbdd7fdb8b9568b5 +journal.articlezxybnytfddd +
+ + + + + + +
/content/realmedia?fmt=ahah&adPositionList=
+
&advertTargetUrl=//oascentral.aip.org/RealMedia/ads/&sitePageValue=apl.aip.org/97/10/10.1063/1.3488806&pageURL=http://scitation.aip.org/content/aip/journal/apl/97/10/10.1063/1.3488806'
+
x100,x101,x102,x103,
+
Position1,Position2,Position3,
+
Right1,Right2,Right3,
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1_main.html new file mode 100644 index 00000000..36b9cef0 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/Lack_content1_main.html @@ -0,0 +1,99 @@ + + +
+
+
+
+
+
+ + +
+ +
+ +
+
+ + +
+ +
+
+ +
+Loading +

Full text loading...

+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1.html new file mode 100644 index 00000000..1e57ea2b --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1.html @@ -0,0 +1,1298 @@ + + + + +CCCBDB Ionizaition Energy + + + + + + + + + + + + + + +
+ + + +

XVII.C.1.

+ + +

Calculated Ionization Energy for HF (Hydrogen fluoride)

+Experimental Ionization Energy is 16.03 ± 0.04 eV
+Please note! +These calculated ionizataion energies have the vibrational zero-point energy +(zpe) included, +but the zpe has NOT been scaled. +Click on an entry for more details, including the ionization energy with a scaled +zpe. + +
+ +Original data displayed. Press to display differences. +
+
+Differences displayed. Press to display original data. + +
+ +
+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + +
Methods with predefined basis sets
semi-empiricalPM3 
PM6 
compositeCBS-Q16.048

+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methods with standard basis sets
STO-3G3-21G3-21G*6-31G6-31G*6-31G**6-31+G**6-311G*6-311G**6-31G(2df,p)TZVPcc-pVDZcc-pVTZcc-pVQZaug-cc-pVDZaug-cc-pVTZaug-cc-pVQZcc-pV(T+d)Z
hartree fockHF10.31313.33913.33913.96913.89313.92914.26514.10614.11213.89514.16713.98414.12314.15614.27114.18214.17314.123
density functionalLSDA12.24615.41715.41716.26116.29116.33016.98716.67616.71316.342 16.39316.844 17.009  16.844
SVWN 15.417    16.987           
BLYP11.47914.42314.42315.24815.26615.30316.01815.63015.65815.295 15.34515.812    15.812
B1B9511.71014.62214.62215.36515.33815.41015.94915.67015.69615.402 15.46615.747 15.921  15.747
B3LYP11.70214.65414.65415.43615.44015.47616.08415.77015.79415.46616.00415.52515.91416.01216.09716.07316.07215.914
B3LYPultrafine    15.441             
B3PW9111.73614.73414.73415.45915.45515.48916.01915.75715.78015.473 15.54615.865    15.865
mPW1PW9111.68514.65914.67915.38915.36215.39515.92315.65015.68815.394 15.45015.753    15.753
M06-2X    15.434             
PBEPBE11.56414.56814.56815.34815.36115.39916.05215.68915.72115.386 15.45015.856    15.856
HSEh1PBE    15.317             
Moller Plesset perturbationMP2FC10.52014.20514.20515.10615.31215.40415.93715.52415.62215.56415.81415.40315.96816.16816.04916.16616.24415.968
MP2FU 14.207  15.31615.40915.94115.53315.632  15.40815.98216.18316.053 16.26015.982
MP3    15.109             
MP4 14.140  15.128       15.790     
B2PLYP    15.298 15.927           
Configuration interactionCID    15.031  15.212          
CISD    15.021             
Quadratic configuration interactionQCISD 14.122  15.10115.19315.69215.30315.387  15.18915.684    15.684
QCISD(T)    15.108      15.19715.744    15.744
Coupled ClusterCCD 14.128  15.11215.20015.67115.300   15.19515.675    15.675
CCSD(T)           15.19715.74115.93815.81915.93316.00915.741
CCSD(T)=FULL    15.111        15.953  16.025 
+ + +
+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methods with effective core potentials (select basis sets)
CEP-31GCEP-31G*CEP-121GCEP-121G*LANL2DZSDD
hartree fockHF14.17514.08314.19014.11814.13314.131
density functionalB3LYP15.68915.65815.72215.72215.74715.738
Moller Plesset perturbationMP2FC15.41415.62515.50815.70515.41415.418
+ + +For descriptions of the methods (AM1, HF, MP2, ...) and basis sets (3-21G, 3-21G*, 6-31G, ...) see the +glossary in section I.C. +Predefined means the basis set used is determined by the method. + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1_main.html new file mode 100644 index 00000000..bade166c --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space1_main.html @@ -0,0 +1,1131 @@ + + +

Calculated Ionization Energy for HF (Hydrogen fluoride)

+ +Experimental Ionization Energy is 16.03 ± 0.04 eV
+ +Please note! +These calculated ionizataion energies have the vibrational zero-point energy +(zpe) included, +but the zpe has NOT been scaled. +Click on an entry for more details, including the ionization energy with a scaled +zpe. + +
+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + +
Methods with predefined basis sets
semi-empiricalPM3 
PM6 
compositeCBS-Q16.048

+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methods with standard basis sets
STO-3G3-21G3-21G*6-31G6-31G*6-31G**6-31+G**6-311G*6-311G**6-31G(2df,p)TZVPcc-pVDZcc-pVTZcc-pVQZaug-cc-pVDZaug-cc-pVTZaug-cc-pVQZcc-pV(T+d)Z
hartree fockHF10.31313.33913.33913.96913.89313.92914.26514.10614.11213.89514.16713.98414.12314.15614.27114.18214.17314.123
density functionalLSDA12.24615.41715.41716.26116.29116.33016.98716.67616.71316.342 16.39316.844 17.009  16.844
SVWN 15.417    16.987           
BLYP11.47914.42314.42315.24815.26615.30316.01815.63015.65815.295 15.34515.812    15.812
B1B9511.71014.62214.62215.36515.33815.41015.94915.67015.69615.402 15.46615.747 15.921  15.747
B3LYP11.70214.65414.65415.43615.44015.47616.08415.77015.79415.46616.00415.52515.91416.01216.09716.07316.07215.914
B3LYPultrafine    15.441             
B3PW9111.73614.73414.73415.45915.45515.48916.01915.75715.78015.473 15.54615.865    15.865
mPW1PW9111.68514.65914.67915.38915.36215.39515.92315.65015.68815.394 15.45015.753    15.753
M06-2X    15.434             
PBEPBE11.56414.56814.56815.34815.36115.39916.05215.68915.72115.386 15.45015.856    15.856
HSEh1PBE    15.317             
Moller Plesset perturbationMP2FC10.52014.20514.20515.10615.31215.40415.93715.52415.62215.56415.81415.40315.96816.16816.04916.16616.24415.968
MP2FU 14.207  15.31615.40915.94115.53315.632  15.40815.98216.18316.053 16.26015.982
MP3    15.109             
MP4 14.140  15.128       15.790     
B2PLYP    15.298 15.927           
Configuration interactionCID    15.031  15.212          
CISD    15.021             
Quadratic configuration interactionQCISD 14.122  15.10115.19315.69215.30315.387  15.18915.684    15.684
QCISD(T)    15.108      15.19715.744    15.744
Coupled ClusterCCD 14.128  15.11215.20015.67115.300   15.19515.675    15.675
CCSD(T)           15.19715.74115.93815.81915.93316.00915.741
CCSD(T)=FULL    15.111        15.953  16.025 
+ + +
+Ionization Energies in eV + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Methods with effective core potentials (select basis sets)
CEP-31GCEP-31G*CEP-121GCEP-121G*LANL2DZSDD
hartree fockHF14.17514.08314.19014.11814.13314.131
density functionalB3LYP15.68915.65815.72215.72215.74715.738
Moller Plesset perturbationMP2FC15.41415.62515.50815.70515.41415.418
+ + +For descriptions of the methods (AM1, HF, MP2, ...) and basis sets (3-21G, 3-21G*, 6-31G, ...) see the +glossary in section I.C. +Predefined means the basis set used is determined by the method. + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2.html new file mode 100644 index 00000000..716b7061 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2.html @@ -0,0 +1,141 @@ + + Can someone please tell me my code wont work, error after error
  • December 10th 2009, 06:42 PM
    fearless901
    Can someone please tell me my code wont work, error after error
    im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?


    Code:

    %  VALUES GIVEN AND ASSUMED
    +%DIMENSIONS AND OTHER CONSTANS
    +L=5;                                              %Length of Tube
    +x=0:.1:5;                                          %Length vector
    +dx=0.1;                                            %  FOR THE DISTANCE
    +Ao=1;                                              %Cross-Sectional Area of undeformed Tube
    +n=(L./dx)+1;                                      %number of nodes
    +Ro=10^3;                                          %Density of Fluid(Kg/m^3)
    +Co=10^3;                                          %propagation speed in an initially undeformed
    +kp=2/3*10^9;                                        %Tube law proportionality constant

    +% BOUNDARY CONDITIONS
    +u=zeros(1,n);                                      %Initializes velocity vector to zero
    +uo=-10;                                            %Velocity of withdrawn fluid(SET BY US)
    +u(1,1)=uo;                                          %Set the initial velocity to the velocity of withdrawn fluid
    +un=u;
    +t=0;                                              %Initializing t to zero
    +dt=0.00005;                                        %STEP SIZE FOR THE TIME
    +tmax=0.05;                                          %Total duration of simulation
    +A=ones(1,n);                                        %Initial Tube Area vector
    +An=A;                                              %INITIAL AREA

    +eval('A,kp,Ro,Ao,n');                              %calling evalc function
    +i_end=input('Enter 0 for open end  or 1 for closed end TUBE: ');
    +ip=input('Enter 0 for A vs x or 1 for u vs x: ');    %  Graphic preference

    +GRAPHICS(x,A,u,ip,t)                                %calling graphics function

    +while t<tmax       
    +      %clearing variables
    +   
    +    Co=Cn;   
    +    uo=un;
    +    Ao=An;
    +    DUA=0;
    +    Cn=c;
    +   
    +    while 1
    +     
    +        B=zeros((2.*n),1);
    +        M=zeros((2.*n),4);
    +     
    +        sol=zeros(1,(2.*n));
    +     
    +       
    +        [M,B]=setUA(dx,dt,n,i_end,Co,Ao,Cn,un,An)
    +       
    +        dUA=solveUA(M,B)
    +        for i=1:n
    +           
    +            dU=dUA(i,2*i+1);
    +            dA=dUA(i,2*i);
    +            un=uo+dU;
    +            An=Ao+dA;
    +           
    +        end
    +     
    +        c=evalc(An,kp,Ro,Ao);
    +        if max(abs(dUA-DUA))<5*10^-6,break,end         
    +        DUA=dUA;
    +    end
    +    t=t+dt;
    +    i_end=input('Enter 0 for open end scenario or 1 for closed end scenario: ');
    +    ip=input('Enter 0 for A vs x or 1 for U vs x: ');
    +    GRPHCS(x,A,U,ip,t)   
    +end

    (Headbang) (Headbang) (Headbang)
  • December 10th 2009, 10:48 PM
    CaptainBlack
    Quote:

    + Originally Posted by fearless901 View Post
    im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?


    Code:

    %  VALUES GIVEN AND ASSUMED
    +%DIMENSIONS AND OTHER CONSTANS
    +L=5;                                              %Length of Tube
    +x=0:.1:5;                                          %Length vector
    +dx=0.1;                                            %  FOR THE DISTANCE
    +Ao=1;                                              %Cross-Sectional Area of undeformed Tube
    +n=(L./dx)+1;                                      %number of nodes
    +Ro=10^3;                                          %Density of Fluid(Kg/m^3)
    +Co=10^3;                                          %propagation speed in an initially undeformed
    +kp=2/3*10^9;                                        %Tube law proportionality constant

    +% BOUNDARY CONDITIONS
    +u=zeros(1,n);                                      %Initializes velocity vector to zero
    +uo=-10;                                            %Velocity of withdrawn fluid(SET BY US)
    +u(1,1)=uo;                                          %Set the initial velocity to the velocity of withdrawn fluid
    +un=u;
    +t=0;                                              %Initializing t to zero
    +dt=0.00005;                                        %STEP SIZE FOR THE TIME
    +tmax=0.05;                                          %Total duration of simulation
    +A=ones(1,n);                                        %Initial Tube Area vector
    +An=A;                                              %INITIAL AREA

    +eval('A,kp,Ro,Ao,n');                              %calling evalc function
    +i_end=input('Enter 0 for open end  or 1 for closed end TUBE: ');
    +ip=input('Enter 0 for A vs x or 1 for u vs x: ');    %  Graphic preference

    +GRAPHICS(x,A,u,ip,t)                                %calling graphics function

    +while t<tmax       
    +      %clearing variables

    +    Co=Cn;   
    +    uo=un;
    +    Ao=An;
    +    DUA=0;
    +    Cn=c;

    +    while 1

    +        B=zeros((2.*n),1);
    +        M=zeros((2.*n),4);

    +        sol=zeros(1,(2.*n));


    +        [M,B]=setUA(dx,dt,n,i_end,Co,Ao,Cn,un,An)

    +        dUA=solveUA(M,B)
    +        for i=1:n

    +            dU=dUA(i,2*i+1);
    +            dA=dUA(i,2*i);
    +            un=uo+dU;
    +            An=Ao+dA;

    +        end

    +        c=evalc(An,kp,Ro,Ao);
    +        if max(abs(dUA-DUA))<5*10^-6,break,end         
    +        DUA=dUA;
    +    end
    +    t=t+dt;
    +    i_end=input('Enter 0 for open end scenario or 1 for closed end scenario: ');
    +    ip=input('Enter 0 for A vs x or 1 for U vs x: ');
    +    GRPHCS(x,A,U,ip,t)   
    +end

    (Headbang) (Headbang) (Headbang)

    Tell us what the first error message is, or better yet just tell us all of them.

    +Also, what do you think this line does:

    +eval('A,kp,Ro,Ao,n'); %calling evalc function

    +And:

    +Consider single stepping through the code in the debugger.

    +CB
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2_main.html new file mode 100644 index 00000000..8eb18a3b --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space2_main.html @@ -0,0 +1,303 @@ +
  • December 10th 2009, 06:42 PM
    + fearless901 +
    +
    Can someone please tell me my code wont work, error after error
    +
    im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?
    +
    +
    +
    Code:
    +
    %  VALUES GIVEN AND ASSUMED
    + +%DIMENSIONS AND OTHER CONSTANS
    + +L=5;                                              %Length of Tube
    + +x=0:.1:5;                                          %Length vector
    + +dx=0.1;                                            %  FOR THE DISTANCE
    + +Ao=1;                                              %Cross-Sectional Area of undeformed Tube
    + +n=(L./dx)+1;                                      %number of nodes
    + +Ro=10^3;                                          %Density of Fluid(Kg/m^3)
    + +Co=10^3;                                          %propagation speed in an initially undeformed
    + +kp=2/3*10^9;                                        %Tube law proportionality constant
    +
    + +% BOUNDARY CONDITIONS
    + +u=zeros(1,n);                                      %Initializes velocity vector to zero
    + +uo=-10;                                            %Velocity of withdrawn fluid(SET BY US)
    + +u(1,1)=uo;                                          %Set the initial velocity to the velocity of withdrawn fluid
    + +un=u;
    + +t=0;                                              %Initializing t to zero
    + +dt=0.00005;                                        %STEP SIZE FOR THE TIME
    + +tmax=0.05;                                          %Total duration of simulation
    + +A=ones(1,n);                                        %Initial Tube Area vector
    + +An=A;                                              %INITIAL AREA
    + + 
    + +eval('A,kp,Ro,Ao,n');                              %calling evalc function
    + +i_end=input('Enter 0 for open end  or 1 for closed end TUBE: ');
    + +ip=input('Enter 0 for A vs x or 1 for u vs x: ');    %  Graphic preference
    + + 
    + +GRAPHICS(x,A,u,ip,t)                                %calling graphics function
    + + 
    + +while t<tmax       
    + +      %clearing variables
    + +   
    + +    Co=Cn;   
    + +    uo=un;
    + +    Ao=An;
    + +    DUA=0;
    + +    Cn=c;
    + +   
    + +    while 1
    + +     
    + +        B=zeros((2.*n),1);
    + +        M=zeros((2.*n),4);
    + +     
    + +        sol=zeros(1,(2.*n));
    + +     
    + +       
    + +        [M,B]=setUA(dx,dt,n,i_end,Co,Ao,Cn,un,An)
    + +       
    + +        dUA=solveUA(M,B)
    + +        for i=1:n
    + +           
    + +            dU=dUA(i,2*i+1);
    + +            dA=dUA(i,2*i);
    + +            un=uo+dU;
    + +            An=Ao+dA;
    + +           
    + +        end
    + +     
    + +        c=evalc(An,kp,Ro,Ao);
    + +        if max(abs(dUA-DUA))<5*10^-6,break,end         
    + +        DUA=dUA;
    + +    end
    + +    t=t+dt;
    + +    i_end=input('Enter 0 for open end scenario or 1 for closed end scenario: ');
    + +    ip=input('Enter 0 for A vs x or 1 for U vs x: ');
    + +    GRPHCS(x,A,U,ip,t)   
    + +end

    +(Headbang) (Headbang) (Headbang)
    +
  • +
  • December 10th 2009, 10:48 PM
    + CaptainBlack +
    +
    Quote:
    +

    + Originally Posted by fearless901 View Post
    +
    im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?
    +
    +
    +
    Code:
    +
    %  VALUES GIVEN AND ASSUMED
    + +%DIMENSIONS AND OTHER CONSTANS
    + +L=5;                                              %Length of Tube
    + +x=0:.1:5;                                          %Length vector
    + +dx=0.1;                                            %  FOR THE DISTANCE
    + +Ao=1;                                              %Cross-Sectional Area of undeformed Tube
    + +n=(L./dx)+1;                                      %number of nodes
    + +Ro=10^3;                                          %Density of Fluid(Kg/m^3)
    + +Co=10^3;                                          %propagation speed in an initially undeformed
    + +kp=2/3*10^9;                                        %Tube law proportionality constant
    + + 
    + +% BOUNDARY CONDITIONS
    + +u=zeros(1,n);                                      %Initializes velocity vector to zero
    + +uo=-10;                                            %Velocity of withdrawn fluid(SET BY US)
    + +u(1,1)=uo;                                          %Set the initial velocity to the velocity of withdrawn fluid
    + +un=u;
    + +t=0;                                              %Initializing t to zero
    + +dt=0.00005;                                        %STEP SIZE FOR THE TIME
    + +tmax=0.05;                                          %Total duration of simulation
    + +A=ones(1,n);                                        %Initial Tube Area vector
    + +An=A;                                              %INITIAL AREA
    + + 
    + +eval('A,kp,Ro,Ao,n');                              %calling evalc function
    + +i_end=input('Enter 0 for open end  or 1 for closed end TUBE: ');
    + +ip=input('Enter 0 for A vs x or 1 for u vs x: ');    %  Graphic preference
    + + 
    + +GRAPHICS(x,A,u,ip,t)                                %calling graphics function
    + + 
    + +while t<tmax       
    + +      %clearing variables
    + + 
    + +    Co=Cn;   
    + +    uo=un;
    + +    Ao=An;
    + +    DUA=0;
    + +    Cn=c;
    + + 
    + +    while 1
    + + 
    + +        B=zeros((2.*n),1);
    + +        M=zeros((2.*n),4);
    + + 
    + +        sol=zeros(1,(2.*n));
    + + 
    + + 
    + +        [M,B]=setUA(dx,dt,n,i_end,Co,Ao,Cn,un,An)
    + + 
    + +        dUA=solveUA(M,B)
    + +        for i=1:n
    + + 
    + +            dU=dUA(i,2*i+1);
    + +            dA=dUA(i,2*i);
    + +            un=uo+dU;
    + +            An=Ao+dA;
    + + 
    + +        end
    + + 
    + +        c=evalc(An,kp,Ro,Ao);
    + +        if max(abs(dUA-DUA))<5*10^-6,break,end         
    + +        DUA=dUA;
    + +    end
    + +    t=t+dt;
    + +    i_end=input('Enter 0 for open end scenario or 1 for closed end scenario: ');
    + +    ip=input('Enter 0 for A vs x or 1 for U vs x: ');
    + +    GRPHCS(x,A,U,ip,t)   
    + +end

    +(Headbang) (Headbang) (Headbang)
    +
    +
    +Tell us what the first error message is, or better yet just tell us all of them.
    +
    + +Also, what do you think this line does:
    +
    + +eval('A,kp,Ro,Ao,n'); %calling evalc function
    +
    + +And:
    +
    + +Consider single stepping through the code in the debugger.
    +
    + +CB
    +
  • +
+ \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3.html new file mode 100644 index 00000000..9d95fe14 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3.html @@ -0,0 +1,2012 @@ + + + + + +Applied Physics Letters - Volume 6, Issue 3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
NOTICE:
+ +

AIP Publishing manuscript submission and processing system (PXP) is currently unavailable to users in China. We are working to resolve the issue as quickly as possible. We apologize for the inconvenience.

+

尊敬的中国作者和评审人:AIP Publishing (AIP出版公司)的论文发布系统(PXP)目前遇到一些技术问题。我们将为您尽快解决。因此带来的不便,我们向您表达我们诚挚的歉意! +

+

Thank you for your patience during this process.

+
+
+
1887
+
+
+
+ +
+
+
+
+ + +
+ +
+
+banner image +
+
+ +
+
+Cover image Placeholder +
+
+

+Volume 6, +Issue 3, +01 February 1965 +

+
+
+Show +/ +Hide +descriptions +
+
+
+

Index of content:

+ +
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+ + + + +
+ + +
+
+
+
+
+
+
+

+Most Read This Month + + + +

+
Article
+
content/aip/journal/apl
+
Journal
+
5
+
3
+Loading +
+
+

+Most Cited This Month + + + +

+
+ ++ More +- Less +
+
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+
+
This is a required field
+
Please enter a valid email address
+
+

Oops! This section, does not exist...

Use the links on this page to find existing content.

+
+
+ + +
+38d913362403a4a254fa758cb6527ba5 +journal.issuezxybnytfddd +
+
Scitation: Applied Physics Letters - Volume 6, Issue 3
+
http://aip.metastore.ingenta.com/content/aip/journal/apl/6/3
+
+
BROWSE_VIEW_TOC
+
+ + +
+
    +
  • journal/journal.issue
  • +
  • aip/apl
  • +
  • /content/aip/journal/apl/6/3
  • +
  • apl.aip.org/6/3/Right1,Right2,Right3
  • +
+
+ + + + + + + + + + + + + + + + + + + + + diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3_main.html new file mode 100644 index 00000000..47abea71 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space3_main.html @@ -0,0 +1,1392 @@ + + + +
+
+
+
+
+
+

+Volume 6, +Issue 3, +01 February 1965 +

+ +
+ +
+
+
+
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+ + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md index 5eda3398..c8745567 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md @@ -4,18 +4,12 @@ ### Use Integers for Index Variables -In MATLAB® code that you want to convert to single precision, -it is a best practice to use integers for index variables. However, -if the code does not use integers for index variables, when possible `convertToSingle` tries to detect -the index variables and select `int32` types for -them. +In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. ### Limit Use of `assert` Statements -- Do not use `assert` statements -to define the properties of input arguments. -- Do not use `assert` statements -to test the type of a variable. For example, do not use +- Do not use `assert` statements to define the properties of input arguments. +- Do not use `assert` statements to test the type of a variable. For example, do not use ``` assert(isa(a, 'double')) @@ -23,54 +17,27 @@ assert(isa(a, 'double')) ### Initialize MATLAB Class Properties in Constructor -Do not initialize MATLAB class properties in the `properties` block. -Instead, use the constructor to initialize the class properties. +Do not initialize MATLAB class properties in the `properties` block. Instead, use the constructor to initialize the class properties. ### Provide a Test File That Calls Your MATLAB Function -Separate your core algorithm from other code that you use to -test and verify the results. Create a test file that calls your double-precision MATLAB algorithm. -You can use the test file to: +Separate your core algorithm from other code that you use to test and verify the results. Create a test file that calls your double-precision MATLAB algorithm. You can use the test file to: -- Automatically define properties of the top-level function -inputs. -- Verify that the double-precision algorithm behaves -as you expect. The double-precision behavior is the baseline against -which you compare the behavior of the single-precision versions of -your algorithm. -- Compare the behavior of the single-precision version -of your algorithm to the double-precision baseline. +- Automatically define properties of the top-level function inputs. +- Verify that the double-precision algorithm behaves as you expect. The double-precision behavior is the baseline against which you compare the behavior of the single-precision versions of your algorithm. +- Compare the behavior of the single-precision version of your algorithm to the double-precision baseline. -For best results, the test file must exercise the algorithm -over its full operating range. +For best results, the test file must exercise the algorithm over its full operating range. ### Prepare Your Code for Code Generation -MATLAB code that you want to convert to single precision -must comply with code generation requirements. See MATLAB Language Features Supported for C/C++ Code Generation. +MATLAB code that you want to convert to single precision must comply with code generation requirements. See MATLAB Language Features Supported for C/C++ Code Generation. -To help you identify unsupported functions or constructs in -your MATLAB code, add the `%#codegen` pragma -to the top of your MATLAB file. When you edit your code in the MATLAB editor, -the MATLAB Code Analyzer flags functions and constructs that -are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB - Coder™ app, -the app screens your code for code generation readiness. At the function -line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. +To help you identify unsupported functions or constructs in your MATLAB code, add the `%#codegen` pragma to the top of your MATLAB file. When you edit your code in the MATLAB editor, the MATLAB Code Analyzer flags functions and constructs that are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB Coder™ app, the app screens your code for code generation readiness. At the function line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. ### Use the `-args` Option to Specify Input Properties -When you generate single-precision MATLAB code, if you -specify a test file, you do not have to specify argument properties -with the `-args` option. In this case, the code generator -runs the test file to determine the properties of the input types. -However, running the test file can slow the code generation. It is -a best practice to pass the properties to the `-args` option -so that `convertToSingle` does not run the test -file to determine the argument properties. If you have a MATLAB - Coder license, -you can use `coder.getArgTypes` to determine the -argument properties. For example: +When you generate single-precision MATLAB code, if you specify a test file, you do not have to specify argument properties with the `-args` option. In this case, the code generator runs the test file to determine the properties of the input types. However, running the test file can slow the code generation. It is a best practice to pass the properties to the `-args` option so that `convertToSingle` does not run the test file to determine the argument properties. If you have a MATLAB Coder license, you can use `coder.getArgTypes` to determine the argument properties. For example: ``` types = coder.getArgTypes('myfun_test', 'myfun'); @@ -80,14 +47,7 @@ convertToSingle -config scfg -args types myfun ### Test Numerics and Log I/O Data -When you use the convertToSingle function to -generate single-precision MATLAB code, enable numerics testing -and I/O data logging for comparison plots. To use numerics testing, -you must provide a test file that calls your MATLAB function. -To enable numerics testing and I/O data logging, create a `coder.SingleConfig` object. -Set the `TestBenchName` , `TestNumerics` , -and `LogIOForComparisonPlotting` properties. For -example: +When you use the convertToSingle function to generate single-precision MATLAB code, enable numerics testing and I/O data logging for comparison plots. To use numerics testing, you must provide a test file that calls your MATLAB function. To enable numerics testing and I/O data logging, create a `coder.SingleConfig` object. Set the `TestBenchName` , `TestNumerics` , and `LogIOForComparisonPlotting` properties. For example: ``` scfg = coder.config('single'); diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt index 901f1ad6..7b4f582d 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt @@ -1,73 +1,33 @@ 主要内容 Single-Precision Conversion Best Practices Use Integers for Index Variables -In MATLAB® code that you want to convert to single precision, -it is a best practice to use integers for index variables. However, -if the code does not use integers for index variables, when possible `convertToSingle` tries to detect -the index variables and select `int32` types for -them. +In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. Limit Use of `assert` Statements -- Do not use `assert` statements -to define the properties of input arguments. -- Do not use `assert` statements -to test the type of a variable. For example, do not use +- Do not use `assert` statements to define the properties of input arguments. +- Do not use `assert` statements to test the type of a variable. For example, do not use ``` assert(isa(a, 'double')) ``` Initialize MATLAB Class Properties in Constructor -Do not initialize MATLAB class properties in the `properties` block. -Instead, use the constructor to initialize the class properties. +Do not initialize MATLAB class properties in the `properties` block. Instead, use the constructor to initialize the class properties. Provide a Test File That Calls Your MATLAB Function -Separate your core algorithm from other code that you use to -test and verify the results. Create a test file that calls your double-precision MATLAB algorithm. -You can use the test file to: -- Automatically define properties of the top-level function -inputs. -- Verify that the double-precision algorithm behaves -as you expect. The double-precision behavior is the baseline against -which you compare the behavior of the single-precision versions of -your algorithm. -- Compare the behavior of the single-precision version -of your algorithm to the double-precision baseline. -For best results, the test file must exercise the algorithm -over its full operating range. +Separate your core algorithm from other code that you use to test and verify the results. Create a test file that calls your double-precision MATLAB algorithm. You can use the test file to: +- Automatically define properties of the top-level function inputs. +- Verify that the double-precision algorithm behaves as you expect. The double-precision behavior is the baseline against which you compare the behavior of the single-precision versions of your algorithm. +- Compare the behavior of the single-precision version of your algorithm to the double-precision baseline. +For best results, the test file must exercise the algorithm over its full operating range. Prepare Your Code for Code Generation -MATLAB code that you want to convert to single precision -must comply with code generation requirements. See MATLAB Language Features Supported for C/C++ Code Generation. -To help you identify unsupported functions or constructs in -your MATLAB code, add the `%#codegen` pragma -to the top of your MATLAB file. When you edit your code in the MATLAB editor, -the MATLAB Code Analyzer flags functions and constructs that -are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB - Coder™ app, -the app screens your code for code generation readiness. At the function -line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. +MATLAB code that you want to convert to single precision must comply with code generation requirements. See MATLAB Language Features Supported for C/C++ Code Generation. +To help you identify unsupported functions or constructs in your MATLAB code, add the `%#codegen` pragma to the top of your MATLAB file. When you edit your code in the MATLAB editor, the MATLAB Code Analyzer flags functions and constructs that are not supported for code generation. See Check Code Using the MATLAB Code Analyzer. When you use the MATLAB Coder™ app, the app screens your code for code generation readiness. At the function line, you can use the Code Generation Readiness Tool. See Check Code Using the Code Generation Readiness Tool. Use the `-args` Option to Specify Input Properties -When you generate single-precision MATLAB code, if you -specify a test file, you do not have to specify argument properties -with the `-args` option. In this case, the code generator -runs the test file to determine the properties of the input types. -However, running the test file can slow the code generation. It is -a best practice to pass the properties to the `-args` option -so that `convertToSingle` does not run the test -file to determine the argument properties. If you have a MATLAB - Coder license, -you can use `coder.getArgTypes` to determine the -argument properties. For example: +When you generate single-precision MATLAB code, if you specify a test file, you do not have to specify argument properties with the `-args` option. In this case, the code generator runs the test file to determine the properties of the input types. However, running the test file can slow the code generation. It is a best practice to pass the properties to the `-args` option so that `convertToSingle` does not run the test file to determine the argument properties. If you have a MATLAB Coder license, you can use `coder.getArgTypes` to determine the argument properties. For example: ``` types = coder.getArgTypes('myfun_test', 'myfun'); scfg = coder.config('single'); convertToSingle -config scfg -args types myfun ``` Test Numerics and Log I/O Data -When you use the convertToSingle function to -generate single-precision MATLAB code, enable numerics testing -and I/O data logging for comparison plots. To use numerics testing, -you must provide a test file that calls your MATLAB function. -To enable numerics testing and I/O data logging, create a `coder.SingleConfig` object. -Set the `TestBenchName` , `TestNumerics` , -and `LogIOForComparisonPlotting` properties. For -example: +When you use the convertToSingle function to generate single-precision MATLAB code, enable numerics testing and I/O data logging for comparison plots. To use numerics testing, you must provide a test file that calls your MATLAB function. To enable numerics testing and I/O data logging, create a `coder.SingleConfig` object. Set the `TestBenchName` , `TestNumerics` , and `LogIOForComparisonPlotting` properties. For example: ``` scfg = coder.config('single'); scfg.TestBenchName = 'mytest'; diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1.html new file mode 100644 index 00000000..25941a72 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1.html @@ -0,0 +1,207 @@ + + + + + + +Compare vibarational frequencies from two calculations + + + + + + + + + + + + + + + + +
+ + + +

III.G.7. (XIII.B.3.) (XIII.D.5.)

+ +

Compare vibrational frequencies for two calculations +for C3 (carbon trimer)

+ +

+

A = +HF/6-31G* +
+
B = +MP2FC/6-31G* +
+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 symmetry frequency (cm-1) reduced mass (amu) IR Intensity (km mol-1)
mode numberAB ABdiff.ratio ABdiff.ratio ABdiff.ratio
1ΣgΣg 12281179 49.51.042 12.00012.000 0.0001.000 0.000.00 0.00 
2ΣuΣu 22432743-500.70.817 12.00012.000 0.0001.000 3932.189050.54 -5118.360.434
3ΠuΠu 137274-136.60.501 12.00012.000 0.0001.000 2.131.28 0.851.668
4ΠuΠu 130i274-404.3-0.476 12.00012.000 0.0001.000 17.741.28 16.4613.874
scaled by  0.89850.943
+ +

See section III.C.1 List or set vibrational scaling factors +to change the scale factors used here. +
See section III.C.2 +Calculate a vibrational scaling factor for a given set of molecules +to determine the least squares best scaling factor. + + + + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1_main.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1_main.html new file mode 100644 index 00000000..4c91facd --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/title1_main.html @@ -0,0 +1,204 @@ + + + + + + + + + + + + + +
+ +return to home page + +

III.G.7. (XIII.B.3.) (XIII.D.5.)

+
+ +

Compare vibrational frequencies for two calculations +for C3 (carbon trimer)

+ + +

+

+

A = +HF/6-31G* +
+ +
+
B = +MP2FC/6-31G* +
+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 symmetry frequency (cm-1) + reduced mass (amu) IR Intensity (km mol-1)
mode numberAB ABdiff.ratio ABdiff.ratio ABdiff.ratio
1ΣgΣg 12281179 49.51.042 12.00012.000 0.0001.000 0.000.00 0.00 
2ΣuΣu 22432743-500.70.817 12.00012.000 0.0001.000 3932.189050.54 -5118.360.434
3ΠuΠu 137274-136.60.501 12.00012.000 0.0001.000 2.131.28 0.851.668
4ΠuΠu 130i +274-404.3-0.476 12.00012.000 0.0001.000 17.741.28 16.4613.874
scaled by  0.89850.943
+ +

See section III.C.1 List or set vibrational scaling factors +to change the scale factors used here. +
+See section III.C.2 +Calculate a vibrational scaling factor for a given set of molecules +to determine the least squares best scaling factor. + +

+ + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index e5bca016..267db287 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -115,11 +115,11 @@ def test_simple_complex_table(self): simple_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] simple_table_type = simple_table_tag.attrib assert simple_table_type['table_type'] == 'simple' - assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
\n\n'} + assert simple_table_type == {'table_type': 'simple', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n
12
34
'} complex_table_tag = parts[2][0].xpath(f'.//{CCTag.CC_TABLE}')[0] complex_table_type = complex_table_tag.attrib assert complex_table_type['table_type'] == 'complex' - assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
\n '} + assert complex_table_type == {'table_type': 'complex', 'table_nest_level': '1', 'html': '\n \n \n \n \n \n \n \n \n \n \n \n \n \n
123
4
567
'} def test_table_to_content_list_node_simple(self): """测试table的 to content list node方法.""" diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index a011299d..c3517bff 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -333,3 +333,99 @@ def test_interactive_element(self): result = chain.extract(input_data) main_html = result.get_content_list().to_main_html() assert 'TEST: `import *` TEST' pass + + +def test_title1(title_recognizer): + """ + 测试修复标题被隔断 + Args: + title_recognizer: + + Returns: + + """ + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/title1_main.html', 'r') as file: + main_html_content = file.read() + + with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/title1.html', 'r') as file: + html_content = file.read() + result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content) + assert 'Compare vibrational frequencies for two calculations for C <sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) From 0fbac261db65c758dbb3176791547efe3f177bad Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 31 Jul 2025 19:55:03 +0800 Subject: [PATCH 14/31] =?UTF-8?q?fix:=20=E5=8E=BB=E6=8E=89=E5=86=97?= =?UTF-8?q?=E4=BD=99=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/title.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index 2d158578..cf105b9f 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -11,7 +11,7 @@ from llm_web_kit.libs.html_utils import (html_normalize_space, process_sub_sup_tags) -from .text import PARAGRAPH_SEPARATOR, inline_tags +from .text import PARAGRAPH_SEPARATOR class TitleRecognizer(BaseHTMLElementRecognizer): @@ -132,8 +132,6 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li else: if el.text and el.text.strip(): _new_text = html_normalize_space(el.text.strip()) - if blks and el.tag not in inline_tags: - blks.extend(['$br$']) blks.append(_new_text) for child in el.getchildren(): From 0bd9d284e39ff829277957c283d08ce9f89e51e9 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 31 Jul 2025 20:46:31 +0800 Subject: [PATCH 15/31] =?UTF-8?q?fix:=20=E5=A2=9E=E5=8A=A0=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/list.py | 2 - llm_web_kit/libs/html_utils.py | 3 - .../good_data/html/text_normalize_space4.html | 1650 +++++++++++++++++ .../html/text_normalize_space4_main.html | 389 ++++ .../extractor/html/recognizer/test_text.py | 24 + 5 files changed, 2063 insertions(+), 5 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4_main.html diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 58d0690d..d91caa23 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -192,8 +192,6 @@ def __extract_list_item_text_recusive(el: HtmlElement): # 如果尾部文本跟在sub/sup后面,直接附加到最后一个文本段落中 if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT: paragraph[-1]['c'] += _new_tail - else: - paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) else: paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index 72edbf56..6d971738 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -4,7 +4,6 @@ from copy import deepcopy from lxml import html as lxmlhtml -from lxml.etree import ParseError from lxml.html import HtmlElement, HTMLParser, fromstring, tostring special_symbols = [ # TODO 从文件读取 @@ -449,7 +448,5 @@ def html_normalize_space(text: str) -> str: tem_text_el = lxmlhtml.fromstring(text.strip()) _text = tem_text_el.xpath('normalize-space()') return _text - except ParseError: - return '' except Exception: return text diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4.html new file mode 100644 index 00000000..1256ea38 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4.html @@ -0,0 +1,1650 @@ + + + + + + + + + + + + + + + + + + + + + vanhees71 | Physics Forums - The Fusion of Science and Community + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+ + + +
+ + + + + + + + + + +
+
+
+

+ +

+ + +
+
+
+ + + + + + +
+ + + + +
+ + +
+
+
+ + + + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + + + + + + + + + + +
+ +
+
+ + Dismiss Notice + +
+ +
+ +
+ Join Physics Forums Today!
+The friendliest, high quality science and math community on the planet! Everyone who loves science is here! +
+
+
+ +
+ + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + +
+ +
+
+ + + vanhees71 + + +
+ + + + + +
+
+ + + + + +
Joined:
+
Jul 16, 2010
+ +
Messages:
+
8,249
+ +
Likes Received:
+
2,326
+ +
Trophy Points:
+
454
+ + + + + + +
+
Featured Threads:
+
+ 1 + +
+
+ + + + + + + + + + +
+
+ + + + +
+ +
+

+ Following + 27 +

+
+
    + +
  1. + Chestermiller +
  2. + +
  3. + vela +
  4. + +
  5. + mfb +
  6. + +
  7. + jtbell +
  8. + +
  9. + stevendaryl +
  10. + +
  11. + Dr. Courtney +
  12. + +
+
+ + + +
+ + + +
+

+ Followers + 36 +

+
+
    + +
  1. + Demystifier +
  2. + +
  3. + Orodruin +
  4. + +
  5. + aleazk +
  6. + +
  7. + weirdoguy +
  8. + +
  9. + shinobi20 +
  10. + +
  11. + davidbenari +
  12. + +
+
+ + + +
+ +
+ + + + + +
+
+ + +
Gender:
+
Male
+ + + + + + +
Location:
+
Frankfurt/Main, Germany
+ + + +
Occupation:
+
Postdoc at Goethe Universtity Frankfurt (Germany)
+ + +
Badges:
+ + + + +
+ Science Advisor +
+ + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + + +
+ +
+ +
+
+
+ + +
+ +

vanhees71

+ +

+ Male, from Frankfurt/Main, Germany +

+ + + + + +
+ + +
+ + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + +
+
+ +
+ + +
+ + + + + + + +
+ + +
+ + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4_main.html new file mode 100644 index 00000000..d2483b56 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text_normalize_space4_main.html @@ -0,0 +1,389 @@ + + + + +
+ + + + + + + +
+ + + + +
+
+
+ + + + + + + + + + + + + + + + + + +
+ +
+ + +
+
+ 777 888 +
+ +
+ + + + + + + + + + + + + + +
+ +
+ + +
+ + + +
+ + +
+ + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index c3517bff..0445d0c6 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -406,6 +406,30 @@ def test_normalize_space3(self): content_md = result.get_content_list().to_mm_md() assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md + def test_normalize_space4(self): + """ + 测试换行不正确 + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'text_normalize_space4.html', + 'main_path': 'text_normalize_space4_main.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert 'Show Ignored Content\n 1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + def test_Lack_content1(self): """ 测试换缺少内容 From 12cccf6f58d9363824a18b113ae76efb597d4ca8 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Fri, 8 Aug 2025 18:05:51 +0800 Subject: [PATCH 16/31] =?UTF-8?q?refactor:=20=E9=87=8D=E6=9E=84simple?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/text.py | 2 +- llm_web_kit/simple.py | 51 +++++++++++-------- tests/llm_web_kit/simple/test_simple.py | 12 +++-- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 2bbd1f47..7768ef50 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -195,7 +195,7 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: return self.replace_entities(txt.strip(), entities_map) else: # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接 - if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): + if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' else: words_sep = ' ' diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py index ae687351..fefb535a 100644 --- a/llm_web_kit/simple.py +++ b/llm_web_kit/simple.py @@ -5,12 +5,14 @@ from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory -from llm_web_kit.extractor.html.extractor import ( - HTMLPageLayoutType, MagicHTMLFIleFormatorExtractor, - NoClipHTMLFIleFormatorExtractor) from llm_web_kit.input.datajson import DataJson +class PipeType: + HTML = 'html' + NOCLIP = 'noclip_html' + + class ExtractorType: HTML = 'html' PDF = 'pdf' @@ -19,24 +21,32 @@ class ExtractorType: class ExtractorFactory: """factory class for extractor.""" - html_extractor = None + magic_html_extractor = None + noclip_html_extractor = None pdf_extractor = None ebook_extractor = None @staticmethod - def get_extractor(extractor_type: str): + def get_extractor(extractor_type: str, pipe_tpl_name: str): if extractor_type == ExtractorType.HTML: - if ExtractorFactory.html_extractor is None: - extractor_cfg = load_pipe_tpl('html') - chain = ExtractSimpleFactory.create(extractor_cfg) - ExtractorFactory.html_extractor = chain - return ExtractorFactory.html_extractor + if pipe_tpl_name == PipeType.HTML: + if ExtractorFactory.magic_html_extractor is None: + extractor_cfg = load_pipe_tpl(pipe_tpl_name) + chain = ExtractSimpleFactory.create(extractor_cfg) + ExtractorFactory.magic_html_extractor = chain + return ExtractorFactory.magic_html_extractor + if pipe_tpl_name == PipeType.NOCLIP: + if ExtractorFactory.noclip_html_extractor is None: + extractor_cfg = load_pipe_tpl(pipe_tpl_name) + chain = ExtractSimpleFactory.create(extractor_cfg) + ExtractorFactory.noclip_html_extractor = chain + return ExtractorFactory.noclip_html_extractor else: raise ValueError(f'Invalid extractor type: {extractor_type}') def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str) -> DataJson: - extractor = NoClipHTMLFIleFormatorExtractor(load_pipe_tpl('noclip_html')) + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.NOCLIP) if raw_html == '': raw_html = html_content input_data_dict = { @@ -54,14 +64,8 @@ def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str return result -def __extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str) -> DataJson: - magic_html_extractor = MagicHTMLFIleFormatorExtractor(load_pipe_tpl('html')) - main_html, method, title = magic_html_extractor._extract_main_html(html_str, url, page_layout_type) - return main_html, title - - def __extract_html(url:str, html_content: str) -> DataJson: - extractor = ExtractorFactory.get_extractor(ExtractorType.HTML) + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.HTML) input_data_dict = { 'track_id': str(uuid.uuid4()), 'url': url, @@ -94,7 +98,10 @@ def extract_html_to_mm_md(url:str, html_content: str, clip_html=True, raw_html=' return result.get_content_list().to_mm_md() -def extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str = HTMLPageLayoutType.LAYOUT_ARTICLE) -> str: - """extract main html.""" - result = __extract_main_html_by_maigic_html(url, html_str, page_layout_type) - return result[0], result[1] +def extract_main_html(url:str, html_content: str, clip_html=True, raw_html='') -> str: + if clip_html: + result = __extract_html(url, html_content) + else: + result = __extract_main_html_by_no_clip_html(url, html_content, raw_html) + main_html = result.get('main_html') + return main_html diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 233eaead..3931b511 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -2,7 +2,7 @@ import unittest from llm_web_kit.simple import (extract_html_to_md, extract_html_to_mm_md, - extract_main_html_by_maigic_html) + extract_main_html) class TestSimple(unittest.TestCase): @@ -136,9 +136,13 @@ def test_extract_pure_html_to_mm_md(self): mm_md = extract_html_to_mm_md(self.url, self.html_content, clip_html=True) self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image](e5db82b5bf63d49d80c5533616892d3386f43955369520986d67653c700fc53c)\n') - def test_extract_magic_html(self): - magic_html, title = extract_main_html_by_maigic_html(self.url, self.html_content) - self.assertEqual(magic_html, '

Test Content

This is a test paragraph.

Test Image
') + def test_extract_magic_main_html(self): + magic_main_html = extract_main_html(self.url, self.html_content, clip_html=True) + self.assertEqual(magic_main_html, '

Test Content

This is a test paragraph.

Test Image
') + + def test_extract_noclip_main_html(self): + magic_main_html = extract_main_html(self.url, self.html_content, clip_html=False, raw_html=self.html_content) + self.assertEqual(magic_main_html, '

Test Content

This is a test paragraph.

Test Image') def test_extract_real_html_to_md(self): md = extract_html_to_md(self.url, self.real_html_content, clip_html=False) From 1a1fb18db75094cb41daa72c7ccd30d5314fcf42 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 14 Aug 2025 20:01:36 +0800 Subject: [PATCH 17/31] =?UTF-8?q?fix:=20=E5=85=81=E8=AE=B8=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E9=9D=9E=E6=A0=87=E5=87=86=E7=BB=93=E6=9E=84=E7=9A=84?= =?UTF-8?q?list=E7=9A=84content=5Flist?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/list.py | 15 ++++---- .../extractor/html/recognizer/test_list.py | 35 +++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index d91caa23..c3599fc8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -199,13 +199,14 @@ def __extract_list_item_text_recusive(el: HtmlElement): # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') - if child.tag in list_item_tags: - paragraph = __extract_list_item_text_recusive(child) - if len(paragraph) > 0: - tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') - new_paragraph = json.loads(tem_json) - text_paragraph.append(new_paragraph) + # list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') + # if child.tag in list_item_tags: + # 去掉if限制条件,允许非标准结构的列表通过 + paragraph = __extract_list_item_text_recusive(child) + if len(paragraph) > 0: + tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') + new_paragraph = json.loads(tem_json) + text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): tem_json = json.dumps(item).replace('$br$', '\\n\\n') diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index dbe79347..5f8d61de 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -375,3 +375,38 @@ def test_get_attribute_standalone_improved(self): error_msg = str(context.exception) self.assertIn('中没有cclist标签', error_msg) self.assertIn(element.tag, error_msg) + + def test_no_standard_get_list_content_list(self): + """测试非标准结构的list获取content_list.""" + # 获取私有方法 __get_list_content_list + get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list') + + # 创建测试数据 + test_elements = [ + html_to_element('''
    +
    +

    How to Process Oxidized Lead Zinc Ore by Flotation

    +

    How to Process Oxidized Lead Zinc Ore by Flotation. Metallurgical Content. The + Flowsheet. Crushing Section; GRINDING; Conditioning and Flotation; Thickening and + Filtering; Sampling; ORE TESTING LABORATORY; The problem of treating oxidized lead + zinc ores for the production of high grade lead zinc concentrates is a complex

    +
    +
    +
    +

    ore dressing flotation machine,fluorite ore flotation

    +

    Ore dressing flotation machine is widely used to conduct flotation of copper ore, + lead zinc ore, glod ore, etc. Mail to sales@sinofote

    +
    +
    +
    +

    Zinc Ore Mining Crusher wffofoundation

    +

    Zinc ore mining process can 14 2016 31 Mar Lead zinc ore dressing equipment zinc ore + Once processing in the flotation circuit was complete, the zinc

    +
    +
    +
''') + ] + + for i, element in enumerate(test_elements): + list_content_list = get_list_content_list_method(element, 1) + assert len(list_content_list) == 3 From 8d30d9325ad4e1026a1813106d790747bb636eba Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 25 Aug 2025 02:49:49 +0800 Subject: [PATCH 18/31] =?UTF-8?q?feat:=20noclip=E7=AE=A1=E7=BA=BFmain=5Fht?= =?UTF-8?q?ml=E9=A2=84=E5=A4=84=E7=90=86=E4=BD=BF=E7=94=A8selectolax?= =?UTF-8?q?=E4=BF=AE=E5=A4=8Dhtml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/pre_extractor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index d2e08ffa..064430d3 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -1,6 +1,7 @@ import os from overrides import override +from selectolax.parser import HTMLParser from llm_web_kit.extractor.config import INVISIBLE_TAGS from llm_web_kit.extractor.pre_extractor import \ @@ -154,7 +155,9 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson: def __clean_interactive_elements(self, data_json: DataJson) -> str: """清除main_html中交互式元素.""" html_content = data_json['main_html'] - tree = html_to_element(html_content) + selectolax_tree = HTMLParser(html_content) + fixed_html = selectolax_tree.html + tree = html_to_element(fixed_html) # 删除main_html中的script和style标签 for script_element in tree.xpath('//script'): remove_element(script_element) From c36134aa7f9d1046ff555cf75cbf26b28936d493 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 25 Aug 2025 02:51:15 +0800 Subject: [PATCH 19/31] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=A4=8D?= =?UTF-8?q?=E6=9D=82=E5=B5=8C=E5=A5=97table=E6=8F=90=E5=8F=96=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extractor/html/recognizer/table.py | 124 ++-- .../assets/recognizer/nested_table1.html | 404 ++++++++++++ .../assets/recognizer/nested_table2.html | 575 ++++++++++++++++++ .../assets/recognizer/nested_table3.html | 179 ++++++ .../assets/recognizer/nested_table4.html | 458 ++++++++++++++ .../extractor/html/recognizer/test_table.py | 52 +- .../extractor/test_extractor_chain.py | 18 +- 7 files changed, 1742 insertions(+), 68 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table2.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table3.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table4.html diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 45402887..aa96060c 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,4 +1,3 @@ -from itertools import chain from typing import Any, List, Tuple from lxml.html import HtmlElement @@ -9,9 +8,30 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import process_sub_sup_tags, remove_element +from llm_web_kit.libs.html_utils import (html_normalize_space, + process_sub_sup_tags) from llm_web_kit.libs.text_utils import normalize_text_segment +from .text import inline_tags + +# 空元素 +VOID_ELEMENTS = { + 'area', 'base', 'br', 'col', 'embed', 'hr', + 'img', 'input', 'link', 'meta', 'param', + 'source', 'track', 'wbr' +} + +KEEP_ATTRS = {'colspan', 'rowspan'} + + +def table_clean_attributes(element): + attrs = list(element.attrib.keys()) + for attr in attrs: + if attr not in KEEP_ATTRS: + del element.attrib[attr] + for child in element.iterchildren(): + table_clean_attributes(child) + class TableRecognizer(BaseHTMLElementRecognizer): """解析table元素.""" @@ -175,6 +195,9 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): raw_html=math_raw_html ) result = [] + if not math_res_parts: + if raw_html.tag == 'br' or raw_html.xpath('.//br'): + result.append("\n\n") for math_item in math_res_parts: ele_item = math_item[0] @@ -211,79 +234,76 @@ def process_node(node): if node.tail and node.tail.strip(): result.append(node.tail.strip()) else: + if node.tag == 'br' or node.tag not in inline_tags: + result.append('\n\n') + # 提取当前节点的文本 if node.text and node.text.strip(): cleaned_text = node.text.strip() - result.append(cleaned_text) - # 处理节点的tail(元素闭合后的文本) - if node.tail and node.tail.strip(): - cleaned_tail = node.tail.strip() - result.append(cleaned_tail) + result.append(html_normalize_space(cleaned_text)) + # 递归处理子节点 for child in node: process_node(child) + # 处理节点的tail(元素闭合后的文本) + if node.tail and node.tail.strip(): + if node.tag not in inline_tags: + result.append('\n\n') + cleaned_tail = node.tail.strip() + result.append(html_normalize_space(cleaned_tail)) # 从根节点开始处理 process_node(ele_item) return result def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: """简化 和 内容,保留嵌套表格结构.""" - if elem.tag in ['td', 'th']: - parse_res = [] - # 检查是否存在嵌套的表格 - if table_nest_level > 1: - if elem.text and elem.text.strip(): - parse_res.append(elem.text.strip()) - elem.text = None # 防止后续重复处理 - # 存在嵌套表格,递归处理子节点 + if (elem.tag in ['td', 'th', 'table'] or + any(child.tag in ['table', 'td', 'th'] for child in elem.iterchildren()) or + elem.xpath('.//table') or elem.xpath('.//td') or elem.xpath('.//th')): + if len(elem) > 0: + # 需要继续遍历的情况 for child in elem.iterchildren(): - if child.tag == 'table': - # 对嵌套表格递归调用简化处理 - self.__simplify_td_th_content(table_nest_level, child) - else: - # 处理非表格元素 - math_res = self.__check_table_include_math_code(child) - parse_res.extend(math_res) - remove_element(child) - # 将非表格内容拼接后放在表格前面 - if parse_res: - elem.text = ' '.join(normalize_text_segment(item) for item in parse_res) + self.__simplify_td_th_content(table_nest_level, child) else: - # 没有嵌套表格,直接简化 math_res = self.__check_table_include_math_code(elem) - parse_res.extend(math_res) - for item in list(elem.iterchildren()): - remove_element(item) - if parse_res: - elem.text = ' '.join(normalize_text_segment(item) for item in parse_res) - return - # 非 td/th 元素继续递归处理 - for child in elem.iterchildren(): - self.__simplify_td_th_content(table_nest_level, child) + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + elem.text = math_res_text + else: + math_res = self.__check_table_include_math_code(elem) + elem.clear() + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + if elem.tag in VOID_ELEMENTS: + elem_pre = elem.getprevious() + if elem_pre is not None: + elem_pre.tail = math_res_text + else: + elem_parent = elem.getparent() + if elem_parent is not None: + elem_parent_text = elem_parent.text + ' ' if elem_parent is not None and elem_parent.text is not None else '' + elem_parent.text = elem_parent_text + math_res_text + else: + elem.text = math_res_text def __get_table_body(self, table_type, table_nest_level, table_root): """获取并处理table body,返回处理后的HTML字符串。""" if table_type == 'empty': content = table_root.text_content() return content - allowed_attributes = ['colspan', 'rowspan'] # 清理除了colspan和rowspan之外的属性 - if len(table_root.attrib) > 0: - cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} - table_root.attrib.clear() - table_root.attrib.update(cleaned_attrs) - # text进行strip操作,tail保留(部分内容留在tail中) - for elem in chain([table_root], table_root.iterchildren()): - if elem.text is not None: - elem.text = elem.text.strip() - if elem.tail is not None: - elem.tail = elem.tail.strip() - # 单元格内的多标签内容进行简化,空格拼接,公式、代码识别 self.__simplify_td_th_content(table_nest_level, table_root) - # 迭代 - for child in table_root.iterchildren(): - if child is not None: - self.__get_table_body(table_type, table_nest_level, child) + table_clean_attributes(table_root) + + # doc = html.fromstring(html_content) + for element in table_root.iter(): + # 清理元素前后的空白(不影响.text和.tail的内容) + if element.text is not None: + element.text = element.text.lstrip('\n\t ') + if element.tail is not None: + if "\n\n" in element.tail: + element.tail = "\n\n" + element.tail.lstrip('\n\t ') + else: + element.tail = element.tail.lstrip('\n\t ') + return self._element_to_html_entity(table_root) def __do_extract_tables(self, root: HtmlElement) -> None: diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table1.html new file mode 100644 index 00000000..8af0b5ef --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table1.html @@ -0,0 +1,404 @@ + + + + + + + + + + + + + + + + + + + + + + ANSTO Publications Online: An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes. + + + + + + + + + + + + + + + + + + +
+   +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + +

+ +ANSTO Publications Online > +
+Journal Publications > +
+Journal Articles > + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + Please use this identifier to cite or link to this item: + http://apo.ansto.gov.au/dspace/handle/10238/2935 +
+
+ + +
+ + + + + + + + + + +
Title: An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
Authors: Rajkhowa, R
Naik, R
Wang, L
Smith, SV
Wang, X
Keywords: Radioisotopes
Transition Elements
Binding Energy
Fibers
Absorption
Ions
Issue Date: 15-Mar-2011
Publisher: Wiley-Blackwell
Citation: Rajkhowa, R., Naik, R., Wang, L., Smith, S. V., & Wang, X. (2011). An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes. Journal of Applied Polymer Science, 119(6), 3630-3639.
Abstract: Silk is a structural protein fiber that is stable over a wide pH range making it attractive for use in medical and environmental applications. Variation in amino acid composition has the potential for selective binding for ions under varying conditions. Here we report on the metal ion separation potential of Mulberry and Eri silk fibers and powders over a range of pH. Highly sensitive radiotracer probes, 64Cu2+, 109Cd2+, and 57Co2+ were used to study the absorption of their respective stable metal ions Cu2+, Cd2+, and Co2+ into and from the silk sorbents. The total amount of each metal ion absorbed and time taken to reach equilibrium occurred in the following order: Cu2+ > Cd2+ > Co2+. In all cases the silk powders absorbed metal ions faster than their respective silk fibers. Intensive degumming of the fibers and powders significantly reduced the time to absorb respective metal ions and the time to reach equilibrium was reduced from hours to 5–15 min at pH 8. Once bound, 45–100% of the metal ions were released from the sorbents after exposure to pH 3 buffer for 30 min. The transition metal ion loading capacity for the silk sorbents was considerably higher than that found for commercial ion exchange resins (AG MP-50 and AG 50W-X2) under similar conditions. Interestingly, total Cu2+ bound was found to be higher than theoretically predicted values based on known specific Cu2+ binding sites (AHGGYSGY), suggesting that additional (new) sites for transition metal ion binding sites are present in silk fibers. © 2011, Wiley-Blackwell. The definitive version is available at www3.interscience.wiley.com
URI: http://dx.doi.org/10.1002/app.33059
http://apo.ansto.gov.au/dspace/handle/10238/2935
ISSN: 0021-8995
Appears in Collections:Journal Articles

+

Files in This Item:

+

There are no files associated with this item.

+
+ + + +
+ +
+ + +
+ +
+ + +
+
+ + +
+
+ + + + + + +

Items in APO are protected by copyright, with all rights reserved, unless otherwise indicated.

+ + + + + + + + + + + + + +

 

+
+ + + + + + +
+ Valid XHTML 1.0! + + DSpace Software Copyright © 2002-2010  Duraspace - + Feedback + + +
+
+ + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table2.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table2.html new file mode 100644 index 00000000..b36c5caa --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table2.html @@ -0,0 +1,575 @@ + + + +ARTP 1020 - Drawing II - Modern Campus Catalog™ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +
+ + +
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ +
+
+ + + + + + + + +
+ +
    + + + + + + + + + + + + + + + +
  +   + May 24, 2024 +   + + + + + +
+ 2013-2014 VSCC Catalog  + +
+ + + + + +
+
+ + +
+
   + +   +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + 2013-2014 VSCC Catalog [ARCHIVED CATALOG]

+ + + Add to Portfolio (opens a new window) +
+

ARTP 1020 - Drawing II


This course emphasizes conceptual skills and contemporary approaches to media and subject, based on the fundamental aspects of drawing–line, tone, space, form, and composition.; Observational skill development is maintained.; Figure drawing may be introduced.  [This course was previously ART 102.]

PREREQUISITES: ARTP 1010 or permission of instructor.
Hours: Six contact hours per week.
Credits: (3)


+

Click here for the Spring 2024 Class Schedule

+

Click here for the Summer 2024 Class Schedule

+

Click here for the Fall 2024 Class Schedule

+

+

+ + Add to Portfolio (opens a new window) +
+
+
+ \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table3.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table3.html new file mode 100644 index 00000000..4cc557d0 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table3.html @@ -0,0 +1,179 @@ + + + +Lists of Peaks + + + + +
+ +
LoJ Logo
+
Lists of Peaks

+
+ +
+
+
  
+Search by State: +   +
+
+ +

+Update your list: +   +
+   
+Lists & Stats: +   +


Cameron, Mount
+
Random peak: Cameron, Mount  14,238' Rise: 138' State: CO  Photo by: Kiefer Thomas

Total peaks listed: 164,392   Total peaks with 300'+ rise: 98,979
Total peaks with images: 17,239   Total members: 2,740
Total trip reports: 4,015   Total peaks with reports: 5,866
Total peaks ascended: 41,563   Total ascents recorded: 334,618


 LoJ on Facebook
+© 2005-2014 listsofjohn.com
Legal Notice and Disclaimer
+
    +
+ +
+ \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table4.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table4.html new file mode 100644 index 00000000..776a3a32 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/nested_table4.html @@ -0,0 +1,458 @@ + + + + + Molecular line emissions from pre main sequence objects - Open Research Online + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ + + + +
+ + + + + +
+ + +
+
+ + + + +
+

+ + +Molecular line emissions from pre main sequence objects + + +

+
+ +

+ + + + Saraceno, P.; Benedettini, M.; Caux, E.; Ceccarelli, M. C.; Clegg, P. E.; Correia, J. C.; di Giorgio, A. M.; Giannini, T.; Griffin, M. J.; Leeks, S. J.; Liseau, R.; Lorenzetti, D.; Molinari, S.; Nisini, B.; Smith, H.; Spinoglio, L.; Tomassi, E. and White, G. J. + (1997). + + +Molecular line emissions from pre main sequence objects. + + + In: The first ISO workshop on Analytical Spectroscopy, 6-8 October 1997, Madrid, Spain, p. 291. + + + + + + + +

+ + + + + + + + + + + Full text available as: + + + + + + + + + + +
+ + [img]
Preview
+ + + +
+ + + + + + PDF (Version of Record) + - Requires a PDF viewer such as GSview, Xpdf or Adobe Acrobat Reader + + + +
+ + + Download (239Kb) + + + + +
    + + + + + + + +
+
+ + + + + + + + + + + + + + + + + +
URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
Google Scholar:Look up in Google Scholar
+ + + +

Abstract

+

We present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H20 cooling.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Item Type: + Conference Item + + + +
Copyright Holders:1997 European Space Agency
Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
Academic Unit/Department:Science > Physical Sciences
Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
Item ID:32696
Depositing User: + +Glenn White + +
Date Deposited:22 Mar 2012 10:56
Last Modified:22 Mar 2012 22:37
URI:http://oro.open.ac.uk/id/eprint/32696
+ + + + + + + +
+

Actions (login may be required)

+ + + + + + + + + + + + + +
View Item
+ Report issue / request change +
+ +
+ +
+ +
+
+

Policies | Disclaimer

+
+ + +
+ + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index 267db287..fb3ba4ff 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -66,7 +66,7 @@ def test_only_involve_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 2) table_body = parts[1][0].text_content() - assert table_body == r'
Mrs S Hindle
ShowCCRCC
Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
' + assert table_body == '
Mrs S Hindle
ShowCCRCC
Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
' def test_table_include_img_label(self): """table是否包含img标签.""" @@ -89,8 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == r"""
Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для - омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
""" + assert content == '
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -101,7 +100,7 @@ def test_cc_complex_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == r'
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
' + assert content == '\n\n\n\n
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
' table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] assert table_type.attrib['table_type'] == 'complex' @@ -131,7 +130,6 @@ def test_table_to_content_list_node_simple(self): result = self.rec.to_content_list_node(base_url, html_to_element(parsed_content), raw_html) expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') - print(result) assert result['type'] == json.loads(expect_json)['type'] assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] assert result['raw_content'] == json.loads(expect_json)['raw_content'] @@ -157,10 +155,10 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' + assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
' parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == r'
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\displaystyle np}$${\displaystyle np(1-p)}$
Geometric distribution${\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p}$${\displaystyle {\frac {1}{p}}}$${\displaystyle {\frac {(1-p)}{p^{2}}}}$
Normal distribution${\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}}$${\displaystyle \mu }$${\displaystyle \sigma ^{2}}$
Uniform distribution (continuous)${\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}}$${\displaystyle {\frac {a+b}{2}}}$${\displaystyle {\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}}$${\displaystyle {\frac {1}{\lambda }}}$${\displaystyle {\frac {1}{\lambda ^{2}}}}$
Poisson distribution${\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}}$${\displaystyle \lambda }$${\displaystyle \lambda }$
' + assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -196,3 +194,43 @@ def test_table_involve_complex_code(self): expect_path = base_dir.joinpath(test_case['expected'][3]) content = open(expect_path, 'r', encoding='utf-8').read() assert complex_table_tag[0].text == content.strip('\n') + + def test_nested_table1(self): + """复杂嵌套表格.""" + raw_html_path = base_dir.joinpath('assets/recognizer/nested_table1.html') + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert len(parts) == 3 + content = parts[2][0].text_content() + assert '\n\n
\n\n
Advanced Search
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Home
Browse
Communities \n\n & Collections
Issue Date
Author
Title
Subject
Sign on to:
Receive email \n\n updates
My APO\n\n
authorized users
Edit Profile
About DSpace
\n\n

ANSTO Publications Online > \n\n Journal Publications > \n\n Journal Articles >

\n\n
Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935
\n\n
' in content + + def test_nested_table2(self): + """复杂嵌套表格.""" + raw_html_path = base_dir.joinpath('assets/recognizer/nested_table2.html') + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert len(parts) == 2 + content = parts[1][0].text_content() + assert '
Title:An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
Authors:Rajkhowa, R\n\n
Naik, R\n\n
Wang, L\n\n
Smith, SV\n\n
Wang, X
Keywords:Radioisotopes \n\n Transition Elements
\n\nBinding Energy
\n\nFibers
\n\nAbsorption
\n\nIons
Issue Date:15-Mar-2011
Publisher:Wiley-Blackwell
Citation:
jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }}); \n\n .acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;} \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\'); \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n Main Numbers: \n\n (615) 452-8600 \n\n (888) 335-8722 \n\n \n\n \n\n \n\n \n\n facebook \n\n instagram \n\n twitter \n\n youtube \n\n \n\n \n\n \n\n \n\n Campuses \n\n \n\n Gallatin \n\n Cookeville \n\n Livingston \n\n Springfield \n\n \n\n \n\n \n\n \n\n \n\n Academic Divisions \n\n \n\n Business & Technology \n\n Health Sciences \n\n Humanities & Fine Arts \n\n Mathematics & Science \n\n Nursing \n\n Social Science & Education \n\n \n\n \n\n \n\n \n\n \n\n Resources \n\n \n\n Accreditation \n\n Bookstore \n\n Campus Police \n\n Contact Us \n\n Employee Directory \n\n IT Help Desk \n\n Library \n\n Marketing & Communications
Volunteer State Community College
May 24, 2024
2013-2014 VSCC Catalog
Select a Catalog \n\n 2024-2025 Undergraduate Catalog \n\n 2023-2024 Undergraduate Catalog [ARCHIVED CATALOG] \n\n 2022-2023' in content + + def test_nested_table3(self): + """复杂嵌套表格.""" + raw_html_path = base_dir.joinpath('assets/recognizer/nested_table3.html') + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert len(parts) == 3 + content = parts[2][0].text_content() + assert "
What's New - Recent Content \n\n \n\n Members' Peak Updates \n\n Recent Trip Reports \n\n Recent Trip Report Comments \n\n Recently added Images \n\n Recently added Peaks \n\n List Completers \n\n \n\n Height List Completers \n\n Elevation List Completers \n\n County Summit Completers \n\n Wilderness Area Completers \n\n Member Profiles & Stats \n\n \n\n Member Profiles - Summary Stats \n\n Member Stats by Date Range & Charts \n\n Calendar Grid Completions \n\n Peaks Repeated \n\n Most Climbed Peaks \n\n Unclimbed Peaks \n\n US Peak Totals by State \n\n Member Tools \n\n \n\n Closest 50 Peaks by Member \n\n \n\n Closest 50 Map \n\n Closest 50 List \n\n Download your Peak List \n\n Search Trip Reports \n\n Unclimbed by Custom Group \n\n Export CSV, GPX, POI, TOPO! Files \n\n Elevation Threshold Progress Maps \n\n State Highest # Progress Maps \n\n County Summit Progress Maps \n\n Statewide County Summit Maps \n\n Prominence Progress Maps \n\n State Quads Progress Maps \n\n Quadrangle Lookup \n\n Distance Calculator \n\n Slope Angle Calculator \n\n Stats Category Leaders \n\n US Highest 1,000 Peaks \n\n \n\n US Highest 1,000 Member Area \n\n 1,000 Highest Peak List \n\n US Steepest 1,000 Peaks \n\n \n\n Steepness Member Area \n\n View 1,000 Steepest List \n\n US 2,000' Prominence \n\n \n\n US Prominence Member Area \n\n View US Prominence Peak Profiles \n\n View Member 5k Completion Maps \n\n Prominence Progress Maps \n\n US County Highpoints \n\n \n\n County Highpoints Member Area \n\n Highpoint Profiles - By State \n\n View Member's Completion Maps \n\n US State Highpoints \n\n \n\n US State Highpoints Member Area \n\n View State Highpoints List \n\n View Member's Completion Maps \n\n US Wilderness Area Peaks \n\n \n\n Wilderness Summits Member Area \n\n Wilderness Area Detail by State \n\n Wilderness HPs Member Area \n\n US National Park Peaks \n\n \n\n National Park Peaks Member Area \n\n National Park Peaks Detail by State" in content + + def test_nested_table4(self): + """复杂嵌套表格.""" + raw_html_path = base_dir.joinpath('assets/recognizer/nested_table4.html') + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert len(parts) == 4 + content = parts[2][0].text_content() + assert '

Molecular line emissions from pre main sequence objects

Saraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). \n\n Molecular line emissions from pre main sequence objects. \n\n In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291. \n\n Full text available as:

\n\n
Preview
\n\n
PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\n
Download (239Kb)
    \n\n\n\n
    URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
    Google Scholar:Look up in Google Scholar
    \n\n

    Abstract

    We present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H 2 0 cooling.

    \n\n\n\n\n\n\n\n\n\n\n\n\n\n' in content diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 7f1bf8c9..a5f68540 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -325,7 +325,7 @@ def test_table_involve_inline_code(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_list = result.get_content_list()._get_data()[0][0]['content']['html'] - assert content_list == r"""
    Item Type:Conference Item
    Copyright Holders:1997 European Space Agency
    Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
    Academic Unit/Department:Science > Physical Sciences
    Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
    Item ID:32696
    Depositing User:Glenn White
    FunctionDescriptionExample
    `print()`Prints a message to the console.`print("Hello, World!")`
    `len()`Returns the length of an object.`len([1, 2, 3])`
    `range()`Generates a sequence of numbers.`range(1, 10)`
    """ + assert content_list == '
    FunctionDescriptionExample
    `print()`Prints a message to the console.`print("Hello, World!")`
    `len()`Returns the length of an object.`len([1, 2, 3])`
    `range()`Generates a sequence of numbers.`range(1, 10)`
    ' def test_table_tail_text(self): """table的tail文本保留.""" @@ -347,11 +347,11 @@ def test_table_element_include_enter(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert """| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | + assert '''| عنوان فارسی | توسعه مالی و هزینه سرمایه حقوق سهامداران: شواهدی از چین | |---|---| | عنوان انگلیسی | Financial development and the cost of equity capital: Evidence from China | -| کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | -| درسهای مرتبط | حسابداری |""" in content_md +| کلمات کلیدی : |   توسعه مالی؛ هزینه سرمایه حقوق سهامداران؛ قانون و امور مالی؛ چین | +| درسهای مرتبط | حسابداری |''' in content_md def test_list_empty(self): """list抽取为空,原因是嵌套的img标签没有text.""" @@ -374,7 +374,7 @@ def test_table_include_math_p(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() assert len(content_list[0]) == 17 - assert content_list[0][3]['content']['html'] == r"
    up vote 17 down vote favorite 5I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime? prime-numbers factoring
    " + assert content_list[0][3]['content']['html'] == "
    up vote 17 down vote favorite \n\n 5
    I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?
    prime-numbers factoring
    " def test_table_include_math_p_2(self): """table包含math和其他内容.""" @@ -386,7 +386,7 @@ def test_table_include_math_p_2(self): md_content = result.get_content_list().to_nlp_md() # with open('output_badcase_p2.md', 'w', encoding='utf-8') as f: # f.write(md_content) - self.assertIn(r'
    单位换算:数学公式区块: $1\text{km}={10}^{3}\text{m}$
    长度质量时间
    数学公式 $1m={10}^{2}\mathrm{cm}$数学公式 $1\mathrm{kg}={10}^{3}g$数学公式 $1h=3600s$
    运动学:数学公式 $v=\frac{dx}{dt}$ 数学公式 $a=\frac{dv}{dt}$
    ', md_content) + self.assertIn('
    单位换算:

    数学公式区块: $1\\text{km}={10}^{3}\\text{m}$

    ', md_content) def test_clean_tags(self): """测试clean_tag的preExtractor是否生效.""" @@ -491,7 +491,7 @@ def test_more_nt(self): result_content_list = result.get_content_list()._get_data() result = result_content_list[0][2]['content']['html'] assert '\n\t' not in result - assert len(result) == 1878 + assert len(result) == 2205 def test_math_physicsforums(self): """测试math_physicsforums网页中数学公式是[tex]和[itex]包裹的,且中间还有
    标签分割.""" @@ -512,7 +512,7 @@ def test_table_only_include_tr(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_nlp_md() - assert 'List Price:$11.80' in result_md + assert 'List Price: $11.80' in result_md def test_table_only_one_td(self): """测试table只有一个td.""" @@ -636,7 +636,7 @@ def test_table_lack_pre_content(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() - assert result_content_list[0][22]['content']['html'] == r"""
    长度质量时间
    お名前 【必須】お名前(カナ)
    ご連絡先 【いずれか必須】※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。
    メールアドレス電話番号
    """ + assert result_content_list[0][22]['content']['html'] == '
    お名前【必須】お名前(カナ)
    ご連絡先【いずれか必須】

    メールアドレス

    電話番号

    ※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。

    ' def test_td_include_specila_symbol(self): """测试td包含特殊符号|,需要转义.""" From ca49f1a7ce941cef22a3d86a010dc2e19b2e8f1c Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 1 Sep 2025 15:47:55 +0800 Subject: [PATCH 20/31] =?UTF-8?q?fix:=20image=E6=8F=90=E5=8F=96caption?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/image.py | 3 +-- llm_web_kit/input/datajson.py | 10 ++++++++-- tests/llm_web_kit/extractor/test_extractor_chain.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/image.py b/llm_web_kit/extractor/html/recognizer/image.py index 30f8241d..53f612dc 100644 --- a/llm_web_kit/extractor/html/recognizer/image.py +++ b/llm_web_kit/extractor/html/recognizer/image.py @@ -157,8 +157,7 @@ def __parse_img_elements(self, base_url: str, img_elements: HtmlElement, html_ob 'html': raw_img_html, # 保留原始 标签作为属性值 'format': 'url', # 指定图片格式,url|base } - if elem.text and elem.text.strip(): - attributes['caption'] = elem.text.strip() + attributes['caption'] = elem.xpath('normalize-space()') if tag in ['embed', 'object', 'iframe', 'video', 'audio', 'canvas']: if not [img_elem for img_elem in self.IMG_LABEL if img_elem in raw_img_html.lower()]: diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 3ded4c2b..bf970f93 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -298,7 +298,7 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: else: image_caption = '' - image_des = image_title if image_title else image_caption if image_caption else '' + image_des = image_title if image_title else '' # 优先使用data, 其次path.其中data是base64编码的图片,path是图片的url if image_data: if image_des: @@ -310,7 +310,13 @@ def __content_lst_node_2_md(self, content_lst_node: dict, exclude_inline_types: image = f'![{image_alt}]({image_path} "{image_des}")' else: image = f'![{image_alt}]({image_path})' - return image + + if image_caption: + image_with_caption = f'{image}\n\n{image_caption}' + else: + image_with_caption = image + + return image_with_caption elif node_type == DocElementType.AUDIO: return '' # TODO: 音频格式 elif node_type == DocElementType.VIDEO: diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index a5f68540..5255efe9 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -105,7 +105,7 @@ def test_html_pipeline(self): self.assertEqual(html_content['content']['title'], 'image-title') self.assertEqual(html_content['content']['alt'], 'image-alt') self.assertEqual(html_content['content']['url'], 'https://www.test.com/test.png') - self.assertEqual(html_content['content']['caption'], None) + self.assertEqual(html_content['content']['caption'], '') # 然后是simple table html_content = html_content_list[4] From b4c641e17f27e19d6fa6561278d373b34bf72dcb Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 1 Sep 2025 20:01:53 +0800 Subject: [PATCH 21/31] =?UTF-8?q?feat:=20image=E6=8F=90=E5=8F=96caption?= =?UTF-8?q?=E7=9A=84=E5=8D=95=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extractor/html/recognizer/test_image.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_image.py b/tests/llm_web_kit/extractor/html/recognizer/test_image.py index 73a4c3a0..6396c6d1 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_image.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_image.py @@ -350,3 +350,36 @@ def test_complex_heading_image_removal(self): img_in_p.extend(p.xpath('.//img')) self.assertEqual(len(img_in_p), 0, '段落中不应该有img标签') + + def test_image_caption(self): + complex_html = """ +
    +
    + Roger Moore in + + For + Your Eyes Only + . Photo Courtesy: United + Artists/Everett Collection + +
    +
    + """ + element = html_to_element(complex_html) + base_url = 'http://example.com' + parts = self.img_recognizer.recognize(base_url, [(element, element)], complex_html) + html = element_to_html(parts[0][0]) + self.assertIn('caption="Roger Moore in For Your Eyes Only . Photo Courtesy: United Artists/Everett Collection', html) From 12c05ca52fad42157afa49e94ec08fac2149b204 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 11 Sep 2025 17:48:37 +0800 Subject: [PATCH 22/31] =?UTF-8?q?fix:=201.=E7=AE=80=E5=8C=96=E8=A1=A8?= =?UTF-8?q?=E6=A0=BC=EF=BC=8C=E5=8E=BB=E6=8E=89=E9=9D=9E=E8=A1=A8=E6=A0=BC?= =?UTF-8?q?=E6=A0=87=E7=AD=BE=202.=E4=BF=AE=E5=A4=8Dtable=E3=80=81list?= =?UTF-8?q?=E3=80=81title=E3=80=81text=E4=B8=AD=E4=B8=8A=E4=B8=8B=E6=A0=87?= =?UTF-8?q?=E4=B8=8E=E4=B8=BB=E4=BD=93=E8=A2=AB=E5=88=86=E9=9A=94=E5=BC=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/list.py | 13 +++- .../extractor/html/recognizer/table.py | 64 +++++++++++++++---- llm_web_kit/extractor/html/recognizer/text.py | 30 +++++---- .../extractor/html/recognizer/title.py | 30 +++++---- llm_web_kit/libs/html_utils.py | 32 ++++++++++ .../recognizer/assets/cccode/mathworks.md | 2 +- .../recognizer/assets/cccode/mathworks.txt | 2 +- .../extractor/html/recognizer/test_table.py | 18 +++--- .../extractor/html/recognizer/test_title.py | 10 ++- .../extractor/test_extractor_chain.py | 8 +-- 10 files changed, 147 insertions(+), 62 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index c3599fc8..61f113b8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,6 +1,7 @@ import json from typing import Any, List, Tuple +from lxml import html as lxml_html from lxml.html import HtmlElement from overrides import override @@ -8,8 +9,10 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element, + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from llm_web_kit.libs.text_utils import normalize_text_segment from .text import inline_tags @@ -224,7 +227,9 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis Returns: list: 包含列表项内容的列表,即items """ - + ele_html = lxml_html.tostring(ele, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(ele_html) + ele = html_to_element(replace_tree_html) content_list = [] # 处理根元素文本 if ele.text and ele.text.strip(): @@ -239,6 +244,8 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis for child in ele.iterchildren(): text_paragraph = self.__extract_list_item_text(child) if len(text_paragraph) > 0: + json_paragraph = restore_sub_sup_from_text_regex(json.dumps(text_paragraph)) + text_paragraph = json.loads(json_paragraph) content_list.extend(text_paragraph) return content_list diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index e4cc78ce..a70e6290 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,6 +1,9 @@ +import re from typing import Any, List, Tuple +from lxml import html from lxml.html import HtmlElement +from lxml.html.clean import Cleaner from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerException @@ -8,12 +11,34 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (element_to_html_unescaped, + html_normalize_space, html_to_element, + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from llm_web_kit.libs.text_utils import normalize_text_segment from .text import inline_tags +new_inline_tags = inline_tags.union({'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption'}) + +allow_tags = ['table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption', 'sub', 'sup', 'ccmath-inline', 'ccmath-interline', 'cccode', 'cccode-inline'] + +cleaner = Cleaner( + safe_attrs_only=False, + page_structure=False, + style=True, + scripts=True, + comments=True, + links=False, + meta=True, + embedded=True, + frames=True, + forms=True, + annoying_tags=True, + allow_tags=allow_tags +) + # 空元素 VOID_ELEMENTS = { 'area', 'base', 'br', 'col', 'embed', 'hr', @@ -234,7 +259,7 @@ def process_node(node): if node.tail and node.tail.strip(): result.append(node.tail.strip()) else: - if node.tag == 'br' or node.tag not in inline_tags: + if node.tag == 'br' or node.tag not in new_inline_tags: result.append('\n\n') # 提取当前节点的文本 @@ -247,7 +272,7 @@ def process_node(node): process_node(child) # 处理节点的tail(元素闭合后的文本) if node.tail and node.tail.strip(): - if node.tag not in inline_tags: + if node.tag not in new_inline_tags: result.append('\n\n') cleaned_tail = node.tail.strip() result.append(html_normalize_space(cleaned_tail)) @@ -274,7 +299,10 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: else: math_res = self.__check_table_include_math_code(elem) elem.clear() - math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + if elem.tag not in new_inline_tags: + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + "\n\n" + else: + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) if elem.tag in VOID_ELEMENTS: elem_pre = elem.getprevious() if elem_pre is not None: @@ -292,22 +320,34 @@ def __get_table_body(self, table_type, table_nest_level, table_root): if table_type == 'empty': content = table_root.text_content() return content + table_html = html.tostring(table_root, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(table_html) + table_root = html_to_element(replace_tree_html) + # 清理除了colspan和rowspan之外的属性 self.__simplify_td_th_content(table_nest_level, table_root) table_clean_attributes(table_root) + clean_html = cleaner.clean_html(self._element_to_html_entity(table_root)) + new_table_root = self._build_html_tree(clean_html) - # doc = html.fromstring(html_content) - for element in table_root.iter(): + pattern = re.compile(r'(\s*\n\s*\n\s*|\n{2,})') + for element in new_table_root.iter(): # 清理元素前后的空白(不影响.text和.tail的内容) if element.text is not None: - element.text = element.text.lstrip('\n\t ') + if element.tag in allow_tags: + element.text = re.sub(pattern, '\n\n', element.text.strip()) + else: + element.text = re.sub(pattern, '\n\n', element.text.lstrip()) if element.tail is not None: - if "\n\n" in element.tail: - element.tail = "\n\n" + element.tail.lstrip('\n\t ') + if element.tag not in new_inline_tags: + element.tail = "\n\n" + re.sub(pattern, '\n\n', element.tail.lstrip()) else: - element.tail = element.tail.lstrip('\n\t ') + element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip() + + tree_html = element_to_html_unescaped(new_table_root) + restore_tree_html = restore_sub_sup_from_text_regex(tree_html) - return self._element_to_html_entity(table_root) + return restore_tree_html def __do_extract_tables(self, root: HtmlElement) -> None: """递归处理所有子标签.""" diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 0052872c..6dc7e346 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -13,7 +13,9 @@ from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType from llm_web_kit.libs.html_utils import (element_to_html_unescaped, html_normalize_space, html_to_element, - process_sub_sup_tags) + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) special_symbols = [ # TODO 从文件读取 '®', # 注册商标符号 @@ -65,7 +67,7 @@ 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', 'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline', 'marked-tail', 'marked-text', 'math','mspace', 'font', 'nobr', 'bdi', - 'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins' + 'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins', 'xhtml' } # 词间无分隔符的语言 @@ -205,7 +207,10 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' else: - words_sep = ' ' + if text2.startswith('tem_sub_') or text2.startswith('tem_sup_') or text1.endswith("tem_sub_start") or text1.endswith("tem_sup_start"): + words_sep = '' + else: + words_sep = ' ' txt = text1 + words_sep + text2 return self.replace_entities(txt.strip(), entities_map) @@ -222,12 +227,13 @@ def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[d Args: el: 代表一个段落的html元素 """ + _html = html.tostring(root, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(_html) + root = html_to_element(replace_tree_html) + para_text = [] def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: - # 标记当前元素是否是sub或sup类型 - is_sub_sup = el.tag == 'sub' or el.tag == 'sup' - if el.tag == CCTag.CC_MATH_INLINE: if text: para_text.append({'c': text, 't': ParagraphTextType.TEXT}) @@ -254,19 +260,17 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: # 处理尾部文本 if el.tail and el.tail.strip(): - if is_sub_sup: - _new_tail = html_normalize_space(el.tail.strip()) - text += _new_tail - else: - _new_tail = html_normalize_space(el.tail.strip()) - new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail - text = self.__combine_text(text, new_tail, language) + _new_tail = html_normalize_space(el.tail.strip()) + new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail + text = self.__combine_text(text, new_tail, language) return text if final := __get_paragraph_text_recusive(root, ''): para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT}) + for item in para_text: + item['c'] = restore_sub_sup_from_text_regex(item['c']) return para_text def __extract_paragraphs(self, root: HtmlElement): diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index cf105b9f..d3205344 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -1,6 +1,7 @@ from typing import List, Tuple # from lxml.etree import _Element as HtmlElement +from lxml import html as lxml_html from lxml.html import HtmlElement from overrides import override @@ -8,8 +9,9 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from .text import PARAGRAPH_SEPARATOR @@ -90,10 +92,14 @@ def __do_extract_title(self, root:HtmlElement) -> None: """ # 匹配需要替换的标签 if root.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + if root.tail and root.tail.strip(): + tail_text = root.tail.strip() + else: + tail_text = '' + root.tail = None title_text = self.__extract_title_text(root) title_raw_html = self._element_to_html(root) title_level = str(self.__extract_title_level(root.tag)) - tail_text = root.tail cc_element = self._build_cc_element(CCTag.CC_TITLE, title_text, tail_text, level=title_level, html=title_raw_html) self._replace_element(root, cc_element) return @@ -122,8 +128,9 @@ def __extract_title_text(self, header_el:HtmlElement) -> str: Returns: str: 标题的文本 """ + blks = [] + def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> list[str]: - blks = [] if el.tag == CCTag.CC_CODE_INLINE: blks.append(f'`{el.text}`') @@ -134,21 +141,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li _new_text = html_normalize_space(el.text.strip()) blks.append(_new_text) - for child in el.getchildren(): - if child.tag == 'sub' or child.tag == 'sup': - blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail]) - else: - blks.extend(__extract_title_text_recusive(child)) - if with_tail: blks.append((el.tail or '').strip()) return blks - # 根元素不保留结尾 - blks = __extract_title_text_recusive(header_el, False) + _html = lxml_html.tostring(header_el, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(_html) + header_el = html_to_element(replace_tree_html) - return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR) + for child in header_el.iter(): + __extract_title_text_recusive(child, True) + return restore_sub_sup_from_text_regex(' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)) def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]: """获取element的属性.""" diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index fede6cf0..21718f2a 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -452,6 +452,38 @@ def html_normalize_space(text: str) -> str: return text +def replace_sub_sup_with_text_regex(html_content): + """使用正则表达式将 HTML 中的 、 标签替换为特殊标记。""" + + def replacer(match): + tag = match.group(0).lower() + if tag.startswith('': + return 'tem_sub_end' + if tag.startswith('': + return 'tem_sup_end' + return tag + + pattern = r']*>' + return re.sub(pattern, replacer, html_content, flags=re.IGNORECASE) + + +def restore_sub_sup_from_text_regex(processed_content): + """将、的替换标记还原为原始的 HTML 标签。""" + replacement_map = { + 'tem_sub_start': '', + 'tem_sub_end': '', + 'tem_sup_start': '', + 'tem_sup_end': '' + } + + pattern = '|'.join(re.escape(key) for key in replacement_map.keys()) + return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content) + + def get_plain_text_fast(html_source: str) -> str: """使用lxml快速获取html中的纯文本. diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md index c8745567..bd8a9517 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md @@ -4,7 +4,7 @@ ### Use Integers for Index Variables -In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. +In MATLAB® code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. ### Limit Use of `assert` Statements diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt index 7b4f582d..2bdc3018 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt @@ -1,7 +1,7 @@ 主要内容 Single-Precision Conversion Best Practices Use Integers for Index Variables -In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. +In MATLAB® code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. Limit Use of `assert` Statements - Do not use `assert` statements to define the properties of input arguments. - Do not use `assert` statements to test the type of a variable. For example, do not use diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index fb3ba4ff..c80d3568 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -66,7 +66,7 @@ def test_only_involve_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 2) table_body = parts[1][0].text_content() - assert table_body == '
    Mrs S Hindle
    ShowCCRCC
    Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
    Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
    Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
    Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
    Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
    Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
    ' + assert table_body == '
    Mrs S Hindle
    ShowCCRCC
    Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
    Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
    Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
    Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
    Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
    Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
    ' def test_table_include_img_label(self): """table是否包含img标签.""" @@ -89,7 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == '
    Рейтинг:
    Рейтинг 5.00 из 5 на основе опроса 3 пользователей
    Тип товара:Препараты для омоложения
    Форма:Крем
    Объем:50 мл
    Рецепт:Отпускается без рецепта
    Способ хранения:Хранить при температуре 4-20°
    Примечание:Беречь от детей
    Оплата:Наличными/банковской картой
    Доступность в Северске:В наличии
    Доставка:2-7 Дней
    Цена:84 ₽
    ' + assert content == '
    Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
    Тип товара:Препараты для омоложения
    Форма:Крем
    Объем:50 мл
    Рецепт:Отпускается без рецепта
    Способ хранения:Хранить при температуре 4-20°
    Примечание:Беречь от детей
    Оплата:Наличными/банковской картой
    Доступность в Северске:В наличии
    Доставка:2-7 Дней
    Цена:84 ₽
    ' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -100,7 +100,7 @@ def test_cc_complex_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == '\n\n\n\n
    ফেব্রুয়ারি ২০২৪
    সোমমঙ্গলবুধবৃহশুক্রশনিরবি
    « জানুয়ারি
    ১০১১
    ১২১৩১৪১৫১৬১৭১৮
    ১৯২০২১২২২৩২৪২৫
    ২৬২৭২৮২৯
    ' + assert content == '
    ফেব্রুয়ারি ২০২৪
    সোমমঙ্গলবুধবৃহশুক্রশনিরবি
    « জানুয়ারি
    ১০১১
    ১২১৩১৪১৫১৬১৭১৮
    ১৯২০২১২২২৩২৪২৫
    ২৬২৭২৮২৯
    ' table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] assert table_type.attrib['table_type'] == 'complex' @@ -155,10 +155,10 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == '
    Name of the probability distributionProbability distribution functionMeanVariance
    Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
    Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
    Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
    Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
    Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
    Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
    ' + assert complex_table_tag[0].text == '
    Name of the probability distributionProbability distribution functionMeanVariance
    Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\\displaystyle np}$${\\displaystyle np(1-p)}$
    Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$${\\displaystyle {\\frac {1}{p}}}$${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$
    Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$${\\displaystyle \\mu }$${\\displaystyle \\sigma ^{2}}$
    Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$${\\displaystyle {\\frac {a+b}{2}}}$${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$
    Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$${\\displaystyle {\\frac {1}{\\lambda }}}$${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$
    Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$${\\displaystyle \\lambda }$${\\displaystyle \\lambda }$
    ' parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == '
    Name of the probability distributionProbability distribution functionMeanVariance
    Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
    Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
    Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
    Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
    Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
    Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
    ' + assert complex_table_tag[0].text == '
    Name of the probability distributionProbability distribution functionMeanVariance
    Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\\displaystyle np}$${\\displaystyle np(1-p)}$
    Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$${\\displaystyle {\\frac {1}{p}}}$${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$
    Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$${\\displaystyle \\mu }$${\\displaystyle \\sigma ^{2}}$
    Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$${\\displaystyle {\\frac {a+b}{2}}}$${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$
    Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$${\\displaystyle {\\frac {1}{\\lambda }}}$${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$
    Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$${\\displaystyle \\lambda }$${\\displaystyle \\lambda }$
    ' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -203,7 +203,7 @@ def test_nested_table1(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[2][0].text_content() - assert '\n\n
    \n\n
    Advanced Search
    \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
    Home
    Browse
    Communities \n\n & Collections
    Issue Date
    Author
    Title
    Subject
    Sign on to:
    Receive email \n\n updates
    My APO\n\n
    authorized users
    Edit Profile
    About DSpace
    \n\n

    ANSTO Publications Online > \n\n Journal Publications > \n\n Journal Articles >

    \n\n
    Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935
    \n\n
    ' in content + assert '
    Title:An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
    Authors:Rajkhowa, R\n\n
    Naik, R\n\n
    Wang, L\n\n
    Smith, SV\n\n
    Wang, X
    Keywords:Radioisotopes \n\n Transition Elements
    \n\nBinding Energy
    \n\nFibers
    \n\nAbsorption
    \n\nIons
    Issue Date:15-Mar-2011
    Publisher:Wiley-Blackwell
    Citation:
    Search APO
    Advanced Search
    Home
    Browse
    Communities\n\n& Collections
    Issue Date
    Author
    Title
    Subject
    Sign on to:
    Receive email\n\nupdates
    My APO\n\nauthorized users
    Edit Profile
    Help
    About DSpace
    ANSTO Publications Online >\n\nJournal Publications >\n\nJournal Articles >
    Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935
    ' in content def test_nested_table2(self): """复杂嵌套表格.""" @@ -213,7 +213,7 @@ def test_nested_table2(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 2 content = parts[1][0].text_content() - assert '
    Title:An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
    Authors:Rajkhowa, R\n\nNaik, R\n\nWang, L\n\nSmith, SV\n\nWang, X
    Keywords:Radioisotopes\n\nTransition Elements\n\nBinding Energy\n\nFibers\n\nAbsorption\n\nIons
    Issue Date:15-Mar-2011
    Publisher:Wiley-Blackwell
    jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }}); \n\n .acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;} \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\'); \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n Main Numbers: \n\n (615) 452-8600 \n\n (888) 335-8722 \n\n \n\n \n\n \n\n \n\n facebook \n\n instagram \n\n twitter \n\n youtube \n\n \n\n \n\n \n\n \n\n Campuses \n\n \n\n Gallatin \n\n Cookeville \n\n Livingston \n\n Springfield \n\n \n\n \n\n \n\n \n\n \n\n Academic Divisions \n\n \n\n Business & Technology \n\n Health Sciences \n\n Humanities & Fine Arts \n\n Mathematics & Science \n\n Nursing \n\n Social Science & Education \n\n \n\n \n\n \n\n \n\n \n\n Resources \n\n \n\n Accreditation \n\n Bookstore \n\n Campus Police \n\n Contact Us \n\n Employee Directory \n\n IT Help Desk \n\n Library \n\n Marketing & Communications
    Volunteer State Community College
    May 24, 2024
    2013-2014 VSCC Catalog
    Select a Catalog \n\n 2024-2025 Undergraduate Catalog \n\n 2023-2024 Undergraduate Catalog [ARCHIVED CATALOG] \n\n 2022-2023' in content + assert '
    jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }});\n\n.acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;}\n\nwindow.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\');\n\nMain Numbers:\n\n(615) 452-8600\n\n(888) 335-8722\n\nfacebook\n\ninstagram\n\ntwitter\n\nyoutube\n\nCampuses\n\nGallatin\n\nCookeville\n\nLivingston\n\nSpringfield\n\nAcademic Divisions\n\nBusiness & Technology\n\nHealth Sciences\n\nHumanities & Fine Arts\n\nMathematics & Science\n\nNursing\n\nSocial Science & Education\n\nResources\n\nAccreditation\n\nBookstore\n\nCampus Police\n\nContact Us\n\nEmployee Directory\n\nIT Help Desk\n\nLibrary\n\nMarketing & Communications
    Volunteer State Community College
    May 24, 2024
    2013-2014 VSCC Catalog
    Select a Catalog\n\n2024-2025 Undergraduate Catalog\n\n2023-2024 Undergraduate Catalog [ARCHIVED CATALOG]\n\n2022-2023' in content def test_nested_table3(self): """复杂嵌套表格.""" @@ -223,7 +223,7 @@ def test_nested_table3(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[2][0].text_content() - assert "
    What's New - Recent Content \n\n \n\n Members' Peak Updates \n\n Recent Trip Reports \n\n Recent Trip Report Comments \n\n Recently added Images \n\n Recently added Peaks \n\n List Completers \n\n \n\n Height List Completers \n\n Elevation List Completers \n\n County Summit Completers \n\n Wilderness Area Completers \n\n Member Profiles & Stats \n\n \n\n Member Profiles - Summary Stats \n\n Member Stats by Date Range & Charts \n\n Calendar Grid Completions \n\n Peaks Repeated \n\n Most Climbed Peaks \n\n Unclimbed Peaks \n\n US Peak Totals by State \n\n Member Tools \n\n \n\n Closest 50 Peaks by Member \n\n \n\n Closest 50 Map \n\n Closest 50 List \n\n Download your Peak List \n\n Search Trip Reports \n\n Unclimbed by Custom Group \n\n Export CSV, GPX, POI, TOPO! Files \n\n Elevation Threshold Progress Maps \n\n State Highest # Progress Maps \n\n County Summit Progress Maps \n\n Statewide County Summit Maps \n\n Prominence Progress Maps \n\n State Quads Progress Maps \n\n Quadrangle Lookup \n\n Distance Calculator \n\n Slope Angle Calculator \n\n Stats Category Leaders \n\n US Highest 1,000 Peaks \n\n \n\n US Highest 1,000 Member Area \n\n 1,000 Highest Peak List \n\n US Steepest 1,000 Peaks \n\n \n\n Steepness Member Area \n\n View 1,000 Steepest List \n\n US 2,000' Prominence \n\n \n\n US Prominence Member Area \n\n View US Prominence Peak Profiles \n\n View Member 5k Completion Maps \n\n Prominence Progress Maps \n\n US County Highpoints \n\n \n\n County Highpoints Member Area \n\n Highpoint Profiles - By State \n\n View Member's Completion Maps \n\n US State Highpoints \n\n \n\n US State Highpoints Member Area \n\n View State Highpoints List \n\n View Member's Completion Maps \n\n US Wilderness Area Peaks \n\n \n\n Wilderness Summits Member Area \n\n Wilderness Area Detail by State \n\n Wilderness HPs Member Area \n\n US National Park Peaks \n\n \n\n National Park Peaks Member Area \n\n National Park Peaks Detail by State" in content + assert "
    What's New - Recent Content\n\nMembers' Peak Updates\n\nRecent Trip Reports\n\nRecent Trip Report Comments\n\nRecently added Images\n\nRecently added Peaks\n\nList Completers\n\nHeight List Completers\n\nElevation List Completers\n\nCounty Summit Completers\n\nWilderness Area Completers\n\nMember Profiles & Stats\n\nMember Profiles - Summary Stats\n\nMember Stats by Date Range & Charts\n\nCalendar Grid Completions\n\nPeaks Repeated\n\nMost Climbed Peaks\n\nUnclimbed Peaks\n\nUS Peak Totals by State\n\nMember Tools\n\nClosest 50 Peaks by Member\n\nClosest 50 Map\n\nClosest 50 List\n\nDownload your Peak List\n\nSearch Trip Reports\n\nUnclimbed by Custom Group\n\nExport CSV, GPX, POI, TOPO! Files\n\nElevation Threshold Progress Maps\n\nState Highest # Progress Maps\n\nCounty Summit Progress Maps\n\nStatewide County Summit Maps\n\nProminence Progress Maps\n\nState Quads Progress Maps\n\nQuadrangle Lookup\n\nDistance Calculator\n\nSlope Angle Calculator\n\nStats Category Leaders\n\nUS Highest 1,000 Peaks\n\nUS Highest 1,000 Member Area\n\n1,000 Highest Peak List\n\nUS Steepest 1,000 Peaks\n\nSteepness Member Area\n\nView 1,000 Steepest List\n\nUS 2,000' Prominence\n\nUS Prominence Member Area\n\nView US Prominence Peak Profiles\n\nView Member 5k Completion Maps\n\nProminence Progress Maps\n\nUS County Highpoints\n\nCounty Highpoints Member Area\n\nHighpoint Profiles - By State\n\nView Member's Completion Maps\n\nUS State Highpoints\n\nUS State Highpoints Member Area\n\nView State Highpoints List\n\nView Member's Completion Maps\n\nUS Wilderness Area Peaks\n\nWilderness Summits Member Area\n\nWilderness Area Detail by State" in content def test_nested_table4(self): """复杂嵌套表格.""" @@ -233,4 +233,4 @@ def test_nested_table4(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 4 content = parts[2][0].text_content() - assert '

    Molecular line emissions from pre main sequence objects

    Saraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). \n\n Molecular line emissions from pre main sequence objects. \n\n In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291. \n\n Full text available as:

    \n\n
    Preview
    \n\n
    PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\n
    Download (239Kb)
      \n\n\n\n
      URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
      Google Scholar:Look up in Google Scholar
      \n\n

      Abstract

      We present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H 2 0 cooling.

      \n\n\n\n\n\n\n\n\n\n\n\n\n\n' in content + assert '
      Item Type:Conference Item
      Copyright Holders:1997 European Space Agency
      Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
      Academic Unit/Department:Science > Physical Sciences
      Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
      Item ID:32696
      Depositing User:Glenn White
      Molecular line emissions from pre main sequence objects\n\nSaraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). Molecular line emissions from pre main sequence objects. In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291.\n\nFull text available as:
      Preview
      PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\nDownload (239Kb)
      URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
      Google Scholar:Look up in Google Scholar
      Abstract\n\nWe present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H20 cooling.' in content diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index a0b71fa4..2edfbeab 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -18,17 +18,15 @@ def test_title_recognizer(title_recognizer): result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 10 - assert element_to_html(result[0][0]) == """大模型好,大模型棒1""" - assert element_to_html(result[6][0]) == """大模型好,大模型棒5 大模型很棒""" + assert element_to_html(result[0][0]) == """大模型好,大模型棒1""" + assert element_to_html(result[6][0]) == """大模型好,大模型棒5 大模型很棒""" def test_title_tails_and_levels(title_recognizer): html_content = """

      TEST:import *TEST

      Tail

      aaa

      """ result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 2 - assert element_to_html(result[0][0]) == '
      TEST: `import *` TEST
      ' + assert element_to_html(result[0][0]) == '
      TEST: `import *` TEST
      ' pass @@ -47,4 +45,4 @@ def test_title1(title_recognizer): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/title1.html', 'r') as file: html_content = file.read() result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content) - assert 'Compare vibrational frequencies for two calculations for C <sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) + assert 'Compare vibrational frequencies for two calculations for C<sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 5255efe9..533470ef 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -374,7 +374,7 @@ def test_table_include_math_p(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() assert len(content_list[0]) == 17 - assert content_list[0][3]['content']['html'] == "
      Item Type:Conference Item
      Copyright Holders:1997 European Space Agency
      Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
      Academic Unit/Department:Science > Physical Sciences
      Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
      Item ID:32696
      Depositing User:Glenn White
      up vote 17 down vote favorite \n\n 5
      I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?
      prime-numbers factoring
      " + assert content_list[0][3]['content']['html'] == "
      up vote 17 down vote favorite\n\n5I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?\n\nprime-numbers factoring
      " def test_table_include_math_p_2(self): """table包含math和其他内容.""" @@ -386,7 +386,7 @@ def test_table_include_math_p_2(self): md_content = result.get_content_list().to_nlp_md() # with open('output_badcase_p2.md', 'w', encoding='utf-8') as f: # f.write(md_content) - self.assertIn('
      单位换算:

      数学公式区块: $1\\text{km}={10}^{3}\\text{m}$

      ', md_content) + self.assertIn('
      长度质量时间
      单位换算:数学公式区块: $1\\text{km}={10}^{3}\\text{m}$', md_content) def test_clean_tags(self): """测试clean_tag的preExtractor是否生效.""" @@ -491,7 +491,7 @@ def test_more_nt(self): result_content_list = result.get_content_list()._get_data() result = result_content_list[0][2]['content']['html'] assert '\n\t' not in result - assert len(result) == 2205 + assert len(result) == 1893 def test_math_physicsforums(self): """测试math_physicsforums网页中数学公式是[tex]和[itex]包裹的,且中间还有
      标签分割.""" @@ -636,7 +636,7 @@ def test_table_lack_pre_content(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() - assert result_content_list[0][22]['content']['html'] == '
      长度质量时间
      お名前【必須】お名前(カナ)
      ご連絡先【いずれか必須】

      メールアドレス

      電話番号

      ※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。

      ' + assert result_content_list[0][22]['content']['html'] == '
      お名前【必須】お名前(カナ)
      ご連絡先【いずれか必須】
      メールアドレス電話番号
      ※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。
      ' def test_td_include_specila_symbol(self): """测试td包含特殊符号|,需要转义.""" From 3a6ff2b616304161e0dceab1e68b1ec79db23d0b Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 11 Sep 2025 18:30:18 +0800 Subject: [PATCH 23/31] =?UTF-8?q?fix:=20=E5=8E=BB=E6=8E=89=E5=86=97?= =?UTF-8?q?=E4=BD=99=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/table.py | 10 ++-------- llm_web_kit/libs/html_utils.py | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index a70e6290..0d6e2b8e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -334,15 +334,9 @@ def __get_table_body(self, table_type, table_nest_level, table_root): for element in new_table_root.iter(): # 清理元素前后的空白(不影响.text和.tail的内容) if element.text is not None: - if element.tag in allow_tags: - element.text = re.sub(pattern, '\n\n', element.text.strip()) - else: - element.text = re.sub(pattern, '\n\n', element.text.lstrip()) + element.text = re.sub(pattern, '\n\n', element.text.strip()) if element.tail is not None: - if element.tag not in new_inline_tags: - element.tail = "\n\n" + re.sub(pattern, '\n\n', element.tail.lstrip()) - else: - element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip() + element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip() tree_html = element_to_html_unescaped(new_table_root) restore_tree_html = restore_sub_sup_from_text_regex(tree_html) diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index 21718f2a..05d6783b 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -465,7 +465,6 @@ def replacer(match): return 'tem_sup_start' if tag == '': return 'tem_sup_end' - return tag pattern = r']*>' return re.sub(pattern, replacer, html_content, flags=re.IGNORECASE) From ad22fb563fe1512bcb37f7131d1714df8cdd826c Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 11 Sep 2025 22:30:58 +0800 Subject: [PATCH 24/31] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E6=AE=B5?= =?UTF-8?q?=E8=90=BD=E7=BB=93=E5=B0=BE=E4=B8=BA=E6=8D=A2=E8=A1=8C=E6=97=B6?= =?UTF-8?q?datajson=E6=8B=BC=E6=8E=A5=E5=BC=82=E5=B8=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/text.py | 4 +- llm_web_kit/input/datajson.py | 1 - .../good_data/html/para_br.html | 54312 ++++++++++++++++ .../good_data/html/para_br_main.html | 68 + .../extractor/html/recognizer/test_text.py | 24 + 5 files changed, 54406 insertions(+), 3 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 6dc7e346..4f99d9e6 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -267,10 +267,10 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: return text if final := __get_paragraph_text_recusive(root, ''): - para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT}) + para_text.append({'c': final, 't': ParagraphTextType.TEXT}) for item in para_text: - item['c'] = restore_sub_sup_from_text_regex(item['c']) + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) return para_text def __extract_paragraphs(self, root: HtmlElement): diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index e16d4ecf..11a4b5f1 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -519,7 +519,6 @@ def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str: c = el['c'] if not c or not c.strip(): continue - c = c.strip() new_c = self.__escape_md_special_chars(c) # 转义特殊字符 one_para.append(new_c) elif el['t'] == ParagraphTextType.EQUATION_INLINE: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html new file mode 100644 index 00000000..459eca7e --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html @@ -0,0 +1,54312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + How To Find Interquartile Range - UpSkillMe + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + +
      + +
      +
      +
      +
      +
      +
      + +

      How To Find Interquartile Range

      + +
      How To Find Interquartile Range +
      +

      Image Credits: Pinterest.

      +

      The interquartile range (IQR) is the difference between the third and the first quartiles. It is a + measure of dispersion. Quartiles are the values that divide a list of numbers into quarters. Here is how + to find the Interquartile Range. 

      +

      The interquartile range formula is the first quartile subtracted from the third quartile: 
                +                                           IQR = Q_{3}-Q_{1}

      + +

      How To Find Interquartile Range for + an Odd Set of Numbers

      +
        +
      1. Order the numbers from least to greatest.

        Given Data Set: 5, 7, 9, 3, 13, + 11, 17, 15, 21, 19, 23

        +

        Order Number: 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23.

        +
      2. +
      3. Find the median. The median is the data value in the middle of the set. The median in + the given data set is 13 since 13 is in the middle of the set.

        Median: 3, 5, 7, 9, + 11, 13, 15, 17, 19, 21, 23

        +
      4. +
      5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

        (3, 5, 7, 9, 11), 13, (15, 17, 19, 21, 23) +

        +
      6. +
      7. Find the median of both the lower and upper half of the data. Think Q1 as a median in + the lower half of the data and Q3 as a median for the upper half of data.

        (3, 5, 7, + 9, 11) = 7 = Q1 and (15, 17, 19, 21, 23) = 19 = Q3 +

        +
      8. +
      9. Subtract Q1 from Q3 to find the interquartile range

        Q3 – Q1 = 19 – 7 = + 12

        +
      10. +
      +

      How To Find Interquartile Range + for an Even Set of Numbers

      +
        +
      1. Order the numbers from least to greatest.

        Given Data Set: 42, 51, 62, 47, + 38, 50, 54, 43

        +

        Order Number: 38, 42, 43, 47, 50, 51, 54, 62.

        +
      2. +
      3. Make a mark in the center of the data:

        Median: 38, 42, 43, 47,| + 50, 51, 54, 62

        +
      4. +
      5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

        (38, 42, 43, 47),| (50, 51, 54, 62).

        +
      6. +
      7. Find the median. We have the even data sets so the median is the average of + the middle two numbers.

        (38, 42, 43, 47) = \frac{42+43}{2} = \frac{85}{2} = 42.5 = Q1 +

        +

        (50, 51, 54, 62) = \frac{51+54}{2} = \frac{105}{2} = 52.5 = Q3 +

        +
      8. +
      9. Subtract Q1 from Q3 to find the interquartile range.

        Q3 – Q1 = 52.5 – + 42.5 = 10

        +
      10. +
      + + +
      +
      +
      +
      +
      +
      +
      + + + + +
      +
      + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html new file mode 100644 index 00000000..7a41eab0 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html @@ -0,0 +1,68 @@ +
      +

      Image Credits: Pinterest.

      +

      The interquartile range (IQR) is the difference between the third and the first quartiles. It is a + measure of dispersion. Quartiles are the values that divide a list of numbers into quarters. Here is how + to find the Interquartile Range. 

      +

      The interquartile range formula is the first quartile subtracted from the third quartile: 
                +                                           IQR = Q_{3}-Q_{1}

      + +

      How To Find Interquartile Range for + an Odd Set of Numbers

      +
        +
      1. Order the numbers from least to greatest.

        Given Data Set: 5, 7, 9, 3, 13, + 11, 17, 15, 21, 19, 23

        +

        Order Number: 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23.

        +
      2. +
      3. Find the median. The median is the data value in the middle of the set. The median in + the given data set is 13 since 13 is in the middle of the set.

        Median: 3, 5, 7, 9, + 11, 13, 15, 17, 19, 21, 23

        +
      4. +
      5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

        (3, 5, 7, 9, 11), 13, (15, 17, 19, 21, 23) +

        +
      6. +
      7. Find the median of both the lower and upper half of the data. Think Q1 as a median in + the lower half of the data and Q3 as a median for the upper half of data.

        (3, 5, 7, + 9, 11) = 7 = Q1 and (15, 17, 19, 21, 23) = 19 = Q3 +

        +
      8. +
      9. Subtract Q1 from Q3 to find the interquartile range

        Q3 – Q1 = 19 – 7 = + 12

        +
      10. +
      +

      How To Find Interquartile Range + for an Even Set of Numbers

      +
        +
      1. Order the numbers from least to greatest.

        Given Data Set: 42, 51, 62, 47, + 38, 50, 54, 43

        +

        Order Number: 38, 42, 43, 47, 50, 51, 54, 62.

        +
      2. +
      3. Make a mark in the center of the data:

        Median: 38, 42, 43, 47,| + 50, 51, 54, 62

        +
      4. +
      5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

        (38, 42, 43, 47),| (50, 51, 54, 62).

        +
      6. +
      7. Find the median. We have the even data sets so the median is the average of + the middle two numbers.

        (38, 42, 43, 47) = \frac{42+43}{2} = \frac{85}{2} = 42.5 = Q1 +

        +

        (50, 51, 54, 62) = \frac{51+54}{2} = \frac{105}{2} = 52.5 = Q3 +

        +
      8. +
      9. Subtract Q1 from Q3 to find the interquartile range.

        Q3 – Q1 = 52.5 – + 42.5 = 10

        +
      10. +
      + + +
      + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 9dd16050..674b9dc3 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -455,6 +455,30 @@ def test_Lack_content1(self): content_md = result.get_content_list().to_mm_md() assert 'a) Electronic mail: airegg.py90g@nctu.edu.tw .' in content_md + def test_para_br(self): + """ + 测试修复段落结尾为\n\n + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'para_br.html', + 'main_path': 'para_br_main.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md + def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError From 9161e8fc7704027f297dc4a2777e0d3f9c6224e3 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Wed, 17 Sep 2025 11:34:41 +0800 Subject: [PATCH 25/31] =?UTF-8?q?fix:=20=E5=85=BC=E5=AE=B9=E6=AE=B5?= =?UTF-8?q?=E8=90=BD=E5=8F=AF=E8=83=BD=E4=B8=BANone=E7=9A=84=E6=83=85?= =?UTF-8?q?=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/text.py | 6 +- .../good_data/html/para_has_none.html | 1245 +++++++++++++++++ .../extractor/html/recognizer/test_text.py | 24 + 3 files changed, 1274 insertions(+), 1 deletion(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_has_none.html diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 4f99d9e6..db90f4a7 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -270,7 +270,11 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: para_text.append({'c': final, 't': ParagraphTextType.TEXT}) for item in para_text: - item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) + if item['c'] is not None: + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) + else: + item['c'] = "" + return para_text def __extract_paragraphs(self, root: HtmlElement): diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_has_none.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_has_none.html new file mode 100644 index 00000000..de42f2bf --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_has_none.html @@ -0,0 +1,1245 @@ + + + Zk multiplecomposer using Spring use composer variable in zul - ZK Forum
      0

      Zk multiplecomposer using Spring use composer variable in zul

      + + + asked + + + + 2018-01-10 05:05:54 +0800 +

      javiut gravatar image javiut 委内瑞拉玻利瓦尔共和国国旗
      90 1 5

      I have a view using a composer which has inside 5 columnChildren each one are the parentComponent of a macroComponent which also uses a composer.

      MyView.zul which has 5 columnChildrens

      <window id="win" apply='com.MyComposerController'>
      +<portallayout maximizedMode="whole">
      +    <portalchildren style="padding:5px" width="50%">
      +        <panel sclass="innerPanel" title="${c:l('zul.m.zeus.ac.001')}" border="normal">
      +            <panelchildren>
      +                <columnlayout>
      +                    <columnchildren id='columnChildrenForInfo'/><!-PARENT FOR THE MACROURI TEMPLATE->
      +                </columnlayout>
      +            </panelchildren>
      +        </panel>
      +        <panel sclass="innerPanel" title="${c:l('zul.m.zeus.ac.005')}" border="normal">
      +            <panelchildren>
      +                <columnlayout>
      +                    <columnchildren id='columnChildrenForPetition'/><!-PARENT FOR THE MACROURI TEMPLATE->
      +                </columnlayout>
      +            </panelchildren>
      +        </panel>
      +        <panel sclass="innerPanel" title="${c:l('zul.m.zeus.ac.009')}" border="normal">
      +            <panelchildren>
      +                <columnlayout>
      +                    <columnchildren id='columnChildrenForErrors'/><!-PARENT FOR THE MACROURI TEMPLATE->
      +                </columnlayout>
      +            </panelchildren>
      +        </panel>
      +    </portalchildren>
      +    <portalchildren style="padding:5px" width="50%">
      +        <panel sclass="innerPanel" title="${c:l('zul.m.zeus.ac.013')}" border="normal">
      +            <panelchildren>
      +                <columnlayout>
      +                    <columnchildren id='columnChildrenForEsperaNoAtendidos'/><!-PARENT FOR THE MACROURI TEMPLATE->
      +                </columnlayout>
      +            </panelchildren>
      +        </panel>
      +        <panel sclass="innerPanel" title="${c:l('zul.m.zeus.ac.018')}" border="normal">
      +            <panelchildren>
      +                <columnlayout>
      +                    <columnchildren id='columnChildrenForDocs'/><!-PARENT FOR THE MACROURI TEMPLATE->
      +                </columnlayout>
      +            </panelchildren>
      +        </panel>
      +    </portalchildren>
      +</portallayout>
      +

      </window>

      Latter i create in each columnChildren the following macroUri template +MyViewDetail.zul

      <zk>
      +                            <panel>
      +                                <panelchildren>
      +<listbox id="results" style="margin-right:5px;margin-top:5px;margin-left:5px">
      +
      +<listhead>
      +   <listheader/>
      +   <listheader/>
      +   <listheader/>
      +</listhead>
      +<template name="model">
      +<listitem onClick='$composer.listitemOnClick(event);'>
      +    <listcell/>
      +    <listcell/>
      +    <listcell/>
      + </listitem>
      +</template>
      +</listbox>
      +</panelchildren>
      +</panel>
      +</zk>
      +

      this macroComponent is mapped in lang-addon.xml like this

      <component>
      +    <component-name>composerDetailController</component-name>
      +    <component-class>DetailController</component-class>
      +    <macro-uri>myViewDetail.zul</macro-uri>
      +</component>
      +

      Also the detail composer is a string bean which i retrieve like this

      <bean id="composerDetailControllerBean" class="...." autowire="byName" scope="prototype"/>
      +

      I bind it like this.

          final DetailController composerDetailControllerBean = (DetailController)SpringUtil.getBean("composerDetailControllerBean");//EXTRACT IT FROM SPRING CONTAINER
      +    composerDetailControllerBean.setParent(columnChildren);//EACH DETAIL CONTROLLER HAS A COLUMN CHILDREN AS A PARENT
      +    detail.afterCompose();
      +    detail.inicializa();//SOME BUSINESS LOGIC
      +

      Everything is working like a charm but i had a problem as you can see in the template there is a listitem which doesn't have a id because if i put a id a NONUNIQUEIDINSPACE exception is thrown but i need to listen the onClick method on it i have try the following

      onClick='$composer.listitemOnClick(event);'
      +

      But the composer variables points it to the first composer i mean the MyView.zul composer and i have the method and the Business logic in the template composer DetailController but as i dont do this

      <div id="compA" apply="some.package.ComposerA">
      +   <div id="compB" apply="some.package.ComposerB"/> i dont use apply in the detailComposer i bind it through lang-addon.xml file
      +</div>
      +
      +<div apply="some.package.ComposerA">
      +   <custom-attributes composerName="myCompA"/>i dont use apply in the detailComposer i bind it through lang-addon.xml file i cannot use customAttributes
      +</div>
      +

      I mean i dont set a explicit name to the detail composer and i dont know how to referred it to it in the zul code

      i cannot map the composer name to a variable the only thing i did was this try to set a composerName using custom-attributes trying to accomplish the custom-name by custom-attributes aforementioned

      lang-addon.xml file

      <component>
      +    <component-name>composerDetailController</component-name>
      +    <component-class>DetailController</component-class>
      +    <macro-uri>myViewDetail.zul</macro-uri>
      +  <custom-attribute>
      +     <attribute-name>composerName</attribute-name>
      +     <attribute-value>composerDetail</attribute-value>
      +  </custom-attribute>
      +</component>
      +
      +'$composerDetail.listitemOnClick(event);'
      +

      Trying to map the custom attribute-name in the lang-addOn and use it in the zul but without success.

      I get Caused by: Sourced file: inline evaluation of: $composerDetail.listitemOnClick(event);'' : $composerDetail .listitemOnClick ( event ) $composerDetail.listitemOnClick(event);'' : Attempt to resolve method: listitemOnClick() on undefined variable or class name: $composerDetail : at Line: 14 : in file: inline evaluation of:

      I also try this in the MyViewDetail.zul

      <zk>
      +    <custom-attributes composerName="composerDetail"/>
      +</zk>
      +

      But not works neither.

      In resume i dont know to set a name to the detailComposer and i cannot use it in the MyViewDetail.zul

      delete flag offensive retag edit

      + + + 1 Answer + + +

      + Sort by » + oldest newest most voted
      0

      + + + answered + + + + 2018-01-10 16:03:56 +0800 +

      cor3000 gravatar image cor3000
      4406 2 7

      ZK Team

      I can't really follow your explanations... and maybe I don't have to. +I think you are trying to solve a problem that shouldn't be there in the first place.

      Wiring a listener to dynamically created items such as listitems is best done using event-forwarding to a non dynamic component such as the surrounding listbox.

      <listbox id="results" style="margin-right:5px;margin-top:5px;margin-left:5px">
      +   ...
      +   <template name="model">
      +      <!-- don't to this, this is zscript/beanshell -->
      +      <!-- <listitem onClick='$composer.listitemOnClick(event);'> -->
      +      <!-- use event forwarding -->
      +      <listitem forward='onClick=results.onListitemClick'>
      +   </template>
      +</listbox>
      +

      Then in your composer you can easily bind an event listener to the results listbox:

      @Listen("onListitemClick=#results")
      +public void listitemOnClick(ForwardEvent event) {
      +    //get the original MouseEvent and click target
      +    MouseEvent me = (MouseEvent) event.getOrigin();
      +    Listitem listitem = me.getTarget();
      +}
      +

      I think that should avoid the complications you are encountering.

      Robert

      + link + publish delete flag offensive edit
      + + + Your answer + + +
      Please start posting your answer anonymously - your answer will be saved within the current session and published after you log in or create a new account. Please try to give a substantial answer, for discussions, please use comments and please do remember to vote (after you log in)!

      + [hide preview] +
      Support Options
      • Email Support
      • Training
      • Consulting
      • Outsourcing
      Learn More
      \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 674b9dc3..1d5c200b 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -479,6 +479,30 @@ def test_para_br(self): content_md = result.get_content_list().to_mm_md() assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md + def test_para_has_none(self): + """ + 兼容段落可能为None的情况 + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'para_has_none.html', + 'main_path': 'para_has_none.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert content_md + def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError From f501ba4b44cc92073a45ff22698378959724dbd6 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Wed, 24 Sep 2025 15:08:32 +0800 Subject: [PATCH 26/31] =?UTF-8?q?fix:=201.=E4=BF=AE=E5=A4=8D=E6=A0=87?= =?UTF-8?q?=E9=A2=98=E4=B8=AD=E5=85=AC=E5=BC=8F=E5=9C=A8md=E6=B8=B2?= =?UTF-8?q?=E6=9F=93=E5=BC=82=E5=B8=B8=202.=E4=BF=AE=E5=A4=8D=E6=AD=A3?= =?UTF-8?q?=E5=88=99=E6=97=A0=E6=B3=95=E6=AD=A3=E7=A1=AE=E5=8C=B9=E9=85=8D?= =?UTF-8?q?$...$$...$$...$=E8=BF=99=E7=A7=8D=E8=BF=9E=E7=BB=AD=E5=85=AC?= =?UTF-8?q?=E5=BC=8F=203.=E4=BF=AE=E5=A4=8D=E5=A4=84=E7=90=86=E4=B8=8D?= =?UTF-8?q?=E9=97=AD=E5=90=88=E7=9A=84=E5=85=AC=E5=BC=8F=E9=80=BB=E8=BE=91?= =?UTF-8?q?=204.=E5=8E=BB=E6=8E=89$=E8=BD=AC=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/render/mathjax.py | 44 +- .../html/recognizer/cc_math/tag_math.py | 6 +- .../extractor/html/recognizer/title.py | 2 + llm_web_kit/input/datajson.py | 2 +- llm_web_kit/libs/html_utils.py | 76 ++++ requirements/runtime.txt | 1 + .../extractor/html/recognizer/test_math.py | 388 ++++++++++++++++++ .../extractor/html/recognizer/test_title.py | 18 + .../extractor/test_extractor_chain.py | 2 +- 9 files changed, 532 insertions(+), 7 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 06ac62a9..b5b1ac88 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -1,10 +1,13 @@ import re from typing import Any, Dict, List +from pylatexenc import latexwalker + from llm_web_kit.extractor.html.recognizer.cc_math.common import CCMATH from llm_web_kit.extractor.html.recognizer.cc_math.render.render import ( BaseMathRender, MathRenderType) -from llm_web_kit.libs.html_utils import HtmlElement, html_to_element +from llm_web_kit.libs.html_utils import (HtmlElement, SimpleMatch, + html_to_element) from llm_web_kit.libs.text_utils import normalize_ctl_text # 添加MATHJAX_OPTIONS变量定义 @@ -358,8 +361,43 @@ def _process_math_in_text( return text # 首先查找所有分隔符形式的匹配 - matches = list(pattern.finditer(text)) - + if not is_display: + matches = list(pattern.finditer(text)) + else: + matches = [] + # 独立公式环境 + independent_math_environments = [ + 'displaymath', + 'equation', + 'equation*', + 'align', + 'align*', + 'gather', + 'gather*', + 'multline', + 'multline*', + 'vmatrix', + 'Vmatrix' + ] + walker = latexwalker.LatexWalker(text) + nodelist, pos, len_ = walker.get_latex_nodes(pos=0) + for node in nodelist: + # 标准的数学环境 + if node.isNodeType(latexwalker.LatexMathNode): + # 判断是行内公式还是独立公式 + if node.displaytype == 'inline': + pass + elif node.displaytype == 'display': + fake_match = SimpleMatch(text, node.pos, node.len) + matches.append(fake_match) + # 其他数学环境 + if (node.isNodeType(latexwalker.LatexEnvironmentNode) and + hasattr(node, 'environmentname') and + node.environmentname in independent_math_environments): + fake_match = SimpleMatch(text, node.pos, node.len) + matches.append(fake_match) + tex_pattern = re.compile('\\[tex\\](.*?)\\[/tex\\]', re.DOTALL) + matches.extend(list(tex_pattern.finditer(text))) # 如果没有匹配到分隔符形式的公式,直接返回原文本 if not matches: return text diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index aed792c9..ef295565 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -7,8 +7,9 @@ from llm_web_kit.extractor.html.recognizer.cc_math.common import (CCMATH, MathType, text_strip) -from llm_web_kit.libs.html_utils import (build_cc_element, element_to_html, - replace_element) +from llm_web_kit.libs.html_utils import (build_cc_element, + check_and_balance_delimiters, + element_to_html, replace_element) def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement): @@ -55,6 +56,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa # 处理未转义的%为\% if latex: latex = re.sub(r'(? li if el.tag == CCTag.CC_CODE_INLINE: blks.append(f'`{el.text}`') + elif el.tag == CCTag.CC_MATH_INLINE: + blks.append(f'${el.text.strip()}$') elif el.tag in ['br']: blks.extend(['$br$']) else: diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index 11a4b5f1..56ed8272 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -55,7 +55,7 @@ def __init__(self): self.__text_end = '\n' self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 - self.__md_special_chars = ['#', '`', '$'] # TODO 拼装table的时候还应该转义掉|符号 + self.__md_special_chars = ['#', '`'] # TODO 拼装table的时候还应该转义掉|符号 self.__nodes_document_type = [DocElementType.MM_NODE_LIST, DocElementType.PARAGRAPH, DocElementType.LIST, DocElementType.SIMPLE_TABLE, DocElementType.COMPLEX_TABLE, DocElementType.TITLE, DocElementType.IMAGE, DocElementType.AUDIO, DocElementType.VIDEO, diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index 05d6783b..ff728654 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -483,6 +483,41 @@ def restore_sub_sup_from_text_regex(processed_content): return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content) +def check_and_balance_delimiters(latex_str): + """检查LaTeX字符串中的left和right是否成对,并移除多余的left或right,但保留分隔符。 + + Args: + latex_str (str): 输入的LaTeX字符串 + + Returns: + str: 处理后的字符串,多余的left或right已被移除,分隔符保留。 + """ + stack = [] + to_remove = [] + pattern = re.compile(r'(\\left|\\right)(\\[{}()[\]]|\.|)') + + matches = list(pattern.finditer(latex_str)) + for match in matches: + start_idx = match.start() # 整个匹配的起始位置 + command = match.group(1) # 匹配到的命令,是 '\left' 或 '\right' + + if command == r'\left': + stack.append((start_idx, len(command))) + elif command == r'\right': + if stack: + stack.pop() + else: + to_remove.append((start_idx, len(command))) + + for left_start, left_cmd_len in stack: + to_remove.append((left_start, left_cmd_len)) + + for pos, cmd_len in sorted(to_remove, reverse=True): + latex_str = latex_str[:pos] + latex_str[pos + cmd_len:] + + return latex_str + + def get_plain_text_fast(html_source: str) -> str: """使用lxml快速获取html中的纯文本. @@ -506,3 +541,44 @@ def get_plain_text_fast(html_source: str) -> str: texts = doc.xpath('//text()') full_text = ' '.join(text.strip() for text in texts if text.strip()) return full_text + + +class SimpleMatch: + """一个简单的模拟 re.Match 的对象。 根据提供的原始字符串、起始位置和长度来模拟匹配结果。""" + def __init__(self, original_string, start_pos, length): + self._string = original_string + self._start = start_pos + self._end = start_pos + length + self._match = original_string[start_pos:self._end] # 提取匹配的字符串 + + def group(self, group_num=0): + if group_num == 0: + return self._match + else: + # 这个简单的模拟不支持捕获组,调用 group(>0) 可能抛出异常或返回 None + # 可以根据需要扩展以支持捕获组 + raise IndexError("no such group") + + def start(self, group_num=0): + if group_num == 0: + return self._start + else: + # 同样,不支持捕获组 + raise IndexError("no such group") + + def end(self, group_num=0): + if group_num == 0: + return self._end + else: + raise IndexError("no such group") + + def span(self, group_num=0): + return (self.start(group_num), self.end(group_num)) + + def groups(self): + # 返回空元组,因为不支持捕获组 + return () + + def groupdict(self): + # 返回空字典,因为不支持命名组 + return {} diff --git a/requirements/runtime.txt b/requirements/runtime.txt index ba3671e1..6e12c89f 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -26,6 +26,7 @@ py-asciimath==0.3.0 pyahocorasick==2.0.0 pydantic==2.11.7 pydantic-settings==2.10.1 +pylatexenc==2.10 python-dotenv==1.1.1 python-multipart==0.0.20 scikit-learn>=1.6.1 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index d25cc630..0e92cb92 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -565,6 +565,394 @@ def test_to_content_list_node(self): ) self.assertIn('No ccmath element found in content', str(exc_info.exception)) + def test_fix_re_match(self): + """修复正则无法正确匹配$...$$...$$...$这种连续公式.""" + html_content = r"""

      $\newcommand{\cE}[2]{\mathbf{E}(#1\ |\ #2)}$$\newcommand{\cP}[2]{\mathbf{P}(#1\ |\ #2)}$$\renewcommand{\P}[1]{\mathbf{P}(#1)}$$\newcommand{\E}[1]{\mathbf{E}(#1)}$$\newcommand{\F}{\mathcal{F}}$$\newcommand{\G}{\mathcal{G}}$$\newcommand{\ind}[1]{\mathbf{1}_{#1}}$ + To motivate this note, I’ll pose the following problem:

      """ + parts = self.math_recognizer.recognize('https://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) + assert element_to_html(parts[0][0]) == '

      \\newcommand{\\cE}[2]{\\mathbf{E}(#1\\ |\\ #2)}\\newcommand{\\cP}[2]{\\mathbf{P}(#1\\ |\\ #2)}\\renewcommand{\\P}[1]{\\mathbf{P}(#1)}\\newcommand{\\E}[1]{\\mathbf{E}(#1)}\\newcommand{\\F}{\\mathcal{F}}\\newcommand{\\G}{\\mathcal{G}}\\newcommand{\\ind}[1]{\\mathbf{1}_{#1}}\n To motivate this note, I’ll pose the following problem:

      ' + + def test_latex_not_closed(self): + """移除LaTeX字符多余的left或right.""" + html_content = """

      + + + + + { + + + + + + ∇ + + + ⋅ + + + + ( + + + + + R + + + 2 + + + + ∇ + + + φ + + + + ) + + + + = + + + 0 + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + + ( + + + 6 + + + ) + + + + + + + D + + + + ( + + + + + r + + + + , + + + + k + + + + , + + + ω + + + + ) + + + + ≡ + + + + c + + + + 2 + + + + k + + + 0 + + + + + + + [ + + + + + k + + + 2 + + + + − + + + + + + ( + + + + n + + + + k + + + 0 + + + + + ) + + + + + 2 + + + + + ] + + + + + + + + W + + + + ( + + + + + r + + + + , + + + ω + + + + ) + + + + = + + + 0 + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + +   + + + (7) + + + + + + + +

      """ + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert '\\{\\begin{array}{l}\\nabla \\cdot \\left({R}^{2}\\nabla \\phi \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(6\\right)\\\\ D\\left(r,k,\\omega \\right)\\equiv \\frac{c}{2{k}_{0}}\\left[{k}^{2}-{\\left(n{k}_{0}\\right)}^{2}\\right]+W\\left(r,\\omega \\right)=0\\text{ }\\text{ }\\text{ }\\text{ }\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\text{ }\\text{\\hspace{0.17em}}\\text{\\hspace{0.17em}}\\left(7\\right)\\end{array}' in element_to_html(parts[0][0]) + class TestCCMATH(unittest.TestCase): def setUp(self): diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index 2edfbeab..b07ee1fe 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -46,3 +46,21 @@ def test_title1(title_recognizer): html_content = file.read() result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content) assert 'Compare vibrational frequencies for two calculations for C<sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) + + +def test_title_has_formula(title_recognizer): + """ + 标题含有公式 + Args: + title_recognizer: + + Returns: + + """ + html_content = r"""

      + + Vector Meson Production in the Final State $K^+ K^- \pi^+ \pi^-$ Photon-photon Collisions + +

      """ + result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) + assert r"Vector Meson Production in the Final State $K^+ K^- \pi^+ \pi^-$ Photon-photon Collisions" in element_to_html(result[0][0]) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 533470ef..361f6487 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -469,7 +469,7 @@ def test_math_dollar(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_nlp_md() - self.assertIn(r'\$16.8 million', result_md) + self.assertIn(r'$16.8 million', result_md) def test_math_non_asciimath(self): """测试普通文本中的``不应该被识别为asciimath.""" From 21edfef48853517d60cb9e3e895bab88fbdca489 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Wed, 24 Sep 2025 15:27:22 +0800 Subject: [PATCH 27/31] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E5=86=97=E4=BD=99?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/libs/html_utils.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index ff728654..f9245a21 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -554,31 +554,15 @@ def __init__(self, original_string, start_pos, length): def group(self, group_num=0): if group_num == 0: return self._match - else: - # 这个简单的模拟不支持捕获组,调用 group(>0) 可能抛出异常或返回 None - # 可以根据需要扩展以支持捕获组 - raise IndexError("no such group") def start(self, group_num=0): if group_num == 0: return self._start - else: - # 同样,不支持捕获组 - raise IndexError("no such group") def end(self, group_num=0): if group_num == 0: return self._end - else: - raise IndexError("no such group") - - def span(self, group_num=0): - return (self.start(group_num), self.end(group_num)) def groups(self): # 返回空元组,因为不支持捕获组 return () - - def groupdict(self): - # 返回空字典,因为不支持命名组 - return {} From 1eb819888bf67b4071b2877108a19019e3163da5 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Sun, 28 Sep 2025 17:03:15 +0800 Subject: [PATCH 28/31] =?UTF-8?q?fix:=E7=BE=8E=E5=85=83=E7=AC=A6=E5=8F=B7?= =?UTF-8?q?=E4=B8=8E=E5=85=AC=E5=BC=8F=E5=85=B1=E5=AD=98=E6=97=B6=EF=BC=8C?= =?UTF-8?q?=E8=BD=AC=E4=B9=89=E7=BE=8E=E5=85=83=E7=AC=A6=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/render/mathjax.py | 7 +++-- llm_web_kit/libs/html_utils.py | 28 +++++++++++++++++++ .../extractor/html/recognizer/test_math.py | 8 ++++++ .../extractor/test_extractor_chain.py | 2 +- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index b5b1ac88..60497452 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -7,7 +7,8 @@ from llm_web_kit.extractor.html.recognizer.cc_math.render.render import ( BaseMathRender, MathRenderType) from llm_web_kit.libs.html_utils import (HtmlElement, SimpleMatch, - html_to_element) + html_to_element, + optimized_dollar_matching) from llm_web_kit.libs.text_utils import normalize_ctl_text # 添加MATHJAX_OPTIONS变量定义 @@ -400,7 +401,7 @@ def _process_math_in_text( matches.extend(list(tex_pattern.finditer(text))) # 如果没有匹配到分隔符形式的公式,直接返回原文本 if not matches: - return text + return optimized_dollar_matching(text) # 从后向前处理,以避免位置偏移 result = text @@ -476,7 +477,7 @@ def _process_math_in_text( last_position = start_pos # 返回处理后的文本 - return result + return optimized_dollar_matching(result) def _is_escaped_delimiter(self, text: str, pos: int) -> bool: """检查分隔符是否被转义. diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index f9245a21..d1d0a648 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -566,3 +566,31 @@ def end(self, group_num=0): def groups(self): # 返回空元组,因为不支持捕获组 return () + + +def optimized_dollar_matching(text): + """美元金额匹配.""" + # 用于存储需要修改的位置和替换内容 + replacements = [] + + pattern = r'(?referring $18.1 to $18.1 the packet center $ p$ and apparently coinciding with the particle velocity

      """ + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert element_to_html(parts[0][0]) == '

      referring \\$18.1 to \\$18.1 the packet center p and apparently coinciding with the particle velocity

      ' + class TestCCMATH(unittest.TestCase): def setUp(self): diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 361f6487..363b28ec 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -512,7 +512,7 @@ def test_table_only_include_tr(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_nlp_md() - assert 'List Price: $11.80' in result_md + assert r'List Price: \$11.80' in result_md def test_table_only_one_td(self): """测试table只有一个td.""" From b451d42aacf62d7f458e19dd469e6f9ea474b114 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 9 Oct 2025 16:12:53 +0800 Subject: [PATCH 29/31] =?UTF-8?q?fix:=E6=B7=BB=E5=8A=A0=E5=85=AC=E5=BC=8F?= =?UTF-8?q?=E8=87=AA=E5=AE=9A=E4=B9=89=E8=BE=B9=E7=95=8C=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/render/mathjax.py | 21 +++++++++++++------ .../extractor/test_extractor_chain.py | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 60497452..7e06089e 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -250,10 +250,10 @@ def find_math(self, root: HtmlElement) -> None: display_patterns.append(pattern) # 添加对环境的支持 - if MATHJAX_OPTIONS.get('processEnvironments', True): - # 通用匹配任何 \begin{...}\end{...} 环境的模式,保证环境名称相同时才匹配 - env_pattern = r'(\\begin\{(?P[^}]+)\}.*?\\end\{(?P=env)\})' - display_patterns.append(env_pattern) + # if MATHJAX_OPTIONS.get('processEnvironments', True): + # # 通用匹配任何 \begin{...}\end{...} 环境的模式,保证环境名称相同时才匹配 + # env_pattern = r'(\\begin\{(?P[^}]+)\}.*?\\end\{(?P=env)\})' + # display_patterns.append(env_pattern) # 编译正则表达式 inline_pattern = re.compile('|'.join(inline_patterns), re.DOTALL) @@ -366,6 +366,7 @@ def _process_math_in_text( matches = list(pattern.finditer(text)) else: matches = [] + tem_match_display = [] # 独立公式环境 independent_math_environments = [ 'displaymath', @@ -389,16 +390,24 @@ def _process_math_in_text( if node.displaytype == 'inline': pass elif node.displaytype == 'display': + tem_match_display.append(node.latex_verbatim()) fake_match = SimpleMatch(text, node.pos, node.len) matches.append(fake_match) # 其他数学环境 if (node.isNodeType(latexwalker.LatexEnvironmentNode) and hasattr(node, 'environmentname') and node.environmentname in independent_math_environments): + tem_match_display.append(node.latex_verbatim()) fake_match = SimpleMatch(text, node.pos, node.len) matches.append(fake_match) - tex_pattern = re.compile('\\[tex\\](.*?)\\[/tex\\]', re.DOTALL) - matches.extend(list(tex_pattern.finditer(text))) + # 公式自定义边界逻辑 + new_display_patterns = [item for item in pattern.pattern.split('|') if "$" not in item] + custom_pattern = re.compile('|'.join(new_display_patterns), re.DOTALL) + custom_matches = list(custom_pattern.finditer(text)) + for item in custom_matches: + if item.group() not in tem_match_display: + matches.append(item) + tem_match_display.clear() # 如果没有匹配到分隔符形式的公式,直接返回原文本 if not matches: return optimized_dollar_matching(text) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 363b28ec..0ceedc0b 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -469,7 +469,7 @@ def test_math_dollar(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_md = result.get_content_list().to_nlp_md() - self.assertIn(r'$16.8 million', result_md) + self.assertIn(r'\$16.8 million', result_md) def test_math_non_asciimath(self): """测试普通文本中的``不应该被识别为asciimath.""" From 78bcd9f3d5505fefb7a610171e445032ca1fd9d6 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 9 Oct 2025 16:37:09 +0800 Subject: [PATCH 30/31] =?UTF-8?q?fix:=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/render/mathjax.py | 29 ++++++++++--------- .../extractor/html/recognizer/test_math.py | 8 +++++ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 7e06089e..2f1cc3ad 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -33,6 +33,21 @@ 'AM_CHTML' ] +# 独立公式环境 +independent_math_environments = [ + 'displaymath', + 'equation', + 'equation*', + 'align', + 'align*', + 'gather', + 'gather*', + 'multline', + 'multline*', + 'vmatrix', + 'Vmatrix' +] + class MathJaxRender(BaseMathRender): """MathJax渲染器实现.""" @@ -367,20 +382,6 @@ def _process_math_in_text( else: matches = [] tem_match_display = [] - # 独立公式环境 - independent_math_environments = [ - 'displaymath', - 'equation', - 'equation*', - 'align', - 'align*', - 'gather', - 'gather*', - 'multline', - 'multline*', - 'vmatrix', - 'Vmatrix' - ] walker = latexwalker.LatexWalker(text) nodelist, pos, len_ = walker.get_latex_nodes(pos=0) for node in nodelist: diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 23dd5d6d..79d010aa 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -961,6 +961,14 @@ def test_dollar_sign(self): html_content) assert element_to_html(parts[0][0]) == '

      referring \\$18.1 to \\$18.1 the packet center p and apparently coinciding with the particle velocity

      ' + def test_begin_end(self): + """$begin end$的嵌套组合识别时候$$没有处理.""" + html_content = r"""

      $\begin{array}{1 1}(a)\;xy=c\\(b)\;xy=c^2\\(c)\;x^2+y^2=a^2\\(d)\;x^2+y^2=1\end{array}$

      """ + parts = self.math_recognizer.recognize('https://www.baidu.com', + [(html_to_element(html_content), html_to_element(html_content))], + html_content) + assert element_to_html(parts[0][0]) == '

      \\begin{array}{1 1}(a)\\;xy=c\\\\(b)\\;xy=c^2\\\\(c)\\;x^2+y^2=a^2\\\\(d)\\;x^2+y^2=1\\end{array}

      ' + class TestCCMATH(unittest.TestCase): def setUp(self): From ced9dc490846ba9d874227b56cc8f8901c0ae909 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 13 Oct 2025 15:31:28 +0800 Subject: [PATCH 31/31] =?UTF-8?q?fix:1.=E4=BF=AE=E5=A4=8D=E8=A1=A8?= =?UTF-8?q?=E6=A0=BC=E5=86=85=E5=85=AC=E5=BC=8F=E6=B2=A1=E6=9C=89=E8=A2=AB?= =?UTF-8?q?$$=E5=8C=85=E8=A3=B9=E6=B8=B2=E6=9F=93=E5=BC=82=E5=B8=B8=202.?= =?UTF-8?q?=E5=8E=BB=E6=8E=89=E5=8E=9F=E7=94=9FHTML=E5=85=B7=E6=9C=89hidde?= =?UTF-8?q?n=E5=B1=9E=E6=80=A7=E7=9A=84=E9=9A=90=E8=97=8F=E6=A0=87?= =?UTF-8?q?=E7=AD=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/config.py | 1 + .../html/recognizer/cc_math/tag_math.py | 11 +- .../extractor/html/recognizer/table.py | 9 +- .../html/clean_invisible_elements.html | 3834 +++++++++++++++++ .../assets/recognizer/table_has_formula.html | 1964 +++++++++ .../extractor/html/recognizer/test_table.py | 8 + .../extractor/html/recognizer/test_text.py | 24 + 7 files changed, 5842 insertions(+), 9 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/clean_invisible_elements.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_has_formula.html diff --git a/llm_web_kit/extractor/config.py b/llm_web_kit/extractor/config.py index c34075b0..821f26df 100644 --- a/llm_web_kit/extractor/config.py +++ b/llm_web_kit/extractor/config.py @@ -3,6 +3,7 @@ {'url': '*', 'tag': '//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]'}, {'url': '*', 'tag': '//div[contains(@style, "display: none")]'}, {'url': '*', 'tag': '//div[contains(@style, "display:none")]'}, + {'url': '*', 'tag': '//*[@hidden and not(@hidden="false")]'}, {'url': 'stackexchange.com', 'tag': '//*[contains(@class, "d-none")]'}, # 任意标签,class包含d-none,限制在stackexchange.com网站 {'url': 'mathoverflow.net', 'tag': '//*[contains(@class, "d-none")]'}, # 任意标签,class包含d-none,限制在mathoverflow.net网站 {'url': 'blog.csdn.net', 'tag': '//span[contains(@class, "katex-html")]'}, # 仅针对 blog.csdn.net 域名,删除所有 class 包含 katex-html 的 标签及其内容(用于移除数学公式渲染的 HTML 部分) diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py index ef295565..c7a50281 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py @@ -24,11 +24,12 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa if len(annotation_tags) > 0: annotation_tag = annotation_tags[0] text = annotation_tag.text - style_value = parent.get('style') - if style_value: - normalized_style_value = style_value.lower().strip().replace(' ', '').replace(';', '') - if 'display: none' in normalized_style_value: - parent.style = '' + if parent: + style_value = parent.get('style') + if style_value: + normalized_style_value = style_value.lower().strip().replace(' ', '').replace(';', '') + if 'display: none' in normalized_style_value: + parent.style = '' text = cm.wrap_math_md(text) if text: new_span = build_cc_element(html_tag_name=new_tag, text=text, tail=text_strip(node.tail), type=math_type, by=math_render, html=o_html) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index 0d6e2b8e..239776cd 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,3 +1,4 @@ +import copy import re from typing import Any, List, Tuple @@ -212,16 +213,16 @@ def __get_table_type(self, child: HtmlElement) -> str: def __check_table_include_math_code(self, raw_html: HtmlElement): """检查table中的内容,包括普通文本、数学公式和代码.""" - math_raw_html = self._element_to_html(raw_html) - math_html = raw_html + tem_raw_html = copy.deepcopy(raw_html) + math_raw_html = self._element_to_html(tem_raw_html) math_res_parts = self.math_recognizer.recognize( base_url='', - main_html_lst=[(math_html, math_html)], + main_html_lst=[(tem_raw_html, tem_raw_html)], raw_html=math_raw_html ) result = [] if not math_res_parts: - if raw_html.tag == 'br' or raw_html.xpath('.//br'): + if tem_raw_html.tag == 'br' or tem_raw_html.xpath('.//br'): result.append("\n\n") for math_item in math_res_parts: ele_item = math_item[0] diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/clean_invisible_elements.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/clean_invisible_elements.html new file mode 100644 index 00000000..5ca2c395 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/clean_invisible_elements.html @@ -0,0 +1,3834 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Heylo FAQs | Frequently Asked Questions about Our Products & Service + – Heylo MG + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skip to content + +
      + + +
      +
      + + +
      + + + + + + + +
      + + +
      + + +
      +
      +
      +

      + FAQs +

      +
      +
      +
      +
      + + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + +
      + + +
      + +
      + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      + + + +
      +
      + diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_has_formula.html b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_has_formula.html new file mode 100644 index 00000000..a050772e --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_has_formula.html @@ -0,0 +1,1964 @@ + + + +Icentia11k Single Lead Continuous Raw Electrocardiogram Dataset v1.0 + + + + + + + + + + + + + + + + + +
      +
      +

      + Database + Open Access +

      +

      Icentia11k Single Lead Continuous Raw Electrocardiogram Dataset

      +

      + +Shawn Tan + ,  + + Satya Ortiz-Gagné + ,  + + Nicolas Beaudoin-Gagnon + ,  + + Pierre Fecteau + ,  + + Aaron Courville + ,  + + Yoshua Bengio + ,  + + Joseph Paul Cohen + +

      +

      Published: April 12, 2022. Version: 1.0

      +
      + +
      + +
      +
      +

      +When using this resource, please cite: +(show more options) +
      Tan, S., Ortiz-Gagné, S., Beaudoin-Gagnon, N., Fecteau, P., Courville, A., Bengio, Y., & Cohen, J. P. (2022). Icentia11k Single Lead Continuous Raw Electrocardiogram Dataset (version 1.0). PhysioNet. https://doi.org/10.13026/kk0v-r952.
      +

      + +Additionally, please cite the original publication: +

      Tan, S., Androz, G., Ortiz-Gagné, S., Chamseddine, A., Fecteau, P., Courville, A., Bengio, Y., & Cohen, J. P. (2021, October 21). Icentia11K: An Unsupervised Representation Learning Dataset for Arrhythmia Subtype Discovery. Computing in Cardiology Conference (CinC).

      +

      +Please include the standard citation for PhysioNet: +(show more options) +
      Goldberger, A., Amaral, L., Glass, L., Hausdorff, J., Ivanov, P. C., Mark, R., ... & Stanley, H. E. (2000). PhysioBank, PhysioToolkit, and PhysioNet: Components of a new research resource for complex physiologic signals. Circulation [Online]. 101 (23), pp. e215–e220.
      +

      + +
      +

      Abstract

      +

      This is a dataset of continuous raw electrocardiogram (ECG) signals containing 11 thousand patients and 2 billion labelled beats. The signals were recorded with a 16-bit resolution at 250Hz with a fixed chest mounted single lead probe for up to 2 weeks. The average age of the patient is 62.2±17.4 years. 20 technologists annotated each beat's type (Normal, Premature Atrial Contraction, Premature Ventricular contraction) and rhythm (Normal Sinusal Rhythm, Atrial Fibrillation, Atrial Flutter).

      +
      +

      Background

      +

      Arrhythmia detection is presently performed by cardiologists or technologists familiar with ECG readings. Recently, supervised machine learning has been successfully applied to perform automated detection of many arrhythmias [1,2,3,4]. However, there may be ECG anomalies that warrant further investigation because they do not fit the morphology of presently known arrhythmia. We seek to use a data driven approach to finding these differences that cardiologists have anecdotally observed. Existing public ECG datasets include the the MIMIC-III Waveform Database and the ECG-ViEW II dataset [5,6]. Here we present Icentia11k, a dataset of continuous raw electrocardiogram (ECG) signals containing 11 thousand patients and 2 billion labelled beats

      +
      +

      Methods

      +

      Our data is collected by the CardioSTAT, a single-lead heart monitor device from Icentia [7]. The raw signals were recorded with a 16-bit resolution and sampled at 250Hz with the CardioSTAT in a modified lead 1 position. The wealth of data this provides us can allow us to improve on the techniques currently used by the medical industry to process days worth of ECG data, and perhaps to catch anomalous events earlier than currently possible.

      +

      The dataset is processed from data provided by 11,000 patients who used the CardioSTAT device predominantly in Ontario, Canada, from various medical centers. While the device captures ECG data for up to two weeks, the majority of the prescribed duration of wear was one week.

      +

      The data is analyzed by Icentia's team of 20 technologists who performed annotation using proprietary analysis tools. Initial beat detection is performed automatically and then a technologist analyses the record labelling beat and rhythm types performing a full disclosure analysis (i.e. they see the whole recording). Finally the analysis is approved by a senior technologist before making it to the dataset.

      +

      The ethics institutional review boards at the Université de Montréal approved the study and release of data (CERSES-19-065-D).

      +
      +

      Data Description

      +

      We segment each patient record into segments of 2 20 + 1 2^{20}+1  signal samples (≈70 minutes). This longer time context was informed by discussions with technologists: the context is useful for rhythm detection. We made it a power of two with a middle sample to allow for easier convolution stack parameterization. From this, we randomly select 50 of the segments and their respective labels from the list of segments. The goal here is to reduce the size of the dataset while maintaining a fair representation of each patient.

      +

      Data structure

      +

      The data is structured into patients and segments.

      +

      Patient level (3-14 days)

      +

      At this level, the data can capture features which vary in a systematic way and not isolated events, like the placement of the probes or patient specific noise.

      +

      Segment level (1,048,577 int16 samples, approximately 1 hour)

      +

      A cardiologist can look at a specific segment and identify patterns which indicate a disease while ignoring noise from the signal such as a unique signal amplitude. Looking at trends in the segment help to correctly identify arrhythmia as half an hour provides the necessary context to observe the stress of a specific activity.

      +

      Aggregate statistics

      +

      Aggregate statistics are shown below:

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      Statistic# (units)
      Number of patients11,000
      Number of labeled beats2,774,054,987
      Sample rate250Hz
      Segment size 2 20 + 1 2^{20}+1  = 1,048,577
      Total number of segments541,794 (not all patients have enough for 50 segments)
      +

      Beats are annotated in ann.symbols at the R timepoint in the QRS complex. The timepoint in the rec.signal for each annotation is found in ann.sample Below shows the counts for beats over the entire dataset. There are also annotations with a '+' symbol which just mean there is a rhythm annotation (next table).

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      SymbolBeat DescriptionCount
      NNormal2,061,141,216
      SESSV (PAC): Premature or ectopic supraventricular beat, premature atrial contraction19,346,728
      VESV (PVC): Premature ventricular contraction, premature ventricular contraction17,203,041
      QUndefined: Unclassifiable beat676,364,002
      +

      Rhythms are annotated in ann.aux_note at each timepoint. For example a normal sinusal rhythm will start with a '(N' annotation and then end with a ')' annotation. The entire sequence in between is annotated as a normal sinusal rhythm. Below are the counts of each annotated region which could be one beat or thousands.

      + + + + + + + + + + + + + + + + + + + + + + + + + +
      SymbolRhythm LabelsCount
      (N ... )NSR (Normal sinusal rhythm)16,083,158
      (AFIB ... )AFib (Atrial fibrillation)848,564
      (AFL ... )AFlutter (Atrial flutter)313,251
      +

      Details on how the dataset is encoded into wfdb format are available on GitHub [8].

      +
      +

      Usage Notes

      +

      By releasing this dataset, we seek to enable the research community to develop better models for detection of arrhythmia and related heart disease. The dataset is described in more detail in our accompanying paper [9], which also describes our efforts to evaluation existing models for classification of arrhythmia. Code for working with the data, including executable notebooks, is available on GitHub [8].

      +

      Example code

      +

      To look at patient 9000 and segment 0 the filename would be: p09/09000/p09000_s00 and it can loaded using wfdb as follows:

      +
      import wfdb
      +patient_id=9000
      +segment_id=0
      +start=2000
      +length=1024
      +filename = f'{data_path}/p0{str(patient_id)[:1]}/p{patient_id:05d}/p{patient_id:05d}_s{segment_id:02d}'
      +rec = wfdb.rdrecord(filename, sampfrom=start, sampto=start+length)
      +ann = wfdb.rdann(filename, "atr", sampfrom=start, sampto=start+length, shift_samps=True)
      +wfdb.plot_wfdb(rec, ann, plot_sym=True, figsize=(15,4));
      +

      Limitations

      +

      It should be noted that since the people who wear the device are patients, the dataset does not represent a true random sample of the global population.  For one, the average age of the patient is 62.2±17.4 years of age.  Furthermore, whereas the CardioSTAT can be worn by any patient, it is mostly used for third line exam, so the majority of records in the dataset exhibit arrhythmias. No particular effort has been done on patient selection except data collection has been conducted over years 2017 and 2018.

      +
      +

      Release Notes

      +

      Version 1.0: First release on PhysioNet. Prior to this release data was made available on AcademicTorrents [10].

      +
      +

      Ethics

      +

      The authors declare no ethics concerns. The ethics institutional review boards at the University of Montreal approved the study and release of data (#CERSES-19-065-D).

      +
      +

      Acknowledgements

      +

      We thank Leon Glass, Yannick Le Devehat, Germain Ethier, and Margaux Luck, Kris Sankaran, and Gabriele Prato for useful discussions. This work is partially funded by a grant from Icentia, Fonds de Recherche en Santé du Québec, and the Institut de valorisation des donnees (IVADO). This work utilized the supercomputing facilities managed by Compute Canada and Calcul Quebec. We thank AcademicTorrents.com for making data available for our research.

      +
      +

      Conflicts of Interest

      +

      None

      +
      +

      References

      +
        +
      1. Hannun AY, Rajpurkar P, Haghpanahi M, Tison GH, Bourn C, Turakhia MP, Ng AY. Cardiologist-level arrhythmia detection and classification in ambulatory electrocardiograms using a deep neural network. Nature Medicine 2019
      2. +
      3. Yıldırım O, Pławiak P, Tan RS, Acharya UR. Arrhythmia detection using deep convolutional neural network with long duration ecg signals. Computers in biology and medicine 2018.
      4. +
      5. Minchole A, Rodriguez B. Artificial intelligence for the electrocardiogram. Nature Medicine 1 2019.
      6. +
      7. Porumb M, Iadanza E, Massaro S, Pecchia L. A convolutional neural network approach to detect congestive heart failure. Biomedical Signal Processing and Control 2020.
      8. +
      9. Johnson, A., Pollard, T., & Mark, R. (2016). MIMIC-III Clinical Database (version 1.4). PhysioNet. https://doi.org/10.13026/C2XW26.
      10. +
      11. Kim YG, Shin D, Park MY, Lee S, Jeon MS, Yoon D, Park RW. ECG-ViEW II, a freely accessible electrocardiogram database. PloS one 2017.
      12. +
      13. Icentia website. https://www.icentia.com/
      14. +
      15. Icentia11k project on GitHub. https://github.com/shawntan/icentia-ecg/tree/master/physionet
      16. +
      17. Tan, S., Androz, G., Ortiz-Gagné, S., Chamseddine, A., Fecteau, P., Courville, A., Bengio, Y., & Cohen, J. P. (2021, October 21). Icentia11K: An Unsupervised Representation Learning Dataset for Arrhythmia Subtype Discovery. Computing in Cardiology Conference (CinC). https://www.cinc.org/2021/Program/accepted/229_Preprint.pdf
      18. +
      19. Icentia11k Dataset on Academic Torrents. https://academictorrents.com/details/af04abfe9a3c96b30e5dd029eb185e19a7055272
      20. +
      +
      +
      + + +
      + +
      +
      Share
      +
      + + + + + +
      +
      +
      +
      Access
      +
      +

      +Access Policy: +
      + Anyone can access the files, as long as they conform to the terms of the specified license. +

      +

      +License (for files): +
      +Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
      +

      +
      +
      +
      +
      Discovery
      + +
      +
      +
      Corresponding Author
      +
      +You must be logged in to view the contact information. +
      +
      +
      + +
      +

      Files

      +

      Total uncompressed size: 1.1 TB. + +

      Access the files
      +
        +
      • Download the ZIP file (188.3 GB)
      • +
      • Download the files using your terminal:
        wget -r -N -c -np https://physionet.org/files/icentia11k-continuous-ecg/1.0/
      • +
      +

      Visualize waveforms

      +
      +
      + Folder Navigation: + <base>/p06/p06073 +
      + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      NameSizeModified
      Parent Directory
      RECORDS + +(download) + +550 B2022-02-13
      p06073_s00.atr + +(download) + +36.9 KB2022-02-13
      p06073_s00.dat + +(download) + +2 MB2022-02-13
      p06073_s00.hea + +(download) + +87 B2022-02-13
      p06073_s01.atr + +(download) + +43.0 KB2022-02-13
      p06073_s01.dat + +(download) + +2 MB2022-02-13
      p06073_s01.hea + +(download) + +86 B2022-02-13
      p06073_s02.atr + +(download) + +36.8 KB2022-02-13
      p06073_s02.dat + +(download) + +2 MB2022-02-13
      p06073_s02.hea + +(download) + +85 B2022-02-13
      p06073_s03.atr + +(download) + +35.4 KB2022-02-13
      p06073_s03.dat + +(download) + +2 MB2022-02-13
      p06073_s03.hea + +(download) + +87 B2022-02-13
      p06073_s04.atr + +(download) + +43.1 KB2022-02-13
      p06073_s04.dat + +(download) + +2 MB2022-02-13
      p06073_s04.hea + +(download) + +87 B2022-02-13
      p06073_s05.atr + +(download) + +41.4 KB2022-02-13
      p06073_s05.dat + +(download) + +2 MB2022-02-13
      p06073_s05.hea + +(download) + +87 B2022-02-13
      p06073_s06.atr + +(download) + +35.3 KB2022-02-13
      p06073_s06.dat + +(download) + +2 MB2022-02-13
      p06073_s06.hea + +(download) + +87 B2022-02-13
      p06073_s07.atr + +(download) + +37.3 KB2022-02-13
      p06073_s07.dat + +(download) + +2 MB2022-02-13
      p06073_s07.hea + +(download) + +86 B2022-02-13
      p06073_s08.atr + +(download) + +39.6 KB2022-02-13
      p06073_s08.dat + +(download) + +2 MB2022-02-13
      p06073_s08.hea + +(download) + +86 B2022-02-13
      p06073_s09.atr + +(download) + +43.7 KB2022-02-13
      p06073_s09.dat + +(download) + +2 MB2022-02-13
      p06073_s09.hea + +(download) + +87 B2022-02-13
      p06073_s10.atr + +(download) + +40.8 KB2022-02-13
      p06073_s10.dat + +(download) + +2 MB2022-02-13
      p06073_s10.hea + +(download) + +87 B2022-02-13
      p06073_s11.atr + +(download) + +44.8 KB2022-02-13
      p06073_s11.dat + +(download) + +2 MB2022-02-13
      p06073_s11.hea + +(download) + +87 B2022-02-13
      p06073_s12.atr + +(download) + +35.0 KB2022-02-13
      p06073_s12.dat + +(download) + +2 MB2022-02-13
      p06073_s12.hea + +(download) + +87 B2022-02-13
      p06073_s13.atr + +(download) + +37.7 KB2022-02-13
      p06073_s13.dat + +(download) + +2 MB2022-02-13
      p06073_s13.hea + +(download) + +87 B2022-02-13
      p06073_s14.atr + +(download) + +41.7 KB2022-02-13
      p06073_s14.dat + +(download) + +2 MB2022-02-13
      p06073_s14.hea + +(download) + +86 B2022-02-13
      p06073_s15.atr + +(download) + +39.8 KB2022-02-13
      p06073_s15.dat + +(download) + +2 MB2022-02-13
      p06073_s15.hea + +(download) + +86 B2022-02-13
      p06073_s16.atr + +(download) + +35.5 KB2022-02-13
      p06073_s16.dat + +(download) + +2 MB2022-02-13
      p06073_s16.hea + +(download) + +87 B2022-02-13
      p06073_s17.atr + +(download) + +37.3 KB2022-02-13
      p06073_s17.dat + +(download) + +2 MB2022-02-13
      p06073_s17.hea + +(download) + +87 B2022-02-13
      p06073_s18.atr + +(download) + +40.2 KB2022-02-13
      p06073_s18.dat + +(download) + +2 MB2022-02-13
      p06073_s18.hea + +(download) + +87 B2022-02-13
      p06073_s19.atr + +(download) + +36.9 KB2022-02-13
      p06073_s19.dat + +(download) + +2 MB2022-02-13
      p06073_s19.hea + +(download) + +87 B2022-02-13
      p06073_s20.atr + +(download) + +37.6 KB2022-02-13
      p06073_s20.dat + +(download) + +2 MB2022-02-13
      p06073_s20.hea + +(download) + +86 B2022-02-13
      p06073_s21.atr + +(download) + +33.3 KB2022-02-13
      p06073_s21.dat + +(download) + +2 MB2022-02-13
      p06073_s21.hea + +(download) + +85 B2022-02-13
      p06073_s22.atr + +(download) + +36.4 KB2022-02-13
      p06073_s22.dat + +(download) + +2 MB2022-02-13
      p06073_s22.hea + +(download) + +87 B2022-02-13
      p06073_s23.atr + +(download) + +36.3 KB2022-02-13
      p06073_s23.dat + +(download) + +2 MB2022-02-13
      p06073_s23.hea + +(download) + +86 B2022-02-13
      p06073_s24.atr + +(download) + +40.0 KB2022-02-13
      p06073_s24.dat + +(download) + +2 MB2022-02-13
      p06073_s24.hea + +(download) + +86 B2022-02-13
      p06073_s25.atr + +(download) + +37.8 KB2022-02-13
      p06073_s25.dat + +(download) + +2 MB2022-02-13
      p06073_s25.hea + +(download) + +85 B2022-02-13
      p06073_s26.atr + +(download) + +41.0 KB2022-02-13
      p06073_s26.dat + +(download) + +2 MB2022-02-13
      p06073_s26.hea + +(download) + +87 B2022-02-13
      p06073_s27.atr + +(download) + +40.7 KB2022-02-13
      p06073_s27.dat + +(download) + +2 MB2022-02-13
      p06073_s27.hea + +(download) + +86 B2022-02-13
      p06073_s28.atr + +(download) + +39.8 KB2022-02-13
      p06073_s28.dat + +(download) + +2 MB2022-02-13
      p06073_s28.hea + +(download) + +86 B2022-02-13
      p06073_s29.atr + +(download) + +33.9 KB2022-02-13
      p06073_s29.dat + +(download) + +2 MB2022-02-13
      p06073_s29.hea + +(download) + +86 B2022-02-13
      p06073_s30.atr + +(download) + +40.9 KB2022-02-13
      p06073_s30.dat + +(download) + +2 MB2022-02-13
      p06073_s30.hea + +(download) + +87 B2022-02-13
      p06073_s31.atr + +(download) + +41.7 KB2022-02-13
      p06073_s31.dat + +(download) + +2 MB2022-02-13
      p06073_s31.hea + +(download) + +86 B2022-02-13
      p06073_s32.atr + +(download) + +40.3 KB2022-02-13
      p06073_s32.dat + +(download) + +2 MB2022-02-13
      p06073_s32.hea + +(download) + +87 B2022-02-13
      p06073_s33.atr + +(download) + +40.3 KB2022-02-13
      p06073_s33.dat + +(download) + +2 MB2022-02-13
      p06073_s33.hea + +(download) + +87 B2022-02-13
      p06073_s34.atr + +(download) + +39.0 KB2022-02-13
      p06073_s34.dat + +(download) + +2 MB2022-02-13
      p06073_s34.hea + +(download) + +87 B2022-02-13
      p06073_s35.atr + +(download) + +36.7 KB2022-02-13
      p06073_s35.dat + +(download) + +2 MB2022-02-13
      p06073_s35.hea + +(download) + +86 B2022-02-13
      p06073_s36.atr + +(download) + +42.0 KB2022-02-13
      p06073_s36.dat + +(download) + +2 MB2022-02-13
      p06073_s36.hea + +(download) + +87 B2022-02-13
      p06073_s37.atr + +(download) + +40.6 KB2022-02-13
      p06073_s37.dat + +(download) + +2 MB2022-02-13
      p06073_s37.hea + +(download) + +85 B2022-02-13
      p06073_s38.atr + +(download) + +40.9 KB2022-02-13
      p06073_s38.dat + +(download) + +2 MB2022-02-13
      p06073_s38.hea + +(download) + +87 B2022-02-13
      p06073_s39.atr + +(download) + +40.5 KB2022-02-13
      p06073_s39.dat + +(download) + +2 MB2022-02-13
      p06073_s39.hea + +(download) + +87 B2022-02-13
      p06073_s40.atr + +(download) + +39.5 KB2022-02-13
      p06073_s40.dat + +(download) + +2 MB2022-02-13
      p06073_s40.hea + +(download) + +87 B2022-02-13
      p06073_s41.atr + +(download) + +40.7 KB2022-02-13
      p06073_s41.dat + +(download) + +2 MB2022-02-13
      p06073_s41.hea + +(download) + +87 B2022-02-13
      p06073_s42.atr + +(download) + +40.1 KB2022-02-13
      p06073_s42.dat + +(download) + +2 MB2022-02-13
      p06073_s42.hea + +(download) + +87 B2022-02-13
      p06073_s43.atr + +(download) + +39.6 KB2022-02-13
      p06073_s43.dat + +(download) + +2 MB2022-02-13
      p06073_s43.hea + +(download) + +87 B2022-02-13
      p06073_s44.atr + +(download) + +42.0 KB2022-02-13
      p06073_s44.dat + +(download) + +2 MB2022-02-13
      p06073_s44.hea + +(download) + +87 B2022-02-13
      p06073_s45.atr + +(download) + +37.5 KB2022-02-13
      p06073_s45.dat + +(download) + +2 MB2022-02-13
      p06073_s45.hea + +(download) + +86 B2022-02-13
      p06073_s46.atr + +(download) + +41.6 KB2022-02-13
      p06073_s46.dat + +(download) + +2 MB2022-02-13
      p06073_s46.hea + +(download) + +87 B2022-02-13
      p06073_s47.atr + +(download) + +40.3 KB2022-02-13
      p06073_s47.dat + +(download) + +2 MB2022-02-13
      p06073_s47.hea + +(download) + +87 B2022-02-13
      p06073_s48.atr + +(download) + +36.0 KB2022-02-13
      p06073_s48.dat + +(download) + +2 MB2022-02-13
      p06073_s48.hea + +(download) + +86 B2022-02-13
      p06073_s49.atr + +(download) + +36.2 KB2022-02-13
      p06073_s49.dat + +(download) + +2 MB2022-02-13
      p06073_s49.hea + +(download) + +87 B2022-02-13
      + +
      +
      +
      +
      + + +
      +
      +
      +
      +

      PhysioNet is a repository of freely-available medical research data, managed by the MIT Laboratory for Computational Physiology.

      +

      Supported by the National Institute of Biomedical Imaging and Bioengineering (NIBIB) under NIH grant number R01EB030362.

      +

      For more accessibility options, see the MIT Accessibility Page.

      +

      Back to top

      +
      +
      +
      +
      + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index c80d3568..eb58bb98 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -234,3 +234,11 @@ def test_nested_table4(self): assert len(parts) == 4 content = parts[2][0].text_content() assert '
      Molecular line emissions from pre main sequence objects\n\nSaraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). Molecular line emissions from pre main sequence objects. In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291.\n\nFull text available as:
      Preview
      PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\nDownload (239Kb)
      URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
      Google Scholar:Look up in Google Scholar
      Abstract\n\nWe present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H20 cooling.' in content + + def test_table_has_formula(self): + """表格含有公式.""" + raw_html_path = base_dir.joinpath('assets/recognizer/table_has_formula.html') + base_url = 'https://en.m.wikipedia.org/wiki/Variance' + raw_html = raw_html_path.read_text(encoding='utf-8') + parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) + assert parts[5][0].text_content() == '
      Item Type:Conference Item
      Copyright Holders:1997 European Space Agency
      Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
      Academic Unit/Department:Science > Physical Sciences
      Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
      Item ID:32696
      Depositing User:Glenn White
      Statistic# (units)
      Number of patients11,000
      Number of labeled beats2,774,054,987
      Sample rate250Hz
      Segment size$2^{20}+1$= 1,048,577
      Total number of segments541,794 (not all patients have enough for 50 segments)
      ' diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 1d5c200b..0c0f8db2 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -503,6 +503,30 @@ def test_para_has_none(self): content_md = result.get_content_list().to_mm_md() assert content_md + def test_clean_invisible_elements(self): + """ + 清理隐藏标签 + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'clean_invisible_elements.html', + 'main_path': 'clean_invisible_elements.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert "Choosing a selection results in a full page refresh." not in content_md + def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError