From 823c8fc8797efd5ab7acc315b36365678ccdca3f Mon Sep 17 00:00:00 2001 From: linfeng <56671143+LollipopsAndWine@users.noreply.github.com> Date: Fri, 12 Sep 2025 09:56:58 +0800 Subject: [PATCH 01/11] Merge pull request #561 from LollipopsAndWine/dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: 修复段落结尾为换行时datajson拼接异常 --- llm_web_kit/extractor/html/recognizer/text.py | 4 +- llm_web_kit/input/datajson.py | 1 - .../good_data/html/para_br.html | 54312 ++++++++++++++++ .../good_data/html/para_br_main.html | 68 + .../extractor/html/recognizer/test_text.py | 24 + 5 files changed, 54406 insertions(+), 3 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 6dc7e346..4f99d9e6 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -267,10 +267,10 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: return text if final := __get_paragraph_text_recusive(root, ''): - para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT}) + para_text.append({'c': final, 't': ParagraphTextType.TEXT}) for item in para_text: - item['c'] = restore_sub_sup_from_text_regex(item['c']) + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) return para_text def __extract_paragraphs(self, root: HtmlElement): diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index e16d4ecf..11a4b5f1 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -519,7 +519,6 @@ def __join_one_para(self, para: list, exclude_inline_types: list = []) -> str: c = el['c'] if not c or not c.strip(): continue - c = c.strip() new_c = self.__escape_md_special_chars(c) # 转义特殊字符 one_para.append(new_c) elif el['t'] == ParagraphTextType.EQUATION_INLINE: diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html new file mode 100644 index 00000000..459eca7e --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br.html @@ -0,0 +1,54312 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + How To Find Interquartile Range - UpSkillMe + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ +
+
+
+
+
+
+ +

How To Find Interquartile Range

+ +
How To Find Interquartile Range +
+

Image Credits: Pinterest.

+

The interquartile range (IQR) is the difference between the third and the first quartiles. It is a + measure of dispersion. Quartiles are the values that divide a list of numbers into quarters. Here is how + to find the Interquartile Range. 

+

The interquartile range formula is the first quartile subtracted from the third quartile: 
          +                                           IQR = Q_{3}-Q_{1}

+ +

How To Find Interquartile Range for + an Odd Set of Numbers

+
    +
  1. Order the numbers from least to greatest.

    Given Data Set: 5, 7, 9, 3, 13, + 11, 17, 15, 21, 19, 23

    +

    Order Number: 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23.

    +
  2. +
  3. Find the median. The median is the data value in the middle of the set. The median in + the given data set is 13 since 13 is in the middle of the set.

    Median: 3, 5, 7, 9, + 11, 13, 15, 17, 19, 21, 23

    +
  4. +
  5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

    (3, 5, 7, 9, 11), 13, (15, 17, 19, 21, 23) +

    +
  6. +
  7. Find the median of both the lower and upper half of the data. Think Q1 as a median in + the lower half of the data and Q3 as a median for the upper half of data.

    (3, 5, 7, + 9, 11) = 7 = Q1 and (15, 17, 19, 21, 23) = 19 = Q3 +

    +
  8. +
  9. Subtract Q1 from Q3 to find the interquartile range

    Q3 – Q1 = 19 – 7 = + 12

    +
  10. +
+

How To Find Interquartile Range + for an Even Set of Numbers

+
    +
  1. Order the numbers from least to greatest.

    Given Data Set: 42, 51, 62, 47, + 38, 50, 54, 43

    +

    Order Number: 38, 42, 43, 47, 50, 51, 54, 62.

    +
  2. +
  3. Make a mark in the center of the data:

    Median: 38, 42, 43, 47,| + 50, 51, 54, 62

    +
  4. +
  5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

    (38, 42, 43, 47),| (50, 51, 54, 62).

    +
  6. +
  7. Find the median. We have the even data sets so the median is the average of + the middle two numbers.

    (38, 42, 43, 47) = \frac{42+43}{2} = \frac{85}{2} = 42.5 = Q1 +

    +

    (50, 51, 54, 62) = \frac{51+54}{2} = \frac{105}{2} = 52.5 = Q3 +

    +
  8. +
  9. Subtract Q1 from Q3 to find the interquartile range.

    Q3 – Q1 = 52.5 – + 42.5 = 10

    +
  10. +
+ + +
+
+
+
+
+
+
+ + + + +
+
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html new file mode 100644 index 00000000..7a41eab0 --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_br_main.html @@ -0,0 +1,68 @@ +
+

Image Credits: Pinterest.

+

The interquartile range (IQR) is the difference between the third and the first quartiles. It is a + measure of dispersion. Quartiles are the values that divide a list of numbers into quarters. Here is how + to find the Interquartile Range. 

+

The interquartile range formula is the first quartile subtracted from the third quartile: 
          +                                           IQR = Q_{3}-Q_{1}

+
+

Contents

+ +
+

How To Find Interquartile Range for + an Odd Set of Numbers

+
    +
  1. Order the numbers from least to greatest.

    Given Data Set: 5, 7, 9, 3, 13, + 11, 17, 15, 21, 19, 23

    +

    Order Number: 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23.

    +
  2. +
  3. Find the median. The median is the data value in the middle of the set. The median in + the given data set is 13 since 13 is in the middle of the set.

    Median: 3, 5, 7, 9, + 11, 13, 15, 17, 19, 21, 23

    +
  4. +
  5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

    (3, 5, 7, 9, 11), 13, (15, 17, 19, 21, 23) +

    +
  6. +
  7. Find the median of both the lower and upper half of the data. Think Q1 as a median in + the lower half of the data and Q3 as a median for the upper half of data.

    (3, 5, 7, + 9, 11) = 7 = Q1 and (15, 17, 19, 21, 23) = 19 = Q3 +

    +
  8. +
  9. Subtract Q1 from Q3 to find the interquartile range

    Q3 – Q1 = 19 – 7 = + 12

    +
  10. +
+

How To Find Interquartile Range + for an Even Set of Numbers

+
    +
  1. Order the numbers from least to greatest.

    Given Data Set: 42, 51, 62, 47, + 38, 50, 54, 43

    +

    Order Number: 38, 42, 43, 47, 50, 51, 54, 62.

    +
  2. +
  3. Make a mark in the center of the data:

    Median: 38, 42, 43, 47,| + 50, 51, 54, 62

    +
  4. +
  5. Place the parentheses around the numbers before and after the median. It makes + Q1 and Q3 easier to spot.

    (38, 42, 43, 47),| (50, 51, 54, 62).

    +
  6. +
  7. Find the median. We have the even data sets so the median is the average of + the middle two numbers.

    (38, 42, 43, 47) = \frac{42+43}{2} = \frac{85}{2} = 42.5 = Q1 +

    +

    (50, 51, 54, 62) = \frac{51+54}{2} = \frac{105}{2} = 52.5 = Q3 +

    +
  8. +
  9. Subtract Q1 from Q3 to find the interquartile range.

    Q3 – Q1 = 52.5 – + 42.5 = 10

    +
  10. +
+ + +
+ + \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 9dd16050..674b9dc3 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -455,6 +455,30 @@ def test_Lack_content1(self): content_md = result.get_content_list().to_mm_md() assert 'a) Electronic mail: airegg.py90g@nctu.edu.tw .' in content_md + def test_para_br(self): + """ + 测试修复段落结尾为\n\n + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': 'para_br.html', + 'main_path': 'para_br_main.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md + def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError From bec5aa428e02379f2fbe2ff2e3f61cf3af37a72d Mon Sep 17 00:00:00 2001 From: renpengli01 Date: Wed, 17 Sep 2025 14:30:43 +0800 Subject: [PATCH 02/11] fix: add post_main_html_processer_demo.py & update post main html (#563) --- .../html/post_main_html_processer/post_llm.py | 63 ++-- .../post_main_html_processer/post_mapping.py | 15 +- .../post_main_html_processer_demo.py | 274 ++++++++++++++++++ .../post_main_html_processer/test_post_llm.py | 54 +++- 4 files changed, 371 insertions(+), 35 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/post_main_html_processer/post_main_html_processer_demo.py diff --git a/llm_web_kit/extractor/html/post_main_html_processer/post_llm.py b/llm_web_kit/extractor/html/post_main_html_processer/post_llm.py index d30120cf..c1713276 100644 --- a/llm_web_kit/extractor/html/post_main_html_processer/post_llm.py +++ b/llm_web_kit/extractor/html/post_main_html_processer/post_llm.py @@ -35,19 +35,8 @@ def clean_json_data(md_text: str) -> dict: return json_data -def get_llm_response(input_lst: List, api_key: str, url: str, model_name: str, is_llm: bool = True, - max_retry: int = 3) -> dict: - if not is_llm: - post_llm_response = base_dir.joinpath('assets/llm_res.json').read_text(encoding='utf-8') - return json_loads(post_llm_response) - - from openai import BadRequestError, OpenAI - - client = OpenAI( - # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key='sk-xxx', - api_key=api_key, - base_url=url, - ) +def request_model(input_lst: List, api_key: str, url: str, model_name: str) -> str: + from openai import OpenAI html_count = len(input_lst) eg_input_lst, eg_output = __get_eg_data() @@ -87,23 +76,45 @@ def get_llm_response(input_lst: List, api_key: str, url: str, model_name: str, i ################ Now return your result:""" + + client = OpenAI( + # 若没有配置环境变量,请用百炼API Key将下行替换为:api_key='sk-xxx', + api_key=api_key, + base_url=url, + ) + + completion = client.chat.completions.create( + model=model_name, + # 此处以qwen-plus为例,可按需更换模型名称。模型列表:https://help.aliyun.com/zh/model-studio/getting-started/models + extra_body={'enable_thinking': False}, + messages=[ + {'role': 'system', 'content': 'You are a HTML semantics expert.'}, + {'role': 'user', 'content': prompt} + + ], + ) + + rtn = completion.model_dump_json() + return rtn + + +def get_llm_response(input_lst: List, api_key: str, url: str, model_name: str, is_llm: bool = True, + max_retry: int = 3) -> dict: + from openai import BadRequestError + + if not is_llm: + post_llm_response = base_dir.joinpath('assets/llm_res.json').read_text(encoding='utf-8') + return json_loads(post_llm_response) + try: - completion = client.chat.completions.create( - model=model_name, - # 此处以qwen-plus为例,可按需更换模型名称。模型列表:https://help.aliyun.com/zh/model-studio/getting-started/models - extra_body={'enable_thinking': False}, - messages=[ - {'role': 'system', 'content': 'You are a HTML semantics expert.'}, - {'role': 'user', 'content': prompt} - - ], - ) - - rtn = completion.model_dump_json() + rtn = request_model(input_lst, api_key, url, model_name) rtn_detail = json_loads(rtn) post_llm_response = rtn_detail.get('choices', [])[0].get('message', {}).get('content', '') return clean_json_data(post_llm_response) - except BadRequestError: + except BadRequestError as e: + if 'Range of input length should be' in str(e): + if len(input_lst) > 1 and max_retry > 0: + return get_llm_response(input_lst[:len(input_lst) - 1], api_key, url, model_name, is_llm, max_retry - 1) return None except Exception: if max_retry > 0: diff --git a/llm_web_kit/extractor/html/post_main_html_processer/post_mapping.py b/llm_web_kit/extractor/html/post_main_html_processer/post_mapping.py index 27dc876b..0f29f51a 100644 --- a/llm_web_kit/extractor/html/post_main_html_processer/post_mapping.py +++ b/llm_web_kit/extractor/html/post_main_html_processer/post_mapping.py @@ -8,11 +8,11 @@ def mapping_html_by_rules(html_content: str, xpaths_to_remove: List[dict]) -> tuple[str, bool]: """从HTML中删除指定XPath匹配的所有节点. - 参数: + Args: html_content (str): 原始HTML内容 xpaths_to_remove (list): 需要删除的元素列表 - 返回: + Returns: str: 处理后的HTML bool: 推广是否成功 """ @@ -22,10 +22,11 @@ def mapping_html_by_rules(html_content: str, xpaths_to_remove: List[dict]) -> tu is_success = False tree = html_to_element(html_content) + # 获取所有元素节点 + all_elements = [element for element in tree.iter() if isinstance(element, html.HtmlElement)] + for remove_node in xpaths_to_remove: xpath_content = remove_node.get('xpath') - # 获取所有元素节点 - all_elements = [element for element in tree.iter() if isinstance(element, html.HtmlElement)] for node in tree.xpath(xpath_content): # 获取节点内容占比 content_rate = __calculate_node_content_ratio(tree, node) @@ -45,11 +46,11 @@ def mapping_html_by_rules(html_content: str, xpaths_to_remove: List[dict]) -> tu def __calculate_node_content_ratio(tree: html.HtmlElement, node: html.HtmlElement) -> float: """计算节点内容占比. - 参数: + Args: tree(html.HtmlElement): 根节点对象 - node (html.HtmlElement): 节点对象 + node(html.HtmlElement): 节点对象 - 返回: + Returns: float: 节点内容占比 """ # 获取节点的文本内容 diff --git a/tests/llm_web_kit/extractor/html/post_main_html_processer/post_main_html_processer_demo.py b/tests/llm_web_kit/extractor/html/post_main_html_processer/post_main_html_processer_demo.py new file mode 100644 index 00000000..e1ee388d --- /dev/null +++ b/tests/llm_web_kit/extractor/html/post_main_html_processer/post_main_html_processer_demo.py @@ -0,0 +1,274 @@ +import time +from pathlib import Path +from typing import Dict, Generator, List, Union + +from loguru import logger +from lxml import html + +from llm_web_kit.extractor.html.post_main_html_processer.choose_html import \ + select_typical_htmls +from llm_web_kit.extractor.html.post_main_html_processer.post_llm import \ + get_llm_response +from llm_web_kit.extractor.html.post_main_html_processer.post_mapping import \ + mapping_html_by_rules +from llm_web_kit.libs.html_utils import element_to_html, html_to_element + + +def get_post_html(html_files: List[Union[Path, str]], api_key: str, llm_url: str, model_name: str, + choose_html_n: int = 10) -> Generator[Dict, None, None]: + """对main html进行后处理,旨在清洗html头部和尾部的噪声内容. + + Args: + html_files: 元素结构为 object or str + api_key: 模型的api_key + llm_url: 模型url + model_name: 模型name + choose_html_n: 选择choose_html_n个html进行html代表的选择 + + Returns: + 迭代器 + { + "post_llm_response":[{},{}], # 模型返回的结果 + "post_llm_spend_time":0, # 请求模型耗时 + "post_llm_paths":["", "", ""], # 代表html的path + "main_html":" ... ", # 原始 main html + "post_html":" ... ", # 后处理结果 post html + "post_map_successful":true, # bool, 后处理是否成功 + "marked_html":" ... " # 用于测试的标记html,其中红色框中灰度的内容为后处理删除的内容 + } + """ + if not html_files: + return + # 这里随机选择choose_html_n个html + html_strings = [ + {'html': f.read_text(encoding='utf-8'), 'filename': str(f)} + for idx, f in enumerate(html_files) + if idx < choose_html_n + ] + + post_llm_response = [] + line = {} + + try: + # 获取典型html + selected_htmls = select_typical_htmls(html_strings, select_n=3) + post_llm_paths = [f['filename'] for f in selected_htmls] + # 获取llm响应 + start_post_llm_time = time.time() + # is_llm 表示llm request的开关,True,表示请求llm, False,表示使用默认模型response,仅用于测试 + post_llm_response = get_llm_response([f['html'] for f in selected_htmls], api_key, llm_url, model_name, + is_llm=False) + post_llm_spend_time = int(time.time() - start_post_llm_time) + line.update({'post_llm_response': post_llm_response, 'post_llm_spend_time': post_llm_spend_time, + 'post_llm_paths': post_llm_paths}) + except Exception as e: + logger.error(f'The error is {e}, input path: {html_files}') + + # 进行推广处理 + yield from get_data_to_map(html_files, post_llm_response, line) + + +def get_data_to_map(html_files: List, post_llm_response: List[dict], line: Dict) -> Generator[dict, None, None]: + """获取数据进行推广处理. + + Args: + html_files: 元素结构为 object or str + post_llm_response: 模型返回的结果 + line: 需要保留的字段字典 + + Returns: + 迭代器 + { + "post_llm_response":[{},{}], # 模型返回的结果 + "post_llm_spend_time":0, # 请求模型耗时 + "post_llm_paths":["", "", ""], # 代表html的path + "main_html":" ... ", # 原始 main html + "post_html":" ... ", # 后处理结果 post html + "post_map_successful":true, # bool, 后处理是否成功 + "marked_html":" ... " # 用于测试的标记html,其中红色框中灰度的内容为后处理删除的内容 + } + """ + for idx, html_file in enumerate(html_files): + html_str = html_file.read_text(encoding='utf-8') + if post_llm_response: + post_html, post_map_successful = mapping_html_by_rules(html_str, post_llm_response) + else: + post_html, post_map_successful = html_str, False + line['main_html'] = html_str + line['post_html'] = post_html + line['post_map_successful'] = post_map_successful + + # 获取对比结果,这里只有测试使用,生产不需要这个内容 + # ------start----- + xpath_list = [i['xpath'] for i in post_llm_response] + marker = HTMLMarker(html_str) + marked_html = marker.process( + xpath_list, + # output_html=f'./assets/marked_{idx}.html' # 标注结果保存地址,默认不保存 + ) + line['marked_html'] = marked_html + # -----end----- + + # 返回最终结果 + yield line + + +class HTMLMarker: + def __init__(self, html_source): + """初始化HTML标记器. + + Args: + html_source: HTML内容(字符串) + """ + self.tree = html_to_element(html_source) + + def __calculate_node_content_ratio(self, tree: html.HtmlElement, node: html.HtmlElement) -> float: + """计算节点内容占比. + + Args: + tree(html.HtmlElement): 根节点对象 + node(html.HtmlElement): 节点对象 + + Returns: + float: 节点内容占比 + """ + # 获取节点的文本内容 + text_content = node.text_content() + + total_contents = tree.text_content() + content_rate = len(text_content) / len(total_contents) if total_contents else 0 + return content_rate + + def __analyze_node_position(self, all_elements: List[html.HtmlElement], target_node: html.HtmlElement) -> str: + # 计算总节点数 + total_nodes = len(all_elements) + + # 新增逻辑:检查元素是否在