Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,5 @@ llm_web_kit.egg-info/*
.llm-web-kit.jsonc
.llm-web-kit-pageclassify.jsonc
tests/llm_web_kit/extractor/ygq_testmd
output.md
output.jsonl
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
cc_table_type = DocElementType.SIMPLE_TABLE
d = {
'type': cc_table_type,
'raw_content': raw_html_segment,
# 'raw_content': raw_html_segment,
'content': {
'html': html_content,
'is_complex': table_type,
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
el = parsed_content
node = {
'type': DocElementType.PARAGRAPH,
'raw_content': raw_html_segment,
# 'raw_content': raw_html_segment,
'content': json.loads(el.text),
}
return node
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
return None
cctitle_content_node = {
'type': DocElementType.TITLE,
'raw_content': raw_html_segment,
# 'raw_content': raw_html_segment,
'content': {
'title_content': text,
'level': level
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def htmll_to_content2(self, body_str):
else:
parent.text = (parent.text or '') + (element.tail or '')
parent.remove(element)
self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li'])
# self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'div', 'p', 'li'])
output = []
main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body))
for line in main_content:
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"type": "simple_table", "raw_content": "<cctable table_type=\\\"simple\\\" html=\\\"&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;\\\">&lt;table&gt;&lt;tr&gt;&lt;td&gt;1&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;3&lt;/td&gt;&lt;td&gt;4&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;<tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></cctable>", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
{"type": "simple_table", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
1 change: 0 additions & 1 deletion tests/llm_web_kit/extractor/html/recognizer/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def test_table_to_content_list_node_simple(self):
expect_json = expect.read_text(encoding='utf-8')
assert result['type'] == json.loads(expect_json)['type']
assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex']
assert result['raw_content'] == json.loads(expect_json)['raw_content']
self.assertTrue(result['content']['html'].startswith('<table>'))
self.assertTrue(result['content']['html'].endswith('</table>'))

Expand Down
20 changes: 0 additions & 20 deletions tests/llm_web_kit/input/assets/content_json.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
[
{
"type": "paragraph",
"raw_content": "<div><div>==========================title====================================</div></div>",
"content": [
{
"c": "==========================title====================================",
Expand All @@ -23,15 +22,13 @@
},
{
"type": "title",
"raw_content": "<h1>Title Test</h1>",
"content": {
"title_content": "Title Test",
"level": "1"
}
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================code inline====================================</div></div>",
"content": [
{
"c": "==========================code inline====================================",
Expand All @@ -41,7 +38,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div><li><strong>Dead simple</strong>\n Include prism.css and prism.js, use proper HTML5 code tags (<cccode-inline by=\"tag_code\" html=\"&lt;code&gt;code.language-xxxx&lt;/code&gt;), done!\n \" inline=\"true\">code.language-xxxx</cccode-inline>), done!\n </li></div></div>",
"content": [
{
"c": "Dead simple Include prism.css and prism.js, use proper HTML5 code tags (",
Expand All @@ -59,7 +55,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================code====================================</div></div>",
"content": [
{
"c": "==========================code====================================",
Expand All @@ -77,7 +72,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================simple table====================================</div></div>",
"content": [
{
"c": "==========================simple table====================================",
Expand All @@ -87,7 +81,6 @@
},
{
"type": "simple_table",
"raw_content": "<table><tr><td>1.1</td><td>2.1</td></tr><tr><td>3.1</td><td>4.1</td></tr></table>",
"content": {
"html": "<table><tr><td>1.1</td><td>2.1</td></tr><tr><td>3.1</td><td>4.1</td></tr></table>",
"is_complex": false,
Expand All @@ -96,7 +89,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================complex table====================================</div></div>",
"content": [
{
"c": "==========================complex table====================================",
Expand All @@ -106,7 +98,6 @@
},
{
"type": "complex_table",
"raw_content": "<table><tr><td rowspan=\"2\">1</td><td>2</td><td>3</td></tr><tr><td colspan=\"2\">4</td></tr><tr><td>5</td><td>6</td><td>7</td></tr></table>",
"content": {
"html": "<table><tr><td rowspan=\"2\">1</td><td>2</td><td>3</td></tr><tr><td colspan=\"2\">4</td></tr><tr><td>5</td><td>6</td><td>7</td></tr></table>",
"is_complex": true,
Expand All @@ -115,7 +106,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================equation inline====================================</div></div>",
"content": [
{
"c": "==========================equation inline====================================",
Expand All @@ -125,7 +115,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><p>测试行内公式<ccmath-inline type=\"latex\" by=\"mathjax_mock\" html=\"$x=4$\">x=4</ccmath-inline>。</p></div>",
"content": [
{
"c": "测试行内公式",
Expand All @@ -143,7 +132,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================equation interline====================================</div></div>",
"content": [
{
"c": "==========================equation interline====================================",
Expand All @@ -153,7 +141,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><p>公式如下:</p></div>",
"content": [
{
"c": "公式如下:",
Expand All @@ -172,7 +159,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================img====================================</div></div>",
"content": [
{
"c": "==========================img====================================",
Expand All @@ -194,7 +180,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================list====================================</div></div>",
"content": [
{
"c": "==========================list====================================",
Expand Down Expand Up @@ -242,7 +227,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================paragraph====================================</div></div>",
"content": [
{
"c": "==========================paragraph====================================",
Expand All @@ -252,7 +236,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><p>test paragraph</p></div>",
"content": [
{
"c": "test paragraph",
Expand All @@ -262,7 +245,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================audio====================================</div></div>",
"content": [
{
"c": "==========================audio====================================",
Expand All @@ -272,7 +254,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><div>==========================video====================================</div></div>",
"content": [
{
"c": "==========================video====================================",
Expand All @@ -282,7 +263,6 @@
},
{
"type": "paragraph",
"raw_content": "<div><video controls=\"\" width=\"250\">Download the<a href=\"/shared-assets/videos/flower.webm\">WEBM</a>\n or\n <a href=\"/shared-assets/videos/flower.mp4\">MP4</a>\n video.\n</video></div>",
"content": [
{
"c": "Download the WEBM or MP4 video.",
Expand Down
Loading