-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_parser.py
More file actions
80 lines (59 loc) · 2.58 KB
/
html_parser.py
File metadata and controls
80 lines (59 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re
import csv
import html
def extract_text_from_html(html_content):
"""Extract text content from HTML, removing all tags"""
# Remove all HTML tags, preserving only text content
return re.sub(r'<[^>]*>', '', html_content)
def parse_html_content(html_content):
"""Parse the HTML content and extract item IDs and names"""
items = []
# Split the content by <br> tags to process line by line
lines = html_content.split('<br>')
for line in lines:
if not line.strip():
continue
# Extract the item ID
id_match = re.search(r'<a href="#([^"]+)">', line)
if not id_match:
continue
item_id = id_match.group(1)
# Remove the anchor tag part from the line
content = re.sub(r'<a href="#[^"]+">.*?</a>', '', line).strip()
# Extract text from the remaining content
item_name_with_prefix = extract_text_from_html(content).strip()
# Clean up any extra whitespace
item_name_with_prefix = re.sub(r'\s+', ' ', item_name_with_prefix).strip()
# Decode HTML entities (like )
item_name_with_prefix = html.unescape(item_name_with_prefix)
# Skip items with empty names
if not item_name_with_prefix:
continue
item_name = re.sub(r'^\([^)]+\)', '', item_name_with_prefix) # Remove (prefix)
item_name = re.sub(r'^\[[^\]]+\]', '', item_name) # Remove [prefix]
item_name = item_name.strip()
items.append((item_id, item_name))
return items
def main():
try:
# Read the input HTML file
with open("item.txt", "r", encoding="utf-8") as file:
html_content = file.read()
print(len(html_content))
# Parse the HTML content
items = parse_html_content(html_content)
# Write to CSV
with open("item_translations.csv", "w", encoding="utf-8", newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "Original Name"])
for item_id, item_name in items:
writer.writerow([item_id, item_name])
print(f"Successfully extracted {len(items)} items to item_translations.csv")
# Print a few examples for verification
print("\nExample entries:")
for i, (item_id, item_name) in enumerate(items[:5]):
print(f"{item_id}: {item_name}")
except Exception as e:
print(f"Error processing file: {e}")
if __name__ == "__main__":
main()