-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUnicode Name Database Generator.py
More file actions
96 lines (78 loc) · 3.43 KB
/
Unicode Name Database Generator.py
File metadata and controls
96 lines (78 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
=========================================================
Universal Unicode Name Database Generator
=========================================================
[ SETUP & USAGE ]
1. Purpose : Extracts all named Unicode characters from Python's
internal database and exports them into a lightweight
JavaScript module (ES6) or a raw JSON file. Ideal for
front-end apps needing offline Unicode character lookups.
2. Run : Configure the settings below and execute.
No external libraries/dependencies required.
=========================================================
"""
import unicodedata
import json
import os
# ================= CONFIGURATION =================
# 1. Output File Settings
OUTPUT_FILENAME = "unicodeNames" # Name of the output file (without extension)
EXPORT_FORMAT = "js" # Options: "js" (ES6 module) or "json" (Raw JSON)
# 2. Text Formatting
# Options:
# "title" -> "Bengali Letter Ka"
# "upper" -> "BENGALI LETTER KA"
# "lower" -> "bengali letter ka"
NAME_CASING = "title"
# =================================================
def format_character_name(raw_name):
"""Formats the raw Unicode name based on user configuration."""
casing = NAME_CASING.lower()
if casing == "title":
return raw_name.title()
elif casing == "lower":
return raw_name.lower()
return raw_name.upper()
def generate_unicode_names():
unicode_map = {}
print("Scanning Unicode Range (0x0000 to 0x10FFFF)... Please wait.")
# Iterate through all possible Unicode code points
for code_point in range(0x110000):
char = chr(code_point)
try:
# Attempt to get the official character name
raw_name = unicodedata.name(char)
# Format the name
formatted_name = format_character_name(raw_name)
# Generate 4 to 6 digit Hex Key (e.g., "0995" or "1F600")
hex_key = f"{code_point:04X}"
unicode_map[hex_key] = formatted_name
except ValueError:
# Skip code points that do not have an official name (e.g., control chars)
continue
print(f"Extraction complete! Found {len(unicode_map)} named characters.")
print("Building output file...")
# Determine file extension and content structure based on EXPORT_FORMAT
format_type = EXPORT_FORMAT.lower()
final_filename = f"{OUTPUT_FILENAME}.{format_type}"
if format_type == "js":
# Create an ES6 JavaScript Module
content = "// This file is auto-generated by the Universal Unicode Database Generator.\n"
content += "export const UNICODE_NAMES = "
content += json.dumps(unicode_map, separators=(',', ':'))
content += ";"
elif format_type == "json":
# Create a raw JSON file
content = json.dumps(unicode_map, indent=2 if len(unicode_map) < 1000 else None, separators=(',', ':'))
else:
print(f"[ERROR] Unsupported EXPORT_FORMAT: '{EXPORT_FORMAT}'. Use 'js' or 'json'.")
return
# Save the file
try:
with open(final_filename, "w", encoding="utf-8") as f:
f.write(content)
print(f"\u2714 Success! Saved to '{os.path.abspath(final_filename)}'.")
except Exception as e:
print(f"[ERROR] Could not save file: {e}")
if __name__ == "__main__":
generate_unicode_names()