Wiki-Scripts/Unicode Name Database Generator.py at main · MSSakib03/Wiki-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
=========================================================
Universal Unicode Name Database Generator
=========================================================
[ SETUP & USAGE ]
1. Purpose     : Extracts all named Unicode characters from Python's
                 internal database and exports them into a lightweight
                 JavaScript module (ES6) or a raw JSON file. Ideal for
                 front-end apps needing offline Unicode character lookups.
2. Run         : Configure the settings below and execute.
                 No external libraries/dependencies required.
=========================================================
"""

import unicodedata
import json
import os

# ================= CONFIGURATION =================

# 1. Output File Settings
OUTPUT_FILENAME = "unicodeNames"        # Name of the output file (without extension)
EXPORT_FORMAT = "js"                    # Options: "js" (ES6 module) or "json" (Raw JSON)

# 2. Text Formatting
# Options:
# "title" -> "Bengali Letter Ka"
# "upper" -> "BENGALI LETTER KA"
# "lower" -> "bengali letter ka"
NAME_CASING = "title"

# =================================================

def format_character_name(raw_name):
    """Formats the raw Unicode name based on user configuration."""
    casing = NAME_CASING.lower()
    if casing == "title":
        return raw_name.title()
    elif casing == "lower":
        return raw_name.lower()
    return raw_name.upper()

def generate_unicode_names():
    unicode_map = {}

    print("Scanning Unicode Range (0x0000 to 0x10FFFF)... Please wait.")

    # Iterate through all possible Unicode code points
    for code_point in range(0x110000):
        char = chr(code_point)
        try:
            # Attempt to get the official character name
            raw_name = unicodedata.name(char)

            # Format the name
            formatted_name = format_character_name(raw_name)

            # Generate 4 to 6 digit Hex Key (e.g., "0995" or "1F600")
            hex_key = f"{code_point:04X}"

            unicode_map[hex_key] = formatted_name

        except ValueError:
            # Skip code points that do not have an official name (e.g., control chars)
            continue

    print(f"Extraction complete! Found {len(unicode_map)} named characters.")
    print("Building output file...")

    # Determine file extension and content structure based on EXPORT_FORMAT
    format_type = EXPORT_FORMAT.lower()
    final_filename = f"{OUTPUT_FILENAME}.{format_type}"

    if format_type == "js":
        # Create an ES6 JavaScript Module
        content = "// This file is auto-generated by the Universal Unicode Database Generator.\n"
        content += "export const UNICODE_NAMES = "
        content += json.dumps(unicode_map, separators=(',', ':'))
        content += ";"
    elif format_type == "json":
        # Create a raw JSON file
        content = json.dumps(unicode_map, indent=2 if len(unicode_map) < 1000 else None, separators=(',', ':'))
    else:
        print(f"[ERROR] Unsupported EXPORT_FORMAT: '{EXPORT_FORMAT}'. Use 'js' or 'json'.")
        return

    # Save the file
    try:
        with open(final_filename, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"\u2714 Success! Saved to '{os.path.abspath(final_filename)}'.")
    except Exception as e:
        print(f"[ERROR] Could not save file: {e}")

if __name__ == "__main__":
    generate_unicode_names()