Wiki-Scripts/Unicode CSV metadata generator.py at main · MSSakib03/Wiki-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
=========================================================
Simple Font Batch Processor & CSV Generator
=========================================================
[ SETUP & USAGE ]
1. Purpose     : Scans a local folder for files containing Unicode hex
                 (e.g., U+05D2). Generates a standardized Wikimedia Commons
                 filename, smart description, and categories based SOLELY
                 on the specified Font Name, without Unicode block sorting.
2. Run         : Configure the settings below and execute.
=========================================================
"""

import os
import csv
import re
import unicodedata

# ================= CONFIGURATION =================

# 1. File & Folder Paths
TARGET_FOLDER = r"YOUR_LOCAL_FOLDER_PATH_HERE"
OUTPUT_CSV = "upload_metadata.csv"
FILE_EXTENSION = ".svg"

# 2. Glyph & Font Settings
PROJECT_NAME = "Your Font Name"               # e.g., "Libartinus" or "Noto Sans Hebrew"
GLYPH_TYPE = ""                               # Optional: e.g., "Dotted". Leave empty ("") for default.

# 3. Wikimedia Commons Template Settings
AUTHOR_NAME = "Author or Designer Name"       # e.g., "John Doe" or "Google"
UPLOADER_NAME = "Your Wiki Username"          # e.g., "MS Sakib"
SOURCE_INFO = "{{own}} derived from [http://link.com Source]"
LICENSE_TAG = "{{OFL|1=https://scripts.sil.org/OFL|2=2026|3=1.1}}" # Default OFL template format
CREATION_DATE = "2026"                        # e.g., "2026" or "2025"

# 4. Base Category Setting
BASE_CATEGORY = "Your Base Category Name"     # Will generate: "Category:[Base Category]"

# =================================================

def extract_unicode_data(filename):
    """Extracts hex from any filename and fetches official Unicode data."""
    match = re.search(r'(?:U\+|u|U-)([0-9a-fA-F]+)', filename)

    if match:
        hex_str = match.group(1).upper()
        if len(hex_str) < 4:
            hex_str = hex_str.zfill(4)

        try:
            char_int = int(hex_str, 16)
            char = chr(char_int)

            try:
                # e.g., "HEBREW LETTER GIMEL" -> "Hebrew Letter Gimel"
                name = unicodedata.name(char).title()
            except ValueError:
                name = "Unknown Character"

            # Avoid printing invisible control characters
            if unicodedata.category(char).startswith('C'):
                display_char = ""
            else:
                display_char = char

            return display_char, name, hex_str

        except ValueError:
            pass

    return None, "Unknown", "0000"

def build_data(filename):
    """Generates the standardized filename, Wikitext, and Sortkey."""
    display_char, char_name, hex_str = extract_unicode_data(filename)

    # If no Unicode found, skip this file
    if hex_str == "0000":
        return None, None

    # Format the display character safely (adds a space if it exists)
    char_display = f"{display_char} " if display_char else ""

    # ---------------------------------------------------------
    # 1. Standardized Target Filename Generation (NO CHARACTER SYMBOL)
    # Format: {Name} in {Font} - U+{Hex}.svg
    # Example: Hebrew Letter Gimel in Noto Sans Hebrew - U+05D2.svg
    # ---------------------------------------------------------
    raw_target_filename = f"{char_name} in {PROJECT_NAME} - U+{hex_str}{FILE_EXTENSION}"
    target_filename = re.sub(r'\s+', ' ', raw_target_filename).strip()

    # ---------------------------------------------------------
    # 2. Smart Description Generation (INCLUDES CHARACTER SYMBOL)
    # Example: Dotted style Hebrew Letter Gimel ג in Noto Sans Hebrew - [link U+05D2]
    # ---------------------------------------------------------
    compart_link = f"[https://www.compart.com/en/unicode/U+{hex_str} U+{hex_str}]"
    base_desc = f"{char_name} {char_display}in {PROJECT_NAME} - {compart_link}"

    if GLYPH_TYPE.strip():
        final_desc_text = f"{GLYPH_TYPE.strip()} style {base_desc}"
    else:
        final_desc_text = base_desc

    # ---------------------------------------------------------
    # 3. Wikitext Template Generation
    # ---------------------------------------------------------
    hex_sort = f"{int(hex_str, 16):06X}" # 6-digit sortkey

    template = (
        f"== {{{{int:filedesc}}}} ==\n"
        f"{{{{Information\n"
        f"|description= {{{{en|1={final_desc_text}.}}}}\n"
        f"|date={CREATION_DATE}\n"
        f"|source= {SOURCE_INFO}\n"
        f"|author= * Font: {AUTHOR_NAME}\n"
        f"* SVG: [[User:{UPLOADER_NAME}|{UPLOADER_NAME}]]\n"
        f"|permission=\n"
        f"|other versions=\n"
        f"}}}}\n\n"
        f"== {{{{int:license-header}}}} ==\n"
        f"{LICENSE_TAG}\n\n"
        f"[[Category:{BASE_CATEGORY}|{hex_sort}]]\n"
        f"[[Category:{BASE_CATEGORY} uploads by {UPLOADER_NAME}|{hex_sort}]]\n"
    )

    return target_filename, template

def generate_csv():
    """Main function to scan directory and write the CSV."""
    if not os.path.exists(TARGET_FOLDER):
        print(f"[ERROR] Directory not found: {TARGET_FOLDER}")
        return

    print("Scanning folder and generating standardized metadata...")

    with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        # Removed "block" from the CSV columns
        writer.writerow(["local_filepath", "filename", "description"])

        file_list = [f for f in os.listdir(TARGET_FOLDER) if f.lower().endswith(FILE_EXTENSION)]
        total_files = len(file_list)

        if total_files == 0:
            print(f"No {FILE_EXTENSION} files found in {TARGET_FOLDER}.")
            return

        count = 0
        skipped = 0

        for file in file_list:
            full_path = os.path.join(TARGET_FOLDER, file)

            target_filename, desc_template = build_data(file)

            if target_filename is None:
                skipped += 1
                continue

            # Writing 3 columns instead of 4
            writer.writerow([full_path, target_filename, desc_template])
            count += 1

            if count % 1000 == 0:
                print(f"Processed {count} files...")

    print(f"\n\u2714 DONE — Successfully generated metadata for {count} files!")
    if skipped > 0:
        print(f"\u26A0 Skipped {skipped} files (No Unicode hex found in their names).")
    print(f"Saved at: {os.path.abspath(OUTPUT_CSV)}")

if __name__ == "__main__":
    generate_csv()