From 03e1282ba8fa977a7001c00413cc36add67d271d Mon Sep 17 00:00:00 2001 From: Sid Vishnoi <8426945+sidvishnoi@users.noreply.github.com> Date: Sat, 22 Jun 2024 21:02:01 +0530 Subject: [PATCH 1/3] feat(/api/unicode/names): add unicode codepoint to name service feat(/api/unicode/update): update data from r12a's data https://github.com/r12a/shared/blob/gh-pages/code/all-names.js Webhook not yet configured --- app.ts | 2 + routes/api/index.ts | 8 ++++ routes/api/unicode/index.ts | 21 +++++++++ routes/api/unicode/lib/scraper.ts | 69 ++++++++++++++++++++++++++++ routes/api/unicode/lib/store-init.ts | 5 ++ routes/api/unicode/lib/store.ts | 33 +++++++++++++ routes/api/unicode/names.ts | 56 ++++++++++++++++++++++ routes/api/unicode/update.ts | 26 +++++++++++ routes/api/unicode/update.worker.ts | 8 ++++ scripts/update-data-sources.ts | 5 ++ 10 files changed, 233 insertions(+) create mode 100644 routes/api/index.ts create mode 100644 routes/api/unicode/index.ts create mode 100644 routes/api/unicode/lib/scraper.ts create mode 100644 routes/api/unicode/lib/store-init.ts create mode 100644 routes/api/unicode/lib/store.ts create mode 100644 routes/api/unicode/names.ts create mode 100644 routes/api/unicode/update.ts create mode 100644 routes/api/unicode/update.worker.ts diff --git a/app.ts b/app.ts index 27228cd1..0b67279d 100644 --- a/app.ts +++ b/app.ts @@ -15,6 +15,7 @@ import caniuseRouter from "./routes/caniuse/index.js"; import githubRouter from "./routes/github/index.js"; import respecRouter from "./routes/respec/index.js"; import w3cRouter from "./routes/w3c/index.js"; +import apiRouter from "./routes/api/index.js"; import wellKnownRouter from "./routes/well-known/index.js"; import docsRouter from "./routes/docs/index.js"; @@ -45,6 +46,7 @@ app.use("/caniuse", caniuseRouter); app.use("/github/:org/:repo", githubRouter); app.use("/respec", respecRouter); app.use("/w3c", w3cRouter); +app.use("/api", apiRouter); app.use("/.well-known", wellKnownRouter); app.use("/docs", docsRouter); app.get("/", (_req, res) => res.redirect("/docs/")); diff --git a/routes/api/index.ts b/routes/api/index.ts new file mode 100644 index 00000000..23790e77 --- /dev/null +++ b/routes/api/index.ts @@ -0,0 +1,8 @@ +import express from "express"; +import unicode from "./unicode/index.js"; + +const router = express.Router({ mergeParams: true }); + +router.use("/unicode", unicode); + +export default router; diff --git a/routes/api/unicode/index.ts b/routes/api/unicode/index.ts new file mode 100644 index 00000000..8d2d143e --- /dev/null +++ b/routes/api/unicode/index.ts @@ -0,0 +1,21 @@ +import path from "node:path"; + +import express from "express"; +import cors from "cors"; + +import { env, ms } from "../../../utils/misc.js"; + +import namesRoute from "./names.js"; +import updateRoute from "./update.js"; + +const DATA_DIR = env("DATA_DIR"); + +const router = express.Router({ mergeParams: true }); + +router + .options("/names", cors({ methods: ["POST", "GET"], maxAge: ms("1day") })) + .post("/names", express.json({ limit: "2mb" }), cors(), namesRoute); +router.post("/update", updateRoute); +router.use("/data", express.static(path.join(DATA_DIR, "unicode"))); + +export default router; diff --git a/routes/api/unicode/lib/scraper.ts b/routes/api/unicode/lib/scraper.ts new file mode 100644 index 00000000..8d488149 --- /dev/null +++ b/routes/api/unicode/lib/scraper.ts @@ -0,0 +1,69 @@ +import path from "node:path"; +import { tmpdir } from "node:os"; +import { createWriteStream, writeFile } from "node:fs"; +import { appendFile, mkdir, rm } from "node:fs/promises"; +import { Readable } from "node:stream"; +import { ReadableStream } from "node:stream/web"; +import { finished } from "node:stream/promises"; + +import { env } from "../../../../utils/misc.js"; +import sh from "../../../../utils/sh.js"; + +const DATA_DIR = env("DATA_DIR"); + +const INPUT_DATA_SOURCE = `https://raw.githubusercontent.com/r12a/shared/gh-pages/code/all-names.js`; +const OUT_DIR_BASE = path.join(DATA_DIR, "unicode"); +const OUT_FILE_BY_CODEPOINT = path.resolve( + OUT_DIR_BASE, + "./codepoint-to-name.json", +); + +const defaultOptions = { forceUpdate: false }; +type Options = typeof defaultOptions; + +export default async function main(options: Partial = {}) { + options = { ...defaultOptions, ...options } as Options; + const hasUpdated = await updateInputSource(); + if (!hasUpdated && !options.forceUpdate) { + console.log("Nothing to update"); + return false; + } + + return true; +} + +async function updateInputSource() { + await mkdir(OUT_DIR_BASE, { recursive: true }); + + const namesJs = path.join(tmpdir(), "unicode-all-names.js"); + await rm(namesJs, { force: true }); + // download file and convert its data to JSON + console.log(`Downloading`, INPUT_DATA_SOURCE); + await downloadFile(INPUT_DATA_SOURCE, namesJs); + console.log("Converting to JSON"); + await appendFile( + namesJs, + "\n\n" + + String.raw`require("fs").writeFileSync( + "${OUT_FILE_BY_CODEPOINT}", + JSON.stringify(charData, null, 2).replace( + /[\u007f-\uffff]/g, + c => "\\\\u" + ("0000" + c.charCodeAt(0).toString(16)).slice(-4) + ) + )`, + ); + await sh(`node ${namesJs}`); + console.log("Wrote to", OUT_FILE_BY_CODEPOINT); + await rm(namesJs, { force: true }); + + return true; +} + +async function downloadFile(url: string, destination: string) { + const res = await fetch(url); + await mkdir(path.dirname(destination), { recursive: true }); + const outStream = createWriteStream(destination, { flags: "wx" }); + await finished( + Readable.fromWeb(res.body as ReadableStream).pipe(outStream), + ); +} diff --git a/routes/api/unicode/lib/store-init.ts b/routes/api/unicode/lib/store-init.ts new file mode 100644 index 00000000..11ea147b --- /dev/null +++ b/routes/api/unicode/lib/store-init.ts @@ -0,0 +1,5 @@ +import { Store } from "./store.js"; + +export const store = new Store(); + +export type { Store }; diff --git a/routes/api/unicode/lib/store.ts b/routes/api/unicode/lib/store.ts new file mode 100644 index 00000000..58dbb030 --- /dev/null +++ b/routes/api/unicode/lib/store.ts @@ -0,0 +1,33 @@ +import path from "path"; +import { readFileSync } from "fs"; + +import { env } from "../../../../utils/misc.js"; + +export class Store { + version = -1; + private codepointToName: Map = new Map(); + + constructor() { + this.fill(); + } + + /** Fill the store with its contents from the filesystem. */ + fill() { + this.codepointToName = new Map( + Object.entries(readJson("codepoint-to-name.json")), + ); + this.version = Date.now(); + } + + + getNameByCodepoint(codepoint: string) { + return this.codepointToName.get(String.raw`\u` + codepoint) ?? null; + } +} + +function readJson(filename: string) { + const DATA_DIR = env("DATA_DIR"); + const dataFile = path.resolve(DATA_DIR, `./unicode/${filename}`); + const text = readFileSync(dataFile, "utf8"); + return JSON.parse(text); +} diff --git a/routes/api/unicode/names.ts b/routes/api/unicode/names.ts new file mode 100644 index 00000000..a0d957e1 --- /dev/null +++ b/routes/api/unicode/names.ts @@ -0,0 +1,56 @@ +import type { Request, Response } from "express"; + +import { store, type Store } from "./lib/store-init.js"; + +interface Query { + codepoint: string; +} +interface Result { + name: string; +} + +type Options = Record; + +interface RequestBody { + queries: Query[]; + options?: Options; +} +type IRequest = Request; + +interface ResponseData { + data: Array<{ query: Query; result: Result | null }>; + metadata: { lastParsedAt: string }; +} + +export default function route(req: IRequest, res: Response) { + const { options = {}, queries = [] } = req.body; + const data: ResponseData["data"] = queries.map(query => ({ + query, + result: search(query, store, options), + })); + + Object.assign(res.locals, { + errors: getErrorCount(data), + queries: queries.length, + }); + + const result: ResponseData = { + data, + metadata: { + lastParsedAt: "", + }, + }; + res.json(result); +} + +function search(query: Query, store: Store, _options: Options): Result | null { + if (query.codepoint) { + const name = store.getNameByCodepoint(query.codepoint); + return typeof name === "string" ? { name } : null; + } + return null; +} + +function getErrorCount(results: ResponseData["data"]) { + return results.filter(({ result }) => !result).length; +} diff --git a/routes/api/unicode/update.ts b/routes/api/unicode/update.ts new file mode 100644 index 00000000..cee0ec71 --- /dev/null +++ b/routes/api/unicode/update.ts @@ -0,0 +1,26 @@ +import path from "path"; +import { legacyDirname } from "../../../utils/misc.js"; +import { BackgroundTaskQueue } from "../../../utils/background-task-queue.js"; +import { store } from "./lib/store-init.js"; +import type { Request, Response } from "express"; + +const workerFile = path.join(legacyDirname(import.meta), "update.worker.js"); +const taskQueue = new BackgroundTaskQueue( + workerFile, + "unicode_update", +); + +export default async function route(req: Request, res: Response) { + const job = taskQueue.add({}); + try { + const { updated } = await job.run(); + if (updated) { + store.fill(); + } + } catch { + res.status(500); + } finally { + res.locals.job = job.id; + res.send(job.id); + } +} diff --git a/routes/api/unicode/update.worker.ts b/routes/api/unicode/update.worker.ts new file mode 100644 index 00000000..d159772c --- /dev/null +++ b/routes/api/unicode/update.worker.ts @@ -0,0 +1,8 @@ +import unicodeScraper from "./lib/scraper.js"; + +interface Input {} + +export default async function unicodeUpdate(_input: Input) { + const updated = await unicodeScraper(); + return { updated }; +} diff --git a/scripts/update-data-sources.ts b/scripts/update-data-sources.ts index e6687cb3..c57754a2 100644 --- a/scripts/update-data-sources.ts +++ b/scripts/update-data-sources.ts @@ -3,6 +3,7 @@ import { mkdir } from "fs/promises"; import { env } from "../utils/misc.js"; import caniuse from "../routes/caniuse/lib/scraper.js"; import xref from "../routes/xref/lib/scraper.js"; +import unicode from "../routes/api/unicode/lib/scraper.js"; import w3cGroupsList from "./update-w3c-groups-list.js"; // ensure the data directory exists @@ -16,6 +17,10 @@ console.group("xref"); await xref({ forceUpdate: true }); console.groupEnd(); +console.group("unicode"); +await unicode({ forceUpdate: true }); +console.groupEnd(); + console.group("W3C Groups List"); await w3cGroupsList(); console.groupEnd(); From ab3c933b3830ab8ec8191ec252e5ad72656f1840 Mon Sep 17 00:00:00 2001 From: Sid Vishnoi <8426945+sidvishnoi@users.noreply.github.com> Date: Sun, 7 Jul 2024 19:24:44 +0530 Subject: [PATCH 2/3] use unicode data --- routes/api/unicode/lib/scraper.ts | 58 +++++++++++++++++++++---------- routes/api/unicode/lib/store.ts | 14 ++++---- routes/api/unicode/names.ts | 15 ++++---- 3 files changed, 56 insertions(+), 31 deletions(-) diff --git a/routes/api/unicode/lib/scraper.ts b/routes/api/unicode/lib/scraper.ts index 8d488149..9d0bc479 100644 --- a/routes/api/unicode/lib/scraper.ts +++ b/routes/api/unicode/lib/scraper.ts @@ -1,17 +1,17 @@ import path from "node:path"; import { tmpdir } from "node:os"; -import { createWriteStream, writeFile } from "node:fs"; -import { appendFile, mkdir, rm } from "node:fs/promises"; +import { createReadStream, createWriteStream } from "node:fs"; +import { mkdir, rm } from "node:fs/promises"; import { Readable } from "node:stream"; import { ReadableStream } from "node:stream/web"; import { finished } from "node:stream/promises"; +import { createInterface } from "node:readline/promises"; import { env } from "../../../../utils/misc.js"; -import sh from "../../../../utils/sh.js"; const DATA_DIR = env("DATA_DIR"); -const INPUT_DATA_SOURCE = `https://raw.githubusercontent.com/r12a/shared/gh-pages/code/all-names.js`; +export const INPUT_DATA_SOURCE = `https://unicode.org/Public/UNIDATA/UnicodeData.txt`; const OUT_DIR_BASE = path.join(DATA_DIR, "unicode"); const OUT_FILE_BY_CODEPOINT = path.resolve( OUT_DIR_BASE, @@ -32,33 +32,53 @@ export default async function main(options: Partial = {}) { return true; } +// download file and convert its data to JSON async function updateInputSource() { await mkdir(OUT_DIR_BASE, { recursive: true }); const namesJs = path.join(tmpdir(), "unicode-all-names.js"); await rm(namesJs, { force: true }); - // download file and convert its data to JSON - console.log(`Downloading`, INPUT_DATA_SOURCE); + await rm(OUT_FILE_BY_CODEPOINT, { force: true }); + + console.log(`Downloading`, INPUT_DATA_SOURCE, "to", namesJs); await downloadFile(INPUT_DATA_SOURCE, namesJs); - console.log("Converting to JSON"); - await appendFile( - namesJs, - "\n\n" + - String.raw`require("fs").writeFileSync( - "${OUT_FILE_BY_CODEPOINT}", - JSON.stringify(charData, null, 2).replace( - /[\u007f-\uffff]/g, - c => "\\\\u" + ("0000" + c.charCodeAt(0).toString(16)).slice(-4) - ) - )`, - ); - await sh(`node ${namesJs}`); + + console.log("Converting to JSON and writing to", OUT_FILE_BY_CODEPOINT); + const rl = createInterface({ + input: createReadStream(namesJs), + crlfDelay: Infinity, + }); + const dest = createWriteStream(OUT_FILE_BY_CODEPOINT, { flags: "a" }); + dest.write("[\n"); + for await (const line of rl) { + const parsed = parseLine(line); + if (!parsed) continue; + dest.write(JSON.stringify(parsed) + ",\n"); + } + dest.write(`["null", {"name": ""}]`); + dest.write("\n]\n"); + await new Promise(resolve => dest.end(resolve)); + console.log("Wrote to", OUT_FILE_BY_CODEPOINT); await rm(namesJs, { force: true }); return true; } +// Parse a line based on https://www.unicode.org/Public/5.1.0/ucd/UCD.html#UnicodeData.txt +// e.g. 0001;;Cc;0;BN;;;;;N;START OF HEADING;;;; +// -> 0001 -> {name: "[control]", generalCategory: "Cc", ...} +function parseLine(line: string) { + if (line.startsWith("#")) { + return null; // comments + } + + const parts = line.split(";"); + const codepoint = parts[0]; + const name = parts[1].replace(/[<>]/g, s => (s === "<" ? "[" : "]")); + return [codepoint, { name }] as const; +} + async function downloadFile(url: string, destination: string) { const res = await fetch(url); await mkdir(path.dirname(destination), { recursive: true }); diff --git a/routes/api/unicode/lib/store.ts b/routes/api/unicode/lib/store.ts index 58dbb030..ba4a6c70 100644 --- a/routes/api/unicode/lib/store.ts +++ b/routes/api/unicode/lib/store.ts @@ -2,10 +2,11 @@ import path from "path"; import { readFileSync } from "fs"; import { env } from "../../../../utils/misc.js"; +import { INPUT_DATA_SOURCE } from "./scraper.js"; export class Store { version = -1; - private codepointToName: Map = new Map(); + private codepointToName: Map = new Map(); constructor() { this.fill(); @@ -13,15 +14,16 @@ export class Store { /** Fill the store with its contents from the filesystem. */ fill() { - this.codepointToName = new Map( - Object.entries(readJson("codepoint-to-name.json")), - ); + this.codepointToName = new Map(readJson("codepoint-to-name.json")); this.version = Date.now(); } + getNameByHexCodePoint(hex: string) { + return this.codepointToName.get(hex) ?? null; + } - getNameByCodepoint(codepoint: string) { - return this.codepointToName.get(String.raw`\u` + codepoint) ?? null; + get dataSource() { + return INPUT_DATA_SOURCE; } } diff --git a/routes/api/unicode/names.ts b/routes/api/unicode/names.ts index a0d957e1..a213dae3 100644 --- a/routes/api/unicode/names.ts +++ b/routes/api/unicode/names.ts @@ -3,7 +3,8 @@ import type { Request, Response } from "express"; import { store, type Store } from "./lib/store-init.js"; interface Query { - codepoint: string; + /** Codepoint as hex */ + hex: string; } interface Result { name: string; @@ -19,7 +20,7 @@ type IRequest = Request; interface ResponseData { data: Array<{ query: Query; result: Result | null }>; - metadata: { lastParsedAt: string }; + metadata: { lastParsedAt: string; dataSource: string }; } export default function route(req: IRequest, res: Response) { @@ -37,16 +38,18 @@ export default function route(req: IRequest, res: Response) { const result: ResponseData = { data, metadata: { - lastParsedAt: "", + lastParsedAt: store.version.toString(), + dataSource: store.dataSource, }, }; res.json(result); } function search(query: Query, store: Store, _options: Options): Result | null { - if (query.codepoint) { - const name = store.getNameByCodepoint(query.codepoint); - return typeof name === "string" ? { name } : null; + if (query.hex) { + query.hex = query.hex.toUpperCase().padStart(4, "0"); + const data = store.getNameByHexCodePoint(query.hex); + return data; } return null; } From 150924d3d7ca1f974724cdf0eb5c4086ffb23440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcos=20C=C3=A1ceres?= Date: Sun, 1 Mar 2026 13:29:05 +1100 Subject: [PATCH 3/3] Update routes/api/unicode/lib/scraper.ts --- routes/api/unicode/lib/scraper.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/routes/api/unicode/lib/scraper.ts b/routes/api/unicode/lib/scraper.ts index 9d0bc479..11c8f667 100644 --- a/routes/api/unicode/lib/scraper.ts +++ b/routes/api/unicode/lib/scraper.ts @@ -25,7 +25,7 @@ export default async function main(options: Partial = {}) { options = { ...defaultOptions, ...options } as Options; const hasUpdated = await updateInputSource(); if (!hasUpdated && !options.forceUpdate) { - console.log("Nothing to update"); + console.log("Nothing to update from Unicode."); return false; }