diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index 40168b7a9..dfbcd8181 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -247,6 +247,27 @@ export default class Bitstring { } } + // Decodes a UTF-8 sequence starting at the given position. + // Returns the decoded Unicode code point value. + // bytes: Uint8Array containing the UTF-8 encoded data + // start: byte index where the sequence begins + // length: number of bytes in the UTF-8 sequence (1-4) + static decodeUtf8CodePoint(bytes, start, length) { + if (length === 1) return bytes[start]; + + // First byte masks: 2-byte=0x1f, 3-byte=0x0f, 4-byte=0x07 + const firstByteMasks = {2: 0x1f, 3: 0x0f, 4: 0x07}; + + let codePoint = bytes[start] & firstByteMasks[length]; + + // Process continuation bytes (all use 0x3f mask, shift by 6 each) + for (let i = 1; i < length; i++) { + codePoint = (codePoint << 6) | (bytes[start + i] & 0x3f); + } + + return codePoint; + } + static fromBits(bits) { const bitCount = bits.length; const byteCount = Math.ceil(bitCount / 8); @@ -572,6 +593,29 @@ export default class Bitstring { return bitstring.text !== false; } + // Validates that a code point is within UTF-8 rules: + // - Not an overlong encoding (using more bytes than necessary) + // - Not a UTF-16 surrogate (U+D800–U+DFFF) + // - Not above maximum Unicode (U+10FFFF) + static isValidUtf8CodePoint(codePoint, encodingLength) { + // Check for overlong encodings (security issue) + const minValueForLength = {1: 0, 2: 0x80, 3: 0x800, 4: 0x10000}; + + // Reject code points that could have been encoded with fewer bytes (overlong) + if (codePoint < minValueForLength[encodingLength]) return false; + // Reject UTF-16 surrogates (U+D800–U+DFFF) + if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; + // Reject code points beyond Unicode range (> U+10FFFF) + if (codePoint > 0x10ffff) return false; + + return true; + } + + // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). + static isValidUtf8ContinuationByte(byte) { + return (byte & 0xc0) === 0x80; + } + static maybeResolveHex(bitstring) { if (bitstring.hex === null) { $.maybeSetBytesFromText(bitstring); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index 8479f3528..064b25164 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -95,55 +95,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -152,13 +103,14 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Checks if there's a truncated (incomplete) sequence at position. @@ -174,7 +126,8 @@ const Erlang_Unicode = { // Check all available continuation bytes for (let i = 1; i < availableBytes; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } return true; @@ -384,55 +337,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -441,13 +345,14 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -700,54 +605,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -756,13 +613,14 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -867,55 +725,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -924,12 +733,13 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); - return isValidCodePoint(codePoint, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -1033,55 +843,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -1090,15 +851,15 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) { + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) { return false; } } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index d822d1ffa..6f1916f56 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -1231,6 +1231,43 @@ describe("Bitstring", () => { }); }); + describe("decodeUtf8CodePoint()", () => { + it("decodes 1-byte UTF-8 sequence (ASCII)", () => { + // 'A' = 0x41 = U+0041 + const bytes = new Uint8Array([0x41]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 1); + assert.equal(codePoint, 0x41); + }); + + it("decodes 2-byte UTF-8 sequence", () => { + // '£' = 0xC2 0xA3 = U+00A3 (pound sign) + const bytes = new Uint8Array([0xc2, 0xa3]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 2); + assert.equal(codePoint, 0xa3); + }); + + it("decodes 3-byte UTF-8 sequence", () => { + // '€' = 0xE2 0x82 0xAC = U+20AC (euro sign) + const bytes = new Uint8Array([0xe2, 0x82, 0xac]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 3); + assert.equal(codePoint, 0x20ac); + }); + + it("decodes 4-byte UTF-8 sequence", () => { + // '𐍈' = 0xF0 0x90 0x8D 0x88 = U+10348 (Gothic letter hwair) + const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 4); + assert.equal(codePoint, 0x10348); + }); + + it("decodes from non-zero start position", () => { + // Test decoding '£' starting at position 2 + const bytes = new Uint8Array([0x41, 0x42, 0xc2, 0xa3]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 2, 2); + assert.equal(codePoint, 0xa3); + }); + }); + describe("fromBits()", () => { it("empty", () => { const result = Bitstring.fromBits([]); @@ -5227,6 +5264,50 @@ describe("Bitstring", () => { }); }); + describe("isValidUtf8ContinuationByte()", () => { + it("valid continuation byte (10xxxxxx pattern)", () => { + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000 + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111 + }); + + it("invalid continuation byte (not 10xxxxxx pattern)", () => { + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid) + }); + }); + + describe("isValidUtf8CodePoint()", () => { + it("valid codepoint", () => { + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A' + assert.isTrue(Bitstring.isValidUtf8CodePoint(0xa9, 2)); // © (copyright) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x20ac, 3)); // € (euro) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10348, 4)); // 𐍈 (Gothic letter) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10ffff, 4)); // Maximum valid Unicode + }); + + it("overlong encoding (codepoint too small for encoding length)", () => { + // 'A' (0x41) must use 1 byte, not 2 + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x41, 2)); + // 0x7FF requires 2 bytes, but attempting 3-byte encoding + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x7ff, 3)); + // 0xFFFF requires 3 bytes, but attempting 4-byte encoding + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xffff, 4)); + }); + + it("UTF-16 surrogate (U+D800–U+DFFF)", () => { + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xd800, 3)); // Start of surrogate range + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdc00, 3)); // Middle of surrogate range + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdfff, 3)); // End of surrogate range + }); + + it("beyond Unicode range (> U+10FFFF)", () => { + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x110000, 4)); + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x200000, 4)); + }); + }); + describe("maybeResolveHex()", () => { it("when hex field is already set", () => { const bitstring = Type.bitstring("Hologram");