From 9a49bec70d22ca8a76e8fe1aa9292fd705e2bf3e Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Tue, 17 Feb 2026 20:00:30 +0000 Subject: [PATCH 1/7] Extracts UTF-8 code point decoding to the BitString utility class, adds parameter validation --- assets/js/bitstring.mjs | 21 ++++ assets/js/erlang/unicode.mjs | 149 +---------------------------- test/javascript/bitstring_test.mjs | 37 +++++++ 3 files changed, 63 insertions(+), 144 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index 40168b7a9..f4c9b8695 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -247,6 +247,27 @@ export default class Bitstring { } } + // Decodes a UTF-8 sequence starting at the given position. + // Returns the decoded Unicode code point value. + // bytes: Uint8Array containing the UTF-8 encoded data + // start: byte index where the sequence begins + // length: number of bytes in the UTF-8 sequence (1-4) + static decodeUtf8CodePoint(bytes, start, length) { + if (length === 1) return bytes[start]; + + // First byte masks: 2-byte=0x1f, 3-byte=0x0f, 4-byte=0x07 + const firstByteMasks = {2: 0x1f, 3: 0x0f, 4: 0x07}; + + let codePoint = bytes[start] & firstByteMasks[length]; + + // Process continuation bytes (all use 0x3f mask, shift by 6 each) + for (let i = 1; i < length; i++) { + codePoint = (codePoint << 6) | (bytes[start + i] & 0x3f); + } + + return codePoint; + } + static fromBits(bits) { const bitCount = bits.length; const byteCount = Math.ceil(bitCount / 8); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index 8479f3528..c47b0b2e2 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -98,34 +98,6 @@ const Erlang_Unicode = { // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -156,7 +128,7 @@ const Erlang_Unicode = { } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); return isValidCodePoint(codePoint, length); }; @@ -387,34 +359,6 @@ const Erlang_Unicode = { // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -445,7 +389,7 @@ const Erlang_Unicode = { } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); return isValidCodePoint(codePoint, length); }; @@ -703,33 +647,6 @@ const Erlang_Unicode = { // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -760,7 +677,7 @@ const Erlang_Unicode = { } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); return isValidCodePoint(codePoint, length); }; @@ -870,34 +787,6 @@ const Erlang_Unicode = { // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -928,7 +817,7 @@ const Erlang_Unicode = { } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); return isValidCodePoint(codePoint, length); }; @@ -1036,34 +925,6 @@ const Erlang_Unicode = { // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Decodes a UTF-8 sequence starting at the given position. - // Returns the decoded Unicode code point value. - const decodeCodePoint = (start, length) => { - if (length === 1) { - return bytes[start]; - } - - if (length === 2) { - return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f); - } - - if (length === 3) { - return ( - ((bytes[start] & 0x0f) << 12) | - ((bytes[start + 1] & 0x3f) << 6) | - (bytes[start + 2] & 0x3f) - ); - } - - // length === 4 - return ( - ((bytes[start] & 0x07) << 18) | - ((bytes[start + 1] & 0x3f) << 12) | - ((bytes[start + 2] & 0x3f) << 6) | - (bytes[start + 3] & 0x3f) - ); - }; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -1096,7 +957,7 @@ const Erlang_Unicode = { } // Decode and validate the code point value - const codePoint = decodeCodePoint(start, length); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); return isValidCodePoint(codePoint, length); }; diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index d822d1ffa..88b725447 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -1231,6 +1231,43 @@ describe("Bitstring", () => { }); }); + describe("decodeUtf8CodePoint()", () => { + it("decodes 1-byte UTF-8 sequence (ASCII)", () => { + // 'A' = 0x41 = U+0041 + const bytes = new Uint8Array([0x41]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 1); + assert.equal(codePoint, 0x41); + }); + + it("decodes 2-byte UTF-8 sequence", () => { + // '£' = 0xC2 0xA3 = U+00A3 (pound sign) + const bytes = new Uint8Array([0xc2, 0xa3]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 2); + assert.equal(codePoint, 0xa3); + }); + + it("decodes 3-byte UTF-8 sequence", () => { + // '€' = 0xE2 0x82 0xAC = U+20AC (euro sign) + const bytes = new Uint8Array([0xe2, 0x82, 0xac]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 3); + assert.equal(codePoint, 0x20ac); + }); + + it("decodes 4-byte UTF-8 sequence", () => { + // '𐍈' = 0xF0 0x90 0x8D 0x88 = U+10348 (Gothic letter hwair) + const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 4); + assert.equal(codePoint, 0x10348); + }); + + it("decodes from non-zero start position", () => { + // Test decoding '£' starting at position 2 + const bytes = new Uint8Array([0x41, 0x42, 0xc2, 0xa3]); + const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 2, 2); + assert.equal(codePoint, 0xa3); + }); + }); + describe("fromBits()", () => { it("empty", () => { const result = Bitstring.fromBits([]); From 105d2d91dd0e3bc511079f260848483f0c378872 Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Wed, 18 Feb 2026 15:01:43 +0000 Subject: [PATCH 2/7] Extracts UTF-8 continuation byte validation to BitString class --- assets/js/bitstring.mjs | 5 +++++ assets/js/erlang/unicode.mjs | 32 ++++++++++-------------------- test/javascript/bitstring_test.mjs | 14 +++++++++++++ 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index f4c9b8695..881118d75 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -593,6 +593,11 @@ export default class Bitstring { return bitstring.text !== false; } + // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). + static isValidUtf8ContinuationByte(byte) { + return (byte & 0xc0) === 0x80; + } + static maybeResolveHex(bitstring) { if (bitstring.hex === null) { $.maybeSetBytesFromText(bitstring); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index c47b0b2e2..e54076495 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -95,9 +95,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -124,7 +121,8 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value @@ -146,7 +144,8 @@ const Erlang_Unicode = { // Check all available continuation bytes for (let i = 1; i < availableBytes; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } return true; @@ -356,9 +355,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -385,7 +381,8 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value @@ -644,9 +641,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -673,7 +667,8 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value @@ -784,9 +779,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -813,7 +805,8 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) return false; + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) + return false; } // Decode and validate the code point value @@ -922,9 +915,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). - const isValidContinuation = (byte) => (byte & 0xc0) === 0x80; - // Validates that a code point is within UTF-8 rules: // - Not an overlong encoding (using more bytes than necessary) // - Not a UTF-16 surrogate (U+D800–U+DFFF) @@ -951,7 +941,7 @@ const Erlang_Unicode = { // Verify all continuation bytes have correct pattern (10xxxxxx) for (let i = 1; i < length; i++) { - if (!isValidContinuation(bytes[start + i])) { + if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) { return false; } } diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index 88b725447..0a53ca436 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -5264,6 +5264,20 @@ describe("Bitstring", () => { }); }); + describe("isValidUtf8ContinuationByte()", () => { + it("valid continuation byte (10xxxxxx pattern)", () => { + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000 + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111 + }); + + it("invalid continuation byte (not 10xxxxxx pattern)", () => { + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid) + }); + }); + describe("maybeResolveHex()", () => { it("when hex field is already set", () => { const bitstring = Type.bitstring("Hologram"); From 7015625b214ae92310278d6d214735ccda8be3fa Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Wed, 18 Feb 2026 15:33:08 +0000 Subject: [PATCH 3/7] Extracts Utf-8 code point validation to BitString class --- assets/js/bitstring.mjs | 18 ++++++ assets/js/erlang/unicode.mjs | 100 ++--------------------------- test/javascript/bitstring_test.mjs | 30 +++++++++ 3 files changed, 53 insertions(+), 95 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index 881118d75..dfbcd8181 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -593,6 +593,24 @@ export default class Bitstring { return bitstring.text !== false; } + // Validates that a code point is within UTF-8 rules: + // - Not an overlong encoding (using more bytes than necessary) + // - Not a UTF-16 surrogate (U+D800–U+DFFF) + // - Not above maximum Unicode (U+10FFFF) + static isValidUtf8CodePoint(codePoint, encodingLength) { + // Check for overlong encodings (security issue) + const minValueForLength = {1: 0, 2: 0x80, 3: 0x800, 4: 0x10000}; + + // Reject code points that could have been encoded with fewer bytes (overlong) + if (codePoint < minValueForLength[encodingLength]) return false; + // Reject UTF-16 surrogates (U+D800–U+DFFF) + if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; + // Reject code points beyond Unicode range (> U+10FFFF) + if (codePoint > 0x10ffff) return false; + + return true; + } + // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx). static isValidUtf8ContinuationByte(byte) { return (byte & 0xc0) === 0x80; diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index e54076495..064b25164 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -95,24 +95,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -128,7 +110,7 @@ const Erlang_Unicode = { // Decode and validate the code point value const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Checks if there's a truncated (incomplete) sequence at position. @@ -355,24 +337,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -388,7 +352,7 @@ const Erlang_Unicode = { // Decode and validate the code point value const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -641,24 +605,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -674,7 +620,7 @@ const Erlang_Unicode = { // Decode and validate the code point value const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -779,24 +725,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -811,7 +739,7 @@ const Erlang_Unicode = { // Decode and validate the code point value const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence @@ -915,24 +843,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates that a code point is within UTF-8 rules: - // - Not an overlong encoding (using more bytes than necessary) - // - Not a UTF-16 surrogate (U+D800–U+DFFF) - // - Not above maximum Unicode (U+10FFFF) - const isValidCodePoint = (codePoint, encodingLength) => { - // Check for overlong encodings (security issue) - const minValueForLength = [0, 0, 0x80, 0x800, 0x10000]; - if (codePoint < minValueForLength[encodingLength]) return false; - - // Reject UTF-16 surrogates (U+D800–U+DFFF) - if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false; - - // Reject code points beyond Unicode range (> U+10FFFF) - if (codePoint > 0x10ffff) return false; - - return true; - }; - // Validates a complete UTF-8 sequence at the given position. // Checks: sufficient bytes, valid continuations, and valid code point. const isValidSequence = (start, length) => { @@ -949,7 +859,7 @@ const Erlang_Unicode = { // Decode and validate the code point value const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return isValidCodePoint(codePoint, length); + return Bitstring.isValidUtf8CodePoint(codePoint, length); }; // Main loop: scan forward, validating each sequence diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index 0a53ca436..6f1916f56 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -5278,6 +5278,36 @@ describe("Bitstring", () => { }); }); + describe("isValidUtf8CodePoint()", () => { + it("valid codepoint", () => { + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A' + assert.isTrue(Bitstring.isValidUtf8CodePoint(0xa9, 2)); // © (copyright) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x20ac, 3)); // € (euro) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10348, 4)); // 𐍈 (Gothic letter) + assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10ffff, 4)); // Maximum valid Unicode + }); + + it("overlong encoding (codepoint too small for encoding length)", () => { + // 'A' (0x41) must use 1 byte, not 2 + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x41, 2)); + // 0x7FF requires 2 bytes, but attempting 3-byte encoding + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x7ff, 3)); + // 0xFFFF requires 3 bytes, but attempting 4-byte encoding + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xffff, 4)); + }); + + it("UTF-16 surrogate (U+D800–U+DFFF)", () => { + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xd800, 3)); // Start of surrogate range + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdc00, 3)); // Middle of surrogate range + assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdfff, 3)); // End of surrogate range + }); + + it("beyond Unicode range (> U+10FFFF)", () => { + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x110000, 4)); + assert.isFalse(Bitstring.isValidUtf8CodePoint(0x200000, 4)); + }); + }); + describe("maybeResolveHex()", () => { it("when hex field is already set", () => { const bitstring = Type.bitstring("Hologram"); From b9bfe2d681c06d792a3599abf8deebf68d907316 Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Wed, 18 Feb 2026 16:34:55 +0000 Subject: [PATCH 4/7] Extracts UTF-8 sequence validation to BitString class --- assets/js/bitstring.mjs | 19 +++++ assets/js/erlang/unicode.mjs | 130 +++++++---------------------- test/javascript/bitstring_test.mjs | 84 +++++++++++++++---- 3 files changed, 120 insertions(+), 113 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index dfbcd8181..5bee0cc43 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -616,6 +616,25 @@ export default class Bitstring { return (byte & 0xc0) === 0x80; } + // Validates a UTF-8 sequence at the given position assuming the leader byte + // has already been confirmed valid for `length` (e.g. via getUtf8SequenceLength). + // Checks: sufficient bytes, valid continuation bytes, and valid code point. + // Precondition: `length` is the value returned by getUtf8SequenceLength(bytes[start]). + static isValidUtf8Sequence(bytes, start, length) { + // Check if we have enough bytes + if (start + length > bytes.length) return false; + + // Verify all continuation bytes have correct pattern (10xxxxxx) + for (let i = 1; i < length; i++) { + if (!$.isValidUtf8ContinuationByte(bytes[start + i])) return false; + } + + // Decode and validate the code point value + const codePoint = $.decodeUtf8CodePoint(bytes, start, length); + + return $.isValidUtf8CodePoint(codePoint, length); + } + static maybeResolveHex(bitstring) { if (bitstring.hex === null) { $.maybeSetBytesFromText(bitstring); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index 064b25164..c9ee533ba 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -95,24 +95,6 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates a complete UTF-8 sequence at the given position. - // Checks: sufficient bytes, valid continuations, and valid code point. - const isValidSequence = (start, length) => { - // Check if we have enough bytes - if (start + length > bytes.length) return false; - - // Verify all continuation bytes have correct pattern (10xxxxxx) - for (let i = 1; i < length; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) - return false; - } - - // Decode and validate the code point value - const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - - return Bitstring.isValidUtf8CodePoint(codePoint, length); - }; - // Checks if there's a truncated (incomplete) sequence at position. // Returns true if bytes could be a valid prefix of a UTF-8 sequence. const isTruncatedSequence = (start) => { @@ -138,7 +120,11 @@ const Erlang_Unicode = { while (pos < bytes.length) { const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if (seqLength === false || !isValidSequence(pos, seqLength)) break; + if ( + seqLength === false || + !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) + ) + break; pos += seqLength; } @@ -337,30 +323,16 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates a complete UTF-8 sequence at the given position. - // Checks: sufficient bytes, valid continuations, and valid code point. - const isValidSequence = (start, length) => { - // Check if we have enough bytes - if (start + length > bytes.length) return false; - - // Verify all continuation bytes have correct pattern (10xxxxxx) - for (let i = 1; i < length; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) - return false; - } - - // Decode and validate the code point value - const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - - return Bitstring.isValidUtf8CodePoint(codePoint, length); - }; - - // Main loop: scan forward, validating each sequence + // Scan forward, validating each sequence let pos = 0; while (pos < bytes.length) { const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if (seqLength === false || !isValidSequence(pos, seqLength)) break; + if ( + seqLength === false || + !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) + ) + break; pos += seqLength; } @@ -605,29 +577,16 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates a complete UTF-8 sequence at the given position. - // Checks: sufficient bytes, valid continuations, and valid code point. - const isValidSequence = (start, length) => { - // Check if we have enough bytes - if (start + length > bytes.length) return false; - - // Verify all continuation bytes have correct pattern (10xxxxxx) - for (let i = 1; i < length; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) - return false; - } - - // Decode and validate the code point value - const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - - return Bitstring.isValidUtf8CodePoint(codePoint, length); - }; - - // Main loop: scan forward, validating each sequence + // Scan forward, validating each sequence let pos = 0; + while (pos < bytes.length) { const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if (seqLength === false || !isValidSequence(pos, seqLength)) break; + if ( + seqLength === false || + !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) + ) + break; pos += seqLength; } @@ -725,28 +684,16 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates a complete UTF-8 sequence at the given position. - // Checks: sufficient bytes, valid continuations, and valid code point. - const isValidSequence = (start, length) => { - // Check if we have enough bytes - if (start + length > bytes.length) return false; - - // Verify all continuation bytes have correct pattern (10xxxxxx) - for (let i = 1; i < length; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) - return false; - } - - // Decode and validate the code point value - const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - return Bitstring.isValidUtf8CodePoint(codePoint, length); - }; - - // Main loop: scan forward, validating each sequence + // scan forward, validating each sequence let pos = 0; + while (pos < bytes.length) { const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if (seqLength === false || !isValidSequence(pos, seqLength)) break; + if ( + seqLength === false || + !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) + ) + break; pos += seqLength; } @@ -843,30 +790,15 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Validates a complete UTF-8 sequence at the given position. - // Checks: sufficient bytes, valid continuations, and valid code point. - const isValidSequence = (start, length) => { - // Check if we have enough bytes - if (start + length > bytes.length) return false; - - // Verify all continuation bytes have correct pattern (10xxxxxx) - for (let i = 1; i < length; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) { - return false; - } - } - - // Decode and validate the code point value - const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length); - - return Bitstring.isValidUtf8CodePoint(codePoint, length); - }; - - // Main loop: scan forward, validating each sequence + // Scan forward, validating each sequence let pos = 0; while (pos < bytes.length) { const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if (seqLength === false || !isValidSequence(pos, seqLength)) break; + if ( + seqLength === false || + !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) + ) + break; pos += seqLength; } diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index 6f1916f56..a2b906ad7 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -5264,20 +5264,6 @@ describe("Bitstring", () => { }); }); - describe("isValidUtf8ContinuationByte()", () => { - it("valid continuation byte (10xxxxxx pattern)", () => { - assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000 - assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111 - }); - - it("invalid continuation byte (not 10xxxxxx pattern)", () => { - assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII) - assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII) - assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start) - assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid) - }); - }); - describe("isValidUtf8CodePoint()", () => { it("valid codepoint", () => { assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A' @@ -5308,6 +5294,76 @@ describe("Bitstring", () => { }); }); + describe("isValidUtf8ContinuationByte()", () => { + it("valid continuation byte (10xxxxxx pattern)", () => { + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000 + assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111 + }); + + it("invalid continuation byte (not 10xxxxxx pattern)", () => { + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start) + assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid) + }); + }); + + describe("isValidUtf8Sequence()", () => { + it("valid 1-byte sequence (ASCII)", () => { + // ASCII 'A' + const bytes = new Uint8Array([0x41]); + assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 1)); + }); + + it("valid 2-byte sequence", () => { + // é (U+00E9): 0xC3 0xA9 + const bytes = new Uint8Array([0xc3, 0xa9]); + assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 2)); + }); + + it("valid 3-byte sequence", () => { + // € (U+20AC): 0xE2 0x82 0xAC + const bytes = new Uint8Array([0xe2, 0x82, 0xac]); + assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 3)); + }); + + it("valid 4-byte sequence", () => { + // 𐍈 (U+10348): 0xF0 0x90 0x8D 0x88 + const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]); + assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 4)); + }); + + it("not enough bytes available", () => { + const bytes = new Uint8Array([0xc3, 0xa9]); // 2 bytes + // Try to validate 3-byte sequence starting at position 0 + assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 3)); + }); + + it("invalid continuation byte", () => { + // 0xC3 starts a 2-byte sequence, but 0x41 (ASCII 'A') is not a valid continuation + const bytes = new Uint8Array([0xc3, 0x41]); + assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 2)); + }); + + it("overlong encoding", () => { + // 'A' (0x41) encoded as 2-byte sequence: 0xC1 0x81 (overlong) + const bytes = new Uint8Array([0xc1, 0x81]); + assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 2)); + }); + + it("UTF-16 surrogate", () => { + // U+D800 (surrogate) encoded as 3-byte sequence: 0xED 0xA0 0x80 + const bytes = new Uint8Array([0xed, 0xa0, 0x80]); + assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 3)); + }); + + it("beyond Unicode range", () => { + // U+110000 (beyond max) encoded as 4-byte sequence: 0xF4 0x90 0x80 0x80 + const bytes = new Uint8Array([0xf4, 0x90, 0x80, 0x80]); + assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 4)); + }); + }); + describe("maybeResolveHex()", () => { it("when hex field is already set", () => { const bitstring = Type.bitstring("Hologram"); From 9ab7bc37b117477b18319b0f1e2d8193bd1e9c5d Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Wed, 18 Feb 2026 17:08:58 +0000 Subject: [PATCH 5/7] Extracts truncated UTF-8 sequence validation to BitString class --- assets/js/bitstring.mjs | 21 ++++++++ assets/js/erlang/unicode.mjs | 31 +++--------- test/javascript/bitstring_test.mjs | 79 ++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 24 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index 5bee0cc43..d80dd9937 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -635,6 +635,27 @@ export default class Bitstring { return $.isValidUtf8CodePoint(codePoint, length); } + // Checks if there's a truncated (incomplete) UTF-8 sequence at the given position. + // Returns true if bytes could be a valid prefix of a UTF-8 sequence. + // bytes: Uint8Array containing UTF-8 encoded data + // start: byte index to check for truncation + static isTruncatedUtf8Sequence(bytes, start) { + const leaderByte = bytes[start]; + const expectedLength = $.getUtf8SequenceLength(leaderByte); + + if (expectedLength === false) return false; + + const availableBytes = bytes.length - start; + if (availableBytes >= expectedLength) return false; + + // Check all available continuation bytes + for (let i = 1; i < availableBytes; i++) { + if (!$.isValidUtf8ContinuationByte(bytes[start + i])) return false; + } + + return true; + } + static maybeResolveHex(bitstring) { if (bitstring.hex === null) { $.maybeSetBytesFromText(bitstring); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index c9ee533ba..a433c4ff0 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -95,27 +95,7 @@ const Erlang_Unicode = { // and rejecting overlong encodings, surrogates, and out-of-range values. // Time complexity: O(n) where n is the number of bytes. const findValidUtf8Length = (bytes) => { - // Checks if there's a truncated (incomplete) sequence at position. - // Returns true if bytes could be a valid prefix of a UTF-8 sequence. - const isTruncatedSequence = (start) => { - const leaderByte = bytes[start]; - const expectedLength = Bitstring.getUtf8SequenceLength(leaderByte); - - if (expectedLength === false) return false; - - const availableBytes = bytes.length - start; - if (availableBytes >= expectedLength) return false; - - // Check all available continuation bytes - for (let i = 1; i < availableBytes; i++) { - if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) - return false; - } - - return true; - }; - - // Main loop: scan forward, validating each sequence + // Scan forward, validating each sequence let pos = 0; while (pos < bytes.length) { @@ -128,7 +108,7 @@ const Erlang_Unicode = { pos += seqLength; } - return {validLength: pos, isTruncated: isTruncatedSequence(pos)}; + return pos; }; // Converts a binary to a list of codepoints. @@ -160,7 +140,7 @@ const Erlang_Unicode = { const handleInvalidUtf8FromBinary = (invalidBinary) => { Bitstring.maybeSetBytesFromText(invalidBinary); const bytes = invalidBinary.bytes ?? new Uint8Array(0); - const {validLength, isTruncated} = findValidUtf8Length(bytes); + const validLength = findValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); @@ -168,6 +148,8 @@ const Erlang_Unicode = { const codepoints = validLength > 0 ? convertBinaryToCodepoints(validPrefix) : []; + const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength); + if (isTruncated) { return createIncompleteTuple(codepoints, invalidRest); } @@ -187,7 +169,8 @@ const Erlang_Unicode = { // Check if it's a truncated sequence Bitstring.maybeSetBytesFromText(invalidBinary); const bytes = invalidBinary.bytes ?? new Uint8Array(0); - const {isTruncated} = findValidUtf8Length(bytes); + const validLength = findValidUtf8Length(bytes); + const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength); if (isTruncated) { // Incomplete: rest is the binary directly (not wrapped in list) diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index a2b906ad7..dd371b89b 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -5264,6 +5264,85 @@ describe("Bitstring", () => { }); }); + describe("isTruncatedUtf8Sequence()", () => { + // Happy path: truncated 2-byte sequence + it("returns true for truncated 2-byte sequence with valid continuation byte", () => { + // 0xC2 requires 2 bytes, but only 1 byte available (0x80 is valid continuation) + const bytes = new Uint8Array([0xc2]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true); + }); + + // Happy path: truncated 3-byte sequence + it("returns true for truncated 3-byte sequence with valid continuation bytes", () => { + // 0xE2 requires 3 bytes, but only 2 bytes available (both valid continuations) + const bytes = new Uint8Array([0xe2, 0x82]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true); + }); + + // Happy path: truncated 4-byte sequence + it("returns true for truncated 4-byte sequence with valid continuation bytes", () => { + // 0xF0 requires 4 bytes, but only 3 bytes available (all valid continuations) + const bytes = new Uint8Array([0xf0, 0x90, 0x8d]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true); + }); + + // Edge case: start position in middle of data + it("returns true for truncated sequence starting at non-zero position", () => { + // Valid ASCII prefix, then truncated 2-byte sequence + const bytes = new Uint8Array([0x41, 0xc2]); // 'A' + truncated '£' + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 1), true); + }); + + // Edge case: multiple valid continuation bytes before truncation + it("returns true for 4-byte sequence with 2 valid continuation bytes (truncated)", () => { + // 0xF0 (4-byte) with 2 valid continuation bytes available + const bytes = new Uint8Array([0xf0, 0x90, 0x8d]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true); + }); + + // False path: invalid leader byte + it("returns false for invalid leader byte", () => { + // 0xC0 is invalid (overlong encoding marker) + const bytes = new Uint8Array([0xc0]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + + // False path: invalid leader byte (out of range) + it("returns false for leader byte >= 0xF5", () => { + // 0xF5 and above are invalid (> U+10FFFF) + const bytes = new Uint8Array([0xf5]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + + // False path: enough bytes available + it("returns false when enough bytes are available for complete sequence", () => { + // 0xC2 requires 2 bytes, and 2 bytes are available + const bytes = new Uint8Array([0xc2, 0xa3]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + + // False path: invalid continuation byte in truncated sequence + it("returns false when continuation byte is invalid", () => { + // 0xC2 requires 2 bytes, but only 1 available with invalid continuation (0x00) + const bytes = new Uint8Array([0xc2, 0x00]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + + // False path: ASCII byte + it("returns false for ASCII byte (1-byte sequence)", () => { + // ASCII bytes are 1-byte sequences, always complete + const bytes = new Uint8Array([0x41]); // 'A' + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + + // False path: truncated sequence with invalid continuation at end + it("returns false when truncated sequence has invalid continuation byte at start", () => { + // 0xE2 requires 3 bytes, 2 available, but second byte (0x00) is invalid continuation + const bytes = new Uint8Array([0xe2, 0x00]); + assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false); + }); + }); + describe("isValidUtf8CodePoint()", () => { it("valid codepoint", () => { assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A' From f458bf4970d30c63e6443d4f2e5766396535b4c9 Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Wed, 18 Feb 2026 23:07:11 +0000 Subject: [PATCH 6/7] Extracts valid UTF-8 sequence length to BitString class --- assets/js/bitstring.mjs | 21 ++++ assets/js/erlang/unicode.mjs | 116 ++------------------- test/javascript/bitstring_test.mjs | 156 +++++++++++++++++++++++------ 3 files changed, 153 insertions(+), 140 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index d80dd9937..72d37b5a3 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -511,6 +511,27 @@ export default class Bitstring { return false; // Invalid leader byte } + // Scans forward once to find the longest valid UTF-8 prefix. + // Validates UTF-8 by checking byte structure, decoding code points, + // and rejecting overlong encodings, surrogates, and out-of-range values. + // Time complexity: O(n) where n is the number of bytes. + static getValidUtf8Length(bytes) { + let pos = 0; + + while (pos < bytes.length) { + const seqLength = $.getUtf8SequenceLength(bytes[pos]); + if ( + seqLength === false || + !$.isValidUtf8Sequence(bytes, pos, seqLength) + ) { + break; + } + pos += seqLength; + } + + return pos; + } + static isEmpty(bitstring) { return bitstring.text === "" || bitstring.bytes?.length === 0; } diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index a433c4ff0..15424f5ab 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -90,27 +90,6 @@ const Erlang_Unicode = { "characters_to_list/1": (data) => { // Helpers - // Scans forward once to find the longest valid UTF-8 prefix. - // Validates UTF-8 by checking byte structure, decoding code points, - // and rejecting overlong encodings, surrogates, and out-of-range values. - // Time complexity: O(n) where n is the number of bytes. - const findValidUtf8Length = (bytes) => { - // Scan forward, validating each sequence - let pos = 0; - - while (pos < bytes.length) { - const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if ( - seqLength === false || - !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) - ) - break; - pos += seqLength; - } - - return pos; - }; - // Converts a binary to a list of codepoints. const convertBinaryToCodepoints = (binary, preDecodedText = null) => { const text = @@ -140,7 +119,7 @@ const Erlang_Unicode = { const handleInvalidUtf8FromBinary = (invalidBinary) => { Bitstring.maybeSetBytesFromText(invalidBinary); const bytes = invalidBinary.bytes ?? new Uint8Array(0); - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); @@ -169,7 +148,7 @@ const Erlang_Unicode = { // Check if it's a truncated sequence Bitstring.maybeSetBytesFromText(invalidBinary); const bytes = invalidBinary.bytes ?? new Uint8Array(0); - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength); if (isTruncated) { @@ -301,27 +280,6 @@ const Erlang_Unicode = { "characters_to_nfc_binary/1": (data) => { // Helpers - // Scans forward once to find the longest valid UTF-8 prefix. - // Validates UTF-8 by checking byte structure, decoding code points, - // and rejecting overlong encodings, surrogates, and out-of-range values. - // Time complexity: O(n) where n is the number of bytes. - const findValidUtf8Length = (bytes) => { - // Scan forward, validating each sequence - let pos = 0; - - while (pos < bytes.length) { - const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if ( - seqLength === false || - !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) - ) - break; - pos += seqLength; - } - - return pos; - }; - // Validates that rest is a list containing a binary (from invalid UTF-8). // Raises ArgumentError if it's a list of invalid codepoints instead. const validateListRest = (rest) => { @@ -360,7 +318,7 @@ const Erlang_Unicode = { // Finds the UTF-8 validity boundary, normalizes the valid prefix, // and returns error tuple with normalized prefix and invalid remainder. const handleInvalidUtf8 = (bytes) => { - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); const validText = Bitstring.toText(validPrefix); @@ -555,27 +513,6 @@ const Erlang_Unicode = { "characters_to_nfd_binary/1": (data) => { // Helpers - // Scans forward once to find the longest valid UTF-8 prefix. - // Validates UTF-8 by checking byte structure, decoding code points, - // and rejecting overlong encodings, surrogates, and out-of-range values. - // Time complexity: O(n) where n is the number of bytes. - const findValidUtf8Length = (bytes) => { - // Scan forward, validating each sequence - let pos = 0; - - while (pos < bytes.length) { - const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if ( - seqLength === false || - !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) - ) - break; - pos += seqLength; - } - - return pos; - }; - // Validates that rest is a list containing a binary (from invalid UTF-8). // Raises ArgumentError if it's a list of invalid codepoints instead. const validateListRest = (rest) => { @@ -614,7 +551,7 @@ const Erlang_Unicode = { // Finds the UTF-8 validity boundary, normalizes the valid prefix, // and returns error tuple with normalized prefix and invalid remainder. const handleInvalidUtf8 = (bytes) => { - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); const validText = Bitstring.toText(validPrefix); @@ -662,27 +599,6 @@ const Erlang_Unicode = { "characters_to_nfkc_binary/1": (data) => { // Helpers - // Scans forward once to find the longest valid UTF-8 prefix. - // Validates UTF-8 by checking byte structure, decoding code points, - // and rejecting overlong encodings, surrogates, and out-of-range values. - // Time complexity: O(n) where n is the number of bytes. - const findValidUtf8Length = (bytes) => { - // scan forward, validating each sequence - let pos = 0; - - while (pos < bytes.length) { - const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if ( - seqLength === false || - !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) - ) - break; - pos += seqLength; - } - - return pos; - }; - // Validates that rest is a list containing a binary (from invalid UTF-8). // Raises ArgumentError if it's a list of invalid codepoints instead. const validateListRest = (rest) => { @@ -721,7 +637,7 @@ const Erlang_Unicode = { // Finds the UTF-8 validity boundary, normalizes the valid prefix, // and returns error tuple with normalized prefix and invalid remainder. const handleInvalidUtf8 = (bytes) => { - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); const validText = Bitstring.toText(validPrefix); @@ -768,26 +684,6 @@ const Erlang_Unicode = { "characters_to_nfkd_binary/1": (data) => { // Helpers - // Scans forward once to find the longest valid UTF-8 prefix. - // Validates UTF-8 by checking byte structure, decoding code points, - // and rejecting overlong encodings, surrogates, and out-of-range values. - // Time complexity: O(n) where n is the number of bytes. - const findValidUtf8Length = (bytes) => { - // Scan forward, validating each sequence - let pos = 0; - while (pos < bytes.length) { - const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]); - if ( - seqLength === false || - !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength) - ) - break; - pos += seqLength; - } - - return pos; - }; - // Validates that rest is a list containing a binary (from invalid UTF-8). // Raises ArgumentError if it's a list of invalid codepoints instead. const validateListRest = (rest) => { @@ -826,7 +722,7 @@ const Erlang_Unicode = { // Finds the UTF-8 validity boundary, normalizes the valid prefix, // and returns error tuple with normalized prefix and invalid remainder. const handleInvalidUtf8 = (bytes) => { - const validLength = findValidUtf8Length(bytes); + const validLength = Bitstring.getValidUtf8Length(bytes); const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength)); const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); const validText = Bitstring.toText(validPrefix); diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index dd371b89b..1bffe47ff 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -5087,6 +5087,132 @@ describe("Bitstring", () => { assert.deepStrictEqual(result, expected); }); + describe("getUtf8SequenceLength()", () => { + it("returns 1 for 0x41 (ASCII)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0x41), 1); + }); + + it("returns 2 for 0xC2 (2-byte leader)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0xc2), 2); + }); + + it("returns 3 for 0xE0 (3-byte leader)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0xe0), 3); + }); + + it("returns 4 for 0xF0 (4-byte leader)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0xf0), 4); + }); + + it("returns false for 0xC0 (invalid: overlong encoding)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0xc0), false); + }); + + it("returns false for 0xF5 (invalid: > U+10FFFF)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0xf5), false); + }); + + it("returns false for 0x80 (invalid: continuation byte)", () => { + assert.equal(Bitstring.getUtf8SequenceLength(0x80), false); + }); + }); + + describe("getValidUtf8Length()", () => { + it("empty bytes array", () => { + const bytes = new Uint8Array([]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("single valid ASCII byte", () => { + // 'A' (0x41): 0xxxxxxx pattern + const bytes = new Uint8Array([0x41]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 1); + }); + + it("single valid 2-byte sequence", () => { + // é (U+00E9): 0xC3 0xA9 + const bytes = new Uint8Array([0xc3, 0xa9]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 2); + }); + + it("single valid 3-byte sequence", () => { + // € (U+20AC): 0xE2 0x82 0xAC + const bytes = new Uint8Array([0xe2, 0x82, 0xac]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 3); + }); + + it("single valid 4-byte sequence", () => { + // 𐍈 (U+10348): 0xF0 0x90 0x8D 0x88 + const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 4); + }); + + it("multiple consecutive valid sequences", () => { + // 'A' (1 byte) + é (2 bytes) + € (3 bytes) + const bytes = new Uint8Array([ + 0x41, // ASCII 'A' + 0xc3, + 0xa9, // é + 0xe2, + 0x82, + 0xac, // € + ]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 6); + }); + + it("invalid leader byte (0xC0)", () => { + // 0xC0 is invalid (overlong encoding) + const bytes = new Uint8Array([0xc0, 0xa9]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("invalid leader byte (>= 0xF5)", () => { + // 0xF5+ is invalid (beyond Unicode range) + const bytes = new Uint8Array([0xf5, 0x90, 0x80, 0x80]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("invalid continuation byte", () => { + // 0xC3 starts 2-byte sequence, but 0x41 (ASCII) is not a valid continuation + const bytes = new Uint8Array([0xc3, 0x41]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("truncated 2-byte sequence", () => { + // 0xC3 expects 1 continuation byte, but we only have the leader + const bytes = new Uint8Array([0xc3]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("truncated 3-byte sequence with one byte missing", () => { + // 0xE2 0x82 expects one more continuation byte + const bytes = new Uint8Array([0xe2, 0x82]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 0); + }); + + it("valid sequences followed by invalid byte", () => { + // 'A' (1 byte) + é (2 bytes) + invalid 0xC0 + const bytes = new Uint8Array([ + 0x41, // ASCII 'A' + 0xc3, + 0xa9, // é + 0xc0, // Invalid + ]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 3); + }); + + it("valid sequences followed by truncated sequence", () => { + // 'A' (1 byte) + é (2 bytes) + incomplete 2-byte start + const bytes = new Uint8Array([ + 0x41, // ASCII 'A' + 0xc3, + 0xa9, // é + 0xc3, // Incomplete 2-byte + ]); + assert.equal(Bitstring.getValidUtf8Length(bytes), 3); + }); + }); + describe("isEmpty()", () => { describe("empty", () => { it("with text field", () => { @@ -7596,34 +7722,4 @@ describe("Bitstring", () => { }); }); }); - - describe("getUtf8SequenceLength()", () => { - it("returns 1 for 0x41 (ASCII)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0x41), 1); - }); - - it("returns 2 for 0xC2 (2-byte leader)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0xc2), 2); - }); - - it("returns 3 for 0xE0 (3-byte leader)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0xe0), 3); - }); - - it("returns 4 for 0xF0 (4-byte leader)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0xf0), 4); - }); - - it("returns false for 0xC0 (invalid: overlong encoding)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0xc0), false); - }); - - it("returns false for 0xF5 (invalid: > U+10FFFF)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0xf5), false); - }); - - it("returns false for 0x80 (invalid: continuation byte)", () => { - assert.equal(Bitstring.getUtf8SequenceLength(0x80), false); - }); - }); }); From 8d3af8d6cc6b7e95d18b149766ab6e8022c47fea Mon Sep 17 00:00:00 2001 From: Michael Ward Date: Thu, 19 Feb 2026 01:09:35 +0000 Subject: [PATCH 7/7] Extracts BitString to code points array to BitString class --- assets/js/bitstring.mjs | 9 ++++ assets/js/erlang/unicode.mjs | 25 +++------- test/javascript/bitstring_test.mjs | 79 ++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+), 18 deletions(-) diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs index 72d37b5a3..58da3f58c 100644 --- a/assets/js/bitstring.mjs +++ b/assets/js/bitstring.mjs @@ -829,6 +829,15 @@ export default class Bitstring { }; } + // Converts a bitstring to an array of codepoint integers. + // Returns a plain array (not wrapped in Type.list). + static toCodepointArray(bitstring) { + $.maybeSetTextFromBytes(bitstring); + return Array.from(bitstring.text, (char) => + Type.integer(char.codePointAt(0)), + ); + } + static toCodepoints(bitstring) { $.maybeSetTextFromBytes(bitstring); return Type.charlist(bitstring.text); diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs index 15424f5ab..f8e651c95 100644 --- a/assets/js/erlang/unicode.mjs +++ b/assets/js/erlang/unicode.mjs @@ -90,14 +90,6 @@ const Erlang_Unicode = { "characters_to_list/1": (data) => { // Helpers - // Converts a binary to a list of codepoints. - const convertBinaryToCodepoints = (binary, preDecodedText = null) => { - const text = - preDecodedText !== null ? preDecodedText : Bitstring.toText(binary); - - return Array.from(text).map((char) => Type.integer(char.codePointAt(0))); - }; - // Converts a single codepoint integer to a UTF-8 encoded binary. const convertCodepointToBinary = (codepoint) => { const segment = Type.bitstringSegment(codepoint, {type: "utf8"}); @@ -125,7 +117,7 @@ const Erlang_Unicode = { const invalidRest = Bitstring.fromBytes(bytes.slice(validLength)); const codepoints = - validLength > 0 ? convertBinaryToCodepoints(validPrefix) : []; + validLength > 0 ? Bitstring.toCodepointArray(validPrefix) : []; const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength); @@ -142,7 +134,7 @@ const Erlang_Unicode = { // Convert all valid chunks to codepoints const codepoints = chunks.length > 0 - ? convertBinaryToCodepoints(Bitstring.concat(chunks)) + ? Bitstring.toCodepointArray(Bitstring.concat(chunks)) : []; // Check if it's a truncated sequence @@ -171,7 +163,7 @@ const Erlang_Unicode = { ) => { const codepoints = chunks.length > 0 - ? convertBinaryToCodepoints(Bitstring.concat(chunks)) + ? Bitstring.toCodepointArray(Bitstring.concat(chunks)) : []; // Build the rest list with invalid code point and remaining elements @@ -237,7 +229,7 @@ const Erlang_Unicode = { return handleInvalidUtf8FromBinary(data); } - const codepoints = convertBinaryToCodepoints(data, text); + const codepoints = Bitstring.toCodepointArray(data); return Type.list(codepoints); } @@ -269,7 +261,7 @@ const Erlang_Unicode = { } const binary = Bitstring.concat(chunks); - const codepoints = convertBinaryToCodepoints(binary); + const codepoints = Bitstring.toCodepointArray(binary); return Type.list(codepoints); }, @@ -372,12 +364,9 @@ const Erlang_Unicode = { binary, preDecodedText = null, ) => { - const text = - preDecodedText !== null ? preDecodedText : Bitstring.toText(binary); - + const text = preDecodedText ?? binary.text ?? Bitstring.toText(binary); const normalized = text.normalize("NFC"); - - return Array.from(normalized).map((char) => + return Array.from(normalized, (char) => Type.integer(char.codePointAt(0)), ); }; diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs index 1bffe47ff..15c7e3a21 100644 --- a/test/javascript/bitstring_test.mjs +++ b/test/javascript/bitstring_test.mjs @@ -6242,6 +6242,85 @@ describe("Bitstring", () => { }); }); + describe("toCodepointArray()", () => { + it("single codepoint (1 byte)", () => { + const bitstring = Type.bitstring("A"); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [Type.integer(65)]; + + assert.deepStrictEqual(result, expected); + }); + + it("single codepoint (2 bytes)", () => { + const bitstring = Type.bitstring("£"); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [Type.integer(163)]; + + assert.deepStrictEqual(result, expected); + }); + + it("single codepoint (3 bytes)", () => { + const bitstring = Type.bitstring("€"); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [Type.integer(8364)]; + + assert.deepStrictEqual(result, expected); + }); + + it("single codepoint (4 bytes)", () => { + const bitstring = Type.bitstring("𐍈"); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [Type.integer(66376)]; + + assert.deepStrictEqual(result, expected); + }); + + it("multiple codepoints", () => { + const bitstring = Type.bitstring("A£€𐍈"); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [ + Type.integer(65), + Type.integer(163), + Type.integer(8364), + Type.integer(66376), + ]; + + assert.deepStrictEqual(result, expected); + }); + + it("empty bitstring", () => { + const bitstring = Type.bitstring(""); + const result = Bitstring.toCodepointArray(bitstring); + const expected = []; + + assert.deepStrictEqual(result, expected); + }); + + it("uses cached text when available", () => { + const bitstring = Type.bitstring("cached"); + // Bitstring already has text cached from Type.bitstring() + const result = Bitstring.toCodepointArray(bitstring); + const expected = [ + Type.integer(99), + Type.integer(97), + Type.integer(99), + Type.integer(104), + Type.integer(101), + Type.integer(100), + ]; + + assert.deepStrictEqual(result, expected); + }); + + it("decodes bytes and caches text when needed", () => { + const bitstring = Bitstring.fromBytes([97, 98, 99]); + const result = Bitstring.toCodepointArray(bitstring); + const expected = [Type.integer(97), Type.integer(98), Type.integer(99)]; + + assert.deepStrictEqual(result, expected); + }); + }); + describe("toCodepoints()", () => { describe("single codepoint", () => { it("$ (1 byte)", () => {