From 9a49bec70d22ca8a76e8fe1aa9292fd705e2bf3e Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Tue, 17 Feb 2026 20:00:30 +0000
Subject: [PATCH 1/7] Extracts UTF-8 code point decoding to the BitString
 utility class, adds parameter validation

---
 assets/js/bitstring.mjs            |  21 ++++
 assets/js/erlang/unicode.mjs       | 149 +----------------------------
 test/javascript/bitstring_test.mjs |  37 +++++++
 3 files changed, 63 insertions(+), 144 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index 40168b7a9..f4c9b8695 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -247,6 +247,27 @@ export default class Bitstring {
     }
   }
 
+  // Decodes a UTF-8 sequence starting at the given position.
+  // Returns the decoded Unicode code point value.
+  // bytes: Uint8Array containing the UTF-8 encoded data
+  // start: byte index where the sequence begins
+  // length: number of bytes in the UTF-8 sequence (1-4)
+  static decodeUtf8CodePoint(bytes, start, length) {
+    if (length === 1) return bytes[start];
+
+    // First byte masks: 2-byte=0x1f, 3-byte=0x0f, 4-byte=0x07
+    const firstByteMasks = {2: 0x1f, 3: 0x0f, 4: 0x07};
+
+    let codePoint = bytes[start] & firstByteMasks[length];
+
+    // Process continuation bytes (all use 0x3f mask, shift by 6 each)
+    for (let i = 1; i < length; i++) {
+      codePoint = (codePoint << 6) | (bytes[start + i] & 0x3f);
+    }
+
+    return codePoint;
+  }
+
   static fromBits(bits) {
     const bitCount = bits.length;
     const byteCount = Math.ceil(bitCount / 8);
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index 8479f3528..c47b0b2e2 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -98,34 +98,6 @@ const Erlang_Unicode = {
       // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
       const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
 
-      // Decodes a UTF-8 sequence starting at the given position.
-      // Returns the decoded Unicode code point value.
-      const decodeCodePoint = (start, length) => {
-        if (length === 1) {
-          return bytes[start];
-        }
-
-        if (length === 2) {
-          return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f);
-        }
-
-        if (length === 3) {
-          return (
-            ((bytes[start] & 0x0f) << 12) |
-            ((bytes[start + 1] & 0x3f) << 6) |
-            (bytes[start + 2] & 0x3f)
-          );
-        }
-
-        // length === 4
-        return (
-          ((bytes[start] & 0x07) << 18) |
-          ((bytes[start + 1] & 0x3f) << 12) |
-          ((bytes[start + 2] & 0x3f) << 6) |
-          (bytes[start + 3] & 0x3f)
-        );
-      };
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -156,7 +128,7 @@ const Erlang_Unicode = {
         }
 
         // Decode and validate the code point value
-        const codePoint = decodeCodePoint(start, length);
+        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
         return isValidCodePoint(codePoint, length);
       };
@@ -387,34 +359,6 @@ const Erlang_Unicode = {
       // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
       const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
 
-      // Decodes a UTF-8 sequence starting at the given position.
-      // Returns the decoded Unicode code point value.
-      const decodeCodePoint = (start, length) => {
-        if (length === 1) {
-          return bytes[start];
-        }
-
-        if (length === 2) {
-          return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f);
-        }
-
-        if (length === 3) {
-          return (
-            ((bytes[start] & 0x0f) << 12) |
-            ((bytes[start + 1] & 0x3f) << 6) |
-            (bytes[start + 2] & 0x3f)
-          );
-        }
-
-        // length === 4
-        return (
-          ((bytes[start] & 0x07) << 18) |
-          ((bytes[start + 1] & 0x3f) << 12) |
-          ((bytes[start + 2] & 0x3f) << 6) |
-          (bytes[start + 3] & 0x3f)
-        );
-      };
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -445,7 +389,7 @@ const Erlang_Unicode = {
         }
 
         // Decode and validate the code point value
-        const codePoint = decodeCodePoint(start, length);
+        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
         return isValidCodePoint(codePoint, length);
       };
@@ -703,33 +647,6 @@ const Erlang_Unicode = {
       // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
       const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
 
-      // Decodes a UTF-8 sequence starting at the given position.
-      // Returns the decoded Unicode code point value.
-      const decodeCodePoint = (start, length) => {
-        if (length === 1) {
-          return bytes[start];
-        }
-
-        if (length === 2) {
-          return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f);
-        }
-
-        if (length === 3) {
-          return (
-            ((bytes[start] & 0x0f) << 12) |
-            ((bytes[start + 1] & 0x3f) << 6) |
-            (bytes[start + 2] & 0x3f)
-          );
-        }
-        // length === 4
-        return (
-          ((bytes[start] & 0x07) << 18) |
-          ((bytes[start + 1] & 0x3f) << 12) |
-          ((bytes[start + 2] & 0x3f) << 6) |
-          (bytes[start + 3] & 0x3f)
-        );
-      };
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -760,7 +677,7 @@ const Erlang_Unicode = {
         }
 
         // Decode and validate the code point value
-        const codePoint = decodeCodePoint(start, length);
+        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
         return isValidCodePoint(codePoint, length);
       };
@@ -870,34 +787,6 @@ const Erlang_Unicode = {
       // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
       const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
 
-      // Decodes a UTF-8 sequence starting at the given position.
-      // Returns the decoded Unicode code point value.
-      const decodeCodePoint = (start, length) => {
-        if (length === 1) {
-          return bytes[start];
-        }
-
-        if (length === 2) {
-          return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f);
-        }
-
-        if (length === 3) {
-          return (
-            ((bytes[start] & 0x0f) << 12) |
-            ((bytes[start + 1] & 0x3f) << 6) |
-            (bytes[start + 2] & 0x3f)
-          );
-        }
-
-        // length === 4
-        return (
-          ((bytes[start] & 0x07) << 18) |
-          ((bytes[start + 1] & 0x3f) << 12) |
-          ((bytes[start + 2] & 0x3f) << 6) |
-          (bytes[start + 3] & 0x3f)
-        );
-      };
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -928,7 +817,7 @@ const Erlang_Unicode = {
         }
 
         // Decode and validate the code point value
-        const codePoint = decodeCodePoint(start, length);
+        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
         return isValidCodePoint(codePoint, length);
       };
 
@@ -1036,34 +925,6 @@ const Erlang_Unicode = {
       // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
       const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
 
-      // Decodes a UTF-8 sequence starting at the given position.
-      // Returns the decoded Unicode code point value.
-      const decodeCodePoint = (start, length) => {
-        if (length === 1) {
-          return bytes[start];
-        }
-
-        if (length === 2) {
-          return ((bytes[start] & 0x1f) << 6) | (bytes[start + 1] & 0x3f);
-        }
-
-        if (length === 3) {
-          return (
-            ((bytes[start] & 0x0f) << 12) |
-            ((bytes[start + 1] & 0x3f) << 6) |
-            (bytes[start + 2] & 0x3f)
-          );
-        }
-
-        // length === 4
-        return (
-          ((bytes[start] & 0x07) << 18) |
-          ((bytes[start + 1] & 0x3f) << 12) |
-          ((bytes[start + 2] & 0x3f) << 6) |
-          (bytes[start + 3] & 0x3f)
-        );
-      };
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -1096,7 +957,7 @@ const Erlang_Unicode = {
         }
 
         // Decode and validate the code point value
-        const codePoint = decodeCodePoint(start, length);
+        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
         return isValidCodePoint(codePoint, length);
       };
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index d822d1ffa..88b725447 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -1231,6 +1231,43 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("decodeUtf8CodePoint()", () => {
+    it("decodes 1-byte UTF-8 sequence (ASCII)", () => {
+      // 'A' = 0x41 = U+0041
+      const bytes = new Uint8Array([0x41]);
+      const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 1);
+      assert.equal(codePoint, 0x41);
+    });
+
+    it("decodes 2-byte UTF-8 sequence", () => {
+      // '£' = 0xC2 0xA3 = U+00A3 (pound sign)
+      const bytes = new Uint8Array([0xc2, 0xa3]);
+      const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 2);
+      assert.equal(codePoint, 0xa3);
+    });
+
+    it("decodes 3-byte UTF-8 sequence", () => {
+      // '€' = 0xE2 0x82 0xAC = U+20AC (euro sign)
+      const bytes = new Uint8Array([0xe2, 0x82, 0xac]);
+      const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 3);
+      assert.equal(codePoint, 0x20ac);
+    });
+
+    it("decodes 4-byte UTF-8 sequence", () => {
+      // '𐍈' = 0xF0 0x90 0x8D 0x88 = U+10348 (Gothic letter hwair)
+      const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]);
+      const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 0, 4);
+      assert.equal(codePoint, 0x10348);
+    });
+
+    it("decodes from non-zero start position", () => {
+      // Test decoding '£' starting at position 2
+      const bytes = new Uint8Array([0x41, 0x42, 0xc2, 0xa3]);
+      const codePoint = Bitstring.decodeUtf8CodePoint(bytes, 2, 2);
+      assert.equal(codePoint, 0xa3);
+    });
+  });
+
   describe("fromBits()", () => {
     it("empty", () => {
       const result = Bitstring.fromBits([]);

From 105d2d91dd0e3bc511079f260848483f0c378872 Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Wed, 18 Feb 2026 15:01:43 +0000
Subject: [PATCH 2/7] Extracts UTF-8 continuation byte validation to BitString
 class

---
 assets/js/bitstring.mjs            |  5 +++++
 assets/js/erlang/unicode.mjs       | 32 ++++++++++--------------------
 test/javascript/bitstring_test.mjs | 14 +++++++++++++
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index f4c9b8695..881118d75 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -593,6 +593,11 @@ export default class Bitstring {
     return bitstring.text !== false;
   }
 
+  // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
+  static isValidUtf8ContinuationByte(byte) {
+    return (byte & 0xc0) === 0x80;
+  }
+
   static maybeResolveHex(bitstring) {
     if (bitstring.hex === null) {
       $.maybeSetBytesFromText(bitstring);
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index c47b0b2e2..e54076495 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -95,9 +95,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
-      const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -124,7 +121,8 @@ const Erlang_Unicode = {
 
         // Verify all continuation bytes have correct pattern (10xxxxxx)
         for (let i = 1; i < length; i++) {
-          if (!isValidContinuation(bytes[start + i])) return false;
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
+            return false;
         }
 
         // Decode and validate the code point value
@@ -146,7 +144,8 @@ const Erlang_Unicode = {
 
         // Check all available continuation bytes
         for (let i = 1; i < availableBytes; i++) {
-          if (!isValidContinuation(bytes[start + i])) return false;
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
+            return false;
         }
 
         return true;
@@ -356,9 +355,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
-      const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -385,7 +381,8 @@ const Erlang_Unicode = {
 
         // Verify all continuation bytes have correct pattern (10xxxxxx)
         for (let i = 1; i < length; i++) {
-          if (!isValidContinuation(bytes[start + i])) return false;
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
+            return false;
         }
 
         // Decode and validate the code point value
@@ -644,9 +641,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
-      const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -673,7 +667,8 @@ const Erlang_Unicode = {
 
         // Verify all continuation bytes have correct pattern (10xxxxxx)
         for (let i = 1; i < length; i++) {
-          if (!isValidContinuation(bytes[start + i])) return false;
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
+            return false;
         }
 
         // Decode and validate the code point value
@@ -784,9 +779,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
-      const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -813,7 +805,8 @@ const Erlang_Unicode = {
 
         // Verify all continuation bytes have correct pattern (10xxxxxx)
         for (let i = 1; i < length; i++) {
-          if (!isValidContinuation(bytes[start + i])) return false;
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
+            return false;
         }
 
         // Decode and validate the code point value
@@ -922,9 +915,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
-      const isValidContinuation = (byte) => (byte & 0xc0) === 0x80;
-
       // Validates that a code point is within UTF-8 rules:
       // - Not an overlong encoding (using more bytes than necessary)
       // - Not a UTF-16 surrogate (U+D800–U+DFFF)
@@ -951,7 +941,7 @@ const Erlang_Unicode = {
 
         // Verify all continuation bytes have correct pattern (10xxxxxx)
         for (let i = 1; i < length; i++) {
-          if (!isValidContinuation(bytes[start + i])) {
+          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) {
             return false;
           }
         }
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index 88b725447..0a53ca436 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -5264,6 +5264,20 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("isValidUtf8ContinuationByte()", () => {
+    it("valid continuation byte (10xxxxxx pattern)", () => {
+      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000
+      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111
+    });
+
+    it("invalid continuation byte (not 10xxxxxx pattern)", () => {
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid)
+    });
+  });
+
   describe("maybeResolveHex()", () => {
     it("when hex field is already set", () => {
       const bitstring = Type.bitstring("Hologram");

From 7015625b214ae92310278d6d214735ccda8be3fa Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Wed, 18 Feb 2026 15:33:08 +0000
Subject: [PATCH 3/7] Extracts Utf-8 code point validation to BitString class

---
 assets/js/bitstring.mjs            |  18 ++++++
 assets/js/erlang/unicode.mjs       | 100 ++---------------------------
 test/javascript/bitstring_test.mjs |  30 +++++++++
 3 files changed, 53 insertions(+), 95 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index 881118d75..dfbcd8181 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -593,6 +593,24 @@ export default class Bitstring {
     return bitstring.text !== false;
   }
 
+  // Validates that a code point is within UTF-8 rules:
+  // - Not an overlong encoding (using more bytes than necessary)
+  // - Not a UTF-16 surrogate (U+D800–U+DFFF)
+  // - Not above maximum Unicode (U+10FFFF)
+  static isValidUtf8CodePoint(codePoint, encodingLength) {
+    // Check for overlong encodings (security issue)
+    const minValueForLength = {1: 0, 2: 0x80, 3: 0x800, 4: 0x10000};
+
+    // Reject code points that could have been encoded with fewer bytes (overlong)
+    if (codePoint < minValueForLength[encodingLength]) return false;
+    // Reject UTF-16 surrogates (U+D800–U+DFFF)
+    if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
+    // Reject code points beyond Unicode range (> U+10FFFF)
+    if (codePoint > 0x10ffff) return false;
+
+    return true;
+  }
+
   // Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx).
   static isValidUtf8ContinuationByte(byte) {
     return (byte & 0xc0) === 0x80;
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index e54076495..064b25164 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -95,24 +95,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates that a code point is within UTF-8 rules:
-      // - Not an overlong encoding (using more bytes than necessary)
-      // - Not a UTF-16 surrogate (U+D800–U+DFFF)
-      // - Not above maximum Unicode (U+10FFFF)
-      const isValidCodePoint = (codePoint, encodingLength) => {
-        // Check for overlong encodings (security issue)
-        const minValueForLength = [0, 0, 0x80, 0x800, 0x10000];
-        if (codePoint < minValueForLength[encodingLength]) return false;
-
-        // Reject UTF-16 surrogates (U+D800–U+DFFF)
-        if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
-
-        // Reject code points beyond Unicode range (> U+10FFFF)
-        if (codePoint > 0x10ffff) return false;
-
-        return true;
-      };
-
       // Validates a complete UTF-8 sequence at the given position.
       // Checks: sufficient bytes, valid continuations, and valid code point.
       const isValidSequence = (start, length) => {
@@ -128,7 +110,7 @@ const Erlang_Unicode = {
         // Decode and validate the code point value
         const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
-        return isValidCodePoint(codePoint, length);
+        return Bitstring.isValidUtf8CodePoint(codePoint, length);
       };
 
       // Checks if there's a truncated (incomplete) sequence at position.
@@ -355,24 +337,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates that a code point is within UTF-8 rules:
-      // - Not an overlong encoding (using more bytes than necessary)
-      // - Not a UTF-16 surrogate (U+D800–U+DFFF)
-      // - Not above maximum Unicode (U+10FFFF)
-      const isValidCodePoint = (codePoint, encodingLength) => {
-        // Check for overlong encodings (security issue)
-        const minValueForLength = [0, 0, 0x80, 0x800, 0x10000];
-        if (codePoint < minValueForLength[encodingLength]) return false;
-
-        // Reject UTF-16 surrogates (U+D800–U+DFFF)
-        if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
-
-        // Reject code points beyond Unicode range (> U+10FFFF)
-        if (codePoint > 0x10ffff) return false;
-
-        return true;
-      };
-
       // Validates a complete UTF-8 sequence at the given position.
       // Checks: sufficient bytes, valid continuations, and valid code point.
       const isValidSequence = (start, length) => {
@@ -388,7 +352,7 @@ const Erlang_Unicode = {
         // Decode and validate the code point value
         const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
-        return isValidCodePoint(codePoint, length);
+        return Bitstring.isValidUtf8CodePoint(codePoint, length);
       };
 
       // Main loop: scan forward, validating each sequence
@@ -641,24 +605,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates that a code point is within UTF-8 rules:
-      // - Not an overlong encoding (using more bytes than necessary)
-      // - Not a UTF-16 surrogate (U+D800–U+DFFF)
-      // - Not above maximum Unicode (U+10FFFF)
-      const isValidCodePoint = (codePoint, encodingLength) => {
-        // Check for overlong encodings (security issue)
-        const minValueForLength = [0, 0, 0x80, 0x800, 0x10000];
-        if (codePoint < minValueForLength[encodingLength]) return false;
-
-        // Reject UTF-16 surrogates (U+D800–U+DFFF)
-        if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
-
-        // Reject code points beyond Unicode range (> U+10FFFF)
-        if (codePoint > 0x10ffff) return false;
-
-        return true;
-      };
-
       // Validates a complete UTF-8 sequence at the given position.
       // Checks: sufficient bytes, valid continuations, and valid code point.
       const isValidSequence = (start, length) => {
@@ -674,7 +620,7 @@ const Erlang_Unicode = {
         // Decode and validate the code point value
         const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
-        return isValidCodePoint(codePoint, length);
+        return Bitstring.isValidUtf8CodePoint(codePoint, length);
       };
 
       // Main loop: scan forward, validating each sequence
@@ -779,24 +725,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates that a code point is within UTF-8 rules:
-      // - Not an overlong encoding (using more bytes than necessary)
-      // - Not a UTF-16 surrogate (U+D800–U+DFFF)
-      // - Not above maximum Unicode (U+10FFFF)
-      const isValidCodePoint = (codePoint, encodingLength) => {
-        // Check for overlong encodings (security issue)
-        const minValueForLength = [0, 0, 0x80, 0x800, 0x10000];
-        if (codePoint < minValueForLength[encodingLength]) return false;
-
-        // Reject UTF-16 surrogates (U+D800–U+DFFF)
-        if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
-
-        // Reject code points beyond Unicode range (> U+10FFFF)
-        if (codePoint > 0x10ffff) return false;
-
-        return true;
-      };
-
       // Validates a complete UTF-8 sequence at the given position.
       // Checks: sufficient bytes, valid continuations, and valid code point.
       const isValidSequence = (start, length) => {
@@ -811,7 +739,7 @@ const Erlang_Unicode = {
 
         // Decode and validate the code point value
         const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-        return isValidCodePoint(codePoint, length);
+        return Bitstring.isValidUtf8CodePoint(codePoint, length);
       };
 
       // Main loop: scan forward, validating each sequence
@@ -915,24 +843,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates that a code point is within UTF-8 rules:
-      // - Not an overlong encoding (using more bytes than necessary)
-      // - Not a UTF-16 surrogate (U+D800–U+DFFF)
-      // - Not above maximum Unicode (U+10FFFF)
-      const isValidCodePoint = (codePoint, encodingLength) => {
-        // Check for overlong encodings (security issue)
-        const minValueForLength = [0, 0, 0x80, 0x800, 0x10000];
-        if (codePoint < minValueForLength[encodingLength]) return false;
-
-        // Reject UTF-16 surrogates (U+D800–U+DFFF)
-        if (codePoint >= 0xd800 && codePoint <= 0xdfff) return false;
-
-        // Reject code points beyond Unicode range (> U+10FFFF)
-        if (codePoint > 0x10ffff) return false;
-
-        return true;
-      };
-
       // Validates a complete UTF-8 sequence at the given position.
       // Checks: sufficient bytes, valid continuations, and valid code point.
       const isValidSequence = (start, length) => {
@@ -949,7 +859,7 @@ const Erlang_Unicode = {
         // Decode and validate the code point value
         const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
 
-        return isValidCodePoint(codePoint, length);
+        return Bitstring.isValidUtf8CodePoint(codePoint, length);
       };
 
       // Main loop: scan forward, validating each sequence
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index 0a53ca436..6f1916f56 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -5278,6 +5278,36 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("isValidUtf8CodePoint()", () => {
+    it("valid codepoint", () => {
+      assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A'
+      assert.isTrue(Bitstring.isValidUtf8CodePoint(0xa9, 2)); // © (copyright)
+      assert.isTrue(Bitstring.isValidUtf8CodePoint(0x20ac, 3)); // € (euro)
+      assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10348, 4)); // 𐍈 (Gothic letter)
+      assert.isTrue(Bitstring.isValidUtf8CodePoint(0x10ffff, 4)); // Maximum valid Unicode
+    });
+
+    it("overlong encoding (codepoint too small for encoding length)", () => {
+      // 'A' (0x41) must use 1 byte, not 2
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0x41, 2));
+      // 0x7FF requires 2 bytes, but attempting 3-byte encoding
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0x7ff, 3));
+      // 0xFFFF requires 3 bytes, but attempting 4-byte encoding
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0xffff, 4));
+    });
+
+    it("UTF-16 surrogate (U+D800–U+DFFF)", () => {
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0xd800, 3)); // Start of surrogate range
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdc00, 3)); // Middle of surrogate range
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0xdfff, 3)); // End of surrogate range
+    });
+
+    it("beyond Unicode range (> U+10FFFF)", () => {
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0x110000, 4));
+      assert.isFalse(Bitstring.isValidUtf8CodePoint(0x200000, 4));
+    });
+  });
+
   describe("maybeResolveHex()", () => {
     it("when hex field is already set", () => {
       const bitstring = Type.bitstring("Hologram");

From b9bfe2d681c06d792a3599abf8deebf68d907316 Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Wed, 18 Feb 2026 16:34:55 +0000
Subject: [PATCH 4/7] Extracts UTF-8 sequence validation to BitString class

---
 assets/js/bitstring.mjs            |  19 +++++
 assets/js/erlang/unicode.mjs       | 130 +++++++----------------------
 test/javascript/bitstring_test.mjs |  84 +++++++++++++++----
 3 files changed, 120 insertions(+), 113 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index dfbcd8181..5bee0cc43 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -616,6 +616,25 @@ export default class Bitstring {
     return (byte & 0xc0) === 0x80;
   }
 
+  // Validates a UTF-8 sequence at the given position assuming the leader byte
+  // has already been confirmed valid for `length` (e.g. via getUtf8SequenceLength).
+  // Checks: sufficient bytes, valid continuation bytes, and valid code point.
+  // Precondition: `length` is the value returned by getUtf8SequenceLength(bytes[start]).
+  static isValidUtf8Sequence(bytes, start, length) {
+    // Check if we have enough bytes
+    if (start + length > bytes.length) return false;
+
+    // Verify all continuation bytes have correct pattern (10xxxxxx)
+    for (let i = 1; i < length; i++) {
+      if (!$.isValidUtf8ContinuationByte(bytes[start + i])) return false;
+    }
+
+    // Decode and validate the code point value
+    const codePoint = $.decodeUtf8CodePoint(bytes, start, length);
+
+    return $.isValidUtf8CodePoint(codePoint, length);
+  }
+
   static maybeResolveHex(bitstring) {
     if (bitstring.hex === null) {
       $.maybeSetBytesFromText(bitstring);
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index 064b25164..c9ee533ba 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -95,24 +95,6 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates a complete UTF-8 sequence at the given position.
-      // Checks: sufficient bytes, valid continuations, and valid code point.
-      const isValidSequence = (start, length) => {
-        // Check if we have enough bytes
-        if (start + length > bytes.length) return false;
-
-        // Verify all continuation bytes have correct pattern (10xxxxxx)
-        for (let i = 1; i < length; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
-            return false;
-        }
-
-        // Decode and validate the code point value
-        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-
-        return Bitstring.isValidUtf8CodePoint(codePoint, length);
-      };
-
       // Checks if there's a truncated (incomplete) sequence at position.
       // Returns true if bytes could be a valid prefix of a UTF-8 sequence.
       const isTruncatedSequence = (start) => {
@@ -138,7 +120,11 @@ const Erlang_Unicode = {
 
       while (pos < bytes.length) {
         const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (seqLength === false || !isValidSequence(pos, seqLength)) break;
+        if (
+          seqLength === false ||
+          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
+        )
+          break;
         pos += seqLength;
       }
 
@@ -337,30 +323,16 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates a complete UTF-8 sequence at the given position.
-      // Checks: sufficient bytes, valid continuations, and valid code point.
-      const isValidSequence = (start, length) => {
-        // Check if we have enough bytes
-        if (start + length > bytes.length) return false;
-
-        // Verify all continuation bytes have correct pattern (10xxxxxx)
-        for (let i = 1; i < length; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
-            return false;
-        }
-
-        // Decode and validate the code point value
-        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-
-        return Bitstring.isValidUtf8CodePoint(codePoint, length);
-      };
-
-      // Main loop: scan forward, validating each sequence
+      // Scan forward, validating each sequence
       let pos = 0;
 
       while (pos < bytes.length) {
         const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (seqLength === false || !isValidSequence(pos, seqLength)) break;
+        if (
+          seqLength === false ||
+          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
+        )
+          break;
         pos += seqLength;
       }
 
@@ -605,29 +577,16 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates a complete UTF-8 sequence at the given position.
-      // Checks: sufficient bytes, valid continuations, and valid code point.
-      const isValidSequence = (start, length) => {
-        // Check if we have enough bytes
-        if (start + length > bytes.length) return false;
-
-        // Verify all continuation bytes have correct pattern (10xxxxxx)
-        for (let i = 1; i < length; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
-            return false;
-        }
-
-        // Decode and validate the code point value
-        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-
-        return Bitstring.isValidUtf8CodePoint(codePoint, length);
-      };
-
-      // Main loop: scan forward, validating each sequence
+      // Scan forward, validating each sequence
       let pos = 0;
+
       while (pos < bytes.length) {
         const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (seqLength === false || !isValidSequence(pos, seqLength)) break;
+        if (
+          seqLength === false ||
+          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
+        )
+          break;
         pos += seqLength;
       }
 
@@ -725,28 +684,16 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates a complete UTF-8 sequence at the given position.
-      // Checks: sufficient bytes, valid continuations, and valid code point.
-      const isValidSequence = (start, length) => {
-        // Check if we have enough bytes
-        if (start + length > bytes.length) return false;
-
-        // Verify all continuation bytes have correct pattern (10xxxxxx)
-        for (let i = 1; i < length; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
-            return false;
-        }
-
-        // Decode and validate the code point value
-        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-        return Bitstring.isValidUtf8CodePoint(codePoint, length);
-      };
-
-      // Main loop: scan forward, validating each sequence
+      // scan forward, validating each sequence
       let pos = 0;
+
       while (pos < bytes.length) {
         const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (seqLength === false || !isValidSequence(pos, seqLength)) break;
+        if (
+          seqLength === false ||
+          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
+        )
+          break;
         pos += seqLength;
       }
 
@@ -843,30 +790,15 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Validates a complete UTF-8 sequence at the given position.
-      // Checks: sufficient bytes, valid continuations, and valid code point.
-      const isValidSequence = (start, length) => {
-        // Check if we have enough bytes
-        if (start + length > bytes.length) return false;
-
-        // Verify all continuation bytes have correct pattern (10xxxxxx)
-        for (let i = 1; i < length; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i])) {
-            return false;
-          }
-        }
-
-        // Decode and validate the code point value
-        const codePoint = Bitstring.decodeUtf8CodePoint(bytes, start, length);
-
-        return Bitstring.isValidUtf8CodePoint(codePoint, length);
-      };
-
-      // Main loop: scan forward, validating each sequence
+      // Scan forward, validating each sequence
       let pos = 0;
       while (pos < bytes.length) {
         const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (seqLength === false || !isValidSequence(pos, seqLength)) break;
+        if (
+          seqLength === false ||
+          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
+        )
+          break;
         pos += seqLength;
       }
 
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index 6f1916f56..a2b906ad7 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -5264,20 +5264,6 @@ describe("Bitstring", () => {
     });
   });
 
-  describe("isValidUtf8ContinuationByte()", () => {
-    it("valid continuation byte (10xxxxxx pattern)", () => {
-      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000
-      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111
-    });
-
-    it("invalid continuation byte (not 10xxxxxx pattern)", () => {
-      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII)
-      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII)
-      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start)
-      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid)
-    });
-  });
-
   describe("isValidUtf8CodePoint()", () => {
     it("valid codepoint", () => {
       assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A'
@@ -5308,6 +5294,76 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("isValidUtf8ContinuationByte()", () => {
+    it("valid continuation byte (10xxxxxx pattern)", () => {
+      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0x80)); // 10000000
+      assert.isTrue(Bitstring.isValidUtf8ContinuationByte(0xbf)); // 10111111
+    });
+
+    it("invalid continuation byte (not 10xxxxxx pattern)", () => {
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x00)); // 00000000 (ASCII)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0x7f)); // 01111111 (ASCII)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xc0)); // 11000000 (2-byte start)
+      assert.isFalse(Bitstring.isValidUtf8ContinuationByte(0xff)); // 11111111 (invalid)
+    });
+  });
+
+  describe("isValidUtf8Sequence()", () => {
+    it("valid 1-byte sequence (ASCII)", () => {
+      // ASCII 'A'
+      const bytes = new Uint8Array([0x41]);
+      assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 1));
+    });
+
+    it("valid 2-byte sequence", () => {
+      // é (U+00E9): 0xC3 0xA9
+      const bytes = new Uint8Array([0xc3, 0xa9]);
+      assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 2));
+    });
+
+    it("valid 3-byte sequence", () => {
+      // € (U+20AC): 0xE2 0x82 0xAC
+      const bytes = new Uint8Array([0xe2, 0x82, 0xac]);
+      assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 3));
+    });
+
+    it("valid 4-byte sequence", () => {
+      // 𐍈 (U+10348): 0xF0 0x90 0x8D 0x88
+      const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]);
+      assert.isTrue(Bitstring.isValidUtf8Sequence(bytes, 0, 4));
+    });
+
+    it("not enough bytes available", () => {
+      const bytes = new Uint8Array([0xc3, 0xa9]); // 2 bytes
+      // Try to validate 3-byte sequence starting at position 0
+      assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 3));
+    });
+
+    it("invalid continuation byte", () => {
+      // 0xC3 starts a 2-byte sequence, but 0x41 (ASCII 'A') is not a valid continuation
+      const bytes = new Uint8Array([0xc3, 0x41]);
+      assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 2));
+    });
+
+    it("overlong encoding", () => {
+      // 'A' (0x41) encoded as 2-byte sequence: 0xC1 0x81 (overlong)
+      const bytes = new Uint8Array([0xc1, 0x81]);
+      assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 2));
+    });
+
+    it("UTF-16 surrogate", () => {
+      // U+D800 (surrogate) encoded as 3-byte sequence: 0xED 0xA0 0x80
+      const bytes = new Uint8Array([0xed, 0xa0, 0x80]);
+      assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 3));
+    });
+
+    it("beyond Unicode range", () => {
+      // U+110000 (beyond max) encoded as 4-byte sequence: 0xF4 0x90 0x80 0x80
+      const bytes = new Uint8Array([0xf4, 0x90, 0x80, 0x80]);
+      assert.isFalse(Bitstring.isValidUtf8Sequence(bytes, 0, 4));
+    });
+  });
+
   describe("maybeResolveHex()", () => {
     it("when hex field is already set", () => {
       const bitstring = Type.bitstring("Hologram");

From 9ab7bc37b117477b18319b0f1e2d8193bd1e9c5d Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Wed, 18 Feb 2026 17:08:58 +0000
Subject: [PATCH 5/7] Extracts truncated UTF-8 sequence validation to BitString
 class

---
 assets/js/bitstring.mjs            | 21 ++++++++
 assets/js/erlang/unicode.mjs       | 31 +++---------
 test/javascript/bitstring_test.mjs | 79 ++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index 5bee0cc43..d80dd9937 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -635,6 +635,27 @@ export default class Bitstring {
     return $.isValidUtf8CodePoint(codePoint, length);
   }
 
+  // Checks if there's a truncated (incomplete) UTF-8 sequence at the given position.
+  // Returns true if bytes could be a valid prefix of a UTF-8 sequence.
+  // bytes: Uint8Array containing UTF-8 encoded data
+  // start: byte index to check for truncation
+  static isTruncatedUtf8Sequence(bytes, start) {
+    const leaderByte = bytes[start];
+    const expectedLength = $.getUtf8SequenceLength(leaderByte);
+
+    if (expectedLength === false) return false;
+
+    const availableBytes = bytes.length - start;
+    if (availableBytes >= expectedLength) return false;
+
+    // Check all available continuation bytes
+    for (let i = 1; i < availableBytes; i++) {
+      if (!$.isValidUtf8ContinuationByte(bytes[start + i])) return false;
+    }
+
+    return true;
+  }
+
   static maybeResolveHex(bitstring) {
     if (bitstring.hex === null) {
       $.maybeSetBytesFromText(bitstring);
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index c9ee533ba..a433c4ff0 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -95,27 +95,7 @@ const Erlang_Unicode = {
     // and rejecting overlong encodings, surrogates, and out-of-range values.
     // Time complexity: O(n) where n is the number of bytes.
     const findValidUtf8Length = (bytes) => {
-      // Checks if there's a truncated (incomplete) sequence at position.
-      // Returns true if bytes could be a valid prefix of a UTF-8 sequence.
-      const isTruncatedSequence = (start) => {
-        const leaderByte = bytes[start];
-        const expectedLength = Bitstring.getUtf8SequenceLength(leaderByte);
-
-        if (expectedLength === false) return false;
-
-        const availableBytes = bytes.length - start;
-        if (availableBytes >= expectedLength) return false;
-
-        // Check all available continuation bytes
-        for (let i = 1; i < availableBytes; i++) {
-          if (!Bitstring.isValidUtf8ContinuationByte(bytes[start + i]))
-            return false;
-        }
-
-        return true;
-      };
-
-      // Main loop: scan forward, validating each sequence
+      // Scan forward, validating each sequence
       let pos = 0;
 
       while (pos < bytes.length) {
@@ -128,7 +108,7 @@ const Erlang_Unicode = {
         pos += seqLength;
       }
 
-      return {validLength: pos, isTruncated: isTruncatedSequence(pos)};
+      return pos;
     };
 
     // Converts a binary to a list of codepoints.
@@ -160,7 +140,7 @@ const Erlang_Unicode = {
     const handleInvalidUtf8FromBinary = (invalidBinary) => {
       Bitstring.maybeSetBytesFromText(invalidBinary);
       const bytes = invalidBinary.bytes ?? new Uint8Array(0);
-      const {validLength, isTruncated} = findValidUtf8Length(bytes);
+      const validLength = findValidUtf8Length(bytes);
 
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
@@ -168,6 +148,8 @@ const Erlang_Unicode = {
       const codepoints =
         validLength > 0 ? convertBinaryToCodepoints(validPrefix) : [];
 
+      const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength);
+
       if (isTruncated) {
         return createIncompleteTuple(codepoints, invalidRest);
       }
@@ -187,7 +169,8 @@ const Erlang_Unicode = {
       // Check if it's a truncated sequence
       Bitstring.maybeSetBytesFromText(invalidBinary);
       const bytes = invalidBinary.bytes ?? new Uint8Array(0);
-      const {isTruncated} = findValidUtf8Length(bytes);
+      const validLength = findValidUtf8Length(bytes);
+      const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength);
 
       if (isTruncated) {
         // Incomplete: rest is the binary directly (not wrapped in list)
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index a2b906ad7..dd371b89b 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -5264,6 +5264,85 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("isTruncatedUtf8Sequence()", () => {
+    // Happy path: truncated 2-byte sequence
+    it("returns true for truncated 2-byte sequence with valid continuation byte", () => {
+      // 0xC2 requires 2 bytes, but only 1 byte available (0x80 is valid continuation)
+      const bytes = new Uint8Array([0xc2]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true);
+    });
+
+    // Happy path: truncated 3-byte sequence
+    it("returns true for truncated 3-byte sequence with valid continuation bytes", () => {
+      // 0xE2 requires 3 bytes, but only 2 bytes available (both valid continuations)
+      const bytes = new Uint8Array([0xe2, 0x82]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true);
+    });
+
+    // Happy path: truncated 4-byte sequence
+    it("returns true for truncated 4-byte sequence with valid continuation bytes", () => {
+      // 0xF0 requires 4 bytes, but only 3 bytes available (all valid continuations)
+      const bytes = new Uint8Array([0xf0, 0x90, 0x8d]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true);
+    });
+
+    // Edge case: start position in middle of data
+    it("returns true for truncated sequence starting at non-zero position", () => {
+      // Valid ASCII prefix, then truncated 2-byte sequence
+      const bytes = new Uint8Array([0x41, 0xc2]); // 'A' + truncated '£'
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 1), true);
+    });
+
+    // Edge case: multiple valid continuation bytes before truncation
+    it("returns true for 4-byte sequence with 2 valid continuation bytes (truncated)", () => {
+      // 0xF0 (4-byte) with 2 valid continuation bytes available
+      const bytes = new Uint8Array([0xf0, 0x90, 0x8d]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), true);
+    });
+
+    // False path: invalid leader byte
+    it("returns false for invalid leader byte", () => {
+      // 0xC0 is invalid (overlong encoding marker)
+      const bytes = new Uint8Array([0xc0]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+
+    // False path: invalid leader byte (out of range)
+    it("returns false for leader byte >= 0xF5", () => {
+      // 0xF5 and above are invalid (> U+10FFFF)
+      const bytes = new Uint8Array([0xf5]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+
+    // False path: enough bytes available
+    it("returns false when enough bytes are available for complete sequence", () => {
+      // 0xC2 requires 2 bytes, and 2 bytes are available
+      const bytes = new Uint8Array([0xc2, 0xa3]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+
+    // False path: invalid continuation byte in truncated sequence
+    it("returns false when continuation byte is invalid", () => {
+      // 0xC2 requires 2 bytes, but only 1 available with invalid continuation (0x00)
+      const bytes = new Uint8Array([0xc2, 0x00]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+
+    // False path: ASCII byte
+    it("returns false for ASCII byte (1-byte sequence)", () => {
+      // ASCII bytes are 1-byte sequences, always complete
+      const bytes = new Uint8Array([0x41]); // 'A'
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+
+    // False path: truncated sequence with invalid continuation at end
+    it("returns false when truncated sequence has invalid continuation byte at start", () => {
+      // 0xE2 requires 3 bytes, 2 available, but second byte (0x00) is invalid continuation
+      const bytes = new Uint8Array([0xe2, 0x00]);
+      assert.equal(Bitstring.isTruncatedUtf8Sequence(bytes, 0), false);
+    });
+  });
+
   describe("isValidUtf8CodePoint()", () => {
     it("valid codepoint", () => {
       assert.isTrue(Bitstring.isValidUtf8CodePoint(0x41, 1)); // ASCII 'A'

From f458bf4970d30c63e6443d4f2e5766396535b4c9 Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Wed, 18 Feb 2026 23:07:11 +0000
Subject: [PATCH 6/7] Extracts valid UTF-8 sequence length to BitString class

---
 assets/js/bitstring.mjs            |  21 ++++
 assets/js/erlang/unicode.mjs       | 116 ++-------------------
 test/javascript/bitstring_test.mjs | 156 +++++++++++++++++++++++------
 3 files changed, 153 insertions(+), 140 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index d80dd9937..72d37b5a3 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -511,6 +511,27 @@ export default class Bitstring {
     return false; // Invalid leader byte
   }
 
+  // Scans forward once to find the longest valid UTF-8 prefix.
+  // Validates UTF-8 by checking byte structure, decoding code points,
+  // and rejecting overlong encodings, surrogates, and out-of-range values.
+  // Time complexity: O(n) where n is the number of bytes.
+  static getValidUtf8Length(bytes) {
+    let pos = 0;
+
+    while (pos < bytes.length) {
+      const seqLength = $.getUtf8SequenceLength(bytes[pos]);
+      if (
+        seqLength === false ||
+        !$.isValidUtf8Sequence(bytes, pos, seqLength)
+      ) {
+        break;
+      }
+      pos += seqLength;
+    }
+
+    return pos;
+  }
+
   static isEmpty(bitstring) {
     return bitstring.text === "" || bitstring.bytes?.length === 0;
   }
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index a433c4ff0..15424f5ab 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -90,27 +90,6 @@ const Erlang_Unicode = {
   "characters_to_list/1": (data) => {
     // Helpers
 
-    // Scans forward once to find the longest valid UTF-8 prefix.
-    // Validates UTF-8 by checking byte structure, decoding code points,
-    // and rejecting overlong encodings, surrogates, and out-of-range values.
-    // Time complexity: O(n) where n is the number of bytes.
-    const findValidUtf8Length = (bytes) => {
-      // Scan forward, validating each sequence
-      let pos = 0;
-
-      while (pos < bytes.length) {
-        const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (
-          seqLength === false ||
-          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
-        )
-          break;
-        pos += seqLength;
-      }
-
-      return pos;
-    };
-
     // Converts a binary to a list of codepoints.
     const convertBinaryToCodepoints = (binary, preDecodedText = null) => {
       const text =
@@ -140,7 +119,7 @@ const Erlang_Unicode = {
     const handleInvalidUtf8FromBinary = (invalidBinary) => {
       Bitstring.maybeSetBytesFromText(invalidBinary);
       const bytes = invalidBinary.bytes ?? new Uint8Array(0);
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
 
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
@@ -169,7 +148,7 @@ const Erlang_Unicode = {
       // Check if it's a truncated sequence
       Bitstring.maybeSetBytesFromText(invalidBinary);
       const bytes = invalidBinary.bytes ?? new Uint8Array(0);
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
       const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength);
 
       if (isTruncated) {
@@ -301,27 +280,6 @@ const Erlang_Unicode = {
   "characters_to_nfc_binary/1": (data) => {
     // Helpers
 
-    // Scans forward once to find the longest valid UTF-8 prefix.
-    // Validates UTF-8 by checking byte structure, decoding code points,
-    // and rejecting overlong encodings, surrogates, and out-of-range values.
-    // Time complexity: O(n) where n is the number of bytes.
-    const findValidUtf8Length = (bytes) => {
-      // Scan forward, validating each sequence
-      let pos = 0;
-
-      while (pos < bytes.length) {
-        const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (
-          seqLength === false ||
-          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
-        )
-          break;
-        pos += seqLength;
-      }
-
-      return pos;
-    };
-
     // Validates that rest is a list containing a binary (from invalid UTF-8).
     // Raises ArgumentError if it's a list of invalid codepoints instead.
     const validateListRest = (rest) => {
@@ -360,7 +318,7 @@ const Erlang_Unicode = {
     // Finds the UTF-8 validity boundary, normalizes the valid prefix,
     // and returns error tuple with normalized prefix and invalid remainder.
     const handleInvalidUtf8 = (bytes) => {
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
       const validText = Bitstring.toText(validPrefix);
@@ -555,27 +513,6 @@ const Erlang_Unicode = {
   "characters_to_nfd_binary/1": (data) => {
     // Helpers
 
-    // Scans forward once to find the longest valid UTF-8 prefix.
-    // Validates UTF-8 by checking byte structure, decoding code points,
-    // and rejecting overlong encodings, surrogates, and out-of-range values.
-    // Time complexity: O(n) where n is the number of bytes.
-    const findValidUtf8Length = (bytes) => {
-      // Scan forward, validating each sequence
-      let pos = 0;
-
-      while (pos < bytes.length) {
-        const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (
-          seqLength === false ||
-          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
-        )
-          break;
-        pos += seqLength;
-      }
-
-      return pos;
-    };
-
     // Validates that rest is a list containing a binary (from invalid UTF-8).
     // Raises ArgumentError if it's a list of invalid codepoints instead.
     const validateListRest = (rest) => {
@@ -614,7 +551,7 @@ const Erlang_Unicode = {
     // Finds the UTF-8 validity boundary, normalizes the valid prefix,
     // and returns error tuple with normalized prefix and invalid remainder.
     const handleInvalidUtf8 = (bytes) => {
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
       const validText = Bitstring.toText(validPrefix);
@@ -662,27 +599,6 @@ const Erlang_Unicode = {
   "characters_to_nfkc_binary/1": (data) => {
     // Helpers
 
-    // Scans forward once to find the longest valid UTF-8 prefix.
-    // Validates UTF-8 by checking byte structure, decoding code points,
-    // and rejecting overlong encodings, surrogates, and out-of-range values.
-    // Time complexity: O(n) where n is the number of bytes.
-    const findValidUtf8Length = (bytes) => {
-      // scan forward, validating each sequence
-      let pos = 0;
-
-      while (pos < bytes.length) {
-        const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (
-          seqLength === false ||
-          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
-        )
-          break;
-        pos += seqLength;
-      }
-
-      return pos;
-    };
-
     // Validates that rest is a list containing a binary (from invalid UTF-8).
     // Raises ArgumentError if it's a list of invalid codepoints instead.
     const validateListRest = (rest) => {
@@ -721,7 +637,7 @@ const Erlang_Unicode = {
     // Finds the UTF-8 validity boundary, normalizes the valid prefix,
     // and returns error tuple with normalized prefix and invalid remainder.
     const handleInvalidUtf8 = (bytes) => {
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
       const validText = Bitstring.toText(validPrefix);
@@ -768,26 +684,6 @@ const Erlang_Unicode = {
   "characters_to_nfkd_binary/1": (data) => {
     // Helpers
 
-    // Scans forward once to find the longest valid UTF-8 prefix.
-    // Validates UTF-8 by checking byte structure, decoding code points,
-    // and rejecting overlong encodings, surrogates, and out-of-range values.
-    // Time complexity: O(n) where n is the number of bytes.
-    const findValidUtf8Length = (bytes) => {
-      // Scan forward, validating each sequence
-      let pos = 0;
-      while (pos < bytes.length) {
-        const seqLength = Bitstring.getUtf8SequenceLength(bytes[pos]);
-        if (
-          seqLength === false ||
-          !Bitstring.isValidUtf8Sequence(bytes, pos, seqLength)
-        )
-          break;
-        pos += seqLength;
-      }
-
-      return pos;
-    };
-
     // Validates that rest is a list containing a binary (from invalid UTF-8).
     // Raises ArgumentError if it's a list of invalid codepoints instead.
     const validateListRest = (rest) => {
@@ -826,7 +722,7 @@ const Erlang_Unicode = {
     // Finds the UTF-8 validity boundary, normalizes the valid prefix,
     // and returns error tuple with normalized prefix and invalid remainder.
     const handleInvalidUtf8 = (bytes) => {
-      const validLength = findValidUtf8Length(bytes);
+      const validLength = Bitstring.getValidUtf8Length(bytes);
       const validPrefix = Bitstring.fromBytes(bytes.slice(0, validLength));
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
       const validText = Bitstring.toText(validPrefix);
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index dd371b89b..1bffe47ff 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -5087,6 +5087,132 @@ describe("Bitstring", () => {
     assert.deepStrictEqual(result, expected);
   });
 
+  describe("getUtf8SequenceLength()", () => {
+    it("returns 1 for 0x41 (ASCII)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0x41), 1);
+    });
+
+    it("returns 2 for 0xC2 (2-byte leader)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0xc2), 2);
+    });
+
+    it("returns 3 for 0xE0 (3-byte leader)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0xe0), 3);
+    });
+
+    it("returns 4 for 0xF0 (4-byte leader)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0xf0), 4);
+    });
+
+    it("returns false for 0xC0 (invalid: overlong encoding)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0xc0), false);
+    });
+
+    it("returns false for 0xF5 (invalid: > U+10FFFF)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0xf5), false);
+    });
+
+    it("returns false for 0x80 (invalid: continuation byte)", () => {
+      assert.equal(Bitstring.getUtf8SequenceLength(0x80), false);
+    });
+  });
+
+  describe("getValidUtf8Length()", () => {
+    it("empty bytes array", () => {
+      const bytes = new Uint8Array([]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("single valid ASCII byte", () => {
+      // 'A' (0x41): 0xxxxxxx pattern
+      const bytes = new Uint8Array([0x41]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 1);
+    });
+
+    it("single valid 2-byte sequence", () => {
+      // é (U+00E9): 0xC3 0xA9
+      const bytes = new Uint8Array([0xc3, 0xa9]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 2);
+    });
+
+    it("single valid 3-byte sequence", () => {
+      // € (U+20AC): 0xE2 0x82 0xAC
+      const bytes = new Uint8Array([0xe2, 0x82, 0xac]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 3);
+    });
+
+    it("single valid 4-byte sequence", () => {
+      // 𐍈 (U+10348): 0xF0 0x90 0x8D 0x88
+      const bytes = new Uint8Array([0xf0, 0x90, 0x8d, 0x88]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 4);
+    });
+
+    it("multiple consecutive valid sequences", () => {
+      // 'A' (1 byte) + é (2 bytes) + € (3 bytes)
+      const bytes = new Uint8Array([
+        0x41, // ASCII 'A'
+        0xc3,
+        0xa9, // é
+        0xe2,
+        0x82,
+        0xac, // €
+      ]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 6);
+    });
+
+    it("invalid leader byte (0xC0)", () => {
+      // 0xC0 is invalid (overlong encoding)
+      const bytes = new Uint8Array([0xc0, 0xa9]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("invalid leader byte (>= 0xF5)", () => {
+      // 0xF5+ is invalid (beyond Unicode range)
+      const bytes = new Uint8Array([0xf5, 0x90, 0x80, 0x80]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("invalid continuation byte", () => {
+      // 0xC3 starts 2-byte sequence, but 0x41 (ASCII) is not a valid continuation
+      const bytes = new Uint8Array([0xc3, 0x41]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("truncated 2-byte sequence", () => {
+      // 0xC3 expects 1 continuation byte, but we only have the leader
+      const bytes = new Uint8Array([0xc3]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("truncated 3-byte sequence with one byte missing", () => {
+      // 0xE2 0x82 expects one more continuation byte
+      const bytes = new Uint8Array([0xe2, 0x82]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 0);
+    });
+
+    it("valid sequences followed by invalid byte", () => {
+      // 'A' (1 byte) + é (2 bytes) + invalid 0xC0
+      const bytes = new Uint8Array([
+        0x41, // ASCII 'A'
+        0xc3,
+        0xa9, // é
+        0xc0, // Invalid
+      ]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 3);
+    });
+
+    it("valid sequences followed by truncated sequence", () => {
+      // 'A' (1 byte) + é (2 bytes) + incomplete 2-byte start
+      const bytes = new Uint8Array([
+        0x41, // ASCII 'A'
+        0xc3,
+        0xa9, // é
+        0xc3, // Incomplete 2-byte
+      ]);
+      assert.equal(Bitstring.getValidUtf8Length(bytes), 3);
+    });
+  });
+
   describe("isEmpty()", () => {
     describe("empty", () => {
       it("with text field", () => {
@@ -7596,34 +7722,4 @@ describe("Bitstring", () => {
       });
     });
   });
-
-  describe("getUtf8SequenceLength()", () => {
-    it("returns 1 for 0x41 (ASCII)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0x41), 1);
-    });
-
-    it("returns 2 for 0xC2 (2-byte leader)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0xc2), 2);
-    });
-
-    it("returns 3 for 0xE0 (3-byte leader)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0xe0), 3);
-    });
-
-    it("returns 4 for 0xF0 (4-byte leader)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0xf0), 4);
-    });
-
-    it("returns false for 0xC0 (invalid: overlong encoding)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0xc0), false);
-    });
-
-    it("returns false for 0xF5 (invalid: > U+10FFFF)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0xf5), false);
-    });
-
-    it("returns false for 0x80 (invalid: continuation byte)", () => {
-      assert.equal(Bitstring.getUtf8SequenceLength(0x80), false);
-    });
-  });
 });

From 8d3af8d6cc6b7e95d18b149766ab6e8022c47fea Mon Sep 17 00:00:00 2001
From: Michael Ward <michael@scripthungry.com>
Date: Thu, 19 Feb 2026 01:09:35 +0000
Subject: [PATCH 7/7] Extracts BitString to code points array to BitString
 class

---
 assets/js/bitstring.mjs            |  9 ++++
 assets/js/erlang/unicode.mjs       | 25 +++-------
 test/javascript/bitstring_test.mjs | 79 ++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/assets/js/bitstring.mjs b/assets/js/bitstring.mjs
index 72d37b5a3..58da3f58c 100644
--- a/assets/js/bitstring.mjs
+++ b/assets/js/bitstring.mjs
@@ -829,6 +829,15 @@ export default class Bitstring {
     };
   }
 
+  // Converts a bitstring to an array of codepoint integers.
+  // Returns a plain array (not wrapped in Type.list).
+  static toCodepointArray(bitstring) {
+    $.maybeSetTextFromBytes(bitstring);
+    return Array.from(bitstring.text, (char) =>
+      Type.integer(char.codePointAt(0)),
+    );
+  }
+
   static toCodepoints(bitstring) {
     $.maybeSetTextFromBytes(bitstring);
     return Type.charlist(bitstring.text);
diff --git a/assets/js/erlang/unicode.mjs b/assets/js/erlang/unicode.mjs
index 15424f5ab..f8e651c95 100644
--- a/assets/js/erlang/unicode.mjs
+++ b/assets/js/erlang/unicode.mjs
@@ -90,14 +90,6 @@ const Erlang_Unicode = {
   "characters_to_list/1": (data) => {
     // Helpers
 
-    // Converts a binary to a list of codepoints.
-    const convertBinaryToCodepoints = (binary, preDecodedText = null) => {
-      const text =
-        preDecodedText !== null ? preDecodedText : Bitstring.toText(binary);
-
-      return Array.from(text).map((char) => Type.integer(char.codePointAt(0)));
-    };
-
     // Converts a single codepoint integer to a UTF-8 encoded binary.
     const convertCodepointToBinary = (codepoint) => {
       const segment = Type.bitstringSegment(codepoint, {type: "utf8"});
@@ -125,7 +117,7 @@ const Erlang_Unicode = {
       const invalidRest = Bitstring.fromBytes(bytes.slice(validLength));
 
       const codepoints =
-        validLength > 0 ? convertBinaryToCodepoints(validPrefix) : [];
+        validLength > 0 ? Bitstring.toCodepointArray(validPrefix) : [];
 
       const isTruncated = Bitstring.isTruncatedUtf8Sequence(bytes, validLength);
 
@@ -142,7 +134,7 @@ const Erlang_Unicode = {
       // Convert all valid chunks to codepoints
       const codepoints =
         chunks.length > 0
-          ? convertBinaryToCodepoints(Bitstring.concat(chunks))
+          ? Bitstring.toCodepointArray(Bitstring.concat(chunks))
           : [];
 
       // Check if it's a truncated sequence
@@ -171,7 +163,7 @@ const Erlang_Unicode = {
     ) => {
       const codepoints =
         chunks.length > 0
-          ? convertBinaryToCodepoints(Bitstring.concat(chunks))
+          ? Bitstring.toCodepointArray(Bitstring.concat(chunks))
           : [];
 
       // Build the rest list with invalid code point and remaining elements
@@ -237,7 +229,7 @@ const Erlang_Unicode = {
         return handleInvalidUtf8FromBinary(data);
       }
 
-      const codepoints = convertBinaryToCodepoints(data, text);
+      const codepoints = Bitstring.toCodepointArray(data);
 
       return Type.list(codepoints);
     }
@@ -269,7 +261,7 @@ const Erlang_Unicode = {
     }
 
     const binary = Bitstring.concat(chunks);
-    const codepoints = convertBinaryToCodepoints(binary);
+    const codepoints = Bitstring.toCodepointArray(binary);
 
     return Type.list(codepoints);
   },
@@ -372,12 +364,9 @@ const Erlang_Unicode = {
       binary,
       preDecodedText = null,
     ) => {
-      const text =
-        preDecodedText !== null ? preDecodedText : Bitstring.toText(binary);
-
+      const text = preDecodedText ?? binary.text ?? Bitstring.toText(binary);
       const normalized = text.normalize("NFC");
-
-      return Array.from(normalized).map((char) =>
+      return Array.from(normalized, (char) =>
         Type.integer(char.codePointAt(0)),
       );
     };
diff --git a/test/javascript/bitstring_test.mjs b/test/javascript/bitstring_test.mjs
index 1bffe47ff..15c7e3a21 100644
--- a/test/javascript/bitstring_test.mjs
+++ b/test/javascript/bitstring_test.mjs
@@ -6242,6 +6242,85 @@ describe("Bitstring", () => {
     });
   });
 
+  describe("toCodepointArray()", () => {
+    it("single codepoint (1 byte)", () => {
+      const bitstring = Type.bitstring("A");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [Type.integer(65)];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("single codepoint (2 bytes)", () => {
+      const bitstring = Type.bitstring("£");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [Type.integer(163)];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("single codepoint (3 bytes)", () => {
+      const bitstring = Type.bitstring("€");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [Type.integer(8364)];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("single codepoint (4 bytes)", () => {
+      const bitstring = Type.bitstring("𐍈");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [Type.integer(66376)];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("multiple codepoints", () => {
+      const bitstring = Type.bitstring("A£€𐍈");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [
+        Type.integer(65),
+        Type.integer(163),
+        Type.integer(8364),
+        Type.integer(66376),
+      ];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("empty bitstring", () => {
+      const bitstring = Type.bitstring("");
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("uses cached text when available", () => {
+      const bitstring = Type.bitstring("cached");
+      // Bitstring already has text cached from Type.bitstring()
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [
+        Type.integer(99),
+        Type.integer(97),
+        Type.integer(99),
+        Type.integer(104),
+        Type.integer(101),
+        Type.integer(100),
+      ];
+
+      assert.deepStrictEqual(result, expected);
+    });
+
+    it("decodes bytes and caches text when needed", () => {
+      const bitstring = Bitstring.fromBytes([97, 98, 99]);
+      const result = Bitstring.toCodepointArray(bitstring);
+      const expected = [Type.integer(97), Type.integer(98), Type.integer(99)];
+
+      assert.deepStrictEqual(result, expected);
+    });
+  });
+
   describe("toCodepoints()", () => {
     describe("single codepoint", () => {
       it("$ (1 byte)", () => {