diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 5aca3f5c1f1..a6fcbc2acd9 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -492,6 +492,37 @@ kj::Maybe AsciiDecoder::decode( return js.str(result.asPtr()); } +kj::Maybe XUserDefinedDecoder::decode( + jsg::Lock& js, kj::ArrayPtr buffer, bool flush) { + // x-user-defined encoding per WHATWG spec: + // https://encoding.spec.whatwg.org/#x-user-defined-decoder + // - 0x00-0x7F: code point = byte (ASCII identity) + // - 0x80-0xFF: code point = 0xF780 + (byte - 0x80) = 0xF700 + byte + + // Check if we have any high bytes that need remapping + bool hasHighBytes = + !simdutf::validate_ascii(reinterpret_cast(buffer.begin()), buffer.size()); + + if (!hasHighBytes) { + // Fast path: all ASCII bytes, identity mapping + return js.str(buffer); + } + + // Slow path: at least one byte >= 0x80, need uint16_t for PUA mapping + auto result = kj::heapArray(buffer.size()); + for (size_t i = 0; i < buffer.size(); i++) { + auto byte = buffer[i]; + if (byte < 0x80) { + result[i] = byte; + } else { + // Map 0x80-0xFF to U+F780-U+F7FF (Private Use Area) + result[i] = 0xF700 + byte; + } + } + + return js.str(result.asPtr()); +} + void IcuDecoder::reset() { bomSeen = false; return ucnv_reset(inner.get()); @@ -505,6 +536,9 @@ Decoder& TextDecoder::getImpl() { KJ_CASE_ONEOF(dec, IcuDecoder) { return dec; } + KJ_CASE_ONEOF(dec, XUserDefinedDecoder) { + return dec; + } } KJ_UNREACHABLE; } @@ -522,15 +556,18 @@ jsg::Ref TextDecoder::constructor(jsg::Lock& js, KJ_IF_SOME(label, maybeLabel) { encoding = getEncodingForLabel(label); - JSG_REQUIRE(encoding != Encoding::Replacement && encoding != Encoding::X_User_Defined && - encoding != Encoding::INVALID, - RangeError, errorMessage(label)); + JSG_REQUIRE(encoding != Encoding::Replacement && encoding != Encoding::INVALID, RangeError, + errorMessage(label)); } if (encoding == Encoding::Windows_1252) { return js.alloc(AsciiDecoder(), options); } + if (encoding == Encoding::X_User_Defined) { + return js.alloc(XUserDefinedDecoder(), options); + } + return js.alloc( JSG_REQUIRE_NONNULL(IcuDecoder::create(encoding, options.fatal, options.ignoreBOM), RangeError, errorMessage(getEncodingId(encoding))), @@ -559,6 +596,9 @@ kj::Maybe TextDecoder::decodePtr( KJ_CASE_ONEOF(dec, IcuDecoder) { return dec.decode(js, buffer, flush); } + KJ_CASE_ONEOF(dec, XUserDefinedDecoder) { + return dec.decode(js, buffer, flush); + } } KJ_UNREACHABLE; } diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index a5b55c57af6..37720124a6f 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -94,6 +94,25 @@ class AsciiDecoder final: public Decoder { jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) override; }; +// Decoder implementation for x-user-defined encoding. +// Per WHATWG spec (https://encoding.spec.whatwg.org/#x-user-defined-decoder): +// - Bytes 0x00-0x7F map to themselves (ASCII identity) +// - Bytes 0x80-0xFF map to U+F780 + (byte - 0x80) = U+F700 + byte +class XUserDefinedDecoder final: public Decoder { + public: + XUserDefinedDecoder() = default; + XUserDefinedDecoder(XUserDefinedDecoder&&) = default; + XUserDefinedDecoder& operator=(XUserDefinedDecoder&&) = default; + KJ_DISALLOW_COPY(XUserDefinedDecoder); + + Encoding getEncoding() override { + return Encoding::X_User_Defined; + } + + kj::Maybe decode( + jsg::Lock& js, kj::ArrayPtr buffer, bool flush = false) override; +}; + // Decoder implementation that uses ICU's built-in conversion APIs. // ICU's decoder is fairly comprehensive, covering the full range // of encodings required by the Encoding specification. @@ -138,7 +157,7 @@ class IcuDecoder final: public Decoder { // https://encoding.spec.whatwg.org/#interface-textdecoder class TextDecoder final: public jsg::Object { public: - using DecoderImpl = kj::OneOf; + using DecoderImpl = kj::OneOf; struct ConstructorOptions { bool fatal = false; diff --git a/src/workerd/api/tests/encoding-test.js b/src/workerd/api/tests/encoding-test.js index a5bdb47a3c4..81ac2a30f12 100644 --- a/src/workerd/api/tests/encoding-test.js +++ b/src/workerd/api/tests/encoding-test.js @@ -622,7 +622,7 @@ export const allTheDecoders = { ['unicodefeff', 'utf-16le'], ['utf-16', 'utf-16le'], ['utf-16le', 'utf-16le'], - ['x-user-defined', undefined], + ['x-user-defined', 'x-user-defined'], // Test that match is case-insensitive ['UTF-8', 'utf-8'], ['UtF-8', 'utf-8'], @@ -742,3 +742,65 @@ export const textDecoderStream = { strictEqual(enc.encoding, 'utf-8'); }, }; + +// Test x-user-defined encoding per WHATWG spec +// https://encoding.spec.whatwg.org/#x-user-defined-decoder +export const xUserDefinedDecode = { + test() { + const decoder = new TextDecoder('x-user-defined'); + strictEqual(decoder.encoding, 'x-user-defined'); + strictEqual(decoder.fatal, false); + strictEqual(decoder.ignoreBOM, false); + + // Test ASCII bytes (0x00-0x7F) - identity mapping + strictEqual(decoder.decode(Uint8Array.of(0x41)), 'A'); + strictEqual(decoder.decode(Uint8Array.of(0x00)), '\u0000'); + strictEqual(decoder.decode(Uint8Array.of(0x7f)), '\u007F'); + + // Test high bytes (0x80-0xFF) - map to Private Use Area U+F780-U+F7FF + strictEqual(decoder.decode(Uint8Array.of(0x80)), '\uF780'); + strictEqual(decoder.decode(Uint8Array.of(0x81)), '\uF781'); + strictEqual(decoder.decode(Uint8Array.of(0xff)), '\uF7FF'); + + // Test mixed sequence + const mixed = new Uint8Array([0x00, 0x7f, 0x80, 0x81, 0xff]); + strictEqual(decoder.decode(mixed), '\u0000\u007F\uF780\uF781\uF7FF'); + + // Test empty input + strictEqual(decoder.decode(new Uint8Array([])), ''); + strictEqual(decoder.decode(), ''); + + // Test pure ASCII input (fast path) + strictEqual( + decoder.decode(new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f])), + 'Hello' + ); + + // Test streaming (x-user-defined is single-byte, streaming is trivial) + const streamDecoder = new TextDecoder('x-user-defined'); + let result = ''; + result += streamDecoder.decode(Uint8Array.of(0x41), { stream: true }); + result += streamDecoder.decode(Uint8Array.of(0x80), { stream: true }); + result += streamDecoder.decode(Uint8Array.of(0xff), { stream: true }); + result += streamDecoder.decode(); + strictEqual(result, 'A\uF780\uF7FF'); + }, +}; + +// Test x-user-defined with fatal option (all 256 bytes are valid) +export const xUserDefinedFatal = { + test() { + const decoder = new TextDecoder('x-user-defined', { fatal: true }); + strictEqual(decoder.fatal, true); + + // All 256 byte values are valid, fatal mode should never throw + for (let byte = 0; byte < 256; byte++) { + const decoded = decoder.decode(Uint8Array.of(byte)); + if (byte < 0x80) { + strictEqual(decoded.codePointAt(0), byte); + } else { + strictEqual(decoded.codePointAt(0), 0xf700 + byte); + } + } + }, +}; diff --git a/src/wpt/encoding-test.ts b/src/wpt/encoding-test.ts index 48aa67a3503..b73f7d43660 100644 --- a/src/wpt/encoding-test.ts +++ b/src/wpt/encoding-test.ts @@ -527,18 +527,10 @@ export default { 'textdecoder-fatal-streaming.any.js': {}, 'textdecoder-fatal.any.js': {}, 'textdecoder-ignorebom.any.js': {}, - 'textdecoder-labels.any.js': { - comment: 'x-user-defined is rejected at TextDecoder construction time', - expectedFailures: ['x-user-defined => x-user-defined'], - }, + 'textdecoder-labels.any.js': {}, 'textdecoder-streaming.any.js': {}, 'textdecoder-utf16-surrogates.any.js': {}, - 'textencoder-constructor-non-utf.any.js': { - comment: 'x-user-defined is rejected at TextDecoder construction time', - expectedFailures: [ - 'Encoding argument supported for decode: x-user-defined', - ], - }, + 'textencoder-constructor-non-utf.any.js': {}, 'textencoder-utf16-surrogates.any.js': {}, 'unsupported-encodings.any.js': { comment: 'XMLHttpRequest is not defined',