Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 43 additions & 3 deletions src/workerd/api/encoding.c++
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,37 @@ kj::Maybe<jsg::JsString> AsciiDecoder::decode(
return js.str(result.asPtr());
}

kj::Maybe<jsg::JsString> XUserDefinedDecoder::decode(
jsg::Lock& js, kj::ArrayPtr<const kj::byte> buffer, bool flush) {
// x-user-defined encoding per WHATWG spec:
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
// - 0x00-0x7F: code point = byte (ASCII identity)
// - 0x80-0xFF: code point = 0xF780 + (byte - 0x80) = 0xF700 + byte

// Check if we have any high bytes that need remapping
bool hasHighBytes =
!simdutf::validate_ascii(reinterpret_cast<const char*>(buffer.begin()), buffer.size());

if (!hasHighBytes) {
// Fast path: all ASCII bytes, identity mapping
return js.str(buffer);
}

// Slow path: at least one byte >= 0x80, need uint16_t for PUA mapping
auto result = kj::heapArray<uint16_t>(buffer.size());
for (size_t i = 0; i < buffer.size(); i++) {
auto byte = buffer[i];
if (byte < 0x80) {
result[i] = byte;
} else {
// Map 0x80-0xFF to U+F780-U+F7FF (Private Use Area)
result[i] = 0xF700 + byte;
}
}

return js.str(result.asPtr());
}

void IcuDecoder::reset() {
bomSeen = false;
return ucnv_reset(inner.get());
Expand All @@ -505,6 +536,9 @@ Decoder& TextDecoder::getImpl() {
KJ_CASE_ONEOF(dec, IcuDecoder) {
return dec;
}
KJ_CASE_ONEOF(dec, XUserDefinedDecoder) {
return dec;
}
}
KJ_UNREACHABLE;
}
Expand All @@ -522,15 +556,18 @@ jsg::Ref<TextDecoder> TextDecoder::constructor(jsg::Lock& js,

KJ_IF_SOME(label, maybeLabel) {
encoding = getEncodingForLabel(label);
JSG_REQUIRE(encoding != Encoding::Replacement && encoding != Encoding::X_User_Defined &&
encoding != Encoding::INVALID,
RangeError, errorMessage(label));
JSG_REQUIRE(encoding != Encoding::Replacement && encoding != Encoding::INVALID, RangeError,
errorMessage(label));
}

if (encoding == Encoding::Windows_1252) {
return js.alloc<TextDecoder>(AsciiDecoder(), options);
}

if (encoding == Encoding::X_User_Defined) {
return js.alloc<TextDecoder>(XUserDefinedDecoder(), options);
}

return js.alloc<TextDecoder>(
JSG_REQUIRE_NONNULL(IcuDecoder::create(encoding, options.fatal, options.ignoreBOM),
RangeError, errorMessage(getEncodingId(encoding))),
Expand Down Expand Up @@ -559,6 +596,9 @@ kj::Maybe<jsg::JsString> TextDecoder::decodePtr(
KJ_CASE_ONEOF(dec, IcuDecoder) {
return dec.decode(js, buffer, flush);
}
KJ_CASE_ONEOF(dec, XUserDefinedDecoder) {
return dec.decode(js, buffer, flush);
}
}
KJ_UNREACHABLE;
}
Expand Down
21 changes: 20 additions & 1 deletion src/workerd/api/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,25 @@ class AsciiDecoder final: public Decoder {
jsg::Lock& js, kj::ArrayPtr<const kj::byte> buffer, bool flush = false) override;
};

// Decoder implementation for x-user-defined encoding.
// Per WHATWG spec (https://encoding.spec.whatwg.org/#x-user-defined-decoder):
// - Bytes 0x00-0x7F map to themselves (ASCII identity)
// - Bytes 0x80-0xFF map to U+F780 + (byte - 0x80) = U+F700 + byte
class XUserDefinedDecoder final: public Decoder {
public:
XUserDefinedDecoder() = default;
XUserDefinedDecoder(XUserDefinedDecoder&&) = default;
XUserDefinedDecoder& operator=(XUserDefinedDecoder&&) = default;
KJ_DISALLOW_COPY(XUserDefinedDecoder);

Encoding getEncoding() override {
return Encoding::X_User_Defined;
}

kj::Maybe<jsg::JsString> decode(
jsg::Lock& js, kj::ArrayPtr<const kj::byte> buffer, bool flush = false) override;
};

// Decoder implementation that uses ICU's built-in conversion APIs.
// ICU's decoder is fairly comprehensive, covering the full range
// of encodings required by the Encoding specification.
Expand Down Expand Up @@ -138,7 +157,7 @@ class IcuDecoder final: public Decoder {
// https://encoding.spec.whatwg.org/#interface-textdecoder
class TextDecoder final: public jsg::Object {
public:
using DecoderImpl = kj::OneOf<AsciiDecoder, IcuDecoder>;
using DecoderImpl = kj::OneOf<AsciiDecoder, IcuDecoder, XUserDefinedDecoder>;

struct ConstructorOptions {
bool fatal = false;
Expand Down
64 changes: 63 additions & 1 deletion src/workerd/api/tests/encoding-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ export const allTheDecoders = {
['unicodefeff', 'utf-16le'],
['utf-16', 'utf-16le'],
['utf-16le', 'utf-16le'],
['x-user-defined', undefined],
['x-user-defined', 'x-user-defined'],
// Test that match is case-insensitive
['UTF-8', 'utf-8'],
['UtF-8', 'utf-8'],
Expand Down Expand Up @@ -742,3 +742,65 @@ export const textDecoderStream = {
strictEqual(enc.encoding, 'utf-8');
},
};

// Test x-user-defined encoding per WHATWG spec
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
export const xUserDefinedDecode = {
test() {
const decoder = new TextDecoder('x-user-defined');
strictEqual(decoder.encoding, 'x-user-defined');
strictEqual(decoder.fatal, false);
strictEqual(decoder.ignoreBOM, false);

// Test ASCII bytes (0x00-0x7F) - identity mapping
strictEqual(decoder.decode(Uint8Array.of(0x41)), 'A');
strictEqual(decoder.decode(Uint8Array.of(0x00)), '\u0000');
strictEqual(decoder.decode(Uint8Array.of(0x7f)), '\u007F');

// Test high bytes (0x80-0xFF) - map to Private Use Area U+F780-U+F7FF
strictEqual(decoder.decode(Uint8Array.of(0x80)), '\uF780');
strictEqual(decoder.decode(Uint8Array.of(0x81)), '\uF781');
strictEqual(decoder.decode(Uint8Array.of(0xff)), '\uF7FF');

// Test mixed sequence
const mixed = new Uint8Array([0x00, 0x7f, 0x80, 0x81, 0xff]);
strictEqual(decoder.decode(mixed), '\u0000\u007F\uF780\uF781\uF7FF');

// Test empty input
strictEqual(decoder.decode(new Uint8Array([])), '');
strictEqual(decoder.decode(), '');

// Test pure ASCII input (fast path)
strictEqual(
decoder.decode(new Uint8Array([0x48, 0x65, 0x6c, 0x6c, 0x6f])),
'Hello'
);

// Test streaming (x-user-defined is single-byte, streaming is trivial)
const streamDecoder = new TextDecoder('x-user-defined');
let result = '';
result += streamDecoder.decode(Uint8Array.of(0x41), { stream: true });
result += streamDecoder.decode(Uint8Array.of(0x80), { stream: true });
result += streamDecoder.decode(Uint8Array.of(0xff), { stream: true });
result += streamDecoder.decode();
strictEqual(result, 'A\uF780\uF7FF');
},
};

// Test x-user-defined with fatal option (all 256 bytes are valid)
export const xUserDefinedFatal = {
test() {
const decoder = new TextDecoder('x-user-defined', { fatal: true });
strictEqual(decoder.fatal, true);

// All 256 byte values are valid, fatal mode should never throw
for (let byte = 0; byte < 256; byte++) {
const decoded = decoder.decode(Uint8Array.of(byte));
if (byte < 0x80) {
strictEqual(decoded.codePointAt(0), byte);
} else {
strictEqual(decoded.codePointAt(0), 0xf700 + byte);
}
}
},
};
12 changes: 2 additions & 10 deletions src/wpt/encoding-test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -527,18 +527,10 @@ export default {
'textdecoder-fatal-streaming.any.js': {},
'textdecoder-fatal.any.js': {},
'textdecoder-ignorebom.any.js': {},
'textdecoder-labels.any.js': {
comment: 'x-user-defined is rejected at TextDecoder construction time',
expectedFailures: ['x-user-defined => x-user-defined'],
},
'textdecoder-labels.any.js': {},
'textdecoder-streaming.any.js': {},
'textdecoder-utf16-surrogates.any.js': {},
'textencoder-constructor-non-utf.any.js': {
comment: 'x-user-defined is rejected at TextDecoder construction time',
expectedFailures: [
'Encoding argument supported for decode: x-user-defined',
],
},
'textencoder-constructor-non-utf.any.js': {},
'textencoder-utf16-surrogates.any.js': {},
'unsupported-encodings.any.js': {
comment: 'XMLHttpRequest is not defined',
Expand Down
Loading