From 202609248e28aeb90c36b4d88d2b1cf39264ba88 Mon Sep 17 00:00:00 2001 From: Joshua Jones Date: Sat, 21 Feb 2026 17:12:09 -0500 Subject: [PATCH 1/2] add test that transforms a Latin encoded message into a UTF-8 encoded message. --- transform_test.go | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/transform_test.go b/transform_test.go index 49ffc2f..4b3b850 100644 --- a/transform_test.go +++ b/transform_test.go @@ -1970,3 +1970,45 @@ func BenchmarkMapValue(b *testing.B) { _, _ = msg.Transform(MapValue("PID-5.1", upper)) } } + +func TestMapAllValuesLatin1ToUTF8(t *testing.T) { + // The raw message contains Latin-1 bytes in several PID fields: + // \xFC = ü (PID-5.1 "Müller", PID-11.3 "München") + // \xE7 = ç (PID-5.2 "François") + // \xDF = ß (PID-11.1 "Straße 5") + raw := "MSH|^~\\&|LAB|FAC|EMR|FAC2|20240101||ADT^A01|1|P|2.5.1\r" + + "PID|1||123||M\xFCller^Fran\xE7ois^J||19800101|F|||Stra\xDFe 5^^M\xFCnchen" + + msg := parseTestMessage(t, raw) + + result, err := msg.Transform(MapAllValues(latin1ToUTF8)) + if err != nil { + t.Fatalf("Transform(MapAllValues) failed: %v", err) + } + + // Verify delimiters are unchanged. + if result.Delimiters() != msg.Delimiters() { + t.Errorf("delimiters changed: got %v, want %v", result.Delimiters(), msg.Delimiters()) + } + + cases := []struct { + loc string + want string + }{ + {"MSH-9.1", "ADT"}, // ASCII component — unchanged + {"PID-3", "123"}, // ASCII field — unchanged + {"PID-5.1", "Müller"}, // \xFC → ü + {"PID-5.2", "François"}, // \xE7 → ç + {"PID-5.3", "J"}, // ASCII component inside mixed-charset field — unchanged + {"PID-7", "19800101"}, // ASCII date — unchanged + {"PID-11.1", "Straße 5"}, // \xDF → ß; space is ASCII + {"PID-11.3", "München"}, // \xFC → ü; PID-11.2 is empty + } + + for _, tc := range cases { + got := result.Get(tc.loc).String() + if got != tc.want { + t.Errorf("%s = %q, want %q", tc.loc, got, tc.want) + } + } +} From 440bbcfe1ef6d7cfe60c6df17e15bcf5ef152ac4 Mon Sep 17 00:00:00 2001 From: Joshua Jones Date: Sat, 21 Feb 2026 18:21:02 -0500 Subject: [PATCH 2/2] reworked the value mapping --- accessor.go | 16 +++-- ack.go | 16 ++--- builder_test.go | 2 +- charset.go | 24 +++---- charset_test.go | 37 ++++++++-- doc.go | 6 +- transform.go | 6 +- transform_test.go | 173 +++++++++++++++++++++++++++++++++++++--------- validate.go | 4 +- 9 files changed, 217 insertions(+), 67 deletions(-) diff --git a/accessor.go b/accessor.go index 9e67c6d..cb8e8bf 100644 --- a/accessor.go +++ b/accessor.go @@ -211,12 +211,19 @@ type Value struct { // String returns the unescaped string value. Returns an empty string for a // zero (not-found) Value. func (v Value) String() string { - return string(Unescape(v.raw, v.delims)) + return string(v.Bytes()) } -// Bytes returns the raw bytes without escape processing. Returns nil for a -// zero (not-found) Value. +// Bytes returns the unescaped bytes of this value. Equivalent to calling +// String() and converting, but avoids the string allocation for binary use. +// Returns nil for a zero (not-found) Value. func (v Value) Bytes() []byte { + return Unescape(v.raw, v.delims) +} + +// Raw returns the raw, possibly-escaped bytes of this value as they appear +// in the HL7 message (before escape processing). Returns nil for a zero Value. +func (v Value) Raw() []byte { return v.raw } @@ -250,7 +257,8 @@ func (v Value) HasValue() bool { // msg.Get("PID-3.1").String() // Patient ID // msg.Get("PID-5.1").String() // Family name // msg.Get("OBX(0)-5").String() // First OBX segment, field 5 -// msg.Get("PID-3.1").Bytes() // raw bytes without unescaping +// msg.Get("PID-3.1").Bytes() // unescaped bytes +// msg.Get("PID-3.1").Raw() // raw bytes without unescaping func (m *Message) Get(location string) Value { loc, err := ParseLocation(location) if err != nil { diff --git a/ack.go b/ack.go index 04e355c..09ce7a5 100644 --- a/ack.go +++ b/ack.go @@ -96,14 +96,14 @@ func (m *Message) Ack(code AckCode, controlID string, opts ...AckOption) ([]byte // Pre-calculate buffer size to avoid growing. // MSH: "MSH" + separator + encoding chars + 10 delimited fields // MSA: "MSA" + code + control ID + optional text - sendApp := msh.Field(5).Bytes() - sendFac := msh.Field(6).Bytes() - recvApp := msh.Field(3).Bytes() - recvFac := msh.Field(4).Bytes() - triggerEvent := msh.Field(9).Rep(0).Component(2).Bytes() - procID := msh.Field(11).Bytes() - versionID := msh.Field(12).Bytes() - origControlID := msh.Field(10).Bytes() + sendApp := msh.Field(5).Raw() + sendFac := msh.Field(6).Raw() + recvApp := msh.Field(3).Raw() + recvFac := msh.Field(4).Raw() + triggerEvent := msh.Field(9).Rep(0).Component(2).Raw() + procID := msh.Field(11).Raw() + versionID := msh.Field(12).Raw() + origControlID := msh.Field(10).Raw() // Estimate size: MSH header + fields + MSA segment. size := 3 + 1 + 4 + // "MSH" + separator + encoding chars diff --git a/builder_test.go b/builder_test.go index b313c00..44b9f10 100644 --- a/builder_test.go +++ b/builder_test.go @@ -204,7 +204,7 @@ func TestBuilderEscaping(t *testing.T) { // Raw bytes should contain escape sequences. pid := findSeg(msg, "PID") - raw := pid.Field(3).Bytes() + raw := pid.Field(3).Raw() rawStr := string(raw) for _, seq := range []string{`\F\`, `\S\`, `\R\`, `\E\`, `\T\`} { if !strings.Contains(rawStr, seq) { diff --git a/charset.go b/charset.go index 5f49f7a..dcfe91d 100644 --- a/charset.go +++ b/charset.go @@ -14,11 +14,13 @@ package hl7 -// ValueMapper converts field-content bytes to a new representation. +// ValueMapper converts a Value to a new byte representation. // It is used at read time (DecodeString) to decode stored bytes, and at // write time (MapAllValues) to transform content before storage. // -// The mapper receives post-unescape bytes and must return the transformed bytes. +// The mapper receives the full Value, allowing it to inspect IsNull(), +// IsEmpty(), and HasValue() before deciding how to convert. Call v.Bytes() +// to obtain the post-unescape bytes for byte-level conversion. // When used with DecodeString, returning an error aborts the decode. // When used with MapAllValues, returning an error aborts the transform. // @@ -27,24 +29,22 @@ package hl7 // DecodeString behaves identically to String() — no conversion is applied and // no extra allocation is incurred beyond the Unescape fast path. // -// Note: Unescape runs before the ValueMapper, so the mapper receives -// resolved byte values. Charset escape sequences (\C..\ and \M..\) are passed -// through verbatim by Unescape; a sophisticated ValueMapper may parse them, -// but a simple byte-level mapper will treat them as-is. -type ValueMapper func(data []byte) ([]byte, error) +// Note: v.Bytes() returns post-unescape bytes. Charset escape sequences +// (\C..\ and \M..\) are passed through verbatim by Unescape; a sophisticated +// ValueMapper may parse them, but a simple byte-level mapper will treat them as-is. +type ValueMapper func(v Value) ([]byte, error) // DecodeString returns the unescaped, charset-decoded string value of the -// Value. Unescape runs first; if decode is nil the unescaped bytes are cast -// to string with no further allocation. +// Value. If decode is nil the unescaped bytes are cast to string with no +// further allocation. // // DecodeString is promoted to Field, Repetition, Component, and Subcomponent // via their embedded Value. func (v Value) DecodeString(decode ValueMapper) (string, error) { - unescaped := Unescape(v.raw, v.delims) if decode == nil { - return string(unescaped), nil + return v.String(), nil } - converted, err := decode(unescaped) + converted, err := decode(v) if err != nil { return "", err } diff --git a/charset_test.go b/charset_test.go index a87e2f0..47242fd 100644 --- a/charset_test.go +++ b/charset_test.go @@ -22,7 +22,8 @@ import ( // latin1ToUTF8 is a test-only ValueMapper that converts Latin-1 (ISO 8859-1) // bytes to UTF-8. Each byte in the range 0x80–0xFF is expanded to its UTF-8 // two-byte sequence; bytes below 0x80 are ASCII and pass through unchanged. -func latin1ToUTF8(data []byte) ([]byte, error) { +func latin1ToUTF8(v Value) ([]byte, error) { + data := v.Bytes() out := make([]byte, 0, len(data)) for _, b := range data { if b < 0x80 { @@ -35,17 +36,45 @@ func latin1ToUTF8(data []byte) ([]byte, error) { return out, nil } +// utf16LEToUTF8 is a test-only ValueMapper that converts UTF-16 little-endian +// bytes to UTF-8. The input length must be even; an error is returned otherwise. +// Only BMP characters (U+0000..U+FFFF) are decoded; surrogate pairs are not needed +// for the Latin/Western European test data used here. +// Null ("") and empty values are returned unchanged to preserve HL7 semantics. +func utf16LEToUTF8(v Value) ([]byte, error) { + if !v.HasValue() { + return v.Bytes(), nil // preserve null ("") and empty unchanged + } + data := v.Bytes() + if len(data)%2 != 0 { + return nil, errors.New("utf16: odd byte count") + } + out := make([]byte, 0, len(data)) + for i := 0; i < len(data); i += 2 { + r := rune(uint16(data[i]) | uint16(data[i+1])<<8) + switch { + case r < 0x80: + out = append(out, byte(r)) + case r < 0x800: + out = append(out, byte(0xC0|(r>>6)), byte(0x80|(r&0x3F))) + default: + out = append(out, byte(0xE0|(r>>12)), byte(0x80|((r>>6)&0x3F)), byte(0x80|(r&0x3F))) + } + } + return out, nil +} + // errHighBit is returned by errorDecoder when a byte with the high bit set is seen. var errHighBit = errors.New("high bit set") // errorDecoder rejects any byte with the high bit set. -func errorDecoder(data []byte) ([]byte, error) { - for _, b := range data { +func errorDecoder(v Value) ([]byte, error) { + for _, b := range v.Bytes() { if b&0x80 != 0 { return nil, errHighBit } } - return data, nil + return v.Bytes(), nil } // --- Value.DecodeString --- diff --git a/doc.go b/doc.go index 0dbd3e1..709ddb2 100644 --- a/doc.go +++ b/doc.go @@ -46,7 +46,8 @@ // msg.Get("MSH-9.1").String() // Message code (unescaped string) // msg.Get("PID-3[1].4.2").String() // 2nd repetition of PID-3, component 4, subcomponent 2 // msg.Get("OBX(1)-5").String() // 2nd OBX segment (0-based), field 5 -// msg.Get("PID-3.1").Bytes() // raw bytes without unescaping +// msg.Get("PID-3.1").Bytes() // unescaped bytes +// msg.Get("PID-3.1").Raw() // raw bytes without unescaping // // Get returns a zero Value when the location is invalid or the value is not // present — consistent with how Field(n), Rep(n), and Component(n) return zero @@ -56,7 +57,8 @@ // // Field values in non-UTF-8 charsets (e.g. Latin-1 / ISO 8859-1 declared in // MSH-18) can be decoded with DecodeString. A ValueMapper is a -// func([]byte) ([]byte, error) that converts post-unescape bytes to UTF-8. +// func(Value) ([]byte, error) that converts a Value to UTF-8 bytes; call +// v.Bytes() inside the mapper to get the post-unescape bytes to convert. // DecodeString is available on Value, Field, Repetition, Component, and // Subcomponent: // diff --git a/transform.go b/transform.go index 513f2d4..d392fbd 100644 --- a/transform.go +++ b/transform.go @@ -1064,10 +1064,10 @@ func mapDelimited(raw []byte, sep byte, fn func([]byte) ([]byte, error)) ([]byte return out, nil } -// mapLeafValue unescapes raw, applies mapper, and re-escapes the result. +// mapLeafValue applies mapper to raw and re-escapes the result. +// The mapper receives the full Value and calls v.Bytes() for unescaped bytes. func mapLeafValue(raw []byte, delims Delimiters, mapper ValueMapper) ([]byte, error) { - unescaped := Unescape(raw, delims) - mapped, err := mapper(unescaped) + mapped, err := mapper(Value{raw: raw, delims: delims}) if err != nil { return nil, err } diff --git a/transform_test.go b/transform_test.go index 4b3b850..48ecfc2 100644 --- a/transform_test.go +++ b/transform_test.go @@ -666,7 +666,7 @@ func TestTransformDelimiterConversionReEscapes(t *testing.T) { } // Verify the raw bytes contain the escape sequence, not a bare "|". - if raw := string(result.Get("OBX-5").Bytes()); raw != "before$S$after" { + if raw := string(result.Get("OBX-5").Raw()); raw != "before$S$after" { t.Errorf("OBX-5 raw = %q, want %q", raw, "before$S$after") } } @@ -695,7 +695,7 @@ func TestTransformDelimiterConversionEscapesNewCollisions(t *testing.T) { t.Errorf("PID-5.1 = %q, want %q", got, "Do#e") } // Raw bytes should contain the escape sequence, not a bare "#". - if raw := string(result.Get("PID-5.1").Bytes()); raw != "Do$F$e" { + if raw := string(result.Get("PID-5.1").Raw()); raw != "Do$F$e" { t.Errorf("PID-5.1 raw = %q, want %q", raw, "Do$F$e") } } @@ -1581,7 +1581,7 @@ func TestMapAllValuesIdentity(t *testing.T) { raw := "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123||Doe^John||19800101" msg := parseTestMessage(t, raw) - identity := func(b []byte) ([]byte, error) { return b, nil } + identity := func(v Value) ([]byte, error) { return v.Bytes(), nil } result, err := msg.Transform(MapAllValues(identity)) if err != nil { @@ -1605,7 +1605,7 @@ func TestMapAllValuesIdentity(t *testing.T) { func TestMapAllValuesUppercase(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123||doe^john") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapAllValues(upper)) if err != nil { @@ -1637,9 +1637,9 @@ func TestMapAllValuesNull(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||\"\"|") callCount := 0 - capture := func(b []byte) ([]byte, error) { + capture := func(v Value) ([]byte, error) { callCount++ - return b, nil // pass through + return v.Bytes(), nil // pass through } result, err := msg.Transform(MapAllValues(capture)) @@ -1661,11 +1661,11 @@ func TestMapAllValuesNullReplaced(t *testing.T) { // Mapper returns non-empty for a null ("") leaf — value should be stored. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||\"\"") - replaceNull := func(b []byte) ([]byte, error) { - if bytes.Equal(b, []byte(`""`)) { + replaceNull := func(v Value) ([]byte, error) { + if v.IsNull() { return []byte("REPLACED"), nil } - return b, nil + return v.Bytes(), nil } result, err := msg.Transform(MapAllValues(replaceNull)) @@ -1681,11 +1681,11 @@ func TestMapAllValuesEmpty(t *testing.T) { // Mapper IS called with empty bytes; it can return a value. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1|||5678") - replaceEmpty := func(b []byte) ([]byte, error) { - if len(b) == 0 { + replaceEmpty := func(v Value) ([]byte, error) { + if v.IsEmpty() { return []byte("FILLED"), nil } - return b, nil + return v.Bytes(), nil } result, err := msg.Transform(MapAllValues(replaceEmpty)) @@ -1705,7 +1705,7 @@ func TestMapAllValuesEmpty(t *testing.T) { func TestMapAllValuesMSHDelimFields(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapAllValues(upper)) if err != nil { @@ -1727,7 +1727,7 @@ func TestMapAllValuesSubcomponents(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||a&b") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapAllValues(upper)) if err != nil { @@ -1748,7 +1748,7 @@ func TestMapAllValuesSubcomponentsPartialChange(t *testing.T) { // Exercises the lazy-init separator logic when i > 0 is the first change. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||A&b") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapAllValues(upper)) if err != nil { @@ -1767,7 +1767,7 @@ func TestMapAllValuesRepetitions(t *testing.T) { // Field val1~val2 — mapper applied to each repetition independently. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||rep1~rep2") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapAllValues(upper)) if err != nil { @@ -1787,7 +1787,7 @@ func TestMapAllValuesEscapeRoundTrip(t *testing.T) { // Mapper receives unescaped "|"; output is re-escaped before storage. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||before\\F\\after") - identity := func(b []byte) ([]byte, error) { return b, nil } + identity := func(v Value) ([]byte, error) { return v.Bytes(), nil } result, err := msg.Transform(MapAllValues(identity)) if err != nil { @@ -1799,7 +1799,7 @@ func TestMapAllValuesEscapeRoundTrip(t *testing.T) { t.Errorf("PID-3 = %q, want %q", got, "before|after") } // Raw bytes should contain the re-escaped sequence. - if raw := result.Get("PID-3").Bytes(); !bytes.Contains(raw, []byte(`\F\`)) { + if raw := result.Get("PID-3").Raw(); !bytes.Contains(raw, []byte(`\F\`)) { t.Errorf("PID-3 raw %q should contain \\F\\", raw) } } @@ -1808,11 +1808,11 @@ func TestMapAllValuesError(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123") sentinel := errors.New("mapper error") - errMapper := func(b []byte) ([]byte, error) { - if len(b) > 0 { + errMapper := func(v Value) ([]byte, error) { + if v.HasValue() { return nil, sentinel } - return b, nil + return v.Bytes(), nil } _, err := msg.Transform(MapAllValues(errMapper)) @@ -1825,7 +1825,7 @@ func TestMapAllValuesMixedChanges(t *testing.T) { // MapAllValues and Replace in the same Transform call. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123||doe^john") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform( MapAllValues(upper), @@ -1855,7 +1855,7 @@ func BenchmarkMapAllValues(b *testing.B) { b.Fatalf("ParseMessage failed: %v", err) } - identity := func(b []byte) ([]byte, error) { return b, nil } + identity := func(v Value) ([]byte, error) { return v.Bytes(), nil } b.ResetTimer() b.ReportAllocs() @@ -1869,7 +1869,7 @@ func BenchmarkMapAllValues(b *testing.B) { func TestMapValue(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123||Doe^John") - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } result, err := msg.Transform(MapValue("PID-5.1", upper)) if err != nil { @@ -1895,11 +1895,11 @@ func TestMapValueAbsentLocation(t *testing.T) { // non-nil, a new field should be created. msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123") - fill := func(b []byte) ([]byte, error) { - if b == nil || len(b) == 0 { + fill := func(v Value) ([]byte, error) { + if v.IsEmpty() { return []byte("CREATED"), nil } - return b, nil + return v.Bytes(), nil } result, err := msg.Transform(MapValue("PID-19", fill)) @@ -1915,7 +1915,7 @@ func TestMapValueAbsentLocation(t *testing.T) { func TestMapValueInvalidLocation(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1") - identity := func(b []byte) ([]byte, error) { return b, nil } + identity := func(v Value) ([]byte, error) { return v.Bytes(), nil } _, err := msg.Transform(MapValue("invalid", identity)) if !errors.Is(err, ErrInvalidLocation) { @@ -1927,7 +1927,7 @@ func TestMapValueError(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1\rPID|1||123") sentinel := errors.New("map error") - errMapper := func(b []byte) ([]byte, error) { return nil, sentinel } + errMapper := func(v Value) ([]byte, error) { return nil, sentinel } _, err := msg.Transform(MapValue("PID-3", errMapper)) if !errors.Is(err, sentinel) { @@ -1938,7 +1938,7 @@ func TestMapValueError(t *testing.T) { func TestMapValueMSHDelimReject(t *testing.T) { msg := parseTestMessage(t, "MSH|^~\\&|S|F|R|RF|20240101||ADT^A01|1|P|2.5.1") - identity := func(b []byte) ([]byte, error) { return b, nil } + identity := func(v Value) ([]byte, error) { return v.Bytes(), nil } _, err := msg.Transform(MapValue("MSH-1", identity)) if !errors.Is(err, ErrMSHDelimiterField) { @@ -1962,7 +1962,7 @@ func BenchmarkMapValue(b *testing.B) { b.Fatalf("ParseMessage failed: %v", err) } - upper := func(b []byte) ([]byte, error) { return bytes.ToUpper(b), nil } + upper := func(v Value) ([]byte, error) { return bytes.ToUpper(v.Bytes()), nil } b.ResetTimer() b.ReportAllocs() @@ -2012,3 +2012,114 @@ func TestMapAllValuesLatin1ToUTF8(t *testing.T) { } } } + +func TestMapAllValuesUTF16ToUTF8(t *testing.T) { + // enc encodes a UTF-8 string as UTF-16 LE bytes (BMP only). + enc := func(s string) string { + b := make([]byte, 0, len(s)*2) + for _, r := range s { + b = append(b, byte(r), byte(r>>8)) + } + return string(b) + } + + // ALL field values are UTF-16 LE encoded so the strict mapper (which errors + // on odd-length input) succeeds on every leaf including pure-ASCII fields. + // Structural bytes (segment type, field/component separators, \r) remain ASCII. + raw := "MSH|^~\\&|" + enc("LAB") + "|" + enc("FAC") + "|" + enc("EMR") + "|" + + enc("FAC2") + "|" + enc("20240101") + "||" + enc("ADT") + "^" + enc("A01") + + "|" + enc("1") + "|" + enc("P") + "|" + enc("2.5.1") + "\r" + + "PID|" + enc("1") + "||" + enc("123") + "||" + + enc("Müller") + "^" + enc("François") + "^" + enc("J") + + "||" + enc("19800101") + "|" + enc("F") + "|||" + + enc("Straße 5") + "^^" + enc("München") + + msg := parseTestMessage(t, raw) + + result, err := msg.Transform(MapAllValues(utf16LEToUTF8)) + if err != nil { + t.Fatalf("Transform(MapAllValues) failed: %v", err) + } + + if result.Delimiters() != msg.Delimiters() { + t.Errorf("delimiters changed: got %v, want %v", result.Delimiters(), msg.Delimiters()) + } + + cases := []struct { + loc string + want string + }{ + {"MSH-9.1", "ADT"}, // pure-ASCII UTF-16 decoded correctly + {"PID-3", "123"}, // pure-ASCII UTF-16 decoded correctly + {"PID-5.1", "Müller"}, // \xFC\x00 → ü + {"PID-5.2", "François"}, // \xE7\x00 → ç + {"PID-5.3", "J"}, // ASCII inside multi-component field + {"PID-7", "19800101"}, // date: pure-ASCII UTF-16 + {"PID-11.1", "Straße 5"}, // \xDF\x00 → ß; space and digit pass through + {"PID-11.3", "München"}, // \xFC\x00 → ü; PID-11.2 is empty (^^ skips it) + } + + for _, tc := range cases { + got := result.Get(tc.loc).String() + if got != tc.want { + t.Errorf("%s = %q, want %q", tc.loc, got, tc.want) + } + } +} + +func TestMapAllValuesNullPreservedStrictMapper(t *testing.T) { + // utf16LEToUTF8 is a strict mapper that decodes every leaf as UTF-16 LE. + // The HL7 null sentinel ("") is ASCII (0x22 0x22). A strict UTF-16 LE mapper + // that does not guard against null will decode 0x22 0x22 as U+2222 (∢), + // silently corrupting the null field. This test asserts that null is preserved. + enc := func(s string) string { + b := make([]byte, 0, len(s)*2) + for _, r := range s { + b = append(b, byte(r), byte(r>>8)) + } + return string(b) + } + + raw := "MSH|^~\\&|" + enc("LAB") + "|" + enc("FAC") + "\r" + + "PID|" + enc("1") + "||\"\"||" + enc("Smith") + + msg := parseTestMessage(t, raw) + + result, err := msg.Transform(MapAllValues(utf16LEToUTF8)) + if err != nil { + t.Fatalf("Transform(MapAllValues) failed: %v", err) + } + + if !result.Segments()[1].Field(3).IsNull() { + t.Errorf("PID-3 should be null after MapAllValues(utf16LEToUTF8), got %q", + result.Get("PID-3").String()) + } +} + +func TestMapAllValuesNullCombinedWithNullChange(t *testing.T) { + // Null("PID-3") writes "" into PID-3; MapAllValues(utf16LEToUTF8) runs next. + // A mapper without a null guard would decode "" (0x22 0x22) as U+2222, corrupting + // the explicit null that was just written. This test asserts the null survives. + enc := func(s string) string { + b := make([]byte, 0, len(s)*2) + for _, r := range s { + b = append(b, byte(r), byte(r>>8)) + } + return string(b) + } + + raw := "MSH|^~\\&|" + enc("LAB") + "|" + enc("FAC") + "\r" + + "PID|" + enc("1") + "||" + enc("123") + "||" + enc("Smith") + + msg := parseTestMessage(t, raw) + + result, err := msg.Transform(Null("PID-3"), MapAllValues(utf16LEToUTF8)) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if !result.Segments()[1].Field(3).IsNull() { + t.Errorf("PID-3 should be null after Null+MapAllValues, got %q", + result.Get("PID-3").String()) + } +} diff --git a/validate.go b/validate.go index 7300d3c..521e0b2 100644 --- a/validate.go +++ b/validate.go @@ -320,7 +320,7 @@ func (v *validator) validateSegmentFields(seg *Segment, def *SegmentDef, segLoc } func (v *validator) validateRep(rep Repetition, fd FieldDef, segLoc string, repIdx, repCount int) { - raw := rep.Bytes() + raw := rep.Raw() if fd.Value != "" && string(raw) != fd.Value { loc := buildFieldLoc(segLoc, fd.Index, repIdx, repCount) @@ -372,7 +372,7 @@ func (v *validator) validateComposite(rep Repetition, dtDef *DataTypeDef, segLoc if comp.IsEmpty() { continue } - raw := comp.Bytes() + raw := comp.Raw() if cd.Value != "" && string(raw) != cd.Value { loc := buildCompLoc(segLoc, fieldIdx, repIdx, repCount, cd.Index) v.addIssue(SeverityError, loc, CodeInvalidValue,