From ff4583e219342d41c3a95d3aeb927655773ab85a Mon Sep 17 00:00:00 2001 From: Joshua Jones Date: Sun, 22 Feb 2026 12:25:02 -0500 Subject: [PATCH 1/3] capture decision --- AGENTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AGENTS.md b/AGENTS.md index 5b2d869..0a1a73c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,6 +18,7 @@ Single-package Go library (`package hl7`) for parsing HL7 version 2.x messages i - **Full serialization or Marshal API.** The library does not provide a general `Marshal` function. Message construction is handled by `MessageBuilder` (from-scratch) and `Transform` (modify existing), both of which produce `*Message` via `ParseMessage`. There are no per-field setters on parsed messages. - **Built-in schema definitions.** The library does not ship with HL7v2 segment, data type, or table definitions. Users provide their own `Schema` struct to `msg.Validate()`. This keeps the library general-purpose and avoids bundling version-specific definitions. - **No field location constants.** HL7 field positions (e.g., `PID-5.1`) are not stable across HL7 v2 versions or vendor implementations. The library does not provide named constants for terser-style location strings; callers define their own. +- **No predicate-based segment filtering.** Methods like `SegmentsOfType`, `SegmentsWhere`, or combinator functions (`And`, `Or`) were considered and rejected. The plain `Segments()` loop with an inline `if` or `switch` is idiomatic Go, immediately readable without library knowledge, and handles all filtering cases in a single pass. The abstraction does not eliminate domain knowledge — callers still decide which types and fields matter — it only restructures where that logic lives. The API surface cost exceeds the ergonomic benefit. ## Building and Testing From b4ae148bd1e11eca8ffa267e32e32f6810d339a5 Mon Sep 17 00:00:00 2001 From: Joshua Jones Date: Sun, 22 Feb 2026 13:01:48 -0500 Subject: [PATCH 2/3] performance enhancements --- delimiters.go | 16 ++++++++++++---- escape.go | 13 ++++++------- message.go | 15 ++++++--------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/delimiters.go b/delimiters.go index c495676..eccbb94 100644 --- a/delimiters.go +++ b/delimiters.go @@ -77,8 +77,8 @@ func nthSlice(data []byte, delim byte, n int) []byte { } idx := 0 start := 0 - for i := 0; i <= len(data); i++ { - if i == len(data) || data[i] == delim { + for i := 0; i < len(data); i++ { + if data[i] == delim { if idx == n { return data[start:i] } @@ -86,6 +86,10 @@ func nthSlice(data []byte, delim byte, n int) []byte { start = i + 1 } } + // Final piece (no trailing delimiter). + if idx == n { + return data[start:] + } return nil } @@ -97,8 +101,8 @@ func nthRange(data []byte, delim byte, n int) (start, end int, found bool) { } idx := 0 s := 0 - for i := 0; i <= len(data); i++ { - if i == len(data) || data[i] == delim { + for i := 0; i < len(data); i++ { + if data[i] == delim { if idx == n { return s, i, true } @@ -106,6 +110,10 @@ func nthRange(data []byte, delim byte, n int) (start, end int, found bool) { s = i + 1 } } + // Final piece (no trailing delimiter). + if idx == n { + return s, len(data), true + } return 0, 0, false } diff --git a/escape.go b/escape.go index 2746c89..b6f3f02 100644 --- a/escape.go +++ b/escape.go @@ -165,14 +165,13 @@ func Escape(data []byte, delims Delimiters) []byte { } // containsAny returns true if data contains any of the delimiter characters. +// Uses bytes.IndexByte for SIMD-accelerated scanning on amd64/arm64. func containsAny(data []byte, delims Delimiters) bool { - for _, b := range data { - if b == delims.Field || b == delims.Component || b == delims.SubComponent || - b == delims.Repetition || b == delims.Escape { - return true - } - } - return false + return bytes.IndexByte(data, delims.Field) >= 0 || + bytes.IndexByte(data, delims.Component) >= 0 || + bytes.IndexByte(data, delims.SubComponent) >= 0 || + bytes.IndexByte(data, delims.Repetition) >= 0 || + bytes.IndexByte(data, delims.Escape) >= 0 } // appendHexDecode decodes pairs of hex characters and appends the resulting diff --git a/message.go b/message.go index 6c7bddf..86e5348 100644 --- a/message.go +++ b/message.go @@ -70,15 +70,12 @@ func ParseMessage(data []byte) (*Message, error) { // the segment terminator (\r). Accepts \r, \n, and \r\n for real-world // compatibility. Empty lines are skipped. func splitSegments(data []byte, delims Delimiters) []Segment { - // Pre-count terminators for allocation. - n := 1 - for _, b := range data { - if b == '\r' || b == '\n' { - n++ - } - } - - segments := make([]Segment, 0, n) + // Heuristic capacity: typical HL7 segments are 40-120 bytes; dividing by + // 80 with a +4 floor gives a good estimate in one pass, avoiding a separate + // pre-counting scan. The +4 floor ensures ADD-merged messages with fewer + // bytes-per-segment are still pre-sized adequately. Slight over-allocation + // is an acceptable tradeoff for eliminating the extra buffer scan. + segments := make([]Segment, 0, len(data)/80+4) start := 0 for i := 0; i < len(data); i++ { From a29e8d44e84b934a98f63b4d78a6f653973f60c2 Mon Sep 17 00:00:00 2001 From: Joshua Jones Date: Sun, 22 Feb 2026 13:27:59 -0500 Subject: [PATCH 3/3] add example for re-encoding a message --- examples/charset/main.go | 106 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 examples/charset/main.go diff --git a/examples/charset/main.go b/examples/charset/main.go new file mode 100644 index 0000000..d932345 --- /dev/null +++ b/examples/charset/main.go @@ -0,0 +1,106 @@ +// Example charset demonstrates decoding a Latin-1 (ISO 8859-1) encoded HL7v2 +// message into UTF-8. It shows both selective field decoding with DecodeString +// and whole-message conversion with MapAllValues. +package main + +import ( + "bytes" + "fmt" + "log" + + "github.com/senojj/hl7" +) + +// Sample ADT^A01 message encoded in Latin-1 (ISO 8859-1). +// MSH-18 declares the charset as "8859/1" per HL7 v2.5.1 Table 0211. +// High-byte characters embedded using Go hex escapes: +// +// 0xFC = ü (Müller — family name) +// 0xF6 = ö (Jörg — middle name) +// 0xDF = ß (Hauptstraße — street) +var sampleMessage = []byte("" + + "MSH|^~\\&|SENDER|FAC|RECV|FAC|20260101||ADT^A01^ADT_A01|MSG001|P|2.5.1|||NE|AL|DEU|8859/1\r" + + "PID|1||123456||M\xFCller^Hans^J\xF6rg||19620315|M|||Hauptstra\xDFe 5^^Berlin^^10115^DE\r" + + "PV1|1|I\r") + +// latin1ToUTF8 is a ValueMapper that converts Latin-1 (ISO 8859-1) bytes to +// UTF-8. Bytes below 0x80 are ASCII and pass through unchanged; bytes in the +// range 0x80–0xFF map directly to Unicode U+0080–U+00FF and expand to two +// UTF-8 bytes. Empty and null values are returned unchanged to preserve HL7 +// null semantics. +func latin1ToUTF8(v hl7.Value) ([]byte, error) { + if !v.HasValue() { + return v.Bytes(), nil + } + data := v.Bytes() + out := make([]byte, 0, len(data)) + for _, b := range data { + if b < 0x80 { + out = append(out, b) + } else { + // Latin-1 0x80–0xFF maps to Unicode U+0080–U+00FF. + out = append(out, 0xC0|(b>>6), 0x80|(b&0x3F)) + } + } + return out, nil +} + +func main() { + // Parse the Latin-1 encoded message. + msg, err := hl7.ParseMessage(sampleMessage) + if err != nil { + log.Fatal("parse:", err) + } + + // Read MSH-18 to confirm the declared charset. + charset := msg.Get("MSH-18").String() + fmt.Println("Charset declared in MSH-18:", charset) + fmt.Println() + + // Selective decoding — decode individual fields using DecodeString. + // This is suitable when only a few fields need charset conversion. + family, err := msg.Get("PID-5.1").DecodeString(latin1ToUTF8) + if err != nil { + log.Fatal("decode PID-5.1:", err) + } + given, err := msg.Get("PID-5.2").DecodeString(latin1ToUTF8) + if err != nil { + log.Fatal("decode PID-5.2:", err) + } + middle, err := msg.Get("PID-5.3").DecodeString(latin1ToUTF8) + if err != nil { + log.Fatal("decode PID-5.3:", err) + } + addr, err := msg.Get("PID-11.1").DecodeString(latin1ToUTF8) + if err != nil { + log.Fatal("decode PID-11.1:", err) + } + + fmt.Println("Patient name (decoded):", family+", "+given+" "+middle) + fmt.Println("Address (decoded):", addr) + fmt.Println() + + // Full-message conversion — re-encode every field value to UTF-8 and + // update MSH-18 in a single atomic Transform call. + utf8Msg, err := msg.Transform( + hl7.MapAllValues(latin1ToUTF8), + hl7.Replace("MSH-18", "UNICODE UTF-8"), + ) + if err != nil { + log.Fatal("transform:", err) + } + + // Write the converted message (no MLLP framing for readability). + var out bytes.Buffer + writer := hl7.NewWriter(&out) + if err := writer.WriteMessage(utf8Msg); err != nil { + log.Fatal("write:", err) + } + + fmt.Println("Converted message (UTF-8):") + for _, seg := range bytes.Split(out.Bytes(), []byte{'\r'}) { + if len(seg) > 0 { + fmt.Println(" ", string(seg)) + } + } +}