senojj · senojj · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
@@ -18,6 +18,7 @@ Single-package Go library (`package hl7`) for parsing HL7 version 2.x messages i
 - **Full serialization or Marshal API.** The library does not provide a general `Marshal` function. Message construction is handled by `MessageBuilder` (from-scratch) and `Transform` (modify existing), both of which produce `*Message` via `ParseMessage`. There are no per-field setters on parsed messages.
 - **Built-in schema definitions.** The library does not ship with HL7v2 segment, data type, or table definitions. Users provide their own `Schema` struct to `msg.Validate()`. This keeps the library general-purpose and avoids bundling version-specific definitions.
 - **No field location constants.** HL7 field positions (e.g., `PID-5.1`) are not stable across HL7 v2 versions or vendor implementations. The library does not provide named constants for terser-style location strings; callers define their own.
+- **No predicate-based segment filtering.** Methods like `SegmentsOfType`, `SegmentsWhere`, or combinator functions (`And`, `Or`) were considered and rejected. The plain `Segments()` loop with an inline `if` or `switch` is idiomatic Go, immediately readable without library knowledge, and handles all filtering cases in a single pass. The abstraction does not eliminate domain knowledge — callers still decide which types and fields matter — it only restructures where that logic lives. The API surface cost exceeds the ergonomic benefit.
 
 ## Building and Testing
 

@@ -77,15 +77,19 @@ func nthSlice(data []byte, delim byte, n int) []byte {
 	}
 	idx := 0
 	start := 0
-	for i := 0; i <= len(data); i++ {
-		if i == len(data) || data[i] == delim {
+	for i := 0; i < len(data); i++ {
+		if data[i] == delim {
 			if idx == n {
 				return data[start:i]
 			}
 			idx++
 			start = i + 1
 		}
 	}
+	// Final piece (no trailing delimiter).
+	if idx == n {
+		return data[start:]
+	}
 	return nil
 }
 
@@ -97,15 +101,19 @@ func nthRange(data []byte, delim byte, n int) (start, end int, found bool) {
 	}
 	idx := 0
 	s := 0
-	for i := 0; i <= len(data); i++ {
-		if i == len(data) || data[i] == delim {
+	for i := 0; i < len(data); i++ {
+		if data[i] == delim {
 			if idx == n {
 				return s, i, true
 			}
 			idx++
 			s = i + 1
 		}
 	}
+	// Final piece (no trailing delimiter).
+	if idx == n {
+		return s, len(data), true
+	}
 	return 0, 0, false
 }
 

@@ -165,14 +165,13 @@ func Escape(data []byte, delims Delimiters) []byte {
 }
 
 // containsAny returns true if data contains any of the delimiter characters.
+// Uses bytes.IndexByte for SIMD-accelerated scanning on amd64/arm64.
 func containsAny(data []byte, delims Delimiters) bool {
-	for _, b := range data {
-		if b == delims.Field || b == delims.Component || b == delims.SubComponent ||
-			b == delims.Repetition || b == delims.Escape {
-			return true
-		}
-	}
-	return false
+	return bytes.IndexByte(data, delims.Field) >= 0 ||
+		bytes.IndexByte(data, delims.Component) >= 0 ||
+		bytes.IndexByte(data, delims.SubComponent) >= 0 ||
+		bytes.IndexByte(data, delims.Repetition) >= 0 ||
+		bytes.IndexByte(data, delims.Escape) >= 0
 }
 
 // appendHexDecode decodes pairs of hex characters and appends the resulting

@@ -0,0 +1,106 @@
+// Example charset demonstrates decoding a Latin-1 (ISO 8859-1) encoded HL7v2
+// message into UTF-8. It shows both selective field decoding with DecodeString
+// and whole-message conversion with MapAllValues.
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"log"
+
+	"github.com/senojj/hl7"
+)
+
+// Sample ADT^A01 message encoded in Latin-1 (ISO 8859-1).
+// MSH-18 declares the charset as "8859/1" per HL7 v2.5.1 Table 0211.
+// High-byte characters embedded using Go hex escapes:
+//
+//	0xFC = ü  (Müller — family name)
+//	0xF6 = ö  (Jörg — middle name)
+//	0xDF = ß  (Hauptstraße — street)
+var sampleMessage = []byte("" +
+	"MSH|^~\\&|SENDER|FAC|RECV|FAC|20260101||ADT^A01^ADT_A01|MSG001|P|2.5.1|||NE|AL|DEU|8859/1\r" +
+	"PID|1||123456||M\xFCller^Hans^J\xF6rg||19620315|M|||Hauptstra\xDFe 5^^Berlin^^10115^DE\r" +
+	"PV1|1|I\r")
+
+// latin1ToUTF8 is a ValueMapper that converts Latin-1 (ISO 8859-1) bytes to
+// UTF-8. Bytes below 0x80 are ASCII and pass through unchanged; bytes in the
+// range 0x80–0xFF map directly to Unicode U+0080–U+00FF and expand to two
+// UTF-8 bytes. Empty and null values are returned unchanged to preserve HL7
+// null semantics.
+func latin1ToUTF8(v hl7.Value) ([]byte, error) {
+	if !v.HasValue() {
+		return v.Bytes(), nil
+	}
+	data := v.Bytes()
+	out := make([]byte, 0, len(data))
+	for _, b := range data {
+		if b < 0x80 {
+			out = append(out, b)
+		} else {
+			// Latin-1 0x80–0xFF maps to Unicode U+0080–U+00FF.
+			out = append(out, 0xC0|(b>>6), 0x80|(b&0x3F))
+		}
+	}
+	return out, nil
+}
+
+func main() {
+	// Parse the Latin-1 encoded message.
+	msg, err := hl7.ParseMessage(sampleMessage)
+	if err != nil {
+		log.Fatal("parse:", err)
+	}
+
+	// Read MSH-18 to confirm the declared charset.
+	charset := msg.Get("MSH-18").String()
+	fmt.Println("Charset declared in MSH-18:", charset)
+	fmt.Println()
+
+	// Selective decoding — decode individual fields using DecodeString.
+	// This is suitable when only a few fields need charset conversion.
+	family, err := msg.Get("PID-5.1").DecodeString(latin1ToUTF8)
+	if err != nil {
+		log.Fatal("decode PID-5.1:", err)
+	}
+	given, err := msg.Get("PID-5.2").DecodeString(latin1ToUTF8)
+	if err != nil {
+		log.Fatal("decode PID-5.2:", err)
+	}
+	middle, err := msg.Get("PID-5.3").DecodeString(latin1ToUTF8)
+	if err != nil {
+		log.Fatal("decode PID-5.3:", err)
+	}
+	addr, err := msg.Get("PID-11.1").DecodeString(latin1ToUTF8)
+	if err != nil {
+		log.Fatal("decode PID-11.1:", err)
+	}
+
+	fmt.Println("Patient name (decoded):", family+", "+given+" "+middle)
+	fmt.Println("Address (decoded):", addr)
+	fmt.Println()
+
+	// Full-message conversion — re-encode every field value to UTF-8 and
+	// update MSH-18 in a single atomic Transform call.
+	utf8Msg, err := msg.Transform(
+		hl7.MapAllValues(latin1ToUTF8),
+		hl7.Replace("MSH-18", "UNICODE UTF-8"),
+	)
+	if err != nil {
+		log.Fatal("transform:", err)
+	}
+
+	// Write the converted message (no MLLP framing for readability).
+	var out bytes.Buffer
+	writer := hl7.NewWriter(&out)
+	if err := writer.WriteMessage(utf8Msg); err != nil {
+		log.Fatal("write:", err)
+	}
+
+	fmt.Println("Converted message (UTF-8):")
+	for _, seg := range bytes.Split(out.Bytes(), []byte{'\r'}) {
+		if len(seg) > 0 {
+			fmt.Println(" ", string(seg))
+		}
+	}
+}
@@ -70,15 +70,12 @@ func ParseMessage(data []byte) (*Message, error) {
 // the segment terminator (\r). Accepts \r, \n, and \r\n for real-world
 // compatibility. Empty lines are skipped.
 func splitSegments(data []byte, delims Delimiters) []Segment {
-	// Pre-count terminators for allocation.
-	n := 1
-	for _, b := range data {
-		if b == '\r' || b == '\n' {
-			n++
-		}
-	}
-
-	segments := make([]Segment, 0, n)
+	// Heuristic capacity: typical HL7 segments are 40-120 bytes; dividing by
+	// 80 with a +4 floor gives a good estimate in one pass, avoiding a separate
+	// pre-counting scan. The +4 floor ensures ADD-merged messages with fewer
+	// bytes-per-segment are still pre-sized adequately. Slight over-allocation
+	// is an acceptable tradeoff for eliminating the extra buffer scan.
+	segments := make([]Segment, 0, len(data)/80+4)
 	start := 0
 
 	for i := 0; i < len(data); i++ {