From eff721cd6c175e2ab037fc53892bb9d8706bc7ae Mon Sep 17 00:00:00 2001 From: Seth Hall Date: Mon, 22 Dec 2025 19:56:42 -0500 Subject: [PATCH 1/2] Add Timestamp type for compact date storage - Add DataValue::Timestamp(i64) storing Unix epoch seconds - Use MMDB extended type 128 to avoid collision with MaxMind types - Auto-detect ISO 8601 strings during JSON deserialization - Serialize timestamps back to ISO 8601 for JSON consumers - Update schema validation to accept Timestamp for first_seen/last_seen - Add chrono dependency for date parsing/formatting - Update binary-format.md and data-types-ref.md documentation Reduces storage from 27-byte ISO 8601 strings to 8-byte integers. --- Cargo.lock | 11 ++ Cargo.toml | 3 + book/src/reference/binary-format.md | 11 ++ book/src/reference/data-types-ref.md | 25 ++++ crates/matchy-data-format/Cargo.toml | 1 + crates/matchy-data-format/src/lib.rs | 155 ++++++++++++++++++++ crates/matchy-data-format/src/validation.rs | 3 +- crates/matchy/Cargo.toml | 1 + crates/matchy/src/bin/cli_utils.rs | 9 ++ crates/matchy/src/c_api/matchy.rs | 17 +++ crates/matchy/src/schema_validation.rs | 36 ++++- 11 files changed, 269 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ed004768..4b2a3b75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -302,6 +302,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "num-traits", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -1085,6 +1094,7 @@ dependencies = [ "bech32", "bs58", "cbindgen", + "chrono", "clap", "criterion", "crossbeam-channel", @@ -1136,6 +1146,7 @@ dependencies = [ name = "matchy-data-format" version = "1.2.2" dependencies = [ + "chrono", "serde", "serde_json", ] diff --git a/Cargo.toml b/Cargo.toml index 8a274873..062b0dd8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,9 @@ libc = "0.2" # Time (wasm-compatible) web-time = "1.1" +# Timestamp parsing/formatting +chrono = { version = "0.4", default-features = false, features = ["alloc"] } + # Dev dependencies (shared) criterion = "0.8" tempfile = "3.8" diff --git a/book/src/reference/binary-format.md b/book/src/reference/binary-format.md index 19f53835..afa3414e 100644 --- a/book/src/reference/binary-format.md +++ b/book/src/reference/binary-format.md @@ -124,9 +124,20 @@ MMDB-format data types: | Boolean | 14 | 0 bytes | Value in type byte | | Float | 15 | 4 bytes | IEEE 754 | | Array | 11 | Variable | Ordered list | +| Timestamp | 128 | 8 bytes | Matchy extension (Unix epoch seconds) | See [MaxMind DB Format](https://maxmind.github.io/MaxMind-DB/) for encoding details. +### Matchy Extended Types + +Matchy extends the MMDB format with additional types using codes 128+: + +| Type | Code | Size | Notes | +|------|------|------|-------| +| Timestamp | 128 | 8 bytes | Unix epoch seconds (signed i64) | + +These types are stored using the MMDB extended type mechanism (raw byte = code - 7). Timestamp values are serialized to JSON as ISO 8601 strings (e.g., `2025-10-02T18:44:31Z`) for human readability while stored compactly as 8 bytes instead of 27-byte strings. + ## PARAGLOB Section Format When glob patterns are present, the PARAGLOB section contains: diff --git a/book/src/reference/data-types-ref.md b/book/src/reference/data-types-ref.md index b4804d1c..e2293600 100644 --- a/book/src/reference/data-types-ref.md +++ b/book/src/reference/data-types-ref.md @@ -17,6 +17,7 @@ Matchy databases store arbitrary data with each entry using the `DataValue` type - **Bytes**: Arbitrary binary data - **Array**: Ordered list of values - **Map**: Key-value mappings +- **Timestamp**: Unix epoch seconds (compact storage for ISO 8601 timestamps) See [Data Types](../guide/data-types.md) for conceptual overview. @@ -35,6 +36,7 @@ pub enum DataValue { Bytes(Vec), Array(Vec), Map(HashMap), + Timestamp(i64), // Unix epoch seconds } ``` @@ -84,6 +86,28 @@ let tags = DataValue::Array(vec![ data.insert("tags".to_string(), tags); ``` +## Working with Timestamps + +Timestamps store Unix epoch seconds compactly (8 bytes vs 27-byte ISO 8601 strings): + +```rust +use matchy::DataValue; + +let first_seen = DataValue::Timestamp(1727891071); +data.insert("first_seen".to_string(), first_seen); +``` + +ISO 8601 strings in JSON input are automatically parsed into Timestamps during deserialization: + +```json +{ + "entry": "1.2.3.4", + "first_seen": "2025-10-02T18:44:31Z" +} +``` + +When serialized back to JSON, Timestamps render as ISO 8601 strings for readability. + ## Nested Structures ```rust @@ -180,6 +204,7 @@ DataValue types are serialized to the MMDB binary format: | Bytes | bytes | Length-prefixed | | Array | array | Recursive | | Map | map | Key-value pairs | +| Timestamp | ext 128 | 8 bytes, Matchy extension | See [Binary Format](binary-format.md) for encoding details. diff --git a/crates/matchy-data-format/Cargo.toml b/crates/matchy-data-format/Cargo.toml index 42359df5..18bbd05e 100644 --- a/crates/matchy-data-format/Cargo.toml +++ b/crates/matchy-data-format/Cargo.toml @@ -10,6 +10,7 @@ publish = false # Internal crate for now # Minimal dependencies for serialization support [dependencies] serde.workspace = true +chrono.workspace = true [dev-dependencies] serde_json.workspace = true diff --git a/crates/matchy-data-format/src/lib.rs b/crates/matchy-data-format/src/lib.rs index e0dea897..3a1ee9b1 100644 --- a/crates/matchy-data-format/src/lib.rs +++ b/crates/matchy-data-format/src/lib.rs @@ -27,9 +27,26 @@ //! //! See: +use chrono::{DateTime, TimeZone, Utc}; use std::collections::HashMap; use std::hash::{Hash, Hasher}; +/// Extended type ID for Timestamp (Matchy extension, avoids collision with MaxMind types 1-15) +const TIMESTAMP_EXTENDED_TYPE: u8 = 121; // Type 128 = 7 + 121 + +fn try_parse_iso8601(s: &str) -> Option { + DateTime::parse_from_rfc3339(s) + .ok() + .map(|dt| dt.timestamp()) +} + +fn format_iso8601(epoch: i64) -> String { + Utc.timestamp_opt(epoch, 0) + .single() + .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)) + .unwrap_or_else(|| format!("{epoch}")) +} + mod validation; pub use validation::{ validate_data_section, validate_data_value_pointers, validate_data_value_utf8, @@ -74,6 +91,15 @@ pub enum DataValue { Bool(bool), /// IEEE 754 single precision float Float(f32), + /// Unix timestamp (seconds since 1970-01-01 00:00:00 UTC) + /// + /// Stored compactly as a variable-length i64 using Matchy extended type 128. + /// Serializes to/from ISO 8601 strings (e.g., "2025-10-02T18:44:31Z") in JSON, + /// making the optimization transparent to API consumers. + /// + /// This is a Matchy extension to the MMDB format. Standard MMDB readers + /// will not recognize this type. + Timestamp(i64), } // Custom serialization that excludes Pointer (internal format detail) @@ -98,6 +124,7 @@ impl serde::Serialize for DataValue { Self::Array(a) => a.serialize(serializer), Self::Bool(b) => serializer.serialize_bool(*b), Self::Float(f) => serializer.serialize_f32(*f), + Self::Timestamp(epoch) => serializer.serialize_str(&format_iso8601(*epoch)), } } } @@ -173,10 +200,16 @@ impl<'de> serde::Deserialize<'de> for DataValue { } fn visit_str(self, v: &str) -> Result { + if let Some(epoch) = try_parse_iso8601(v) { + return Ok(DataValue::Timestamp(epoch)); + } Ok(DataValue::String(v.to_string())) } fn visit_string(self, v: String) -> Result { + if let Some(epoch) = try_parse_iso8601(&v) { + return Ok(DataValue::Timestamp(epoch)); + } Ok(DataValue::String(v)) } @@ -254,6 +287,7 @@ impl Hash for DataValue { // For floats, hash the bit representation to handle NaN consistently v.to_bits().hash(state); } + Self::Timestamp(v) => v.hash(state), } } } @@ -381,6 +415,7 @@ impl DataEncoder { DataValue::Array(a) => Self::encode_array(a, buffer), DataValue::Bool(b) => Self::encode_bool(*b, buffer), DataValue::Float(f) => Self::encode_float(*f, buffer), + DataValue::Timestamp(t) => Self::encode_timestamp(*t, buffer), } } @@ -616,6 +651,13 @@ impl DataEncoder { buffer.extend_from_slice(&f.to_be_bytes()); } + // Type 128: Timestamp (Matchy extension, extended type 121) + fn encode_timestamp(epoch: i64, buffer: &mut Vec) { + buffer.push(0x08); // Type 0 << 5, size 8 + buffer.push(TIMESTAMP_EXTENDED_TYPE); + buffer.extend_from_slice(&epoch.to_be_bytes()); + } + /// Encode control byte with size for standard types fn encode_with_size(type_id: u8, size: usize, buffer: &mut Vec) { let type_bits = type_id << 5; @@ -724,6 +766,7 @@ impl<'a> DataDecoder<'a> { 11 => self.decode_array(cursor, size_from_ctrl), // Extended type 4 14 => Ok(DataValue::Bool(size_from_ctrl != 0)), // Extended type 7 15 => self.decode_float(cursor, size_from_ctrl), // Extended type 8 + 128 => self.decode_timestamp(cursor, size_from_ctrl), // Matchy extension _ => { eprintln!( "Unknown extended type: raw_ext_type={}, type_id={}, size_from_ctrl={}, offset={}", @@ -994,6 +1037,26 @@ impl<'a> DataDecoder<'a> { Ok(DataValue::Float(f32::from_be_bytes(bytes))) } + fn decode_timestamp( + &self, + cursor: &mut usize, + size_bits: u8, + ) -> Result { + if size_bits != 8 { + return Err("Timestamp must be 8 bytes"); + } + + if *cursor + 8 > self.buffer.len() { + return Err("Timestamp data out of bounds"); + } + + let mut bytes = [0u8; 8]; + bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]); + *cursor += 8; + + Ok(DataValue::Timestamp(i64::from_be_bytes(bytes))) + } + fn decode_size(&self, cursor: &mut usize, size_bits: u8) -> Result { match size_bits { 0..=28 => Ok(size_bits as usize), @@ -1391,4 +1454,96 @@ mod tests { assert!(result.is_ok()); assert_eq!(result.unwrap(), DataValue::Int32(i32::MIN)); } + + #[test] + fn test_timestamp_binary_roundtrip() { + let mut encoder = DataEncoder::new(); + let epoch = 1727894671i64; // 2024-10-02T18:44:31Z + let value = DataValue::Timestamp(epoch); + let offset = encoder.encode(&value); + + let bytes = encoder.into_bytes(); + let decoder = DataDecoder::new(&bytes, 0); + let decoded = decoder.decode(offset).unwrap(); + + assert_eq!(decoded, DataValue::Timestamp(epoch)); + } + + #[test] + fn test_timestamp_json_serialize() { + let value = DataValue::Timestamp(1727894671); + let json = serde_json::to_string(&value).unwrap(); + assert_eq!(json, "\"2024-10-02T18:44:31Z\""); + } + + #[test] + fn test_timestamp_json_deserialize() { + let json = "\"2024-10-02T18:44:31Z\""; + let value: DataValue = serde_json::from_str(json).unwrap(); + assert_eq!(value, DataValue::Timestamp(1727894671)); + } + + #[test] + fn test_timestamp_with_fractional_seconds() { + let json = "\"2024-10-02T18:44:31.123456Z\""; + let value: DataValue = serde_json::from_str(json).unwrap(); + if let DataValue::Timestamp(epoch) = value { + assert_eq!(epoch, 1727894671); + } else { + panic!("Expected Timestamp, got {value:?}"); + } + } + + #[test] + fn test_non_timestamp_string_stays_string() { + let json = "\"hello world\""; + let value: DataValue = serde_json::from_str(json).unwrap(); + assert_eq!(value, DataValue::String("hello world".to_string())); + } + + #[test] + fn test_timestamp_negative_epoch() { + let mut encoder = DataEncoder::new(); + let epoch = -86400i64; // 1969-12-31 + let value = DataValue::Timestamp(epoch); + let offset = encoder.encode(&value); + + let bytes = encoder.into_bytes(); + let decoder = DataDecoder::new(&bytes, 0); + let decoded = decoder.decode(offset).unwrap(); + + assert_eq!(decoded, DataValue::Timestamp(epoch)); + } + + #[test] + fn test_timestamp_in_map() { + let mut encoder = DataEncoder::new(); + let mut map = HashMap::new(); + map.insert("first_seen".to_string(), DataValue::Timestamp(1727894671)); + map.insert("last_seen".to_string(), DataValue::Timestamp(1727981071)); + map.insert("name".to_string(), DataValue::String("test".to_string())); + + let offset = encoder.encode(&DataValue::Map(map.clone())); + + let bytes = encoder.into_bytes(); + let decoder = DataDecoder::new(&bytes, 0); + let decoded = decoder.decode(offset).unwrap(); + + if let DataValue::Map(decoded_map) = decoded { + assert_eq!( + decoded_map.get("first_seen"), + Some(&DataValue::Timestamp(1727894671)) + ); + assert_eq!( + decoded_map.get("last_seen"), + Some(&DataValue::Timestamp(1727981071)) + ); + assert_eq!( + decoded_map.get("name"), + Some(&DataValue::String("test".to_string())) + ); + } else { + panic!("Expected Map, got {decoded:?}"); + } + } } diff --git a/crates/matchy-data-format/src/validation.rs b/crates/matchy-data-format/src/validation.rs index ef8ad33d..fce45cab 100644 --- a/crates/matchy-data-format/src/validation.rs +++ b/crates/matchy-data-format/src/validation.rs @@ -147,7 +147,8 @@ pub fn validate_value_strings_utf8(value: &DataValue) -> Result { | DataValue::Uint64(_) | DataValue::Uint128(_) | DataValue::Bool(_) - | DataValue::Float(_) => {} + | DataValue::Float(_) + | DataValue::Timestamp(_) => {} } Ok(count) diff --git a/crates/matchy/Cargo.toml b/crates/matchy/Cargo.toml index 8e3c728b..f46e82f9 100644 --- a/crates/matchy/Cargo.toml +++ b/crates/matchy/Cargo.toml @@ -86,6 +86,7 @@ bs58.workspace = true sha2.workspace = true tiny-keccak.workspace = true bech32.workspace = true +chrono.workspace = true clap = { version = "4.5", features = ["derive", "cargo"], optional = true } csv = { version = "1.3", optional = true } diff --git a/crates/matchy/src/bin/cli_utils.rs b/crates/matchy/src/bin/cli_utils.rs index 1bc55b95..beeb1b94 100644 --- a/crates/matchy/src/bin/cli_utils.rs +++ b/crates/matchy/src/bin/cli_utils.rs @@ -1,4 +1,5 @@ use anyhow::Result; +use chrono::TimeZone; use matchy::DataValue; use serde_json::json; use std::collections::HashMap; @@ -197,6 +198,10 @@ pub fn data_value_to_json(data: &DataValue) -> serde_json::Value { json!(items.iter().map(data_value_to_json).collect::>()) } DataValue::Pointer(_) => json!(""), + DataValue::Timestamp(epoch) => { + let dt = chrono::Utc.timestamp_opt(*epoch, 0).unwrap(); + json!(dt.to_rfc3339()) + } } } @@ -361,5 +366,9 @@ pub fn format_data_value(data: &DataValue, indent: &str) -> String { } } DataValue::Pointer(_) => "".to_string(), + DataValue::Timestamp(epoch) => { + let dt = chrono::Utc.timestamp_opt(*epoch, 0).unwrap(); + format!("\"{}\"", dt.to_rfc3339()) + } } } diff --git a/crates/matchy/src/c_api/matchy.rs b/crates/matchy/src/c_api/matchy.rs index 6ca8cd14..1f180cbe 100644 --- a/crates/matchy/src/c_api/matchy.rs +++ b/crates/matchy/src/c_api/matchy.rs @@ -7,6 +7,7 @@ use crate::database::{Database, ReloadEvent}; use crate::schema_validation::SchemaValidator; use crate::schemas::{get_schema_info, is_known_database_type}; use crate::DatabaseBuilder; +use chrono::TimeZone; use matchy_data_format::DataValue; use matchy_match_mode::MatchMode; use std::collections::HashMap; @@ -1583,6 +1584,22 @@ impl matchy_entry_data_t { matchy_entry_data_value_u { float_value: *f }, 4, ), + DataValue::Timestamp(epoch) => { + let iso = chrono::Utc + .timestamp_opt(*epoch, 0) + .single() + .map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)) + .unwrap_or_else(|| format!("{epoch}")); + let c_str = CString::new(iso.as_str()).ok()?; + let ptr = c_str.as_ptr(); + let len = iso.len(); + string_cache.push(c_str); + ( + MATCHY_DATA_TYPE_UTF8_STRING, + matchy_entry_data_value_u { utf8_string: ptr }, + u32::try_from(len).unwrap_or(u32::MAX), + ) + } }; Some(Self { diff --git a/crates/matchy/src/schema_validation.rs b/crates/matchy/src/schema_validation.rs index a5b8287c..e504f98e 100644 --- a/crates/matchy/src/schema_validation.rs +++ b/crates/matchy/src/schema_validation.rs @@ -370,8 +370,8 @@ fn validate_threatdb(data: &HashMap) -> Vec) -> Vec Date: Tue, 23 Dec 2025 00:58:01 +0000 Subject: [PATCH 2/2] chore(deps): bump serde_json from 1.0.145 to 1.0.146 Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.145 to 1.0.146. - [Release notes](https://github.com/serde-rs/json/releases) - [Commits](https://github.com/serde-rs/json/compare/v1.0.145...v1.0.146) --- updated-dependencies: - dependency-name: serde_json dependency-version: 1.0.146 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b2a3b75..e26164ee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1841,9 +1841,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.145" +version = "1.0.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +checksum = "217ca874ae0207aac254aa02c957ded05585a90892cc8d87f9e5fa49669dadd8" dependencies = [ "itoa", "memchr",