Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ libc = "0.2"
# Time (wasm-compatible)
web-time = "1.1"

# Timestamp parsing/formatting
chrono = { version = "0.4", default-features = false, features = ["alloc"] }

# Dev dependencies (shared)
criterion = "0.8"
tempfile = "3.8"
Expand Down
11 changes: 11 additions & 0 deletions book/src/reference/binary-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,20 @@ MMDB-format data types:
| Boolean | 14 | 0 bytes | Value in type byte |
| Float | 15 | 4 bytes | IEEE 754 |
| Array | 11 | Variable | Ordered list |
| Timestamp | 128 | 8 bytes | Matchy extension (Unix epoch seconds) |

See [MaxMind DB Format](https://maxmind.github.io/MaxMind-DB/) for encoding details.

### Matchy Extended Types

Matchy extends the MMDB format with additional types using codes 128+:

| Type | Code | Size | Notes |
|------|------|------|-------|
| Timestamp | 128 | 8 bytes | Unix epoch seconds (signed i64) |

These types are stored using the MMDB extended type mechanism (raw byte = code - 7). Timestamp values are serialized to JSON as ISO 8601 strings (e.g., `2025-10-02T18:44:31Z`) for human readability while stored compactly as 8 bytes instead of 27-byte strings.

## PARAGLOB Section Format

When glob patterns are present, the PARAGLOB section contains:
Expand Down
25 changes: 25 additions & 0 deletions book/src/reference/data-types-ref.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Matchy databases store arbitrary data with each entry using the `DataValue` type
- **Bytes**: Arbitrary binary data
- **Array**: Ordered list of values
- **Map**: Key-value mappings
- **Timestamp**: Unix epoch seconds (compact storage for ISO 8601 timestamps)

See [Data Types](../guide/data-types.md) for conceptual overview.

Expand All @@ -35,6 +36,7 @@ pub enum DataValue {
Bytes(Vec<u8>),
Array(Vec<DataValue>),
Map(HashMap<String, DataValue>),
Timestamp(i64), // Unix epoch seconds
}
```

Expand Down Expand Up @@ -84,6 +86,28 @@ let tags = DataValue::Array(vec![
data.insert("tags".to_string(), tags);
```

## Working with Timestamps

Timestamps store Unix epoch seconds compactly (8 bytes vs 27-byte ISO 8601 strings):

```rust
use matchy::DataValue;

let first_seen = DataValue::Timestamp(1727891071);
data.insert("first_seen".to_string(), first_seen);
```

ISO 8601 strings in JSON input are automatically parsed into Timestamps during deserialization:

```json
{
"entry": "1.2.3.4",
"first_seen": "2025-10-02T18:44:31Z"
}
```

When serialized back to JSON, Timestamps render as ISO 8601 strings for readability.

## Nested Structures

```rust
Expand Down Expand Up @@ -180,6 +204,7 @@ DataValue types are serialized to the MMDB binary format:
| Bytes | bytes | Length-prefixed |
| Array | array | Recursive |
| Map | map | Key-value pairs |
| Timestamp | ext 128 | 8 bytes, Matchy extension |

See [Binary Format](binary-format.md) for encoding details.

Expand Down
1 change: 1 addition & 0 deletions crates/matchy-data-format/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ publish = false # Internal crate for now
# Minimal dependencies for serialization support
[dependencies]
serde.workspace = true
chrono.workspace = true

[dev-dependencies]
serde_json.workspace = true
Expand Down
155 changes: 155 additions & 0 deletions crates/matchy-data-format/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,26 @@
//!
//! See: <https://maxmind.github.io/MaxMind-DB/>

use chrono::{DateTime, TimeZone, Utc};
use std::collections::HashMap;
use std::hash::{Hash, Hasher};

/// Extended type ID for Timestamp (Matchy extension, avoids collision with MaxMind types 1-15)
const TIMESTAMP_EXTENDED_TYPE: u8 = 121; // Type 128 = 7 + 121

fn try_parse_iso8601(s: &str) -> Option<i64> {
DateTime::parse_from_rfc3339(s)
.ok()
.map(|dt| dt.timestamp())
}

fn format_iso8601(epoch: i64) -> String {
Utc.timestamp_opt(epoch, 0)
.single()
.map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true))
.unwrap_or_else(|| format!("{epoch}"))
}

mod validation;
pub use validation::{
validate_data_section, validate_data_value_pointers, validate_data_value_utf8,
Expand Down Expand Up @@ -74,6 +91,15 @@ pub enum DataValue {
Bool(bool),
/// IEEE 754 single precision float
Float(f32),
/// Unix timestamp (seconds since 1970-01-01 00:00:00 UTC)
///
/// Stored compactly as a variable-length i64 using Matchy extended type 128.
/// Serializes to/from ISO 8601 strings (e.g., "2025-10-02T18:44:31Z") in JSON,
/// making the optimization transparent to API consumers.
///
/// This is a Matchy extension to the MMDB format. Standard MMDB readers
/// will not recognize this type.
Timestamp(i64),
}

// Custom serialization that excludes Pointer (internal format detail)
Expand All @@ -98,6 +124,7 @@ impl serde::Serialize for DataValue {
Self::Array(a) => a.serialize(serializer),
Self::Bool(b) => serializer.serialize_bool(*b),
Self::Float(f) => serializer.serialize_f32(*f),
Self::Timestamp(epoch) => serializer.serialize_str(&format_iso8601(*epoch)),
}
}
}
Expand Down Expand Up @@ -173,10 +200,16 @@ impl<'de> serde::Deserialize<'de> for DataValue {
}

fn visit_str<E>(self, v: &str) -> Result<DataValue, E> {
if let Some(epoch) = try_parse_iso8601(v) {
return Ok(DataValue::Timestamp(epoch));
}
Ok(DataValue::String(v.to_string()))
}

fn visit_string<E>(self, v: String) -> Result<DataValue, E> {
if let Some(epoch) = try_parse_iso8601(&v) {
return Ok(DataValue::Timestamp(epoch));
}
Ok(DataValue::String(v))
}

Expand Down Expand Up @@ -254,6 +287,7 @@ impl Hash for DataValue {
// For floats, hash the bit representation to handle NaN consistently
v.to_bits().hash(state);
}
Self::Timestamp(v) => v.hash(state),
}
}
}
Expand Down Expand Up @@ -381,6 +415,7 @@ impl DataEncoder {
DataValue::Array(a) => Self::encode_array(a, buffer),
DataValue::Bool(b) => Self::encode_bool(*b, buffer),
DataValue::Float(f) => Self::encode_float(*f, buffer),
DataValue::Timestamp(t) => Self::encode_timestamp(*t, buffer),
}
}

Expand Down Expand Up @@ -616,6 +651,13 @@ impl DataEncoder {
buffer.extend_from_slice(&f.to_be_bytes());
}

// Type 128: Timestamp (Matchy extension, extended type 121)
fn encode_timestamp(epoch: i64, buffer: &mut Vec<u8>) {
buffer.push(0x08); // Type 0 << 5, size 8
buffer.push(TIMESTAMP_EXTENDED_TYPE);
buffer.extend_from_slice(&epoch.to_be_bytes());
}

/// Encode control byte with size for standard types
fn encode_with_size(type_id: u8, size: usize, buffer: &mut Vec<u8>) {
let type_bits = type_id << 5;
Expand Down Expand Up @@ -724,6 +766,7 @@ impl<'a> DataDecoder<'a> {
11 => self.decode_array(cursor, size_from_ctrl), // Extended type 4
14 => Ok(DataValue::Bool(size_from_ctrl != 0)), // Extended type 7
15 => self.decode_float(cursor, size_from_ctrl), // Extended type 8
128 => self.decode_timestamp(cursor, size_from_ctrl), // Matchy extension
_ => {
eprintln!(
"Unknown extended type: raw_ext_type={}, type_id={}, size_from_ctrl={}, offset={}",
Expand Down Expand Up @@ -994,6 +1037,26 @@ impl<'a> DataDecoder<'a> {
Ok(DataValue::Float(f32::from_be_bytes(bytes)))
}

fn decode_timestamp(
&self,
cursor: &mut usize,
size_bits: u8,
) -> Result<DataValue, &'static str> {
if size_bits != 8 {
return Err("Timestamp must be 8 bytes");
}

if *cursor + 8 > self.buffer.len() {
return Err("Timestamp data out of bounds");
}

let mut bytes = [0u8; 8];
bytes.copy_from_slice(&self.buffer[*cursor..*cursor + 8]);
*cursor += 8;

Ok(DataValue::Timestamp(i64::from_be_bytes(bytes)))
}

fn decode_size(&self, cursor: &mut usize, size_bits: u8) -> Result<usize, &'static str> {
match size_bits {
0..=28 => Ok(size_bits as usize),
Expand Down Expand Up @@ -1391,4 +1454,96 @@ mod tests {
assert!(result.is_ok());
assert_eq!(result.unwrap(), DataValue::Int32(i32::MIN));
}

#[test]
fn test_timestamp_binary_roundtrip() {
let mut encoder = DataEncoder::new();
let epoch = 1727894671i64; // 2024-10-02T18:44:31Z
let value = DataValue::Timestamp(epoch);
let offset = encoder.encode(&value);

let bytes = encoder.into_bytes();
let decoder = DataDecoder::new(&bytes, 0);
let decoded = decoder.decode(offset).unwrap();

assert_eq!(decoded, DataValue::Timestamp(epoch));
}

#[test]
fn test_timestamp_json_serialize() {
let value = DataValue::Timestamp(1727894671);
let json = serde_json::to_string(&value).unwrap();
assert_eq!(json, "\"2024-10-02T18:44:31Z\"");
}

#[test]
fn test_timestamp_json_deserialize() {
let json = "\"2024-10-02T18:44:31Z\"";
let value: DataValue = serde_json::from_str(json).unwrap();
assert_eq!(value, DataValue::Timestamp(1727894671));
}

#[test]
fn test_timestamp_with_fractional_seconds() {
let json = "\"2024-10-02T18:44:31.123456Z\"";
let value: DataValue = serde_json::from_str(json).unwrap();
if let DataValue::Timestamp(epoch) = value {
assert_eq!(epoch, 1727894671);
} else {
panic!("Expected Timestamp, got {value:?}");
}
}

#[test]
fn test_non_timestamp_string_stays_string() {
let json = "\"hello world\"";
let value: DataValue = serde_json::from_str(json).unwrap();
assert_eq!(value, DataValue::String("hello world".to_string()));
}

#[test]
fn test_timestamp_negative_epoch() {
let mut encoder = DataEncoder::new();
let epoch = -86400i64; // 1969-12-31
let value = DataValue::Timestamp(epoch);
let offset = encoder.encode(&value);

let bytes = encoder.into_bytes();
let decoder = DataDecoder::new(&bytes, 0);
let decoded = decoder.decode(offset).unwrap();

assert_eq!(decoded, DataValue::Timestamp(epoch));
}

#[test]
fn test_timestamp_in_map() {
let mut encoder = DataEncoder::new();
let mut map = HashMap::new();
map.insert("first_seen".to_string(), DataValue::Timestamp(1727894671));
map.insert("last_seen".to_string(), DataValue::Timestamp(1727981071));
map.insert("name".to_string(), DataValue::String("test".to_string()));

let offset = encoder.encode(&DataValue::Map(map.clone()));

let bytes = encoder.into_bytes();
let decoder = DataDecoder::new(&bytes, 0);
let decoded = decoder.decode(offset).unwrap();

if let DataValue::Map(decoded_map) = decoded {
assert_eq!(
decoded_map.get("first_seen"),
Some(&DataValue::Timestamp(1727894671))
);
assert_eq!(
decoded_map.get("last_seen"),
Some(&DataValue::Timestamp(1727981071))
);
assert_eq!(
decoded_map.get("name"),
Some(&DataValue::String("test".to_string()))
);
} else {
panic!("Expected Map, got {decoded:?}");
}
}
}
3 changes: 2 additions & 1 deletion crates/matchy-data-format/src/validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ pub fn validate_value_strings_utf8(value: &DataValue) -> Result<u32, String> {
| DataValue::Uint64(_)
| DataValue::Uint128(_)
| DataValue::Bool(_)
| DataValue::Float(_) => {}
| DataValue::Float(_)
| DataValue::Timestamp(_) => {}
}

Ok(count)
Expand Down
1 change: 1 addition & 0 deletions crates/matchy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ bs58.workspace = true
sha2.workspace = true
tiny-keccak.workspace = true
bech32.workspace = true
chrono.workspace = true

clap = { version = "4.5", features = ["derive", "cargo"], optional = true }
csv = { version = "1.3", optional = true }
Expand Down
Loading
Loading