From 1393edea5d88903d6597c28f66e61a41a02f864e Mon Sep 17 00:00:00 2001 From: Dima Date: Tue, 14 Oct 2025 15:19:52 +0200 Subject: [PATCH] Improve wording about error cases in VariantShredding --- VariantShredding.md | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/VariantShredding.md b/VariantShredding.md index 4a98a31d..cfc05fd5 100644 --- a/VariantShredding.md +++ b/VariantShredding.md @@ -22,7 +22,7 @@ The Variant type is designed to store and process semi-structured data efficiently, even with heterogeneous values. Query engines encode each Variant value in a self-describing format, and store it as a group containing `value` and `metadata` binary fields in Parquet. Since data is often partially homogeneous, it can be beneficial to extract certain fields into separate Parquet columns to further improve performance. -This process is **shredding**. +This process is called **shredding**. Shredding enables the use of Parquet's columnar representation for more compact data encoding, column statistics for data skipping, and partial projections. @@ -202,21 +202,22 @@ As a result, reads when both `value` and `typed_value` are defined may be incons The table below shows how the series of objects in the first column would be stored: -| Event object | `value` | `typed_value` | `typed_value.event_type.value` | `typed_value.event_type.typed_value` | `typed_value.event_ts.value` | `typed_value.event_ts.typed_value` | Notes | -|------------------------------------------------------------------------------------|-----------------------------------|---------------|--------------------------------|--------------------------------------|------------------------------|------------------------------------|--------------------------------------------------| -| `{"event_type": "noop", "event_ts": 1729794114937}` | null | non-null | null | `noop` | null | 1729794114937 | Fully shredded object | -| `{"event_type": "login", "event_ts": 1729794146402, "email": "user@example.com"}` | `{"email": "user@example.com"}` | non-null | null | `login` | null | 1729794146402 | Partially shredded object | -| `{"error_msg": "malformed: ..."}` | `{"error_msg", "malformed: ..."}` | non-null | null | null | null | null | Object with all shredded fields missing | -| `"malformed: not an object"` | `malformed: not an object` | null | | | | | Not an object (stored as Variant string) | -| `{"event_ts": 1729794240241, "click": "_button"}` | `{"click": "_button"}` | non-null | null | null | null | 1729794240241 | Field `event_type` is missing | -| `{"event_type": null, "event_ts": 1729794954163}` | null | non-null | `00` (field exists, is null) | null | null | 1729794954163 | Field `event_type` is present and is null | -| `{"event_type": "noop", "event_ts": "2024-10-24"}` | null | non-null | null | `noop` | `"2024-10-24"` | null | Field `event_ts` is present but not a timestamp | -| `{ }` | null | non-null | null | null | null | null | Object is present but empty | -| null | `00` (null) | null | | | | | Object/value is null | -| missing | null | null | | | | | Object/value is missing | -| INVALID | `{"event_type": "login"}` | non-null | null | `login` | null | 1729795057774 | INVALID: Shredded field is present in `value` | -| INVALID | `"a"` | non-null | null | null | null | null | INVALID: `typed_value` is present for non-object | -| INVALID | `02 00` (object with 0 fields) | null | | | | | INVALID: `typed_value` is null for object | +| Event object | `value` | `typed_value` | `typed_value.event_type.value` | `typed_value.event_type.typed_value` | `typed_value.event_ts.value` | `typed_value.event_ts.typed_value` | Notes | +|-----------------------------------------------------------------------------------|-----------------------------------|---------------|--------------------------------|--------------------------------------|------------------------------|------------------------------------|----------------------------------------------------------------------------| +| `{"event_type": "noop", "event_ts": 1729794114937}` | null | non-null | null | `noop` | null | 1729794114937 | Fully shredded object | +| `{"event_type": "login", "event_ts": 1729794146402, "email": "user@example.com"}` | `{"email": "user@example.com"}` | non-null | null | `login` | null | 1729794146402 | Partially shredded object | +| `{"error_msg": "malformed: ..."}` | `{"error_msg", "malformed: ..."}` | non-null | null | null | null | null | Object with all shredded fields missing | +| `"malformed: not an object"` | `malformed: not an object` | null | | | | | Not an object (stored as Variant string) | +| `{"event_ts": 1729794240241, "click": "_button"}` | `{"click": "_button"}` | non-null | null | null | null | 1729794240241 | Field `event_type` is missing | +| `{"event_type": null, "event_ts": 1729794954163}` | null | non-null | `00` (field exists, is null) | null | null | 1729794954163 | Field `event_type` is present and is null | +| `{"event_type": "noop", "event_ts": "2024-10-24"}` | null | non-null | null | `noop` | `"2024-10-24"` | null | Field `event_ts` is present but not a timestamp | +| `{ }` | null | non-null | null | null | null | null | Object is present but empty | +| null | `00` (null) | null | | | | | Object/value is null | +| missing | null | null | | | | | Object/value is missing | +| INVALID: `{"event_type": "login", "event_ts": 1729795057774}` | `{"event_type": "login"}` | non-null | null | `login` | null | 1729795057774 | INVALID: Shredded field is present in `value` | +| INVALID: `{"event_type": "login"}` | `{"event_type": "login"}` | null | | | | | INVALID: Shredded field is present in `value`, while `typed_value` is null | +| INVALID: `"a"` | `"a"` | non-null | null | null | null | null | INVALID: `typed_value` is present and `value` is not an object | +| INVALID: `{}` | `02 00` (object with 0 fields) | null | | | | | INVALID: `typed_value` is null for object | Invalid cases in the table above must not be produced by writers. Readers must return an object when `typed_value` is non-null containing the shredded fields.