From f4231a88d8a24c99c3e3c60e01e47515d3bd426d Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 6 Nov 2025 09:07:16 +0000 Subject: [PATCH 01/53] Update settings for vs code --- .vscode/settings.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ae1d1ab --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,31 @@ +{ + "[markdown]": { + "editor.defaultFormatter": null + }, + "editor.formatOnPaste": false, + "editor.formatOnSave": true, + "editor.rulers": [ + 100 + ], + "files.autoSave": "off", + "files.insertFinalNewline": true, + "gitlens.showWhatsNewAfterUpgrades": false, + "lldb.consoleMode": "evaluate", + "rust-analyzer.check.command": "clippy", + "rust-analyzer.checkOnSave": true, + "rust-analyzer.runnables.extraTestBinaryArgs": [ + "--nocapture" + ], + "rust-analyzer.rustfmt.extraArgs": [ + "--config", + "max_width=100" + ], + "jupyter.kernels.excludePythonEnvironments": [ + "/bin/python3", + "/usr/bin/python3" + ], + "notebook.formatOnSave.enabled": true, + "notebook.output.scrolling": true, + "python.defaultInterpreterPath": "~/.local/share/base/bin/python3", + "python.terminal.activateEnvironment": false +} From 6e3767675908643abc8e36bce86ba29c292c6772 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 6 Nov 2025 09:07:40 +0000 Subject: [PATCH 02/53] Add custom hasher framework --- Cargo.toml | 9 +- cspell.json | 7 +- src/hasher.rs | 378 +++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/pyarrow.rs | 17 +-- 5 files changed, 392 insertions(+), 20 deletions(-) create mode 100644 src/hasher.rs diff --git a/Cargo.toml b/Cargo.toml index e97d73e..92c47cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,8 +4,15 @@ version = "0.1.3" edition = "2024" [dependencies] -arrow = { version = "56.0.0", features = ["ffi"] } +arrow = { version = "57.0.0", features = ["ffi"] } arrow-digest = "56.0.0" +arrow-schema = { version = "57.0.0", features = ["serde"] } +digest = "0.10.7" +hex = "0.4.3" +postcard = "1.1.3" +pretty_assertions = "1.4.1" +serde = "1.0.228" +serde_json = "1.0" sha2 = "0.10.9" # automated CFFI + bindings in other languages uniffi = { version = "0.29.4", features = ["cli", "tokio"] } diff --git a/cspell.json b/cspell.json index 3c38ecf..48de8d0 100644 --- a/cspell.json +++ b/cspell.json @@ -3,11 +3,10 @@ "datatypes", "pyarrow", "pythonapi", - "uniffi" - ], - "ignoreWords": [ - + "uniffi", + "uids" ], + "ignoreWords": [], "useGitignore": false, "ignorePaths": [ "Cargo.lock", diff --git a/src/hasher.rs b/src/hasher.rs new file mode 100644 index 0000000..dc04a42 --- /dev/null +++ b/src/hasher.rs @@ -0,0 +1,378 @@ +use std::collections::BTreeMap; + +use arrow::{ + array::{Array, BooleanArray, RecordBatch, StructArray}, + datatypes::{DataType, Schema}, +}; +use arrow_schema::Field; +use digest::Digest; +use postcard::to_vec; + +const NULL_BYTES: &[u8] = b"NULL"; + +struct ArrowDigester { + schema: Schema, + schema_digest: Vec, + fields_digest_buffer: BTreeMap, +} + +impl ArrowDigester { + fn new(schema: Schema) -> Self { + // Hash the schema first + let schema_digest = Self::hash_schema(&schema); + + // Flatten all nested fields into a single map, this allows us to hash each field individually and efficiently + let mut fields_digest_buffer = BTreeMap::new(); + schema.fields.into_iter().for_each(|field| { + Self::extract_fields_name(field, "", &mut fields_digest_buffer); + }); + + // Store it in the new struct for now + ArrowDigester { + schema, + schema_digest, + fields_digest_buffer, + } + } + + /// Hash a array directly without needing to create an ArrowDigester instance on the user side + pub fn hash_array(array: &dyn Array) -> Vec { + let mut digest = D::new(); + Self::array_digest_update(array.data_type(), array, &mut digest); + digest.finalize().to_vec() + } + + /// Hash record batch directly without needing to create an ArrowDigester instance on the user side + pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { + let mut digester = ArrowDigester::::new(record_batch.schema().as_ref().clone()); + digester.update(record_batch.clone()); + digester.finalize() + } + + /// Internal recursive function to extract field names from nested structs effectively flattening the schema + /// The format is parent_child_grandchild_etc... for nested fields and will be stored in fields_digest_buffer + fn extract_fields_name( + field: &Field, + parent_field_name: &str, + fields_digest_buffer: &mut BTreeMap, + ) { + // Check if field is a nested type of struct + match field.data_type() { + DataType::Struct(fields) => { + // We will add fields in alphabetical order + fields.into_iter().for_each(|field| { + Self::extract_fields_name(field, parent_field_name, fields_digest_buffer); + }); + } + _ => { + // Base case, just add the field name + let field_name = if parent_field_name.is_empty() { + field.name().to_string() + } else { + format!("{}_{}", parent_field_name, field.name()) + }; + + fields_digest_buffer.insert(field_name, D::new()); + } + } + } + + fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { + let array_data = array.to_data(); + + // Get the slice with offset accounted for if there is any + let slice = array_data.buffers()[0] + .as_slice() + .get(array_data.offset() * element_size as usize..) + .expect("Failed to get buffer slice for FixedSizeBinaryArray"); + + // Deal with null + match array_data.nulls() { + Some(null_buffer) => { + // There are nulls, so we need to incrementally hash each value + for i in 0..array_data.len() { + if null_buffer.is_valid(i) { + let data_pos = i * element_size as usize; + digest.update(&slice[data_pos..data_pos + element_size as usize]); + } else { + digest.update(NULL_BYTES); + } + } + } + None => { + // No nulls, we can hash the entire buffer directly + digest.update(slice); + } + } + } + + /// Serialize the schema into a BTreeMap for field name and its digest + fn hash_schema(schema: &Schema) -> Vec { + let fields_digest = schema + .fields + .into_iter() + .map(|field| (field.name(), to_vec::<_, 256>(field).unwrap())) + .collect::>(); + + // Hash the entire thing to the digest + D::digest(to_vec::<_, 1024>(&fields_digest).unwrap()).to_vec() + } + + /// Hash a record batch and update the internal digests + fn update(&mut self, record_batch: RecordBatch) { + // Verify schema matches + if *record_batch.schema() != self.schema { + panic!("Record batch schema does not match ArrowDigester schema"); + } + + // Iterate through each field and update its digest + self.fields_digest_buffer + .iter_mut() + .for_each(|(field_name, digest)| { + // Determine if field name is nested + let field_name_hierarchy = field_name.split('_').collect::>(); + + if field_name_hierarchy.len() == 1 { + Self::array_digest_update( + record_batch + .schema() + .field_with_name(field_name) + .unwrap() + .data_type(), + record_batch.column_by_name(field_name).unwrap(), + digest, + ); + } else { + Self::update_nested_field( + &field_name_hierarchy, + 0, + record_batch + .column_by_name(field_name_hierarchy[0]) + .unwrap() + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StructArray"), + digest, + ); + } + }); + } + + /// Recursive function to update nested field digests (structs within structs) + fn update_nested_field( + field_name_hierarchy: &Vec<&str>, + current_level: usize, + array: &StructArray, + digest: &mut D, + ) { + if field_name_hierarchy.len() == current_level { + // Base case, it should be a non-struct field + Self::array_digest_update( + array + .column_by_name(field_name_hierarchy[0]) + .unwrap() + .data_type(), + array + .column_by_name(field_name_hierarchy[0]) + .unwrap() + .as_ref(), + digest, + ); + } else { + // Recursive case, it should be a struct field + let next_array = array + .column_by_name(field_name_hierarchy[current_level]) + .unwrap() + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StructArray"); + + Self::update_nested_field(field_name_hierarchy, current_level + 1, next_array, digest); + } + } + + /// This will consume the ArrowDigester and produce the final combined digest where the schema + /// digest is fed in first, followed by each field digest in alphabetical order of field names + pub fn finalize(self) -> Vec { + // Finalize all the sub digest and combine them into a single digest + let mut final_digest = D::new(); + + // digest the schema first + final_digest.update(&self.schema_digest); + + // Then digest each field digest in order + self.fields_digest_buffer + .into_iter() + .for_each(|(_, digest)| { + let field_hash = digest.finalize(); + final_digest.update(&field_hash); + }); + + final_digest.finalize().to_vec() + } + + fn array_digest_update(data_type: &DataType, array: &dyn Array, digest: &mut D) { + match data_type { + DataType::Null => todo!(), + DataType::Boolean => { + // Bool Array is stored a bit differently, so we can't use the standard fixed buffer approach + let bool_array = array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to BooleanArray"); + + bool_array.into_iter().for_each(|value| match value { + Some(b) => digest.update([b as u8]), + None => digest.update(NULL_BYTES), + }); + } + DataType::Int8 => Self::hash_fixed_size_array(array, digest, 1), + DataType::Int16 => Self::hash_fixed_size_array(array, digest, 2), + DataType::Int32 => Self::hash_fixed_size_array(array, digest, 4), + DataType::Int64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), + DataType::UInt16 => Self::hash_fixed_size_array(array, digest, 2), + DataType::UInt32 => Self::hash_fixed_size_array(array, digest, 4), + DataType::UInt64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::Float16 => Self::hash_fixed_size_array(array, digest, 2), + DataType::Float32 => Self::hash_fixed_size_array(array, digest, 4), + DataType::Float64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::Timestamp(_, _) => todo!(), + DataType::Date32 => Self::hash_fixed_size_array(array, digest, 4), + DataType::Date64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::Time32(_) => todo!(), + DataType::Time64(_) => todo!(), + DataType::Duration(_) => todo!(), + DataType::Interval(_) => todo!(), + DataType::Binary => todo!(), + DataType::FixedSizeBinary(_) => todo!(), + DataType::LargeBinary => todo!(), + DataType::BinaryView => todo!(), + DataType::Utf8 => todo!(), + DataType::LargeUtf8 => todo!(), + DataType::Utf8View => todo!(), + DataType::List(_) => todo!(), + DataType::ListView(_) => todo!(), + DataType::FixedSizeList(_, _) => todo!(), + DataType::LargeList(_) => todo!(), + DataType::LargeListView(_) => todo!(), + DataType::Struct(_) => todo!(), + DataType::Union(_, _) => todo!(), + DataType::Dictionary(_, _) => todo!(), + DataType::Decimal32(_, _) => todo!(), + DataType::Decimal64(_, _) => todo!(), + DataType::Decimal128(_, _) => todo!(), + DataType::Decimal256(_, _) => todo!(), + DataType::Map(_, _) => todo!(), + DataType::RunEndEncoded(_, _) => todo!(), + }; + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}; + use arrow_schema::{DataType, Field, Schema}; + use pretty_assertions::assert_eq; + use sha2::Sha256; + + use crate::hasher::ArrowDigester; + + #[test] + fn boolean_array_hashing() { + let bool_array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); + let hash = hex::encode(ArrowDigester::::hash_array(&bool_array)); + println!("{}", hash); + assert_eq!( + hash, + "d7b7a73916d3f0c693ebcfa94fe2eee163d31a38ba8fe44ef81c5ffbff50c9be" + ); + } + + /// Test int32 array hashing which is really meant to test fixed size element array hashing + #[test] + fn int32_array_hashing() { + let int_array = arrow::array::Int32Array::from(vec![Some(42), None, Some(-7), Some(0)]); + let hash = hex::encode(ArrowDigester::::hash_array(&int_array)); + println!("{}", hash); + assert_eq!( + hash, + "bb36e54f5e2d937a05bb716a8d595f1c8da67fda48feeb7ab5b071a69e63d648" + ); + } + + #[test] + fn commutative_tables() { + let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; + let fake_data = Arc::new(BooleanArray::from(vec![ + Some(true), + Some(false), + None, + Some(true), + ])) as ArrayRef; + + // Create two record batches with same data but different order + let batch1 = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("uids", DataType::Int32, false), + Field::new("flags", DataType::Boolean, true), + ])), + vec![uids.clone(), fake_data.clone()], + ); + + let batch2 = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("flags", DataType::Boolean, true), + Field::new("uids", DataType::Int32, false), + ])), + vec![fake_data.clone(), uids.clone()], + ); + + // Hash both record batches + assert_eq!( + ArrowDigester::::hash_record_batch(batch1.as_ref().unwrap()), + ArrowDigester::::hash_record_batch(batch2.as_ref().unwrap()) + ); + } + + #[test] + fn record_batch_hashing() { + let schema = Arc::new(Schema::new(vec![ + Field::new("uids", DataType::Int32, false), + Field::new("flags", DataType::Boolean, true), + ])); + + // Create two record batches with different data to simulate loading at different times + let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; + let fake_data = Arc::new(BooleanArray::from(vec![ + Some(true), + Some(false), + None, + Some(true), + ])); + + let batch1 = RecordBatch::try_new(schema.clone(), vec![uids, fake_data]).unwrap(); + + let uids2 = + Arc::new(Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8)])) as ArrayRef; + let fake_data2 = Arc::new(BooleanArray::from(vec![ + Some(false), + Some(true), + Some(true), + None, + ])); + + let batch2 = RecordBatch::try_new(schema.clone(), vec![uids2, fake_data2]).unwrap(); + + // Hash both record batches + let mut digester = ArrowDigester::::new((*schema).clone()); + digester.update(batch1); + digester.update(batch2); + assert_eq!( + hex::encode(digester.finalize()), + "9ba289655f0c7dd359ababc5a6f6188b352e45483623fbbf8b967723e2b798f8" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index e1de1e6..c2f7f2c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,3 +4,4 @@ extern crate uniffi as uniffi_external; uniffi_external::setup_scaffolding!(); mod pyarrow; +mod hasher; diff --git a/src/pyarrow.rs b/src/pyarrow.rs index 90d1b68..ce51a35 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -1,7 +1,5 @@ use arrow::array::{RecordBatch, StructArray}; use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi}; -use arrow_digest::{RecordDigest, RecordDigestV0}; -use sha2::Sha256; /// Process an Arrow table via C Data Interface /// @@ -16,20 +14,9 @@ pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { from_ffi(ffi_array, &ffi_schema).expect("Failed to import Arrow array data") }; - // // Convert FFI schema to Arrow Schema - // let schema = - // Schema::try_from(&ffi_schema).expect("Failed to convert FFI schema to Arrow schema"); - - // Import array data from FFI - - // Create StructArray from the array data - let struct_array = StructArray::from(array_data); - // Create RecordBatch from StructArray - let record_batch = RecordBatch::from(struct_array); + let _record_batch = RecordBatch::from(StructArray::from(array_data)); // Hash the table - let hash = RecordDigestV0::::digest(&record_batch); - // format!("{:x}", hash) - hash.to_vec() + Vec::new() } From 28761b3723e603d71b6a378a3e4423c981865a60 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 6 Nov 2025 09:11:27 +0000 Subject: [PATCH 03/53] Fix clippy errors --- src/hasher.rs | 4 ++-- src/lib.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hasher.rs b/src/hasher.rs index dc04a42..37ddc0e 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -10,14 +10,14 @@ use postcard::to_vec; const NULL_BYTES: &[u8] = b"NULL"; -struct ArrowDigester { +pub struct ArrowDigester { schema: Schema, schema_digest: Vec, fields_digest_buffer: BTreeMap, } impl ArrowDigester { - fn new(schema: Schema) -> Self { + pub fn new(schema: Schema) -> Self { // Hash the schema first let schema_digest = Self::hash_schema(&schema); diff --git a/src/lib.rs b/src/lib.rs index c2f7f2c..ea6e98e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,5 +3,5 @@ extern crate uniffi as uniffi_external; uniffi_external::setup_scaffolding!(); +pub mod hasher; mod pyarrow; -mod hasher; From e6336c58c0296fce0f3bc86ef50b643ac56084e5 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 7 Nov 2025 07:39:38 +0000 Subject: [PATCH 04/53] Add list hashing --- notebooks/Example Python Usage.ipynb | 16 +- src/hasher.rs | 221 ++++++++++++++++++--------- 2 files changed, 155 insertions(+), 82 deletions(-) diff --git a/notebooks/Example Python Usage.ipynb b/notebooks/Example Python Usage.ipynb index ba23c98..d08c3ce 100644 --- a/notebooks/Example Python Usage.ipynb +++ b/notebooks/Example Python Usage.ipynb @@ -30,6 +30,7 @@ "import ctypes\n", "import arrow_hasher as ah\n", "\n", + "\n", "def hash_arrow_table(table: pa.Table):\n", " # Covert table to record batch first (so we can extract the pointers), since the default behavior is 1 batch, we can just get the first element\n", " # After that we can extract the PyCapsules\n", @@ -40,18 +41,21 @@ " PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]\n", " PyCapsule_GetPointer.restype = ctypes.c_void_p\n", "\n", - " return ah.process_arrow_table(PyCapsule_GetPointer(array_capsule, b\"arrow_array\"), PyCapsule_GetPointer(schema_capsule, b\"arrow_schema\"))\n", + " return ah.process_arrow_table(\n", + " PyCapsule_GetPointer(array_capsule, b\"arrow_array\"),\n", + " PyCapsule_GetPointer(schema_capsule, b\"arrow_schema\"),\n", + " )\n", + "\n", "\n", "# Create a simple Arrow table\n", "data = {\n", - " 'id': [1, 2, 3, 4, 5],\n", - " 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],\n", - " 'value': [10.5, 20.3, 15.7, 30.2, 25.8]\n", + " \"id\": [1, 2, 3, 4, 5],\n", + " \"name\": [\"Alice\", \"Bob\", \"Charlie\", \"David\", \"Eve\"],\n", + " \"value\": [10.5, 20.3, 15.7, 30.2, 25.8],\n", "}\n", "table = pa.table(data)\n", "\n", - "hash_arrow_table(table)\n", - "\n" + "hash_arrow_table(table)" ] }, { diff --git a/src/hasher.rs b/src/hasher.rs index 37ddc0e..3535cab 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -1,7 +1,10 @@ use std::collections::BTreeMap; use arrow::{ - array::{Array, BooleanArray, RecordBatch, StructArray}, + array::{ + Array, BooleanArray, GenericListArray, LargeListArray, ListArray, OffsetSizeTrait, + RecordBatch, StructArray, + }, datatypes::{DataType, Schema}, }; use arrow_schema::Field; @@ -49,61 +52,24 @@ impl ArrowDigester { digester.finalize() } - /// Internal recursive function to extract field names from nested structs effectively flattening the schema - /// The format is parent_child_grandchild_etc... for nested fields and will be stored in fields_digest_buffer - fn extract_fields_name( - field: &Field, - parent_field_name: &str, - fields_digest_buffer: &mut BTreeMap, - ) { - // Check if field is a nested type of struct - match field.data_type() { - DataType::Struct(fields) => { - // We will add fields in alphabetical order - fields.into_iter().for_each(|field| { - Self::extract_fields_name(field, parent_field_name, fields_digest_buffer); - }); - } - _ => { - // Base case, just add the field name - let field_name = if parent_field_name.is_empty() { - field.name().to_string() - } else { - format!("{}_{}", parent_field_name, field.name()) - }; - - fields_digest_buffer.insert(field_name, D::new()); - } - } - } + /// This will consume the ArrowDigester and produce the final combined digest where the schema + /// digest is fed in first, followed by each field digest in alphabetical order of field names + pub fn finalize(self) -> Vec { + // Finalize all the sub digest and combine them into a single digest + let mut final_digest = D::new(); - fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { - let array_data = array.to_data(); + // digest the schema first + final_digest.update(&self.schema_digest); - // Get the slice with offset accounted for if there is any - let slice = array_data.buffers()[0] - .as_slice() - .get(array_data.offset() * element_size as usize..) - .expect("Failed to get buffer slice for FixedSizeBinaryArray"); + // Then digest each field digest in order + self.fields_digest_buffer + .into_iter() + .for_each(|(_, digest)| { + let field_hash = digest.finalize(); + final_digest.update(&field_hash); + }); - // Deal with null - match array_data.nulls() { - Some(null_buffer) => { - // There are nulls, so we need to incrementally hash each value - for i in 0..array_data.len() { - if null_buffer.is_valid(i) { - let data_pos = i * element_size as usize; - digest.update(&slice[data_pos..data_pos + element_size as usize]); - } else { - digest.update(NULL_BYTES); - } - } - } - None => { - // No nulls, we can hash the entire buffer directly - digest.update(slice); - } - } + final_digest.finalize().to_vec() } /// Serialize the schema into a BTreeMap for field name and its digest @@ -191,26 +157,6 @@ impl ArrowDigester { } } - /// This will consume the ArrowDigester and produce the final combined digest where the schema - /// digest is fed in first, followed by each field digest in alphabetical order of field names - pub fn finalize(self) -> Vec { - // Finalize all the sub digest and combine them into a single digest - let mut final_digest = D::new(); - - // digest the schema first - final_digest.update(&self.schema_digest); - - // Then digest each field digest in order - self.fields_digest_buffer - .into_iter() - .for_each(|(_, digest)| { - let field_hash = digest.finalize(); - final_digest.update(&field_hash); - }); - - final_digest.finalize().to_vec() - } - fn array_digest_update(data_type: &DataType, array: &dyn Array, digest: &mut D) { match data_type { DataType::Null => todo!(), @@ -251,10 +197,28 @@ impl ArrowDigester { DataType::Utf8 => todo!(), DataType::LargeUtf8 => todo!(), DataType::Utf8View => todo!(), - DataType::List(_) => todo!(), + DataType::List(field) => { + Self::hash_list_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to ListArray"), + field.data_type(), + digest, + ); + } DataType::ListView(_) => todo!(), DataType::FixedSizeList(_, _) => todo!(), - DataType::LargeList(_) => todo!(), + DataType::LargeList(field) => { + Self::hash_list_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeListArray"), + field.data_type(), + digest, + ); + } DataType::LargeListView(_) => todo!(), DataType::Struct(_) => todo!(), DataType::Union(_, _) => todo!(), @@ -267,13 +231,100 @@ impl ArrowDigester { DataType::RunEndEncoded(_, _) => todo!(), }; } + + fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { + let array_data = array.to_data(); + + // Get the slice with offset accounted for if there is any + let slice = array_data.buffers()[0] + .as_slice() + .get(array_data.offset() * element_size as usize..) + .expect("Failed to get buffer slice for FixedSizeBinaryArray"); + + // Deal with null + match array_data.nulls() { + Some(null_buffer) => { + // There are nulls, so we need to incrementally hash each value + for i in 0..array_data.len() { + if null_buffer.is_valid(i) { + let data_pos = i * element_size as usize; + digest.update(&slice[data_pos..data_pos + element_size as usize]); + } else { + digest.update(NULL_BYTES); + } + } + } + None => { + // No nulls, we can hash the entire buffer directly + digest.update(slice); + } + } + } + + fn hash_list_array( + array: &GenericListArray, + field_data_type: &DataType, + digest: &mut D, + ) { + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + Self::array_digest_update( + &field_data_type, + array.value(i).as_ref(), + digest, + ); + } else { + digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + Self::array_digest_update(&field_data_type, array.value(i).as_ref(), digest); + } + } + } + } + + /// Internal recursive function to extract field names from nested structs effectively flattening the schema + /// The format is parent_child_grandchild_etc... for nested fields and will be stored in fields_digest_buffer + fn extract_fields_name( + field: &Field, + parent_field_name: &str, + fields_digest_buffer: &mut BTreeMap, + ) { + // Check if field is a nested type of struct + match field.data_type() { + DataType::Struct(fields) => { + // We will add fields in alphabetical order + fields.into_iter().for_each(|field| { + Self::extract_fields_name(field, parent_field_name, fields_digest_buffer); + }); + } + _ => { + // Base case, just add the field name + let field_name = if parent_field_name.is_empty() { + field.name().to_string() + } else { + format!("{}_{}", parent_field_name, field.name()) + }; + + fields_digest_buffer.insert(field_name, D::new()); + } + } + } } #[cfg(test)] mod tests { use std::sync::Arc; - use arrow::array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}; + use arrow::{ + array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}, + datatypes::Int32Type, + }; use arrow_schema::{DataType, Field, Schema}; use pretty_assertions::assert_eq; use sha2::Sha256; @@ -303,6 +354,24 @@ mod tests { ); } + // List array hashing test + #[test] + fn list_array_hashing() { + let list_array = arrow::array::ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), Some(5)]), + Some(vec![Some(6)]), + ]); + + let hash = hex::encode(ArrowDigester::::hash_array(&list_array)); + println!("{}", hash); + assert_eq!( + hash, + "d30c8845c58f71bcec4910c65a91328af2cc86d26001662270da3a3d5222dd36" + ); + } + #[test] fn commutative_tables() { let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; From 1fc9bf591dde10a6f20b30ed3c6a62ef31e608ce Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 7 Nov 2025 08:26:15 +0000 Subject: [PATCH 05/53] Add decimal hashing --- src/hasher.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/src/hasher.rs b/src/hasher.rs index 3535cab..da5858b 100644 --- a/src/hasher.rs +++ b/src/hasher.rs @@ -223,10 +223,12 @@ impl ArrowDigester { DataType::Struct(_) => todo!(), DataType::Union(_, _) => todo!(), DataType::Dictionary(_, _) => todo!(), - DataType::Decimal32(_, _) => todo!(), - DataType::Decimal64(_, _) => todo!(), - DataType::Decimal128(_, _) => todo!(), - DataType::Decimal256(_, _) => todo!(), + DataType::Decimal32(precision, scale) + | DataType::Decimal64(precision, scale) + | DataType::Decimal128(precision, scale) + | DataType::Decimal256(precision, scale) => { + Self::hash_decimal(precision, scale, array, digest) + } DataType::Map(_, _) => todo!(), DataType::RunEndEncoded(_, _) => todo!(), }; @@ -288,6 +290,21 @@ impl ArrowDigester { } } + fn hash_decimal(precision: &u8, scale: &i8, array: &dyn Array, digest: &mut D) { + // Include the precision and scale in the hash + digest.update([*precision]); + digest.update([*scale as u8]); + + // Hash the underlying fixed size array based on precision + match precision { + 1..=9 => Self::hash_fixed_size_array(array, digest, 4), + 10..=18 => Self::hash_fixed_size_array(array, digest, 8), + 19..=38 => Self::hash_fixed_size_array(array, digest, 16), + 39..=76 => Self::hash_fixed_size_array(array, digest, 32), + _ => panic!("Unsupported decimal precision: {}", precision), + } + } + /// Internal recursive function to extract field names from nested structs effectively flattening the schema /// The format is parent_child_grandchild_etc... for nested fields and will be stored in fields_digest_buffer fn extract_fields_name( @@ -330,6 +347,7 @@ mod tests { use sha2::Sha256; use crate::hasher::ArrowDigester; + use arrow::array::Decimal128Array; #[test] fn boolean_array_hashing() { @@ -372,6 +390,46 @@ mod tests { ); } + // Test all types of decimal hashing + #[test] + fn decimal_array_hashing() { + // Test Decimal32 (precision 1-9) + let decimal32_array = + Decimal128Array::from_iter(vec![Some(123), None, Some(-456), Some(0)]) + .with_precision_and_scale(9, 2) + .unwrap(); + + assert_eq!( + hex::encode(ArrowDigester::::hash_array(&decimal32_array)), + "bd639e8df756f0bd194f18572e89ea180307e6d46e88d96ade52b61e196c3268" + ); + + // Test Decimal64 (precision 10-18) + let decimal64_array = + Decimal128Array::from_iter(vec![Some(1234567890123), None, Some(-9876543210), Some(0)]) + .with_precision_and_scale(15, 3) + .unwrap(); + assert_eq!( + hex::encode(ArrowDigester::::hash_array(&decimal64_array)), + "ca1f8a6fb179ddafad1e02738ad2d869da187c72a9b815d8e12a85692525d231" + ); + + // Test Decimal128 (precision 19-38) + let decimal128_array = Decimal128Array::from_iter(vec![ + Some(123456789012345678901234567), + None, + Some(-987654321098765432109876543), + Some(0), + ]) + .with_precision_and_scale(38, 5) + .unwrap(); + assert_eq!( + hex::encode(ArrowDigester::::hash_array(&decimal128_array)), + "d2a1a2d8c87193032d46a541405e1bf60124d08a7c431ce3fe55f26508b400f3" + ); + // Verify that different precisions/scales produce different hashe + } + #[test] fn commutative_tables() { let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; From 72b1fb8935846d686e0440b3203ea9cbb1532d2d Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 7 Nov 2025 08:27:10 +0000 Subject: [PATCH 06/53] Rename hasher to arrow_digester --- src/{hasher.rs => arrow_digester.rs} | 2 +- src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename src/{hasher.rs => arrow_digester.rs} (99%) diff --git a/src/hasher.rs b/src/arrow_digester.rs similarity index 99% rename from src/hasher.rs rename to src/arrow_digester.rs index da5858b..98e448b 100644 --- a/src/hasher.rs +++ b/src/arrow_digester.rs @@ -346,7 +346,7 @@ mod tests { use pretty_assertions::assert_eq; use sha2::Sha256; - use crate::hasher::ArrowDigester; + use crate::arrow_digester::ArrowDigester; use arrow::array::Decimal128Array; #[test] diff --git a/src/lib.rs b/src/lib.rs index ea6e98e..a713335 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,5 +3,5 @@ extern crate uniffi as uniffi_external; uniffi_external::setup_scaffolding!(); -pub mod hasher; +pub mod arrow_digester; mod pyarrow; From e377ee8f039ed2ffa9d699fe285a8c37b96bac82 Mon Sep 17 00:00:00 2001 From: synicix Date: Sat, 8 Nov 2025 02:20:40 +0000 Subject: [PATCH 07/53] Add String hashing --- src/arrow_digester.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 98e448b..b47f626 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -272,11 +272,7 @@ impl ArrowDigester { Some(null_buf) => { for i in 0..array.len() { if null_buf.is_valid(i) { - Self::array_digest_update( - &field_data_type, - array.value(i).as_ref(), - digest, - ); + Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); } else { digest.update(NULL_BYTES); } @@ -284,7 +280,7 @@ impl ArrowDigester { } None => { for i in 0..array.len() { - Self::array_digest_update(&field_data_type, array.value(i).as_ref(), digest); + Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); } } } @@ -427,7 +423,6 @@ mod tests { hex::encode(ArrowDigester::::hash_array(&decimal128_array)), "d2a1a2d8c87193032d46a541405e1bf60124d08a7c431ce3fe55f26508b400f3" ); - // Verify that different precisions/scales produce different hashe } #[test] From df3a21b9bba98bd0891d57201e8d67a98d0d3956 Mon Sep 17 00:00:00 2001 From: synicix Date: Sat, 8 Nov 2025 03:04:32 +0000 Subject: [PATCH 08/53] Add binary and string hashing --- src/arrow_digester.rs | 180 +++++++++++++++++++++++++++++++++++------- 1 file changed, 152 insertions(+), 28 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index b47f626..810b59d 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -2,8 +2,9 @@ use std::collections::BTreeMap; use arrow::{ array::{ - Array, BooleanArray, GenericListArray, LargeListArray, ListArray, OffsetSizeTrait, - RecordBatch, StructArray, + Array, BinaryArray, BooleanArray, GenericBinaryArray, GenericListArray, GenericStringArray, + LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait, + RecordBatch, StringArray, StructArray, }, datatypes::{DataType, Schema}, }; @@ -172,30 +173,56 @@ impl ArrowDigester { None => digest.update(NULL_BYTES), }); } - DataType::Int8 => Self::hash_fixed_size_array(array, digest, 1), - DataType::Int16 => Self::hash_fixed_size_array(array, digest, 2), - DataType::Int32 => Self::hash_fixed_size_array(array, digest, 4), - DataType::Int64 => Self::hash_fixed_size_array(array, digest, 8), - DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), - DataType::UInt16 => Self::hash_fixed_size_array(array, digest, 2), - DataType::UInt32 => Self::hash_fixed_size_array(array, digest, 4), - DataType::UInt64 => Self::hash_fixed_size_array(array, digest, 8), - DataType::Float16 => Self::hash_fixed_size_array(array, digest, 2), - DataType::Float32 => Self::hash_fixed_size_array(array, digest, 4), - DataType::Float64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::Int8 => Self::hash_fixed_size_array(array, digest, &1), + DataType::Int16 => Self::hash_fixed_size_array(array, digest, &2), + DataType::Int32 => Self::hash_fixed_size_array(array, digest, &4), + DataType::Int64 => Self::hash_fixed_size_array(array, digest, &8), + DataType::UInt8 => Self::hash_fixed_size_array(array, digest, &1), + DataType::UInt16 => Self::hash_fixed_size_array(array, digest, &2), + DataType::UInt32 => Self::hash_fixed_size_array(array, digest, &4), + DataType::UInt64 => Self::hash_fixed_size_array(array, digest, &8), + DataType::Float16 => Self::hash_fixed_size_array(array, digest, &2), + DataType::Float32 => Self::hash_fixed_size_array(array, digest, &4), + DataType::Float64 => Self::hash_fixed_size_array(array, digest, &8), DataType::Timestamp(_, _) => todo!(), - DataType::Date32 => Self::hash_fixed_size_array(array, digest, 4), - DataType::Date64 => Self::hash_fixed_size_array(array, digest, 8), + DataType::Date32 => Self::hash_fixed_size_array(array, digest, &4), + DataType::Date64 => Self::hash_fixed_size_array(array, digest, &8), DataType::Time32(_) => todo!(), DataType::Time64(_) => todo!(), DataType::Duration(_) => todo!(), DataType::Interval(_) => todo!(), - DataType::Binary => todo!(), - DataType::FixedSizeBinary(_) => todo!(), - DataType::LargeBinary => todo!(), + DataType::Binary => Self::hash_binary_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to BinaryArray"), + digest, + ), + DataType::FixedSizeBinary(element_size) => { + Self::hash_fixed_size_array(array, digest, element_size) + } + DataType::LargeBinary => Self::hash_binary_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeBinaryArray"), + digest, + ), DataType::BinaryView => todo!(), - DataType::Utf8 => todo!(), - DataType::LargeUtf8 => todo!(), + DataType::Utf8 => Self::hash_string_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StringArray"), + digest, + ), + DataType::LargeUtf8 => Self::hash_string_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeStringArray"), + digest, + ), DataType::Utf8View => todo!(), DataType::List(field) => { Self::hash_list_array( @@ -234,13 +261,14 @@ impl ArrowDigester { }; } - fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { + fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: &i32) { let array_data = array.to_data(); + let element_size_usize = *element_size as usize; // Get the slice with offset accounted for if there is any let slice = array_data.buffers()[0] .as_slice() - .get(array_data.offset() * element_size as usize..) + .get(array_data.offset() * element_size_usize..) .expect("Failed to get buffer slice for FixedSizeBinaryArray"); // Deal with null @@ -249,8 +277,8 @@ impl ArrowDigester { // There are nulls, so we need to incrementally hash each value for i in 0..array_data.len() { if null_buffer.is_valid(i) { - let data_pos = i * element_size as usize; - digest.update(&slice[data_pos..data_pos + element_size as usize]); + let data_pos = i * element_size_usize; + digest.update(&slice[data_pos..data_pos + element_size_usize]); } else { digest.update(NULL_BYTES); } @@ -263,6 +291,48 @@ impl ArrowDigester { } } + fn hash_binary_array(array: &GenericBinaryArray, digest: &mut D) { + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + digest.update(value); + } else { + digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + let value = array.value(i); + digest.update(value); + } + } + } + } + + fn hash_string_array(array: &GenericStringArray, digest: &mut D) { + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + digest.update(value.as_bytes()); + } else { + digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + let value = array.value(i); + digest.update(value.as_bytes()); + } + } + } + } + fn hash_list_array( array: &GenericListArray, field_data_type: &DataType, @@ -293,10 +363,10 @@ impl ArrowDigester { // Hash the underlying fixed size array based on precision match precision { - 1..=9 => Self::hash_fixed_size_array(array, digest, 4), - 10..=18 => Self::hash_fixed_size_array(array, digest, 8), - 19..=38 => Self::hash_fixed_size_array(array, digest, 16), - 39..=76 => Self::hash_fixed_size_array(array, digest, 32), + 1..=9 => Self::hash_fixed_size_array(array, digest, &4), + 10..=18 => Self::hash_fixed_size_array(array, digest, &8), + 19..=38 => Self::hash_fixed_size_array(array, digest, &16), + 39..=76 => Self::hash_fixed_size_array(array, digest, &32), _ => panic!("Unsupported decimal precision: {}", precision), } } @@ -368,6 +438,60 @@ mod tests { ); } + /// Test binary array hashing + #[test] + fn binary_array_hashing() { + let binary_array = arrow::array::BinaryArray::from(vec![ + Some(b"hello".as_ref()), + None, + Some(b"world".as_ref()), + Some(b"".as_ref()), + ]); + let hash = hex::encode(ArrowDigester::::hash_array(&binary_array)); + assert_eq!( + hash, + "078347d3063fb5bbe0bdbd3315cf8e5e140733ea34e6b73cbc0838b60a9c8012" + ); + + // Test large binary array with same data to ensure consistency + let large_binary_array = arrow::array::LargeBinaryArray::from(vec![ + Some(b"hello".as_ref()), + None, + Some(b"world".as_ref()), + Some(b"".as_ref()), + ]); + + assert_eq!( + hex::encode(ArrowDigester::::hash_array(&large_binary_array)), + hash + ); + } + + // Test String hashing + #[test] + fn string_array_hashing() { + let string_array = + arrow::array::StringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); + let hash = hex::encode(ArrowDigester::::hash_array(&string_array)); + assert_eq!( + hash, + "078347d3063fb5bbe0bdbd3315cf8e5e140733ea34e6b73cbc0838b60a9c8012" + ); + + // Test large string array with same data to ensure consistency + let large_string_array = arrow::array::LargeStringArray::from(vec![ + Some("hello"), + None, + Some("world"), + Some(""), + ]); + + assert_eq!( + hex::encode(ArrowDigester::::hash_array(&large_string_array)), + hash + ); + } + // List array hashing test #[test] fn list_array_hashing() { From 1f6577aa53c70930336d9e15981c08c5ae0e647e Mon Sep 17 00:00:00 2001 From: synicix Date: Sat, 8 Nov 2025 03:34:31 +0000 Subject: [PATCH 09/53] Add time hashing --- src/arrow_digester.rs | 65 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 810b59d..ffe48bb 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -8,7 +8,7 @@ use arrow::{ }, datatypes::{DataType, Schema}, }; -use arrow_schema::Field; +use arrow_schema::{Field, TimeUnit}; use digest::Digest; use postcard::to_vec; @@ -187,8 +187,8 @@ impl ArrowDigester { DataType::Timestamp(_, _) => todo!(), DataType::Date32 => Self::hash_fixed_size_array(array, digest, &4), DataType::Date64 => Self::hash_fixed_size_array(array, digest, &8), - DataType::Time32(_) => todo!(), - DataType::Time64(_) => todo!(), + DataType::Time32(time_unit) => Self::hash_time_array(array, time_unit, digest, &4), + DataType::Time64(time_unit) => Self::hash_time_array(array, time_unit, digest, &8), DataType::Duration(_) => todo!(), DataType::Interval(_) => todo!(), DataType::Binary => Self::hash_binary_array( @@ -312,6 +312,24 @@ impl ArrowDigester { } } + fn hash_time_array( + array: &dyn Array, + time_unit: &TimeUnit, + digest: &mut D, + element_size: &i32, + ) { + // We need to update the digest with the time unit first to ensure different time units produce different hashes + digest.update([match time_unit { + TimeUnit::Second => 0u8, + TimeUnit::Millisecond => 1u8, + TimeUnit::Microsecond => 2u8, + TimeUnit::Nanosecond => 3u8, + }]); + + // Now hash the underlying fixed size array based on time unit + Self::hash_fixed_size_array(array, digest, element_size); + } + fn hash_string_array(array: &GenericStringArray, digest: &mut D) { match array.nulls() { Some(null_buf) => { @@ -438,6 +456,47 @@ mod tests { ); } + /// Test time array hashing + #[test] + fn time32_array_hashing() { + let time_array = + arrow::array::Time32SecondArray::from(vec![Some(1000), None, Some(5000), Some(0)]); + let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); + println!("{}", hash); + assert_eq!( + hash, + "b5d70eca0650399a9b00440e3cd9985e58b0f033d446bdd5947f96a62397002a" + ); + } + + #[test] + fn time64_array_hashing() { + let time_array = arrow::array::Time64MicrosecondArray::from(vec![ + Some(1000000), + None, + Some(5000000), + Some(0), + ]); + let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); + println!("{}", hash); + assert_eq!( + hash, + "1f0847660ea421c266f226293d2f0c54ea5de0c168ac7e4bebfabf6d348a6d18" + ); + } + + #[test] + fn time_array_different_units_produce_different_hashes() { + let time32_second = arrow::array::Time32SecondArray::from(vec![Some(1000), Some(2000)]); + let time32_millis = + arrow::array::Time32MillisecondArray::from(vec![Some(1000), Some(2000)]); + + let hash_second = hex::encode(ArrowDigester::::hash_array(&time32_second)); + let hash_millis = hex::encode(ArrowDigester::::hash_array(&time32_millis)); + + assert_ne!(hash_second, hash_millis); + } + /// Test binary array hashing #[test] fn binary_array_hashing() { From 1fff3448e0c3b4f6a97782da22041c39f40cd4c4 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 13 Nov 2025 08:09:56 +0000 Subject: [PATCH 10/53] Change to new custom hasher and remove old one. --- Cargo.toml | 1 - src/pyarrow.rs | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 92c47cc..1838d0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,6 @@ edition = "2024" [dependencies] arrow = { version = "57.0.0", features = ["ffi"] } -arrow-digest = "56.0.0" arrow-schema = { version = "57.0.0", features = ["serde"] } digest = "0.10.7" hex = "0.4.3" diff --git a/src/pyarrow.rs b/src/pyarrow.rs index ce51a35..00ecdec 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -1,5 +1,8 @@ use arrow::array::{RecordBatch, StructArray}; use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi}; +use sha2::Sha256; + +use crate::arrow_digester::ArrowDigester; /// Process an Arrow table via C Data Interface /// @@ -14,9 +17,6 @@ pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { from_ffi(ffi_array, &ffi_schema).expect("Failed to import Arrow array data") }; - // Create RecordBatch from StructArray - let _record_batch = RecordBatch::from(StructArray::from(array_data)); - // Hash the table - Vec::new() + ArrowDigester::::hash_record_batch(&RecordBatch::from(StructArray::from(array_data))) } From 5a4371a6d4b8d51091fa5e29b72f95b9e5fe9e38 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 13 Nov 2025 10:29:13 +0000 Subject: [PATCH 11/53] Add rust tests --- .github/workflows/clippy.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/clippy.yml diff --git a/.github/workflows/clippy.yml b/.github/workflows/clippy.yml new file mode 100644 index 0000000..1171147 --- /dev/null +++ b/.github/workflows/clippy.yml @@ -0,0 +1,21 @@ +name: rust_checks +on: + - pull_request +jobs: + rust-syntax-style-format-and-integration: + runs-on: ubuntu-latest + env: + CARGO_TERM_COLOR: always + steps: + - uses: actions/checkout@v4 + - name: Install Rust + components + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: 1.90.0 + components: rustfmt,clippy + - name: Install code coverage + uses: taiki-e/install-action@cargo-llvm-cov + - name: Run syntax and style tests + run: cargo clippy --all-targets -- -D warnings + - name: Run format test + run: cargo fmt --check From 934e6e7b23c213f132c07dd9157f0697ba4d4261 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 14 Nov 2025 03:30:28 +0000 Subject: [PATCH 12/53] Fix all clippy recommendations --- Cargo.toml | 63 ++++++++- src/arrow_digester.rs | 297 +++++++++++++++++++++++------------------- src/pyarrow.rs | 16 ++- 3 files changed, 238 insertions(+), 138 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1838d0e..94ce866 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,14 @@ [package] name = "starfix" -version = "0.1.3" -edition = "2024" +version = "0.0.0" +edition = '2024' +description = "Package for hashing Arrow's data structures uniquely for identifying and comparing data efficiently." +authors = ["synicix "] +readme = "README.md" +repository = "https://github.com/nauticalab/starfix" +license = "MIT OR Apache-2.0" +keywords = ["arrow", "hashing"] +categories = ["hashing", "arrow"] [dependencies] arrow = { version = "57.0.0", features = ["ffi"] } @@ -28,3 +35,55 @@ crate-type = ["rlib", "cdylib"] [package.metadata.release] publish = false + + +[lints.clippy] +cargo = "deny" +complexity = "deny" +correctness = "deny" +nursery = "deny" +pedantic = "deny" +perf = "deny" +restriction = "deny" +style = "deny" +suspicious = "deny" + +min_ident_chars = { level = "allow", priority = 127 } # allow for variables that is one char +arbitrary_source_item_ordering = { level = "allow", priority = 127 } # allow arbitrary ordering to keep relevant code nearby +as_conversions = { level = "allow", priority = 127 } # allow casting +blanket_clippy_restriction_lints = { level = "allow", priority = 127 } # allow setting all restrictions so we can omit specific ones +default_numeric_fallback = { level = "allow", priority = 127 } # allow type inferred by numeric literal, detection is buggy +disallowed_script_idents = { level = "allow", priority = 127 } # skip since we use only ascii +exhaustive_enums = { level = "allow", priority = 127 } # remove requirement to label enum as exhaustive +exhaustive_structs = { level = "allow", priority = 127 } # revisit once lib is ready to be used externally +field_scoped_visibility_modifiers = { level = "allow", priority = 127 } # allow field-level visibility modifiers +float_arithmetic = { level = "allow", priority = 127 } # allow float arithmetic +impl_trait_in_params = { level = "allow", priority = 127 } # impl in params ok +implicit_return = { level = "allow", priority = 127 } # missing return ok +iter_over_hash_type = { level = "allow", priority = 127 } # allow iterating over unordered iterables like `HashMap` +little_endian_bytes = { level = "allow", priority = 127 } # allow to_le_bytes / from_le_bytes +missing_docs_in_private_items = { level = "allow", priority = 127 } # missing docs on private ok +missing_inline_in_public_items = { level = "allow", priority = 127 } # let rust compiler determine best inline logic +missing_trait_methods = { level = "allow", priority = 127 } # allow in favor of rustc `implement the missing item` +module_name_repetitions = { level = "allow", priority = 127 } # allow use of module name in type names +multiple_crate_versions = { level = "allow", priority = 127 } # allow since list of exceptions changes frequently from external +multiple_inherent_impl = { level = "allow", priority = 127 } # required in best practice to limit exposure over UniFFI +must_use_candidate = { level = "allow", priority = 127 } # omitting #[must_use] ok +mod_module_files = { level = "allow", priority = 127 } # mod directories ok +non_ascii_literal = { level = "allow", priority = 127 } # non-ascii char in string literal ok +partial_pub_fields = { level = "allow", priority = 127 } # partial struct pub fields ok +pattern_type_mismatch = { level = "allow", priority = 127 } # allow in favor of clippy::ref_patterns +print_stderr = { level = "allow", priority = 127 } # stderr prints ok +print_stdout = { level = "allow", priority = 127 } # stdout prints ok +pub_use = { level = "allow", priority = 127 } # ok to structure source into many files but clean up import +pub_with_shorthand = { level = "allow", priority = 127 } # allow use of pub(super) +question_mark_used = { level = "allow", priority = 127 } # allow question operator +self_named_module_files = { level = "allow", priority = 127 } # mod files ok +separated_literal_suffix = { level = "allow", priority = 127 } # literal suffixes should be separated by underscore +single_call_fn = { level = "allow", priority = 127 } # allow functions called only once, which allows better code organization +single_char_lifetime_names = { level = "allow", priority = 127 } # single char lifetimes ok +std_instead_of_alloc = { level = "allow", priority = 127 } # we should use std when possible +std_instead_of_core = { level = "allow", priority = 127 } # we should use std when possible +string_add = { level = "allow", priority = 127 } # simple concat ok +use_debug = { level = "warn", priority = 127 } # debug print +wildcard_enum_match_arm = { level = "allow", priority = 127 } # allow wildcard match arm in enums diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index ffe48bb..0e1f8c1 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -1,3 +1,9 @@ +#![expect( + clippy::expect_used, + clippy::todo, + clippy::panic, + reason = "First iteration of code, will add proper error handling later. Allow for unsupported data types for now" +)] use std::collections::BTreeMap; use arrow::{ @@ -32,28 +38,28 @@ impl ArrowDigester { }); // Store it in the new struct for now - ArrowDigester { + Self { schema, schema_digest, fields_digest_buffer, } } - /// Hash a array directly without needing to create an ArrowDigester instance on the user side + /// Hash a array directly without needing to create an `ArrowDigester` instance on the user side pub fn hash_array(array: &dyn Array) -> Vec { let mut digest = D::new(); Self::array_digest_update(array.data_type(), array, &mut digest); digest.finalize().to_vec() } - /// Hash record batch directly without needing to create an ArrowDigester instance on the user side + /// Hash record batch directly without needing to create an `ArrowDigester` instance on the user side pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { - let mut digester = ArrowDigester::::new(record_batch.schema().as_ref().clone()); - digester.update(record_batch.clone()); + let mut digester = Self::new(record_batch.schema().as_ref().clone()); + digester.update(&record_batch.clone()); digester.finalize() } - /// This will consume the ArrowDigester and produce the final combined digest where the schema + /// This will consume the `ArrowDigester` and produce the final combined digest where the schema /// digest is fed in first, followed by each field digest in alphabetical order of field names pub fn finalize(self) -> Vec { // Finalize all the sub digest and combine them into a single digest @@ -73,24 +79,33 @@ impl ArrowDigester { final_digest.finalize().to_vec() } - /// Serialize the schema into a BTreeMap for field name and its digest + /// Serialize the schema into a `BTreeMap` for field name and its digest fn hash_schema(schema: &Schema) -> Vec { let fields_digest = schema .fields .into_iter() - .map(|field| (field.name(), to_vec::<_, 256>(field).unwrap())) + .map(|field| { + ( + field.name(), + to_vec::<_, 256>(field).expect("Failed to serialize field of schema"), + ) + }) .collect::>(); // Hash the entire thing to the digest - D::digest(to_vec::<_, 1024>(&fields_digest).unwrap()).to_vec() + D::digest( + to_vec::<_, 1024>(&fields_digest).expect("Failed to serialize field_digest to bytes"), + ) + .to_vec() } /// Hash a record batch and update the internal digests - fn update(&mut self, record_batch: RecordBatch) { + fn update(&mut self, record_batch: &RecordBatch) { // Verify schema matches - if *record_batch.schema() != self.schema { - panic!("Record batch schema does not match ArrowDigester schema"); - } + assert!( + !(*record_batch.schema() != self.schema), + "Record batch schema does not match ArrowDigester schema" + ); // Iterate through each field and update its digest self.fields_digest_buffer @@ -104,9 +119,11 @@ impl ArrowDigester { record_batch .schema() .field_with_name(field_name) - .unwrap() + .expect("Failed to get field with name") .data_type(), - record_batch.column_by_name(field_name).unwrap(), + record_batch + .column_by_name(field_name) + .expect("Failed to get column by name"), digest, ); } else { @@ -114,8 +131,12 @@ impl ArrowDigester { &field_name_hierarchy, 0, record_batch - .column_by_name(field_name_hierarchy[0]) - .unwrap() + .column_by_name( + field_name_hierarchy + .first() + .expect("Failed to get field name at idx 0, list is empty!"), + ) + .expect("Failed to get column by name") .as_any() .downcast_ref::() .expect("Failed to downcast to StructArray"), @@ -133,28 +154,36 @@ impl ArrowDigester { digest: &mut D, ) { if field_name_hierarchy.len() == current_level { + let array_data = array + .column_by_name( + field_name_hierarchy + .first() + .expect("Failed to get field name at idx 0, list is empty!"), + ) + .expect("Failed to get column by name"); // Base case, it should be a non-struct field - Self::array_digest_update( - array - .column_by_name(field_name_hierarchy[0]) - .unwrap() - .data_type(), - array - .column_by_name(field_name_hierarchy[0]) - .unwrap() - .as_ref(), - digest, - ); + Self::array_digest_update(array_data.data_type(), array_data.as_ref(), digest); } else { // Recursive case, it should be a struct field let next_array = array - .column_by_name(field_name_hierarchy[current_level]) - .unwrap() + .column_by_name( + field_name_hierarchy + .get(current_level) + .expect("Failed to get field name at current level"), + ) + .expect("Failed to get column by name") .as_any() .downcast_ref::() .expect("Failed to downcast to StructArray"); - Self::update_nested_field(field_name_hierarchy, current_level + 1, next_array, digest); + Self::update_nested_field( + field_name_hierarchy, + current_level + .checked_add(1) + .expect("Field nesting level overflow"), + next_array, + digest, + ); } } @@ -169,26 +198,23 @@ impl ArrowDigester { .expect("Failed to downcast to BooleanArray"); bool_array.into_iter().for_each(|value| match value { - Some(b) => digest.update([b as u8]), + Some(b) => digest.update([u8::from(b)]), None => digest.update(NULL_BYTES), }); } - DataType::Int8 => Self::hash_fixed_size_array(array, digest, &1), - DataType::Int16 => Self::hash_fixed_size_array(array, digest, &2), - DataType::Int32 => Self::hash_fixed_size_array(array, digest, &4), - DataType::Int64 => Self::hash_fixed_size_array(array, digest, &8), - DataType::UInt8 => Self::hash_fixed_size_array(array, digest, &1), - DataType::UInt16 => Self::hash_fixed_size_array(array, digest, &2), - DataType::UInt32 => Self::hash_fixed_size_array(array, digest, &4), - DataType::UInt64 => Self::hash_fixed_size_array(array, digest, &8), - DataType::Float16 => Self::hash_fixed_size_array(array, digest, &2), - DataType::Float32 => Self::hash_fixed_size_array(array, digest, &4), - DataType::Float64 => Self::hash_fixed_size_array(array, digest, &8), + DataType::Int8 | DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), + DataType::Int16 | DataType::UInt16 | DataType::Float16 => { + Self::hash_fixed_size_array(array, digest, 2); + } + DataType::Int32 | DataType::UInt32 | DataType::Float32 | DataType::Date32 => { + Self::hash_fixed_size_array(array, digest, 4); + } + DataType::Int64 | DataType::UInt64 | DataType::Float64 | DataType::Date64 => { + Self::hash_fixed_size_array(array, digest, 8); + } DataType::Timestamp(_, _) => todo!(), - DataType::Date32 => Self::hash_fixed_size_array(array, digest, &4), - DataType::Date64 => Self::hash_fixed_size_array(array, digest, &8), - DataType::Time32(time_unit) => Self::hash_time_array(array, time_unit, digest, &4), - DataType::Time64(time_unit) => Self::hash_time_array(array, time_unit, digest, &8), + DataType::Time32(time_unit) => Self::hash_time_array(array, *time_unit, digest, 4), + DataType::Time64(time_unit) => Self::hash_time_array(array, *time_unit, digest, 8), DataType::Duration(_) => todo!(), DataType::Interval(_) => todo!(), DataType::Binary => Self::hash_binary_array( @@ -199,7 +225,7 @@ impl ArrowDigester { digest, ), DataType::FixedSizeBinary(element_size) => { - Self::hash_fixed_size_array(array, digest, element_size) + Self::hash_fixed_size_array(array, digest, *element_size); } DataType::LargeBinary => Self::hash_binary_array( array @@ -254,21 +280,30 @@ impl ArrowDigester { | DataType::Decimal64(precision, scale) | DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { - Self::hash_decimal(precision, scale, array, digest) + Self::hash_decimal(*precision, *scale, array, digest); } DataType::Map(_, _) => todo!(), DataType::RunEndEncoded(_, _) => todo!(), - }; + } } - fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: &i32) { + #[expect(clippy::cast_sign_loss, reason = "element_size is always positive")] + fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { let array_data = array.to_data(); - let element_size_usize = *element_size as usize; + let element_size_usize = element_size as usize; // Get the slice with offset accounted for if there is any - let slice = array_data.buffers()[0] + let slice = array_data + .buffers() + .first() + .expect("Unable to get first buffer to determine offset") .as_slice() - .get(array_data.offset() * element_size_usize..) + .get( + array_data + .offset() + .checked_mul(element_size_usize) + .expect("Offset multiplication overflow").., + ) .expect("Failed to get buffer slice for FixedSizeBinaryArray"); // Deal with null @@ -277,8 +312,17 @@ impl ArrowDigester { // There are nulls, so we need to incrementally hash each value for i in 0..array_data.len() { if null_buffer.is_valid(i) { - let data_pos = i * element_size_usize; - digest.update(&slice[data_pos..data_pos + element_size_usize]); + let data_pos = i + .checked_mul(element_size_usize) + .expect("Data position multiplication overflow"); + let end_pos = data_pos + .checked_add(element_size_usize) + .expect("End position addition overflow"); + if let Some(data_slice) = slice.get(data_pos..end_pos) { + digest.update(data_slice); + } else { + digest.update(NULL_BYTES); + } } else { digest.update(NULL_BYTES); } @@ -312,18 +356,13 @@ impl ArrowDigester { } } - fn hash_time_array( - array: &dyn Array, - time_unit: &TimeUnit, - digest: &mut D, - element_size: &i32, - ) { + fn hash_time_array(array: &dyn Array, time_unit: TimeUnit, digest: &mut D, element_size: i32) { // We need to update the digest with the time unit first to ensure different time units produce different hashes digest.update([match time_unit { - TimeUnit::Second => 0u8, - TimeUnit::Millisecond => 1u8, - TimeUnit::Microsecond => 2u8, - TimeUnit::Nanosecond => 3u8, + TimeUnit::Second => 0_u8, + TimeUnit::Millisecond => 1_u8, + TimeUnit::Microsecond => 2_u8, + TimeUnit::Nanosecond => 3_u8, }]); // Now hash the underlying fixed size array based on time unit @@ -374,56 +413,59 @@ impl ArrowDigester { } } - fn hash_decimal(precision: &u8, scale: &i8, array: &dyn Array, digest: &mut D) { + #[expect(clippy::cast_sign_loss, reason = "Scale should always be non-negative")] + fn hash_decimal(precision: u8, scale: i8, array: &dyn Array, digest: &mut D) { // Include the precision and scale in the hash - digest.update([*precision]); - digest.update([*scale as u8]); + digest.update([precision]); + digest.update([scale as u8]); // Hash the underlying fixed size array based on precision match precision { - 1..=9 => Self::hash_fixed_size_array(array, digest, &4), - 10..=18 => Self::hash_fixed_size_array(array, digest, &8), - 19..=38 => Self::hash_fixed_size_array(array, digest, &16), - 39..=76 => Self::hash_fixed_size_array(array, digest, &32), - _ => panic!("Unsupported decimal precision: {}", precision), + 1..=9 => Self::hash_fixed_size_array(array, digest, 4), + 10..=18 => Self::hash_fixed_size_array(array, digest, 8), + 19..=38 => Self::hash_fixed_size_array(array, digest, 16), + 39..=76 => Self::hash_fixed_size_array(array, digest, 32), + _ => panic!("Unsupported decimal precision: {precision}"), } } /// Internal recursive function to extract field names from nested structs effectively flattening the schema - /// The format is parent_child_grandchild_etc... for nested fields and will be stored in fields_digest_buffer + /// The format is `parent_child_grandchild_etc`... for nested fields and will be stored in `fields_digest_buffer` fn extract_fields_name( field: &Field, parent_field_name: &str, fields_digest_buffer: &mut BTreeMap, ) { // Check if field is a nested type of struct - match field.data_type() { - DataType::Struct(fields) => { - // We will add fields in alphabetical order - fields.into_iter().for_each(|field| { - Self::extract_fields_name(field, parent_field_name, fields_digest_buffer); - }); - } - _ => { - // Base case, just add the field name - let field_name = if parent_field_name.is_empty() { - field.name().to_string() - } else { - format!("{}_{}", parent_field_name, field.name()) - }; - - fields_digest_buffer.insert(field_name, D::new()); - } + if let DataType::Struct(fields) = field.data_type() { + // We will add fields in alphabetical order + fields.into_iter().for_each(|field_inner| { + Self::extract_fields_name(field_inner, parent_field_name, fields_digest_buffer); + }); + } else { + // Base case, just add the field name + let field_name = if parent_field_name.is_empty() { + field.name().clone() + } else { + format!("{}_{}", parent_field_name, field.name()) + }; + + fields_digest_buffer.insert(field_name, D::new()); } } } #[cfg(test)] mod tests { + #![expect(clippy::unwrap_used, reason = "Okay in test")] use std::sync::Arc; use arrow::{ - array::{ArrayRef, BooleanArray, Int32Array, RecordBatch}, + array::{ + ArrayRef, BinaryArray, BooleanArray, Int32Array, LargeBinaryArray, LargeStringArray, + ListArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray, + Time64MicrosecondArray, + }, datatypes::Int32Type, }; use arrow_schema::{DataType, Field, Schema}; @@ -437,7 +479,6 @@ mod tests { fn boolean_array_hashing() { let bool_array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); let hash = hex::encode(ArrowDigester::::hash_array(&bool_array)); - println!("{}", hash); assert_eq!( hash, "d7b7a73916d3f0c693ebcfa94fe2eee163d31a38ba8fe44ef81c5ffbff50c9be" @@ -447,9 +488,8 @@ mod tests { /// Test int32 array hashing which is really meant to test fixed size element array hashing #[test] fn int32_array_hashing() { - let int_array = arrow::array::Int32Array::from(vec![Some(42), None, Some(-7), Some(0)]); + let int_array = Int32Array::from(vec![Some(42), None, Some(-7), Some(0)]); let hash = hex::encode(ArrowDigester::::hash_array(&int_array)); - println!("{}", hash); assert_eq!( hash, "bb36e54f5e2d937a05bb716a8d595f1c8da67fda48feeb7ab5b071a69e63d648" @@ -459,10 +499,8 @@ mod tests { /// Test time array hashing #[test] fn time32_array_hashing() { - let time_array = - arrow::array::Time32SecondArray::from(vec![Some(1000), None, Some(5000), Some(0)]); + let time_array = Time32SecondArray::from(vec![Some(1000), None, Some(5000), Some(0)]); let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); - println!("{}", hash); assert_eq!( hash, "b5d70eca0650399a9b00440e3cd9985e58b0f033d446bdd5947f96a62397002a" @@ -471,14 +509,9 @@ mod tests { #[test] fn time64_array_hashing() { - let time_array = arrow::array::Time64MicrosecondArray::from(vec![ - Some(1000000), - None, - Some(5000000), - Some(0), - ]); + let time_array = + Time64MicrosecondArray::from(vec![Some(1_000_000), None, Some(5_000_000), Some(0)]); let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); - println!("{}", hash); assert_eq!( hash, "1f0847660ea421c266f226293d2f0c54ea5de0c168ac7e4bebfabf6d348a6d18" @@ -487,9 +520,8 @@ mod tests { #[test] fn time_array_different_units_produce_different_hashes() { - let time32_second = arrow::array::Time32SecondArray::from(vec![Some(1000), Some(2000)]); - let time32_millis = - arrow::array::Time32MillisecondArray::from(vec![Some(1000), Some(2000)]); + let time32_second = Time32SecondArray::from(vec![Some(1000), Some(2000)]); + let time32_millis = Time32MillisecondArray::from(vec![Some(1000), Some(2000)]); let hash_second = hex::encode(ArrowDigester::::hash_array(&time32_second)); let hash_millis = hex::encode(ArrowDigester::::hash_array(&time32_millis)); @@ -500,7 +532,7 @@ mod tests { /// Test binary array hashing #[test] fn binary_array_hashing() { - let binary_array = arrow::array::BinaryArray::from(vec![ + let binary_array = BinaryArray::from(vec![ Some(b"hello".as_ref()), None, Some(b"world".as_ref()), @@ -513,7 +545,7 @@ mod tests { ); // Test large binary array with same data to ensure consistency - let large_binary_array = arrow::array::LargeBinaryArray::from(vec![ + let large_binary_array = LargeBinaryArray::from(vec![ Some(b"hello".as_ref()), None, Some(b"world".as_ref()), @@ -529,8 +561,7 @@ mod tests { // Test String hashing #[test] fn string_array_hashing() { - let string_array = - arrow::array::StringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); + let string_array = StringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); let hash = hex::encode(ArrowDigester::::hash_array(&string_array)); assert_eq!( hash, @@ -538,12 +569,8 @@ mod tests { ); // Test large string array with same data to ensure consistency - let large_string_array = arrow::array::LargeStringArray::from(vec![ - Some("hello"), - None, - Some("world"), - Some(""), - ]); + let large_string_array = + LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); assert_eq!( hex::encode(ArrowDigester::::hash_array(&large_string_array)), @@ -554,7 +581,7 @@ mod tests { // List array hashing test #[test] fn list_array_hashing() { - let list_array = arrow::array::ListArray::from_iter_primitive::(vec![ + let list_array = ListArray::from_iter_primitive::(vec![ Some(vec![Some(1), Some(2), Some(3)]), None, Some(vec![Some(4), Some(5)]), @@ -562,7 +589,6 @@ mod tests { ]); let hash = hex::encode(ArrowDigester::::hash_array(&list_array)); - println!("{}", hash); assert_eq!( hash, "d30c8845c58f71bcec4910c65a91328af2cc86d26001662270da3a3d5222dd36" @@ -584,10 +610,14 @@ mod tests { ); // Test Decimal64 (precision 10-18) - let decimal64_array = - Decimal128Array::from_iter(vec![Some(1234567890123), None, Some(-9876543210), Some(0)]) - .with_precision_and_scale(15, 3) - .unwrap(); + let decimal64_array = Decimal128Array::from_iter(vec![ + Some(1_234_567_890_123), + None, + Some(-9_876_543_210), + Some(0), + ]) + .with_precision_and_scale(15, 3) + .unwrap(); assert_eq!( hex::encode(ArrowDigester::::hash_array(&decimal64_array)), "ca1f8a6fb179ddafad1e02738ad2d869da187c72a9b815d8e12a85692525d231" @@ -595,9 +625,9 @@ mod tests { // Test Decimal128 (precision 19-38) let decimal128_array = Decimal128Array::from_iter(vec![ - Some(123456789012345678901234567), + Some(123_456_789_012_345_678_901_234_567), None, - Some(-987654321098765432109876543), + Some(-987_654_321_098_765_432_109_876_543), Some(0), ]) .with_precision_and_scale(38, 5) @@ -624,7 +654,7 @@ mod tests { Field::new("uids", DataType::Int32, false), Field::new("flags", DataType::Boolean, true), ])), - vec![uids.clone(), fake_data.clone()], + vec![Arc::clone(&uids), Arc::clone(&fake_data)], ); let batch2 = RecordBatch::try_new( @@ -632,7 +662,7 @@ mod tests { Field::new("flags", DataType::Boolean, true), Field::new("uids", DataType::Int32, false), ])), - vec![fake_data.clone(), uids.clone()], + vec![fake_data, uids], ); // Hash both record batches @@ -658,7 +688,7 @@ mod tests { Some(true), ])); - let batch1 = RecordBatch::try_new(schema.clone(), vec![uids, fake_data]).unwrap(); + let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![uids, fake_data]).unwrap(); let uids2 = Arc::new(Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8)])) as ArrayRef; @@ -669,12 +699,11 @@ mod tests { None, ])); - let batch2 = RecordBatch::try_new(schema.clone(), vec![uids2, fake_data2]).unwrap(); - + let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![uids2, fake_data2]).unwrap(); // Hash both record batches let mut digester = ArrowDigester::::new((*schema).clone()); - digester.update(batch1); - digester.update(batch2); + digester.update(&batch1); + digester.update(&batch2); assert_eq!( hex::encode(digester.finalize()), "9ba289655f0c7dd359ababc5a6f6188b352e45483623fbbf8b967723e2b798f8" diff --git a/src/pyarrow.rs b/src/pyarrow.rs index 00ecdec..b77f9cc 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -8,12 +8,24 @@ use crate::arrow_digester::ArrowDigester; /// /// # Safety /// The pointers must be valid Arrow C Data Interface structs from Python's pyarrow + #[uniffi::export] pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { + #[expect( + unsafe_code, + reason = "Need to convert raw pointers to Arrow data structures" + )] + #[expect( + clippy::multiple_unsafe_ops_per_block, + clippy::expect_used, + reason = "Okay since we are doing the same operation of dereferencing pointers, Will add proper errors later" + )] + // SAFETY: + // Need to conduct unsafe operations to convert raw pointers to Arrow data structures let array_data = unsafe { // Construct ArrayData from FFI structures - let ffi_array = FFI_ArrowArray::from_raw(array_ptr as *mut _); - let ffi_schema = FFI_ArrowSchema::from_raw(schema_ptr as *mut _); + let ffi_array = FFI_ArrowArray::from_raw(array_ptr as *mut FFI_ArrowArray); + let ffi_schema = FFI_ArrowSchema::from_raw(schema_ptr as *mut FFI_ArrowSchema); from_ffi(ffi_array, &ffi_schema).expect("Failed to import Arrow array data") }; From da3a8926653c5eecf30f00aa801ed5a510f8ed9d Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 14 Nov 2025 03:49:15 +0000 Subject: [PATCH 13/53] Remove incorrect categories --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 94ce866..d5c4a7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,6 @@ readme = "README.md" repository = "https://github.com/nauticalab/starfix" license = "MIT OR Apache-2.0" keywords = ["arrow", "hashing"] -categories = ["hashing", "arrow"] [dependencies] arrow = { version = "57.0.0", features = ["ffi"] } From 80653d1b252a45464ab525ee76d4920543281023 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 14 Nov 2025 03:53:35 +0000 Subject: [PATCH 14/53] Update categories --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index d5c4a7d..aed4941 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ readme = "README.md" repository = "https://github.com/nauticalab/starfix" license = "MIT OR Apache-2.0" keywords = ["arrow", "hashing"] +categories = ["algorithms"] [dependencies] arrow = { version = "57.0.0", features = ["ffi"] } From 8da263c62a82deec7158f6ba7e32facddf831c42 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 14 Nov 2025 23:29:25 +0000 Subject: [PATCH 15/53] Update clippy actions --- .github/workflows/clippy.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/clippy.yml b/.github/workflows/clippy.yml index 1171147..ff98561 100644 --- a/.github/workflows/clippy.yml +++ b/.github/workflows/clippy.yml @@ -13,8 +13,6 @@ jobs: with: toolchain: 1.90.0 components: rustfmt,clippy - - name: Install code coverage - uses: taiki-e/install-action@cargo-llvm-cov - name: Run syntax and style tests run: cargo clippy --all-targets -- -D warnings - name: Run format test From 47307028227f5843b44984bfea1bcda9ef88aad5 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:24:33 +0000 Subject: [PATCH 16/53] Update read me to include section about hashing --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9abcf18..8cb0ddb 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,15 @@ Hashing Lib for Arrow Data Tables using C_Pointers with Arrow_Digest from: https The repo is setup to use dev containers of VSCode. After starting up the container and connecting to it the process to install the rust lib and a python package is: ```maturin develop --uv``` -NOTE: After every code edit in rust code, you will need to rerun the command to rebuild it then restart the kernel in the Jupyter Notebook side \ No newline at end of file +NOTE: After every code edit in rust code, you will need to rerun the command to rebuild it then restart the kernel in the Jupyter Notebook side + + +# Hashing System Overview +ArrowDigester stores the digest for multiple components of the arrow data_table before combining them + +- schema: Each field name is serialize via PostCard: https://docs.rs/postcard/latest/postcard/ + - Was chosen since I was originally using JSON but wanted something even faster, hence postcard. It is design to be very resource efficient + +- fields_digest_buffer: Flattens all nested schema with the '__' delimiter between the parent and sub level in this format parent_field_name__child_field_name + +- Upon finalization of the hash, the instance consume itself due to digest.finalize consuming self under ``field_digest_buffer``. Following that it adds it to a final digest in this order: schema + field_digest_buffer (lexical order of the field name) From 2b9da46ecc01e0d68de842d66f52f664660ba834 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:25:28 +0000 Subject: [PATCH 17/53] Change delimiter from _ to __ --- src/arrow_digester.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 0e1f8c1..0ef8458 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -430,7 +430,7 @@ impl ArrowDigester { } /// Internal recursive function to extract field names from nested structs effectively flattening the schema - /// The format is `parent_child_grandchild_etc`... for nested fields and will be stored in `fields_digest_buffer` + /// The format is `parent__child__grandchild__etc`... for nested fields and will be stored in `fields_digest_buffer` fn extract_fields_name( field: &Field, parent_field_name: &str, @@ -447,7 +447,7 @@ impl ArrowDigester { let field_name = if parent_field_name.is_empty() { field.name().clone() } else { - format!("{}_{}", parent_field_name, field.name()) + format!("{}__{}", parent_field_name, field.name()) }; fields_digest_buffer.insert(field_name, D::new()); From 485a544d8e75290381550e26a0e4d9b98ffe3d43 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:39:49 +0000 Subject: [PATCH 18/53] Add test for field name extraction and fix logic bug --- src/arrow_digester.rs | 65 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 0ef8458..eeb183f 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -438,19 +438,33 @@ impl ArrowDigester { ) { // Check if field is a nested type of struct if let DataType::Struct(fields) = field.data_type() { + println!( + "Extracting nested struct field: {} with parent: {}", + field.name(), + parent_field_name + ); // We will add fields in alphabetical order fields.into_iter().for_each(|field_inner| { - Self::extract_fields_name(field_inner, parent_field_name, fields_digest_buffer); + Self::extract_fields_name( + field_inner, + Self::construct_field_name_hierarchy(parent_field_name, field.name()).as_str(), + fields_digest_buffer, + ); }); } else { - // Base case, just add the field name - let field_name = if parent_field_name.is_empty() { - field.name().clone() - } else { - format!("{}__{}", parent_field_name, field.name()) - }; - - fields_digest_buffer.insert(field_name, D::new()); + // Base case, just add the the combine field name to the map + fields_digest_buffer.insert( + Self::construct_field_name_hierarchy(parent_field_name, field.name()), + D::new(), + ); + } + } + + fn construct_field_name_hierarchy(parent_field_name: &str, field_name: &str) -> String { + if parent_field_name.is_empty() { + field_name.to_owned() + } else { + format!("{parent_field_name}__{field_name}") } } } @@ -709,4 +723,37 @@ mod tests { "9ba289655f0c7dd359ababc5a6f6188b352e45483623fbbf8b967723e2b798f8" ); } + + #[test] + fn field_names() { + // Test nested struct field name extraction + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "nested", + DataType::Struct( + vec![ + Field::new("name", DataType::Utf8, true), + Field::new( + "deep", + DataType::Struct( + vec![Field::new("value", DataType::Int64, false)].into(), + ), + false, + ), + ] + .into(), + ), + false, + ), + ]); + + let digester = ArrowDigester::::new(schema); + let field_names: Vec<&String> = digester.fields_digest_buffer.keys().collect(); + + assert_eq!(field_names.len(), 3); + assert!(field_names.contains(&&"id".to_owned())); + assert!(field_names.contains(&&"nested__name".to_owned())); + assert!(field_names.contains(&&"nested__deep__value".to_owned())); + } } From c2ff003bcfb7e81f2a438672dc40c00c56c38480 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:41:55 +0000 Subject: [PATCH 19/53] Up the version due to bug --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index aed4941..d88f3ae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "starfix" -version = "0.0.0" +version = "0.0.1" edition = '2024' description = "Package for hashing Arrow's data structures uniquely for identifying and comparing data efficiently." authors = ["synicix "] From 669647ac5890c088b8d7939ccc1e0a1eaee6e2d3 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 3 Dec 2025 19:46:27 -0800 Subject: [PATCH 20/53] Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8cb0ddb..8c1b7b5 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ NOTE: After every code edit in rust code, you will need to rerun the command to # Hashing System Overview -ArrowDigester stores the digest for multiple components of the arrow data_table before combining them +ArrowDigester stores the digest for multiple components of the arrow data table before combining them - schema: Each field name is serialize via PostCard: https://docs.rs/postcard/latest/postcard/ - Was chosen since I was originally using JSON but wanted something even faster, hence postcard. It is design to be very resource efficient From 882adca636f04a82c89f4767ce0ca3c9e6025ccd Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 3 Dec 2025 19:46:40 -0800 Subject: [PATCH 21/53] Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8c1b7b5..60da8ae 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ NOTE: After every code edit in rust code, you will need to rerun the command to # Hashing System Overview ArrowDigester stores the digest for multiple components of the arrow data table before combining them -- schema: Each field name is serialize via PostCard: https://docs.rs/postcard/latest/postcard/ +- schema: Each field name is serialized via PostCard: https://docs.rs/postcard/latest/postcard/ - Was chosen since I was originally using JSON but wanted something even faster, hence postcard. It is design to be very resource efficient - fields_digest_buffer: Flattens all nested schema with the '__' delimiter between the parent and sub level in this format parent_field_name__child_field_name From 3f083ba087dea1361df342e287be815e6eca0a41 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 3 Dec 2025 19:47:43 -0800 Subject: [PATCH 22/53] Update src/arrow_digester.rs Not sure why clippy didn't catch this Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/arrow_digester.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index eeb183f..e9b347a 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -55,7 +55,7 @@ impl ArrowDigester { /// Hash record batch directly without needing to create an `ArrowDigester` instance on the user side pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { let mut digester = Self::new(record_batch.schema().as_ref().clone()); - digester.update(&record_batch.clone()); + digester.update(record_batch); digester.finalize() } From a908bba54ddbb1033bc2c69dbac05ec4aa525927 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 3 Dec 2025 19:47:51 -0800 Subject: [PATCH 23/53] Update src/arrow_digester.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/arrow_digester.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index e9b347a..c377102 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -45,7 +45,7 @@ impl ArrowDigester { } } - /// Hash a array directly without needing to create an `ArrowDigester` instance on the user side + /// Hash an array directly without needing to create an `ArrowDigester` instance on the user side pub fn hash_array(array: &dyn Array) -> Vec { let mut digest = D::new(); Self::array_digest_update(array.data_type(), array, &mut digest); From 9dce44e4ce2e412aca8e73ed3dfa3799cb24df99 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:49:14 +0000 Subject: [PATCH 24/53] Change Vec to array since it is more flexiable --- src/arrow_digester.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index c377102..08a561c 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -148,7 +148,7 @@ impl ArrowDigester { /// Recursive function to update nested field digests (structs within structs) fn update_nested_field( - field_name_hierarchy: &Vec<&str>, + field_name_hierarchy: &[&str], current_level: usize, array: &StructArray, digest: &mut D, From 616101b1e80768a5b93d3fbbbed5893881b6b9d7 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:53:23 +0000 Subject: [PATCH 25/53] Add check for hash --- src/arrow_digester.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 08a561c..dd1003b 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -20,6 +20,8 @@ use postcard::to_vec; const NULL_BYTES: &[u8] = b"NULL"; +const DELIMITER_FOR_NESTED_FIELD: &str = "__"; + pub struct ArrowDigester { schema: Schema, schema_digest: Vec, @@ -112,7 +114,9 @@ impl ArrowDigester { .iter_mut() .for_each(|(field_name, digest)| { // Determine if field name is nested - let field_name_hierarchy = field_name.split('_').collect::>(); + let field_name_hierarchy = field_name + .split(DELIMITER_FOR_NESTED_FIELD) + .collect::>(); if field_name_hierarchy.len() == 1 { Self::array_digest_update( @@ -464,7 +468,7 @@ impl ArrowDigester { if parent_field_name.is_empty() { field_name.to_owned() } else { - format!("{parent_field_name}__{field_name}") + format!("{parent_field_name}{DELIMITER_FOR_NESTED_FIELD}{field_name}") } } } @@ -755,5 +759,11 @@ mod tests { assert!(field_names.contains(&&"id".to_owned())); assert!(field_names.contains(&&"nested__name".to_owned())); assert!(field_names.contains(&&"nested__deep__value".to_owned())); + + // Check the digest + assert_eq!( + hex::encode(digester.finalize()), + "9c5861a91a66e9e5e4dc16b12b6c9e23acaa8fc6a62519fe8e388ce39daa4fd5" + ) } } From 39218866506c16bf6e1c96af3b628049916b4f7f Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 03:57:22 +0000 Subject: [PATCH 26/53] Fix double negation --- src/arrow_digester.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index dd1003b..097901a 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -105,7 +105,7 @@ impl ArrowDigester { fn update(&mut self, record_batch: &RecordBatch) { // Verify schema matches assert!( - !(*record_batch.schema() != self.schema), + *record_batch.schema() == self.schema, "Record batch schema does not match ArrowDigester schema" ); @@ -764,6 +764,6 @@ mod tests { assert_eq!( hex::encode(digester.finalize()), "9c5861a91a66e9e5e4dc16b12b6c9e23acaa8fc6a62519fe8e388ce39daa4fd5" - ) + ); } } From bcb984c59d505d329958a2edbd642e8b1f0e4299 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 04:01:15 +0000 Subject: [PATCH 27/53] Update field name to deal with possible underflow --- src/arrow_digester.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 097901a..bffc85c 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -157,7 +157,12 @@ impl ArrowDigester { array: &StructArray, digest: &mut D, ) { - if field_name_hierarchy.len() == current_level { + if field_name_hierarchy + .len() + .checked_sub(1) + .expect("field_name_hierarchy underflow occurred") + == current_level + { let array_data = array .column_by_name( field_name_hierarchy From 535615f8a408390654d0db2e9ad19108b49dec92 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 04:42:36 +0000 Subject: [PATCH 28/53] Fix nested field hash --- src/arrow_digester.rs | 66 +++++++++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index bffc85c..973a97c 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -157,16 +157,20 @@ impl ArrowDigester { array: &StructArray, digest: &mut D, ) { + let current_level_plus_one = current_level + .checked_add(1) + .expect("Field nesting level overflow"); + if field_name_hierarchy .len() .checked_sub(1) - .expect("field_name_hierarchy underflow occurred") - == current_level + .expect("field_name_hierarchy underflow") + == current_level_plus_one { let array_data = array .column_by_name( field_name_hierarchy - .first() + .last() .expect("Failed to get field name at idx 0, list is empty!"), ) .expect("Failed to get column by name"); @@ -177,7 +181,7 @@ impl ArrowDigester { let next_array = array .column_by_name( field_name_hierarchy - .get(current_level) + .get(current_level_plus_one) .expect("Failed to get field name at current level"), ) .expect("Failed to get column by name") @@ -187,9 +191,7 @@ impl ArrowDigester { Self::update_nested_field( field_name_hierarchy, - current_level - .checked_add(1) - .expect("Field nesting level overflow"), + current_level_plus_one, next_array, digest, ); @@ -447,11 +449,6 @@ impl ArrowDigester { ) { // Check if field is a nested type of struct if let DataType::Struct(fields) = field.data_type() { - println!( - "Extracting nested struct field: {} with parent: {}", - field.name(), - parent_field_name - ); // We will add fields in alphabetical order fields.into_iter().for_each(|field_inner| { Self::extract_fields_name( @@ -485,9 +482,9 @@ mod tests { use arrow::{ array::{ - ArrayRef, BinaryArray, BooleanArray, Int32Array, LargeBinaryArray, LargeStringArray, - ListArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, + ArrayRef, BinaryArray, BooleanArray, Int32Array, Int64Array, LargeBinaryArray, + LargeStringArray, ListArray, RecordBatch, StringArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, }, datatypes::Int32Type, }; @@ -734,7 +731,7 @@ mod tests { } #[test] - fn field_names() { + fn nested_fields() { // Test nested struct field name extraction let schema = Schema::new(vec![ Field::new("id", DataType::Int32, false), @@ -757,7 +754,7 @@ mod tests { ), ]); - let digester = ArrowDigester::::new(schema); + let mut digester = ArrowDigester::::new(schema.clone()); let field_names: Vec<&String> = digester.fields_digest_buffer.keys().collect(); assert_eq!(field_names.len(), 3); @@ -765,10 +762,43 @@ mod tests { assert!(field_names.contains(&&"nested__name".to_owned())); assert!(field_names.contains(&&"nested__deep__value".to_owned())); + // Test the nested field update by creating record_batch and using the update method + let id_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; + let name_array = Arc::new(StringArray::from(vec![Some("Alice"), Some("Bob")])) as ArrayRef; + let value_array = Arc::new(Int64Array::from(vec![Some(100), Some(200)])) as ArrayRef; + + let schema_ref = Arc::new(schema); + + let nested_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("name", DataType::Utf8, true)), + name_array, + ), + ( + Arc::new(Field::new( + "deep", + DataType::Struct(vec![Field::new("value", DataType::Int64, false)].into()), + false, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("value", DataType::Int64, false)), + value_array, + )])) as ArrayRef, + ), + ]); + + let record_batch = RecordBatch::try_new( + Arc::clone(&schema_ref), + vec![id_array, Arc::new(nested_struct)], + ) + .unwrap(); + + digester.update(&record_batch); + // Check the digest assert_eq!( hex::encode(digester.finalize()), - "9c5861a91a66e9e5e4dc16b12b6c9e23acaa8fc6a62519fe8e388ce39daa4fd5" + "3dad089e89d2d971b6f35781f670deec28b6d0201044110000e9a7cf96f74395" ); } } From 115d80e54a6a4477c159bc929bb6c4f962aa556f Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 04:59:57 +0000 Subject: [PATCH 29/53] Remove casting scale because arrow allow for negative scale for rounding --- src/arrow_digester.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 973a97c..368215c 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -424,11 +424,10 @@ impl ArrowDigester { } } - #[expect(clippy::cast_sign_loss, reason = "Scale should always be non-negative")] fn hash_decimal(precision: u8, scale: i8, array: &dyn Array, digest: &mut D) { // Include the precision and scale in the hash digest.update([precision]); - digest.update([scale as u8]); + digest.update(scale.to_le_bytes()); // Hash the underlying fixed size array based on precision match precision { From 48cd2cb9ee9c79d7d14cdb6c368603c095069e69 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 05:02:36 +0000 Subject: [PATCH 30/53] Update version with bug fixes --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d88f3ae..ec5daf2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "starfix" -version = "0.0.1" +version = "0.0.2" edition = '2024' description = "Package for hashing Arrow's data structures uniquely for identifying and comparing data efficiently." authors = ["synicix "] From 5bb6414ddffb8f245085b41346129fca0695e7fe Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 06:44:25 +0000 Subject: [PATCH 31/53] Move some dependenices into dev --- Cargo.toml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ec5daf2..fd79294 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "starfix" version = "0.0.2" -edition = '2024' +edition = "2021" description = "Package for hashing Arrow's data structures uniquely for identifying and comparing data efficiently." authors = ["synicix "] readme = "README.md" @@ -14,15 +14,19 @@ categories = ["algorithms"] arrow = { version = "57.0.0", features = ["ffi"] } arrow-schema = { version = "57.0.0", features = ["serde"] } digest = "0.10.7" -hex = "0.4.3" + postcard = "1.1.3" -pretty_assertions = "1.4.1" + serde = "1.0.228" -serde_json = "1.0" sha2 = "0.10.9" # automated CFFI + bindings in other languages uniffi = { version = "0.29.4", features = ["cli", "tokio"] } +[dev-dependencies] +hex = "0.4.3" +pretty_assertions = "1.4.1" + + [[bin]] name = "uniffi-bindgen" From ca3426c37f7e1086a03fe2e2a9c3c338add4b753 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 06:44:40 +0000 Subject: [PATCH 32/53] Fix decimal and possible string array collision --- src/arrow_digester.rs | 45 +++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 368215c..79bd0a8 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -287,11 +287,21 @@ impl ArrowDigester { DataType::Struct(_) => todo!(), DataType::Union(_, _) => todo!(), DataType::Dictionary(_, _) => todo!(), - DataType::Decimal32(precision, scale) - | DataType::Decimal64(precision, scale) - | DataType::Decimal128(precision, scale) - | DataType::Decimal256(precision, scale) => { - Self::hash_decimal(*precision, *scale, array, digest); + DataType::Decimal32(precision, scale) => { + Self::hash_decimal_metadata(*precision, *scale, digest); + Self::hash_fixed_size_array(array, digest, 4); + } + DataType::Decimal64(precision, scale) => { + Self::hash_decimal_metadata(*precision, *scale, digest); + Self::hash_fixed_size_array(array, digest, 8); + } + DataType::Decimal128(precision, scale) => { + Self::hash_decimal_metadata(*precision, *scale, digest); + Self::hash_fixed_size_array(array, digest, 16); + } + DataType::Decimal256(precision, scale) => { + Self::hash_decimal_metadata(*precision, *scale, digest); + Self::hash_fixed_size_array(array, digest, 32); } DataType::Map(_, _) => todo!(), DataType::RunEndEncoded(_, _) => todo!(), @@ -380,12 +390,17 @@ impl ArrowDigester { Self::hash_fixed_size_array(array, digest, element_size); } + #[expect( + clippy::cast_possible_truncation, + reason = "String lengths from Arrow offsets are bounded" + )] fn hash_string_array(array: &GenericStringArray, digest: &mut D) { match array.nulls() { Some(null_buf) => { for i in 0..array.len() { if null_buf.is_valid(i) { let value = array.value(i); + digest.update((value.len() as u32).to_le_bytes()); digest.update(value.as_bytes()); } else { digest.update(NULL_BYTES); @@ -395,6 +410,7 @@ impl ArrowDigester { None => { for i in 0..array.len() { let value = array.value(i); + digest.update((value.len() as u32).to_le_bytes()); digest.update(value.as_bytes()); } } @@ -424,19 +440,10 @@ impl ArrowDigester { } } - fn hash_decimal(precision: u8, scale: i8, array: &dyn Array, digest: &mut D) { + fn hash_decimal_metadata(precision: u8, scale: i8, digest: &mut D) { // Include the precision and scale in the hash digest.update([precision]); digest.update(scale.to_le_bytes()); - - // Hash the underlying fixed size array based on precision - match precision { - 1..=9 => Self::hash_fixed_size_array(array, digest, 4), - 10..=18 => Self::hash_fixed_size_array(array, digest, 8), - 19..=38 => Self::hash_fixed_size_array(array, digest, 16), - 39..=76 => Self::hash_fixed_size_array(array, digest, 32), - _ => panic!("Unsupported decimal precision: {precision}"), - } } /// Internal recursive function to extract field names from nested structs effectively flattening the schema @@ -584,7 +591,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&string_array)); assert_eq!( hash, - "078347d3063fb5bbe0bdbd3315cf8e5e140733ea34e6b73cbc0838b60a9c8012" + "bde5c268d835b1ea9ea8b2a058f35f978d1c95a071b71fc5051a8a21f717e77e" ); // Test large string array with same data to ensure consistency @@ -625,7 +632,7 @@ mod tests { assert_eq!( hex::encode(ArrowDigester::::hash_array(&decimal32_array)), - "bd639e8df756f0bd194f18572e89ea180307e6d46e88d96ade52b61e196c3268" + "9bafa8b4e342aa48ed6d25f3e7ca62ec849108395a93739252bdb329d72ec58a" ); // Test Decimal64 (precision 10-18) @@ -639,7 +646,7 @@ mod tests { .unwrap(); assert_eq!( hex::encode(ArrowDigester::::hash_array(&decimal64_array)), - "ca1f8a6fb179ddafad1e02738ad2d869da187c72a9b815d8e12a85692525d231" + "d2730c9222bd211d5c7cfae9fbe604728bb6e75aa0a96383daec511b20b63796" ); // Test Decimal128 (precision 19-38) @@ -797,7 +804,7 @@ mod tests { // Check the digest assert_eq!( hex::encode(digester.finalize()), - "3dad089e89d2d971b6f35781f670deec28b6d0201044110000e9a7cf96f74395" + "5ac26748e626fbac963be995ad91fcd90b441e9e27f2e8b93e39aeb8b5f60ca6" ); } } From 08bc1f1d45ce6276b83c522a0d5ad7dc4326e71f Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 07:00:31 +0000 Subject: [PATCH 33/53] Remove unused panic --- src/arrow_digester.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 79bd0a8..1b4a67a 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -1,7 +1,6 @@ #![expect( clippy::expect_used, clippy::todo, - clippy::panic, reason = "First iteration of code, will add proper error handling later. Allow for unsupported data types for now" )] use std::collections::BTreeMap; @@ -198,6 +197,10 @@ impl ArrowDigester { } } + #[expect( + clippy::too_many_lines, + reason = "Comprehensive match on all data types" + )] fn array_digest_update(data_type: &DataType, array: &dyn Array, digest: &mut D) { match data_type { DataType::Null => todo!(), From cb9cf80e88e7c72af241e7503e52e4fa5122c138 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 4 Dec 2025 07:03:07 +0000 Subject: [PATCH 34/53] Fix cargo fmt error --- src/pyarrow.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyarrow.rs b/src/pyarrow.rs index b77f9cc..f75d393 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -1,5 +1,5 @@ use arrow::array::{RecordBatch, StructArray}; -use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi}; +use arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; use sha2::Sha256; use crate::arrow_digester::ArrowDigester; From 9ce2993bd3605bbbfbe2c441813e0039e89351bb Mon Sep 17 00:00:00 2001 From: synicix Date: Mon, 8 Dec 2025 23:19:07 +0000 Subject: [PATCH 35/53] Change postcard hashing to json hashing --- Cargo.toml | 1 + src/arrow_digester.rs | 49 ++++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fd79294..c0ea45f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ digest = "0.10.7" postcard = "1.1.3" serde = "1.0.228" +serde_json = "1.0.145" sha2 = "0.10.9" # automated CFFI + bindings in other languages uniffi = { version = "0.29.4", features = ["cli", "tokio"] } diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 1b4a67a..31578f9 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -15,7 +15,6 @@ use arrow::{ }; use arrow_schema::{Field, TimeUnit}; use digest::Digest; -use postcard::to_vec; const NULL_BYTES: &[u8] = b"NULL"; @@ -84,18 +83,13 @@ impl ArrowDigester { fn hash_schema(schema: &Schema) -> Vec { let fields_digest = schema .fields - .into_iter() - .map(|field| { - ( - field.name(), - to_vec::<_, 256>(field).expect("Failed to serialize field of schema"), - ) - }) + .iter() + .map(|field| (field.name(), field.to_string())) .collect::>(); // Hash the entire thing to the digest D::digest( - to_vec::<_, 1024>(&fields_digest).expect("Failed to serialize field_digest to bytes"), + serde_json::to_vec(&fields_digest).expect("Failed to serialize field_digest to bytes"), ) .to_vec() } @@ -498,12 +492,29 @@ mod tests { datatypes::Int32Type, }; use arrow_schema::{DataType, Field, Schema}; + use hex::encode; use pretty_assertions::assert_eq; use sha2::Sha256; use crate::arrow_digester::ArrowDigester; use arrow::array::Decimal128Array; + #[test] + fn schema_only() { + let schema = Schema::new(vec![ + Field::new("col1", DataType::Int32, false), + Field::new("col2", DataType::Utf8, true), + ]); + + let digester = ArrowDigester::::new(schema); + let hash = digester.finalize(); + + assert_eq!( + encode(hash), + "95eb6c962dd0b61704bc0a29347ff91d3024e52adc31e23b33d843c539715abc" + ); + } + #[test] fn boolean_array_hashing() { let bool_array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); @@ -634,7 +645,7 @@ mod tests { .unwrap(); assert_eq!( - hex::encode(ArrowDigester::::hash_array(&decimal32_array)), + encode(ArrowDigester::::hash_array(&decimal32_array)), "9bafa8b4e342aa48ed6d25f3e7ca62ec849108395a93739252bdb329d72ec58a" ); @@ -648,7 +659,7 @@ mod tests { .with_precision_and_scale(15, 3) .unwrap(); assert_eq!( - hex::encode(ArrowDigester::::hash_array(&decimal64_array)), + encode(ArrowDigester::::hash_array(&decimal64_array)), "d2730c9222bd211d5c7cfae9fbe604728bb6e75aa0a96383daec511b20b63796" ); @@ -696,8 +707,12 @@ mod tests { // Hash both record batches assert_eq!( - ArrowDigester::::hash_record_batch(batch1.as_ref().unwrap()), - ArrowDigester::::hash_record_batch(batch2.as_ref().unwrap()) + encode(ArrowDigester::::hash_record_batch( + batch1.as_ref().unwrap() + )), + encode(ArrowDigester::::hash_record_batch( + batch2.as_ref().unwrap() + )) ); } @@ -734,8 +749,8 @@ mod tests { digester.update(&batch1); digester.update(&batch2); assert_eq!( - hex::encode(digester.finalize()), - "9ba289655f0c7dd359ababc5a6f6188b352e45483623fbbf8b967723e2b798f8" + encode(digester.finalize()), + "6042a984d52c98d660832763a997eb8d79694005aa46ca89bac15a2240ee46e7" ); } @@ -806,8 +821,8 @@ mod tests { // Check the digest assert_eq!( - hex::encode(digester.finalize()), - "5ac26748e626fbac963be995ad91fcd90b441e9e27f2e8b93e39aeb8b5f60ca6" + encode(digester.finalize()), + "28a12e93525ceb84554fb0ce564d14b14c537b9d0d6b2bf13a583170d18f41fb" ); } } From 81726f5c81d0085b9171edc1276d3bee2bfe9404 Mon Sep 17 00:00:00 2001 From: synicix Date: Mon, 8 Dec 2025 23:24:23 +0000 Subject: [PATCH 36/53] Change delimiter to / --- src/arrow_digester.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 31578f9..d265509 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -18,7 +18,7 @@ use digest::Digest; const NULL_BYTES: &[u8] = b"NULL"; -const DELIMITER_FOR_NESTED_FIELD: &str = "__"; +const DELIMITER_FOR_NESTED_FIELD: &str = "/"; pub struct ArrowDigester { schema: Schema, @@ -783,8 +783,8 @@ mod tests { assert_eq!(field_names.len(), 3); assert!(field_names.contains(&&"id".to_owned())); - assert!(field_names.contains(&&"nested__name".to_owned())); - assert!(field_names.contains(&&"nested__deep__value".to_owned())); + assert!(field_names.contains(&&"nested/name".to_owned())); + assert!(field_names.contains(&&"nested/deep/value".to_owned())); // Test the nested field update by creating record_batch and using the update method let id_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; From 932abe1a751453a1bf9da0b10eb84717f3e481ab Mon Sep 17 00:00:00 2001 From: synicix Date: Tue, 9 Dec 2025 00:12:14 +0000 Subject: [PATCH 37/53] Add binary array len hashing to resolve hash collision problem & tests --- src/arrow_digester.rs | 44 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index d265509..4ff0f49 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -359,6 +359,7 @@ impl ArrowDigester { for i in 0..array.len() { if null_buf.is_valid(i) { let value = array.value(i); + digest.update(value.len().to_le_bytes()); digest.update(value); } else { digest.update(NULL_BYTES); @@ -368,6 +369,7 @@ impl ArrowDigester { None => { for i in 0..array.len() { let value = array.value(i); + digest.update(value.len().to_le_bytes()); digest.update(value); } } @@ -581,7 +583,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&binary_array)); assert_eq!( hash, - "078347d3063fb5bbe0bdbd3315cf8e5e140733ea34e6b73cbc0838b60a9c8012" + "2dadcaf793c1878ffa22eb2d5d746e27b648d638e4e344e565a23840a957b660" ); // Test large binary array with same data to ensure consistency @@ -598,6 +600,46 @@ mod tests { ); } + // Test binary array collision vulnerability - different partitions should produce different hashes + #[test] + fn binary_array_length_prefix_prevents_collisions() { + // Array 1: [[0x01, 0x02], [0x03]] + let array1 = BinaryArray::from(vec![Some(&[0x01_u8, 0x02_u8][..]), Some(&[0x03_u8][..])]); + + // Array 2: [[0x01], [0x02, 0x03]] + let array2 = BinaryArray::from(vec![Some(&[0x01_u8][..]), Some(&[0x02_u8, 0x03_u8][..])]); + + let hash1 = hex::encode(ArrowDigester::::hash_array(&array1)); + let hash2 = hex::encode(ArrowDigester::::hash_array(&array2)); + + // Without length prefix, these would collide (both hash to 0x01 0x02 0x03) + // With length prefix, they should produce different hashes + assert_ne!( + hash1, hash2, + "Binary arrays with different partitions should produce different hashes" + ); + } + + // Test string array collision vulnerability - different partitions should produce different hashes + #[test] + fn string_array_length_prefix_prevents_collisions() { + // Array 1: ["ab", "c"] + let array1 = StringArray::from(vec![Some("ab"), Some("c")]); + + // Array 2: ["a", "bc"] + let array2 = StringArray::from(vec![Some("a"), Some("bc")]); + + let hash1 = hex::encode(ArrowDigester::::hash_array(&array1)); + let hash2 = hex::encode(ArrowDigester::::hash_array(&array2)); + + // Without length prefix, these would collide (both hash to "abc") + // With length prefix, they should produce different hashes + assert_ne!( + hash1, hash2, + "String arrays with different partitions should produce different hashes" + ); + } + // Test String hashing #[test] fn string_array_hashing() { From 58ac7013d592c2994f511c3eaff6ea61869cee39 Mon Sep 17 00:00:00 2001 From: synicix Date: Tue, 9 Dec 2025 09:21:21 +0000 Subject: [PATCH 38/53] Save progress on redesigning null handling --- Cargo.toml | 1 + cspell.json | 2 +- src/arrow_digester.rs | 359 ++++++++++++++++++++++++++++++------------ 3 files changed, 262 insertions(+), 100 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0ea45f..f0bbac1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ categories = ["algorithms"] [dependencies] arrow = { version = "57.0.0", features = ["ffi"] } arrow-schema = { version = "57.0.0", features = ["serde"] } +bitvec = "1.0.1" digest = "0.10.7" postcard = "1.1.3" diff --git a/cspell.json b/cspell.json index 48de8d0..9ba9890 100644 --- a/cspell.json +++ b/cspell.json @@ -4,7 +4,7 @@ "pyarrow", "pythonapi", "uniffi", - "uids" + "uids""bitvec" ], "ignoreWords": [], "useGitignore": false, diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 4ff0f49..1a30a6e 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -3,7 +3,7 @@ clippy::todo, reason = "First iteration of code, will add proper error handling later. Allow for unsupported data types for now" )] -use std::collections::BTreeMap; +use std::{collections::BTreeMap, iter::repeat_n}; use arrow::{ array::{ @@ -14,16 +14,22 @@ use arrow::{ datatypes::{DataType, Schema}, }; use arrow_schema::{Field, TimeUnit}; +use bitvec::prelude::*; use digest::Digest; const NULL_BYTES: &[u8] = b"NULL"; const DELIMITER_FOR_NESTED_FIELD: &str = "/"; +enum DigestBufferType { + NonNullable(D), + Nullable(BitVec, D), // Where first digest is for the bull bits, while the second is for the actual data +} + pub struct ArrowDigester { schema: Schema, schema_digest: Vec, - fields_digest_buffer: BTreeMap, + fields_digest_buffer: BTreeMap>, } impl ArrowDigester { @@ -47,9 +53,17 @@ impl ArrowDigester { /// Hash an array directly without needing to create an `ArrowDigester` instance on the user side pub fn hash_array(array: &dyn Array) -> Vec { - let mut digest = D::new(); - Self::array_digest_update(array.data_type(), array, &mut digest); - digest.finalize().to_vec() + let mut digest_buffer = if array.is_nullable() { + DigestBufferType::Nullable(BitVec::new(), D::new()) + } else { + DigestBufferType::NonNullable(D::new()) + }; + Self::array_digest_update(array.data_type(), array, &mut digest_buffer); + + // Finalize all the sub digest and combine them into a single digest + let mut final_digest = D::new(); + Self::finalize_digest(&mut final_digest, digest_buffer); + final_digest.finalize().to_vec() } /// Hash record batch directly without needing to create an `ArrowDigester` instance on the user side @@ -71,14 +85,32 @@ impl ArrowDigester { // Then digest each field digest in order self.fields_digest_buffer .into_iter() - .for_each(|(_, digest)| { - let field_hash = digest.finalize(); - final_digest.update(&field_hash); - }); + .for_each(|(_, digest)| Self::finalize_digest(&mut final_digest, digest)); final_digest.finalize().to_vec() } + #[expect( + clippy::big_endian_bytes, + reason = "Use for bit packing the null_bit_values" + )] + /// Finalize a single field digest into the final digest + /// Helpers to reduce code duplication + fn finalize_digest(final_digest: &mut D, digest: DigestBufferType) { + match digest { + DigestBufferType::NonNullable(data_digest) => { + final_digest.update(data_digest.finalize()); + } + DigestBufferType::Nullable(null_bit_digest, data_digest) => { + final_digest.update(null_bit_digest.len().to_le_bytes()); + for &word in null_bit_digest.as_raw_slice() { + final_digest.update(word.to_be_bytes()); + } + final_digest.update(data_digest.finalize()); + } + } + } + /// Serialize the schema into a `BTreeMap` for field name and its digest fn hash_schema(schema: &Schema) -> Vec { let fields_digest = schema @@ -148,7 +180,7 @@ impl ArrowDigester { field_name_hierarchy: &[&str], current_level: usize, array: &StructArray, - digest: &mut D, + digest: &mut DigestBufferType, ) { let current_level_plus_one = current_level .checked_add(1) @@ -195,7 +227,11 @@ impl ArrowDigester { clippy::too_many_lines, reason = "Comprehensive match on all data types" )] - fn array_digest_update(data_type: &DataType, array: &dyn Array, digest: &mut D) { + fn array_digest_update( + data_type: &DataType, + array: &dyn Array, + digest: &mut DigestBufferType, + ) { match data_type { DataType::Null => todo!(), DataType::Boolean => { @@ -205,24 +241,53 @@ impl ArrowDigester { .downcast_ref::() .expect("Failed to downcast to BooleanArray"); - bool_array.into_iter().for_each(|value| match value { - Some(b) => digest.update([u8::from(b)]), - None => digest.update(NULL_BYTES), - }); + match digest { + DigestBufferType::NonNullable(data_digest) => { + // We want to bit pack the boolean values into bytes for hashing + let mut bit_vec = BitVec::::with_capacity(bool_array.len()); + for i in 0..bool_array.len() { + bit_vec.push(bool_array.value(i)); + } + + data_digest.update(bit_vec.as_raw_slice()); + } + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Handle null bits first + Self::handle_null_bits(bool_array, null_bit_vec); + + // Handle the data + let mut bit_vec = BitVec::::with_capacity(bool_array.len()); + for i in 0..bool_array.len() { + // We only want the valid bits, for null we will discard from the hash since that is already capture by null_bits + if bool_array.is_valid(i) { + bit_vec.push(bool_array.value(i)); + } + } + data_digest.update(bit_vec.as_raw_slice()); + } + } } DataType::Int8 | DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), DataType::Int16 | DataType::UInt16 | DataType::Float16 => { Self::hash_fixed_size_array(array, digest, 2); } - DataType::Int32 | DataType::UInt32 | DataType::Float32 | DataType::Date32 => { + DataType::Int32 + | DataType::UInt32 + | DataType::Float32 + | DataType::Date32 + | DataType::Decimal32(_, _) => { Self::hash_fixed_size_array(array, digest, 4); } - DataType::Int64 | DataType::UInt64 | DataType::Float64 | DataType::Date64 => { + DataType::Int64 + | DataType::UInt64 + | DataType::Float64 + | DataType::Date64 + | DataType::Decimal64(_, _) => { Self::hash_fixed_size_array(array, digest, 8); } DataType::Timestamp(_, _) => todo!(), - DataType::Time32(time_unit) => Self::hash_time_array(array, *time_unit, digest, 4), - DataType::Time64(time_unit) => Self::hash_time_array(array, *time_unit, digest, 8), + DataType::Time32(_) => Self::hash_fixed_size_array(array, digest, 4), + DataType::Time64(_) => Self::hash_fixed_size_array(array, digest, 8), DataType::Duration(_) => todo!(), DataType::Interval(_) => todo!(), DataType::Binary => Self::hash_binary_array( @@ -284,20 +349,10 @@ impl ArrowDigester { DataType::Struct(_) => todo!(), DataType::Union(_, _) => todo!(), DataType::Dictionary(_, _) => todo!(), - DataType::Decimal32(precision, scale) => { - Self::hash_decimal_metadata(*precision, *scale, digest); - Self::hash_fixed_size_array(array, digest, 4); - } - DataType::Decimal64(precision, scale) => { - Self::hash_decimal_metadata(*precision, *scale, digest); - Self::hash_fixed_size_array(array, digest, 8); - } - DataType::Decimal128(precision, scale) => { - Self::hash_decimal_metadata(*precision, *scale, digest); + DataType::Decimal128(_, _) => { Self::hash_fixed_size_array(array, digest, 16); } - DataType::Decimal256(precision, scale) => { - Self::hash_decimal_metadata(*precision, *scale, digest); + DataType::Decimal256(_, _) => { Self::hash_fixed_size_array(array, digest, 32); } DataType::Map(_, _) => todo!(), @@ -306,7 +361,11 @@ impl ArrowDigester { } #[expect(clippy::cast_sign_loss, reason = "element_size is always positive")] - fn hash_fixed_size_array(array: &dyn Array, digest: &mut D, element_size: i32) { + fn hash_fixed_size_array( + array: &dyn Array, + digest_buffer: &mut DigestBufferType, + element_size: i32, + ) { let array_data = array.to_data(); let element_size_usize = element_size as usize; @@ -324,61 +383,103 @@ impl ArrowDigester { ) .expect("Failed to get buffer slice for FixedSizeBinaryArray"); - // Deal with null - match array_data.nulls() { - Some(null_buffer) => { - // There are nulls, so we need to incrementally hash each value - for i in 0..array_data.len() { - if null_buffer.is_valid(i) { - let data_pos = i - .checked_mul(element_size_usize) - .expect("Data position multiplication overflow"); - let end_pos = data_pos - .checked_add(element_size_usize) - .expect("End position addition overflow"); - if let Some(data_slice) = slice.get(data_pos..end_pos) { - digest.update(data_slice); - } else { - digest.update(NULL_BYTES); + match digest_buffer { + DigestBufferType::NonNullable(data_digest) => { + // No nulls, we can hash the entire buffer directly + data_digest.update(slice); + } + DigestBufferType::Nullable(null_bits, data_digest) => { + // Handle null bits first + Self::handle_null_bits(array, null_bits); + + match array_data.nulls() { + Some(null_buffer) => { + // There are nulls, so we need to incrementally hash each value + for i in 0..array_data.len() { + if null_buffer.is_valid(i) { + let data_pos = i + .checked_mul(element_size_usize) + .expect("Data position multiplication overflow"); + let end_pos = data_pos + .checked_add(element_size_usize) + .expect("End position addition overflow"); + + data_digest.update( + slice + .get(data_pos..end_pos) + .expect("Failed to get data_slice"), + ); + } } - } else { - digest.update(NULL_BYTES); + } + None => { + // No nulls, we can hash the entire buffer directly + data_digest.update(slice); } } } - None => { - // No nulls, we can hash the entire buffer directly - digest.update(slice); - } } } - fn hash_binary_array(array: &GenericBinaryArray, digest: &mut D) { - match array.nulls() { - Some(null_buf) => { + fn hash_binary_array( + array: &GenericBinaryArray, + digest: &mut DigestBufferType, + ) { + match digest { + DigestBufferType::NonNullable(data_digest) => { for i in 0..array.len() { - if null_buf.is_valid(i) { - let value = array.value(i); - digest.update(value.len().to_le_bytes()); - digest.update(value); - } else { - digest.update(NULL_BYTES); - } + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); } } - None => { - for i in 0..array.len() { - let value = array.value(i); - digest.update(value.len().to_le_bytes()); - digest.update(value); + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Deal with the null bits first + if let Some(null_buf) = array.nulls() { + // We would need to iterate through the null buffer and push it into the null_bit_vec + for i in 0..array.len() { + null_bit_vec.push(null_buf.is_valid(i)); + } + + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); + } else { + data_digest.update(NULL_BYTES); + } + } + } else { + // All valid, therefore we can extend the bit vector with all true values + let len = array.len().checked_sub(1).expect("Array length underflow"); + null_bit_vec.extend(repeat_n(true, len)); + + // Deal with the data + for i in 0..array.len() { + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); + } } } } } - fn hash_time_array(array: &dyn Array, time_unit: TimeUnit, digest: &mut D, element_size: i32) { + fn hash_time_array( + array: &dyn Array, + time_unit: TimeUnit, + digest: &mut DigestBufferType, + element_size: i32, + ) { // We need to update the digest with the time unit first to ensure different time units produce different hashes - digest.update([match time_unit { + + let data_digest = match digest { + DigestBufferType::NonNullable(data_digest) + | DigestBufferType::Nullable(_, data_digest) => data_digest, + }; + + data_digest.update([match time_unit { TimeUnit::Second => 0_u8, TimeUnit::Millisecond => 1_u8, TimeUnit::Microsecond => 2_u8, @@ -393,24 +494,41 @@ impl ArrowDigester { clippy::cast_possible_truncation, reason = "String lengths from Arrow offsets are bounded" )] - fn hash_string_array(array: &GenericStringArray, digest: &mut D) { - match array.nulls() { - Some(null_buf) => { + fn hash_string_array( + array: &GenericStringArray, + digest: &mut DigestBufferType, + ) { + match digest { + DigestBufferType::NonNullable(data_digest) => { for i in 0..array.len() { - if null_buf.is_valid(i) { - let value = array.value(i); - digest.update((value.len() as u32).to_le_bytes()); - digest.update(value.as_bytes()); - } else { - digest.update(NULL_BYTES); - } + let value = array.value(i); + data_digest.update((value.len() as u64).to_le_bytes()); + data_digest.update(value.as_bytes()); } } - None => { - for i in 0..array.len() { - let value = array.value(i); - digest.update((value.len() as u32).to_le_bytes()); - digest.update(value.as_bytes()); + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Deal with the null bits first + Self::handle_null_bits(array, null_bit_vec); + + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + data_digest.update((value.len() as u32).to_le_bytes()); + data_digest.update(value.as_bytes()); + } else { + data_digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + let value = array.value(i); + data_digest.update((value.len() as u32).to_le_bytes()); + data_digest.update(value.as_bytes()); + } + } } } } @@ -419,21 +537,42 @@ impl ArrowDigester { fn hash_list_array( array: &GenericListArray, field_data_type: &DataType, - digest: &mut D, + digest: &mut DigestBufferType, ) { - match array.nulls() { - Some(null_buf) => { + todo!(); + match digest { + DigestBufferType::NonNullable(data_digest) => { for i in 0..array.len() { - if null_buf.is_valid(i) { - Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); - } else { - digest.update(NULL_BYTES); - } + Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); } } - None => { - for i in 0..array.len() { - Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); + DigestBufferType::Nullable(bit_vec, data_digest) => { + // Deal with null bits first + Self::handle_null_bits(array, bit_vec); + + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + Self::array_digest_update( + field_data_type, + array.value(i).as_ref(), + digest, + ); + } else { + data_digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + Self::array_digest_update( + field_data_type, + array.value(i).as_ref(), + digest, + ); + } + } } } } @@ -450,7 +589,7 @@ impl ArrowDigester { fn extract_fields_name( field: &Field, parent_field_name: &str, - fields_digest_buffer: &mut BTreeMap, + fields_digest_buffer: &mut BTreeMap>, ) { // Check if field is a nested type of struct if let DataType::Struct(fields) = field.data_type() { @@ -466,7 +605,11 @@ impl ArrowDigester { // Base case, just add the the combine field name to the map fields_digest_buffer.insert( Self::construct_field_name_hierarchy(parent_field_name, field.name()), - D::new(), + if field.is_nullable() { + DigestBufferType::Nullable(BitVec::new(), D::new()) + } else { + DigestBufferType::NonNullable(D::new()) + }, ); } } @@ -478,6 +621,24 @@ impl ArrowDigester { format!("{parent_field_name}{DELIMITER_FOR_NESTED_FIELD}{field_name}") } } + + fn handle_null_bits(array: &dyn Array, null_bit_vec: &mut BitVec) { + match array.nulls() { + Some(null_buf) => { + // We would need to iterate through the null buffer and push it into the null_bit_vec + for i in 0..array.len() { + null_bit_vec.push(null_buf.is_valid(i)); + } + } + None => { + // All valid, therefore we can extend the bit vector with all true values + null_bit_vec.extend(repeat_n( + true, + array.len().checked_sub(1).expect("Array length underflow"), + )); + } + } + } } #[cfg(test)] From 7728bfe80909873a894372fdf6d6ab4490ac44cb Mon Sep 17 00:00:00 2001 From: synicix Date: Wed, 10 Dec 2025 22:36:54 +0000 Subject: [PATCH 39/53] Patch nullbits handling and included datatypes into schema definition --- Cargo.toml | 1 + cspell.json | 4 +- src/arrow_digester.rs | 384 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 318 insertions(+), 71 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f0bbac1..13ba115 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ arrow = { version = "57.0.0", features = ["ffi"] } arrow-schema = { version = "57.0.0", features = ["serde"] } bitvec = "1.0.1" digest = "0.10.7" +indoc = "2.0.7" postcard = "1.1.3" diff --git a/cspell.json b/cspell.json index 9ba9890..ea8ac1f 100644 --- a/cspell.json +++ b/cspell.json @@ -4,7 +4,9 @@ "pyarrow", "pythonapi", "uniffi", - "uids""bitvec" + "uids", + "bitvec", + "indoc" ], "ignoreWords": [], "useGitignore": false, diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index 1a30a6e..e45b2ce 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -13,7 +13,7 @@ use arrow::{ }, datatypes::{DataType, Schema}, }; -use arrow_schema::{Field, TimeUnit}; +use arrow_schema::Field; use bitvec::prelude::*; use digest::Digest; @@ -52,17 +52,32 @@ impl ArrowDigester { } /// Hash an array directly without needing to create an `ArrowDigester` instance on the user side + /// For hash array, we don't have a schema to hash, however we do have field data type. + /// So similar to schema, we will hash based on datatype to encode the metadata information into the digest + /// + /// # Panics + /// + /// This function will panic if JSON serialization of the data type fails. + /// pub fn hash_array(array: &dyn Array) -> Vec { + let mut final_digest = D::new(); + + let data_type_serialized = serde_json::to_string(&array.data_type()) + .expect("Failed to serialize data type to string"); + + // Update the digest buffer with the array metadata and field data + final_digest.update(data_type_serialized); + + // Now we update it with the actual array data let mut digest_buffer = if array.is_nullable() { DigestBufferType::Nullable(BitVec::new(), D::new()) } else { DigestBufferType::NonNullable(D::new()) }; Self::array_digest_update(array.data_type(), array, &mut digest_buffer); - - // Finalize all the sub digest and combine them into a single digest - let mut final_digest = D::new(); Self::finalize_digest(&mut final_digest, digest_buffer); + + // Finalize and return the digest final_digest.finalize().to_vec() } @@ -112,18 +127,23 @@ impl ArrowDigester { } /// Serialize the schema into a `BTreeMap` for field name and its digest - fn hash_schema(schema: &Schema) -> Vec { + /// + /// # Panics + /// This function will panic if JSON serialization of the schema fails. + fn serialized_schema(schema: &Schema) -> String { let fields_digest = schema .fields .iter() - .map(|field| (field.name(), field.to_string())) + .map(|field| (field.name(), (field.to_string(), field.data_type()))) .collect::>(); + serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes") + } + + /// Serialize the schema into a `BTreeMap` for field name and its digest + pub fn hash_schema(schema: &Schema) -> Vec { // Hash the entire thing to the digest - D::digest( - serde_json::to_vec(&fields_digest).expect("Failed to serialize field_digest to bytes"), - ) - .to_vec() + D::digest(Self::serialized_schema(schema)).to_vec() } /// Hash a record batch and update the internal digests @@ -466,30 +486,6 @@ impl ArrowDigester { } } - fn hash_time_array( - array: &dyn Array, - time_unit: TimeUnit, - digest: &mut DigestBufferType, - element_size: i32, - ) { - // We need to update the digest with the time unit first to ensure different time units produce different hashes - - let data_digest = match digest { - DigestBufferType::NonNullable(data_digest) - | DigestBufferType::Nullable(_, data_digest) => data_digest, - }; - - data_digest.update([match time_unit { - TimeUnit::Second => 0_u8, - TimeUnit::Millisecond => 1_u8, - TimeUnit::Microsecond => 2_u8, - TimeUnit::Nanosecond => 3_u8, - }]); - - // Now hash the underlying fixed size array based on time unit - Self::hash_fixed_size_array(array, digest, element_size); - } - #[expect( clippy::cast_possible_truncation, reason = "String lengths from Arrow offsets are bounded" @@ -539,14 +535,13 @@ impl ArrowDigester { field_data_type: &DataType, digest: &mut DigestBufferType, ) { - todo!(); match digest { - DigestBufferType::NonNullable(data_digest) => { + DigestBufferType::NonNullable(_) => { for i in 0..array.len() { Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); } } - DigestBufferType::Nullable(bit_vec, data_digest) => { + DigestBufferType::Nullable(bit_vec, _) => { // Deal with null bits first Self::handle_null_bits(array, bit_vec); @@ -559,8 +554,6 @@ impl ArrowDigester { array.value(i).as_ref(), digest, ); - } else { - data_digest.update(NULL_BYTES); } } } @@ -578,12 +571,6 @@ impl ArrowDigester { } } - fn hash_decimal_metadata(precision: u8, scale: i8, digest: &mut D) { - // Include the precision and scale in the hash - digest.update([precision]); - digest.update(scale.to_le_bytes()); - } - /// Internal recursive function to extract field names from nested structs effectively flattening the schema /// The format is `parent__child__grandchild__etc`... for nested fields and will be stored in `fields_digest_buffer` fn extract_fields_name( @@ -648,33 +635,290 @@ mod tests { use arrow::{ array::{ - ArrayRef, BinaryArray, BooleanArray, Int32Array, Int64Array, LargeBinaryArray, - LargeStringArray, ListArray, RecordBatch, StringArray, StructArray, - Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal32Array, + Decimal64Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, RecordBatch, + StringArray, StructArray, Time32MillisecondArray, Time32SecondArray, + Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }, datatypes::Int32Type, }; - use arrow_schema::{DataType, Field, Schema}; + use arrow_schema::{DataType, Field, Schema, TimeUnit}; use hex::encode; + use indoc::indoc; use pretty_assertions::assert_eq; use sha2::Sha256; use crate::arrow_digester::ArrowDigester; use arrow::array::Decimal128Array; + #[expect(clippy::too_many_lines, reason = "Comprehensive schema test")] #[test] - fn schema_only() { + fn schema() { let schema = Schema::new(vec![ - Field::new("col1", DataType::Int32, false), - Field::new("col2", DataType::Utf8, true), + Field::new("bool", DataType::Boolean, true), + Field::new("int8", DataType::Int8, false), + Field::new("uint8", DataType::UInt8, false), + Field::new("int16", DataType::Int16, false), + Field::new("uint16", DataType::UInt16, false), + Field::new("int32", DataType::Int32, false), + Field::new("uint32", DataType::UInt32, false), + Field::new("int64", DataType::Int64, false), + Field::new("uint64", DataType::UInt64, false), + Field::new("float32", DataType::Float32, false), + Field::new("float64", DataType::Float64, false), + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), + Field::new("time32_second", DataType::Time32(TimeUnit::Second), false), + Field::new( + "time32_millis", + DataType::Time32(TimeUnit::Millisecond), + false, + ), + Field::new( + "time64_micro", + DataType::Time64(TimeUnit::Microsecond), + false, + ), + Field::new("time64_nano", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("binary", DataType::Binary, true), + Field::new("large_binary", DataType::LargeBinary, true), + Field::new("utf8", DataType::Utf8, true), + Field::new("large_utf8", DataType::LargeUtf8, true), + Field::new( + "list", + DataType::List(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new( + "large_list", + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new("decimal32", DataType::Decimal32(9, 2), true), + Field::new("decimal64", DataType::Decimal64(18, 3), true), + Field::new("decimal128", DataType::Decimal128(38, 5), true), ]); - let digester = ArrowDigester::::new(schema); - let hash = digester.finalize(); + // Serialize the schema and covert it over to pretty json for comparison + let compact_json: serde_json::Value = + serde_json::from_str(&ArrowDigester::::serialized_schema(&schema)).unwrap(); + let pretty_json = serde_json::to_string_pretty(&compact_json).unwrap(); assert_eq!( - encode(hash), - "95eb6c962dd0b61704bc0a29347ff91d3024e52adc31e23b33d843c539715abc" + pretty_json, + indoc! {r#" +{ + "binary": [ + "Field { \"binary\": nullable Binary }", + "Binary" + ], + "bool": [ + "Field { \"bool\": nullable Boolean }", + "Boolean" + ], + "date32": [ + "Field { \"date32\": Date32 }", + "Date32" + ], + "date64": [ + "Field { \"date64\": Date64 }", + "Date64" + ], + "decimal128": [ + "Field { \"decimal128\": nullable Decimal128(38, 5) }", + { + "Decimal128": [ + 38, + 5 + ] + } + ], + "decimal32": [ + "Field { \"decimal32\": nullable Decimal32(9, 2) }", + { + "Decimal32": [ + 9, + 2 + ] + } + ], + "decimal64": [ + "Field { \"decimal64\": nullable Decimal64(18, 3) }", + { + "Decimal64": [ + 18, + 3 + ] + } + ], + "float32": [ + "Field { \"float32\": Float32 }", + "Float32" + ], + "float64": [ + "Field { \"float64\": Float64 }", + "Float64" + ], + "int16": [ + "Field { \"int16\": Int16 }", + "Int16" + ], + "int32": [ + "Field { \"int32\": Int32 }", + "Int32" + ], + "int64": [ + "Field { \"int64\": Int64 }", + "Int64" + ], + "int8": [ + "Field { \"int8\": Int8 }", + "Int8" + ], + "large_binary": [ + "Field { \"large_binary\": nullable LargeBinary }", + "LargeBinary" + ], + "large_list": [ + "Field { \"large_list\": nullable LargeList(nullable Int32) }", + { + "LargeList": { + "data_type": "Int32", + "dict_id": 0, + "dict_is_ordered": false, + "metadata": {}, + "name": "item", + "nullable": true + } + } + ], + "large_utf8": [ + "Field { \"large_utf8\": nullable LargeUtf8 }", + "LargeUtf8" + ], + "list": [ + "Field { \"list\": nullable List(nullable Int32) }", + { + "List": { + "data_type": "Int32", + "dict_id": 0, + "dict_is_ordered": false, + "metadata": {}, + "name": "item", + "nullable": true + } + } + ], + "time32_millis": [ + "Field { \"time32_millis\": Time32(ms) }", + { + "Time32": "Millisecond" + } + ], + "time32_second": [ + "Field { \"time32_second\": Time32(s) }", + { + "Time32": "Second" + } + ], + "time64_micro": [ + "Field { \"time64_micro\": Time64(µs) }", + { + "Time64": "Microsecond" + } + ], + "time64_nano": [ + "Field { \"time64_nano\": Time64(ns) }", + { + "Time64": "Nanosecond" + } + ], + "uint16": [ + "Field { \"uint16\": UInt16 }", + "UInt16" + ], + "uint32": [ + "Field { \"uint32\": UInt32 }", + "UInt32" + ], + "uint64": [ + "Field { \"uint64\": UInt64 }", + "UInt64" + ], + "uint8": [ + "Field { \"uint8\": UInt8 }", + "UInt8" + ], + "utf8": [ + "Field { \"utf8\": nullable Utf8 }", + "Utf8" + ] +}"#} + ); + + // Empty Table Hashing Check + + assert_eq!( + encode(ArrowDigester::::new(schema.clone()).finalize()), + "a42e35f6623d86b72350bf0bb74b97781946df45423f192c397d435c254bc71e" + ); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(BooleanArray::from(vec![Some(true)])), + Arc::new(Int8Array::from(vec![1_i8])), + Arc::new(UInt8Array::from(vec![1_u8])), + Arc::new(Int16Array::from(vec![100_i16])), + Arc::new(UInt16Array::from(vec![100_u16])), + Arc::new(Int32Array::from(vec![1000_i32])), + Arc::new(UInt32Array::from(vec![1000_u32])), + Arc::new(Int64Array::from(vec![100_000_i64])), + Arc::new(UInt64Array::from(vec![100_000_u64])), + Arc::new(Float32Array::from(vec![1.5_f32])), + Arc::new(Float64Array::from(vec![1.5_f64])), + Arc::new(Date32Array::from(vec![18993_i32])), + Arc::new(Date64Array::from(vec![1_640_995_200_000_i64])), + Arc::new(Time32SecondArray::from(vec![3600_i32])), + Arc::new(Time32MillisecondArray::from(vec![3_600_000_i32])), + Arc::new(Time64MicrosecondArray::from(vec![3_600_000_000_i64])), + Arc::new(Time64NanosecondArray::from(vec![3_600_000_000_000_i64])), + Arc::new(BinaryArray::from(vec![Some(b"data1".as_ref())])), + Arc::new(LargeBinaryArray::from(vec![Some(b"large1".as_ref())])), + Arc::new(StringArray::from(vec![Some("text1")])), + Arc::new(LargeStringArray::from(vec![Some("large_text1")])), + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + ])), + Arc::new(LargeListArray::from_iter_primitive::( + vec![Some(vec![Some(5), Some(6)])], + )), + Arc::new( + Decimal32Array::from_iter(vec![Some(12345)]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + Arc::new( + Decimal64Array::from_iter(vec![Some(123_456_789_012)]) + .with_precision_and_scale(18, 3) + .unwrap(), + ), + Arc::new( + Decimal128Array::from_iter(vec![Some( + 123_456_789_012_345_678_901_234_567_890_i128, + )]) + .with_precision_and_scale(38, 5) + .unwrap(), + ), + ], + ) + .unwrap(); + // Hash the record batch + assert_eq!( + encode(ArrowDigester::::hash_record_batch(&batch)), + "da0d7d3d76a47e88648e3a1160a5d2432647f0769e08b42315533163c36b3eb0" ); } @@ -684,7 +928,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&bool_array)); assert_eq!( hash, - "d7b7a73916d3f0c693ebcfa94fe2eee163d31a38ba8fe44ef81c5ffbff50c9be" + "f9abeb37d9395f359b48a379f0a8467c572b19ecc6cae9fa85e1bf627a52a8f3" ); } @@ -695,7 +939,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&int_array)); assert_eq!( hash, - "bb36e54f5e2d937a05bb716a8d595f1c8da67fda48feeb7ab5b071a69e63d648" + "27f2411e6839eb1e3fe706ac3f01e704c7b46357360fb2ddb8a08ec98e8ba4fa" ); } @@ -706,7 +950,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); assert_eq!( hash, - "b5d70eca0650399a9b00440e3cd9985e58b0f033d446bdd5947f96a62397002a" + "9000b74aa80f685103a8cafc7e113aa8f33ccc0c94ea3713318d2cc2f3436baa" ); } @@ -717,7 +961,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); assert_eq!( hash, - "1f0847660ea421c266f226293d2f0c54ea5de0c168ac7e4bebfabf6d348a6d18" + "95f12143d789f364a3ed52f7300f8f91dc21fbe00c34aed798ca8fd54182dea3" ); } @@ -744,7 +988,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&binary_array)); assert_eq!( hash, - "2dadcaf793c1878ffa22eb2d5d746e27b648d638e4e344e565a23840a957b660" + "466801efd880d2acecd6c78915b5c2a51476870f9116912834d79de43a000071" ); // Test large binary array with same data to ensure consistency @@ -755,7 +999,7 @@ mod tests { Some(b"".as_ref()), ]); - assert_eq!( + assert_ne!( hex::encode(ArrowDigester::::hash_array(&large_binary_array)), hash ); @@ -808,14 +1052,14 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&string_array)); assert_eq!( hash, - "bde5c268d835b1ea9ea8b2a058f35f978d1c95a071b71fc5051a8a21f717e77e" + "14a2d2eaf535b6e78fbf1d58ae93accce424eafd20fa449eff8acefc47903d3d" ); // Test large string array with same data to ensure consistency let large_string_array = LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); - assert_eq!( + assert_ne!( hex::encode(ArrowDigester::::hash_array(&large_string_array)), hash ); @@ -834,7 +1078,7 @@ mod tests { let hash = hex::encode(ArrowDigester::::hash_array(&list_array)); assert_eq!( hash, - "d30c8845c58f71bcec4910c65a91328af2cc86d26001662270da3a3d5222dd36" + "1a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" ); } @@ -849,7 +1093,7 @@ mod tests { assert_eq!( encode(ArrowDigester::::hash_array(&decimal32_array)), - "9bafa8b4e342aa48ed6d25f3e7ca62ec849108395a93739252bdb329d72ec58a" + "ef29250615f9d6ab34672c3b11dfa2dcda6e8e6164bc55899c13887f17705f5d" ); // Test Decimal64 (precision 10-18) @@ -863,7 +1107,7 @@ mod tests { .unwrap(); assert_eq!( encode(ArrowDigester::::hash_array(&decimal64_array)), - "d2730c9222bd211d5c7cfae9fbe604728bb6e75aa0a96383daec511b20b63796" + "efa4ed72641051233889c07775366cbf2e56eb4b0fcfd46653f5741e81786f08" ); // Test Decimal128 (precision 19-38) @@ -877,7 +1121,7 @@ mod tests { .unwrap(); assert_eq!( hex::encode(ArrowDigester::::hash_array(&decimal128_array)), - "d2a1a2d8c87193032d46a541405e1bf60124d08a7c431ce3fe55f26508b400f3" + "55cc4d81a048dbca001ca8581673a5a6c93efd870d358df211a545c2af9b658d" ); } @@ -953,7 +1197,7 @@ mod tests { digester.update(&batch2); assert_eq!( encode(digester.finalize()), - "6042a984d52c98d660832763a997eb8d79694005aa46ca89bac15a2240ee46e7" + "37954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" ); } @@ -1025,7 +1269,7 @@ mod tests { // Check the digest assert_eq!( encode(digester.finalize()), - "28a12e93525ceb84554fb0ce564d14b14c537b9d0d6b2bf13a583170d18f41fb" + "b7faf50f1328ec80b575e018c121eed9d0e7e84ad72645499ebc8667e64199a7" ); } } From c2e2564b16ffa5f336511ce430f29e347decc611 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 05:50:11 +0000 Subject: [PATCH 40/53] Move actual arrow_digester logic to core and private it, while making a sha256 as public --- cspell.json | 3 +- src/arrow_digester.rs | 1278 +----------------------------------- src/arrow_digester_core.rs | 925 ++++++++++++++++++++++++++ src/lib.rs | 44 +- src/pyarrow.rs | 8 +- tests/arrow_digester.rs | 409 ++++++++++++ 6 files changed, 1402 insertions(+), 1265 deletions(-) create mode 100644 src/arrow_digester_core.rs create mode 100644 tests/arrow_digester.rs diff --git a/cspell.json b/cspell.json index ea8ac1f..5fe20e7 100644 --- a/cspell.json +++ b/cspell.json @@ -6,7 +6,8 @@ "uniffi", "uids", "bitvec", - "indoc" + "indoc", + "starfix" ], "ignoreWords": [], "useGitignore": false, diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs index e45b2ce..6942c34 100644 --- a/src/arrow_digester.rs +++ b/src/arrow_digester.rs @@ -1,1275 +1,39 @@ -#![expect( - clippy::expect_used, - clippy::todo, - reason = "First iteration of code, will add proper error handling later. Allow for unsupported data types for now" -)] -use std::{collections::BTreeMap, iter::repeat_n}; +use arrow::array::{Array, RecordBatch}; +use arrow_schema::Schema; +use sha2::Sha256; -use arrow::{ - array::{ - Array, BinaryArray, BooleanArray, GenericBinaryArray, GenericListArray, GenericStringArray, - LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait, - RecordBatch, StringArray, StructArray, - }, - datatypes::{DataType, Schema}, -}; -use arrow_schema::Field; -use bitvec::prelude::*; -use digest::Digest; +use crate::arrow_digester_core::ArrowDigesterCore; -const NULL_BYTES: &[u8] = b"NULL"; - -const DELIMITER_FOR_NESTED_FIELD: &str = "/"; - -enum DigestBufferType { - NonNullable(D), - Nullable(BitVec, D), // Where first digest is for the bull bits, while the second is for the actual data +/// Maps `arrow_digester_core` function to a `sha_256` digester + versioning +pub struct ArrowDigester { + digester: ArrowDigesterCore, } -pub struct ArrowDigester { - schema: Schema, - schema_digest: Vec, - fields_digest_buffer: BTreeMap>, -} - -impl ArrowDigester { +impl ArrowDigester { + /// Create a new instance of `ArrowDigester` with SHA256 as the digester with the schema which will be enforce through each update pub fn new(schema: Schema) -> Self { - // Hash the schema first - let schema_digest = Self::hash_schema(&schema); - - // Flatten all nested fields into a single map, this allows us to hash each field individually and efficiently - let mut fields_digest_buffer = BTreeMap::new(); - schema.fields.into_iter().for_each(|field| { - Self::extract_fields_name(field, "", &mut fields_digest_buffer); - }); - - // Store it in the new struct for now Self { - schema, - schema_digest, - fields_digest_buffer, + digester: ArrowDigesterCore::::new(schema), } } - /// Hash an array directly without needing to create an `ArrowDigester` instance on the user side - /// For hash array, we don't have a schema to hash, however we do have field data type. - /// So similar to schema, we will hash based on datatype to encode the metadata information into the digest - /// - /// # Panics - /// - /// This function will panic if JSON serialization of the data type fails. - /// - pub fn hash_array(array: &dyn Array) -> Vec { - let mut final_digest = D::new(); - - let data_type_serialized = serde_json::to_string(&array.data_type()) - .expect("Failed to serialize data type to string"); - - // Update the digest buffer with the array metadata and field data - final_digest.update(data_type_serialized); - - // Now we update it with the actual array data - let mut digest_buffer = if array.is_nullable() { - DigestBufferType::Nullable(BitVec::new(), D::new()) - } else { - DigestBufferType::NonNullable(D::new()) - }; - Self::array_digest_update(array.data_type(), array, &mut digest_buffer); - Self::finalize_digest(&mut final_digest, digest_buffer); - - // Finalize and return the digest - final_digest.finalize().to_vec() + /// Update the digester with a new `RecordBatch` + pub fn update(&mut self, record_batch: &RecordBatch) { + self.digester.update(record_batch); } - /// Hash record batch directly without needing to create an `ArrowDigester` instance on the user side - pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { - let mut digester = Self::new(record_batch.schema().as_ref().clone()); - digester.update(record_batch); - digester.finalize() - } - - /// This will consume the `ArrowDigester` and produce the final combined digest where the schema - /// digest is fed in first, followed by each field digest in alphabetical order of field names + /// Consume the digester and finalize the hash computation pub fn finalize(self) -> Vec { - // Finalize all the sub digest and combine them into a single digest - let mut final_digest = D::new(); - - // digest the schema first - final_digest.update(&self.schema_digest); - - // Then digest each field digest in order - self.fields_digest_buffer - .into_iter() - .for_each(|(_, digest)| Self::finalize_digest(&mut final_digest, digest)); - - final_digest.finalize().to_vec() - } - - #[expect( - clippy::big_endian_bytes, - reason = "Use for bit packing the null_bit_values" - )] - /// Finalize a single field digest into the final digest - /// Helpers to reduce code duplication - fn finalize_digest(final_digest: &mut D, digest: DigestBufferType) { - match digest { - DigestBufferType::NonNullable(data_digest) => { - final_digest.update(data_digest.finalize()); - } - DigestBufferType::Nullable(null_bit_digest, data_digest) => { - final_digest.update(null_bit_digest.len().to_le_bytes()); - for &word in null_bit_digest.as_raw_slice() { - final_digest.update(word.to_be_bytes()); - } - final_digest.update(data_digest.finalize()); - } - } - } - - /// Serialize the schema into a `BTreeMap` for field name and its digest - /// - /// # Panics - /// This function will panic if JSON serialization of the schema fails. - fn serialized_schema(schema: &Schema) -> String { - let fields_digest = schema - .fields - .iter() - .map(|field| (field.name(), (field.to_string(), field.data_type()))) - .collect::>(); - - serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes") - } - - /// Serialize the schema into a `BTreeMap` for field name and its digest - pub fn hash_schema(schema: &Schema) -> Vec { - // Hash the entire thing to the digest - D::digest(Self::serialized_schema(schema)).to_vec() - } - - /// Hash a record batch and update the internal digests - fn update(&mut self, record_batch: &RecordBatch) { - // Verify schema matches - assert!( - *record_batch.schema() == self.schema, - "Record batch schema does not match ArrowDigester schema" - ); - - // Iterate through each field and update its digest - self.fields_digest_buffer - .iter_mut() - .for_each(|(field_name, digest)| { - // Determine if field name is nested - let field_name_hierarchy = field_name - .split(DELIMITER_FOR_NESTED_FIELD) - .collect::>(); - - if field_name_hierarchy.len() == 1 { - Self::array_digest_update( - record_batch - .schema() - .field_with_name(field_name) - .expect("Failed to get field with name") - .data_type(), - record_batch - .column_by_name(field_name) - .expect("Failed to get column by name"), - digest, - ); - } else { - Self::update_nested_field( - &field_name_hierarchy, - 0, - record_batch - .column_by_name( - field_name_hierarchy - .first() - .expect("Failed to get field name at idx 0, list is empty!"), - ) - .expect("Failed to get column by name") - .as_any() - .downcast_ref::() - .expect("Failed to downcast to StructArray"), - digest, - ); - } - }); - } - - /// Recursive function to update nested field digests (structs within structs) - fn update_nested_field( - field_name_hierarchy: &[&str], - current_level: usize, - array: &StructArray, - digest: &mut DigestBufferType, - ) { - let current_level_plus_one = current_level - .checked_add(1) - .expect("Field nesting level overflow"); - - if field_name_hierarchy - .len() - .checked_sub(1) - .expect("field_name_hierarchy underflow") - == current_level_plus_one - { - let array_data = array - .column_by_name( - field_name_hierarchy - .last() - .expect("Failed to get field name at idx 0, list is empty!"), - ) - .expect("Failed to get column by name"); - // Base case, it should be a non-struct field - Self::array_digest_update(array_data.data_type(), array_data.as_ref(), digest); - } else { - // Recursive case, it should be a struct field - let next_array = array - .column_by_name( - field_name_hierarchy - .get(current_level_plus_one) - .expect("Failed to get field name at current level"), - ) - .expect("Failed to get column by name") - .as_any() - .downcast_ref::() - .expect("Failed to downcast to StructArray"); - - Self::update_nested_field( - field_name_hierarchy, - current_level_plus_one, - next_array, - digest, - ); - } - } - - #[expect( - clippy::too_many_lines, - reason = "Comprehensive match on all data types" - )] - fn array_digest_update( - data_type: &DataType, - array: &dyn Array, - digest: &mut DigestBufferType, - ) { - match data_type { - DataType::Null => todo!(), - DataType::Boolean => { - // Bool Array is stored a bit differently, so we can't use the standard fixed buffer approach - let bool_array = array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to BooleanArray"); - - match digest { - DigestBufferType::NonNullable(data_digest) => { - // We want to bit pack the boolean values into bytes for hashing - let mut bit_vec = BitVec::::with_capacity(bool_array.len()); - for i in 0..bool_array.len() { - bit_vec.push(bool_array.value(i)); - } - - data_digest.update(bit_vec.as_raw_slice()); - } - DigestBufferType::Nullable(null_bit_vec, data_digest) => { - // Handle null bits first - Self::handle_null_bits(bool_array, null_bit_vec); - - // Handle the data - let mut bit_vec = BitVec::::with_capacity(bool_array.len()); - for i in 0..bool_array.len() { - // We only want the valid bits, for null we will discard from the hash since that is already capture by null_bits - if bool_array.is_valid(i) { - bit_vec.push(bool_array.value(i)); - } - } - data_digest.update(bit_vec.as_raw_slice()); - } - } - } - DataType::Int8 | DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), - DataType::Int16 | DataType::UInt16 | DataType::Float16 => { - Self::hash_fixed_size_array(array, digest, 2); - } - DataType::Int32 - | DataType::UInt32 - | DataType::Float32 - | DataType::Date32 - | DataType::Decimal32(_, _) => { - Self::hash_fixed_size_array(array, digest, 4); - } - DataType::Int64 - | DataType::UInt64 - | DataType::Float64 - | DataType::Date64 - | DataType::Decimal64(_, _) => { - Self::hash_fixed_size_array(array, digest, 8); - } - DataType::Timestamp(_, _) => todo!(), - DataType::Time32(_) => Self::hash_fixed_size_array(array, digest, 4), - DataType::Time64(_) => Self::hash_fixed_size_array(array, digest, 8), - DataType::Duration(_) => todo!(), - DataType::Interval(_) => todo!(), - DataType::Binary => Self::hash_binary_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to BinaryArray"), - digest, - ), - DataType::FixedSizeBinary(element_size) => { - Self::hash_fixed_size_array(array, digest, *element_size); - } - DataType::LargeBinary => Self::hash_binary_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to LargeBinaryArray"), - digest, - ), - DataType::BinaryView => todo!(), - DataType::Utf8 => Self::hash_string_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to StringArray"), - digest, - ), - DataType::LargeUtf8 => Self::hash_string_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to LargeStringArray"), - digest, - ), - DataType::Utf8View => todo!(), - DataType::List(field) => { - Self::hash_list_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to ListArray"), - field.data_type(), - digest, - ); - } - DataType::ListView(_) => todo!(), - DataType::FixedSizeList(_, _) => todo!(), - DataType::LargeList(field) => { - Self::hash_list_array( - array - .as_any() - .downcast_ref::() - .expect("Failed to downcast to LargeListArray"), - field.data_type(), - digest, - ); - } - DataType::LargeListView(_) => todo!(), - DataType::Struct(_) => todo!(), - DataType::Union(_, _) => todo!(), - DataType::Dictionary(_, _) => todo!(), - DataType::Decimal128(_, _) => { - Self::hash_fixed_size_array(array, digest, 16); - } - DataType::Decimal256(_, _) => { - Self::hash_fixed_size_array(array, digest, 32); - } - DataType::Map(_, _) => todo!(), - DataType::RunEndEncoded(_, _) => todo!(), - } - } - - #[expect(clippy::cast_sign_loss, reason = "element_size is always positive")] - fn hash_fixed_size_array( - array: &dyn Array, - digest_buffer: &mut DigestBufferType, - element_size: i32, - ) { - let array_data = array.to_data(); - let element_size_usize = element_size as usize; - - // Get the slice with offset accounted for if there is any - let slice = array_data - .buffers() - .first() - .expect("Unable to get first buffer to determine offset") - .as_slice() - .get( - array_data - .offset() - .checked_mul(element_size_usize) - .expect("Offset multiplication overflow").., - ) - .expect("Failed to get buffer slice for FixedSizeBinaryArray"); - - match digest_buffer { - DigestBufferType::NonNullable(data_digest) => { - // No nulls, we can hash the entire buffer directly - data_digest.update(slice); - } - DigestBufferType::Nullable(null_bits, data_digest) => { - // Handle null bits first - Self::handle_null_bits(array, null_bits); - - match array_data.nulls() { - Some(null_buffer) => { - // There are nulls, so we need to incrementally hash each value - for i in 0..array_data.len() { - if null_buffer.is_valid(i) { - let data_pos = i - .checked_mul(element_size_usize) - .expect("Data position multiplication overflow"); - let end_pos = data_pos - .checked_add(element_size_usize) - .expect("End position addition overflow"); - - data_digest.update( - slice - .get(data_pos..end_pos) - .expect("Failed to get data_slice"), - ); - } - } - } - None => { - // No nulls, we can hash the entire buffer directly - data_digest.update(slice); - } - } - } - } - } - - fn hash_binary_array( - array: &GenericBinaryArray, - digest: &mut DigestBufferType, - ) { - match digest { - DigestBufferType::NonNullable(data_digest) => { - for i in 0..array.len() { - let value = array.value(i); - data_digest.update(value.len().to_le_bytes()); - data_digest.update(value); - } - } - DigestBufferType::Nullable(null_bit_vec, data_digest) => { - // Deal with the null bits first - if let Some(null_buf) = array.nulls() { - // We would need to iterate through the null buffer and push it into the null_bit_vec - for i in 0..array.len() { - null_bit_vec.push(null_buf.is_valid(i)); - } - - for i in 0..array.len() { - if null_buf.is_valid(i) { - let value = array.value(i); - data_digest.update(value.len().to_le_bytes()); - data_digest.update(value); - } else { - data_digest.update(NULL_BYTES); - } - } - } else { - // All valid, therefore we can extend the bit vector with all true values - let len = array.len().checked_sub(1).expect("Array length underflow"); - null_bit_vec.extend(repeat_n(true, len)); - - // Deal with the data - for i in 0..array.len() { - let value = array.value(i); - data_digest.update(value.len().to_le_bytes()); - data_digest.update(value); - } - } - } - } - } - - #[expect( - clippy::cast_possible_truncation, - reason = "String lengths from Arrow offsets are bounded" - )] - fn hash_string_array( - array: &GenericStringArray, - digest: &mut DigestBufferType, - ) { - match digest { - DigestBufferType::NonNullable(data_digest) => { - for i in 0..array.len() { - let value = array.value(i); - data_digest.update((value.len() as u64).to_le_bytes()); - data_digest.update(value.as_bytes()); - } - } - DigestBufferType::Nullable(null_bit_vec, data_digest) => { - // Deal with the null bits first - Self::handle_null_bits(array, null_bit_vec); - - match array.nulls() { - Some(null_buf) => { - for i in 0..array.len() { - if null_buf.is_valid(i) { - let value = array.value(i); - data_digest.update((value.len() as u32).to_le_bytes()); - data_digest.update(value.as_bytes()); - } else { - data_digest.update(NULL_BYTES); - } - } - } - None => { - for i in 0..array.len() { - let value = array.value(i); - data_digest.update((value.len() as u32).to_le_bytes()); - data_digest.update(value.as_bytes()); - } - } - } - } - } - } - - fn hash_list_array( - array: &GenericListArray, - field_data_type: &DataType, - digest: &mut DigestBufferType, - ) { - match digest { - DigestBufferType::NonNullable(_) => { - for i in 0..array.len() { - Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); - } - } - DigestBufferType::Nullable(bit_vec, _) => { - // Deal with null bits first - Self::handle_null_bits(array, bit_vec); - - match array.nulls() { - Some(null_buf) => { - for i in 0..array.len() { - if null_buf.is_valid(i) { - Self::array_digest_update( - field_data_type, - array.value(i).as_ref(), - digest, - ); - } - } - } - None => { - for i in 0..array.len() { - Self::array_digest_update( - field_data_type, - array.value(i).as_ref(), - digest, - ); - } - } - } - } - } - } - - /// Internal recursive function to extract field names from nested structs effectively flattening the schema - /// The format is `parent__child__grandchild__etc`... for nested fields and will be stored in `fields_digest_buffer` - fn extract_fields_name( - field: &Field, - parent_field_name: &str, - fields_digest_buffer: &mut BTreeMap>, - ) { - // Check if field is a nested type of struct - if let DataType::Struct(fields) = field.data_type() { - // We will add fields in alphabetical order - fields.into_iter().for_each(|field_inner| { - Self::extract_fields_name( - field_inner, - Self::construct_field_name_hierarchy(parent_field_name, field.name()).as_str(), - fields_digest_buffer, - ); - }); - } else { - // Base case, just add the the combine field name to the map - fields_digest_buffer.insert( - Self::construct_field_name_hierarchy(parent_field_name, field.name()), - if field.is_nullable() { - DigestBufferType::Nullable(BitVec::new(), D::new()) - } else { - DigestBufferType::NonNullable(D::new()) - }, - ); - } - } - - fn construct_field_name_hierarchy(parent_field_name: &str, field_name: &str) -> String { - if parent_field_name.is_empty() { - field_name.to_owned() - } else { - format!("{parent_field_name}{DELIMITER_FOR_NESTED_FIELD}{field_name}") - } - } - - fn handle_null_bits(array: &dyn Array, null_bit_vec: &mut BitVec) { - match array.nulls() { - Some(null_buf) => { - // We would need to iterate through the null buffer and push it into the null_bit_vec - for i in 0..array.len() { - null_bit_vec.push(null_buf.is_valid(i)); - } - } - None => { - // All valid, therefore we can extend the bit vector with all true values - null_bit_vec.extend(repeat_n( - true, - array.len().checked_sub(1).expect("Array length underflow"), - )); - } - } - } -} - -#[cfg(test)] -mod tests { - #![expect(clippy::unwrap_used, reason = "Okay in test")] - use std::sync::Arc; - - use arrow::{ - array::{ - ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal32Array, - Decimal64Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, RecordBatch, - StringArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, - }, - datatypes::Int32Type, - }; - use arrow_schema::{DataType, Field, Schema, TimeUnit}; - use hex::encode; - use indoc::indoc; - use pretty_assertions::assert_eq; - use sha2::Sha256; - - use crate::arrow_digester::ArrowDigester; - use arrow::array::Decimal128Array; - - #[expect(clippy::too_many_lines, reason = "Comprehensive schema test")] - #[test] - fn schema() { - let schema = Schema::new(vec![ - Field::new("bool", DataType::Boolean, true), - Field::new("int8", DataType::Int8, false), - Field::new("uint8", DataType::UInt8, false), - Field::new("int16", DataType::Int16, false), - Field::new("uint16", DataType::UInt16, false), - Field::new("int32", DataType::Int32, false), - Field::new("uint32", DataType::UInt32, false), - Field::new("int64", DataType::Int64, false), - Field::new("uint64", DataType::UInt64, false), - Field::new("float32", DataType::Float32, false), - Field::new("float64", DataType::Float64, false), - Field::new("date32", DataType::Date32, false), - Field::new("date64", DataType::Date64, false), - Field::new("time32_second", DataType::Time32(TimeUnit::Second), false), - Field::new( - "time32_millis", - DataType::Time32(TimeUnit::Millisecond), - false, - ), - Field::new( - "time64_micro", - DataType::Time64(TimeUnit::Microsecond), - false, - ), - Field::new("time64_nano", DataType::Time64(TimeUnit::Nanosecond), false), - Field::new("binary", DataType::Binary, true), - Field::new("large_binary", DataType::LargeBinary, true), - Field::new("utf8", DataType::Utf8, true), - Field::new("large_utf8", DataType::LargeUtf8, true), - Field::new( - "list", - DataType::List(Box::new(Field::new("item", DataType::Int32, true)).into()), - true, - ), - Field::new( - "large_list", - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)).into()), - true, - ), - Field::new("decimal32", DataType::Decimal32(9, 2), true), - Field::new("decimal64", DataType::Decimal64(18, 3), true), - Field::new("decimal128", DataType::Decimal128(38, 5), true), - ]); - - // Serialize the schema and covert it over to pretty json for comparison - let compact_json: serde_json::Value = - serde_json::from_str(&ArrowDigester::::serialized_schema(&schema)).unwrap(); - let pretty_json = serde_json::to_string_pretty(&compact_json).unwrap(); - - assert_eq!( - pretty_json, - indoc! {r#" -{ - "binary": [ - "Field { \"binary\": nullable Binary }", - "Binary" - ], - "bool": [ - "Field { \"bool\": nullable Boolean }", - "Boolean" - ], - "date32": [ - "Field { \"date32\": Date32 }", - "Date32" - ], - "date64": [ - "Field { \"date64\": Date64 }", - "Date64" - ], - "decimal128": [ - "Field { \"decimal128\": nullable Decimal128(38, 5) }", - { - "Decimal128": [ - 38, - 5 - ] - } - ], - "decimal32": [ - "Field { \"decimal32\": nullable Decimal32(9, 2) }", - { - "Decimal32": [ - 9, - 2 - ] - } - ], - "decimal64": [ - "Field { \"decimal64\": nullable Decimal64(18, 3) }", - { - "Decimal64": [ - 18, - 3 - ] - } - ], - "float32": [ - "Field { \"float32\": Float32 }", - "Float32" - ], - "float64": [ - "Field { \"float64\": Float64 }", - "Float64" - ], - "int16": [ - "Field { \"int16\": Int16 }", - "Int16" - ], - "int32": [ - "Field { \"int32\": Int32 }", - "Int32" - ], - "int64": [ - "Field { \"int64\": Int64 }", - "Int64" - ], - "int8": [ - "Field { \"int8\": Int8 }", - "Int8" - ], - "large_binary": [ - "Field { \"large_binary\": nullable LargeBinary }", - "LargeBinary" - ], - "large_list": [ - "Field { \"large_list\": nullable LargeList(nullable Int32) }", - { - "LargeList": { - "data_type": "Int32", - "dict_id": 0, - "dict_is_ordered": false, - "metadata": {}, - "name": "item", - "nullable": true - } - } - ], - "large_utf8": [ - "Field { \"large_utf8\": nullable LargeUtf8 }", - "LargeUtf8" - ], - "list": [ - "Field { \"list\": nullable List(nullable Int32) }", - { - "List": { - "data_type": "Int32", - "dict_id": 0, - "dict_is_ordered": false, - "metadata": {}, - "name": "item", - "nullable": true - } - } - ], - "time32_millis": [ - "Field { \"time32_millis\": Time32(ms) }", - { - "Time32": "Millisecond" - } - ], - "time32_second": [ - "Field { \"time32_second\": Time32(s) }", - { - "Time32": "Second" - } - ], - "time64_micro": [ - "Field { \"time64_micro\": Time64(µs) }", - { - "Time64": "Microsecond" - } - ], - "time64_nano": [ - "Field { \"time64_nano\": Time64(ns) }", - { - "Time64": "Nanosecond" - } - ], - "uint16": [ - "Field { \"uint16\": UInt16 }", - "UInt16" - ], - "uint32": [ - "Field { \"uint32\": UInt32 }", - "UInt32" - ], - "uint64": [ - "Field { \"uint64\": UInt64 }", - "UInt64" - ], - "uint8": [ - "Field { \"uint8\": UInt8 }", - "UInt8" - ], - "utf8": [ - "Field { \"utf8\": nullable Utf8 }", - "Utf8" - ] -}"#} - ); - - // Empty Table Hashing Check - - assert_eq!( - encode(ArrowDigester::::new(schema.clone()).finalize()), - "a42e35f6623d86b72350bf0bb74b97781946df45423f192c397d435c254bc71e" - ); - - let batch = RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(BooleanArray::from(vec![Some(true)])), - Arc::new(Int8Array::from(vec![1_i8])), - Arc::new(UInt8Array::from(vec![1_u8])), - Arc::new(Int16Array::from(vec![100_i16])), - Arc::new(UInt16Array::from(vec![100_u16])), - Arc::new(Int32Array::from(vec![1000_i32])), - Arc::new(UInt32Array::from(vec![1000_u32])), - Arc::new(Int64Array::from(vec![100_000_i64])), - Arc::new(UInt64Array::from(vec![100_000_u64])), - Arc::new(Float32Array::from(vec![1.5_f32])), - Arc::new(Float64Array::from(vec![1.5_f64])), - Arc::new(Date32Array::from(vec![18993_i32])), - Arc::new(Date64Array::from(vec![1_640_995_200_000_i64])), - Arc::new(Time32SecondArray::from(vec![3600_i32])), - Arc::new(Time32MillisecondArray::from(vec![3_600_000_i32])), - Arc::new(Time64MicrosecondArray::from(vec![3_600_000_000_i64])), - Arc::new(Time64NanosecondArray::from(vec![3_600_000_000_000_i64])), - Arc::new(BinaryArray::from(vec![Some(b"data1".as_ref())])), - Arc::new(LargeBinaryArray::from(vec![Some(b"large1".as_ref())])), - Arc::new(StringArray::from(vec![Some("text1")])), - Arc::new(LargeStringArray::from(vec![Some("large_text1")])), - Arc::new(ListArray::from_iter_primitive::(vec![ - Some(vec![Some(1), Some(2)]), - ])), - Arc::new(LargeListArray::from_iter_primitive::( - vec![Some(vec![Some(5), Some(6)])], - )), - Arc::new( - Decimal32Array::from_iter(vec![Some(12345)]) - .with_precision_and_scale(9, 2) - .unwrap(), - ), - Arc::new( - Decimal64Array::from_iter(vec![Some(123_456_789_012)]) - .with_precision_and_scale(18, 3) - .unwrap(), - ), - Arc::new( - Decimal128Array::from_iter(vec![Some( - 123_456_789_012_345_678_901_234_567_890_i128, - )]) - .with_precision_and_scale(38, 5) - .unwrap(), - ), - ], - ) - .unwrap(); - // Hash the record batch - assert_eq!( - encode(ArrowDigester::::hash_record_batch(&batch)), - "da0d7d3d76a47e88648e3a1160a5d2432647f0769e08b42315533163c36b3eb0" - ); - } - - #[test] - fn boolean_array_hashing() { - let bool_array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); - let hash = hex::encode(ArrowDigester::::hash_array(&bool_array)); - assert_eq!( - hash, - "f9abeb37d9395f359b48a379f0a8467c572b19ecc6cae9fa85e1bf627a52a8f3" - ); - } - - /// Test int32 array hashing which is really meant to test fixed size element array hashing - #[test] - fn int32_array_hashing() { - let int_array = Int32Array::from(vec![Some(42), None, Some(-7), Some(0)]); - let hash = hex::encode(ArrowDigester::::hash_array(&int_array)); - assert_eq!( - hash, - "27f2411e6839eb1e3fe706ac3f01e704c7b46357360fb2ddb8a08ec98e8ba4fa" - ); - } - - /// Test time array hashing - #[test] - fn time32_array_hashing() { - let time_array = Time32SecondArray::from(vec![Some(1000), None, Some(5000), Some(0)]); - let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); - assert_eq!( - hash, - "9000b74aa80f685103a8cafc7e113aa8f33ccc0c94ea3713318d2cc2f3436baa" - ); - } - - #[test] - fn time64_array_hashing() { - let time_array = - Time64MicrosecondArray::from(vec![Some(1_000_000), None, Some(5_000_000), Some(0)]); - let hash = hex::encode(ArrowDigester::::hash_array(&time_array)); - assert_eq!( - hash, - "95f12143d789f364a3ed52f7300f8f91dc21fbe00c34aed798ca8fd54182dea3" - ); - } - - #[test] - fn time_array_different_units_produce_different_hashes() { - let time32_second = Time32SecondArray::from(vec![Some(1000), Some(2000)]); - let time32_millis = Time32MillisecondArray::from(vec![Some(1000), Some(2000)]); - - let hash_second = hex::encode(ArrowDigester::::hash_array(&time32_second)); - let hash_millis = hex::encode(ArrowDigester::::hash_array(&time32_millis)); - - assert_ne!(hash_second, hash_millis); - } - - /// Test binary array hashing - #[test] - fn binary_array_hashing() { - let binary_array = BinaryArray::from(vec![ - Some(b"hello".as_ref()), - None, - Some(b"world".as_ref()), - Some(b"".as_ref()), - ]); - let hash = hex::encode(ArrowDigester::::hash_array(&binary_array)); - assert_eq!( - hash, - "466801efd880d2acecd6c78915b5c2a51476870f9116912834d79de43a000071" - ); - - // Test large binary array with same data to ensure consistency - let large_binary_array = LargeBinaryArray::from(vec![ - Some(b"hello".as_ref()), - None, - Some(b"world".as_ref()), - Some(b"".as_ref()), - ]); - - assert_ne!( - hex::encode(ArrowDigester::::hash_array(&large_binary_array)), - hash - ); - } - - // Test binary array collision vulnerability - different partitions should produce different hashes - #[test] - fn binary_array_length_prefix_prevents_collisions() { - // Array 1: [[0x01, 0x02], [0x03]] - let array1 = BinaryArray::from(vec![Some(&[0x01_u8, 0x02_u8][..]), Some(&[0x03_u8][..])]); - - // Array 2: [[0x01], [0x02, 0x03]] - let array2 = BinaryArray::from(vec![Some(&[0x01_u8][..]), Some(&[0x02_u8, 0x03_u8][..])]); - - let hash1 = hex::encode(ArrowDigester::::hash_array(&array1)); - let hash2 = hex::encode(ArrowDigester::::hash_array(&array2)); - - // Without length prefix, these would collide (both hash to 0x01 0x02 0x03) - // With length prefix, they should produce different hashes - assert_ne!( - hash1, hash2, - "Binary arrays with different partitions should produce different hashes" - ); + self.digester.finalize() } - // Test string array collision vulnerability - different partitions should produce different hashes - #[test] - fn string_array_length_prefix_prevents_collisions() { - // Array 1: ["ab", "c"] - let array1 = StringArray::from(vec![Some("ab"), Some("c")]); - - // Array 2: ["a", "bc"] - let array2 = StringArray::from(vec![Some("a"), Some("bc")]); - - let hash1 = hex::encode(ArrowDigester::::hash_array(&array1)); - let hash2 = hex::encode(ArrowDigester::::hash_array(&array2)); - - // Without length prefix, these would collide (both hash to "abc") - // With length prefix, they should produce different hashes - assert_ne!( - hash1, hash2, - "String arrays with different partitions should produce different hashes" - ); - } - - // Test String hashing - #[test] - fn string_array_hashing() { - let string_array = StringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); - let hash = hex::encode(ArrowDigester::::hash_array(&string_array)); - assert_eq!( - hash, - "14a2d2eaf535b6e78fbf1d58ae93accce424eafd20fa449eff8acefc47903d3d" - ); - - // Test large string array with same data to ensure consistency - let large_string_array = - LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); - - assert_ne!( - hex::encode(ArrowDigester::::hash_array(&large_string_array)), - hash - ); - } - - // List array hashing test - #[test] - fn list_array_hashing() { - let list_array = ListArray::from_iter_primitive::(vec![ - Some(vec![Some(1), Some(2), Some(3)]), - None, - Some(vec![Some(4), Some(5)]), - Some(vec![Some(6)]), - ]); - - let hash = hex::encode(ArrowDigester::::hash_array(&list_array)); - assert_eq!( - hash, - "1a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" - ); - } - - // Test all types of decimal hashing - #[test] - fn decimal_array_hashing() { - // Test Decimal32 (precision 1-9) - let decimal32_array = - Decimal128Array::from_iter(vec![Some(123), None, Some(-456), Some(0)]) - .with_precision_and_scale(9, 2) - .unwrap(); - - assert_eq!( - encode(ArrowDigester::::hash_array(&decimal32_array)), - "ef29250615f9d6ab34672c3b11dfa2dcda6e8e6164bc55899c13887f17705f5d" - ); - - // Test Decimal64 (precision 10-18) - let decimal64_array = Decimal128Array::from_iter(vec![ - Some(1_234_567_890_123), - None, - Some(-9_876_543_210), - Some(0), - ]) - .with_precision_and_scale(15, 3) - .unwrap(); - assert_eq!( - encode(ArrowDigester::::hash_array(&decimal64_array)), - "efa4ed72641051233889c07775366cbf2e56eb4b0fcfd46653f5741e81786f08" - ); - - // Test Decimal128 (precision 19-38) - let decimal128_array = Decimal128Array::from_iter(vec![ - Some(123_456_789_012_345_678_901_234_567), - None, - Some(-987_654_321_098_765_432_109_876_543), - Some(0), - ]) - .with_precision_and_scale(38, 5) - .unwrap(); - assert_eq!( - hex::encode(ArrowDigester::::hash_array(&decimal128_array)), - "55cc4d81a048dbca001ca8581673a5a6c93efd870d358df211a545c2af9b658d" - ); - } - - #[test] - fn commutative_tables() { - let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; - let fake_data = Arc::new(BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - ])) as ArrayRef; - - // Create two record batches with same data but different order - let batch1 = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("uids", DataType::Int32, false), - Field::new("flags", DataType::Boolean, true), - ])), - vec![Arc::clone(&uids), Arc::clone(&fake_data)], - ); - - let batch2 = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("flags", DataType::Boolean, true), - Field::new("uids", DataType::Int32, false), - ])), - vec![fake_data, uids], - ); - - // Hash both record batches - assert_eq!( - encode(ArrowDigester::::hash_record_batch( - batch1.as_ref().unwrap() - )), - encode(ArrowDigester::::hash_record_batch( - batch2.as_ref().unwrap() - )) - ); - } - - #[test] - fn record_batch_hashing() { - let schema = Arc::new(Schema::new(vec![ - Field::new("uids", DataType::Int32, false), - Field::new("flags", DataType::Boolean, true), - ])); - - // Create two record batches with different data to simulate loading at different times - let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; - let fake_data = Arc::new(BooleanArray::from(vec![ - Some(true), - Some(false), - None, - Some(true), - ])); - - let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![uids, fake_data]).unwrap(); - - let uids2 = - Arc::new(Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8)])) as ArrayRef; - let fake_data2 = Arc::new(BooleanArray::from(vec![ - Some(false), - Some(true), - Some(true), - None, - ])); - - let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![uids2, fake_data2]).unwrap(); - // Hash both record batches - let mut digester = ArrowDigester::::new((*schema).clone()); - digester.update(&batch1); - digester.update(&batch2); - assert_eq!( - encode(digester.finalize()), - "37954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" - ); + /// Function to hash an Array in one go + pub fn hash_array(array: &dyn Array) -> Vec { + ArrowDigesterCore::::hash_array(array) } - #[test] - fn nested_fields() { - // Test nested struct field name extraction - let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new( - "nested", - DataType::Struct( - vec![ - Field::new("name", DataType::Utf8, true), - Field::new( - "deep", - DataType::Struct( - vec![Field::new("value", DataType::Int64, false)].into(), - ), - false, - ), - ] - .into(), - ), - false, - ), - ]); - - let mut digester = ArrowDigester::::new(schema.clone()); - let field_names: Vec<&String> = digester.fields_digest_buffer.keys().collect(); - - assert_eq!(field_names.len(), 3); - assert!(field_names.contains(&&"id".to_owned())); - assert!(field_names.contains(&&"nested/name".to_owned())); - assert!(field_names.contains(&&"nested/deep/value".to_owned())); - - // Test the nested field update by creating record_batch and using the update method - let id_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; - let name_array = Arc::new(StringArray::from(vec![Some("Alice"), Some("Bob")])) as ArrayRef; - let value_array = Arc::new(Int64Array::from(vec![Some(100), Some(200)])) as ArrayRef; - - let schema_ref = Arc::new(schema); - - let nested_struct = StructArray::from(vec![ - ( - Arc::new(Field::new("name", DataType::Utf8, true)), - name_array, - ), - ( - Arc::new(Field::new( - "deep", - DataType::Struct(vec![Field::new("value", DataType::Int64, false)].into()), - false, - )), - Arc::new(StructArray::from(vec![( - Arc::new(Field::new("value", DataType::Int64, false)), - value_array, - )])) as ArrayRef, - ), - ]); - - let record_batch = RecordBatch::try_new( - Arc::clone(&schema_ref), - vec![id_array, Arc::new(nested_struct)], - ) - .unwrap(); - - digester.update(&record_batch); - - // Check the digest - assert_eq!( - encode(digester.finalize()), - "b7faf50f1328ec80b575e018c121eed9d0e7e84ad72645499ebc8667e64199a7" - ); + /// Function to hash a complete `RecordBatch` in one go + pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { + ArrowDigesterCore::::hash_record_batch(record_batch) } } diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs new file mode 100644 index 0000000..3af7456 --- /dev/null +++ b/src/arrow_digester_core.rs @@ -0,0 +1,925 @@ +#![expect( + clippy::expect_used, + clippy::todo, + reason = "First iteration of code, will add proper error handling later. Allow for unsupported data types for now" +)] +use std::{collections::BTreeMap, iter::repeat_n}; + +use arrow::{ + array::{ + Array, BinaryArray, BooleanArray, GenericBinaryArray, GenericListArray, GenericStringArray, + LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait, + RecordBatch, StringArray, StructArray, + }, + datatypes::{DataType, Schema}, +}; +use arrow_schema::Field; +use bitvec::prelude::*; +use digest::Digest; + +const NULL_BYTES: &[u8] = b"NULL"; + +const DELIMITER_FOR_NESTED_FIELD: &str = "/"; + +enum DigestBufferType { + NonNullable(D), + Nullable(BitVec, D), // Where first digest is for the bull bits, while the second is for the actual data +} + +pub struct ArrowDigesterCore { + schema: Schema, + schema_digest: Vec, + fields_digest_buffer: BTreeMap>, +} + +impl ArrowDigesterCore { + /// Create a new instance of `ArrowDigesterCore` with the schema which will be enforce through each update + pub fn new(schema: Schema) -> Self { + // Hash the schema first + let schema_digest = Self::hash_schema(&schema); + + // Flatten all nested fields into a single map, this allows us to hash each field individually and efficiently + let mut fields_digest_buffer = BTreeMap::new(); + schema.fields.into_iter().for_each(|field| { + Self::extract_fields_name(field, "", &mut fields_digest_buffer); + }); + + // Store it in the new struct for now + Self { + schema, + schema_digest, + fields_digest_buffer, + } + } + + /// Hash a record batch and update the internal digests + pub fn update(&mut self, record_batch: &RecordBatch) { + // Verify schema matches + assert!( + *record_batch.schema() == self.schema, + "Record batch schema does not match ArrowDigester schema" + ); + + // Iterate through each field and update its digest + self.fields_digest_buffer + .iter_mut() + .for_each(|(field_name, digest)| { + // Determine if field name is nested + let field_name_hierarchy = field_name + .split(DELIMITER_FOR_NESTED_FIELD) + .collect::>(); + + if field_name_hierarchy.len() == 1 { + Self::array_digest_update( + record_batch + .schema() + .field_with_name(field_name) + .expect("Failed to get field with name") + .data_type(), + record_batch + .column_by_name(field_name) + .expect("Failed to get column by name"), + digest, + ); + } else { + Self::update_nested_field( + &field_name_hierarchy, + 0, + record_batch + .column_by_name( + field_name_hierarchy + .first() + .expect("Failed to get field name at idx 0, list is empty!"), + ) + .expect("Failed to get column by name") + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StructArray"), + digest, + ); + } + }); + } + + /// Hash an array directly without needing to create an `ArrowDigester` instance on the user side + /// For hash array, we don't have a schema to hash, however we do have field data type. + /// So similar to schema, we will hash based on datatype to encode the metadata information into the digest + /// + /// # Panics + /// + /// This function will panic if JSON serialization of the data type fails. + /// + pub fn hash_array(array: &dyn Array) -> Vec { + let mut final_digest = D::new(); + + let data_type_serialized = serde_json::to_string(&array.data_type()) + .expect("Failed to serialize data type to string"); + + // Update the digest buffer with the array metadata and field data + final_digest.update(data_type_serialized); + + // Now we update it with the actual array data + let mut digest_buffer = if array.is_nullable() { + DigestBufferType::Nullable(BitVec::new(), D::new()) + } else { + DigestBufferType::NonNullable(D::new()) + }; + Self::array_digest_update(array.data_type(), array, &mut digest_buffer); + Self::finalize_digest(&mut final_digest, digest_buffer); + + // Finalize and return the digest + final_digest.finalize().to_vec() + } + + /// Hash record batch directly without needing to create an `ArrowDigester` instance on the user side + pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { + let mut digester = Self::new(record_batch.schema().as_ref().clone()); + digester.update(record_batch); + digester.finalize() + } + + /// This will consume the `ArrowDigester` and produce the final combined digest where the schema + /// digest is fed in first, followed by each field digest in alphabetical order of field names + pub fn finalize(self) -> Vec { + // Finalize all the sub digest and combine them into a single digest + let mut final_digest = D::new(); + + // digest the schema first + final_digest.update(&self.schema_digest); + + // Then digest each field digest in order + self.fields_digest_buffer + .into_iter() + .for_each(|(_, digest)| Self::finalize_digest(&mut final_digest, digest)); + + final_digest.finalize().to_vec() + } + + #[expect( + clippy::big_endian_bytes, + reason = "Use for bit packing the null_bit_values" + )] + /// Finalize a single field digest into the final digest + /// Helpers to reduce code duplication + fn finalize_digest(final_digest: &mut D, digest: DigestBufferType) { + match digest { + DigestBufferType::NonNullable(data_digest) => { + final_digest.update(data_digest.finalize()); + } + DigestBufferType::Nullable(null_bit_digest, data_digest) => { + final_digest.update(null_bit_digest.len().to_le_bytes()); + for &word in null_bit_digest.as_raw_slice() { + final_digest.update(word.to_be_bytes()); + } + final_digest.update(data_digest.finalize()); + } + } + } + + /// Serialize the schema into a `BTreeMap` for field name and its digest + /// + /// # Panics + /// This function will panic if JSON serialization of the schema fails. + fn serialized_schema(schema: &Schema) -> String { + let fields_digest = schema + .fields + .iter() + .map(|field| (field.name(), (field.to_string(), field.data_type()))) + .collect::>(); + + serde_json::to_string(&fields_digest).expect("Failed to serialize field_digest to bytes") + } + + /// Serialize the schema into a `BTreeMap` for field name and its digest + pub fn hash_schema(schema: &Schema) -> Vec { + // Hash the entire thing to the digest + D::digest(Self::serialized_schema(schema)).to_vec() + } + + /// Recursive function to update nested field digests (structs within structs) + fn update_nested_field( + field_name_hierarchy: &[&str], + current_level: usize, + array: &StructArray, + digest: &mut DigestBufferType, + ) { + let current_level_plus_one = current_level + .checked_add(1) + .expect("Field nesting level overflow"); + + if field_name_hierarchy + .len() + .checked_sub(1) + .expect("field_name_hierarchy underflow") + == current_level_plus_one + { + let array_data = array + .column_by_name( + field_name_hierarchy + .last() + .expect("Failed to get field name at idx 0, list is empty!"), + ) + .expect("Failed to get column by name"); + // Base case, it should be a non-struct field + Self::array_digest_update(array_data.data_type(), array_data.as_ref(), digest); + } else { + // Recursive case, it should be a struct field + let next_array = array + .column_by_name( + field_name_hierarchy + .get(current_level_plus_one) + .expect("Failed to get field name at current level"), + ) + .expect("Failed to get column by name") + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StructArray"); + + Self::update_nested_field( + field_name_hierarchy, + current_level_plus_one, + next_array, + digest, + ); + } + } + + #[expect( + clippy::too_many_lines, + reason = "Comprehensive match on all data types" + )] + fn array_digest_update( + data_type: &DataType, + array: &dyn Array, + digest: &mut DigestBufferType, + ) { + match data_type { + DataType::Null => todo!(), + DataType::Boolean => { + // Bool Array is stored a bit differently, so we can't use the standard fixed buffer approach + let bool_array = array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to BooleanArray"); + + match digest { + DigestBufferType::NonNullable(data_digest) => { + // We want to bit pack the boolean values into bytes for hashing + let mut bit_vec = BitVec::::with_capacity(bool_array.len()); + for i in 0..bool_array.len() { + bit_vec.push(bool_array.value(i)); + } + + data_digest.update(bit_vec.as_raw_slice()); + } + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Handle null bits first + Self::handle_null_bits(bool_array, null_bit_vec); + + // Handle the data + let mut bit_vec = BitVec::::with_capacity(bool_array.len()); + for i in 0..bool_array.len() { + // We only want the valid bits, for null we will discard from the hash since that is already capture by null_bits + if bool_array.is_valid(i) { + bit_vec.push(bool_array.value(i)); + } + } + data_digest.update(bit_vec.as_raw_slice()); + } + } + } + DataType::Int8 | DataType::UInt8 => Self::hash_fixed_size_array(array, digest, 1), + DataType::Int16 | DataType::UInt16 | DataType::Float16 => { + Self::hash_fixed_size_array(array, digest, 2); + } + DataType::Int32 + | DataType::UInt32 + | DataType::Float32 + | DataType::Date32 + | DataType::Decimal32(_, _) => { + Self::hash_fixed_size_array(array, digest, 4); + } + DataType::Int64 + | DataType::UInt64 + | DataType::Float64 + | DataType::Date64 + | DataType::Decimal64(_, _) => { + Self::hash_fixed_size_array(array, digest, 8); + } + DataType::Timestamp(_, _) => todo!(), + DataType::Time32(_) => Self::hash_fixed_size_array(array, digest, 4), + DataType::Time64(_) => Self::hash_fixed_size_array(array, digest, 8), + DataType::Duration(_) => todo!(), + DataType::Interval(_) => todo!(), + DataType::Binary => Self::hash_binary_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to BinaryArray"), + digest, + ), + DataType::FixedSizeBinary(element_size) => { + Self::hash_fixed_size_array(array, digest, *element_size); + } + DataType::LargeBinary => Self::hash_binary_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeBinaryArray"), + digest, + ), + DataType::BinaryView => todo!(), + DataType::Utf8 => Self::hash_string_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to StringArray"), + digest, + ), + DataType::LargeUtf8 => Self::hash_string_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeStringArray"), + digest, + ), + DataType::Utf8View => todo!(), + DataType::List(field) => { + Self::hash_list_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to ListArray"), + field.data_type(), + digest, + ); + } + DataType::ListView(_) => todo!(), + DataType::FixedSizeList(_, _) => todo!(), + DataType::LargeList(field) => { + Self::hash_list_array( + array + .as_any() + .downcast_ref::() + .expect("Failed to downcast to LargeListArray"), + field.data_type(), + digest, + ); + } + DataType::LargeListView(_) => todo!(), + DataType::Struct(_) => todo!(), + DataType::Union(_, _) => todo!(), + DataType::Dictionary(_, _) => todo!(), + DataType::Decimal128(_, _) => { + Self::hash_fixed_size_array(array, digest, 16); + } + DataType::Decimal256(_, _) => { + Self::hash_fixed_size_array(array, digest, 32); + } + DataType::Map(_, _) => todo!(), + DataType::RunEndEncoded(_, _) => todo!(), + } + } + + #[expect(clippy::cast_sign_loss, reason = "element_size is always positive")] + fn hash_fixed_size_array( + array: &dyn Array, + digest_buffer: &mut DigestBufferType, + element_size: i32, + ) { + let array_data = array.to_data(); + let element_size_usize = element_size as usize; + + // Get the slice with offset accounted for if there is any + let slice = array_data + .buffers() + .first() + .expect("Unable to get first buffer to determine offset") + .as_slice() + .get( + array_data + .offset() + .checked_mul(element_size_usize) + .expect("Offset multiplication overflow").., + ) + .expect("Failed to get buffer slice for FixedSizeBinaryArray"); + + match digest_buffer { + DigestBufferType::NonNullable(data_digest) => { + // No nulls, we can hash the entire buffer directly + data_digest.update(slice); + } + DigestBufferType::Nullable(null_bits, data_digest) => { + // Handle null bits first + Self::handle_null_bits(array, null_bits); + + match array_data.nulls() { + Some(null_buffer) => { + // There are nulls, so we need to incrementally hash each value + for i in 0..array_data.len() { + if null_buffer.is_valid(i) { + let data_pos = i + .checked_mul(element_size_usize) + .expect("Data position multiplication overflow"); + let end_pos = data_pos + .checked_add(element_size_usize) + .expect("End position addition overflow"); + + data_digest.update( + slice + .get(data_pos..end_pos) + .expect("Failed to get data_slice"), + ); + } + } + } + None => { + // No nulls, we can hash the entire buffer directly + data_digest.update(slice); + } + } + } + } + } + + fn hash_binary_array( + array: &GenericBinaryArray, + digest: &mut DigestBufferType, + ) { + match digest { + DigestBufferType::NonNullable(data_digest) => { + for i in 0..array.len() { + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); + } + } + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Deal with the null bits first + if let Some(null_buf) = array.nulls() { + // We would need to iterate through the null buffer and push it into the null_bit_vec + for i in 0..array.len() { + null_bit_vec.push(null_buf.is_valid(i)); + } + + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); + } else { + data_digest.update(NULL_BYTES); + } + } + } else { + // All valid, therefore we can extend the bit vector with all true values + let len = array.len().checked_sub(1).expect("Array length underflow"); + null_bit_vec.extend(repeat_n(true, len)); + + // Deal with the data + for i in 0..array.len() { + let value = array.value(i); + data_digest.update(value.len().to_le_bytes()); + data_digest.update(value); + } + } + } + } + } + + #[expect( + clippy::cast_possible_truncation, + reason = "String lengths from Arrow offsets are bounded" + )] + fn hash_string_array( + array: &GenericStringArray, + digest: &mut DigestBufferType, + ) { + match digest { + DigestBufferType::NonNullable(data_digest) => { + for i in 0..array.len() { + let value = array.value(i); + data_digest.update((value.len() as u64).to_le_bytes()); + data_digest.update(value.as_bytes()); + } + } + DigestBufferType::Nullable(null_bit_vec, data_digest) => { + // Deal with the null bits first + Self::handle_null_bits(array, null_bit_vec); + + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + let value = array.value(i); + data_digest.update((value.len() as u32).to_le_bytes()); + data_digest.update(value.as_bytes()); + } else { + data_digest.update(NULL_BYTES); + } + } + } + None => { + for i in 0..array.len() { + let value = array.value(i); + data_digest.update((value.len() as u32).to_le_bytes()); + data_digest.update(value.as_bytes()); + } + } + } + } + } + } + + fn hash_list_array( + array: &GenericListArray, + field_data_type: &DataType, + digest: &mut DigestBufferType, + ) { + match digest { + DigestBufferType::NonNullable(_) => { + for i in 0..array.len() { + Self::array_digest_update(field_data_type, array.value(i).as_ref(), digest); + } + } + DigestBufferType::Nullable(bit_vec, _) => { + // Deal with null bits first + Self::handle_null_bits(array, bit_vec); + + match array.nulls() { + Some(null_buf) => { + for i in 0..array.len() { + if null_buf.is_valid(i) { + Self::array_digest_update( + field_data_type, + array.value(i).as_ref(), + digest, + ); + } + } + } + None => { + for i in 0..array.len() { + Self::array_digest_update( + field_data_type, + array.value(i).as_ref(), + digest, + ); + } + } + } + } + } + } + + /// Internal recursive function to extract field names from nested structs effectively flattening the schema + /// The format is `parent__child__grandchild__etc`... for nested fields and will be stored in `fields_digest_buffer` + fn extract_fields_name( + field: &Field, + parent_field_name: &str, + fields_digest_buffer: &mut BTreeMap>, + ) { + // Check if field is a nested type of struct + if let DataType::Struct(fields) = field.data_type() { + // We will add fields in alphabetical order + fields.into_iter().for_each(|field_inner| { + Self::extract_fields_name( + field_inner, + Self::construct_field_name_hierarchy(parent_field_name, field.name()).as_str(), + fields_digest_buffer, + ); + }); + } else { + // Base case, just add the the combine field name to the map + fields_digest_buffer.insert( + Self::construct_field_name_hierarchy(parent_field_name, field.name()), + if field.is_nullable() { + DigestBufferType::Nullable(BitVec::new(), D::new()) + } else { + DigestBufferType::NonNullable(D::new()) + }, + ); + } + } + + fn construct_field_name_hierarchy(parent_field_name: &str, field_name: &str) -> String { + if parent_field_name.is_empty() { + field_name.to_owned() + } else { + format!("{parent_field_name}{DELIMITER_FOR_NESTED_FIELD}{field_name}") + } + } + + fn handle_null_bits(array: &dyn Array, null_bit_vec: &mut BitVec) { + match array.nulls() { + Some(null_buf) => { + // We would need to iterate through the null buffer and push it into the null_bit_vec + for i in 0..array.len() { + null_bit_vec.push(null_buf.is_valid(i)); + } + } + None => { + // All valid, therefore we can extend the bit vector with all true values + null_bit_vec.extend(repeat_n( + true, + array.len().checked_sub(1).expect("Array length underflow"), + )); + } + } + } +} + +#[cfg(test)] +mod tests { + #![expect(clippy::unwrap_used, reason = "Okay in test")] + + use std::sync::Arc; + + use arrow::array::{ArrayRef, Int32Array, Int64Array, RecordBatch, StringArray, StructArray}; + use arrow_schema::{DataType, Field, Schema, TimeUnit}; + + use hex::encode; + use indoc::indoc; + use pretty_assertions::assert_eq; + use sha2::Sha256; + + use crate::arrow_digester_core::ArrowDigesterCore; + + #[expect(clippy::too_many_lines, reason = "Comprehensive schema test")] + #[test] + fn schema() { + let schema = Schema::new(vec![ + Field::new("bool", DataType::Boolean, true), + Field::new("int8", DataType::Int8, false), + Field::new("uint8", DataType::UInt8, false), + Field::new("int16", DataType::Int16, false), + Field::new("uint16", DataType::UInt16, false), + Field::new("int32", DataType::Int32, false), + Field::new("uint32", DataType::UInt32, false), + Field::new("int64", DataType::Int64, false), + Field::new("uint64", DataType::UInt64, false), + Field::new("float32", DataType::Float32, false), + Field::new("float64", DataType::Float64, false), + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), + Field::new("time32_second", DataType::Time32(TimeUnit::Second), false), + Field::new( + "time32_millis", + DataType::Time32(TimeUnit::Millisecond), + false, + ), + Field::new( + "time64_micro", + DataType::Time64(TimeUnit::Microsecond), + false, + ), + Field::new("time64_nano", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("binary", DataType::Binary, true), + Field::new("large_binary", DataType::LargeBinary, true), + Field::new("utf8", DataType::Utf8, true), + Field::new("large_utf8", DataType::LargeUtf8, true), + Field::new( + "list", + DataType::List(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new( + "large_list", + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new("decimal32", DataType::Decimal32(9, 2), true), + Field::new("decimal64", DataType::Decimal64(18, 3), true), + Field::new("decimal128", DataType::Decimal128(38, 5), true), + ]); + + // Serialize the schema and covert it over to pretty json for comparison + let compact_json: serde_json::Value = + serde_json::from_str(&ArrowDigesterCore::::serialized_schema(&schema)).unwrap(); + let pretty_json = serde_json::to_string_pretty(&compact_json).unwrap(); + + assert_eq!( + pretty_json, + indoc! {r#" +{ + "binary": [ + "Field { \"binary\": nullable Binary }", + "Binary" + ], + "bool": [ + "Field { \"bool\": nullable Boolean }", + "Boolean" + ], + "date32": [ + "Field { \"date32\": Date32 }", + "Date32" + ], + "date64": [ + "Field { \"date64\": Date64 }", + "Date64" + ], + "decimal128": [ + "Field { \"decimal128\": nullable Decimal128(38, 5) }", + { + "Decimal128": [ + 38, + 5 + ] + } + ], + "decimal32": [ + "Field { \"decimal32\": nullable Decimal32(9, 2) }", + { + "Decimal32": [ + 9, + 2 + ] + } + ], + "decimal64": [ + "Field { \"decimal64\": nullable Decimal64(18, 3) }", + { + "Decimal64": [ + 18, + 3 + ] + } + ], + "float32": [ + "Field { \"float32\": Float32 }", + "Float32" + ], + "float64": [ + "Field { \"float64\": Float64 }", + "Float64" + ], + "int16": [ + "Field { \"int16\": Int16 }", + "Int16" + ], + "int32": [ + "Field { \"int32\": Int32 }", + "Int32" + ], + "int64": [ + "Field { \"int64\": Int64 }", + "Int64" + ], + "int8": [ + "Field { \"int8\": Int8 }", + "Int8" + ], + "large_binary": [ + "Field { \"large_binary\": nullable LargeBinary }", + "LargeBinary" + ], + "large_list": [ + "Field { \"large_list\": nullable LargeList(nullable Int32) }", + { + "LargeList": { + "data_type": "Int32", + "dict_id": 0, + "dict_is_ordered": false, + "metadata": {}, + "name": "item", + "nullable": true + } + } + ], + "large_utf8": [ + "Field { \"large_utf8\": nullable LargeUtf8 }", + "LargeUtf8" + ], + "list": [ + "Field { \"list\": nullable List(nullable Int32) }", + { + "List": { + "data_type": "Int32", + "dict_id": 0, + "dict_is_ordered": false, + "metadata": {}, + "name": "item", + "nullable": true + } + } + ], + "time32_millis": [ + "Field { \"time32_millis\": Time32(ms) }", + { + "Time32": "Millisecond" + } + ], + "time32_second": [ + "Field { \"time32_second\": Time32(s) }", + { + "Time32": "Second" + } + ], + "time64_micro": [ + "Field { \"time64_micro\": Time64(µs) }", + { + "Time64": "Microsecond" + } + ], + "time64_nano": [ + "Field { \"time64_nano\": Time64(ns) }", + { + "Time64": "Nanosecond" + } + ], + "uint16": [ + "Field { \"uint16\": UInt16 }", + "UInt16" + ], + "uint32": [ + "Field { \"uint32\": UInt32 }", + "UInt32" + ], + "uint64": [ + "Field { \"uint64\": UInt64 }", + "UInt64" + ], + "uint8": [ + "Field { \"uint8\": UInt8 }", + "UInt8" + ], + "utf8": [ + "Field { \"utf8\": nullable Utf8 }", + "Utf8" + ] +}"#} + ); + } + + #[test] + fn nested_fields() { + // Test nested struct field name extraction + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "nested", + DataType::Struct( + vec![ + Field::new("name", DataType::Utf8, true), + Field::new( + "deep", + DataType::Struct( + vec![Field::new("value", DataType::Int64, false)].into(), + ), + false, + ), + ] + .into(), + ), + false, + ), + ]); + + let mut digester = ArrowDigesterCore::::new(schema.clone()); + let field_names: Vec<&String> = digester.fields_digest_buffer.keys().collect(); + + assert_eq!(field_names.len(), 3); + assert!(field_names.contains(&&"id".to_owned())); + assert!(field_names.contains(&&"nested/name".to_owned())); + assert!(field_names.contains(&&"nested/deep/value".to_owned())); + + // Test the nested field update by creating record_batch and using the update method + let id_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef; + let name_array = Arc::new(StringArray::from(vec![Some("Alice"), Some("Bob")])) as ArrayRef; + let value_array = Arc::new(Int64Array::from(vec![Some(100), Some(200)])) as ArrayRef; + + let schema_ref = Arc::new(schema); + + let nested_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("name", DataType::Utf8, true)), + name_array, + ), + ( + Arc::new(Field::new( + "deep", + DataType::Struct(vec![Field::new("value", DataType::Int64, false)].into()), + false, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("value", DataType::Int64, false)), + value_array, + )])) as ArrayRef, + ), + ]); + + let record_batch = RecordBatch::try_new( + Arc::clone(&schema_ref), + vec![id_array, Arc::new(nested_struct)], + ) + .unwrap(); + + digester.update(&record_batch); + + // Check the digest + assert_eq!( + encode(digester.finalize()), + "b7faf50f1328ec80b575e018c121eed9d0e7e84ad72645499ebc8667e64199a7" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index a713335..caad34c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,5 +3,45 @@ extern crate uniffi as uniffi_external; uniffi_external::setup_scaffolding!(); -pub mod arrow_digester; -mod pyarrow; +use arrow::array::{Array, RecordBatch}; +use arrow_schema::Schema; +use sha2::Sha256; + +use crate::arrow_digester_core::ArrowDigesterCore; + +/// Maps `arrow_digester_core` function to a `sha_256` digester + versioning +pub struct ArrowDigester { + digester: ArrowDigesterCore, +} + +impl ArrowDigester { + /// Create a new instance of `ArrowDigester` with SHA256 as the digester with the schema which will be enforce through each update + pub fn new(schema: Schema) -> Self { + Self { + digester: ArrowDigesterCore::::new(schema), + } + } + + /// Update the digester with a new `RecordBatch` + pub fn update(&mut self, record_batch: &RecordBatch) { + self.digester.update(record_batch); + } + + /// Consume the digester and finalize the hash computation + pub fn finalize(self) -> Vec { + self.digester.finalize() + } + + /// Function to hash an Array in one go + pub fn hash_array(array: &dyn Array) -> Vec { + ArrowDigesterCore::::hash_array(array) + } + + /// Function to hash a complete `RecordBatch` in one go + pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { + ArrowDigesterCore::::hash_record_batch(record_batch) + } +} + +pub(crate) mod arrow_digester_core; +pub mod pyarrow; diff --git a/src/pyarrow.rs b/src/pyarrow.rs index f75d393..d193bef 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -1,12 +1,10 @@ +use crate::ArrowDigester; use arrow::array::{RecordBatch, StructArray}; use arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; -use sha2::Sha256; - -use crate::arrow_digester::ArrowDigester; /// Process an Arrow table via C Data Interface /// -/// # Safety +/// # Panics /// The pointers must be valid Arrow C Data Interface structs from Python's pyarrow #[uniffi::export] @@ -30,5 +28,5 @@ pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { }; // Hash the table - ArrowDigester::::hash_record_batch(&RecordBatch::from(StructArray::from(array_data))) + ArrowDigester::hash_record_batch(&RecordBatch::from(StructArray::from(array_data))) } diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs new file mode 100644 index 0000000..ef0c392 --- /dev/null +++ b/tests/arrow_digester.rs @@ -0,0 +1,409 @@ +#[cfg(test)] +mod tests { + #![expect(clippy::unwrap_used, reason = "Okay in test")] + use std::sync::Arc; + + use arrow::{ + array::{ + ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal32Array, + Decimal64Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, RecordBatch, + StringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }, + datatypes::Int32Type, + }; + use arrow_schema::{DataType, Field, Schema, TimeUnit}; + use hex::encode; + use pretty_assertions::assert_eq; + + use arrow::array::Decimal128Array; + use starfix::ArrowDigester; + + #[expect(clippy::too_many_lines, reason = "Comprehensive schema test")] + #[test] + fn schema() { + let schema = Schema::new(vec![ + Field::new("bool", DataType::Boolean, true), + Field::new("int8", DataType::Int8, false), + Field::new("uint8", DataType::UInt8, false), + Field::new("int16", DataType::Int16, false), + Field::new("uint16", DataType::UInt16, false), + Field::new("int32", DataType::Int32, false), + Field::new("uint32", DataType::UInt32, false), + Field::new("int64", DataType::Int64, false), + Field::new("uint64", DataType::UInt64, false), + Field::new("float32", DataType::Float32, false), + Field::new("float64", DataType::Float64, false), + Field::new("date32", DataType::Date32, false), + Field::new("date64", DataType::Date64, false), + Field::new("time32_second", DataType::Time32(TimeUnit::Second), false), + Field::new( + "time32_millis", + DataType::Time32(TimeUnit::Millisecond), + false, + ), + Field::new( + "time64_micro", + DataType::Time64(TimeUnit::Microsecond), + false, + ), + Field::new("time64_nano", DataType::Time64(TimeUnit::Nanosecond), false), + Field::new("binary", DataType::Binary, true), + Field::new("large_binary", DataType::LargeBinary, true), + Field::new("utf8", DataType::Utf8, true), + Field::new("large_utf8", DataType::LargeUtf8, true), + Field::new( + "list", + DataType::List(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new( + "large_list", + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)).into()), + true, + ), + Field::new("decimal32", DataType::Decimal32(9, 2), true), + Field::new("decimal64", DataType::Decimal64(18, 3), true), + Field::new("decimal128", DataType::Decimal128(38, 5), true), + ]); + + // Empty Table Hashing Check + + assert_eq!( + encode(ArrowDigester::new(schema.clone()).finalize()), + "a42e35f6623d86b72350bf0bb74b97781946df45423f192c397d435c254bc71e" + ); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(BooleanArray::from(vec![Some(true)])), + Arc::new(Int8Array::from(vec![1_i8])), + Arc::new(UInt8Array::from(vec![1_u8])), + Arc::new(Int16Array::from(vec![100_i16])), + Arc::new(UInt16Array::from(vec![100_u16])), + Arc::new(Int32Array::from(vec![1000_i32])), + Arc::new(UInt32Array::from(vec![1000_u32])), + Arc::new(Int64Array::from(vec![100_000_i64])), + Arc::new(UInt64Array::from(vec![100_000_u64])), + Arc::new(Float32Array::from(vec![1.5_f32])), + Arc::new(Float64Array::from(vec![1.5_f64])), + Arc::new(Date32Array::from(vec![18993_i32])), + Arc::new(Date64Array::from(vec![1_640_995_200_000_i64])), + Arc::new(Time32SecondArray::from(vec![3600_i32])), + Arc::new(Time32MillisecondArray::from(vec![3_600_000_i32])), + Arc::new(Time64MicrosecondArray::from(vec![3_600_000_000_i64])), + Arc::new(Time64NanosecondArray::from(vec![3_600_000_000_000_i64])), + Arc::new(BinaryArray::from(vec![Some(b"data1".as_ref())])), + Arc::new(LargeBinaryArray::from(vec![Some(b"large1".as_ref())])), + Arc::new(StringArray::from(vec![Some("text1")])), + Arc::new(LargeStringArray::from(vec![Some("large_text1")])), + Arc::new(ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + ])), + Arc::new(LargeListArray::from_iter_primitive::( + vec![Some(vec![Some(5), Some(6)])], + )), + Arc::new( + Decimal32Array::from_iter(vec![Some(12345)]) + .with_precision_and_scale(9, 2) + .unwrap(), + ), + Arc::new( + Decimal64Array::from_iter(vec![Some(123_456_789_012)]) + .with_precision_and_scale(18, 3) + .unwrap(), + ), + Arc::new( + Decimal128Array::from_iter(vec![Some( + 123_456_789_012_345_678_901_234_567_890_i128, + )]) + .with_precision_and_scale(38, 5) + .unwrap(), + ), + ], + ) + .unwrap(); + // Hash the record batch + assert_eq!( + encode(ArrowDigester::hash_record_batch(&batch)), + "da0d7d3d76a47e88648e3a1160a5d2432647f0769e08b42315533163c36b3eb0" + ); + } + + #[test] + fn boolean_array_hashing() { + let bool_array = BooleanArray::from(vec![Some(true), None, Some(false), Some(true)]); + let hash = hex::encode(ArrowDigester::hash_array(&bool_array)); + assert_eq!( + hash, + "f9abeb37d9395f359b48a379f0a8467c572b19ecc6cae9fa85e1bf627a52a8f3" + ); + } + + /// Test int32 array hashing which is really meant to test fixed size element array hashing + #[test] + fn int32_array_hashing() { + let int_array = Int32Array::from(vec![Some(42), None, Some(-7), Some(0)]); + let hash = hex::encode(ArrowDigester::hash_array(&int_array)); + assert_eq!( + hash, + "27f2411e6839eb1e3fe706ac3f01e704c7b46357360fb2ddb8a08ec98e8ba4fa" + ); + } + + /// Test time array hashing + #[test] + fn time32_array_hashing() { + let time_array = Time32SecondArray::from(vec![Some(1000), None, Some(5000), Some(0)]); + let hash = hex::encode(ArrowDigester::hash_array(&time_array)); + assert_eq!( + hash, + "9000b74aa80f685103a8cafc7e113aa8f33ccc0c94ea3713318d2cc2f3436baa" + ); + } + + #[test] + fn time64_array_hashing() { + let time_array = + Time64MicrosecondArray::from(vec![Some(1_000_000), None, Some(5_000_000), Some(0)]); + let hash = hex::encode(ArrowDigester::hash_array(&time_array)); + assert_eq!( + hash, + "95f12143d789f364a3ed52f7300f8f91dc21fbe00c34aed798ca8fd54182dea3" + ); + } + + #[test] + fn time_array_different_units_produce_different_hashes() { + let time32_second = Time32SecondArray::from(vec![Some(1000), Some(2000)]); + let time32_millis = Time32MillisecondArray::from(vec![Some(1000), Some(2000)]); + + let hash_second = hex::encode(ArrowDigester::hash_array(&time32_second)); + let hash_millis = hex::encode(ArrowDigester::hash_array(&time32_millis)); + + assert_ne!(hash_second, hash_millis); + } + + /// Test binary array hashing + #[test] + fn binary_array_hashing() { + let binary_array = BinaryArray::from(vec![ + Some(b"hello".as_ref()), + None, + Some(b"world".as_ref()), + Some(b"".as_ref()), + ]); + let hash = hex::encode(ArrowDigester::hash_array(&binary_array)); + assert_eq!( + hash, + "466801efd880d2acecd6c78915b5c2a51476870f9116912834d79de43a000071" + ); + + // Test large binary array with same data to ensure consistency + let large_binary_array = LargeBinaryArray::from(vec![ + Some(b"hello".as_ref()), + None, + Some(b"world".as_ref()), + Some(b"".as_ref()), + ]); + + assert_ne!( + hex::encode(ArrowDigester::hash_array(&large_binary_array)), + hash + ); + } + + // Test binary array collision vulnerability - different partitions should produce different hashes + #[test] + fn binary_array_length_prefix_prevents_collisions() { + // Array 1: [[0x01, 0x02], [0x03]] + let array1 = BinaryArray::from(vec![Some(&[0x01_u8, 0x02_u8][..]), Some(&[0x03_u8][..])]); + + // Array 2: [[0x01], [0x02, 0x03]] + let array2 = BinaryArray::from(vec![Some(&[0x01_u8][..]), Some(&[0x02_u8, 0x03_u8][..])]); + + let hash1 = hex::encode(ArrowDigester::hash_array(&array1)); + let hash2 = hex::encode(ArrowDigester::hash_array(&array2)); + + // Without length prefix, these would collide (both hash to 0x01 0x02 0x03) + // With length prefix, they should produce different hashes + assert_ne!( + hash1, hash2, + "Binary arrays with different partitions should produce different hashes" + ); + } + + // Test string array collision vulnerability - different partitions should produce different hashes + #[test] + fn string_array_length_prefix_prevents_collisions() { + // Array 1: ["ab", "c"] + let array1 = StringArray::from(vec![Some("ab"), Some("c")]); + + // Array 2: ["a", "bc"] + let array2 = StringArray::from(vec![Some("a"), Some("bc")]); + + let hash1 = hex::encode(ArrowDigester::hash_array(&array1)); + let hash2 = hex::encode(ArrowDigester::hash_array(&array2)); + + // Without length prefix, these would collide (both hash to "abc") + // With length prefix, they should produce different hashes + assert_ne!( + hash1, hash2, + "String arrays with different partitions should produce different hashes" + ); + } + + // Test String hashing + #[test] + fn string_array_hashing() { + let string_array = StringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); + let hash = hex::encode(ArrowDigester::hash_array(&string_array)); + assert_eq!( + hash, + "14a2d2eaf535b6e78fbf1d58ae93accce424eafd20fa449eff8acefc47903d3d" + ); + + // Test large string array with same data to ensure consistency + let large_string_array = + LargeStringArray::from(vec![Some("hello"), None, Some("world"), Some("")]); + + assert_ne!( + hex::encode(ArrowDigester::hash_array(&large_string_array)), + hash + ); + } + + // List array hashing test + #[test] + fn list_array_hashing() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), Some(3)]), + None, + Some(vec![Some(4), Some(5)]), + Some(vec![Some(6)]), + ]); + + let hash = hex::encode(ArrowDigester::hash_array(&list_array)); + assert_eq!( + hash, + "1a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" + ); + } + + // Test all types of decimal hashing + #[test] + fn decimal_array_hashing() { + // Test Decimal32 (precision 1-9) + let decimal32_array = + Decimal128Array::from_iter(vec![Some(123), None, Some(-456), Some(0)]) + .with_precision_and_scale(9, 2) + .unwrap(); + + assert_eq!( + encode(ArrowDigester::hash_array(&decimal32_array)), + "ef29250615f9d6ab34672c3b11dfa2dcda6e8e6164bc55899c13887f17705f5d" + ); + + // Test Decimal64 (precision 10-18) + let decimal64_array = Decimal128Array::from_iter(vec![ + Some(1_234_567_890_123), + None, + Some(-9_876_543_210), + Some(0), + ]) + .with_precision_and_scale(15, 3) + .unwrap(); + assert_eq!( + encode(ArrowDigester::hash_array(&decimal64_array)), + "efa4ed72641051233889c07775366cbf2e56eb4b0fcfd46653f5741e81786f08" + ); + + // Test Decimal128 (precision 19-38) + let decimal128_array = Decimal128Array::from_iter(vec![ + Some(123_456_789_012_345_678_901_234_567), + None, + Some(-987_654_321_098_765_432_109_876_543), + Some(0), + ]) + .with_precision_and_scale(38, 5) + .unwrap(); + assert_eq!( + hex::encode(ArrowDigester::hash_array(&decimal128_array)), + "55cc4d81a048dbca001ca8581673a5a6c93efd870d358df211a545c2af9b658d" + ); + } + + #[test] + fn commutative_tables() { + let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; + let fake_data = Arc::new(BooleanArray::from(vec![ + Some(true), + Some(false), + None, + Some(true), + ])) as ArrayRef; + + // Create two record batches with same data but different order + let batch1 = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("uids", DataType::Int32, false), + Field::new("flags", DataType::Boolean, true), + ])), + vec![Arc::clone(&uids), Arc::clone(&fake_data)], + ); + + let batch2 = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("flags", DataType::Boolean, true), + Field::new("uids", DataType::Int32, false), + ])), + vec![fake_data, uids], + ); + + // Hash both record batches + assert_eq!( + encode(ArrowDigester::hash_record_batch(batch1.as_ref().unwrap())), + encode(ArrowDigester::hash_record_batch(batch2.as_ref().unwrap())) + ); + } + + #[test] + fn record_batch_hashing() { + let schema = Arc::new(Schema::new(vec![ + Field::new("uids", DataType::Int32, false), + Field::new("flags", DataType::Boolean, true), + ])); + + // Create two record batches with different data to simulate loading at different times + let uids = Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4)])) as ArrayRef; + let fake_data = Arc::new(BooleanArray::from(vec![ + Some(true), + Some(false), + None, + Some(true), + ])); + + let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![uids, fake_data]).unwrap(); + + let uids2 = + Arc::new(Int32Array::from(vec![Some(5), Some(6), Some(7), Some(8)])) as ArrayRef; + let fake_data2 = Arc::new(BooleanArray::from(vec![ + Some(false), + Some(true), + Some(true), + None, + ])); + + let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![uids2, fake_data2]).unwrap(); + // Hash both record batches + let mut digester = ArrowDigester::new((*schema).clone()); + digester.update(&batch1); + digester.update(&batch2); + assert_eq!( + encode(digester.finalize()), + "37954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" + ); + } +} From 63fb32a32813de267680f7f126638e50473008dc Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 05:54:09 +0000 Subject: [PATCH 41/53] Up clippy version --- .github/workflows/clippy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clippy.yml b/.github/workflows/clippy.yml index ff98561..e693886 100644 --- a/.github/workflows/clippy.yml +++ b/.github/workflows/clippy.yml @@ -11,7 +11,7 @@ jobs: - name: Install Rust + components uses: actions-rust-lang/setup-rust-toolchain@v1 with: - toolchain: 1.90.0 + toolchain: 1.91.1 components: rustfmt,clippy - name: Run syntax and style tests run: cargo clippy --all-targets -- -D warnings From d4a233ebfcf7a89d2fd645ec91bf8d88620d1037 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 06:10:40 +0000 Subject: [PATCH 42/53] Update hashing to meet new arrow format --- Cargo.toml | 2 +- src/arrow_digester_core.rs | 6 +++--- tests/arrow_digester.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 13ba115..4e9e61a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ serde = "1.0.228" serde_json = "1.0.145" sha2 = "0.10.9" # automated CFFI + bindings in other languages -uniffi = { version = "0.29.4", features = ["cli", "tokio"] } +uniffi = { version = "0.30.0", features = ["cli", "tokio"] } [dev-dependencies] hex = "0.4.3" diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs index 3af7456..4a9c787 100644 --- a/src/arrow_digester_core.rs +++ b/src/arrow_digester_core.rs @@ -774,7 +774,7 @@ mod tests { "LargeBinary" ], "large_list": [ - "Field { \"large_list\": nullable LargeList(nullable Int32) }", + "Field { \"large_list\": nullable LargeList(Int32) }", { "LargeList": { "data_type": "Int32", @@ -791,7 +791,7 @@ mod tests { "LargeUtf8" ], "list": [ - "Field { \"list\": nullable List(nullable Int32) }", + "Field { \"list\": nullable List(Int32) }", { "List": { "data_type": "Int32", @@ -919,7 +919,7 @@ mod tests { // Check the digest assert_eq!( encode(digester.finalize()), - "b7faf50f1328ec80b575e018c121eed9d0e7e84ad72645499ebc8667e64199a7" + "e32c1f0981ee262622e0e91a5ea99210a085b62d6025d70eb1ee074096a151dd" ); } } diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index ef0c392..b88f977 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -72,7 +72,7 @@ mod tests { assert_eq!( encode(ArrowDigester::new(schema.clone()).finalize()), - "a42e35f6623d86b72350bf0bb74b97781946df45423f192c397d435c254bc71e" + "c7bc0a0c84aca684adbec21f8cb481781332fc91a205165a6c74c3a63a80e9b2" ); let batch = RecordBatch::try_new( @@ -128,7 +128,7 @@ mod tests { // Hash the record batch assert_eq!( encode(ArrowDigester::hash_record_batch(&batch)), - "da0d7d3d76a47e88648e3a1160a5d2432647f0769e08b42315533163c36b3eb0" + "9972058c784f11f63a1d49998a79c00616b0f0a34b9774bbc7e2a3247df709ca" ); } From 5a86fbcbc5c8bab989e8c1da8b08ef99f103f267 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 06:12:27 +0000 Subject: [PATCH 43/53] Remove stale file that was already move to the lib module --- src/arrow_digester.rs | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 src/arrow_digester.rs diff --git a/src/arrow_digester.rs b/src/arrow_digester.rs deleted file mode 100644 index 6942c34..0000000 --- a/src/arrow_digester.rs +++ /dev/null @@ -1,39 +0,0 @@ -use arrow::array::{Array, RecordBatch}; -use arrow_schema::Schema; -use sha2::Sha256; - -use crate::arrow_digester_core::ArrowDigesterCore; - -/// Maps `arrow_digester_core` function to a `sha_256` digester + versioning -pub struct ArrowDigester { - digester: ArrowDigesterCore, -} - -impl ArrowDigester { - /// Create a new instance of `ArrowDigester` with SHA256 as the digester with the schema which will be enforce through each update - pub fn new(schema: Schema) -> Self { - Self { - digester: ArrowDigesterCore::::new(schema), - } - } - - /// Update the digester with a new `RecordBatch` - pub fn update(&mut self, record_batch: &RecordBatch) { - self.digester.update(record_batch); - } - - /// Consume the digester and finalize the hash computation - pub fn finalize(self) -> Vec { - self.digester.finalize() - } - - /// Function to hash an Array in one go - pub fn hash_array(array: &dyn Array) -> Vec { - ArrowDigesterCore::::hash_array(array) - } - - /// Function to hash a complete `RecordBatch` in one go - pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { - ArrowDigesterCore::::hash_record_batch(record_batch) - } -} From b9b6384e1d97279949b0902e7eb36330861d0ed3 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 06:29:19 +0000 Subject: [PATCH 44/53] Add nullable and non-nullable tests --- tests/arrow_digester.rs | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index b88f977..32dcb96 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -406,4 +406,51 @@ mod tests { "37954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" ); } + + #[test] + fn nullable_vs_non_nullable_array_produces_same_hash() { + let nullable_array = Int32Array::from(vec![Some(1), Some(2), Some(3)]); + let non_nullable_array = Int32Array::from(vec![1, 2, 3]); + + let hash_nullable = hex::encode(ArrowDigester::hash_array(&nullable_array)); + let hash_non_nullable = hex::encode(ArrowDigester::hash_array(&non_nullable_array)); + + assert_eq!( + hash_nullable, hash_non_nullable, + "Nullable and non-nullable arrays with same data should produce same hashes" + ); + } + + #[test] + fn empty_nullable_vs_non_nullable_array_produces_different_hash() { + let empty_nullable_array: Int32Array = Int32Array::from(vec![] as Vec>); + let empty_non_nullable_array: Int32Array = Int32Array::from(vec![] as Vec); + + let hash_nullable = hex::encode(ArrowDigester::hash_array(&empty_nullable_array)); + let hash_non_nullable = hex::encode(ArrowDigester::hash_array(&empty_non_nullable_array)); + + // Both are empty, but their nullability metadata may differ + // This test documents the expected behavior + assert_eq!(hash_nullable, hash_non_nullable); + } + + #[test] + fn nullable_vs_non_nullable_schema_produces_different_hash() { + let nullable_schema = Schema::new(vec![ + Field::new("col1", DataType::Int32, true), + Field::new("col2", DataType::Boolean, true), + ]); + let non_nullable_schema = Schema::new(vec![ + Field::new("col1", DataType::Int32, false), + Field::new("col2", DataType::Boolean, false), + ]); + + let hash_nullable = hex::encode(ArrowDigester::new(nullable_schema).finalize()); + let hash_non_nullable = hex::encode(ArrowDigester::new(non_nullable_schema).finalize()); + + assert_ne!( + hash_nullable, hash_non_nullable, + "Nullable and non-nullable schemas with same data types should produce different hashes" + ); + } } From 45cb028f35fe9991cf9097ec724feb0ce35d9af0 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 08:24:14 +0000 Subject: [PATCH 45/53] Add 3 bytes at the start for versioning --- src/lib.rs | 16 +++++++++++----- tests/arrow_digester.rs | 35 ++++++++++++++++++++--------------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index caad34c..1c99028 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,4 @@ -//! Intuitive compute pipeline orchestration with reproducibility, performance, and scalability in -//! mind. +/// Crate for extern crate uniffi as uniffi_external; uniffi_external::setup_scaffolding!(); @@ -9,6 +8,7 @@ use sha2::Sha256; use crate::arrow_digester_core::ArrowDigesterCore; +const VERSION_BYTES: [u8; 3] = [0_u8, 0_u8, 1_u8]; // Version 1.0 /// Maps `arrow_digester_core` function to a `sha_256` digester + versioning pub struct ArrowDigester { digester: ArrowDigesterCore, @@ -29,17 +29,23 @@ impl ArrowDigester { /// Consume the digester and finalize the hash computation pub fn finalize(self) -> Vec { - self.digester.finalize() + Self::prepend_version_bytes(self.digester.finalize()) } /// Function to hash an Array in one go pub fn hash_array(array: &dyn Array) -> Vec { - ArrowDigesterCore::::hash_array(array) + Self::prepend_version_bytes(ArrowDigesterCore::::hash_array(array)) } /// Function to hash a complete `RecordBatch` in one go pub fn hash_record_batch(record_batch: &RecordBatch) -> Vec { - ArrowDigesterCore::::hash_record_batch(record_batch) + Self::prepend_version_bytes(ArrowDigesterCore::::hash_record_batch(record_batch)) + } + + fn prepend_version_bytes(digest: Vec) -> Vec { + let mut complete_hash = VERSION_BYTES.clone().to_vec(); + complete_hash.extend(digest); + complete_hash } } diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index 32dcb96..4ef9626 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -72,7 +72,7 @@ mod tests { assert_eq!( encode(ArrowDigester::new(schema.clone()).finalize()), - "c7bc0a0c84aca684adbec21f8cb481781332fc91a205165a6c74c3a63a80e9b2" + "000001c7bc0a0c84aca684adbec21f8cb481781332fc91a205165a6c74c3a63a80e9b2" ); let batch = RecordBatch::try_new( @@ -128,7 +128,7 @@ mod tests { // Hash the record batch assert_eq!( encode(ArrowDigester::hash_record_batch(&batch)), - "9972058c784f11f63a1d49998a79c00616b0f0a34b9774bbc7e2a3247df709ca" + "0000019972058c784f11f63a1d49998a79c00616b0f0a34b9774bbc7e2a3247df709ca" ); } @@ -138,7 +138,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&bool_array)); assert_eq!( hash, - "f9abeb37d9395f359b48a379f0a8467c572b19ecc6cae9fa85e1bf627a52a8f3" + "000001f9abeb37d9395f359b48a379f0a8467c572b19ecc6cae9fa85e1bf627a52a8f3" ); } @@ -149,7 +149,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&int_array)); assert_eq!( hash, - "27f2411e6839eb1e3fe706ac3f01e704c7b46357360fb2ddb8a08ec98e8ba4fa" + "00000127f2411e6839eb1e3fe706ac3f01e704c7b46357360fb2ddb8a08ec98e8ba4fa" ); } @@ -160,7 +160,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&time_array)); assert_eq!( hash, - "9000b74aa80f685103a8cafc7e113aa8f33ccc0c94ea3713318d2cc2f3436baa" + "0000019000b74aa80f685103a8cafc7e113aa8f33ccc0c94ea3713318d2cc2f3436baa" ); } @@ -171,7 +171,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&time_array)); assert_eq!( hash, - "95f12143d789f364a3ed52f7300f8f91dc21fbe00c34aed798ca8fd54182dea3" + "00000195f12143d789f364a3ed52f7300f8f91dc21fbe00c34aed798ca8fd54182dea3" ); } @@ -198,7 +198,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&binary_array)); assert_eq!( hash, - "466801efd880d2acecd6c78915b5c2a51476870f9116912834d79de43a000071" + "000001466801efd880d2acecd6c78915b5c2a51476870f9116912834d79de43a000071" ); // Test large binary array with same data to ensure consistency @@ -262,7 +262,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&string_array)); assert_eq!( hash, - "14a2d2eaf535b6e78fbf1d58ae93accce424eafd20fa449eff8acefc47903d3d" + "00000114a2d2eaf535b6e78fbf1d58ae93accce424eafd20fa449eff8acefc47903d3d" ); // Test large string array with same data to ensure consistency @@ -288,7 +288,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&list_array)); assert_eq!( hash, - "1a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" + "0000011a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" ); } @@ -303,7 +303,7 @@ mod tests { assert_eq!( encode(ArrowDigester::hash_array(&decimal32_array)), - "ef29250615f9d6ab34672c3b11dfa2dcda6e8e6164bc55899c13887f17705f5d" + "000001ef29250615f9d6ab34672c3b11dfa2dcda6e8e6164bc55899c13887f17705f5d" ); // Test Decimal64 (precision 10-18) @@ -317,7 +317,7 @@ mod tests { .unwrap(); assert_eq!( encode(ArrowDigester::hash_array(&decimal64_array)), - "efa4ed72641051233889c07775366cbf2e56eb4b0fcfd46653f5741e81786f08" + "000001efa4ed72641051233889c07775366cbf2e56eb4b0fcfd46653f5741e81786f08" ); // Test Decimal128 (precision 19-38) @@ -331,7 +331,7 @@ mod tests { .unwrap(); assert_eq!( hex::encode(ArrowDigester::hash_array(&decimal128_array)), - "55cc4d81a048dbca001ca8581673a5a6c93efd870d358df211a545c2af9b658d" + "00000155cc4d81a048dbca001ca8581673a5a6c93efd870d358df211a545c2af9b658d" ); } @@ -363,10 +363,15 @@ mod tests { ); // Hash both record batches - assert_eq!( - encode(ArrowDigester::hash_record_batch(batch1.as_ref().unwrap())), + let hash1 = format!( + "000001{}", + encode(ArrowDigester::hash_record_batch(batch1.as_ref().unwrap())) + ); + let hash2 = format!( + "000001{}", encode(ArrowDigester::hash_record_batch(batch2.as_ref().unwrap())) ); + assert_eq!(hash1, hash2); } #[test] @@ -403,7 +408,7 @@ mod tests { digester.update(&batch2); assert_eq!( encode(digester.finalize()), - "37954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" + "00000137954b3edd169c7a9e65604c191caf6a307940357305d182a5d2168047e9cc51" ); } From 2f866e44534b7032c34c3389870efe1b7d9a9581 Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 09:19:48 +0000 Subject: [PATCH 46/53] Add documentation about hashing --- README.md | 493 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 481 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 60da8ae..e7a9541 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,488 @@ -# Overview -Hashing Lib for Arrow Data Tables using C_Pointers with Arrow_Digest from: https://github.com/kamu-data/arrow-digest +# StarFix: Arrow Data Deterministic Hashing -# Usage -The repo is setup to use dev containers of VSCode. After starting up the container and connecting to it the process to install the rust lib and a python package is: -```maturin develop --uv``` +## Overview -NOTE: After every code edit in rust code, you will need to rerun the command to rebuild it then restart the kernel in the Jupyter Notebook side +StarFix is a cryptographic hashing library for Apache Arrow data tables. It provides a deterministic way to compute unique digests for Arrow data structures, enabling efficient identification and comparison of data regardless of storage order or location. +The hashing system is built on top of SHA-256 (configurable to other digest algorithms via the `Digest` trait) and uses a hierarchical approach to hash different components of an Arrow table: schema metadata and field values. -# Hashing System Overview -ArrowDigester stores the digest for multiple components of the arrow data table before combining them +## Core Architecture -- schema: Each field name is serialized via PostCard: https://docs.rs/postcard/latest/postcard/ - - Was chosen since I was originally using JSON but wanted something even faster, hence postcard. It is design to be very resource efficient +### Main Components -- fields_digest_buffer: Flattens all nested schema with the '__' delimiter between the parent and sub level in this format parent_field_name__child_field_name +The hashing system consists of three main hashing levels: -- Upon finalization of the hash, the instance consume itself due to digest.finalize consuming self under ``field_digest_buffer``. Following that it adds it to a final digest in this order: schema + field_digest_buffer (lexical order of the field name) +1. **Schema Digest** - Hash of the table schema (field names, types, and nullability) +2. **Field Digests** - Individual hashes for each field's data +3. **Final Digest** - Combined hash from schema + all field digests + +### DigestBufferType Enum + +The codebase uses a `DigestBufferType` enum to differentiate between nullable and non-nullable fields: + +```rust +enum DigestBufferType { + NonNullable(D), // Just the data digest + Nullable(BitVec, D), // Null bits vector + data digest +} +``` + +This separation is crucial because nullable and non-nullable fields must be hashed differently to ensure data integrity and distinguish between actual nulls and missing data. + +## Hashing Flow + +### Record Batch Hashing + +When hashing a complete `RecordBatch`, the process follows these steps: + +``` +1. Create ArrowDigester with schema + ├─ Hash the schema (JSON serialized) + └─ Initialize field digest buffers + └─ Flatten nested struct fields with "/" delimiter + └─ Mark each field as Nullable or NonNullable + +2. Update with record batch data + ├─ For each field: + │ └─ Match on data type and call appropriate hashing function + │ └─ Update both null bits (if nullable) and data digest + └─ Accumulate all digests + +3. Finalize + ├─ Combine schema digest + ├─ Process each field digest in alphabetical order + │ ├─ If nullable: hash (null_bits.len + raw_null_bits + data) + │ └─ If non-nullable: hash data only + └─ Return final digest +``` + +### Direct Array Hashing + +Arrays can also be hashed independently without schema context: + +``` +1. Hash the data type metadata (JSON serialized) +2. Initialize digest buffer based on array nullability +3. Call array_digest_update with appropriate handler +4. Finalize and combine digests +``` + +## Null Bits Handling + +### Why Null Bits Matter + +Null bits are essential to the hashing algorithm because: +- They distinguish between actual null values and valid data +- They enable reliable hashing of nullable vs non-nullable fields +- They preserve data integrity across different representations + +### Null Bits Processing + +For nullable fields, the system maintains a `BitVec` (bitvector) where each bit represents whether a value at that index is valid (`true`) or null (`false`). + +#### Processing Steps: + +1. **If null buffer exists:** + ``` + - Iterate through each element + - Set bit to true if value is valid + - Set bit to false if value is null + - For data digest: only hash valid values + - For null values: hash the NULL_BYTES constant (b"NULL") + ``` + +2. **If no null buffer (all values valid):** + ``` + - Extend bitvector with all true values (one per element) + - Hash all data normally + ``` + +### Finalization of Nullable Fields + +When finalizing a nullable field digest: + +```rust +final_digest.update(null_bits.len().to_le_bytes()); // Size of bitvector +for &word in null_bits.as_raw_slice() { + final_digest.update(word.to_be_bytes()); // Actual null bits +} +final_digest.update(data_digest.finalize()); // Data values +``` + +This ensures the null bit pattern is part of the final hash, making nullable arrays with actual nulls hash differently from arrays without nulls. + + +### Nullable Array with No Null Values + +As demonstrated in the `nullable_vs_non_nullable_array_produces_same_hash` test in `/tests/arrow_digester.rs`: + +When an Arrow array is created with a nullable type but contains no actual null values, Arrow optimizes the internal representation by removing the null buffer. This means the **hasher treats the array identically to a non-nullable array, producing the same hash result.** + + +## Supported Data Types + +### Fixed-Size Types + +These types have consistent byte widths and can be hashed directly: + +| Data Type | Size | Handling | +|-----------|------|----------| +| Boolean | Variable | Bit-packed into bytes | +| Int8, UInt8 | 1 byte | Direct buffer hashing | +| Int16, UInt16, Float16 | 2 bytes | Direct buffer hashing | +| Int32, UInt32, Float32, Date32 | 4 bytes | Direct buffer hashing | +| Int64, UInt64, Float64, Date64 | 8 bytes | Direct buffer hashing | +| Decimal32 | 4 bytes | Direct buffer hashing | +| Decimal64 | 8 bytes | Direct buffer hashing | +| Decimal128 | 16 bytes | Direct buffer hashing | +| Decimal256 | 32 bytes | Direct buffer hashing | +| Time32 | 4 bytes | Direct buffer hashing | +| Time64 | 8 bytes | Direct buffer hashing | + +**Hashing Strategy:** +- Get the data buffer from Arrow array +- Account for array offset +- For non-nullable: hash the entire slice directly +- For nullable: iterate element by element, skipping null values + +### Boolean Type + +Booleans receive special handling because Arrow stores them as bit-packed values (1 bit per value): + +```rust +// For non-nullable: +- Extract each boolean value +- Pack into BitVec using MSB0 ordering +- Hash the raw bytes + +// For nullable: +- Handle null bits (as described above) +- Pack only valid boolean values +- Hash the packed bytes +``` + +### Variable-Length Types + +#### Binary Arrays + +Binary data (raw byte sequences) must include length prefixes to prevent collisions: + +``` +For each element: + - Hash: value.len().to_le_bytes() // Length prefix + - Hash: value.as_slice() // Actual data +``` + +**Example collision prevention:** +- Without prefix: `[0x01, 0x02]` + `[0x03]` = `[0x01, 0x02, 0x03]` +- With prefix: `len=2, 0x01, 0x02, len=1, 0x03` (different!) + +#### String Arrays + +Strings are similar to binary but UTF-8 encoded: + +``` +For each element: + - Hash: (value.len() as u32).to_le_bytes() // Length as u32 + - Hash: value.as_bytes() // UTF-8 data +``` + +#### List Arrays + +Lists/Array types recursively hash their nested values: + +``` +For each list element: + - Recursively call array_digest_update + - Use the inner field's data type + - Skip null list entries +``` + +## Schema Handling + +### Schema Flattening + +Nested struct fields are flattened into a single-level map using the `/` delimiter: + +``` +Original schema: + person (struct) + ├─ name (string) + └─ address (struct) + ├─ street (string) + └─ zip (int32) + +Flattened: + person/name + person/address/street + person/address/zip +``` + +### Schema Serialization + +The schema is serialized as a JSON string containing: +- Field names +- Field types (as DataType serialization) +- Nullability flags + +```rust +{ + "address/street": ("string", Utf8), + "address/zip": ("int32", Int32), + "name": ("string", Utf8) +} +``` + +Fields are stored in a `BTreeMap` to ensure **consistent alphabetical ordering**, which is critical for deterministic hashing. + +### Schema Hash Inclusion + +The schema digest is always the first component hashed into the final digest. This ensures that changes to schema structure produce different hashes, preventing false collisions. + +## Collision Prevention + +The hashing algorithm includes multiple safeguards against collisions: + +### 1. Length Prefixes (Variable-Length Types) + +Binary and string arrays include length prefixes to prevent merging boundaries: + +``` +Array1: ["ab", "c"] → len=2, "ab", len=1, "c" +Array2: ["a", "bc"] → len=1, "a", len=2, "bc" +Result: Different hashes! ✓ +``` + +### 2. Null Bit Vectors (Nullable Fields) + +Distinguishes between actual nulls and non-nullable fields: + +``` +NonNullable [1, 2, 3] → Only data hash +Nullable [1, 2, 3] → Null bits [true, true, true] + data hash +Result: Different hashes! ✓ +``` + +### 3. Schema Digests + +Encodes all metadata (type information, field names, nullability) into the hash: + +``` +Field "col1" Int32 (non-nullable) ≠ Field "col1" Int32 (nullable) +Result: Different hashes! ✓ +``` + +### 4. Recursive Data Type Hashing + +Complex types like lists recursively hash their components using the full schema information. + +## Data Type Conversion Details + +### Fixed-Size Array Processing + +When hashing fixed-size types, the algorithm: + +1. **Gets the data buffer** - Contains raw bytes for all elements +2. **Accounts for offset** - Arrow arrays can have offsets; these are applied +3. **Handles nullability:** + - **NonNullable**: Hash entire buffer slice directly + - **Nullable with nulls**: Iterate element-by-element, only hashing valid entries + - **Nullable without nulls**: Hash entire buffer slice (simpler path) + +**Example: Int32Array([1, 2, 3])** +``` +Size per element: 4 bytes +Buffer: [0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00] +Hash entire 12 bytes +``` + +**Example: Int32Array([1, null, 3])** +``` +Size per element: 4 bytes +Buffer: [0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00] +Null bits: [true, false, true] + +Process: + 1. Hash null bits [true, false, true] + 2. Hash bytes 0-3 (index 0, valid) + 3. Skip bytes 4-7 (index 1, null) + 4. Hash bytes 8-11 (index 2, valid) +``` + +## Determinism Guarantees + +The hashing algorithm ensures deterministic output because: + +1. **Schema fields are sorted** - BTreeMap maintains alphabetical order +2. **Field order is deterministic** - Always process in alphabetical field name order +3. **Data types are consistent** - Each type uses the same hashing strategy +4. **Byte order is consistent** - Uses little-endian for length prefixes and big-endian for bitvectors +5. **Null handling is predictable** - Same rules applied consistently + +**Implication:** The same data in different storage order or location will always produce the same hash. + +## Performance Considerations + +### Efficient Schema Hashing + +- Schema is hashed only once during initialization +- Uses JSON serialization (fast) rather than alternative formats +- Schema digest is reused for all record batches + +### Incremental Updates + +- Each record batch update accumulates into the same digest buffers +- No need to re-hash previous batches +- Final digest combines all incremental updates + +### Memory Efficiency + +- Null bits use bit-packing (1 bit per value, not 1 byte) +- Streaming approach avoids loading entire dataset into memory +- Field flattening enables hierarchical processing + +### Buffer Slicing + +- Fixed-size arrays hash the raw buffer directly when possible +- Avoids element-by-element iteration for non-nullable arrays +- Significant speedup for large datasets + +## Known Limitations + +The current implementation marks the following data types as `todo!()`: + +- `Null` - Null data type itself +- `Timestamp` - Timestamp variants +- `Duration` - Duration types +- `Interval` - Interval types +- `BinaryView` - Binary view type +- `Utf8View` - UTF-8 view type +- `ListView` - List view type +- `FixedSizeList` - Fixed-size lists +- `LargeListView` - Large list view type +- `Struct` - Struct types (partial support for nested fields) +- `Union` - Union types +- `Dictionary` - Dictionary-encoded types +- `Map` - Map types +- `RunEndEncoded` - Run-end encoded types + +These types will panic if encountered during hashing and should be implemented in future versions. +## SHA-256 Hashing Implementation + +### Overview + +ArrowDigester uses SHA-256 as its default cryptographic hash function, providing a 256-bit (32-byte) digest. The digest algorithm is configurable through the `Digest` trait, allowing alternative implementations, but SHA-256 is the standard choice for production use. + +### Versioning Header + +Every hash produced by ArrowDigester is prefixed with a 3-byte version identifier: + +``` +[Version Byte 0] [Version Byte 1] [Version Byte 2] [SHA-256 Digest (32 bytes)] +``` + +This 3-byte header ensures forward compatibility and enables detection of incompatible hash formats across different library versions. If the hashing algorithm or data format changes in future versions, the version bytes allow consumers to: +- Reject hashes from incompatible versions +- Implement migration or conversion logic +- Maintain a stable hash contract with external systems + +### SHA-256 Digest Process + +The hashing workflow follows this structure: + +``` +1. Initialize SHA-256 digester with version header + └─ Write 3 version bytes + +2. Hash schema component + └─ Update digester with schema JSON + +3. Hash field digests (alphabetical order) + ├─ For each field: + │ ├─ Hash null bits (if nullable) + │ └─ Hash data digest + └─ Accumulate into SHA-256 state + +4. Finalize + └─ Return 35-byte result: [3 version bytes] + [32-byte SHA-256 hash] +``` + +### Implementation Details + +- **Hash Algorithm**: SHA-256 (256-bit output) +- **Version Prefix**: 3 bytes (allows 16.7 million versions) +- **Total Output**: 35 bytes (3 version + 32 digest) +- **State Management**: SHA-256 maintains running state across multiple `update()` calls +- **Finalization**: Single call to `finalize()` produces immutable digest + +## Example Usage + +### Hashing a Single Array + +```rust +use arrow::array::Int32Array; +use starfix::ArrowDigester; + +let array = Int32Array::from(vec![Some(1), Some(2), Some(3)]); +let hash = ArrowDigester::hash_array(&array); +println!("Hash: {}", hex::encode(hash)); +``` + +### Hashing a Record Batch + +```rust +use arrow::record_batch::RecordBatch; +use starfix::ArrowDigester; + +let batch = RecordBatch::try_new(...)?; +let hash = ArrowDigester::hash_record_batch(&batch); +println!("Hash: {}", hex::encode(hash)); +``` + +### Streaming Multiple Batches + +```rust +use starfix::ArrowDigester; + +let mut digester = ArrowDigester::new(schema); +digester.update(&batch1); +digester.update(&batch2); +digester.update(&batch3); + +let final_hash = digester.finalize(); +println!("Combined hash: {}", hex::encode(final_hash)); +``` + +## Testing Strategy + +The codebase includes comprehensive tests covering: + +- **Data type coverage** - Tests for each supported data type +- **Nullable handling** - Arrays with and without null values +- **Collision prevention** - Length prefix verification +- **Determinism** - Same data produces same hash +- **Schema metadata** - Different schemas produce different hashes +- **Field ordering** - Different field orders produce same hash (commutative) + +## Implementation Notes + +### About the Delimiter + +The code uses `/` as the delimiter for nested field hierarchies. This was chosen to be URL-safe and visually clear while avoiding common naming conflicts. + +### About Byte Order + +- **Length prefixes**: Little-endian (`to_le_bytes()`) - standard for Arrow +- **Bitvector words**: Big-endian (`to_be_bytes()`) - matches bitvector convention +- **Size fields**: Little-endian - consistent with Arrow buffers + +### About Bitpacking + +Boolean values and null indicators use `BitVec` (Most Significant Bit ordering): +- Compresses 8 boolean values into 1 byte +- Reduces hash input size by 8x for boolean arrays +- Uses MSB0 for consistent bit ordering + +--- + +**For more information, see the main README.md and examine test cases in `tests/arrow_digester.rs`** From 4f4b57747f5776815bec85342f01e3412f81560c Mon Sep 17 00:00:00 2001 From: synicix Date: Thu, 11 Dec 2025 09:35:11 +0000 Subject: [PATCH 47/53] Add test to confirm update in batches and hashing all at once results in the same hash, and fix bug related to it --- src/arrow_digester_core.rs | 10 ++----- tests/arrow_digester.rs | 56 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs index 4a9c787..9edf356 100644 --- a/src/arrow_digester_core.rs +++ b/src/arrow_digester_core.rs @@ -473,8 +473,7 @@ impl ArrowDigesterCore { } } else { // All valid, therefore we can extend the bit vector with all true values - let len = array.len().checked_sub(1).expect("Array length underflow"); - null_bit_vec.extend(repeat_n(true, len)); + null_bit_vec.extend(repeat_n(true, array.len())); // Deal with the data for i in 0..array.len() { @@ -620,10 +619,7 @@ impl ArrowDigesterCore { } None => { // All valid, therefore we can extend the bit vector with all true values - null_bit_vec.extend(repeat_n( - true, - array.len().checked_sub(1).expect("Array length underflow"), - )); + null_bit_vec.extend(repeat_n(true, array.len())); } } } @@ -919,7 +915,7 @@ mod tests { // Check the digest assert_eq!( encode(digester.finalize()), - "e32c1f0981ee262622e0e91a5ea99210a085b62d6025d70eb1ee074096a151dd" + "36ffc4d4c072ac0d2470dfa12a9dab10eaecd932a25872aca8de173bf51baa15" ); } } diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index 4ef9626..8421824 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -128,7 +128,7 @@ mod tests { // Hash the record batch assert_eq!( encode(ArrowDigester::hash_record_batch(&batch)), - "0000019972058c784f11f63a1d49998a79c00616b0f0a34b9774bbc7e2a3247df709ca" + "000001ac720bed7fb1d696d5626705dc7602d14cfe974a3297cc28c3cb8b8e9a62601a" ); } @@ -288,7 +288,7 @@ mod tests { let hash = hex::encode(ArrowDigester::hash_array(&list_array)); assert_eq!( hash, - "0000011a8d06635dec40079b979ce439f662c1fb6456bb7e02bbf7d8e8048c61498faf" + "000001f654be5f0ef89807feba9483072190b7d26964e535cd7c522706218df9c3c015" ); } @@ -458,4 +458,56 @@ mod tests { "Nullable and non-nullable schemas with same data types should produce different hashes" ); } + + #[test] + fn batches_vs_single_hash_produces_same_result() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("value", DataType::Float64, true), + ])); + + // Create two batches with data + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(Float64Array::from(vec![1.1, 2.2, 3.3])), + ], + ) + .unwrap(); + + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![4, 5, 6])), + Arc::new(Float64Array::from(vec![4.4, 5.5, 6.6])), + ], + ) + .unwrap(); + + // Hash batches incrementally + let mut digester_batches = ArrowDigester::new((*schema).clone()); + digester_batches.update(&batch1); + digester_batches.update(&batch2); + let hash_batches = encode(digester_batches.finalize()); + + // Hash combined batch all at once + let combined_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])), + Arc::new(Float64Array::from(vec![1.1, 2.2, 3.3, 4.4, 5.5, 6.6])), + ], + ) + .unwrap(); + + let mut digester_single = ArrowDigester::new((*schema).clone()); + digester_single.update(&combined_batch); + let hash_single = encode(digester_single.finalize()); + + assert_eq!( + hash_batches, hash_single, + "Hashing multiple batches incrementally should produce the same result as hashing one combined batch" + ); + } } From 23fc9829922613b4699ae391d99a94e93e71cba3 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 12 Dec 2025 03:30:23 +0000 Subject: [PATCH 48/53] Add test to check for consistent hashing when one batch is null but the next is not. --- tests/arrow_digester.rs | 59 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index 8421824..3623220 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -510,4 +510,63 @@ mod tests { "Hashing multiple batches incrementally should produce the same result as hashing one combined batch" ); } + + #[test] + fn batches_with_nulls_vs_single_hash_produces_same_result() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("value", DataType::Float64, true), + ])); + + // Create two batches: first all nulls, second with values + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![None, None, None])), + Arc::new(Float64Array::from(vec![None, None, None])), + ], + ) + .unwrap(); + + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])), + Arc::new(Float64Array::from(vec![Some(1.1), Some(2.2), Some(3.3)])), + ], + ) + .unwrap(); + + // Hash batches incrementally + let mut digester_batches = ArrowDigester::new((*schema).clone()); + digester_batches.update(&batch1); + digester_batches.update(&batch2); + let hash_batches = encode(digester_batches.finalize()); + + // Hash combined batch all at once + let combined_batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(Int32Array::from(vec![None, None, None, Some(1), Some(2), Some(3)])), + Arc::new(Float64Array::from(vec![ + None, + None, + None, + Some(1.1), + Some(2.2), + Some(3.3), + ])), + ], + ) + .unwrap(); + + let mut digester_single = ArrowDigester::new((*schema).clone()); + digester_single.update(&combined_batch); + let hash_single = encode(digester_single.finalize()); + + assert_eq!( + hash_batches, hash_single, + "Hashing batches where first is all nulls should produce same result as combined batch" + ); + } } From bfa2b17222cface29ab41eb723ae40b4a4c5da49 Mon Sep 17 00:00:00 2001 From: synicix Date: Tue, 6 Jan 2026 20:23:15 +0000 Subject: [PATCH 49/53] feat: Remove some python interp settings --- .vscode/settings.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index ae1d1ab..56f3d71 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -20,12 +20,7 @@ "--config", "max_width=100" ], - "jupyter.kernels.excludePythonEnvironments": [ - "/bin/python3", - "/usr/bin/python3" - ], "notebook.formatOnSave.enabled": true, "notebook.output.scrolling": true, - "python.defaultInterpreterPath": "~/.local/share/base/bin/python3", "python.terminal.activateEnvironment": false } From 8b9db232441cba4189922e9da77e4e158bfddcd6 Mon Sep 17 00:00:00 2001 From: synicix Date: Tue, 6 Jan 2026 20:23:42 +0000 Subject: [PATCH 50/53] feat: update some stale comments --- src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 1c99028..4c91e74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ use sha2::Sha256; use crate::arrow_digester_core::ArrowDigesterCore; -const VERSION_BYTES: [u8; 3] = [0_u8, 0_u8, 1_u8]; // Version 1.0 +const VERSION_BYTES: [u8; 3] = [0_u8, 0_u8, 1_u8]; // Version 0.0.1 /// Maps `arrow_digester_core` function to a `sha_256` digester + versioning pub struct ArrowDigester { digester: ArrowDigesterCore, @@ -51,3 +51,5 @@ impl ArrowDigester { pub(crate) mod arrow_digester_core; pub mod pyarrow; + +// Write a test to check that int32 digest is consistent From 70effd54f921a921a8227538ac16b8a9f396d112 Mon Sep 17 00:00:00 2001 From: synicix Date: Wed, 7 Jan 2026 00:31:03 +0000 Subject: [PATCH 51/53] feat: Expose new functions to python side --- ENDIANNESS.md | 352 +++++++++++++++++++++++++++ notebooks/Example Python Usage.ipynb | 42 +--- python/starfix/__init__.py | 49 +++- src/arrow_digester_core.rs | 2 + src/lib.rs | 7 + src/pyarrow.rs | 98 +++++++- 6 files changed, 516 insertions(+), 34 deletions(-) create mode 100644 ENDIANNESS.md diff --git a/ENDIANNESS.md b/ENDIANNESS.md new file mode 100644 index 0000000..5c6b6be --- /dev/null +++ b/ENDIANNESS.md @@ -0,0 +1,352 @@ +# Endianness in Arrow and StarFix + +## Overview + +Endianness refers to the byte order in which multi-byte values are stored in memory. This document explains how Arrow and StarFix handle endianness and why it matters for data hashing. + +## Endianness Basics + +### Little-Endian vs Big-Endian + +**Little-Endian** (LE): +- Least significant byte first +- Example: `0x12345678` stored as `[0x78, 0x56, 0x34, 0x12]` +- Used by: x86, x64, ARM (most common modern systems) + +**Big-Endian** (BE): +- Most significant byte first +- Example: `0x12345678` stored as `[0x12, 0x34, 0x56, 0x78]` +- Used by: PowerPC, SPARC, network protocols (legacy systems) + +### Rust Byte Order Methods + +```rust +// Native byte order (system-dependent) +value.to_ne_bytes() // native-endian +i32::from_ne_bytes() // native-endian + +// Explicit little-endian +value.to_le_bytes() // always little-endian +i32::from_le_bytes() // always little-endian + +// Explicit big-endian +value.to_be_bytes() // always big-endian +i32::from_be_bytes() // always big-endian +``` + +### Compile-time Endianness Detection + +```rust +#[cfg(target_endian = "little")] +const IS_LITTLE_ENDIAN: bool = true; + +#[cfg(target_endian = "big")] +const IS_LITTLE_ENDIAN: bool = false; + +// Or use at runtime: +#[inline] +fn is_little_endian() -> bool { + u32::from_ne_bytes([1, 0, 0, 0]) == 1 +} +``` + +## Arrow's Approach to Endianness + +### Arrow's Design Philosophy + +**Arrow stores all data in the system's native byte order.** + +- On little-endian systems (x86, ARM, modern CPUs): data is little-endian +- On big-endian systems (legacy): data is big-endian +- No conversion needed for local operations +- Minimal performance overhead + +### Arrow Buffers + +Arrow stores data in columnar buffers with the following structure: + +``` +Arrow Array (e.g., Int32Array with [1, 2, 3]) +├─ Metadata +│ ├─ Data type +│ ├─ Length (3) +│ ├─ Null count +│ └─ Byte order (from Arrow metadata) +├─ Data Buffer +│ └─ Raw bytes in NATIVE order +│ [01 00 00 00 | 02 00 00 00 | 03 00 00 00] (on little-endian) +└─ Null Buffer (optional) + └─ Bitmap [1, 1, 1] (all valid) +``` + +### Inter-Process Communication (IPC) + +Arrow's IPC format (used for serialization) includes endianness metadata: + +``` +Arrow IPC Message +├─ Version +├─ Body size +├─ Endianness flag ← Specifies byte order for interpretation +└─ Actual data (may need conversion on deserialization) +``` + +**If you receive an Arrow message from a big-endian system and your system is little-endian, Arrow handles the conversion automatically.** + +## StarFix's Use of Byte Order + +### Current Implementation + +StarFix uses **mixed byte orders intentionally** for different purposes: + +#### 1. Little-Endian for Length Prefixes + +```rust +// In hash_binary_array and hash_string_array +data_digest.update(value.len().to_le_bytes()); // Little-endian +data_digest.update(value); +``` + +**Why little-endian?** +- Arrow uses little-endian natively on most systems +- Consistent with Arrow's buffer layout +- Deterministic across platforms when considering IPC + +**Example:** +``` +String "hello" (length 5): +Hash input: [05 00 00 00] + "hello" + ↑↑↑↑ + little-endian length +``` + +#### 2. Big-Endian for Null Bits + +```rust +// In finalize_digest +for &word in null_bit_digest.as_raw_slice() { + final_digest.update(word.to_be_bytes()); // Big-endian! +} +``` + +**Why big-endian?** +- BitVec convention for consistency +- Ensures null bits are interpreted canonically +- Makes null bit patterns deterministic regardless of system endianness + +**Example:** +``` +Null bits: [true, true, false, true, ...] (packed into bytes) +Raw word: 0x0D (1101 in binary) +Hashed as: [0x0D] in big-endian representation +``` + +### Determinism Guarantee + +The mixed approach ensures **deterministic hashing**: + +1. **Data buffers**: Hashed in native byte order (as Arrow stores them) +2. **Length prefixes**: Converted to little-endian (Arrow standard) +3. **Null bits**: Converted to big-endian (BitVec standard) +4. **Result**: Same data always produces the same hash, regardless of which system ran the code + +## Platform Considerations + +### x86/x64 and ARM (Little-Endian) + +``` +System Endianness: Little-Endian (native) +↓ +Arrow Buffers: Little-Endian (native, no conversion) +↓ +StarFix Hashing: + - Data: Little-Endian (as-is from Arrow) + - Lengths: Little-Endian (explicit) + - Null bits: Big-Endian (converted) +↓ +Hash produced deterministically +``` + +### PowerPC/SPARC (Big-Endian) - Hypothetical + +``` +System Endianness: Big-Endian (native) +↓ +Arrow Buffers: Big-Endian (native, no conversion) +↓ +StarFix Hashing: + - Data: Big-Endian (as-is from Arrow) + - Lengths: Little-Endian (converted from native) + - Null bits: Big-Endian (as-is, no conversion needed) +↓ +Hash produced deterministically +``` + +**Important:** Even though intermediate representations differ, the final hash should be identical because both use the same explicit byte order for lengths and null bits. + +## Cross-Platform Hashing + +### Challenge + +Two systems with different native endianness processing the same data could produce different hashes if not handled carefully. + +### StarFix's Solution + +1. **Data buffers**: Use Arrow's native representation (system-dependent but consistent) +2. **Explicit conversions**: All metadata uses explicit byte orders +3. **Schema metadata**: Hashed separately, includes nullability info +4. **Result**: Deterministic hashing within a system; comparable across systems + +### Recommendation for Cross-Platform Use + +If you need hashes to match across little-endian and big-endian systems: + +```rust +// Current: May differ between systems +let hash = ArrowDigester::hash_array(&array); + +// Better: Use record batch with explicit schema +let batch = RecordBatch::try_new(schema, arrays)?; +let hash = ArrowDigester::hash_record_batch(&batch); // Schema-aware hashing +``` + +The schema digest is computed from serialized field information, which includes nullability flags and can be made platform-agnostic. + +## Code Examples + +### Checking System Endianness + +```rust +// Compile-time check (preferred) +#[cfg(target_endian = "little")] +fn byte_order_name() -> &'static str { + "little-endian" +} + +#[cfg(target_endian = "big")] +fn byte_order_name() -> &'static str { + "big-endian" +} + +// Runtime check +fn is_little_endian() -> bool { + u32::from_ne_bytes([1, 0, 0, 0]) == 1 +} + +// More explicit +fn is_little_endian_v2() -> bool { + (1u16).to_le_bytes()[0] == 1 +} +``` + +### Getting Arrow's Byte Order + +```rust +use arrow::array::Array; + +fn check_arrow_native_order(array: &dyn Array) { + // Arrow stores in native byte order + // No explicit API to check - it's always native + + #[cfg(target_endian = "little")] + println!("Arrow on this system: little-endian buffers"); + + #[cfg(target_endian = "big")] + println!("Arrow on this system: big-endian buffers"); +} +``` + +### Safe Cross-Platform Hashing + +```rust +use arrow::record_batch::RecordBatch; +use starfix::ArrowDigester; +use std::sync::Arc; + +fn hash_with_platform_info(batch: &RecordBatch) -> (Vec, &'static str) { + let hash = ArrowDigester::hash_record_batch(batch); + + #[cfg(target_endian = "little")] + return (hash, "little-endian"); + + #[cfg(target_endian = "big")] + return (hash, "big-endian"); +} +``` + +## Testing Considerations + +When testing StarFix hashing: + +1. **Same-system tests**: Will pass regardless of implementation details +2. **Cross-platform tests**: Require explicit endianness handling +3. **Integration tests**: Should verify determinism on target platform + +```rust +#[test] +fn deterministic_hashing() { + // Same data → same hash (guaranteed) + let array1 = Int32Array::from(vec![1, 2, 3]); + let array2 = Int32Array::from(vec![1, 2, 3]); + + assert_eq!( + ArrowDigester::hash_array(&array1), + ArrowDigester::hash_array(&array2) + ); +} + +#[test] +fn endianness_consistency() { + // Different byte orders of same value should hash differently + let value_a = 0x12345678u32; + let array_a = UInt32Array::from(vec![value_a]); + + let value_b = 0x78563412u32; // Byte-reversed + let array_b = UInt32Array::from(vec![value_b]); + + // These should hash differently (different semantic values) + assert_ne!( + ArrowDigester::hash_array(&array_a), + ArrowDigester::hash_array(&array_b) + ); +} +``` + +## Current Known Limitations + +The current StarFix implementation: + +✓ Ensures deterministic hashing on the same platform +✓ Uses Arrow's native byte order for efficiency +⚠️ May produce different hashes on different platforms for the same logical data +⚠️ No explicit API to query or control endianness + +## Future Improvements + +Potential enhancements for cross-platform hashing: + +1. **Normalize byte order**: Convert all data to a canonical byte order before hashing +2. **Endianness parameter**: Allow users to specify target byte order +3. **Platform-agnostic mode**: Flag for cross-platform hash compatibility +4. **Schema versioning**: Include endianness info in hashed schema + +Example future API: + +```rust +pub enum HashEndianness { + Native, // Use system native (current behavior) + Little, // Always little-endian + Big, // Always big-endian +} + +pub fn hash_array_with_endianness( + array: &dyn Array, + endianness: HashEndianness, +) -> Vec { + // Implementation +} +``` + +--- + +**For more information about Arrow's byte order handling, see the [Apache Arrow documentation](https://arrow.apache.org/docs/format/Columnar.html).** diff --git a/notebooks/Example Python Usage.ipynb b/notebooks/Example Python Usage.ipynb index d08c3ce..19d7794 100644 --- a/notebooks/Example Python Usage.ipynb +++ b/notebooks/Example Python Usage.ipynb @@ -10,41 +10,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Received table with 5 rows and 3 columns\n", - "Table SHA-256 hash: e474f034a7d25abfac4941a3239a3d7c56405c84edb866e474056cbe033a9476\n" + "000001db154d744ff41a27ec6af4e205842cdf5356be83d39ac0b57e0a7d138774e5ab\n", + "000001aedb11d4fb4cabb4d4028e69cf912a0c392227c2a06ac2a2b4bd92cf122f9208\n" ] - }, - { - "data": { - "text/plain": [ - "'e474f034a7d25abfac4941a3239a3d7c56405c84edb866e474056cbe033a9476'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "import pyarrow as pa\n", "import ctypes\n", - "import arrow_hasher as ah\n", - "\n", - "\n", - "def hash_arrow_table(table: pa.Table):\n", - " # Covert table to record batch first (so we can extract the pointers), since the default behavior is 1 batch, we can just get the first element\n", - " # After that we can extract the PyCapsules\n", - " schema_capsule, array_capsule = table.to_batches()[0].__arrow_c_array__()\n", - "\n", - " # Extract raw pointers from capsules due to uniffi limitations\n", - " PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer\n", - " PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p]\n", - " PyCapsule_GetPointer.restype = ctypes.c_void_p\n", - "\n", - " return ah.process_arrow_table(\n", - " PyCapsule_GetPointer(array_capsule, b\"arrow_array\"),\n", - " PyCapsule_GetPointer(schema_capsule, b\"arrow_schema\"),\n", - " )\n", + "import starfix as sf\n", "\n", "\n", "# Create a simple Arrow table\n", @@ -55,13 +29,19 @@ "}\n", "table = pa.table(data)\n", "\n", - "hash_arrow_table(table)" + "# Hash the entire Arrow table\n", + "(table)\n", + "print(sf.hash_record_batch(table).hex())\n", + "\n", + "# Hash the schema of the Arrow table\n", + "table.schema\n", + "print(sf.hash_schema(table.schema).hex())" ] }, { "cell_type": "code", "execution_count": null, - "id": "9352fee1", + "id": "0c10e8c3", "metadata": {}, "outputs": [], "source": [] diff --git a/python/starfix/__init__.py b/python/starfix/__init__.py index dca6f2d..b638059 100644 --- a/python/starfix/__init__.py +++ b/python/starfix/__init__.py @@ -6,7 +6,7 @@ import pyarrow as pa -def hash_arrow_table(table: "pa.Table") -> bytes: +def hash_record_batch(table: "pa.Table") -> bytes: # Covert table to record batch first (so we can extract the pointers), since the default behavior is 1 batch, we can just get the first element # After that we can extract the PyCapsules schema_capsule, array_capsule = table.to_batches()[0].__arrow_c_array__() @@ -16,7 +16,52 @@ def hash_arrow_table(table: "pa.Table") -> bytes: PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] PyCapsule_GetPointer.restype = ctypes.c_void_p - return sfr.process_arrow_table( + return sfr.hash_record_batch( PyCapsule_GetPointer(array_capsule, b"arrow_array"), PyCapsule_GetPointer(schema_capsule, b"arrow_schema"), ) + + +def hash_schema(schema: "pa.Schema") -> bytes: + schema_capsule = schema.__arrow_c_schema__() + + # Extract raw pointers from capsules due to uniffi limitations + PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer + PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] + PyCapsule_GetPointer.restype = ctypes.c_void_p + + return sfr.hash_schema( + PyCapsule_GetPointer(schema_capsule, b"arrow_schema"), + ) + + +class PyArrowDigester: + def __init__(self, schema: "pa.Schema") -> None: + + schema_capsule = schema.__arrow_c_schema__() + + PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer + PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] + PyCapsule_GetPointer.restype = ctypes.c_void_p + + schema_ptr = PyCapsule_GetPointer(schema_capsule, b"arrow_schema") + + self._internal = sfr.InternalPyArrowDigester(schema_ptr) + + def update(self, table: "pa.Table") -> None: + # Covert table to record batch first (so we can extract the pointers), since the default behavior is 1 batch, we can just get the first element + # After that we can extract the PyCapsules + schema_capsule, array_capsule = table.to_batches()[0].__arrow_c_array__() + + # Extract raw pointers from capsules due to uniffi limitations + PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer + PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] + PyCapsule_GetPointer.restype = ctypes.c_void_p + + self._internal.update( + PyCapsule_GetPointer(array_capsule, b"arrow_array"), + PyCapsule_GetPointer(schema_capsule, b"arrow_schema"), + ) + + def finalize(self) -> bytes: + return self._internal.finalize() diff --git a/src/arrow_digester_core.rs b/src/arrow_digester_core.rs index 9edf356..556ee3d 100644 --- a/src/arrow_digester_core.rs +++ b/src/arrow_digester_core.rs @@ -21,11 +21,13 @@ const NULL_BYTES: &[u8] = b"NULL"; const DELIMITER_FOR_NESTED_FIELD: &str = "/"; +#[derive(Clone)] enum DigestBufferType { NonNullable(D), Nullable(BitVec, D), // Where first digest is for the bull bits, while the second is for the actual data } +#[derive(Clone)] pub struct ArrowDigesterCore { schema: Schema, schema_digest: Vec, diff --git a/src/lib.rs b/src/lib.rs index 4c91e74..b041d3b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,7 +9,9 @@ use sha2::Sha256; use crate::arrow_digester_core::ArrowDigesterCore; const VERSION_BYTES: [u8; 3] = [0_u8, 0_u8, 1_u8]; // Version 0.0.1 + /// Maps `arrow_digester_core` function to a `sha_256` digester + versioning +#[derive(Clone)] pub struct ArrowDigester { digester: ArrowDigesterCore, } @@ -42,6 +44,11 @@ impl ArrowDigester { Self::prepend_version_bytes(ArrowDigesterCore::::hash_record_batch(record_batch)) } + /// Function to hash schema only + pub fn hash_schema(schema: &Schema) -> Vec { + Self::prepend_version_bytes(ArrowDigesterCore::::hash_schema(schema)) + } + fn prepend_version_bytes(digest: Vec) -> Vec { let mut complete_hash = VERSION_BYTES.clone().to_vec(); complete_hash.extend(digest); diff --git a/src/pyarrow.rs b/src/pyarrow.rs index d193bef..03277ba 100644 --- a/src/pyarrow.rs +++ b/src/pyarrow.rs @@ -1,6 +1,14 @@ +#![expect( + unsafe_code, + clippy::expect_used, + reason = "Converting raw pointers to Arrow structures" +)] +use std::sync::{Arc, Mutex}; + use crate::ArrowDigester; use arrow::array::{RecordBatch, StructArray}; use arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; +use arrow_schema::Schema; /// Process an Arrow table via C Data Interface /// @@ -8,7 +16,7 @@ use arrow::ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema}; /// The pointers must be valid Arrow C Data Interface structs from Python's pyarrow #[uniffi::export] -pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { +pub fn hash_record_batch(array_ptr: u64, schema_ptr: u64) -> Vec { #[expect( unsafe_code, reason = "Need to convert raw pointers to Arrow data structures" @@ -30,3 +38,91 @@ pub fn process_arrow_table(array_ptr: u64, schema_ptr: u64) -> Vec { // Hash the table ArrowDigester::hash_record_batch(&RecordBatch::from(StructArray::from(array_data))) } + +/// Process an Arrow schema via C Data Interface +/// +/// # Panics +/// The pointer must be a valid Arrow schema from Python's pyarrow +#[uniffi::export] +pub fn hash_schema(schema_ptr: u64) -> Vec { + #[expect( + unsafe_code, + reason = "Need to convert raw pointers to Arrow data structures" + )] + // SAFETY: + // Need to conduct unsafe operations to convert raw pointers to Arrow data structures + let schema = unsafe { + let ffi_schema = FFI_ArrowSchema::from_raw(schema_ptr as *mut FFI_ArrowSchema); + Schema::try_from(&ffi_schema).expect("Failed to convert FFI schema to Arrow schema") + }; + + // Hash the schema + ArrowDigester::hash_schema(&schema) +} + +#[derive(uniffi::Object)] +pub struct InternalPyArrowDigester { + digester: Arc>, +} + +#[uniffi::export] +impl InternalPyArrowDigester { + /// Create a new instance of `PyArrowDigester` with SHA256 as the digester with the schema which will be enforce through each update + /// + /// # Panics + /// The pointer must be a valid Arrow schema from Python's pyarrow, if failed to convert, it will panic + + #[uniffi::constructor] + pub fn new(schema_ptr: u64) -> Self { + // SAFETY: + // Need to conduct unsafe operations to convert raw pointers to Arrow data structures + let schema = unsafe { + let ffi_schema = FFI_ArrowSchema::from_raw(schema_ptr as *mut FFI_ArrowSchema); + Schema::try_from(&ffi_schema).expect("Failed to convert FFI schema to Arrow schema") + }; + Self { + digester: Arc::new(Mutex::new(ArrowDigester::new(schema))), + } + } + + /// Update the digester with a new `RecordBatch` + /// + /// # Panics + /// The pointers must be valid Arrow C Data Interface structs from Python's pyarrow + pub fn update(&self, array_ptr: u64, schema_ptr: u64) { + #[expect( + unsafe_code, + reason = "Need to convert raw pointers to Arrow data structures" + )] + #[expect( + clippy::multiple_unsafe_ops_per_block, + clippy::expect_used, + reason = "Okay since we are doing the same operation of dereferencing pointers, Will add proper errors later" + )] + // SAFETY: + // Need to conduct unsafe operations to convert raw pointers to Arrow data structures + let array_data = unsafe { + // Construct ArrayData from FFI structures + let ffi_array = FFI_ArrowArray::from_raw(array_ptr as *mut FFI_ArrowArray); + let ffi_schema = FFI_ArrowSchema::from_raw(schema_ptr as *mut FFI_ArrowSchema); + from_ffi(ffi_array, &ffi_schema).expect("Failed to import Arrow array data") + }; + + self.digester + .lock() + .expect("Failed to acquire lock on digester") + .update(&RecordBatch::from(StructArray::from(array_data))); + } + + /// Consume the digester and finalize the hash computation + /// + /// # Panics + /// If failed to acquire lock on digester + pub fn finalize(&self) -> Vec { + self.digester + .lock() + .expect("Failed to acquire lock on digester") + .clone() + .finalize() + } +} From 3591940a51c109f63036a49f59cef158a5cf0772 Mon Sep 17 00:00:00 2001 From: synicix Date: Wed, 7 Jan 2026 00:31:39 +0000 Subject: [PATCH 52/53] feat: remove eadianness file --- ENDIANNESS.md | 352 -------------------------------------------------- 1 file changed, 352 deletions(-) delete mode 100644 ENDIANNESS.md diff --git a/ENDIANNESS.md b/ENDIANNESS.md deleted file mode 100644 index 5c6b6be..0000000 --- a/ENDIANNESS.md +++ /dev/null @@ -1,352 +0,0 @@ -# Endianness in Arrow and StarFix - -## Overview - -Endianness refers to the byte order in which multi-byte values are stored in memory. This document explains how Arrow and StarFix handle endianness and why it matters for data hashing. - -## Endianness Basics - -### Little-Endian vs Big-Endian - -**Little-Endian** (LE): -- Least significant byte first -- Example: `0x12345678` stored as `[0x78, 0x56, 0x34, 0x12]` -- Used by: x86, x64, ARM (most common modern systems) - -**Big-Endian** (BE): -- Most significant byte first -- Example: `0x12345678` stored as `[0x12, 0x34, 0x56, 0x78]` -- Used by: PowerPC, SPARC, network protocols (legacy systems) - -### Rust Byte Order Methods - -```rust -// Native byte order (system-dependent) -value.to_ne_bytes() // native-endian -i32::from_ne_bytes() // native-endian - -// Explicit little-endian -value.to_le_bytes() // always little-endian -i32::from_le_bytes() // always little-endian - -// Explicit big-endian -value.to_be_bytes() // always big-endian -i32::from_be_bytes() // always big-endian -``` - -### Compile-time Endianness Detection - -```rust -#[cfg(target_endian = "little")] -const IS_LITTLE_ENDIAN: bool = true; - -#[cfg(target_endian = "big")] -const IS_LITTLE_ENDIAN: bool = false; - -// Or use at runtime: -#[inline] -fn is_little_endian() -> bool { - u32::from_ne_bytes([1, 0, 0, 0]) == 1 -} -``` - -## Arrow's Approach to Endianness - -### Arrow's Design Philosophy - -**Arrow stores all data in the system's native byte order.** - -- On little-endian systems (x86, ARM, modern CPUs): data is little-endian -- On big-endian systems (legacy): data is big-endian -- No conversion needed for local operations -- Minimal performance overhead - -### Arrow Buffers - -Arrow stores data in columnar buffers with the following structure: - -``` -Arrow Array (e.g., Int32Array with [1, 2, 3]) -├─ Metadata -│ ├─ Data type -│ ├─ Length (3) -│ ├─ Null count -│ └─ Byte order (from Arrow metadata) -├─ Data Buffer -│ └─ Raw bytes in NATIVE order -│ [01 00 00 00 | 02 00 00 00 | 03 00 00 00] (on little-endian) -└─ Null Buffer (optional) - └─ Bitmap [1, 1, 1] (all valid) -``` - -### Inter-Process Communication (IPC) - -Arrow's IPC format (used for serialization) includes endianness metadata: - -``` -Arrow IPC Message -├─ Version -├─ Body size -├─ Endianness flag ← Specifies byte order for interpretation -└─ Actual data (may need conversion on deserialization) -``` - -**If you receive an Arrow message from a big-endian system and your system is little-endian, Arrow handles the conversion automatically.** - -## StarFix's Use of Byte Order - -### Current Implementation - -StarFix uses **mixed byte orders intentionally** for different purposes: - -#### 1. Little-Endian for Length Prefixes - -```rust -// In hash_binary_array and hash_string_array -data_digest.update(value.len().to_le_bytes()); // Little-endian -data_digest.update(value); -``` - -**Why little-endian?** -- Arrow uses little-endian natively on most systems -- Consistent with Arrow's buffer layout -- Deterministic across platforms when considering IPC - -**Example:** -``` -String "hello" (length 5): -Hash input: [05 00 00 00] + "hello" - ↑↑↑↑ - little-endian length -``` - -#### 2. Big-Endian for Null Bits - -```rust -// In finalize_digest -for &word in null_bit_digest.as_raw_slice() { - final_digest.update(word.to_be_bytes()); // Big-endian! -} -``` - -**Why big-endian?** -- BitVec convention for consistency -- Ensures null bits are interpreted canonically -- Makes null bit patterns deterministic regardless of system endianness - -**Example:** -``` -Null bits: [true, true, false, true, ...] (packed into bytes) -Raw word: 0x0D (1101 in binary) -Hashed as: [0x0D] in big-endian representation -``` - -### Determinism Guarantee - -The mixed approach ensures **deterministic hashing**: - -1. **Data buffers**: Hashed in native byte order (as Arrow stores them) -2. **Length prefixes**: Converted to little-endian (Arrow standard) -3. **Null bits**: Converted to big-endian (BitVec standard) -4. **Result**: Same data always produces the same hash, regardless of which system ran the code - -## Platform Considerations - -### x86/x64 and ARM (Little-Endian) - -``` -System Endianness: Little-Endian (native) -↓ -Arrow Buffers: Little-Endian (native, no conversion) -↓ -StarFix Hashing: - - Data: Little-Endian (as-is from Arrow) - - Lengths: Little-Endian (explicit) - - Null bits: Big-Endian (converted) -↓ -Hash produced deterministically -``` - -### PowerPC/SPARC (Big-Endian) - Hypothetical - -``` -System Endianness: Big-Endian (native) -↓ -Arrow Buffers: Big-Endian (native, no conversion) -↓ -StarFix Hashing: - - Data: Big-Endian (as-is from Arrow) - - Lengths: Little-Endian (converted from native) - - Null bits: Big-Endian (as-is, no conversion needed) -↓ -Hash produced deterministically -``` - -**Important:** Even though intermediate representations differ, the final hash should be identical because both use the same explicit byte order for lengths and null bits. - -## Cross-Platform Hashing - -### Challenge - -Two systems with different native endianness processing the same data could produce different hashes if not handled carefully. - -### StarFix's Solution - -1. **Data buffers**: Use Arrow's native representation (system-dependent but consistent) -2. **Explicit conversions**: All metadata uses explicit byte orders -3. **Schema metadata**: Hashed separately, includes nullability info -4. **Result**: Deterministic hashing within a system; comparable across systems - -### Recommendation for Cross-Platform Use - -If you need hashes to match across little-endian and big-endian systems: - -```rust -// Current: May differ between systems -let hash = ArrowDigester::hash_array(&array); - -// Better: Use record batch with explicit schema -let batch = RecordBatch::try_new(schema, arrays)?; -let hash = ArrowDigester::hash_record_batch(&batch); // Schema-aware hashing -``` - -The schema digest is computed from serialized field information, which includes nullability flags and can be made platform-agnostic. - -## Code Examples - -### Checking System Endianness - -```rust -// Compile-time check (preferred) -#[cfg(target_endian = "little")] -fn byte_order_name() -> &'static str { - "little-endian" -} - -#[cfg(target_endian = "big")] -fn byte_order_name() -> &'static str { - "big-endian" -} - -// Runtime check -fn is_little_endian() -> bool { - u32::from_ne_bytes([1, 0, 0, 0]) == 1 -} - -// More explicit -fn is_little_endian_v2() -> bool { - (1u16).to_le_bytes()[0] == 1 -} -``` - -### Getting Arrow's Byte Order - -```rust -use arrow::array::Array; - -fn check_arrow_native_order(array: &dyn Array) { - // Arrow stores in native byte order - // No explicit API to check - it's always native - - #[cfg(target_endian = "little")] - println!("Arrow on this system: little-endian buffers"); - - #[cfg(target_endian = "big")] - println!("Arrow on this system: big-endian buffers"); -} -``` - -### Safe Cross-Platform Hashing - -```rust -use arrow::record_batch::RecordBatch; -use starfix::ArrowDigester; -use std::sync::Arc; - -fn hash_with_platform_info(batch: &RecordBatch) -> (Vec, &'static str) { - let hash = ArrowDigester::hash_record_batch(batch); - - #[cfg(target_endian = "little")] - return (hash, "little-endian"); - - #[cfg(target_endian = "big")] - return (hash, "big-endian"); -} -``` - -## Testing Considerations - -When testing StarFix hashing: - -1. **Same-system tests**: Will pass regardless of implementation details -2. **Cross-platform tests**: Require explicit endianness handling -3. **Integration tests**: Should verify determinism on target platform - -```rust -#[test] -fn deterministic_hashing() { - // Same data → same hash (guaranteed) - let array1 = Int32Array::from(vec![1, 2, 3]); - let array2 = Int32Array::from(vec![1, 2, 3]); - - assert_eq!( - ArrowDigester::hash_array(&array1), - ArrowDigester::hash_array(&array2) - ); -} - -#[test] -fn endianness_consistency() { - // Different byte orders of same value should hash differently - let value_a = 0x12345678u32; - let array_a = UInt32Array::from(vec![value_a]); - - let value_b = 0x78563412u32; // Byte-reversed - let array_b = UInt32Array::from(vec![value_b]); - - // These should hash differently (different semantic values) - assert_ne!( - ArrowDigester::hash_array(&array_a), - ArrowDigester::hash_array(&array_b) - ); -} -``` - -## Current Known Limitations - -The current StarFix implementation: - -✓ Ensures deterministic hashing on the same platform -✓ Uses Arrow's native byte order for efficiency -⚠️ May produce different hashes on different platforms for the same logical data -⚠️ No explicit API to query or control endianness - -## Future Improvements - -Potential enhancements for cross-platform hashing: - -1. **Normalize byte order**: Convert all data to a canonical byte order before hashing -2. **Endianness parameter**: Allow users to specify target byte order -3. **Platform-agnostic mode**: Flag for cross-platform hash compatibility -4. **Schema versioning**: Include endianness info in hashed schema - -Example future API: - -```rust -pub enum HashEndianness { - Native, // Use system native (current behavior) - Little, // Always little-endian - Big, // Always big-endian -} - -pub fn hash_array_with_endianness( - array: &dyn Array, - endianness: HashEndianness, -) -> Vec { - // Implementation -} -``` - ---- - -**For more information about Arrow's byte order handling, see the [Apache Arrow documentation](https://arrow.apache.org/docs/format/Columnar.html).** From 26990d822a3b19e8683badf073231e331455f188 Mon Sep 17 00:00:00 2001 From: synicix Date: Wed, 7 Jan 2026 01:20:15 +0000 Subject: [PATCH 53/53] Fix fmt error --- tests/arrow_digester.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/arrow_digester.rs b/tests/arrow_digester.rs index 3623220..dcc07de 100644 --- a/tests/arrow_digester.rs +++ b/tests/arrow_digester.rs @@ -547,7 +547,14 @@ mod tests { let combined_batch = RecordBatch::try_new( Arc::clone(&schema), vec![ - Arc::new(Int32Array::from(vec![None, None, None, Some(1), Some(2), Some(3)])), + Arc::new(Int32Array::from(vec![ + None, + None, + None, + Some(1), + Some(2), + Some(3), + ])), Arc::new(Float64Array::from(vec![ None, None,