From d30bc854f33999122a48ce4e0dfd724dbff6ad68 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 10:37:38 -0400 Subject: [PATCH 01/18] Rewording --- features/cli/convert.feature | 2 +- features/repl/conversion.feature | 6 +++--- features/repl/head.feature | 2 +- features/repl/select.feature | 2 +- features/repl/tail.feature | 2 +- tests/repl.rs | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/features/cli/convert.feature b/features/cli/convert.feature index 9a8b956..d47f7a7 100644 --- a/features/cli/convert.feature +++ b/features/cli/convert.feature @@ -70,7 +70,7 @@ Feature: Convert Then the command should succeed And the output should contain "Converted fixtures/no_header.csv to $TEMPDIR/no_header.parquet" And the file "$TEMPDIR/no_header.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file And that file should have 3 records Scenario: Avro to CSV diff --git a/features/repl/conversion.feature b/features/repl/conversion.feature index 20e2461..fcb4643 100644 --- a/features/repl/conversion.feature +++ b/features/repl/conversion.feature @@ -79,7 +79,7 @@ Feature: Conversion read("fixtures/userdata5.avro") |> write("$TEMPDIR/userdata5.parquet") ``` Then the file "$TEMPDIR/userdata5.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file Scenario: Avro to ORC When the REPL is ran and the user types: @@ -103,7 +103,7 @@ Feature: Conversion read("fixtures/table.csv") |> write("$TEMPDIR/table_from_csv.parquet") ``` Then the file "$TEMPDIR/table_from_csv.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file Scenario: CSV to JSON When the REPL is ran and the user types: @@ -155,7 +155,7 @@ Feature: Conversion read("fixtures/userdata.orc") |> write("$TEMPDIR/userdata_orc.parquet") ``` Then the file "$TEMPDIR/userdata_orc.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file Scenario: ORC to Avro When the REPL is ran and the user types: diff --git a/features/repl/head.feature b/features/repl/head.feature index da889e7..c213502 100644 --- a/features/repl/head.feature +++ b/features/repl/head.feature @@ -100,7 +100,7 @@ Feature: Head read("fixtures/userdata5.avro") |> head(10) |> write("$TEMPDIR/head.parquet") ``` Then the file "$TEMPDIR/head.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file And that file should have 10 records Scenario: Head to Avro diff --git a/features/repl/select.feature b/features/repl/select.feature index f7657ab..2e115b0 100644 --- a/features/repl/select.feature +++ b/features/repl/select.feature @@ -121,4 +121,4 @@ Feature: Select read("fixtures/userdata5.avro") |> select(:id, :first_name, :email) |> write("$TEMPDIR/select.parquet") ``` Then the file "$TEMPDIR/select.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file diff --git a/features/repl/tail.feature b/features/repl/tail.feature index e2b641b..ef3128a 100644 --- a/features/repl/tail.feature +++ b/features/repl/tail.feature @@ -99,7 +99,7 @@ Feature: Tail read("fixtures/userdata5.avro") |> tail(10) |> write("$TEMPDIR/tail.parquet") ``` Then the file "$TEMPDIR/tail.parquet" should exist - And that file should be valid Parquet + And that file should be a valid Parquet file Scenario: Tail to Avro When the REPL is ran and the user types: diff --git a/tests/repl.rs b/tests/repl.rs index 76383a7..ab98175 100644 --- a/tests/repl.rs +++ b/tests/repl.rs @@ -221,7 +221,7 @@ fn that_file_should_be_valid_avro(world: &mut ReplWorld) { ); } -#[then(regex = r#"^that file should be valid Parquet$"#)] +#[then(regex = r#"^that file should be a valid Parquet file$"#)] fn that_file_should_be_valid_parquet(world: &mut ReplWorld) { let path = world .last_file From 7ab5def1cff30963bb0fdf6ed697dcb85e64f166 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 11:02:19 -0400 Subject: [PATCH 02/18] actions-rust-lang/setup-rust-toolchain@v1 --- .github/workflows/ci.yml | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 59e2e62..2c76422 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: CI +cname: CI on: push: @@ -11,6 +11,11 @@ on: - "README.md" - ".github/workflows/ci.yml" +env: + # We use environment variables to specify the Rust version and other settings once + RUST_TOOLCHAIN: 1.91.1 + CARGO_TERM_COLOR: always + jobs: check: name: Lint & Test @@ -19,17 +24,10 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable + - uses: actions-rust-lang/setup-rust-toolchain@v1 with: - toolchain: nightly - components: clippy - - - name: Install rustfmt - run: rustup component add rustfmt - - - name: Cache cargo registry and target - uses: Swatinem/rust-cache@v2 + toolchain: ${{ env.RUST_TOOLCHAIN }} + components: rustfmt, clippy - name: Check formatting run: cargo +nightly fmt -- --check From 69352cb63b2ed28e2302ad76f74ad7e4bcce5f72 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 11:02:56 -0400 Subject: [PATCH 03/18] Use 1.94.0 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2c76422..ecc2ce1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ on: env: # We use environment variables to specify the Rust version and other settings once - RUST_TOOLCHAIN: 1.91.1 + RUST_TOOLCHAIN: 1.94.0 CARGO_TERM_COLOR: always jobs: From 547cfe66f575dfa9be2598ea027f89217df4c40f Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 11:04:16 -0400 Subject: [PATCH 04/18] typo --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ecc2ce1..8ed289e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -cname: CI +name: CI on: push: From fe05ee19c6afbfc91b0b5bc45a7316274f26c6b2 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 11:12:36 -0400 Subject: [PATCH 05/18] Don't use nightly --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8ed289e..de798f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: components: rustfmt, clippy - name: Check formatting - run: cargo +nightly fmt -- --check + run: cargo fmt -- --check - name: Run clippy run: cargo clippy --all-targets -- -D warnings From 5a1226f6cb3fed2b2337a81ebc52feca9ba287e1 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:00:22 -0400 Subject: [PATCH 06/18] Implement DataFrame API support in convert command for streamlined file processing - Added support for reading and writing JSON files using DataFusion's DataFrame API. - Introduced a new function `is_datafusion_native` to determine if the input format is compatible with DataFusion. - Refactored the `convert` function to utilize either the DataFrame API or a fallback method based on the input and output file types. - Enhanced error handling for unsupported file types in the DataFrameReader. - Updated tests to reflect the new JSON support and error messages. --- src/bin/datu/commands/convert.rs | 177 +++++++++++++++++++++++++------ src/pipeline/dataframe.rs | 16 ++- 2 files changed, 157 insertions(+), 36 deletions(-) diff --git a/src/bin/datu/commands/convert.rs b/src/bin/datu/commands/convert.rs index cc3390f..c784915 100644 --- a/src/bin/datu/commands/convert.rs +++ b/src/bin/datu/commands/convert.rs @@ -5,6 +5,12 @@ use arrow::array::RecordBatchReader; use arrow::datatypes::Schema; use arrow::record_batch::RecordBatch; use clap::Args; +use datafusion::dataframe::DataFrameWriteOptions; +use datafusion::execution::context::SessionContext; +use datafusion::prelude::AvroReadOptions; +use datafusion::prelude::CsvReadOptions; +use datafusion::prelude::NdJsonReadOptions; +use datafusion::prelude::ParquetReadOptions; use datu::Error; use datu::FileType; use datu::pipeline::Source; @@ -13,6 +19,7 @@ use datu::pipeline::VecRecordBatchReader; use datu::pipeline::dataframe::DataFrameReader; use datu::pipeline::dataframe::write_record_batches_from_reader; use datu::resolve_input_file_type; +use datu::utils::parse_select_columns; use indicatif::ProgressBar; use indicatif::ProgressStyle; @@ -66,6 +73,14 @@ pub struct ConvertArgs { pub input_headers: Option, } +/// Returns true if the format is supported by DataFusion's DataFrame read/write API. +fn is_datafusion_native(file_type: FileType) -> bool { + matches!( + file_type, + FileType::Parquet | FileType::Avro | FileType::Csv | FileType::Json + ) +} + /// Returns the total number of rows from file metadata, if available. fn get_total_rows(path: &str, file_type: FileType) -> Option { datu::get_total_rows_result(path, file_type) @@ -103,7 +118,14 @@ pub async fn convert(args: ConvertArgs) -> anyhow::Result<()> { let input_file_type = resolve_input_file_type(args.input, &args.input_path)?; let output_file_type = resolve_input_file_type(args.output, &args.output_path)?; - let total_rows = get_total_rows(&args.input_path, input_file_type); + let use_dataframe_api = is_datafusion_native(input_file_type) + && (output_file_type == FileType::Parquet || output_file_type == FileType::Csv); + + let total_rows = if !use_dataframe_api || input_file_type == FileType::Parquet { + get_total_rows(&args.input_path, input_file_type) + } else { + None + }; let progress = match total_rows { Some(total) => { @@ -133,36 +155,11 @@ pub async fn convert(args: ConvertArgs) -> anyhow::Result<()> { } }; - let reader_step = DataFrameReader::new( - &args.input_path, - input_file_type, - args.select, - args.limit, - args.input_headers, - ); - - let result: Result<(), datu::Error> = async { - let mut source = reader_step.execute(()).await?; - let df = source.get()?; - - let handle = tokio::runtime::Handle::current(); - let batches = tokio::task::block_in_place(|| handle.block_on(df.collect())) - .map_err(|e| Error::GenericError(e.to_string()))?; - - let mut reader = ProgressRecordBatchReader { - inner: VecRecordBatchReader::new(batches), - progress: progress.clone(), - }; - - write_record_batches_from_reader( - &mut reader, - &args.output_path, - output_file_type, - args.sparse, - args.json_pretty, - ) - } - .await; + let result: Result<(), datu::Error> = if use_dataframe_api { + convert_via_dataframe_api(&args, input_file_type, output_file_type, &progress).await + } else { + convert_via_fallback(&args, input_file_type, output_file_type, &progress).await + }; match result { Ok(()) => { @@ -181,6 +178,124 @@ pub async fn convert(args: ConvertArgs) -> anyhow::Result<()> { } } +/// Streamlined path: DataFusion DataFrame read → select/limit → write. No collect or RecordBatchReader. +async fn convert_via_dataframe_api( + args: &ConvertArgs, + input_file_type: FileType, + output_file_type: FileType, + _progress: &ProgressBar, +) -> Result<(), datu::Error> { + let ctx = SessionContext::new(); + + let mut df = match input_file_type { + FileType::Parquet => ctx + .read_parquet(&args.input_path, ParquetReadOptions::default()) + .await + .map_err(|e| Error::GenericError(e.to_string()))?, + FileType::Avro => ctx + .read_avro(&args.input_path, AvroReadOptions::default()) + .await + .map_err(|e| Error::GenericError(e.to_string()))?, + FileType::Csv => { + let has_header = args.input_headers.unwrap_or(true); + ctx.read_csv( + &args.input_path, + CsvReadOptions::new().has_header(has_header), + ) + .await + .map_err(|e| Error::GenericError(e.to_string()))? + } + FileType::Json => ctx + .read_json(&args.input_path, NdJsonReadOptions::default()) + .await + .map_err(|e| Error::GenericError(e.to_string()))?, + _ => { + return Err(Error::GenericError( + "Streamlined path only supports Parquet, Avro, CSV, JSON input".to_string(), + )); + } + }; + + if let Some(columns) = &args.select { + let parsed = parse_select_columns(columns); + if !parsed.is_empty() { + let col_refs: Vec<&str> = parsed.iter().map(String::as_str).collect(); + df = df + .select_columns(&col_refs) + .map_err(|e| Error::GenericError(e.to_string()))?; + } + } + + if let Some(n) = args.limit { + df = df + .limit(0, Some(n)) + .map_err(|e| Error::GenericError(e.to_string()))?; + } + + let write_opts = DataFrameWriteOptions::new(); + + match output_file_type { + FileType::Parquet => { + df.write_parquet(&args.output_path, write_opts, None) + .await + .map_err(|e| Error::GenericError(e.to_string()))?; + } + FileType::Csv => { + df.write_csv(&args.output_path, write_opts, None) + .await + .map_err(|e| Error::GenericError(e.to_string()))?; + } + FileType::Json => { + df.write_json(&args.output_path, write_opts, None) + .await + .map_err(|e| Error::GenericError(e.to_string()))?; + } + _ => { + return Err(Error::GenericError( + "Streamlined path only supports Parquet, Avro, CSV, JSON output".to_string(), + )); + } + } + + Ok(()) +} + +/// Fallback path: DataFrameReader → collect → write_record_batches_from_reader. +async fn convert_via_fallback( + args: &ConvertArgs, + input_file_type: FileType, + output_file_type: FileType, + progress: &ProgressBar, +) -> Result<(), datu::Error> { + let reader_step = DataFrameReader::new( + &args.input_path, + input_file_type, + args.select.clone(), + args.limit, + args.input_headers, + ); + + let mut source = reader_step.execute(()).await?; + let df = source.get()?; + + let handle = tokio::runtime::Handle::current(); + let batches = tokio::task::block_in_place(|| handle.block_on(df.collect())) + .map_err(|e| Error::GenericError(e.to_string()))?; + + let mut reader = ProgressRecordBatchReader { + inner: VecRecordBatchReader::new(batches), + progress: progress.clone(), + }; + + write_record_batches_from_reader( + &mut reader, + &args.output_path, + output_file_type, + args.sparse, + args.json_pretty, + ) +} + #[cfg(test)] mod tests { use arrow::array::RecordBatchReader; diff --git a/src/pipeline/dataframe.rs b/src/pipeline/dataframe.rs index d054778..20699e2 100644 --- a/src/pipeline/dataframe.rs +++ b/src/pipeline/dataframe.rs @@ -3,6 +3,7 @@ use async_trait::async_trait; use datafusion::execution::context::SessionContext; use datafusion::prelude::AvroReadOptions; use datafusion::prelude::CsvReadOptions; +use datafusion::prelude::NdJsonReadOptions; use datafusion::prelude::ParquetReadOptions; use crate::Error; @@ -170,6 +171,10 @@ impl DataFrameReader { .await .map_err(|e| Error::GenericError(e.to_string()))? } + FileType::Json => ctx + .read_json(&self.input_path, NdJsonReadOptions::default()) + .await + .map_err(|e| Error::GenericError(e.to_string()))?, FileType::Orc => { let batches = read_orc_to_batches(&self.input_path)?; if batches.is_empty() { @@ -182,7 +187,7 @@ impl DataFrameReader { } _ => { return Err(Error::GenericError( - "Only Parquet, Avro, CSV, and ORC are supported as input file types" + "Only Parquet, Avro, CSV, JSON, and ORC are supported as input file types" .to_string(), )); } @@ -383,15 +388,16 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_read_dataframe_unsupported_type() { - let result = DataFrameReader::new("fixtures/data.json", FileType::Json, None, None, None) - .execute(()) - .await; + let result = + DataFrameReader::new("fixtures/table.parquet", FileType::Xlsx, None, None, None) + .execute(()) + .await; assert!(result.is_err()); assert!( result .unwrap_err() .to_string() - .contains("Only Parquet, Avro, CSV, and ORC") + .contains("Only Parquet, Avro, CSV, JSON, and ORC") ); } From 5a319054d930904f580657ed0740125b4c8b375d Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:01:15 -0400 Subject: [PATCH 07/18] resolve_input_file_type -> resolve_file_type --- src/bin/datu/commands/convert.rs | 6 +++--- src/bin/datu/commands/count.rs | 4 ++-- src/bin/datu/commands/head.rs | 4 ++-- src/bin/datu/commands/sample.rs | 4 ++-- src/bin/datu/commands/schema.rs | 4 ++-- src/bin/datu/commands/tail.rs | 4 ++-- src/lib.rs | 2 +- src/utils.rs | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/bin/datu/commands/convert.rs b/src/bin/datu/commands/convert.rs index c784915..60bc839 100644 --- a/src/bin/datu/commands/convert.rs +++ b/src/bin/datu/commands/convert.rs @@ -18,7 +18,7 @@ use datu::pipeline::Step; use datu::pipeline::VecRecordBatchReader; use datu::pipeline::dataframe::DataFrameReader; use datu::pipeline::dataframe::write_record_batches_from_reader; -use datu::resolve_input_file_type; +use datu::resolve_file_type; use datu::utils::parse_select_columns; use indicatif::ProgressBar; use indicatif::ProgressStyle; @@ -115,8 +115,8 @@ impl RecordBatchReader for ProgressRecordBatchReader { /// Converts between file formats; reads from input and writes to output, optionally selecting columns. pub async fn convert(args: ConvertArgs) -> anyhow::Result<()> { - let input_file_type = resolve_input_file_type(args.input, &args.input_path)?; - let output_file_type = resolve_input_file_type(args.output, &args.output_path)?; + let input_file_type = resolve_file_type(args.input, &args.input_path)?; + let output_file_type = resolve_file_type(args.output, &args.output_path)?; let use_dataframe_api = is_datafusion_native(input_file_type) && (output_file_type == FileType::Parquet || output_file_type == FileType::Csv); diff --git a/src/bin/datu/commands/count.rs b/src/bin/datu/commands/count.rs index 137542f..48a8a64 100644 --- a/src/bin/datu/commands/count.rs +++ b/src/bin/datu/commands/count.rs @@ -2,12 +2,12 @@ use datu::cli::CountArgs; use datu::pipeline::count_rows; -use datu::resolve_input_file_type; +use datu::resolve_file_type; /// The `datu count` command. Uses metadata for Parquet and ORC (no data read); /// streams batches for Avro and CSV. pub async fn count(args: CountArgs) -> anyhow::Result<()> { - let file_type = resolve_input_file_type(args.input, &args.input_path)?; + let file_type = resolve_file_type(args.input, &args.input_path)?; let total = count_rows(&args.input_path, file_type, args.input_headers)?; println!("{total}"); Ok(()) diff --git a/src/bin/datu/commands/head.rs b/src/bin/datu/commands/head.rs index e00f8b2..445ad5b 100644 --- a/src/bin/datu/commands/head.rs +++ b/src/bin/datu/commands/head.rs @@ -5,11 +5,11 @@ use datu::cli::HeadsOrTails; use datu::pipeline::build_reader; use datu::pipeline::display::apply_select_and_display; use datu::pipeline::record_batch_filter::parse_select_step; -use datu::resolve_input_file_type; +use datu::resolve_file_type; /// head command implementation: print the first N lines of an Avro, CSV, Parquet, or ORC file. pub async fn head(args: HeadsOrTails) -> Result<()> { - let input_file_type = resolve_input_file_type(args.input, &args.input_path)?; + let input_file_type = resolve_file_type(args.input, &args.input_path)?; match input_file_type { FileType::Parquet | FileType::Avro | FileType::Csv | FileType::Orc => {} _ => bail!("Only Parquet, Avro, CSV, and ORC are supported for head"), diff --git a/src/bin/datu/commands/sample.rs b/src/bin/datu/commands/sample.rs index 8a5222f..292e04e 100644 --- a/src/bin/datu/commands/sample.rs +++ b/src/bin/datu/commands/sample.rs @@ -13,11 +13,11 @@ use datu::pipeline::read_to_batches; use datu::pipeline::record_batch_filter::parse_select_step; use datu::pipeline::reservoir_sample_from_reader; use datu::pipeline::sample_from_reader; -use datu::resolve_input_file_type; +use datu::resolve_file_type; /// sample command implementation: print N random rows from an Avro, CSV, Parquet, or ORC file. pub async fn sample(args: HeadsOrTails) -> Result<()> { - let input_file_type = resolve_input_file_type(args.input, &args.input_path)?; + let input_file_type = resolve_file_type(args.input, &args.input_path)?; match input_file_type { FileType::Parquet => sample_seekable_format(args, FileType::Parquet).await, FileType::Avro => sample_avro(args).await, diff --git a/src/bin/datu/commands/schema.rs b/src/bin/datu/commands/schema.rs index 8c8eb4c..17a7375 100644 --- a/src/bin/datu/commands/schema.rs +++ b/src/bin/datu/commands/schema.rs @@ -4,11 +4,11 @@ use anyhow::Result; use datu::cli::SchemaArgs; use datu::pipeline::schema::get_schema_fields; use datu::pipeline::schema::print_schema_fields; -use datu::resolve_input_file_type; +use datu::resolve_file_type; /// The `datu schema` command pub async fn schema(args: SchemaArgs) -> Result<()> { - let file_type = resolve_input_file_type(args.input, &args.input_path)?; + let file_type = resolve_file_type(args.input, &args.input_path)?; let fields = get_schema_fields(&args.input_path, file_type, args.input_headers)?; print_schema_fields(&fields, args.output, args.sparse) } diff --git a/src/bin/datu/commands/tail.rs b/src/bin/datu/commands/tail.rs index a4c9dfb..4ceea8e 100644 --- a/src/bin/datu/commands/tail.rs +++ b/src/bin/datu/commands/tail.rs @@ -12,11 +12,11 @@ use datu::pipeline::display::apply_select_and_display; use datu::pipeline::read_to_batches; use datu::pipeline::record_batch_filter::parse_select_step; use datu::pipeline::tail_batches; -use datu::resolve_input_file_type; +use datu::resolve_file_type; /// tail command implementation: print the last N lines of an Avro, CSV, Parquet, or ORC file. pub async fn tail(args: HeadsOrTails) -> Result<()> { - let input_file_type = resolve_input_file_type(args.input, &args.input_path)?; + let input_file_type = resolve_file_type(args.input, &args.input_path)?; match input_file_type { FileType::Parquet => tail_seekable_format(args, FileType::Parquet).await, FileType::Avro => tail_avro(args).await, diff --git a/src/lib.rs b/src/lib.rs index 2e98673..262538d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ pub mod utils; pub use errors::Error; pub use utils::FileType; pub use utils::get_total_rows_result; -pub use utils::resolve_input_file_type; +pub use utils::resolve_file_type; /// Result type alias for datu operations. pub type Result = std::result::Result; diff --git a/src/utils.rs b/src/utils.rs index cd6be6e..ad88dff 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -113,7 +113,7 @@ impl TryFrom<&str> for FileType { } /// Resolve the input file type: use the explicit override if provided, otherwise infer from the file path extension. -pub fn resolve_input_file_type( +pub fn resolve_file_type( input_override: Option, path: &str, ) -> crate::Result { From 96bf0d831931469e6264f01add283ca17774d47f Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:05:57 -0400 Subject: [PATCH 08/18] cargo fmt --- src/utils.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index ad88dff..851bf28 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -113,10 +113,7 @@ impl TryFrom<&str> for FileType { } /// Resolve the input file type: use the explicit override if provided, otherwise infer from the file path extension. -pub fn resolve_file_type( - input_override: Option, - path: &str, -) -> crate::Result { +pub fn resolve_file_type(input_override: Option, path: &str) -> crate::Result { match input_override { Some(ft) => Ok(ft), None => FileType::try_from(path), From b7bb128a6b9b97ebef26423c63e8c4be91440f8d Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:10:27 -0400 Subject: [PATCH 09/18] Make this more explicit --- src/pipeline.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipeline.rs b/src/pipeline.rs index 63969c4..405b688 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -162,8 +162,8 @@ pub fn build_reader( /// Counts rows in a file. Uses metadata for Parquet and ORC (no data read); /// streams batches for Avro and CSV. pub fn count_rows(path: &str, file_type: FileType, csv_has_header: Option) -> Result { - if let Ok(total) = crate::get_total_rows_result(path, file_type) { - return Ok(total); + if matches!(file_type, FileType::Parquet | FileType::Orc) { + return crate::get_total_rows_result(path, file_type); } let mut reader_step = build_reader(path, file_type, None, None, csv_has_header)?; let reader = reader_step.get()?; From 59510c9002da54af4c89e5729f8bfecf759eb738 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:12:41 -0400 Subject: [PATCH 10/18] Update README with JSON support --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 53ff28c..11c0cd6 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ cargo install --git https://github.com/aisrael/datu | Avro (`.avro`) | ✓ | ✓ | — | | ORC (`.orc`) | ✓ | ✓ | — | | CSV (`.csv`) | ✓ | ✓ | ✓ | +| JSON (`.json`) | ✓ | ✓ | ✓ | | XLSX (`.xlsx`) | — | ✓ | — | -| JSON (`.json`) | — | ✓ | ✓ | | JSON (pretty) | — | — | ✓ | | YAML | — | — | ✓ | @@ -148,7 +148,7 @@ datu count data.csv --input-headers=false Convert data between supported formats. Input and output formats are inferred from file extensions, or can be specified explicitly with `--input` and `--output`. -**Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). +**Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), JSON (`.json`), ORC (`.orc`). **Supported output formats:** CSV (`.csv`), JSON (`.json`), Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`), XLSX (`.xlsx`). @@ -191,13 +191,17 @@ datu convert data.csv output.json --input-headers=false # Parquet to Parquet with column subset datu convert input.parq output.parquet --select one,two,three -# Parquet, Avro, CSV, or ORC to Excel (.xlsx) +# JSON to CSV or Parquet +datu convert data.json data.csv +datu convert data.json data.parquet + +# Parquet, Avro, CSV, JSON, or ORC to Excel (.xlsx) datu convert data.parquet report.xlsx -# Parquet or Avro to ORC +# Parquet, Avro, or JSON to ORC datu convert data.parquet data.orc -# Parquet or Avro to JSON +# Parquet, Avro, or JSON to JSON datu convert data.parquet data.json ``` @@ -361,7 +365,7 @@ read("input") |> ... |> write("output") #### `read(path)` -Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). CSV files are assumed to have a header row by default. +Read a data file. Supported formats: Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), JSON (`.json`), ORC (`.orc`). CSV files are assumed to have a header row by default. ```text > read("data.parquet") |> write("data.csv") From 0a3364c1af3d2133524d850784b71cf38e1134ad Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 12:16:33 -0400 Subject: [PATCH 11/18] Updated with REPL equivalents --- README.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 11c0cd6..70d75c9 100644 --- a/README.md +++ b/README.md @@ -68,12 +68,18 @@ Display the schema of a Parquet, Avro, CSV, or ORC file (column names, types, an **Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). -**Usage:** +**Usage (CLI):** ```sh datu schema [OPTIONS] ``` +**Usage (REPL):** + +> `read("file") |> schema()` +> +> Use `schema()` after a read to print the schema of the data in the pipeline. For a single file, `read("data.parquet") |> schema()` is equivalent. + **Options:** | Option | Description | @@ -114,12 +120,18 @@ Return the number of rows in a Parquet, Avro, CSV, or ORC file. **Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). -**Usage:** +**Usage (CLI):** ```sh datu count [OPTIONS] ``` +**Usage (REPL):** + +> `count("file")` +> +> Count rows in a file directly. Or use `read("file") |> count()` after other steps, e.g. `read("data.parquet") |> select(:id) |> count()`. + **Options:** | Option | Description | @@ -152,12 +164,18 @@ Convert data between supported formats. Input and output formats are inferred fr **Supported output formats:** CSV (`.csv`), JSON (`.json`), Parquet (`.parquet`, `.parq`), Avro (`.avro`), ORC (`.orc`), XLSX (`.xlsx`). -**Usage:** +**Usage (CLI):** ```sh datu convert [OPTIONS] ``` +**Usage (REPL):** + +> `read("input") |> ... |> write("output")` +> +> Example: `read("table.parquet") |> select(:id, :email) |> write("table.csv")`. Add `select(...)`, `head(n)`, `tail(n)`, or `sample(n)` as needed. + **Options:** | Option | Description | @@ -213,12 +231,18 @@ Print N randomly sampled rows from a Parquet, Avro, CSV, or ORC file to stdout ( **Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). -**Usage:** +**Usage (CLI):** ```sh datu sample [OPTIONS] ``` +**Usage (REPL):** + +> `read("file") |> sample(n)` +> +> Prints _n_ random rows to stdout. Chain `|> write("out.csv")` to write to a file, e.g. `read("data.parquet") |> sample(5) |> write("sample.csv")`. + **Options:** | Option | Description | @@ -254,12 +278,18 @@ Print the first N rows of a Parquet, Avro, CSV, or ORC file to stdout (default C **Supported input formats:** Parquet (`.parquet`, `.parq`), Avro (`.avro`), CSV (`.csv`), ORC (`.orc`). -**Usage:** +**Usage (CLI):** ```sh datu head [OPTIONS] ``` +**Usage (REPL):** + +> `read("file") |> head(n)` +> +> Prints the first _n_ rows to stdout. Chain `|> write("out.csv")` to write to a file, e.g. `read("data.parquet") |> head(10) |> write("first10.csv")`. + **Options:** | Option | Description | @@ -300,12 +330,18 @@ Print the last N rows of a Parquet, Avro, CSV, or ORC file to stdout (default CS > **Note:** For Avro and CSV files, `tail` requires a full file scan since these formats do not support random access to the end of the file. -**Usage:** +**Usage (CLI):** ```sh datu tail [OPTIONS] ``` +**Usage (REPL):** + +> `read("file") |> tail(n)` +> +> Prints the last _n_ rows to stdout. Chain `|> write("out.csv")` to write to a file, e.g. `read("data.parquet") |> tail(10) |> write("last10.csv")`. + **Options:** | Option | Description | @@ -340,12 +376,18 @@ datu tail data.parquet -n 1000 > last1000.csv ### Version -Print the installed `datu` version: +Print the installed `datu` version. + +**Usage (CLI):** ```sh datu version ``` +**Usage (REPL):** + +> No equivalent; run `datu version` from the shell. + ## Interactive Mode (REPL) Running `datu` without any command starts an interactive REPL (Read-Eval-Print Loop): From c9d17c22db6757fc01bca1d695fb1b1cf11de672 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 13:01:59 -0400 Subject: [PATCH 12/18] Refactor REPL evaluation methods to use exec_* naming convention and improve test structure - Removed async evaluation methods from ReplPipelineBuilder and replaced them with exec_* methods. - Updated corresponding tests to reflect the new method names and improved error handling. - Consolidated path extraction tests for read operations. --- src/cli/repl.rs | 383 ++++++++++++------------------------------------ 1 file changed, 93 insertions(+), 290 deletions(-) diff --git a/src/cli/repl.rs b/src/cli/repl.rs index 58418ff..77348a1 100644 --- a/src/cli/repl.rs +++ b/src/cli/repl.rs @@ -366,57 +366,6 @@ fn extract_column_specs(args: &[Expr]) -> crate::Result> { .collect() } -#[cfg(test)] -impl ReplPipelineBuilder { - async fn eval_stage(&mut self, expr: Expr) -> crate::Result<()> { - let stage = plan_stage(expr)?; - self.execute_stage(stage).await - } - - async fn eval_read(&mut self, args: Vec) -> crate::Result<()> { - let path = extract_path_from_args("read", &args)?; - self.exec_read(&path).await - } - - async fn eval_select(&mut self, args: Vec) -> crate::Result<()> { - let columns = extract_column_specs(&args)?; - if columns.is_empty() { - return Err(Error::UnsupportedFunctionCall( - "select expects at least one column name".to_string(), - )); - } - self.exec_select(&columns).await - } - - fn eval_head(&mut self, args: Vec) -> crate::Result<()> { - let n = extract_head_n(&args)?; - self.exec_head(n) - } - - fn eval_tail(&mut self, args: Vec) -> crate::Result<()> { - let n = extract_tail_n(&args)?; - self.exec_tail(n) - } - - fn eval_sample(&mut self, args: Vec) -> crate::Result<()> { - let n = extract_sample_n(&args)?; - self.exec_sample(n) - } - - async fn eval_write(&mut self, args: Vec) -> crate::Result<()> { - let path = extract_path_from_args("write", &args)?; - self.exec_write(&path).await - } - - fn eval_count(&mut self) -> crate::Result<()> { - self.exec_count() - } - - fn eval_schema(&mut self) -> crate::Result<()> { - self.exec_schema() - } -} - #[cfg(test)] fn is_head_call(expr: Option<&Expr>) -> bool { if let Some(Expr::FunctionCall(name, _)) = expr { @@ -957,50 +906,22 @@ mod tests { assert!(ctx.pending_exprs.is_empty()); } - // ── eval_stage: error paths ───────────────────────────────────── - - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_stage_unsupported_function() { - let mut ctx = new_context(); - let expr = Expr::FunctionCall(Identifier("unknown".into()), vec![]); - let result = ctx.eval_stage(expr).await; - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedFunctionCall(_) - )); - } - - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_stage_non_function_expr() { - let mut ctx = new_context(); - let expr = Expr::Ident("x".into()); - let result = ctx.eval_stage(expr).await; - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedExpression(_) - )); - } - - // ── eval_read ─────────────────────────────────────────────────── + // ── exec_read / extract_path_from_args ────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_read_success() { + async fn test_exec_read_success() { let mut ctx = new_context(); - let args = vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]; - ctx.eval_read(args).await.expect("eval_read"); + ctx.exec_read("fixtures/table.parquet") + .await + .expect("exec_read"); assert!(ctx.batches.is_some()); assert!(!ctx.batches.as_ref().unwrap().is_empty()); } - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_read_bad_args() { - let mut ctx = new_context(); + #[test] + fn test_extract_path_from_args_read_bad_args() { let args = vec![Expr::Ident("not_a_string".into())]; - let result = ctx.eval_read(args).await; + let result = extract_path_from_args("read", &args); assert!(result.is_err()); assert!(matches!( result.unwrap_err(), @@ -1008,40 +929,34 @@ mod tests { )); } - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_read_no_args() { - let mut ctx = new_context(); - let result = ctx.eval_read(vec![]).await; + #[test] + fn test_extract_path_from_args_read_no_args() { + let result = extract_path_from_args("read", &[]); assert!(result.is_err()); } - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_read_too_many_args() { - let mut ctx = new_context(); + #[test] + fn test_extract_path_from_args_read_too_many_args() { let args = vec![ Expr::Literal(Literal::String("a.parquet".into())), Expr::Literal(Literal::String("b.parquet".into())), ]; - let result = ctx.eval_read(args).await; + let result = extract_path_from_args("read", &args); assert!(result.is_err()); } - // ── eval_select ───────────────────────────────────────────────── + // ── exec_select ───────────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_select_success() { + async fn test_exec_select_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = vec![ - Expr::Literal(Literal::Symbol("one".into())), - Expr::Literal(Literal::Symbol("two".into())), + let columns = vec![ + ColumnSpec::CaseInsensitive("one".into()), + ColumnSpec::CaseInsensitive("two".into()), ]; - ctx.eval_select(args).await.expect("select"); + ctx.exec_select(&columns).await.expect("select"); let batches = ctx.batches.as_ref().expect("batches after select"); let schema = batches[0].schema(); let col_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); @@ -1049,24 +964,18 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_select_no_preceding_read() { + async fn test_exec_select_no_preceding_read() { let mut ctx = new_context(); - let args = vec![Expr::Literal(Literal::Symbol("one".into()))]; - let result = ctx.eval_select(args).await; + let columns = vec![ColumnSpec::CaseInsensitive("one".into())]; + let result = ctx.exec_select(&columns).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_select_empty_columns() { - let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); - - let result = ctx.eval_select(vec![]).await; + #[test] + fn test_plan_stage_select_empty_columns_rejected() { + let expr = parse("select()"); + let result = plan_stage(expr); assert!(result.is_err()); assert!(matches!( result.unwrap_err(), @@ -1074,42 +983,35 @@ mod tests { )); } - // ── eval_write ────────────────────────────────────────────────── + // ── exec_write ────────────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_write_success() { + async fn test_exec_write_success() { let temp_dir = tempfile::tempdir().expect("tempdir"); let output_path = temp_dir.path().join("output.parquet"); let output_str = output_path.to_str().unwrap().to_string(); let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = vec![Expr::Literal(Literal::String(output_str.clone()))]; - ctx.eval_write(args).await.expect("write"); + ctx.exec_write(&output_str).await.expect("write"); assert!(output_path.exists()); assert_eq!(ctx.writer.as_deref(), Some(output_str.as_str())); } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_write_no_preceding_read() { + async fn test_exec_write_no_preceding_read() { let mut ctx = new_context(); - let args = vec![Expr::Literal(Literal::String("out.csv".into()))]; - let result = ctx.eval_write(args).await; + let result = ctx.exec_write("out.csv").await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - #[tokio::test(flavor = "multi_thread")] - async fn test_eval_write_bad_args() { - let mut ctx = new_context(); + #[test] + fn test_extract_path_from_args_write_bad_args() { let args = vec![Expr::Ident("not_a_string".into())]; - let result = ctx.eval_write(args).await; + let result = extract_path_from_args("write", &args); assert!(result.is_err()); assert!(matches!( result.unwrap_err(), @@ -1188,17 +1090,13 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_select_symbol_case_insensitive() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = vec![ - Expr::Literal(Literal::Symbol("ONE".into())), - Expr::Literal(Literal::Symbol("TWO".into())), + let columns = vec![ + ColumnSpec::CaseInsensitive("ONE".into()), + ColumnSpec::CaseInsensitive("TWO".into()), ]; - ctx.eval_select(args).await.expect("select"); + ctx.exec_select(&columns).await.expect("select"); let batches = ctx.batches.as_ref().expect("batches after select"); let schema = batches[0].schema(); let col_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect(); @@ -1208,14 +1106,10 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_select_string_exact_match_fails_on_wrong_case() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); - - let args = vec![Expr::Literal(Literal::String("ONE".into()))]; - let result = ctx.eval_select(args).await; + ctx.exec_read("fixtures/table.parquet").await.expect("read"); + + let columns = vec![ColumnSpec::Exact("ONE".into())]; + let result = ctx.exec_select(&columns).await; assert!(result.is_err()); } @@ -1254,7 +1148,7 @@ mod tests { assert!(!is_head_call(None)); } - // ── eval_head ────────────────────────────────────────────────── + // ── exec_head ────────────────────────────────────────────────── fn parse_fn_args(input: &str) -> Vec { match parse(input) { @@ -1264,16 +1158,11 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_head_success() { + async fn test_exec_head_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = parse_fn_args("head(2)"); - ctx.eval_head(args).expect("head"); + ctx.exec_head(2).expect("head"); let batches = ctx.batches.as_ref().expect("batches after head"); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); @@ -1281,55 +1170,26 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_head_preserves_schema() { + async fn test_exec_head_preserves_schema() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); let original_schema = ctx.batches.as_ref().unwrap()[0].schema(); - let args = parse_fn_args("head(1)"); - ctx.eval_head(args).expect("head"); + ctx.exec_head(1).expect("head"); let batches = ctx.batches.as_ref().expect("batches"); assert_eq!(batches[0].schema(), original_schema); } #[test] - fn test_eval_head_no_preceding_read() { + fn test_exec_head_no_preceding_read() { let mut ctx = new_context(); - let args = parse_fn_args("head(5)"); - let result = ctx.eval_head(args); + let result = ctx.exec_head(5); assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - #[test] - fn test_eval_head_bad_args_string() { - let mut ctx = new_context(); - let args = vec![Expr::Literal(Literal::String("not_a_number".into()))]; - let result = ctx.eval_head(args); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedFunctionCall(_) - )); - } - - #[test] - fn test_eval_head_no_args() { - let mut ctx = new_context(); - let result = ctx.eval_head(vec![]); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedFunctionCall(_) - )); - } - - // ── eval_head: pipeline integration ──────────────────────────── + // ── exec_head: pipeline integration ────────────────────────────── #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_head_write() { @@ -1484,19 +1344,14 @@ mod tests { assert!(extract_tail_n(&[]).is_err()); } - // ── eval_tail ─────────────────────────────────────────────── + // ── exec_tail ─────────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_tail_success() { + async fn test_exec_tail_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = parse_fn_args("tail(2)"); - ctx.eval_tail(args).expect("tail"); + ctx.exec_tail(2).expect("tail"); let batches = ctx.batches.as_ref().expect("batches after tail"); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); @@ -1504,55 +1359,26 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_tail_preserves_schema() { + async fn test_exec_tail_preserves_schema() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); let original_schema = ctx.batches.as_ref().unwrap()[0].schema(); - let args = parse_fn_args("tail(1)"); - ctx.eval_tail(args).expect("tail"); + ctx.exec_tail(1).expect("tail"); let batches = ctx.batches.as_ref().expect("batches"); assert_eq!(batches[0].schema(), original_schema); } #[test] - fn test_eval_tail_no_preceding_read() { + fn test_exec_tail_no_preceding_read() { let mut ctx = new_context(); - let args = parse_fn_args("tail(5)"); - let result = ctx.eval_tail(args); + let result = ctx.exec_tail(5); assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - #[test] - fn test_eval_tail_bad_args_string() { - let mut ctx = new_context(); - let args = vec![Expr::Literal(Literal::String("not_a_number".into()))]; - let result = ctx.eval_tail(args); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedFunctionCall(_) - )); - } - - #[test] - fn test_eval_tail_no_args() { - let mut ctx = new_context(); - let result = ctx.eval_tail(vec![]); - assert!(result.is_err()); - assert!(matches!( - result.unwrap_err(), - Error::UnsupportedFunctionCall(_) - )); - } - - // ── eval_tail: pipeline integration ───────────────────────── + // ── exec_tail: pipeline integration ────────────────────────── #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_tail_write() { @@ -1650,30 +1476,26 @@ mod tests { assert_eq!(pipeline[2], PipelineStage::Count { path: None }); } - // ── eval_count ───────────────────────────────────────────── + // ── exec_count ───────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_count_success() { + async fn test_exec_count_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - ctx.eval_count().expect("count"); + ctx.exec_count().expect("count"); assert!(ctx.batches.is_none(), "batches consumed by count"); } #[test] - fn test_eval_count_no_preceding_read() { + fn test_exec_count_no_preceding_read() { let mut ctx = new_context(); - let result = ctx.eval_count(); + let result = ctx.exec_count(); assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - // ── eval_count: pipeline integration ──────────────────────── + // ── exec_count: pipeline integration ──────────────────────── #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_count() { @@ -1775,30 +1597,26 @@ mod tests { assert_eq!(pipeline[1], PipelineStage::Schema); } - // ── eval_schema ───────────────────────────────────────────── + // ── exec_schema ───────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_schema_success() { + async fn test_exec_schema_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - ctx.eval_schema().expect("schema"); + ctx.exec_schema().expect("schema"); assert!(ctx.batches.is_none(), "batches consumed by schema"); } #[test] - fn test_eval_schema_no_preceding_read() { + fn test_exec_schema_no_preceding_read() { let mut ctx = new_context(); - let result = ctx.eval_schema(); + let result = ctx.exec_schema(); assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - // ── eval_schema: pipeline integration ──────────────────────── + // ── exec_schema: pipeline integration ──────────────────────── #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_schema() { @@ -1920,19 +1738,14 @@ mod tests { assert!(extract_sample_n(&args).is_err()); } - // ── eval_sample ───────────────────────────────────────────── + // ── exec_sample ───────────────────────────────────────────── #[tokio::test(flavor = "multi_thread")] - async fn test_eval_sample_success() { + async fn test_exec_sample_success() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - let args = parse_fn_args("sample(2)"); - ctx.eval_sample(args).expect("sample"); + ctx.exec_sample(2).expect("sample"); let batches = ctx.batches.as_ref().expect("batches after sample"); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); @@ -1940,15 +1753,11 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_sample_default_n() { + async fn test_exec_sample_default_n() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); - ctx.eval_sample(vec![]).expect("sample with default n"); + ctx.exec_sample(10).expect("sample with n=10"); let batches = ctx.batches.as_ref().expect("batches after sample"); let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); @@ -1956,32 +1765,26 @@ mod tests { } #[tokio::test(flavor = "multi_thread")] - async fn test_eval_sample_preserves_schema() { + async fn test_exec_sample_preserves_schema() { let mut ctx = new_context(); - ctx.eval_read(vec![Expr::Literal(Literal::String( - "fixtures/table.parquet".into(), - ))]) - .await - .expect("read"); + ctx.exec_read("fixtures/table.parquet").await.expect("read"); let original_schema = ctx.batches.as_ref().unwrap()[0].schema(); - let args = parse_fn_args("sample(1)"); - ctx.eval_sample(args).expect("sample"); + ctx.exec_sample(1).expect("sample"); let batches = ctx.batches.as_ref().expect("batches"); assert_eq!(batches[0].schema(), original_schema); } #[test] - fn test_eval_sample_no_preceding_read() { + fn test_exec_sample_no_preceding_read() { let mut ctx = new_context(); - let args = parse_fn_args("sample(5)"); - let result = ctx.eval_sample(args); + let result = ctx.exec_sample(5); assert!(result.is_err()); assert!(matches!(result.unwrap_err(), Error::GenericError(_))); } - // ── eval_sample: pipeline integration ─────────────────────── + // ── exec_sample: pipeline integration ──────────────────────── #[tokio::test(flavor = "multi_thread")] async fn test_repl_pipeline_read_sample_write() { From d70e1e911e4142a152c2305a7a3b6e114919ac6b Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 13:07:01 -0400 Subject: [PATCH 13/18] Just updating the comment --- src/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli.rs b/src/cli.rs index 4294eab..7d88b86 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -111,7 +111,7 @@ pub struct CountArgs { pub input_headers: Option, } -/// Arguments for the `datu head` and `datu tail` commands. +/// Arguments for the `head`, `sample`, and `tail` commands. #[derive(Args)] pub struct HeadsOrTails { /// Path to the Parquet, Avro, ORC, or CSV file From 68297930aa6294f9ec73f89d607470f215a18735 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 13:33:36 -0400 Subject: [PATCH 14/18] Update dependencies and versions in Cargo files - Updated various dependencies in Cargo.toml, including `anyhow`, `clap`, `datafusion`, and `tokio`, to their latest compatible versions. - Adjusted version specifications for `anstyle`, `console`, and other packages in Cargo.lock to reflect new releases. - Improved dependency management by ensuring consistent versioning across related packages. --- Cargo.lock | 389 +++++++++++++++++++++++++++-------------------------- Cargo.toml | 34 ++--- 2 files changed, 216 insertions(+), 207 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c69f88b..376f122 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,7 +74,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", - "anstyle-parse", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -84,9 +99,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -97,6 +112,15 @@ dependencies = [ "utf8parse", ] +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + [[package]] name = "anstyle-query" version = "1.1.5" @@ -330,7 +354,7 @@ dependencies = [ "arrow-schema", "arrow-select", "flatbuffers", - "lz4_flex 0.12.0", + "lz4_flex 0.12.1", "zstd", ] @@ -427,9 +451,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.40" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d67d43201f4d20c78bcda740c142ca52482d81da80681533d33bf3f0596c8e2" +checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1" dependencies = [ "compression-codecs", "compression-core", @@ -529,9 +553,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.9.0" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d13a61f2963b88eef9c1be03df65d42f6996dfeac1054870d950fcf66686f83" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" dependencies = [ "bon-macros", "rustversion", @@ -539,9 +563,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.9.0" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d314cc62af2b6b0c65780555abb4d02a03dd3b799cd42419044f0c38d99738c0" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" dependencies = [ "darling", "ident_case", @@ -630,9 +654,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.56" +version = "1.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", "jobserver", @@ -705,9 +729,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -715,11 +739,11 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream", + "anstream 1.0.0", "anstyle", "clap_lex", "strsim", @@ -728,9 +752,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.55" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", @@ -740,9 +764,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "clipboard-win" @@ -755,9 +779,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "comfy-table" @@ -801,26 +825,12 @@ dependencies = [ [[package]] name = "console" -version = "0.15.11" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" dependencies = [ "encode_unicode", "libc", - "once_cell", - "unicode-width", - "windows-sys 0.59.0", -] - -[[package]] -name = "console" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4" -dependencies = [ - "encode_unicode", - "libc", - "once_cell", "unicode-width", "windows-sys 0.61.2", ] @@ -1005,7 +1015,7 @@ checksum = "16cbb27bc2064274afa3a3d8bc9a0e71333589850573aa632ec4520e4af14d94" dependencies = [ "anyhow", "clap", - "console 0.16.2", + "console", "cucumber-codegen", "cucumber-expressions", "derive_more", @@ -1056,9 +1066,9 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.11" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -1066,11 +1076,10 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.11" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", @@ -1080,9 +1089,9 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.11" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", @@ -1105,9 +1114,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d12ee9fdc6cdb5898c7691bb994f0ba606c4acc93a2258d78bb9f26ff8158bb3" +checksum = "ea28305c211e3541c9cfcf06a23d0d8c7c824b4502ed1fdf0a6ff4ad24ee531c" dependencies = [ "arrow", "arrow-schema", @@ -1161,9 +1170,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "462dc9ef45e5d688aeaae49a7e310587e81b6016b9d03bace5626ad0043e5a9e" +checksum = "78ab99b6df5f60a6ddbc515e4c05caee1192d395cf3cb67ce5d1c17e3c9b9b74" dependencies = [ "arrow", "async-trait", @@ -1186,9 +1195,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b96dbf1d728fc321817b744eb5080cdd75312faa6980b338817f68f3caa4208" +checksum = "77ae3d14912c0d779ada98d30dc60f3244f3c26c2446b87394629ea5c076a31c" dependencies = [ "arrow", "async-trait", @@ -1209,9 +1218,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3237a6ff0d2149af4631290074289cae548c9863c885d821315d54c6673a074a" +checksum = "ea2df29b9592a5d55b8238eaf67d2f21963d5a08cd1a8b7670134405206caabd" dependencies = [ "ahash", "apache-avro", @@ -1234,9 +1243,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70b5e34026af55a1bfccb1ef0a763cf1f64e77c696ffcf5a128a278c31236528" +checksum = "42639baa0049d5fffd7e283504b9b5e7b9b2e7a2dea476eed60ab0d40d999b85" dependencies = [ "futures", "log", @@ -1245,9 +1254,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b2a6be734cc3785e18bbf2a7f2b22537f6b9fb960d79617775a51568c281842" +checksum = "25951b617bb22a9619e1520450590cb2004bfcad10bcb396b961f4a1a10dcec5" dependencies = [ "arrow", "async-compression", @@ -1280,9 +1289,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1739b9b07c9236389e09c74f770e88aff7055250774e9def7d3f4f56b3dcc7be" +checksum = "dc0b28226960ba99c50d78ac6f736ebe09eb5cb3bb9bb58194266278000ca41f" dependencies = [ "arrow", "arrow-ipc", @@ -1304,9 +1313,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "828088c2fb681cc0e06fb42f541f76c82a0c10278f9fd6334e22c8d1e3574ee7" +checksum = "18de2e914c2c9ed4b31a4920940b181b0957bc164eec4fc04c294533219bf0a7" dependencies = [ "apache-avro", "arrow", @@ -1324,9 +1333,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c73bc54b518bbba7c7650299d07d58730293cfba4356f6f428cc94c20b7600" +checksum = "f538b57b052a678b1ce860181c65d3ace5a8486312dc50b41c01dd585a773a51" dependencies = [ "arrow", "async-trait", @@ -1347,9 +1356,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37812c8494c698c4d889374ecfabbff780f1f26d9ec095dd1bddfc2a8ca12559" +checksum = "89fbc1d32b1b03c9734e27c0c5f041232b68621c8455f22769838634750a196c" dependencies = [ "arrow", "async-trait", @@ -1369,9 +1378,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2210937ecd9f0e824c397e73f4b5385c97cd1aff43ab2b5836fcfd2d321523fb" +checksum = "203271d31fe5613a5943181db70ec98162121d1de94a9a300d5e5f19f9500a32" dependencies = [ "arrow", "async-trait", @@ -1399,15 +1408,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c825f969126bc2ef6a6a02d94b3c07abff871acf4d6dd759ce1255edb7923ce" +checksum = "5b6450dc702b3d39e8ced54c3356abb453bd2f3cea86d90d555a4b92f7a38462" [[package]] name = "datafusion-execution" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa03ef05a2c2f90dd6c743e3e111078e322f4b395d20d4b4d431a245d79521ae" +checksum = "e66a02fa601de49da5181dbdcf904a18b16a184db2b31f5e5534552ea2d5e660" dependencies = [ "arrow", "async-trait", @@ -1426,9 +1435,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef33934c1f98ee695cc51192cc5f9ed3a8febee84fdbcd9131bf9d3a9a78276f" +checksum = "cdf59a9b308a1a07dc2eb2f85e6366bc0226dc390b40f3aa0a72d79f1cfe2465" dependencies = [ "arrow", "async-trait", @@ -1449,9 +1458,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "000c98206e3dd47d2939a94b6c67af4bfa6732dd668ac4fafdbde408fd9134ea" +checksum = "bd99eac4c6538c708638db43e7a3bd88e0e57955ddb722d420fb9a6d38dfc28f" dependencies = [ "arrow", "datafusion-common", @@ -1462,9 +1471,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "379b01418ab95ca947014066248c22139fe9af9289354de10b445bd000d5d276" +checksum = "11aa2c492ac046397b36d57c62a72982aad306495bbcbcdbcabd424d4a2fe245" dependencies = [ "arrow", "arrow-buffer", @@ -1493,9 +1502,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd00d5454ba4c3f8ebbd04bd6a6a9dc7ced7c56d883f70f2076c188be8459e4c" +checksum = "325a00081898945d48d6194d9ca26120e523c993be3bb7c084061a5a2a72e787" dependencies = [ "ahash", "arrow", @@ -1514,9 +1523,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aec06b380729a87210a4e11f555ec2d729a328142253f8d557b87593622ecc9f" +checksum = "809bbcb1e0dbec5d0ce30d493d135aea7564f1ba4550395f7f94321223df2dae" dependencies = [ "ahash", "arrow", @@ -1527,9 +1536,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904f48d45e0f1eb7d0eb5c0f80f2b5c6046a85454364a6b16a2e0b46f62e7dff" +checksum = "29ebaa5d7024ef45973e0a7db1e9aeaa647936496f4d4061c0448f23d77d6320" dependencies = [ "arrow", "arrow-ord", @@ -1550,9 +1559,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a0d20e2b887e11bee24f7734d780a2588b925796ac741c3118dd06d5aa77f0" +checksum = "60eab6f39df9ee49a2c7fa38eddc01fa0086ee31b29c7d19f38e72f479609752" dependencies = [ "arrow", "async-trait", @@ -1566,9 +1575,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3414b0a07e39b6979fe3a69c7aa79a9f1369f1d5c8e52146e66058be1b285ee" +checksum = "e00b2c15e342a90e65a846199c9e49293dd09fe1bcd63d8be2544604892f7eb8" dependencies = [ "arrow", "datafusion-common", @@ -1584,9 +1593,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bf2feae63cd4754e31add64ce75cae07d015bce4bb41cd09872f93add32523a" +checksum = "493e2e1d1f4753dfc139a5213f1b5d0b97eea46a82d9bda3c7908aa96981b74b" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1594,9 +1603,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4fe888aeb6a095c4bcbe8ac1874c4b9a4c7ffa2ba849db7922683ba20875aaf" +checksum = "ba01c55ade8278a791b429f7bf5cb1de64de587a342d084b18245edfae7096e2" dependencies = [ "datafusion-doc", "quote", @@ -1605,9 +1614,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a6527c063ae305c11be397a86d8193936f4b84d137fe40bd706dfc178cf733c" +checksum = "a80c6dfbba6a2163a9507f6353ac78c69d8deb26232c9e419160e58ff7c3e047" dependencies = [ "arrow", "chrono", @@ -1625,9 +1634,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bb028323dd4efd049dd8a78d78fe81b2b969447b39c51424167f973ac5811d9" +checksum = "5d3a86264bb9163e7360b6622e789bc7fcbb43672e78a8493f0bc369a41a57c6" dependencies = [ "ahash", "arrow", @@ -1649,9 +1658,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78fe0826aef7eab6b4b61533d811234a7a9e5e458331ebbf94152a51fc8ab433" +checksum = "3f5e00e524ac33500be6c5eeac940bd3f6b984ba9b7df0cd5f6c34a8a2cc4d6b" dependencies = [ "arrow", "datafusion-common", @@ -1664,9 +1673,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfccd388620734c661bd8b7ca93c44cdd59fecc9b550eea416a78ffcbb29475f" +checksum = "2ae769ea5d688b4e74e9be5cad6f9d9f295b540825355868a3ab942380dd97ce" dependencies = [ "ahash", "arrow", @@ -1681,9 +1690,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde5fa10e73259a03b705d5fddc136516814ab5f441b939525618a4070f5a059" +checksum = "f3588753ab2b47b0e43cd823fe5e7944df6734dabd6dafb72e2cc1c2a22f1944" dependencies = [ "arrow", "datafusion-common", @@ -1700,9 +1709,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e1098760fb29127c24cc9ade3277051dc73c9ed0ac0131bd7bcd742e0ad7470" +checksum = "79949cbb109c2a45c527bfe0d956b9f2916807c05d4d2e66f3fd0af827ac2b61" dependencies = [ "ahash", "arrow", @@ -1731,9 +1740,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d0fef4201777b52951edec086c21a5b246f3c82621569ddb4a26f488bc38a9" +checksum = "6434e2ee8a39d04b95fed688ff34dc251af6e4a0c2e1714716b6e3846690d589" dependencies = [ "arrow", "datafusion-common", @@ -1748,9 +1757,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f71f1e39e8f2acbf1c63b0e93756c2e970a64729dab70ac789587d6237c4fde0" +checksum = "c91efb8302b4877d499c37e9a71886b90236ab27d9cc42fd51112febf341abd6" dependencies = [ "async-trait", "datafusion-common", @@ -1762,9 +1771,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "52.1.0" +version = "52.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44693cfcaeb7a9f12d71d1c576c3a6dc025a12cef209375fa2d16fb3b5670ee" +checksum = "3f01eef7bcf4d00e87305b55f1b75792384e130fe0258bac02cd48378ae5ff87" dependencies = [ "arrow", "bigdecimal", @@ -1926,7 +1935,7 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" dependencies = [ - "anstream", + "anstream 0.6.21", "anstyle", "env_filter", "jiff", @@ -2037,12 +2046,6 @@ dependencies = [ "thiserror 2.0.18", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.5" @@ -2183,19 +2186,19 @@ checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", "wasip3", ] @@ -2505,14 +2508,14 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.17.11" +version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ - "console 0.15.11", - "number_prefix", + "console", "portable-atomic", "unicode-width", + "unit-prefix", "web-time", ] @@ -2580,9 +2583,9 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.21" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e3d65f018c6ae946ab16e80944b97096ed73c35b221d1c478a6c81d8f57940" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" dependencies = [ "jiff-static", "log", @@ -2593,9 +2596,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.21" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17c2b211d863c7fde02cbea8a3c1a439b98e109286554f2860bdded7ff83818" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" dependencies = [ "proc-macro2", "quote", @@ -2614,9 +2617,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.88" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e709f3e3d22866f9c25b3aff01af289b18422cc8b4262fb19103ee80fe513d" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -2693,9 +2696,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.182" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "liblzma" @@ -2767,18 +2770,18 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" dependencies = [ "twox-hash", ] [[package]] name = "lz4_flex" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" dependencies = [ "twox-hash", ] @@ -2973,12 +2976,6 @@ dependencies = [ "libm", ] -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - [[package]] name = "object" version = "0.37.3" @@ -3014,9 +3011,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -3046,7 +3043,7 @@ dependencies = [ "flate2", "futures", "futures-util", - "lz4_flex 0.11.5", + "lz4_flex 0.11.6", "lzokay-native", "num", "prost", @@ -3119,7 +3116,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.12.0", + "lz4_flex 0.12.1", "num-bigint", "num-integer", "num-traits", @@ -3205,18 +3202,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" dependencies = [ "proc-macro2", "quote", @@ -3225,9 +3222,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pin-utils" @@ -3277,9 +3274,9 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" dependencies = [ "portable-atomic", ] @@ -3371,9 +3368,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -3384,6 +3381,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radix_trie" version = "0.2.1" @@ -3564,9 +3567,9 @@ checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.9" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "reservoir-sampling" @@ -3581,9 +3584,9 @@ dependencies = [ [[package]] name = "rust_xlsxwriter" -version = "0.93.0" +version = "0.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84205e093885c6b4e2d93e225a148df2e7fe67a668c27e3322e56e0d93721848" +checksum = "efc4a0f1f7b425669996977016152b2939be9be44d40df252a5051c9c6b3b859" dependencies = [ "chrono", "zip", @@ -3997,12 +4000,12 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.25.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.1", + "getrandom 0.4.2", "once_cell", "rustix", "windows-sys 0.61.2", @@ -4111,9 +4114,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "pin-project-lite", @@ -4122,9 +4125,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" dependencies = [ "proc-macro2", "quote", @@ -4243,6 +4246,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "unsafe-libyaml" version = "0.2.11" @@ -4275,11 +4284,11 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", "serde_core", "wasm-bindgen", @@ -4327,9 +4336,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.111" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1adf1535672f5b7824f817792b1afd731d7e843d2d04ec8f27e8cb51edd8ac" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", @@ -4340,9 +4349,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.61" +version = "0.4.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe88540d1c934c4ec8e6db0afa536876c5441289d7f9f9123d4f065ac1250a6b" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" dependencies = [ "cfg-if", "futures-util", @@ -4354,9 +4363,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.111" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19e638317c08b21663aed4d2b9a2091450548954695ff4efa75bff5fa546b3b1" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4364,9 +4373,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.111" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c64760850114d03d5f65457e96fc988f11f01d38fbaa51b254e4ab5809102af" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", @@ -4377,9 +4386,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.111" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60eecd4fe26177cfa3339eb00b4a36445889ba3ad37080c2429879718e20ca41" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -4420,9 +4429,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.88" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d6bb20ed2d9572df8584f6dc81d68a41a625cadc6f15999d649a70ce7e3597a" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" dependencies = [ "js-sys", "wasm-bindgen", @@ -4869,18 +4878,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.39" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.39" +version = "0.8.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" dependencies = [ "proc-macro2", "quote", @@ -4957,9 +4966,9 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c745c48e1007337ed136dc99df34128b9faa6ed542d80a1c673cf55a6d7236c8" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" [[package]] name = "zmij" diff --git a/Cargo.toml b/Cargo.toml index f0305ac..ecd6387 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,33 +11,33 @@ exclude = [ ] [dependencies] -anyhow = "1.0.100" +anyhow = "1.0" async-trait = "0.1" -arrow = "57.2.0" +arrow = "57.2" log = "0.4" -arrow-avro = "57.2.0" -arrow-json = "57.2.0" -clap = { version = "4.5.54", features = ["cargo", "derive"] } +arrow-avro = "57.2" +arrow-json = "57.2" +clap = { version = "4.6", features = ["cargo", "derive"] } chrono = "0.4" -datafusion = { version = "52.1.0", features = ["avro"] } -parquet = "57.2.0" -orc-rust = "0.7" +datafusion = { version = "52.3", features = ["avro"] } +parquet = "57.2" +orc-rust = "0.7.1" futures = "0.3" -tokio = { version = "1.48", features = ["macros", "rt", "sync"] } -thiserror = "2.0.18" -csv = "1.4.0" +tokio = { version = "1.50", features = ["macros", "rt", "sync"] } +thiserror = "2.0" +csv = "1.4" hashlink = "0.10" saphyr = "0.0.6" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -rust_xlsxwriter = { version = "0.93", features = ["chrono"] } +rust_xlsxwriter = { version = "0.94", features = ["chrono"] } rustc-literal-escaper = "0.0.7" -rustyline = "17.0" -flt = { version = "0.0.2" } -dirs = { version = "2.0.0", package = "dirs-next" } +rustyline = "17" +flt = "0.0.2" +dirs = { version = "2.0", package = "dirs-next" } rand = "0.8" -reservoir-sampling = "0.5" -indicatif = "0.17" +reservoir-sampling = "0.5.1" +indicatif = "0.18" [dev-dependencies] criterion = "0.5" From b499ea4f91b1ac9f6711540c6881b40c944a9d19 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 13:44:40 -0400 Subject: [PATCH 15/18] Add Avro compatibility for Int16 fields in record batches - Introduced functions to check for Int16 fields in schemas and convert them to Int32 for Avro compatibility. - Implemented a wrapper for record batch readers that casts Int16 columns to Int32. - Updated the `write_record_batches` function to handle schema adjustments before writing to Avro files. - Added a test to verify that Int16 values are correctly upcast to Int32 in the written Avro output. --- benches/parquet_to_avro.rs | 5 +- src/pipeline/avro.rs | 129 ++++++++++++++++++++++++++++++++++++- 2 files changed, 132 insertions(+), 2 deletions(-) diff --git a/benches/parquet_to_avro.rs b/benches/parquet_to_avro.rs index b6a4bad..2b88d91 100644 --- a/benches/parquet_to_avro.rs +++ b/benches/parquet_to_avro.rs @@ -12,10 +12,13 @@ fn parquet_to_avro_benchmark(c: &mut Criterion) { let output_path = temp_dir.path().join("output.avro"); let output = output_path.to_str().expect("Path to string").to_string(); + let input_path = + std::env::var("DATU_BENCH_PARQUET_PATH").unwrap_or("fixtures/userdata.parquet".to_string()); + c.bench_function("parquet_to_avro_userdata", |b| { b.iter(|| { let result = Command::new(&datu_path) - .args(["convert", "fixtures/userdata.parquet", black_box(&output)]) + .args(["convert", &input_path, black_box(&output)]) .output() .expect("Failed to execute datu"); assert!( diff --git a/src/pipeline/avro.rs b/src/pipeline/avro.rs index 046f660..67b34a0 100644 --- a/src/pipeline/avro.rs +++ b/src/pipeline/avro.rs @@ -1,6 +1,12 @@ use std::io::BufReader; +use std::sync::Arc; use arrow::array::RecordBatchReader; +use arrow::compute; +use arrow::datatypes::DataType; +use arrow::datatypes::Field; +use arrow::datatypes::Schema; +use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow_avro::reader::ReaderBuilder; use arrow_avro::writer::AvroWriter; @@ -16,6 +22,71 @@ use crate::pipeline::WriteArgs; use crate::pipeline::batch_write::BatchWriteSink; use crate::pipeline::batch_write::write_record_batches_with_sink; +/// Returns true if the schema has any Int16 field (avro writer does not support Int16). +fn schema_has_int16(schema: &Schema) -> bool { + schema + .fields() + .iter() + .any(|f| f.data_type() == &DataType::Int16) +} + +/// Builds a schema suitable for the Avro writer: Int16 fields are replaced with Int32. +fn schema_for_avro_writer(schema: &Schema) -> SchemaRef { + let fields: Vec = schema + .fields() + .iter() + .map(|f| { + let dt = if f.data_type() == &DataType::Int16 { + DataType::Int32 + } else { + f.data_type().clone() + }; + Field::new(f.name(), dt, f.is_nullable()) + }) + .collect(); + Arc::new(Schema::new(fields)) +} + +/// Casts Int16 columns in the batch to Int32 so the batch matches the Avro-compat schema. +fn cast_record_batch_for_avro( + batch: &RecordBatch, + compat_schema: &SchemaRef, +) -> arrow::error::Result { + let mut columns: Vec> = Vec::with_capacity(batch.num_columns()); + for (i, field) in compat_schema.fields().iter().enumerate() { + let col = batch.column(i); + let target_type = field.data_type(); + let cast_col = if col.data_type() == &DataType::Int16 && target_type == &DataType::Int32 { + compute::cast(col.as_ref(), target_type)? + } else { + col.clone() + }; + columns.push(cast_col); + } + RecordBatch::try_new((*compat_schema).clone(), columns) +} + +/// Wraps a record batch reader and exposes an Avro-compat schema (Int16 → Int32), casting each batch. +struct AvroCompatRecordBatchReader<'a> { + reader: &'a mut dyn RecordBatchReader, + compat_schema: SchemaRef, +} + +impl RecordBatchReader for AvroCompatRecordBatchReader<'_> { + fn schema(&self) -> SchemaRef { + self.compat_schema.clone() + } +} + +impl Iterator for AvroCompatRecordBatchReader<'_> { + type Item = arrow::error::Result; + + fn next(&mut self) -> Option { + let batch = self.reader.next()?; + Some(batch.and_then(|b| cast_record_batch_for_avro(&b, &self.compat_schema))) + } +} + /// Pipeline step that reads an Avro file and produces a record batch reader. pub struct ReadAvroStep { pub args: ReadArgs, @@ -145,8 +216,19 @@ pub struct WriteAvroStep { pub struct WriteAvroResult {} /// Write record batches from a reader to an Avro file. +/// Int16 columns are upcast to Int32 so the arrow-avro writer can write them. pub fn write_record_batches(path: &str, reader: &mut dyn RecordBatchReader) -> Result<()> { - write_record_batches_with_sink(path, reader, AvroSink::new) + let schema = reader.schema(); + if schema_has_int16(schema.as_ref()) { + let compat_schema = schema_for_avro_writer(schema.as_ref()); + let mut wrapper = AvroCompatRecordBatchReader { + reader, + compat_schema, + }; + write_record_batches_with_sink(path, &mut wrapper, AvroSink::new) + } else { + write_record_batches_with_sink(path, reader, AvroSink::new) + } } struct AvroSink { @@ -186,9 +268,54 @@ impl Step for WriteAvroStep { #[cfg(test)] mod tests { + use std::io::BufReader; + use std::sync::Arc; + + use arrow::array::Int16Array; + use arrow::datatypes::DataType; + use arrow::datatypes::Field; + use arrow::datatypes::Schema; + use super::*; use crate::Error; use crate::pipeline::ReadArgs; + use crate::pipeline::VecRecordBatchReader; + + #[test] + fn test_write_avro_int16_upcast_to_int32() { + let schema = Schema::new(vec![Field::new("x", DataType::Int16, false)]); + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(Int16Array::from(vec![1_i16, 2, 3]))], + ) + .unwrap(); + let temp_dir = tempfile::tempdir().unwrap(); + let path = temp_dir.path().join("out.avro"); + let mut reader = VecRecordBatchReader::new(vec![batch]); + write_record_batches(path.to_str().unwrap(), &mut reader).unwrap(); + let file = std::fs::File::open(&path).unwrap(); + let avro_reader = ReaderBuilder::new() + .build(BufReader::new(file)) + .expect("written file should be valid Avro"); + let batches: Vec<_> = avro_reader.collect(); + assert_eq!(batches.len(), 1, "expected one batch"); + let batch = batches.into_iter().next().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 1); + let col = batch.column(0); + assert_eq!( + col.data_type(), + &DataType::Int32, + "Avro int is read as Int32" + ); + let ints = col + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ints.value(0), 1); + assert_eq!(ints.value(1), 2); + assert_eq!(ints.value(2), 3); + } #[test] fn test_read_avro() { From b4abc0c674ab29109a997f42fe437ad5ed0ada00 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 14:02:27 -0400 Subject: [PATCH 16/18] Added .clocignore --- .clocignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .clocignore diff --git a/.clocignore b/.clocignore new file mode 100644 index 0000000..9eed7b0 --- /dev/null +++ b/.clocignore @@ -0,0 +1,2 @@ +fixtures/ +target/ From 2810db232ab5f0cc54b0fb62286adffd2adc4171 Mon Sep 17 00:00:00 2001 From: Alistair Israel Date: Sun, 15 Mar 2026 14:03:09 -0400 Subject: [PATCH 17/18] Use eyre instead of anyhow --- Cargo.lock | 18 +++++++++++++++++- Cargo.toml | 2 +- benches/parquet_to_avro.rs | 32 ++++++++++++++++++++++++++++---- src/bin/datu/commands/convert.rs | 2 +- src/bin/datu/commands/count.rs | 2 +- src/bin/datu/commands/head.rs | 4 ++-- src/bin/datu/commands/sample.rs | 4 ++-- src/bin/datu/commands/schema.rs | 2 +- src/bin/datu/commands/tail.rs | 4 ++-- src/bin/datu/main.rs | 2 +- src/bin/datu/repl.rs | 2 +- src/pipeline.rs | 10 ++++------ src/pipeline/schema.rs | 6 +++--- 13 files changed, 64 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 376f122..5fd2eb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1791,7 +1791,6 @@ dependencies = [ name = "datu" version = "0.3.1" dependencies = [ - "anyhow", "arrow", "arrow-avro", "arrow-json", @@ -1804,6 +1803,7 @@ dependencies = [ "datafusion", "dirs-next", "expectrl", + "eyre", "flt", "futures", "gherkin", @@ -1976,6 +1976,16 @@ dependencies = [ "regex", ] +[[package]] +name = "eyre" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd915d99f24784cdc19fd37ef22b97e3ff0ae756c7e492e9fbfe897d61e2aec" +dependencies = [ + "indenter", + "once_cell", +] + [[package]] name = "fallible-streaming-iterator" version = "0.1.9" @@ -2494,6 +2504,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "indenter" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5" + [[package]] name = "indexmap" version = "2.13.0" diff --git a/Cargo.toml b/Cargo.toml index ecd6387..c85ccc1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ exclude = [ ] [dependencies] -anyhow = "1.0" +eyre = "0.6" async-trait = "0.1" arrow = "57.2" log = "0.4" diff --git a/benches/parquet_to_avro.rs b/benches/parquet_to_avro.rs index 2b88d91..4ce0ea8 100644 --- a/benches/parquet_to_avro.rs +++ b/benches/parquet_to_avro.rs @@ -4,6 +4,20 @@ use criterion::Criterion; use criterion::black_box; use criterion::criterion_group; use criterion::criterion_main; +use dirs::home_dir; + +/// Expand ~ in the path to the home directory +fn expand_path(path: &str) -> eyre::Result { + if path.starts_with("~") { + let home = home_dir().ok_or(eyre::eyre!("Failed to get home directory"))?; + Ok(home + .join(path.trim_start_matches("~")) + .to_string_lossy() + .to_string()) + } else { + Ok(path.to_string()) + } +} fn parquet_to_avro_benchmark(c: &mut Criterion) { let datu_path = std::env::var("CARGO_BIN_EXE_datu") @@ -12,10 +26,20 @@ fn parquet_to_avro_benchmark(c: &mut Criterion) { let output_path = temp_dir.path().join("output.avro"); let output = output_path.to_str().expect("Path to string").to_string(); - let input_path = - std::env::var("DATU_BENCH_PARQUET_PATH").unwrap_or("fixtures/userdata.parquet".to_string()); - - c.bench_function("parquet_to_avro_userdata", |b| { + let mut id = String::from("parquet_to_avro"); + let input_path = if let Ok(value) = std::env::var("DATU_BENCH_PARQUET_PATH") { + let path = expand_path(&value).unwrap_or_else(|e| { + panic!("Failed to expand path: {}", e); + }); + if !std::path::Path::new(&path).exists() { + panic!("DATU_BENCH_PARQUET_PATH does not exist: {}", value); + } + id = format!("parquet_to_avro ({})", value); + path + } else { + "fixtures/userdata.parquet".to_string() + }; + c.bench_function(id.as_str(), |b| { b.iter(|| { let result = Command::new(&datu_path) .args(["convert", &input_path, black_box(&output)]) diff --git a/src/bin/datu/commands/convert.rs b/src/bin/datu/commands/convert.rs index 60bc839..e28feaa 100644 --- a/src/bin/datu/commands/convert.rs +++ b/src/bin/datu/commands/convert.rs @@ -114,7 +114,7 @@ impl RecordBatchReader for ProgressRecordBatchReader { } /// Converts between file formats; reads from input and writes to output, optionally selecting columns. -pub async fn convert(args: ConvertArgs) -> anyhow::Result<()> { +pub async fn convert(args: ConvertArgs) -> eyre::Result<()> { let input_file_type = resolve_file_type(args.input, &args.input_path)?; let output_file_type = resolve_file_type(args.output, &args.output_path)?; diff --git a/src/bin/datu/commands/count.rs b/src/bin/datu/commands/count.rs index 48a8a64..55da118 100644 --- a/src/bin/datu/commands/count.rs +++ b/src/bin/datu/commands/count.rs @@ -6,7 +6,7 @@ use datu::resolve_file_type; /// The `datu count` command. Uses metadata for Parquet and ORC (no data read); /// streams batches for Avro and CSV. -pub async fn count(args: CountArgs) -> anyhow::Result<()> { +pub async fn count(args: CountArgs) -> eyre::Result<()> { let file_type = resolve_file_type(args.input, &args.input_path)?; let total = count_rows(&args.input_path, file_type, args.input_headers)?; println!("{total}"); diff --git a/src/bin/datu/commands/head.rs b/src/bin/datu/commands/head.rs index 445ad5b..a520f1d 100644 --- a/src/bin/datu/commands/head.rs +++ b/src/bin/datu/commands/head.rs @@ -1,11 +1,11 @@ -use anyhow::Result; -use anyhow::bail; use datu::FileType; use datu::cli::HeadsOrTails; use datu::pipeline::build_reader; use datu::pipeline::display::apply_select_and_display; use datu::pipeline::record_batch_filter::parse_select_step; use datu::resolve_file_type; +use eyre::Result; +use eyre::bail; /// head command implementation: print the first N lines of an Avro, CSV, Parquet, or ORC file. pub async fn head(args: HeadsOrTails) -> Result<()> { diff --git a/src/bin/datu/commands/sample.rs b/src/bin/datu/commands/sample.rs index 292e04e..6f37ee2 100644 --- a/src/bin/datu/commands/sample.rs +++ b/src/bin/datu/commands/sample.rs @@ -1,5 +1,3 @@ -use anyhow::Result; -use anyhow::bail; use datu::FileType; use datu::cli::HeadsOrTails; use datu::get_total_rows_result; @@ -14,6 +12,8 @@ use datu::pipeline::record_batch_filter::parse_select_step; use datu::pipeline::reservoir_sample_from_reader; use datu::pipeline::sample_from_reader; use datu::resolve_file_type; +use eyre::Result; +use eyre::bail; /// sample command implementation: print N random rows from an Avro, CSV, Parquet, or ORC file. pub async fn sample(args: HeadsOrTails) -> Result<()> { diff --git a/src/bin/datu/commands/schema.rs b/src/bin/datu/commands/schema.rs index 17a7375..3edbd72 100644 --- a/src/bin/datu/commands/schema.rs +++ b/src/bin/datu/commands/schema.rs @@ -1,10 +1,10 @@ //! `datu schema` - display the schema of a Parquet, Avro, or ORC file -use anyhow::Result; use datu::cli::SchemaArgs; use datu::pipeline::schema::get_schema_fields; use datu::pipeline::schema::print_schema_fields; use datu::resolve_file_type; +use eyre::Result; /// The `datu schema` command pub async fn schema(args: SchemaArgs) -> Result<()> { diff --git a/src/bin/datu/commands/tail.rs b/src/bin/datu/commands/tail.rs index 4ceea8e..9f92d02 100644 --- a/src/bin/datu/commands/tail.rs +++ b/src/bin/datu/commands/tail.rs @@ -1,5 +1,3 @@ -use anyhow::Result; -use anyhow::bail; use datu::Error; use datu::FileType; use datu::cli::HeadsOrTails; @@ -13,6 +11,8 @@ use datu::pipeline::read_to_batches; use datu::pipeline::record_batch_filter::parse_select_step; use datu::pipeline::tail_batches; use datu::resolve_file_type; +use eyre::Result; +use eyre::bail; /// tail command implementation: print the last N lines of an Avro, CSV, Parquet, or ORC file. pub async fn tail(args: HeadsOrTails) -> Result<()> { diff --git a/src/bin/datu/main.rs b/src/bin/datu/main.rs index e436f9b..55da0ea 100644 --- a/src/bin/datu/main.rs +++ b/src/bin/datu/main.rs @@ -45,7 +45,7 @@ pub enum Command { /// Application entry point; parses CLI args and dispatches to the appropriate command. #[tokio::main] -async fn main() -> anyhow::Result<()> { +async fn main() -> eyre::Result<()> { let cli = Cli::parse(); match cli.command { None => repl::run().await, diff --git a/src/bin/datu/repl.rs b/src/bin/datu/repl.rs index 495251b..6cf1874 100644 --- a/src/bin/datu/repl.rs +++ b/src/bin/datu/repl.rs @@ -2,8 +2,8 @@ use std::path::PathBuf; -use anyhow::Result; use datu::cli::repl::ReplPipelineBuilder; +use eyre::Result; use flt::parser::parse_expr; use rustyline::DefaultEditor; use rustyline::error::ReadlineError; diff --git a/src/pipeline.rs b/src/pipeline.rs index 405b688..7e4104f 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -228,7 +228,7 @@ pub async fn read_to_batches( select: &Option>, limit: Option, csv_has_header: Option, -) -> anyhow::Result> { +) -> eyre::Result> { let source = { let select = select.clone(); DataFrameReader::new(input_path, input_file_type, select, limit, csv_has_header) @@ -237,7 +237,7 @@ pub async fn read_to_batches( .await?; let reader = DataFrameToBatchReader::try_new(source) .await - .map_err(|e| anyhow::anyhow!("{e}"))?; + .map_err(|e| eyre::eyre!("{e}"))?; Ok(reader.into_batches()) } @@ -390,11 +390,9 @@ pub async fn write_batches( output_file_type: FileType, sparse: bool, json_pretty: bool, -) -> anyhow::Result<()> { +) -> eyre::Result<()> { let ctx = datafusion::execution::context::SessionContext::new(); - let df = ctx - .read_batches(batches) - .map_err(|e| anyhow::anyhow!("{e}"))?; + let df = ctx.read_batches(batches).map_err(|e| eyre::eyre!("{e}"))?; let source = crate::pipeline::dataframe::DataFrameSource::new(df); let writer_step = crate::pipeline::dataframe::DataFrameWriter::new( diff --git a/src/pipeline/schema.rs b/src/pipeline/schema.rs index 00cbb6c..31ff698 100644 --- a/src/pipeline/schema.rs +++ b/src/pipeline/schema.rs @@ -7,13 +7,13 @@ use std::fs::File; use std::io::BufReader; use std::sync::Arc; -use anyhow::Result; -use anyhow::bail; use arrow::array::RecordBatchReader; use arrow::datatypes::Schema as ArrowSchema; use arrow_avro::reader::ReaderBuilder; use datafusion::execution::context::SessionContext; use datafusion::prelude::CsvReadOptions; +use eyre::Result; +use eyre::bail; use orc_rust::arrow_reader::ArrowReaderBuilder; use parquet::basic::ConvertedType; use parquet::file::metadata::ParquetMetaDataReader; @@ -202,7 +202,7 @@ fn get_schema_fields_csv(path: &str, has_header: bool) -> Result Date: Sun, 15 Mar 2026 14:14:26 -0400 Subject: [PATCH 18/18] 0.3.2 --- CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++---- Cargo.lock | 2 +- Cargo.toml | 2 +- features/cli/cli.feature | 2 +- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 604acb7..4c2c2b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,41 @@ # datu Version Notes -## v0.3.1 +## v0.3.2 + +### Highlights + +- **Convert command** now uses the DataFrame API for streamlined file processing; internal `resolve_input_file_type` renamed to `resolve_file_type`. +- **Avro**: Added compatibility for Int16 fields in record batches. +- **Error handling**: Replaced `anyhow` with `eyre`. +- **REPL**: Refactored evaluation to use `exec_*` naming and improved test structure; feature files updated with REPL equivalents. +- **Docs**: README updates for JSON support and REPL usage. +- **CI**: Switched to `actions-rust-lang/setup-rust-toolchain@v1`, Rust 1.94.0, no nightly. + +### Improvements + +- **Convert** + - Implemented DataFrame API support for file processing in the convert command. + +- **Avro pipeline** + - Int16 fields in record batches are now handled correctly when writing Avro. -Compare range: `0.3.0...0.3.1` +- **REPL** + - Evaluation methods refactored to `exec_*` naming convention. + - Test structure and REPL feature scenarios updated. + +- **Project** + - Added `.clocignore`. + - Dependency and version updates in Cargo files. + - CI workflow and toolchain adjustments. + +### Changelog Stats + +- 16 commits +- 28 files changed +- 754 insertions +- 610 deletions + +## v0.3.1 ### Highlights @@ -42,8 +75,6 @@ Compare range: `0.3.0...0.3.1` ## v0.3.0 -Compare range: `0.2.4...0.3.0` - ### Highlights - Added an interactive REPL mode: running `datu` without a subcommand now starts an interactive pipeline shell. diff --git a/Cargo.lock b/Cargo.lock index 5fd2eb1..2ea77e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1789,7 +1789,7 @@ dependencies = [ [[package]] name = "datu" -version = "0.3.1" +version = "0.3.2" dependencies = [ "arrow", "arrow-avro", diff --git a/Cargo.toml b/Cargo.toml index c85ccc1..5e154fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "datu" -version = "0.3.1" +version = "0.3.2" edition = "2024" description = "datu - a data file utility" license = "MIT" diff --git a/features/cli/cli.feature b/features/cli/cli.feature index a76f54e..153205a 100644 --- a/features/cli/cli.feature +++ b/features/cli/cli.feature @@ -2,7 +2,7 @@ Feature: CLI Scenario: Print version When I run `datu --version` - Then the first line of the output should be: datu 0.3.1 + Then the first line of the output should be: datu 0.3.2 Scenario: Print help with help subcommand When I run `datu help`