From 086e8bf41494f12634b604cdbdc96ef7247e9c82 Mon Sep 17 00:00:00 2001 From: hypsakata <46911464+hypsakata@users.noreply.github.com> Date: Mon, 29 Dec 2025 18:47:02 +0900 Subject: [PATCH 1/3] Move Integer builder creation into create_builder --- ruby/red-arrow/lib/arrow/array-builder.rb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 2ccf50f3c1b..12a90b07224 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -76,12 +76,12 @@ def detect_builder_info(value, builder_info) when Integer if value < 0 { - builder: IntArrayBuilder.new, + builder_type: :int, detected: true, } else { - builder: UIntArrayBuilder.new, + builder_type: :uint, } end when Time @@ -156,7 +156,7 @@ def detect_builder_info(value, builder_info) break if sub_builder_info and sub_builder_info[:detected] end if sub_builder_info - sub_builder = sub_builder_info[:builder] + sub_builder = sub_builder_info[:builder] || create_builder(sub_builder_info) return builder_info unless sub_builder sub_value_data_type = sub_builder.value_data_type field = Field.new("item", sub_value_data_type) @@ -186,6 +186,10 @@ def create_builder(builder_info) data_type = Decimal256DataType.new(builder_info[:precision], builder_info[:scale]) Decimal256ArrayBuilder.new(data_type) + when :int + Int8ArrayBuilder.new + when :uint + UInt8ArrayBuilder.new else nil end From 9cbe2b611b79eb01e533347f24799f72d7580203 Mon Sep 17 00:00:00 2001 From: hypsakata <46911464+hypsakata@users.noreply.github.com> Date: Wed, 31 Dec 2025 13:08:11 +0900 Subject: [PATCH 2/3] Improve nested integer list width inference --- ruby/red-arrow/lib/arrow/array-builder.rb | 49 ++++++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 12a90b07224..a7210c32f4e 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -74,14 +74,26 @@ def detect_builder_info(value, builder_info) detected: true, } when Integer - if value < 0 + builder_info ||= {} + min = builder_info[:min] || value + max = builder_info[:max] || value + min = value if value < min + max = value if value > max + bit_length = value.bit_length + + if builder_info[:builder_type] == :int || value < 0 { builder_type: :int, - detected: true, + min: min, + max: max, + bit_length: [builder_info[:bit_length] || 0, bit_length].max } else { builder_type: :uint, + min: min, + max: max, + bit_length: [builder_info[:bit_length] || 0, bit_length].max } end when Time @@ -150,18 +162,19 @@ def detect_builder_info(value, builder_info) end end when ::Array - sub_builder_info = nil + sub_builder_info = builder_info && builder_info[:value_builder_info] value.each do |sub_value| sub_builder_info = detect_builder_info(sub_value, sub_builder_info) break if sub_builder_info and sub_builder_info[:detected] end if sub_builder_info sub_builder = sub_builder_info[:builder] || create_builder(sub_builder_info) - return builder_info unless sub_builder + return sub_builder_info unless sub_builder sub_value_data_type = sub_builder.value_data_type field = Field.new("item", sub_value_data_type) { builder: ListArrayBuilder.new(ListDataType.new(field)), + value_builder_info: sub_builder_info, detected: sub_builder_info[:detected], } else @@ -187,9 +200,33 @@ def create_builder(builder_info) builder_info[:scale]) Decimal256ArrayBuilder.new(data_type) when :int - Int8ArrayBuilder.new + required_bit_length = builder_info[:bit_length] + 1 + + if required_bit_length <= 8 + Int8ArrayBuilder.new + elsif required_bit_length <= 16 + Int16ArrayBuilder.new + elsif required_bit_length <= 32 + Int32ArrayBuilder.new + elsif required_bit_length <= 64 + Int64ArrayBuilder.new + else + StringArrayBuilder.new + end when :uint - UInt8ArrayBuilder.new + required_bit_length = builder_info[:bit_length] + + if required_bit_length <= 8 + UInt8ArrayBuilder.new + elsif required_bit_length <= 16 + UInt16ArrayBuilder.new + elsif required_bit_length <= 32 + UInt32ArrayBuilder.new + elsif required_bit_length <= 64 + UInt64ArrayBuilder.new + else + StringArrayBuilder.new + end else nil end From f2ff1f1adcb1691c5348beab2eb803bacf6283ba Mon Sep 17 00:00:00 2001 From: hypsakata <46911464+hypsakata@users.noreply.github.com> Date: Wed, 31 Dec 2025 13:08:19 +0900 Subject: [PATCH 3/3] Add nested integer list inference tests --- ruby/red-arrow/test/test-array-builder.rb | 465 ++++++++++++++++++++-- 1 file changed, 428 insertions(+), 37 deletions(-) diff --git a/ruby/red-arrow/test/test-array-builder.rb b/ruby/red-arrow/test/test-array-builder.rb index 7a2d42e54b3..2d0563486bd 100644 --- a/ruby/red-arrow/test/test-array-builder.rb +++ b/ruby/red-arrow/test/test-array-builder.rb @@ -147,44 +147,435 @@ def assert_build(builder_class, raw_array) ]) end - test("lists") do - values = [ - [0, 1, 2], - [3, 4], - ] - array = Arrow::Array.new(values) - data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) - assert_equal({ - data_type: data_type, - values: [ - [0, 1, 2], - [3, 4], - ], - }, - { - data_type: array.value_data_type, - values: array.to_a, - }) - end + sub_test_case("nested integer list") do + test("lists") do + values = [ + [0, 1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, 1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end - test("lists") do - values = [ - [0, -1, 2], - [3, 4], - ] - array = Arrow::Array.new(values) - data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) - assert_equal({ - data_type: data_type, - values: [ - [0, -1, 2], - [3, 4], - ], - }, - { - data_type: array.value_data_type, - values: array.to_a, - }) + test("lists") do + values = [ + [0, -1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) + assert_equal({ + data_type: data_type, + values: [ + [0, -1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + # Int8 can hold values from -128 to 127. + values = [ + [0, -2**7], + [2**7 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, -128], + [127], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int8 underflow") do + values = [ + [0, -2**7 - 1], + [2**7 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int16DataType.new) + + # Int8 lower bound is -128 + assert_equal({ + data_type: data_type, + values: [ + [0, -129], + [127], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int8 overflow") do + values = [ + [0, 2**7], + [-2**7], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int16DataType.new) + + # Int8 upper bound is 127 + assert_equal({ + data_type: data_type, + values: [ + [0, 128], + [-128], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, -2**15], + [2**15 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int16DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, -32768], + [32767], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int16 underflow") do + values = [ + [0, -2**15 - 1], + [2**15 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int32DataType.new) + + # Int16 lower bound is -32768 + assert_equal({ + data_type: data_type, + values: [ + [0, -32769], + [32767], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int16 overflow") do + values = [ + [0, 2**15], + [-2**15], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int32DataType.new) + + # Int16 upper bound is 32767 + assert_equal({ + data_type: data_type, + values: [ + [0, 32768], + [-32768], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, -2**31], + [2**31 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int32DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, -2147483648], + [2147483647], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int32 underflow") do + values = [ + [0, -2**31 - 1], + [2**31 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int64DataType.new) + + # Int32 lower bound is -2147483648 + assert_equal({ + data_type: data_type, + values: [ + [0, -2147483649], + [2147483647], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int32 overflow") do + values = [ + [0, 2**31], + [-2**31], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::Int64DataType.new) + + # Int32 upper bound is 2147483647 + assert_equal({ + data_type: data_type, + values: [ + [0, 2147483648], + [-2147483648], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested int64 array overflow") do + values = [ + [0, 2**63], + [-2**63], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::StringDataType.new) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "9223372036854775808"], + ["-9223372036854775808"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested int64 array underflow") do + values = [ + [0, -2**63 - 1], + [2**63 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::StringDataType.new) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "-9223372036854775809"], + ["9223372036854775807"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + # UInt8 can hold values up to 255, + values = [ + [0, 2**8 - 1], + [2**8 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, 255], + [255], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, 2**8], + [2**8 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt16DataType.new) + + # UInt8 can hold values up to 255 + assert_equal({ + data_type: data_type, + values: [ + [0, 256], + [255], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, 2**16 - 1], + [2**16 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt16DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, 65535], + [65535], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, 2**16], + [2**16 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt32DataType.new) + + # UInt16 can hold values up to 65535 + assert_equal({ + data_type: data_type, + values: [ + [0, 65536], + [65535], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, 2**32 - 1], + [2**32 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt32DataType.new) + + assert_equal({ + data_type: data_type, + values: [ + [0, 4294967295], + [4294967295], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, 2**32], + [2**32 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::UInt64DataType.new) + + # UInt32 can hold values up to 4294967295 + assert_equal({ + data_type: data_type, + values: [ + [0, 4294967296], + [4294967295], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested uint64 array overflow") do + values = [ + [0, 2**64], + [2**64 - 1], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(Arrow::StringDataType.new) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "18446744073709551616"], + ["18446744073709551615"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end end end