Skip to content

Commit 649cfcf

Browse files
committed
Merge branch 'main' into issue-8082-variant-get-list
2 parents 1a74168 + a9d6e92 commit 649cfcf

File tree

26 files changed

+1020
-250
lines changed

26 files changed

+1020
-250
lines changed

.github/workflows/integration.yml

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,58 +78,112 @@ jobs:
7878
run:
7979
shell: bash
8080
steps:
81+
- name: Monitor disk usage - Initial
82+
run: |
83+
echo "=== Initial Disk Usage ==="
84+
df -h /
85+
echo ""
86+
87+
- name: Remove unnecessary preinstalled software
88+
run: |
89+
echo "=== Cleaning up host disk space ==="
90+
echo "Disk space before cleanup:"
91+
df -h /
92+
93+
# Clean apt cache
94+
apt-get clean || true
95+
96+
# Remove GitHub Actions tool cache
97+
rm -rf /__t/* || true
98+
99+
# Remove large packages from host filesystem (mounted at /host/)
100+
rm -rf /host/usr/share/dotnet || true
101+
rm -rf /host/usr/local/lib/android || true
102+
rm -rf /host/usr/local/.ghcup || true
103+
rm -rf /host/opt/hostedtoolcache/CodeQL || true
104+
105+
echo ""
106+
echo "Disk space after cleanup:"
107+
df -h /
108+
echo ""
109+
81110
# This is necessary so that actions/checkout can find git
82111
- name: Export conda path
83112
run: echo "/opt/conda/envs/arrow/bin" >> $GITHUB_PATH
84113
# This is necessary so that Rust can find cargo
85114
- name: Export cargo path
86115
run: echo "/root/.cargo/bin" >> $GITHUB_PATH
87-
- name: Check rustup
88-
run: which rustup
89-
- name: Check cmake
90-
run: which cmake
116+
117+
# Checkout repos (using shallow clones with fetch-depth: 1)
91118
- name: Checkout Arrow
92119
uses: actions/checkout@v6
93120
with:
94121
repository: apache/arrow
95122
submodules: true
96-
fetch-depth: 0
123+
fetch-depth: 1
97124
- name: Checkout Arrow Rust
98125
uses: actions/checkout@v6
99126
with:
100127
path: rust
101128
submodules: true
102-
fetch-depth: 0
129+
fetch-depth: 1
103130
- name: Checkout Arrow .NET
104131
uses: actions/checkout@v6
105132
with:
106133
repository: apache/arrow-dotnet
107134
path: dotnet
135+
fetch-depth: 1
108136
- name: Checkout Arrow Go
109137
uses: actions/checkout@v6
110138
with:
111139
repository: apache/arrow-go
112140
path: go
141+
fetch-depth: 1
113142
- name: Checkout Arrow Java
114143
uses: actions/checkout@v6
115144
with:
116145
repository: apache/arrow-java
117146
path: java
147+
fetch-depth: 1
118148
- name: Checkout Arrow JavaScript
119149
uses: actions/checkout@v6
120150
with:
121151
repository: apache/arrow-js
122152
path: js
153+
fetch-depth: 1
123154
- name: Checkout Arrow nanoarrow
124155
uses: actions/checkout@v6
125156
with:
126157
repository: apache/arrow-nanoarrow
127158
path: nanoarrow
159+
fetch-depth: 1
160+
161+
- name: Monitor disk usage - After checkouts
162+
run: |
163+
echo "=== After Checkouts ==="
164+
df -h /
165+
echo ""
166+
128167
- name: Build
129168
run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build
169+
170+
- name: Monitor disk usage - After build
171+
if: always()
172+
run: |
173+
echo "=== After Build ==="
174+
df -h /
175+
echo ""
176+
130177
- name: Run
131178
run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build
132179

180+
- name: Monitor disk usage - After tests
181+
if: always()
182+
run: |
183+
echo "=== After Tests ==="
184+
df -h /
185+
echo ""
186+
133187
# test FFI against the C-Data interface exposed by pyarrow
134188
pyarrow-integration-test:
135189
name: Pyarrow C Data Interface

arrow-arith/src/aggregate.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,10 @@ fn aggregate<T: ArrowNativeTypeOp, P: ArrowPrimitiveType<Native = T>, A: Numeric
332332

333333
/// Returns the minimum value in the boolean array.
334334
///
335+
/// # Example
335336
/// ```
336337
/// # use arrow_array::BooleanArray;
337338
/// # use arrow_arith::aggregate::min_boolean;
338-
///
339339
/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
340340
/// assert_eq!(min_boolean(&a), Some(false))
341341
/// ```
@@ -390,10 +390,10 @@ pub fn min_boolean(array: &BooleanArray) -> Option<bool> {
390390

391391
/// Returns the maximum value in the boolean array
392392
///
393+
/// # Example
393394
/// ```
394395
/// # use arrow_array::BooleanArray;
395396
/// # use arrow_arith::aggregate::max_boolean;
396-
///
397397
/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
398398
/// assert_eq!(max_boolean(&a), Some(true))
399399
/// ```
@@ -809,6 +809,15 @@ where
809809

810810
/// Returns the minimum value in the array, according to the natural order.
811811
/// For floating point arrays any NaN values are considered to be greater than any other non-null value
812+
///
813+
/// # Example
814+
/// ```rust
815+
/// # use arrow_array::Int32Array;
816+
/// # use arrow_arith::aggregate::min;
817+
/// let array = Int32Array::from(vec![8, 2, 4]);
818+
/// let result = min(&array);
819+
/// assert_eq!(result, Some(2));
820+
/// ```
812821
pub fn min<T: ArrowNumericType>(array: &PrimitiveArray<T>) -> Option<T::Native>
813822
where
814823
T::Native: PartialOrd,
@@ -818,6 +827,15 @@ where
818827

819828
/// Returns the maximum value in the array, according to the natural order.
820829
/// For floating point arrays any NaN values are considered to be greater than any other non-null value
830+
///
831+
/// # Example
832+
/// ```rust
833+
/// # use arrow_array::Int32Array;
834+
/// # use arrow_arith::aggregate::max;
835+
/// let array = Int32Array::from(vec![4, 8, 2]);
836+
/// let result = max(&array);
837+
/// assert_eq!(result, Some(8));
838+
/// ```
821839
pub fn max<T: ArrowNumericType>(array: &PrimitiveArray<T>) -> Option<T::Native>
822840
where
823841
T::Native: PartialOrd,

arrow-buffer/src/buffer/boolean.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ impl BooleanBuffer {
165165
/// * `op` must only apply bitwise operations
166166
/// on the relevant bits; the input `u64` may contain irrelevant bits
167167
/// and may be processed differently on different endian architectures.
168+
/// * `op` may be called with input bits outside the requested range
168169
/// * The output always has zero offset
169170
///
170171
/// # See Also

arrow-buffer/src/buffer/ops.rs

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@ use crate::BooleanBuffer;
2020
use crate::util::bit_util::ceil;
2121

2222
/// Apply a bitwise operation `op` to four inputs and return the result as a Buffer.
23-
/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits.
23+
///
24+
/// The inputs are treated as bitmaps, meaning that offsets and length are
25+
/// specified in number of bits.
26+
///
27+
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
28+
/// outside the offsets and len are set to zero out before calling `op`.
2429
pub fn bitwise_quaternary_op_helper<F>(
2530
buffers: [&Buffer; 4],
2631
offsets: [usize; 4],
@@ -60,7 +65,12 @@ where
6065
}
6166

6267
/// Apply a bitwise operation `op` to two inputs and return the result as a Buffer.
63-
/// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits.
68+
///
69+
/// The inputs are treated as bitmaps, meaning that offsets and length are
70+
/// specified in number of bits.
71+
///
72+
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
73+
/// outside the offsets and len are set to zero out before calling `op`.
6474
pub fn bitwise_bin_op_helper<F>(
6575
left: &Buffer,
6676
left_offset_in_bits: usize,
@@ -93,21 +103,42 @@ where
93103
}
94104

95105
/// Apply a bitwise operation `op` to one input and return the result as a Buffer.
96-
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
97-
#[deprecated(
98-
since = "57.2.0",
99-
note = "use BooleanBuffer::from_bitwise_unary_op instead"
100-
)]
106+
///
107+
/// The input is treated as a bitmap, meaning that offset and length are
108+
/// specified in number of bits.
109+
///
110+
/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
111+
/// outside the offsets and len are set to zero out before calling `op`.
101112
pub fn bitwise_unary_op_helper<F>(
102113
left: &Buffer,
103114
offset_in_bits: usize,
104115
len_in_bits: usize,
105-
op: F,
116+
mut op: F,
106117
) -> Buffer
107118
where
108119
F: FnMut(u64) -> u64,
109120
{
110-
BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner()
121+
// reserve capacity and set length so we can get a typed view of u64 chunks
122+
let mut result =
123+
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
124+
125+
let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
126+
127+
let result_chunks = result.typed_data_mut::<u64>().iter_mut();
128+
129+
result_chunks
130+
.zip(left_chunks.iter())
131+
.for_each(|(res, left)| {
132+
*res = op(left);
133+
});
134+
135+
let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
136+
let rem = op(left_chunks.remainder_bits());
137+
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
138+
let rem = &rem.to_le_bytes()[0..remainder_bytes];
139+
result.extend_from_slice(rem);
140+
141+
result.into()
111142
}
112143

113144
/// Apply a bitwise and to two inputs and return the result as a Buffer.

arrow-buffer/src/builder/boolean.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ impl BooleanBufferBuilder {
140140

141141
/// Reserve space to at least `additional` new bits.
142142
/// Capacity will be `>= self.len() + additional`.
143-
/// New bytes are uninitialized and reading them is undefined behavior.
144143
#[inline]
145144
pub fn reserve(&mut self, additional: usize) {
146145
let capacity = self.len + additional;

arrow-ord/src/sort.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ pub struct SortColumn {
846846
/// Returns an `ArrowError::ComputeError(String)` if any of the array type is either unsupported by
847847
/// `lexsort_to_indices` or `take`.
848848
///
849-
/// Example:
849+
/// # Example:
850850
///
851851
/// ```
852852
/// # use std::convert::From;
@@ -855,7 +855,6 @@ pub struct SortColumn {
855855
/// # use arrow_array::types::Int64Type;
856856
/// # use arrow_array::cast::AsArray;
857857
/// # use arrow_ord::sort::{SortColumn, SortOptions, lexsort};
858-
///
859858
/// let sorted_columns = lexsort(&vec![
860859
/// SortColumn {
861860
/// values: Arc::new(PrimitiveArray::<Int64Type>::from(vec![

arrow-row/src/lib.rs

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -892,7 +892,7 @@ impl RowConverter {
892892
// and therefore must be valid
893893
let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?;
894894

895-
if cfg!(test) {
895+
if cfg!(debug_assertions) {
896896
for (i, row) in rows.iter().enumerate() {
897897
if !row.is_empty() {
898898
return Err(ArrowError::InvalidArgumentError(format!(
@@ -1131,8 +1131,8 @@ impl Rows {
11311131
pub fn size(&self) -> usize {
11321132
// Size of fields is accounted for as part of RowConverter
11331133
std::mem::size_of::<Self>()
1134-
+ self.buffer.len()
1135-
+ self.offsets.len() * std::mem::size_of::<usize>()
1134+
+ self.buffer.capacity()
1135+
+ self.offsets.capacity() * std::mem::size_of::<usize>()
11361136
}
11371137

11381138
/// Create a [BinaryArray] from the [Rows] data without reallocating the
@@ -1644,24 +1644,22 @@ fn encode_column(
16441644
}
16451645
}
16461646
DataType::Binary => {
1647-
variable::encode(data, offsets, as_generic_binary_array::<i32>(column).iter(), opts)
1647+
variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::<i32>(column), opts)
16481648
}
16491649
DataType::BinaryView => {
16501650
variable::encode(data, offsets, column.as_binary_view().iter(), opts)
16511651
}
16521652
DataType::LargeBinary => {
1653-
variable::encode(data, offsets, as_generic_binary_array::<i64>(column).iter(), opts)
1653+
variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::<i64>(column), opts)
16541654
}
1655-
DataType::Utf8 => variable::encode(
1655+
DataType::Utf8 => variable::encode_generic_byte_array(
16561656
data, offsets,
1657-
column.as_string::<i32>().iter().map(|x| x.map(|x| x.as_bytes())),
1657+
column.as_string::<i32>(),
16581658
opts,
16591659
),
1660-
DataType::LargeUtf8 => variable::encode(
1660+
DataType::LargeUtf8 => variable::encode_generic_byte_array(
16611661
data, offsets,
1662-
column.as_string::<i64>()
1663-
.iter()
1664-
.map(|x| x.map(|x| x.as_bytes())),
1662+
column.as_string::<i64>(),
16651663
opts,
16661664
),
16671665
DataType::Utf8View => variable::encode(
@@ -4050,4 +4048,47 @@ mod tests {
40504048
// "a" < "z"
40514049
assert!(rows.row(3) < rows.row(1));
40524050
}
4051+
4052+
#[test]
4053+
fn rows_size_should_count_for_capacity() {
4054+
let row_converter = RowConverter::new(vec![SortField::new(DataType::UInt8)]).unwrap();
4055+
4056+
let empty_rows_size_with_preallocate_rows_and_data = {
4057+
let rows = row_converter.empty_rows(1000, 1000);
4058+
4059+
rows.size()
4060+
};
4061+
let empty_rows_size_with_preallocate_rows = {
4062+
let rows = row_converter.empty_rows(1000, 0);
4063+
4064+
rows.size()
4065+
};
4066+
let empty_rows_size_with_preallocate_data = {
4067+
let rows = row_converter.empty_rows(0, 1000);
4068+
4069+
rows.size()
4070+
};
4071+
let empty_rows_size_without_preallocate = {
4072+
let rows = row_converter.empty_rows(0, 0);
4073+
4074+
rows.size()
4075+
};
4076+
4077+
assert!(
4078+
empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_rows,
4079+
"{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_rows}"
4080+
);
4081+
assert!(
4082+
empty_rows_size_with_preallocate_rows_and_data > empty_rows_size_with_preallocate_data,
4083+
"{empty_rows_size_with_preallocate_rows_and_data} should be larger than {empty_rows_size_with_preallocate_data}"
4084+
);
4085+
assert!(
4086+
empty_rows_size_with_preallocate_rows > empty_rows_size_without_preallocate,
4087+
"{empty_rows_size_with_preallocate_rows} should be larger than {empty_rows_size_without_preallocate}"
4088+
);
4089+
assert!(
4090+
empty_rows_size_with_preallocate_data > empty_rows_size_without_preallocate,
4091+
"{empty_rows_size_with_preallocate_data} should be larger than {empty_rows_size_without_preallocate}"
4092+
);
4093+
}
40534094
}

arrow-row/src/run.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,11 @@ pub unsafe fn decode<R: RunEndIndexType>(
134134
run_ends.push(R::Native::usize_as(idx));
135135
}
136136
unique_row_indices.push(decoded_values.len());
137-
decoded_values.push(decoded_data.clone());
137+
let capacity = decoded_data.capacity();
138+
decoded_values.push(std::mem::replace(
139+
&mut decoded_data,
140+
Vec::with_capacity(capacity),
141+
));
138142
}
139143
}
140144
// Add the final run end

0 commit comments

Comments
 (0)