Skip to content

Commit d8f20f8

Browse files
Merge branch 'main' into friendlymatthew/track-bytes-during-union-row-decoding
2 parents 07ed163 + a9d6e92 commit d8f20f8

29 files changed

Lines changed: 1478 additions & 546 deletions

File tree

.github/workflows/integration.yml

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,58 +78,112 @@ jobs:
7878
run:
7979
shell: bash
8080
steps:
81+
- name: Monitor disk usage - Initial
82+
run: |
83+
echo "=== Initial Disk Usage ==="
84+
df -h /
85+
echo ""
86+
87+
- name: Remove unnecessary preinstalled software
88+
run: |
89+
echo "=== Cleaning up host disk space ==="
90+
echo "Disk space before cleanup:"
91+
df -h /
92+
93+
# Clean apt cache
94+
apt-get clean || true
95+
96+
# Remove GitHub Actions tool cache
97+
rm -rf /__t/* || true
98+
99+
# Remove large packages from host filesystem (mounted at /host/)
100+
rm -rf /host/usr/share/dotnet || true
101+
rm -rf /host/usr/local/lib/android || true
102+
rm -rf /host/usr/local/.ghcup || true
103+
rm -rf /host/opt/hostedtoolcache/CodeQL || true
104+
105+
echo ""
106+
echo "Disk space after cleanup:"
107+
df -h /
108+
echo ""
109+
81110
# This is necessary so that actions/checkout can find git
82111
- name: Export conda path
83112
run: echo "/opt/conda/envs/arrow/bin" >> $GITHUB_PATH
84113
# This is necessary so that Rust can find cargo
85114
- name: Export cargo path
86115
run: echo "/root/.cargo/bin" >> $GITHUB_PATH
87-
- name: Check rustup
88-
run: which rustup
89-
- name: Check cmake
90-
run: which cmake
116+
117+
# Checkout repos (using shallow clones with fetch-depth: 1)
91118
- name: Checkout Arrow
92119
uses: actions/checkout@v6
93120
with:
94121
repository: apache/arrow
95122
submodules: true
96-
fetch-depth: 0
123+
fetch-depth: 1
97124
- name: Checkout Arrow Rust
98125
uses: actions/checkout@v6
99126
with:
100127
path: rust
101128
submodules: true
102-
fetch-depth: 0
129+
fetch-depth: 1
103130
- name: Checkout Arrow .NET
104131
uses: actions/checkout@v6
105132
with:
106133
repository: apache/arrow-dotnet
107134
path: dotnet
135+
fetch-depth: 1
108136
- name: Checkout Arrow Go
109137
uses: actions/checkout@v6
110138
with:
111139
repository: apache/arrow-go
112140
path: go
141+
fetch-depth: 1
113142
- name: Checkout Arrow Java
114143
uses: actions/checkout@v6
115144
with:
116145
repository: apache/arrow-java
117146
path: java
147+
fetch-depth: 1
118148
- name: Checkout Arrow JavaScript
119149
uses: actions/checkout@v6
120150
with:
121151
repository: apache/arrow-js
122152
path: js
153+
fetch-depth: 1
123154
- name: Checkout Arrow nanoarrow
124155
uses: actions/checkout@v6
125156
with:
126157
repository: apache/arrow-nanoarrow
127158
path: nanoarrow
159+
fetch-depth: 1
160+
161+
- name: Monitor disk usage - After checkouts
162+
run: |
163+
echo "=== After Checkouts ==="
164+
df -h /
165+
echo ""
166+
128167
- name: Build
129168
run: conda run --no-capture-output ci/scripts/integration_arrow_build.sh $PWD /build
169+
170+
- name: Monitor disk usage - After build
171+
if: always()
172+
run: |
173+
echo "=== After Build ==="
174+
df -h /
175+
echo ""
176+
130177
- name: Run
131178
run: conda run --no-capture-output ci/scripts/integration_arrow.sh $PWD /build
132179

180+
- name: Monitor disk usage - After tests
181+
if: always()
182+
run: |
183+
echo "=== After Tests ==="
184+
df -h /
185+
echo ""
186+
133187
# test FFI against the C-Data interface exposed by pyarrow
134188
pyarrow-integration-test:
135189
name: Pyarrow C Data Interface

arrow-arith/src/aggregate.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,10 +332,10 @@ fn aggregate<T: ArrowNativeTypeOp, P: ArrowPrimitiveType<Native = T>, A: Numeric
332332

333333
/// Returns the minimum value in the boolean array.
334334
///
335+
/// # Example
335336
/// ```
336337
/// # use arrow_array::BooleanArray;
337338
/// # use arrow_arith::aggregate::min_boolean;
338-
///
339339
/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
340340
/// assert_eq!(min_boolean(&a), Some(false))
341341
/// ```
@@ -390,10 +390,10 @@ pub fn min_boolean(array: &BooleanArray) -> Option<bool> {
390390

391391
/// Returns the maximum value in the boolean array
392392
///
393+
/// # Example
393394
/// ```
394395
/// # use arrow_array::BooleanArray;
395396
/// # use arrow_arith::aggregate::max_boolean;
396-
///
397397
/// let a = BooleanArray::from(vec![Some(true), None, Some(false)]);
398398
/// assert_eq!(max_boolean(&a), Some(true))
399399
/// ```
@@ -809,6 +809,15 @@ where
809809

810810
/// Returns the minimum value in the array, according to the natural order.
811811
/// For floating point arrays any NaN values are considered to be greater than any other non-null value
812+
///
813+
/// # Example
814+
/// ```rust
815+
/// # use arrow_array::Int32Array;
816+
/// # use arrow_arith::aggregate::min;
817+
/// let array = Int32Array::from(vec![8, 2, 4]);
818+
/// let result = min(&array);
819+
/// assert_eq!(result, Some(2));
820+
/// ```
812821
pub fn min<T: ArrowNumericType>(array: &PrimitiveArray<T>) -> Option<T::Native>
813822
where
814823
T::Native: PartialOrd,
@@ -818,6 +827,15 @@ where
818827

819828
/// Returns the maximum value in the array, according to the natural order.
820829
/// For floating point arrays any NaN values are considered to be greater than any other non-null value
830+
///
831+
/// # Example
832+
/// ```rust
833+
/// # use arrow_array::Int32Array;
834+
/// # use arrow_arith::aggregate::max;
835+
/// let array = Int32Array::from(vec![4, 8, 2]);
836+
/// let result = max(&array);
837+
/// assert_eq!(result, Some(8));
838+
/// ```
821839
pub fn max<T: ArrowNumericType>(array: &PrimitiveArray<T>) -> Option<T::Native>
822840
where
823841
T::Native: PartialOrd,

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,53 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
430430
};
431431
}
432432

433+
/// Append the same value `n` times into the builder
434+
///
435+
/// This is more efficient than calling [`Self::try_append_value`] `n` times,
436+
/// especially when deduplication is enabled, as it only hashes the value once.
437+
///
438+
/// # Errors
439+
///
440+
/// Returns an error if
441+
/// - String buffer count exceeds `u32::MAX`
442+
/// - String length exceeds `u32::MAX`
443+
///
444+
/// # Example
445+
/// ```
446+
/// # use arrow_array::builder::StringViewBuilder;
447+
/// # use arrow_array::Array;
448+
/// let mut builder = StringViewBuilder::new().with_deduplicate_strings();
449+
///
450+
/// // Append "hello" 1000 times efficiently
451+
/// builder.try_append_value_n("hello", 1000)?;
452+
///
453+
/// let array = builder.finish();
454+
/// assert_eq!(array.len(), 1000);
455+
///
456+
/// // All values are "hello"
457+
/// for value in array.iter() {
458+
/// assert_eq!(value, Some("hello"));
459+
/// }
460+
/// # Ok::<(), arrow_schema::ArrowError>(())
461+
/// ```
462+
#[inline]
463+
pub fn try_append_value_n(
464+
&mut self,
465+
value: impl AsRef<T::Native>,
466+
n: usize,
467+
) -> Result<(), ArrowError> {
468+
if n == 0 {
469+
return Ok(());
470+
}
471+
// Process value once (handles deduplication, buffer management, view creation)
472+
self.try_append_value(value)?;
473+
// Reuse the view (n-1) times
474+
let view = *self.views_buffer.last().unwrap();
475+
self.views_buffer.extend(std::iter::repeat_n(view, n - 1));
476+
self.null_buffer_builder.append_n_non_nulls(n - 1);
477+
Ok(())
478+
}
479+
433480
/// Append a null value into the builder
434481
#[inline]
435482
pub fn append_null(&mut self) {
@@ -884,4 +931,76 @@ mod tests {
884931
MAX_BLOCK_SIZE as usize
885932
);
886933
}
934+
935+
#[test]
936+
fn test_append_value_n() {
937+
// Test with inline strings (<=12 bytes)
938+
let mut builder = StringViewBuilder::new();
939+
940+
builder.try_append_value_n("hello", 100).unwrap();
941+
builder.append_value("world");
942+
builder.try_append_value_n("foo", 50).unwrap();
943+
944+
let array = builder.finish();
945+
assert_eq!(array.len(), 151);
946+
assert_eq!(array.null_count(), 0);
947+
948+
// Verify the values
949+
for i in 0..100 {
950+
assert_eq!(array.value(i), "hello");
951+
}
952+
assert_eq!(array.value(100), "world");
953+
for i in 101..151 {
954+
assert_eq!(array.value(i), "foo");
955+
}
956+
957+
// All inline strings should have no data buffers
958+
assert_eq!(array.data_buffers().len(), 0);
959+
}
960+
961+
#[test]
962+
fn test_append_value_n_with_deduplication() {
963+
let long_string = "This is a very long string that exceeds the inline length";
964+
965+
// Test with deduplication enabled
966+
let mut builder = StringViewBuilder::new().with_deduplicate_strings();
967+
968+
// First append the string once to add it to the hash map
969+
builder.append_value(long_string);
970+
971+
// Then append_n the same string - should deduplicate and reuse the existing value
972+
builder.try_append_value_n(long_string, 999).unwrap();
973+
974+
let array = builder.finish();
975+
assert_eq!(array.len(), 1000);
976+
assert_eq!(array.null_count(), 0);
977+
978+
// Verify all values are the same
979+
for i in 0..1000 {
980+
assert_eq!(array.value(i), long_string);
981+
}
982+
983+
// With deduplication, should only have 1 data buffer containing the string once
984+
assert_eq!(array.data_buffers().len(), 1);
985+
986+
// All views should be identical
987+
let first_view = array.views()[0];
988+
for view in array.views().iter() {
989+
assert_eq!(*view, first_view);
990+
}
991+
}
992+
993+
#[test]
994+
fn test_append_value_n_zero() {
995+
let mut builder = StringViewBuilder::new();
996+
997+
builder.append_value("first");
998+
builder.try_append_value_n("should not appear", 0).unwrap();
999+
builder.append_value("second");
1000+
1001+
let array = builder.finish();
1002+
assert_eq!(array.len(), 2);
1003+
assert_eq!(array.value(0), "first");
1004+
assert_eq!(array.value(1), "second");
1005+
}
8871006
}

0 commit comments

Comments
 (0)