Skip to content

Commit 52fd59c

Browse files
codephage2020alamb
andauthored
[Variant] Use simdutf8 for UTF-8 validation (#7908)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes #7902. # Rationale for this change # What changes are included in this PR? - add `simdutf8` dependency for parquet-variant - add a fn `extract_and_validate_utf8_slice` # Are these changes tested? We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? # Are there any user-facing changes? If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --------- Signed-off-by: codephage2020 <tingwangyan2020@163.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 5555d30 commit 52fd59c

File tree

5 files changed

+26
-4
lines changed

5 files changed

+26
-4
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" }
108108

109109
chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
110110

111+
simdutf8 = { version = "0.1.5", default-features = false }
112+
111113
# release inherited profile keeping debug information and symbols
112114
# for mem/cpu profiling
113115
[profile.profiling]

arrow-json/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"] }
4949
chrono = { workspace = true }
5050
lexical-core = { version = "1.0", default-features = false}
5151
memchr = "2.7.4"
52-
simdutf8 = "0.1.5"
52+
simdutf8 = { workspace = true }
5353

5454
[dev-dependencies]
5555
flate2 = { version = "1", default-features = false, features = ["rust_backend"] }

parquet-variant/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ arrow-schema = { workspace = true }
3737
chrono = { workspace = true }
3838
indexmap = "2.10.0"
3939

40+
simdutf8 = { workspace = true , optional = true }
4041

4142
[lib]
4243
name = "parquet_variant"
@@ -51,6 +52,10 @@ rand = { version = "0.9", default-features = false, features = [
5152
"thread_rng",
5253
] }
5354

55+
[features]
56+
default = ["simdutf8"]
57+
# Enable SIMD UTF-8 validation
58+
simdutf8 = ["dep:simdutf8"]
5459

5560
[[bench]]
5661
name = "variant_builder"

parquet-variant/src/utils.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,28 @@ pub(crate) fn first_byte_from_slice(slice: &[u8]) -> Result<u8, ArrowError> {
7474
.ok_or_else(|| ArrowError::InvalidArgumentError("Received empty bytes".to_string()))
7575
}
7676

77-
/// Helper to get a &str from a slice at the given offset and range, or an error if invalid.
77+
/// Helper to get a &str from a slice at the given offset and range, or an error if it contains invalid UTF-8 data.
78+
#[inline]
7879
pub(crate) fn string_from_slice(
7980
slice: &[u8],
8081
offset: usize,
8182
range: Range<usize>,
8283
) -> Result<&str, ArrowError> {
83-
str::from_utf8(slice_from_slice_at_offset(slice, offset, range)?)
84+
let offset_buffer = slice_from_slice_at_offset(slice, offset, range)?;
85+
86+
//Use simdutf8 by default
87+
#[cfg(feature = "simdutf8")]
88+
{
89+
simdutf8::basic::from_utf8(offset_buffer).map_err(|_| {
90+
// Use simdutf8::compat to return details about the decoding error
91+
let e = simdutf8::compat::from_utf8(offset_buffer).unwrap_err();
92+
ArrowError::InvalidArgumentError(format!("encountered non UTF-8 data: {e}"))
93+
})
94+
}
95+
96+
//Use std::str if simdutf8 is not enabled
97+
#[cfg(not(feature = "simdutf8"))]
98+
str::from_utf8(offset_buffer)
8499
.map_err(|_| ArrowError::InvalidArgumentError("invalid UTF-8 string".to_string()))
85100
}
86101

parquet/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ twox-hash = { version = "2.0", default-features = false, features = ["xxhash64"]
7070
paste = { version = "1.0" }
7171
half = { version = "2.1", default-features = false, features = ["num-traits"] }
7272
crc32fast = { version = "1.4.2", optional = true, default-features = false }
73-
simdutf8 = { version = "0.1.5", optional = true, default-features = false }
73+
simdutf8 = { workspace = true , optional = true }
7474
ring = { version = "0.17", default-features = false, features = ["std"], optional = true }
7575

7676
[dev-dependencies]

0 commit comments

Comments
 (0)