[Arrow]Configure max deduplication length for `StringView` (#8990)
Configure max deduplication length when deduplicating strings while
building the array
# Which issue does this PR close?
Configure max deduplication length when deduplicating strings while
building the array
- Closes #7187.
# Rationale for this change
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
# What changes are included in this PR?
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
# Are these changes tested?
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
# Are there any user-facing changes?
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 35f5684..d7ff7ce 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -87,6 +87,7 @@
/// Some if deduplicating strings
/// map `<string hash> -> <index to the views>`
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
+ max_deduplication_len: Option<u32>,
phantom: PhantomData<T>,
}
@@ -107,10 +108,28 @@
current_size: STARTING_BLOCK_SIZE,
},
string_tracker: None,
+ max_deduplication_len: None,
phantom: Default::default(),
}
}
+ /// Configure max deduplication length when deduplicating strings while building the array.
+ /// Default is None.
+ ///
+ /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
+ /// any strings longer than 12 bytes. However, since it takes time proportional to the length
+ /// of the string to deduplicate, setting this option limits the CPU overhead for this option.
+ pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
+ debug_assert!(
+ max_deduplication_len > 0,
+ "max_deduplication_len must be greater than 0"
+ );
+ Self {
+ max_deduplication_len: Some(max_deduplication_len),
+ ..self
+ }
+ }
+
/// Set a fixed buffer size for variable length strings
///
/// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,42 @@
// Deduplication if:
// (1) deduplication is enabled.
- // (2) len > 12
- if let Some((mut ht, hasher)) = self.string_tracker.take() {
- let hash_val = hasher.hash_one(v);
- let hasher_fn = |v: &_| hasher.hash_one(v);
+ // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
+ let can_deduplicate = self.string_tracker.is_some()
+ && self
+ .max_deduplication_len
+ .map(|max_length| length <= max_length)
+ .unwrap_or(true);
+ if can_deduplicate {
+ if let Some((mut ht, hasher)) = self.string_tracker.take() {
+ let hash_val = hasher.hash_one(v);
+ let hasher_fn = |v: &_| hasher.hash_one(v);
- let entry = ht.entry(
- hash_val,
- |idx| {
- let stored_value = self.get_value(*idx);
- v == stored_value
- },
- hasher_fn,
- );
- match entry {
- Entry::Occupied(occupied) => {
- // If the string already exists, we will directly use the view
- let idx = occupied.get();
- self.views_buffer.push(self.views_buffer[*idx]);
- self.null_buffer_builder.append_non_null();
- self.string_tracker = Some((ht, hasher));
- return Ok(());
+ let entry = ht.entry(
+ hash_val,
+ |idx| {
+ let stored_value = self.get_value(*idx);
+ v == stored_value
+ },
+ hasher_fn,
+ );
+ match entry {
+ Entry::Occupied(occupied) => {
+ // If the string already exists, we will directly use the view
+ let idx = occupied.get();
+ self.views_buffer.push(self.views_buffer[*idx]);
+ self.null_buffer_builder.append_non_null();
+ self.string_tracker = Some((ht, hasher));
+ return Ok(());
+ }
+ Entry::Vacant(vacant) => {
+ // o.w. we insert the (string hash -> view index)
+ // the idx is current length of views_builder, as we are inserting a new view
+ vacant.insert(self.views_buffer.len());
+ }
}
- Entry::Vacant(vacant) => {
- // o.w. we insert the (string hash -> view index)
- // the idx is current length of views_builder, as we are inserting a new view
- vacant.insert(self.views_buffer.len());
- }
+ self.string_tracker = Some((ht, hasher));
}
- self.string_tracker = Some((ht, hasher));
}
let required_cap = self.in_progress.len() + v.len();
@@ -636,9 +662,54 @@
mod tests {
use core::str;
+ use arrow_buffer::ArrowNativeType;
+
use super::*;
#[test]
+ fn test_string_max_deduplication_len() {
+ let value_1 = "short";
+ let value_2 = "not so similar string but long";
+ let value_3 = "1234567890123";
+
+ let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
+
+ let mut builder = StringViewBuilder::new()
+ .with_deduplicate_strings()
+ .with_max_deduplication_len(max_deduplication_len);
+
+ assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
+ assert!(value_2.len() > max_deduplication_len.as_usize());
+ assert!(
+ value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
+ && value_3.len() < max_deduplication_len.as_usize()
+ );
+
+ // append value1 (short), expect it is inlined and not deduplicated
+ builder.append_value(value_1); // view 0
+ builder.append_value(value_1); // view 1
+ // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
+ builder.append_value(value_2); // view 2
+ builder.append_value(value_2); // view 3
+ // append value3, expect second copy is deduplicated
+ builder.append_value(value_3); // view 4
+ builder.append_value(value_3); // view 5
+
+ let array = builder.finish();
+
+ // verify
+ let v2 = ByteView::from(array.views()[2]);
+ let v3 = ByteView::from(array.views()[3]);
+ assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
+ assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated
+
+ let v4 = ByteView::from(array.views()[4]);
+ let v5 = ByteView::from(array.views()[5]);
+ assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
+ assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
+ }
+
+ #[test]
fn test_string_view_deduplicate() {
let value_1 = "long string to test string view";
let value_2 = "not so similar string but long";