Fix out of bounds read in bit chunk iterator (#416)
* Fix out of bounds read in bit chunk iterator
* Add comment why reading one additional byte is enough
diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs
index 6559c4e..e9394f7 100644
--- a/arrow/benches/boolean_kernels.rs
+++ b/arrow/benches/boolean_kernels.rs
@@ -45,6 +45,25 @@
c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2)));
c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2)));
c.bench_function("not", |b| b.iter(|| bench_not(&array1)));
+
+ let array1_slice = array1.slice(1, size - 1);
+ let array1_slice = array1_slice
+ .as_any()
+ .downcast_ref::<BooleanArray>()
+ .unwrap();
+ let array2_slice = array2.slice(1, size - 1);
+ let array2_slice = array2_slice
+ .as_any()
+ .downcast_ref::<BooleanArray>()
+ .unwrap();
+
+ c.bench_function("and_sliced", |b| {
+ b.iter(|| bench_and(&array1_slice, &array2_slice))
+ });
+ c.bench_function("or_sliced", |b| {
+ b.iter(|| bench_or(&array1_slice, &array2_slice))
+ });
+ c.bench_function("not_sliced", |b| b.iter(|| bench_not(&array1_slice)));
}
criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs
index c57d021..ea9280c 100644
--- a/arrow/src/util/bit_chunk_iterator.rs
+++ b/arrow/src/util/bit_chunk_iterator.rs
@@ -35,10 +35,10 @@
let byte_offset = offset / 8;
let bit_offset = offset % 8;
- let chunk_bits = 8 * std::mem::size_of::<u64>();
-
- let chunk_len = len / chunk_bits;
- let remainder_len = len & (chunk_bits - 1);
+ // number of complete u64 chunks
+ let chunk_len = len / 64;
+ // number of remaining bits
+ let remainder_len = len % 64;
BitChunks::<'a> {
buffer: &buffer[byte_offset..],
@@ -137,14 +137,18 @@
// so when reading as u64 on a big-endian machine, the bytes need to be swapped
let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() };
- let combined = if self.bit_offset == 0 {
+ let bit_offset = self.bit_offset;
+
+ let combined = if bit_offset == 0 {
current
} else {
- let next =
- unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() };
+ // the constructor ensures that bit_offset is in 0..8
+ // that means we need to read at most one additional byte to fill in the high bits
+ let next = unsafe {
+ std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64
+ };
- current >> self.bit_offset
- | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset)
+ (current >> bit_offset) | (next << (64 - bit_offset))
};
self.index = index + 1;
@@ -184,7 +188,6 @@
}
#[test]
- #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
fn test_iter_unaligned() {
let input: &[u8] = &[
0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
@@ -206,7 +209,6 @@
}
#[test]
- #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
fn test_iter_unaligned_remainder_1_byte() {
let input: &[u8] = &[
0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
@@ -228,7 +230,6 @@
}
#[test]
- #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
fn test_iter_unaligned_remainder_bits_across_bytes() {
let input: &[u8] = &[0b00111111, 0b11111100];
let buffer: Buffer = Buffer::from(input);
@@ -242,7 +243,6 @@
}
#[test]
- #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
fn test_iter_unaligned_remainder_bits_large() {
let input: &[u8] = &[
0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000,
@@ -258,4 +258,18 @@
bitchunks.remainder_bits()
);
}
+
+ #[test]
+ fn test_iter_remainder_out_of_bounds() {
+ // allocating a full page should trigger a fault when reading out of bounds
+ const ALLOC_SIZE: usize = 4 * 1024;
+ let input = vec![0xFF_u8; ALLOC_SIZE];
+
+ let buffer: Buffer = Buffer::from(input);
+
+ let bitchunks = buffer.bit_chunks(57, ALLOC_SIZE * 8 - 57);
+
+ assert_eq!(u64::MAX, bitchunks.iter().last().unwrap());
+ assert_eq!(0x7F, bitchunks.remainder_bits());
+ }
}