Fix out of bounds read in bit chunk iterator (#416)

* Fix out of bounds read in bit chunk iterator

* Add comment why reading one additional byte is enough
diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs
index 6559c4e..e9394f7 100644
--- a/arrow/benches/boolean_kernels.rs
+++ b/arrow/benches/boolean_kernels.rs
@@ -45,6 +45,25 @@
     c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2)));
     c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2)));
     c.bench_function("not", |b| b.iter(|| bench_not(&array1)));
+
+    let array1_slice = array1.slice(1, size - 1);
+    let array1_slice = array1_slice
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .unwrap();
+    let array2_slice = array2.slice(1, size - 1);
+    let array2_slice = array2_slice
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .unwrap();
+
+    c.bench_function("and_sliced", |b| {
+        b.iter(|| bench_and(&array1_slice, &array2_slice))
+    });
+    c.bench_function("or_sliced", |b| {
+        b.iter(|| bench_or(&array1_slice, &array2_slice))
+    });
+    c.bench_function("not_sliced", |b| b.iter(|| bench_not(&array1_slice)));
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs
index c57d021..ea9280c 100644
--- a/arrow/src/util/bit_chunk_iterator.rs
+++ b/arrow/src/util/bit_chunk_iterator.rs
@@ -35,10 +35,10 @@
         let byte_offset = offset / 8;
         let bit_offset = offset % 8;
 
-        let chunk_bits = 8 * std::mem::size_of::<u64>();
-
-        let chunk_len = len / chunk_bits;
-        let remainder_len = len & (chunk_bits - 1);
+        // number of complete u64 chunks
+        let chunk_len = len / 64;
+        // number of remaining bits
+        let remainder_len = len % 64;
 
         BitChunks::<'a> {
             buffer: &buffer[byte_offset..],
@@ -137,14 +137,18 @@
         // so when reading as u64 on a big-endian machine, the bytes need to be swapped
         let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() };
 
-        let combined = if self.bit_offset == 0 {
+        let bit_offset = self.bit_offset;
+
+        let combined = if bit_offset == 0 {
             current
         } else {
-            let next =
-                unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() };
+            // the constructor ensures that bit_offset is in 0..8
+            // that means we need to read at most one additional byte to fill in the high bits
+            let next = unsafe {
+                std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64
+            };
 
-            current >> self.bit_offset
-                | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset)
+            (current >> bit_offset) | (next << (64 - bit_offset))
         };
 
         self.index = index + 1;
@@ -184,7 +188,6 @@
     }
 
     #[test]
-    #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
     fn test_iter_unaligned() {
         let input: &[u8] = &[
             0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
@@ -206,7 +209,6 @@
     }
 
     #[test]
-    #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
     fn test_iter_unaligned_remainder_1_byte() {
         let input: &[u8] = &[
             0b00000000, 0b00000001, 0b00000010, 0b00000100, 0b00001000, 0b00010000,
@@ -228,7 +230,6 @@
     }
 
     #[test]
-    #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
     fn test_iter_unaligned_remainder_bits_across_bytes() {
         let input: &[u8] = &[0b00111111, 0b11111100];
         let buffer: Buffer = Buffer::from(input);
@@ -242,7 +243,6 @@
     }
 
     #[test]
-    #[cfg_attr(miri, ignore)] // error: Undefined Behavior: using uninitialized data, but this operation requires initialized memory
     fn test_iter_unaligned_remainder_bits_large() {
         let input: &[u8] = &[
             0b11111111, 0b00000000, 0b11111111, 0b00000000, 0b11111111, 0b00000000,
@@ -258,4 +258,18 @@
             bitchunks.remainder_bits()
         );
     }
+
+    #[test]
+    fn test_iter_remainder_out_of_bounds() {
+        // allocating a full page should trigger a fault when reading out of bounds
+        const ALLOC_SIZE: usize = 4 * 1024;
+        let input = vec![0xFF_u8; ALLOC_SIZE];
+
+        let buffer: Buffer = Buffer::from(input);
+
+        let bitchunks = buffer.bit_chunks(57, ALLOC_SIZE * 8 - 57);
+
+        assert_eq!(u64::MAX, bitchunks.iter().last().unwrap());
+        assert_eq!(0x7F, bitchunks.remainder_bits());
+    }
 }