blob: 508508bada8a258e51fb8dc4c212fe379843efdb [file] [log] [blame]
use std::io;
use std::io::Write;
use common::{intersect_bitsets, BitSet, ReadOnlyBitSet};
use ownedbytes::OwnedBytes;
use crate::space_usage::ByteCount;
use crate::DocId;
/// Write an alive `BitSet`
///
/// where `alive_bitset` is the set of alive `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
alive_bitset.serialize(writer)?;
Ok(())
}
/// Set of alive `DocId`s.
#[derive(Clone)]
pub struct AliveBitSet {
num_alive_docs: usize,
bitset: ReadOnlyBitSet,
}
/// Intersects two AliveBitSets in a new one.
/// The two bitsets need to have the same max_value.
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
let bitset = intersect_bitsets(left.bitset(), right.bitset());
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let mut alive_bitset_buffer = Vec::new();
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
Self::open(alive_bitset_bytes)
}
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
let readonly_bitset = ReadOnlyBitSet::from(bitset);
AliveBitSet::from(readonly_bitset)
}
/// Opens an alive bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let bitset = ReadOnlyBitSet::open(bytes);
AliveBitSet::from(bitset)
}
/// Returns true if the document is still "alive". In other words, if it has not been deleted.
#[inline]
pub fn is_alive(&self, doc: DocId) -> bool {
self.bitset.contains(doc)
}
/// Returns true if the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
!self.is_alive(doc)
}
/// Iterate over the alive doc_ids.
#[inline]
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter()
}
/// Get underlying bitset.
#[inline]
pub fn bitset(&self) -> &ReadOnlyBitSet {
&self.bitset
}
/// The number of alive documents.
pub fn num_alive_docs(&self) -> usize {
self.num_alive_docs
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.bitset().num_bytes()
}
}
impl From<ReadOnlyBitSet> for AliveBitSet {
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
}
#[cfg(test)]
mod tests {
use super::AliveBitSet;
#[test]
fn test_alive_bitset_empty() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
assert!(!alive_bitset.is_deleted(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 10);
}
#[test]
fn test_alive_bitset() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
assert!(alive_bitset.is_alive(0));
assert!(alive_bitset.is_deleted(1));
assert!(alive_bitset.is_alive(2));
assert!(alive_bitset.is_alive(3));
assert!(alive_bitset.is_alive(4));
assert!(alive_bitset.is_alive(5));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(7));
assert!(alive_bitset.is_alive(8));
assert!(alive_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 8);
}
#[test]
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::IteratorRandom;
use rand::thread_rng;
use test::Bencher;
use super::AliveBitSet;
fn get_alive() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..(1_000_000) * 1 / 8 {
remove_rand(&mut data);
}
data
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}
#[bench]
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_alive_bitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
#[bench]
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
}