blob: 45f56c8dbb7caa6ce396a52339fed781d697a6cd [file] [log] [blame]
use std::collections::HashMap;
use std::io;
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
use super::FastFieldType;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::termdict::TermOrdinal;
use crate::DatePrecision;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
u128_value_writers: Vec<U128FastFieldWriter>,
u128_multi_value_writers: Vec<MultiValueU128FastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
pub(crate) fn unexpected_value(expected: &str, actual: &Value) -> crate::TantivyError {
crate::TantivyError::SchemaError(format!(
"Expected a {:?} in fast field, but got {:?}",
expected, actual
))
}
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
match *field_entry.field_type() {
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
FieldType::F64(_) => common::f64_to_u64(0.0f64),
_ => 0u64,
}
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut u128_value_writers = Vec::new();
let mut u128_multi_value_writers = Vec::new();
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
for (field, field_entry) in schema.fields() {
match field_entry.field_type() {
FieldType::I64(ref int_options)
| FieldType::U64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
None,
);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer =
IntFastFieldWriter::new(field, Some(options.get_precision()));
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
Some(options.get_precision()),
);
multi_values_writers.push(fast_field_writer);
}
None => {}
},
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
let fast_field_writer = BytesFastFieldWriter::new(field);
bytes_value_writers.push(fast_field_writer);
}
}
FieldType::IpAddr(opt) => {
if opt.is_fast() {
match opt.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let fast_field_writer = U128FastFieldWriter::new(field);
u128_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueU128FastFieldWriter::new(field);
u128_multi_value_writers.push(fast_field_writer);
}
None => {}
}
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {}
}
}
FastFieldsWriter {
u128_value_writers,
u128_multi_value_writers,
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.term_id_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_multi_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub fn get_multivalue_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Returns the bytes fast field writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a bytes fastfield in the schema.
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
// TODO optimize
self.bytes_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.multi_values_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.bytes_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_multi_value_writers {
field_writer.add_document(doc)?;
}
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in self.bytes_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_multi_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
Ok(())
}
}
/// Fast field writer for u128 values.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// compressed to a compact number space and the number of
/// bits required for bitpacking can only been known once
/// we have seen all of the values.
pub struct U128FastFieldWriter {
field: Field,
vals: Vec<u128>,
val_count: u32,
}
impl U128FastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> Self {
Self {
field,
vals: vec![],
val_count: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.len() * 16
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u128) {
self.vals.push(val);
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
/// Extract the value associated to the fast field for
/// this document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let ip_addr = v.as_ip_addr().ok_or_else(|| unexpected_value("ip", v))?;
let value = ip_addr.to_u128();
self.add_val(value);
}
None => {
self.add_val(0); // TODO fix null handling
}
};
self.val_count += 1;
Ok(())
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
if let Some(doc_id_map) = doc_id_map {
let iter_gen = || {
doc_id_map
.iter_old_doc_ids()
.map(|idx| self.vals[idx as usize])
};
serializer.create_u128_fast_field_with_idx(
self.field,
iter_gen,
self.val_count as u32,
0,
)?;
} else {
let iter_gen = || self.vals.iter().cloned();
serializer.create_u128_fast_field_with_idx(
self.field,
iter_gen,
self.val_count as u32,
0,
)?;
}
Ok(())
}
}
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, i64 and f64 use the same writer.
/// i64 and f64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
precision_opt,
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::MAX,
val_max: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targeting.
pub fn field(&self) -> Field {
self.field
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Records a new value.
///
/// The n-th value being recorded is implicitly
/// associated with the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.add(val);
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated with the fast field for
/// this document.
///
/// i64 and f64 are remapped to u64 using the logic
/// in `common::i64_to_u64` and `common::f64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
///
/// Values on text fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let value = match (self.precision_opt, v) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => super::value_to_u64(v)?,
};
self.add_val(value);
}
None => {
self.add_val(self.val_if_missing);
}
};
Ok(())
}
/// get iterator over the data
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.vals.iter()
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
min_value: min,
max_value: max,
num_vals: self.val_count as u32,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
min_value: u64,
max_value: u64,
num_vals: u32,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated with the given doc.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 {
unimplemented!()
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
if let Some(doc_id_map) = self.doc_id_map {
Box::new(
doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize)),
)
} else {
Box::new(self.vals.iter())
}
}
fn min_value(&self) -> u64 {
self.min_value
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u32 {
self.num_vals
}
}