blob: f60214a34fbb902d4539e05c53068a404dfb08d5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.uninverting;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Function;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterDirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.solr.uninverting.FieldCache.CacheEntry;
/**
* A FilterReader that exposes <i>indexed</i> values as if they also had
* docvalues.
* <p>
* This is accomplished by "inverting the inverted index" or "uninversion".
* <p>
* The uninversion process happens lazily: upon the first request for the
* field's docvalues (e.g. via {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}
* or similar), it will create the docvalues on-the-fly if needed and cache it,
* based on the core cache key of the wrapped LeafReader.
*/
public class UninvertingReader extends FilterLeafReader {
/**
* Specifies the type of uninversion to apply for the field.
*/
public static enum Type {
/**
* Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.IntPoint})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
*/
INTEGER_POINT,
/**
* Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.LongPoint})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
*/
LONG_POINT,
/**
* Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.FloatPoint})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
*/
FLOAT_POINT,
/**
* Single-valued Integer, (e.g. indexed with {@link org.apache.lucene.document.DoublePoint})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
*/
DOUBLE_POINT,
/**
* Single-valued Integer, (e.g. indexed with {@link org.apache.solr.legacy.LegacyIntField})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
* @deprecated Index with points and use {@link #INTEGER_POINT} instead.
*/
@Deprecated
LEGACY_INTEGER,
/**
* Single-valued Long, (e.g. indexed with {@link org.apache.solr.legacy.LegacyLongField})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
* @deprecated Index with points and use {@link #LONG_POINT} instead.
*/
@Deprecated
LEGACY_LONG,
/**
* Single-valued Float, (e.g. indexed with {@link org.apache.solr.legacy.LegacyFloatField})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
* @deprecated Index with points and use {@link #FLOAT_POINT} instead.
*/
@Deprecated
LEGACY_FLOAT,
/**
* Single-valued Double, (e.g. indexed with {@link org.apache.solr.legacy.LegacyDoubleField})
* <p>
* Fields with this type act as if they were indexed with
* {@link NumericDocValuesField}.
* @deprecated Index with points and use {@link #DOUBLE_POINT} instead.
*/
@Deprecated
LEGACY_DOUBLE,
/**
* Single-valued Binary, (e.g. indexed with {@link StringField})
* <p>
* Fields with this type act as if they were indexed with
* {@link BinaryDocValuesField}.
*/
BINARY,
/**
* Single-valued Binary, (e.g. indexed with {@link StringField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedDocValuesField}.
*/
SORTED,
/**
* Multi-valued Binary, (e.g. indexed with {@link StringField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedSetDocValuesField}.
*/
SORTED_SET_BINARY,
/**
* Multi-valued Integer, (e.g. indexed with {@link org.apache.solr.legacy.LegacyIntField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedSetDocValuesField}.
*/
SORTED_SET_INTEGER,
/**
* Multi-valued Float, (e.g. indexed with {@link org.apache.solr.legacy.LegacyFloatField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedSetDocValuesField}.
*/
SORTED_SET_FLOAT,
/**
* Multi-valued Long, (e.g. indexed with {@link org.apache.solr.legacy.LegacyLongField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedSetDocValuesField}.
*/
SORTED_SET_LONG,
/**
* Multi-valued Double, (e.g. indexed with {@link org.apache.solr.legacy.LegacyDoubleField})
* <p>
* Fields with this type act as if they were indexed with
* {@link SortedSetDocValuesField}.
*/
SORTED_SET_DOUBLE
}
/** @see #wrap(DirectoryReader, Function) */
public static DirectoryReader wrap(DirectoryReader reader, Map<String, Type> mapping) throws IOException {
return wrap(reader, mapping::get);
}
/**
* Wraps a provided {@link DirectoryReader}. Note that for convenience, the returned reader
* can be used normally (e.g. passed to {@link DirectoryReader#openIfChanged(DirectoryReader)})
* and so on.
*
* @param in input directory reader
* @param mapper function to map a field name to an uninversion type. A Null result means to not uninvert.
* @return a wrapped directory reader
*/
public static DirectoryReader wrap(DirectoryReader in, Function<String, Type> mapper) throws IOException {
return new UninvertingDirectoryReader(in, mapper);
}
static class UninvertingDirectoryReader extends FilterDirectoryReader {
final Function<String, Type> mapper;
public UninvertingDirectoryReader(DirectoryReader in, final Function<String, Type> mapper) throws IOException {
super(in, new FilterDirectoryReader.SubReaderWrapper() {
@Override
public LeafReader wrap(LeafReader reader) {
return UninvertingReader.wrap(reader, mapper);
}
});
this.mapper = mapper;
}
@Override
protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
return new UninvertingDirectoryReader(in, mapper);
}
// NOTE: delegating the cache helpers is wrong since this wrapper alters the
// content of the reader, it is only fine to do that because Solr ALWAYS
// consumes index readers through this wrapper
@Override
public CacheHelper getReaderCacheHelper() {
return in.getReaderCacheHelper();
}
}
/**
* Create a new UninvertingReader with the specified mapping, wrapped around the input. It may be deemed that there
* is no mapping to do, in which case the input is returned.
* <p>
* Expert: This should almost never be used. Use {@link #wrap(DirectoryReader, Function)} instead.
*
* @lucene.internal
*/
public static LeafReader wrap(LeafReader in, Function<String, Type> mapping) {
boolean wrap = false;
// Calculate a new FieldInfos that has DocValuesType where we didn't before
ArrayList<FieldInfo> newFieldInfos = new ArrayList<>(in.getFieldInfos().size());
for (FieldInfo fi : in.getFieldInfos()) {
DocValuesType type = shouldWrap(fi, mapping);
if (type != null) { // always wrap if the mapping says to, potentially discarding existing DVs
wrap = true;
newFieldInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(),
fi.getPointDataDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
} else {
newFieldInfos.add(fi);
}
}
if (!wrap) {
return in;
} else {
FieldInfos fieldInfos = new FieldInfos(newFieldInfos.toArray(new FieldInfo[newFieldInfos.size()]));
return new UninvertingReader(in, mapping, fieldInfos);
}
}
public static DocValuesType shouldWrap(FieldInfo fi, Function<String, Type> mapping) {
DocValuesType type = fi.getDocValuesType();
// fields which currently don't have docValues, but are uninvertable (indexed or points data present)
if (type == DocValuesType.NONE &&
(fi.getIndexOptions() != IndexOptions.NONE || (fi.getPointNumBytes() > 0 && fi.getPointDataDimensionCount() == 1))) {
Type t = mapping.apply(fi.name); // could definitely return null, thus still can't uninvert it
if (t != null) {
if (t == Type.INTEGER_POINT || t == Type.LONG_POINT || t == Type.FLOAT_POINT || t == Type.DOUBLE_POINT) {
// type uses points
if (fi.getPointDataDimensionCount() == 0) {
return null;
}
} else {
// type uses inverted index
if (fi.getIndexOptions() == IndexOptions.NONE) {
return null;
}
}
switch(t) {
case INTEGER_POINT:
case LONG_POINT:
case FLOAT_POINT:
case DOUBLE_POINT:
case LEGACY_INTEGER:
case LEGACY_LONG:
case LEGACY_FLOAT:
case LEGACY_DOUBLE:
type = DocValuesType.NUMERIC;
break;
case BINARY:
type = DocValuesType.BINARY;
break;
case SORTED:
type = DocValuesType.SORTED;
break;
case SORTED_SET_BINARY:
case SORTED_SET_INTEGER:
case SORTED_SET_FLOAT:
case SORTED_SET_LONG:
case SORTED_SET_DOUBLE:
type = DocValuesType.SORTED_SET;
break;
default:
throw new AssertionError();
}
}
}
return type;
}
final Function<String, Type> mapping;
final FieldInfos fieldInfos;
private UninvertingReader(LeafReader in, Function<String, Type> mapping, FieldInfos fieldInfos) {
super(in);
this.mapping = mapping;
this.fieldInfos = fieldInfos;
}
@Override
public FieldInfos getFieldInfos() {
return fieldInfos;
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
Type v = getType(field);
if (v != null) {
switch (v) {
case INTEGER_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.INT_POINT_PARSER);
case FLOAT_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.FLOAT_POINT_PARSER);
case LONG_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LONG_POINT_PARSER);
case DOUBLE_POINT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.DOUBLE_POINT_PARSER);
case LEGACY_INTEGER: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_INT_PARSER);
case LEGACY_FLOAT: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_FLOAT_PARSER);
case LEGACY_LONG: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_LONG_PARSER);
case LEGACY_DOUBLE: return FieldCache.DEFAULT.getNumerics(in, field, FieldCache.LEGACY_DOUBLE_PARSER);
case BINARY:
case SORTED:
case SORTED_SET_BINARY:
case SORTED_SET_DOUBLE:
case SORTED_SET_FLOAT:
case SORTED_SET_INTEGER:
case SORTED_SET_LONG:
break;
}
}
return super.getNumericDocValues(field);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
Type v = getType(field);
if (v == Type.BINARY) {
return FieldCache.DEFAULT.getTerms(in, field);
} else {
return in.getBinaryDocValues(field);
}
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
Type v = getType(field);
if (v == Type.SORTED) {
return FieldCache.DEFAULT.getTermsIndex(in, field);
} else {
return in.getSortedDocValues(field);
}
}
@Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
Type v = getType(field);
if (v != null) {
switch (v) {
case SORTED_SET_INTEGER:
case SORTED_SET_FLOAT:
return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT32_TERM_PREFIX);
case SORTED_SET_LONG:
case SORTED_SET_DOUBLE:
return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT64_TERM_PREFIX);
case SORTED_SET_BINARY:
return FieldCache.DEFAULT.getDocTermOrds(in, field, null);
case BINARY:
case LEGACY_DOUBLE:
case LEGACY_FLOAT:
case LEGACY_INTEGER:
case LEGACY_LONG:
case DOUBLE_POINT:
case FLOAT_POINT:
case INTEGER_POINT:
case LONG_POINT:
case SORTED:
break;
}
}
return in.getSortedSetDocValues(field);
}
/**
* Returns the field's uninversion type, or null
* if the field doesn't exist or doesn't have a mapping.
*/
private Type getType(String field) {
return mapping.apply(field);
}
// NOTE: delegating the cache helpers is wrong since this wrapper alters the
// content of the reader, it is only fine to do that because Solr ALWAYS
// consumes index readers through this wrapper
@Override
public CacheHelper getCoreCacheHelper() {
return in.getCoreCacheHelper();
}
@Override
public CacheHelper getReaderCacheHelper() {
return in.getReaderCacheHelper();
}
@Override
public String toString() {
return "Uninverting(" + in.toString() + ")";
}
/**
* Return information about the backing cache
* @lucene.internal
*/
public static FieldCacheStats getUninvertedStats() {
CacheEntry[] entries = FieldCache.DEFAULT.getCacheEntries();
long totalBytesUsed = 0;
String[] info = new String[entries.length];
for (int i = 0; i < entries.length; i++) {
info[i] = entries[i].toString();
totalBytesUsed += entries[i].getValue().ramBytesUsed();
}
String totalSize = RamUsageEstimator.humanReadableUnits(totalBytesUsed);
return new FieldCacheStats(totalSize, info);
}
public static Map<String, Object> getDVStats(CodecReader reader, FieldInfo fi) throws IOException {
DocValuesType type = fi.getDocValuesType();
try {
int present = 0;
int zeroOrNull = 0;
Bits liveDocs = reader.getLiveDocs();
DocValuesProducer producer = reader.getDocValuesReader();
int expected = reader.numDocs();
int deletedButPresent = 0;
switch (type) {
case NUMERIC:
NumericDocValues ndv = reader.getNumericDocValues(fi.name);
while (ndv.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(ndv.docID())) {
deletedButPresent++;
}
long num = ndv.longValue();
if (num == 0) {
zeroOrNull++;
}
present++;
}
break;
case BINARY:
BinaryDocValues bdv = reader.getBinaryDocValues(fi.name);
while (bdv.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(bdv.docID())) {
deletedButPresent++;
}
BytesRef bytes = bdv.binaryValue();
if (bytes == null || bytes.length == 0) {
zeroOrNull++;
}
present++;
}
break;
case SORTED:
SortedDocValues sdv = reader.getSortedDocValues(fi.name);
while (sdv.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(sdv.docID())) {
deletedButPresent++;
}
BytesRef bytes = sdv.binaryValue();
if (bytes == null || bytes.length == 0) {
zeroOrNull++;
}
present++;
}
break;
case SORTED_NUMERIC:
SortedNumericDocValues sndv = reader.getSortedNumericDocValues(fi.name);
while (sndv.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(sndv.docID())) {
deletedButPresent++;
}
if (sndv.docValueCount() > 0) {
for (int j = 0; j < sndv.docValueCount(); j++) {
long val = sndv.nextValue();
}
present++;
} else {
zeroOrNull++;
}
}
break;
case SORTED_SET:
SortedSetDocValues ssdv = reader.getSortedSetDocValues(fi.name);
while (ssdv.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(ssdv.docID())) {
deletedButPresent++;
}
if (ssdv.getValueCount() > 0) {
long ord;
boolean allPresent = true;
while ((ord = ssdv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
BytesRef term = ssdv.lookupOrd(ord);
if (term == null || term.length == 0) {
allPresent = false;
}
}
if (!allPresent) {
zeroOrNull++;
}
present++;
} else {
zeroOrNull++;
}
}
break;
}
Map<String, Object> result = new HashMap<>();
result.put("numDocs", reader.numDocs());
result.put("expected", expected);
result.put("present", present);
result.put("nullOrZero", zeroOrNull);
result.put("delPresent", deletedButPresent);
return result;
} catch (IOException e) {
return Collections.singletonMap("error", e.getMessage());
}
}
public static int getUninvertedStatsSize() {
return FieldCache.DEFAULT.getCacheEntries().length;
}
/**
* Return information about the backing cache
* @lucene.internal
*/
public static class FieldCacheStats {
public String totalSize;
public String[] info;
public FieldCacheStats(String totalSize, String[] info) {
this.totalSize = totalSize;
this.info = info;
}
}
}