blob: e6e8fdac5fc78bb28c82b9c8c05a90124a522095 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.uninverting;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;
/**
* Expert: The default cache implementation, storing all values in memory.
* A WeakHashMap is used for storage.
*
* @lucene.internal
*/
public class FieldCacheImpl implements FieldCache {
private Map<Class<?>,Cache> caches;
FieldCacheImpl() {
init();
}
private synchronized void init() {
caches = new HashMap<>(6);
caches.put(Long.TYPE, new LongCache(this));
caches.put(BinaryDocValues.class, new BinaryDocValuesCache(this));
caches.put(SortedDocValues.class, new SortedDocValuesCache(this));
caches.put(DocTermOrds.class, new DocTermOrdsCache(this));
caches.put(DocsWithFieldCache.class, new DocsWithFieldCache(this));
}
@Override
public synchronized void purgeAllCaches() {
init();
}
@Override
public synchronized void purgeByCacheKey(IndexReader.CacheKey coreCacheKey) {
for(Cache c : caches.values()) {
c.purgeByCacheKey(coreCacheKey);
}
}
@Override
public synchronized CacheEntry[] getCacheEntries() {
List<CacheEntry> result = new ArrayList<>(17);
for(final Map.Entry<Class<?>,Cache> cacheEntry: caches.entrySet()) {
final Cache cache = cacheEntry.getValue();
final Class<?> cacheType = cacheEntry.getKey();
synchronized(cache.readerCache) {
for (final Map.Entry<IndexReader.CacheKey,Map<CacheKey, Accountable>> readerCacheEntry : cache.readerCache.entrySet()) {
final IndexReader.CacheKey readerKey = readerCacheEntry.getKey();
if (readerKey == null) continue;
final Map<CacheKey, Accountable> innerCache = readerCacheEntry.getValue();
for (final Map.Entry<CacheKey, Accountable> mapEntry : innerCache.entrySet()) {
CacheKey entry = mapEntry.getKey();
result.add(new CacheEntry(readerKey, entry.field,
cacheType, entry.custom,
mapEntry.getValue()));
}
}
}
}
return result.toArray(new CacheEntry[result.size()]);
}
// per-segment fieldcaches don't purge until the shared core closes.
final IndexReader.ClosedListener purgeCore = FieldCacheImpl.this::purgeByCacheKey;
private void initReader(LeafReader reader) {
IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper();
if (cacheHelper == null) {
throw new IllegalStateException("Cannot cache on " + reader);
}
cacheHelper.addClosedListener(purgeCore);
}
/** Expert: Internal cache. */
abstract static class Cache {
Cache(FieldCacheImpl wrapper) {
this.wrapper = wrapper;
}
final FieldCacheImpl wrapper;
final Map<IndexReader.CacheKey,Map<CacheKey,Accountable>> readerCache = new WeakHashMap<>();
protected abstract Accountable createValue(LeafReader reader, CacheKey key)
throws IOException;
/** Remove this reader from the cache, if present. */
public void purgeByCacheKey(IndexReader.CacheKey coreCacheKey) {
synchronized(readerCache) {
readerCache.remove(coreCacheKey);
}
}
/** Sets the key to the value for the provided reader;
* if the key is already set then this doesn't change it. */
public void put(LeafReader reader, CacheKey key, Accountable value) {
IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper();
if (cacheHelper == null) {
throw new IllegalStateException("Cannot cache on " + reader);
}
final IndexReader.CacheKey readerKey = cacheHelper.getKey();
synchronized (readerCache) {
Map<CacheKey,Accountable> innerCache = readerCache.get(readerKey);
if (innerCache == null) {
// First time this reader is using FieldCache
innerCache = new HashMap<>();
readerCache.put(readerKey, innerCache);
wrapper.initReader(reader);
}
if (innerCache.get(key) == null) {
innerCache.put(key, value);
} else {
// Another thread beat us to it; leave the current
// value
}
}
}
public Object get(LeafReader reader, CacheKey key) throws IOException {
Map<CacheKey,Accountable> innerCache;
Accountable value;
IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper();
if (cacheHelper == null) {
reader.getCoreCacheHelper();
throw new IllegalStateException("Cannot cache on " + reader);
}
final IndexReader.CacheKey readerKey = cacheHelper.getKey();
synchronized (readerCache) {
innerCache = readerCache.get(readerKey);
if (innerCache == null) {
// First time this reader is using FieldCache
innerCache = new HashMap<>();
readerCache.put(readerKey, innerCache);
wrapper.initReader(reader);
value = null;
} else {
value = innerCache.get(key);
}
if (value == null) {
value = new CreationPlaceholder();
innerCache.put(key, value);
}
}
if (value instanceof CreationPlaceholder) {
synchronized (value) {
CreationPlaceholder progress = (CreationPlaceholder) value;
if (progress.value == null) {
progress.value = createValue(reader, key);
synchronized (readerCache) {
innerCache.put(key, progress.value);
}
}
return progress.value;
}
}
return value;
}
}
/** Expert: Every composite-key in the internal cache is of this type. */
static class CacheKey {
final String field; // which Field
final Object custom; // which custom comparator or parser
/** Creates one of these objects for a custom comparator/parser. */
CacheKey(String field, Object custom) {
this.field = field;
this.custom = custom;
}
/** Two of these are equal iff they reference the same field and type. */
@Override
public boolean equals (Object o) {
if (o instanceof CacheKey) {
CacheKey other = (CacheKey) o;
if (other.field.equals(field)) {
if (other.custom == null) {
if (custom == null) return true;
} else if (other.custom.equals (custom)) {
return true;
}
}
}
return false;
}
/** Composes a hashcode based on the field and type. */
@Override
public int hashCode() {
return field.hashCode() ^ (custom==null ? 0 : custom.hashCode());
}
}
private static abstract class Uninvert {
public Bits docsWithField;
final boolean points;
// pass true to pull from points, otherwise postings.
Uninvert(boolean points) {
this.points = points;
}
final void uninvert(LeafReader reader, String field) throws IOException {
if (points) {
uninvertPoints(reader, field);
} else {
uninvertPostings(reader, field);
}
}
final void uninvertPoints(LeafReader reader, String field) throws IOException {
final int maxDoc = reader.maxDoc();
PointValues values = reader.getPointValues(field);
assert values != null;
assert values.size() > 0;
final boolean setDocsWithField;
final int docCount = values.getDocCount();
assert docCount <= maxDoc;
if (docCount == maxDoc) {
// Fast case: all docs have this field:
this.docsWithField = new Bits.MatchAllBits(maxDoc);
setDocsWithField = false;
} else {
setDocsWithField = true;
}
BytesRef scratch = new BytesRef();
values.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) throws IOException {
throw new AssertionError();
}
@Override
public void visit(int docID, byte[] packedValue) throws IOException {
scratch.bytes = packedValue;
scratch.length = packedValue.length;
visitTerm(scratch);
visitDoc(docID);
if (setDocsWithField) {
if (docsWithField == null) {
// Lazy init
docsWithField = new FixedBitSet(maxDoc);
}
((FixedBitSet)docsWithField).set(docID);
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_CROSSES_QUERY; // inspect all byte-docid pairs
}
});
}
final void uninvertPostings(LeafReader reader, String field) throws IOException {
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(field);
if (terms != null) {
final boolean setDocsWithField;
final int termsDocCount = terms.getDocCount();
assert termsDocCount <= maxDoc;
if (termsDocCount == maxDoc) {
// Fast case: all docs have this field:
this.docsWithField = new Bits.MatchAllBits(maxDoc);
setDocsWithField = false;
} else {
setDocsWithField = true;
}
final TermsEnum termsEnum = termsEnum(terms);
PostingsEnum docs = null;
FixedBitSet docsWithField = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
visitTerm(term);
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
visitDoc(docID);
if (setDocsWithField) {
if (docsWithField == null) {
// Lazy init
this.docsWithField = docsWithField = new FixedBitSet(maxDoc);
}
docsWithField.set(docID);
}
}
}
}
}
protected abstract TermsEnum termsEnum(Terms terms) throws IOException;
protected abstract void visitTerm(BytesRef term);
protected abstract void visitDoc(int docID);
}
// null Bits means no docs matched
void setDocsWithField(LeafReader reader, String field, Bits docsWithField, Parser parser) {
final int maxDoc = reader.maxDoc();
final Bits bits;
if (docsWithField == null) {
bits = new Bits.MatchNoBits(maxDoc);
} else if (docsWithField instanceof FixedBitSet) {
final int numSet = ((FixedBitSet) docsWithField).cardinality();
if (numSet >= maxDoc) {
// The cardinality of the BitSet is maxDoc if all documents have a value.
assert numSet == maxDoc;
bits = new Bits.MatchAllBits(maxDoc);
} else {
bits = docsWithField;
}
} else {
bits = docsWithField;
}
caches.get(DocsWithFieldCache.class).put(reader, new CacheKey(field, parser), new BitsEntry(bits));
}
private static class HoldsOneThing<T> {
private T it;
public void set(T it) {
this.it = it;
}
public T get() {
return it;
}
}
private static class GrowableWriterAndMinValue {
GrowableWriterAndMinValue(GrowableWriter array, long minValue) {
this.writer = array;
this.minValue = minValue;
}
public GrowableWriter writer;
public long minValue;
}
@Override
public Bits getDocsWithField(LeafReader reader, String field, Parser parser) throws IOException {
final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
// field does not exist or has no value
return new Bits.MatchNoBits(reader.maxDoc());
}
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
// doc values case
} else if (parser instanceof PointParser) {
// points case
} else {
// postings case
if (fieldInfo.getIndexOptions() == IndexOptions.NONE) {
return new Bits.MatchNoBits(reader.maxDoc());
}
}
BitsEntry bitsEntry = (BitsEntry) caches.get(DocsWithFieldCache.class).get(reader, new CacheKey(field, parser));
return bitsEntry.bits;
}
static class BitsEntry implements Accountable {
final Bits bits;
BitsEntry(Bits bits) {
this.bits = bits;
}
@Override
public long ramBytesUsed() {
long base = RamUsageEstimator.NUM_BYTES_OBJECT_REF;
if (bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) {
return base;
} else {
return base + (bits.length() >>> 3);
}
}
}
static final class DocsWithFieldCache extends Cache {
DocsWithFieldCache(FieldCacheImpl wrapper) {
super(wrapper);
}
@Override
protected BitsEntry createValue(LeafReader reader, CacheKey key) throws IOException {
final String field = key.field;
final Parser parser = (Parser) key.custom;
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo.getDocValuesType() != DocValuesType.NONE) {
return createValueDocValues(reader, field);
} else if (parser instanceof PointParser) {
return createValuePoints(reader, field);
} else {
return createValuePostings(reader, field);
}
}
private BitsEntry createValueDocValues(LeafReader reader, String field) throws IOException {
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
DocValuesType dvType = fieldInfo.getDocValuesType();
DocIdSetIterator iterator;
switch(dvType) {
case NUMERIC:
iterator = reader.getNumericDocValues(field);
break;
case BINARY:
iterator = reader.getBinaryDocValues(field);
break;
case SORTED:
iterator = reader.getSortedDocValues(field);
break;
case SORTED_NUMERIC:
iterator = reader.getSortedNumericDocValues(field);
break;
case SORTED_SET:
iterator = reader.getSortedSetDocValues(field);
break;
default:
throw new AssertionError();
}
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
while (true) {
int docID = iterator.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
bits.set(docID);
}
return new BitsEntry(bits);
}
private BitsEntry createValuePoints(LeafReader reader, String field) throws IOException {
final int maxDoc = reader.maxDoc();
PointValues values = reader.getPointValues(field);
assert values != null;
assert values.size() > 0;
final int docCount = values.getDocCount();
assert docCount <= maxDoc;
if (docCount == maxDoc) {
// Fast case: all docs have this field:
return new BitsEntry(new Bits.MatchAllBits(maxDoc));
}
// otherwise a no-op uninvert!
Uninvert u = new Uninvert(true) {
@Override
protected TermsEnum termsEnum(Terms terms) throws IOException {
throw new AssertionError();
}
@Override
protected void visitTerm(BytesRef term) {}
@Override
protected void visitDoc(int docID) {}
};
u.uninvert(reader, field);
return new BitsEntry(u.docsWithField);
}
// TODO: it is dumb that uninverting code is duplicated here in this method!!
private BitsEntry createValuePostings(LeafReader reader, String field) throws IOException {
final int maxDoc = reader.maxDoc();
// Visit all docs that have terms for this field
FixedBitSet res = null;
Terms terms = reader.terms(field);
if (terms != null) {
final int termsDocCount = terms.getDocCount();
assert termsDocCount <= maxDoc;
if (termsDocCount == maxDoc) {
// Fast case: all docs have this field:
return new BitsEntry(new Bits.MatchAllBits(maxDoc));
}
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (res == null) {
// lazy init
res = new FixedBitSet(maxDoc);
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
// TODO: use bulk API
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
res.set(docID);
}
}
}
if (res == null) {
return new BitsEntry(new Bits.MatchNoBits(maxDoc));
}
final int numSet = res.cardinality();
if (numSet >= maxDoc) {
// The cardinality of the BitSet is maxDoc if all documents have a value.
assert numSet == maxDoc;
return new BitsEntry(new Bits.MatchAllBits(maxDoc));
}
return new BitsEntry(res);
}
}
@Override
public NumericDocValues getNumerics(LeafReader reader, String field, Parser parser) throws IOException {
if (parser == null) {
throw new NullPointerException();
}
final NumericDocValues valuesIn = reader.getNumericDocValues(field);
if (valuesIn != null) {
return valuesIn;
} else {
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptyNumeric();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
}
if (parser instanceof PointParser) {
// points case
// no points in this segment
if (info.getPointDimensionCount() == 0) {
return DocValues.emptyNumeric();
}
if (info.getPointDimensionCount() != 1) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed with dimensions=" + info.getPointDimensionCount());
}
PointValues values = reader.getPointValues(field);
// no actual points for this field (e.g. all points deleted)
if (values == null || values.size() == 0) {
return DocValues.emptyNumeric();
}
// not single-valued
if (values.size() != values.getDocCount()) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed with multiple values, numValues=" + values.size() + ",numDocs=" + values.getDocCount());
}
} else {
// postings case
// not indexed
if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptyNumeric();
}
}
return ((LongsFromArray) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser))).iterator();
}
}
public static class LongsFromArray implements Accountable {
private final PackedInts.Reader values;
private final long minValue;
private final Bits docsWithField;
private final String field;
public LongsFromArray(String field, PackedInts.Reader values, long minValue, Bits docsWithField) { // TODO: accept null docsWithField?
this.field = field;
this.values = values;
this.minValue = minValue;
this.docsWithField = docsWithField;
}
@Override
public long ramBytesUsed() {
return values.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF + Long.BYTES;
}
public NumericDocValues iterator() {
return new NumericDocValues() {
int docID = -1;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID >= values.size()) {
docID = NO_MORE_DOCS;
return docID;
}
if (docsWithField.get(docID)) {
return docID;
}
}
}
@Override
public int advance(int target) {
if (target < values.size()) {
docID = target;
if (docsWithField.get(docID)) {
return docID;
} else{
return nextDoc();
}
} else {
docID = NO_MORE_DOCS;
return docID;
}
}
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
return docsWithField.get(docID);
}
@Override
public long cost() {
return values.size();
}
@Override
public long longValue() {
return minValue + values.get(docID);
}
};
}
}
static final class LongCache extends Cache {
LongCache(FieldCacheImpl wrapper) {
super(wrapper);
}
@Override
protected Accountable createValue(final LeafReader reader, CacheKey key)
throws IOException {
final Parser parser = (Parser) key.custom;
final HoldsOneThing<GrowableWriterAndMinValue> valuesRef = new HoldsOneThing<>();
Uninvert u = new Uninvert(parser instanceof PointParser) {
private long minValue;
private long currentValue;
private GrowableWriter values;
@Override
public void visitTerm(BytesRef term) {
currentValue = parser.parseValue(term);
if (values == null) {
// Lazy alloc so for the numeric field case
// (which will hit a NumberFormatException
// when we first try the DEFAULT_INT_PARSER),
// we don't double-alloc:
int startBitsPerValue;
// Make sure than missing values (0) can be stored without resizing
if (currentValue < 0) {
minValue = currentValue;
startBitsPerValue = minValue == Long.MIN_VALUE ? 64 : PackedInts.bitsRequired(-minValue);
} else {
minValue = 0;
startBitsPerValue = PackedInts.bitsRequired(currentValue);
}
values = new GrowableWriter(startBitsPerValue, reader.maxDoc(), PackedInts.FAST);
if (minValue != 0) {
values.fill(0, values.size(), -minValue); // default value must be 0
}
valuesRef.set(new GrowableWriterAndMinValue(values, minValue));
}
}
@Override
public void visitDoc(int docID) {
values.set(docID, currentValue - minValue);
}
@Override
protected TermsEnum termsEnum(Terms terms) throws IOException {
return parser.termsEnum(terms);
}
};
u.uninvert(reader, key.field);
wrapper.setDocsWithField(reader, key.field, u.docsWithField, parser);
GrowableWriterAndMinValue values = valuesRef.get();
Bits docsWithField = u.docsWithField == null ? new Bits.MatchNoBits(reader.maxDoc()) : u.docsWithField;
if (values == null) {
return new LongsFromArray(key.field, new PackedInts.NullReader(reader.maxDoc()), 0L, docsWithField);
}
return new LongsFromArray(key.field, values.writer.getMutable(), values.minValue, docsWithField);
}
}
public static class SortedDocValuesImpl implements Accountable {
private final PagedBytes.Reader bytes;
private final PackedLongValues termOrdToBytesOffset;
private final PackedInts.Reader docToTermOrd;
private final int numOrd;
public SortedDocValuesImpl(PagedBytes.Reader bytes, PackedLongValues termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) {
this.bytes = bytes;
this.docToTermOrd = docToTermOrd;
this.termOrdToBytesOffset = termOrdToBytesOffset;
this.numOrd = numOrd;
}
public SortedDocValues iterator() {
return new Iter();
}
public class Iter extends SortedDocValues {
private int docID = -1;
private final BytesRef term = new BytesRef();
/** @lucene.internal Specific to this implementation and subject to change. For internal optimization only. */
public int getOrd(int docID) {
// Subtract 1, matching the 1+ord we did when
// storing, so that missing values, which are 0 in the
// packed ints, are returned as -1 ord:
return (int) docToTermOrd.get(docID)-1;
}
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID >= docToTermOrd.size()) {
docID = NO_MORE_DOCS;
return docID;
}
if (docToTermOrd.get(docID) != 0) {
return docID;
}
}
}
@Override
public int advance(int target) {
if (target < docToTermOrd.size()) {
docID = target;
if (docToTermOrd.get(docID) != 0) {
return docID;
} else{
return nextDoc();
}
} else {
docID = NO_MORE_DOCS;
return docID;
}
}
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
return docToTermOrd.get(docID) != 0;
}
@Override
public long cost() {
return 0;
}
@Override
public int ordValue() {
// Subtract 1, matching the 1+ord we did when
// storing, so that missing values, which are 0 in the
// packed ints, are returned as -1 ord:
return (int) docToTermOrd.get(docID)-1;
}
@Override
public int getValueCount() {
return numOrd;
}
@Override
public BytesRef lookupOrd(int ord) {
if (ord < 0) {
throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")");
}
bytes.fill(term, termOrdToBytesOffset.get(ord));
return term;
}
}
@Override
public long ramBytesUsed() {
return bytes.ramBytesUsed() +
termOrdToBytesOffset.ramBytesUsed() +
docToTermOrd.ramBytesUsed() +
3*RamUsageEstimator.NUM_BYTES_OBJECT_REF +
Integer.BYTES;
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>(3);
resources.add(Accountables.namedAccountable("term bytes", bytes));
resources.add(Accountables.namedAccountable("ord -> term", termOrdToBytesOffset));
resources.add(Accountables.namedAccountable("doc -> ord", docToTermOrd));
return Collections.unmodifiableList(resources);
}
}
public SortedDocValues getTermsIndex(LeafReader reader, String field) throws IOException {
return getTermsIndex(reader, field, PackedInts.FAST);
}
public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException {
SortedDocValues valuesIn = reader.getSortedDocValues(field);
if (valuesIn != null) {
// Not cached here by FieldCacheImpl (cached instead
// per-thread by SegmentReader):
return valuesIn;
} else {
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptySorted();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
// we don't try to build a sorted instance from numeric/binary doc
// values because dedup can be very costly
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
} else if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptySorted();
}
SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio));
return impl.iterator();
}
}
static class SortedDocValuesCache extends Cache {
SortedDocValuesCache(FieldCacheImpl wrapper) {
super(wrapper);
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final PagedBytes bytes = new PagedBytes(15);
int startTermsBPV;
// TODO: use Uninvert?
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
} else {
startTermsBPV = 1;
}
} else {
startTermsBPV = 1;
}
PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);
int termOrd = 0;
// TODO: use Uninvert?
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (termOrd >= maxDoc) {
throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
}
termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
// Store 1+ ord into packed bits
docToTermOrd.set(docID, 1+termOrd);
}
termOrd++;
}
}
// maybe an int-only impl?
return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}
}
public static class BinaryDocValuesImpl implements Accountable {
private final PagedBytes.Reader bytes;
private final PackedInts.Reader docToOffset;
private final Bits docsWithField;
public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset, Bits docsWithField) {
this.bytes = bytes;
this.docToOffset = docToOffset;
this.docsWithField = docsWithField;
}
public BinaryDocValues iterator() {
return new BinaryDocValues() {
final BytesRef term = new BytesRef();
int docID = -1;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() {
while (true) {
docID++;
if (docID >= docToOffset.size()) {
docID = NO_MORE_DOCS;
return docID;
}
if (docsWithField.get(docID)) {
return docID;
}
}
}
@Override
public int advance(int target) {
if (target < docToOffset.size()) {
docID = target;
if (docsWithField.get(docID)) {
return docID;
} else{
return nextDoc();
}
} else {
docID = NO_MORE_DOCS;
return docID;
}
}
@Override
public boolean advanceExact(int target) throws IOException {
docID = target;
return docsWithField.get(docID);
}
@Override
public long cost() {
return 0;
}
@Override
public BytesRef binaryValue() {
final long pointer = docToOffset.get(docID);
if (pointer == 0) {
term.length = 0;
} else {
bytes.fill(term, pointer);
}
return term;
}
};
}
@Override
public long ramBytesUsed() {
return bytes.ramBytesUsed() + docToOffset.ramBytesUsed() + 2*RamUsageEstimator.NUM_BYTES_OBJECT_REF;
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>(2);
resources.add(Accountables.namedAccountable("term bytes", bytes));
resources.add(Accountables.namedAccountable("addresses", docToOffset));
return Collections.unmodifiableList(resources);
}
}
// TODO: this if DocTermsIndex was already created, we
// should share it...
public BinaryDocValues getTerms(LeafReader reader, String field) throws IOException {
return getTerms(reader, field, PackedInts.FAST);
}
public BinaryDocValues getTerms(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException {
BinaryDocValues valuesIn = reader.getBinaryDocValues(field);
if (valuesIn == null) {
valuesIn = reader.getSortedDocValues(field);
}
if (valuesIn != null) {
// Not cached here by FieldCacheImpl (cached instead
// per-thread by SegmentReader):
return valuesIn;
}
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptyBinary();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
} else if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptyBinary();
}
BinaryDocValuesImpl impl = (BinaryDocValuesImpl) caches.get(BinaryDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio));
return impl.iterator();
}
static final class BinaryDocValuesCache extends Cache {
BinaryDocValuesCache(FieldCacheImpl wrapper) {
super(wrapper);
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
// TODO: would be nice to first check if DocTermsIndex
// was already cached for this field and then return
// that instead, to avoid insanity
final int maxDoc = reader.maxDoc();
Terms terms = reader.terms(key.field);
final float acceptableOverheadRatio = ((Float) key.custom).floatValue();
final int termCountHardLimit = maxDoc;
// Holds the actual term data, expanded.
final PagedBytes bytes = new PagedBytes(15);
int startBPV;
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = terms.size();
if (numUniqueTerms != -1L) {
if (numUniqueTerms > termCountHardLimit) {
numUniqueTerms = termCountHardLimit;
}
startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
} else {
startBPV = 1;
}
} else {
startBPV = 1;
}
final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
// pointer==0 means not set
bytes.copyUsingLengthPrefix(new BytesRef());
if (terms != null) {
int termCount = 0;
final TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while(true) {
if (termCount++ == termCountHardLimit) {
// app is misusing the API (there is more than
// one term per doc); in this case we make best
// effort to load what we can (see LUCENE-2142)
break;
}
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
final long pointer = bytes.copyUsingLengthPrefix(term);
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
docToOffset.set(docID, pointer);
}
}
}
final PackedInts.Reader offsetReader = docToOffset.getMutable();
Bits docsWithField = new Bits() {
@Override
public boolean get(int index) {
return offsetReader.get(index) != 0;
}
@Override
public int length() {
return maxDoc;
}
};
wrapper.setDocsWithField(reader, key.field, docsWithField, null);
// maybe an int-only impl?
return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}
}
// TODO: this if DocTermsIndex was already created, we
// should share it...
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException {
// not a general purpose filtering mechanism...
assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;
SortedSetDocValues dv = reader.getSortedSetDocValues(field);
if (dv != null) {
return dv;
}
SortedDocValues sdv = reader.getSortedDocValues(field);
if (sdv != null) {
return DocValues.singleton(sdv);
}
final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
return DocValues.emptySortedSet();
} else if (info.getDocValuesType() != DocValuesType.NONE) {
throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
} else if (info.getIndexOptions() == IndexOptions.NONE) {
return DocValues.emptySortedSet();
}
// ok we need to uninvert. check if we can optimize a bit.
Terms terms = reader.terms(field);
if (terms == null) {
return DocValues.emptySortedSet();
} else {
// if #postings = #docswithfield we know that the field is "single valued enough".
// it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
// it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
long numPostings = terms.getSumDocFreq();
if (numPostings != -1 && numPostings == terms.getDocCount()) {
return DocValues.singleton(getTermsIndex(reader, field));
}
}
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix));
return dto.iterator(reader);
}
static final class DocTermOrdsCache extends Cache {
DocTermOrdsCache(FieldCacheImpl wrapper) {
super(wrapper);
}
@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
throws IOException {
BytesRef prefix = (BytesRef) key.custom;
return new DocTermOrds(reader, null, key.field, prefix);
}
}
}