blob: 4503daaf2d420d53acdb5aa27c361345894c3b7f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.index.SlowCompositeReaderWrapper;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
import org.apache.solr.uninverting.DocTermOrds;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* Final form of the un-inverted field:
* Each document points to a list of term numbers that are contained in that document.
*
* Term numbers are in sorted order, and are encoded as variable-length deltas from the
* previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
* term number of 0 signals the end of the termNumber list.
*
* There is a single int[maxDoc()] which either contains a pointer into a byte[] for
* the termNumber lists, or directly contains the termNumber list if it fits in the 4
* bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
* are a pointer into a byte[] where the termNumber list starts.
*
* There are actually 256 byte arrays, to compensate for the fact that the pointers
* into the byte arrays are only 3 bytes long. The correct byte array for a document
* is a function of its id.
*
* To save space and speed up faceting, any term that matches enough documents will
* not be un-inverted... it will be skipped while building the un-inverted field structure,
* and will use a set intersection method during faceting.
*
* To further save memory, the terms (the actual string values) are not all stored in
* memory, but a TermIndex is used to convert term numbers to term values only
* for the terms needed after faceting has completed. Only every 128th term value
* is stored, along with its corresponding term number, and this is used as an
* index to find the closest term and iterate until the desired number is hit (very
* much like Lucene's own internal term index).
*
*/
public class UnInvertedField extends DocTermOrds {
private static int TNUM_OFFSET=2;
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
static class TopTerm {
Query termQuery;
BytesRef term;
int termNum;
long memSize() {
return 8 + // obj header
8 + 8 +term.length + //term
4; // int
}
}
long memsz;
final AtomicLong use = new AtomicLong(); // number of uses
/* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */
int[] maxTermCounts = new int[1024];
/* termNum -> docIDs for big terms. */
final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<>();
private SolrIndexSearcher.DocsEnumState deState;
private final SolrIndexSearcher searcher;
private static final UnInvertedField uifPlaceholder = new UnInvertedField();
private UnInvertedField() { // Dummy for synchronization.
super("fake", 0, 0); // cheapest initialization I can find.
searcher = null;
}
/**
* Called for each term in the field being uninverted.
* Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}.
* @param te positioned at the current term.
* @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls.
*/
@Override
protected void visitTerm(TermsEnum te, int termNum) throws IOException {
if (termNum >= maxTermCounts.length) {
// resize by doubling - for very large number of unique terms, expanding
// by 4K and resultant GC will dominate uninvert times. Resize at end if material
int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ];
System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
maxTermCounts = newMaxTermCounts;
}
final BytesRef term = te.term();
if (te.docFreq() > maxTermDocFreq) {
Term t = new Term(field, term); // this makes a deep copy of the term bytes
TopTerm topTerm = new TopTerm();
topTerm.term = t.bytes();
topTerm.termNum = termNum;
topTerm.termQuery = new TermQuery(t);
bigTerms.put(topTerm.termNum, topTerm);
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = field;
deState.liveDocs = searcher.getLiveDocsBits();
deState.termsEnum = te; // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail?
deState.postingsEnum = postingsEnum;
deState.minSetSizeCached = maxTermDocFreq;
}
postingsEnum = deState.postingsEnum;
DocSet set = searcher.getDocSet(deState);
maxTermCounts[termNum] = set.size();
}
}
@Override
protected void setActualDocFreq(int termNum, int docFreq) {
maxTermCounts[termNum] = docFreq;
}
public long memSize() {
// can cache the mem size since it shouldn't change
if (memsz!=0) return memsz;
long sz = super.ramBytesUsed();
sz += 8*8 + 32; // local fields
sz += bigTerms.size() * 64;
for (TopTerm tt : bigTerms.values()) {
sz += tt.memSize();
}
if (maxTermCounts != null)
sz += maxTermCounts.length * 4;
memsz = sz;
return sz;
}
public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
super(field,
// threshold, over which we use set intersections instead of counting
// to (1) save memory, and (2) speed up faceting.
// Add 2 for testing purposes so that there will always be some terms under
// the threshold even when the index is very
// small.
searcher.maxDoc()/20 + 2,
DEFAULT_INDEX_INTERVAL_BITS);
assert TestInjection.injectUIFOutOfMemoryError();
final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field));
this.searcher = searcher;
try {
// TODO: it's wasteful to create one of these each time
// but DocTermOrds will throw an exception if it thinks the field has doc values (which is faked by UnInvertingReader)
LeafReader r = SlowCompositeReaderWrapper.wrap(searcher.getRawReader());
uninvert(r, r.getLiveDocs(), prefix == null ? null : new BytesRef(prefix));
} catch (IllegalStateException ise) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ise);
}
if (tnums != null) {
for(byte[] target : tnums) {
if (target != null && target.length > (1<<24)*.9) {
log.warn("Approaching too many values for UnInvertedField faceting on field '{}' : bucket size={}", field, target.length);
}
}
}
// free space if outrageously wasteful (tradeoff memory/cpu)
if ((maxTermCounts.length - numTermsInField) > 1024) { // too much waste!
int[] newMaxTermCounts = new int[numTermsInField];
System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, numTermsInField);
maxTermCounts = newMaxTermCounts;
}
log.info("UnInverted multi-valued field {}", this);
//System.out.println("CREATED: " + toString() + " ti.index=" + ti.index);
}
public int getNumTerms() {
return numTermsInField;
}
public class DocToTerm implements Closeable {
private final DocSet[] bigTermSets;
private final int[] bigTermNums;
private TermsEnum te;
public DocToTerm() throws IOException {
bigTermSets = new DocSet[bigTerms.size()];
bigTermNums = new int[bigTerms.size()];
int i=0;
for (TopTerm tt : bigTerms.values()) {
bigTermSets[i] = searcher.getDocSet(tt.termQuery);
bigTermNums[i] = tt.termNum;
i++;
}
}
public BytesRef lookupOrd(int ord) throws IOException {
return getTermValue( getTermsEnum() , ord );
}
public TermsEnum getTermsEnum() throws IOException {
if (te == null) {
te = getOrdTermsEnum(searcher.getSlowAtomicReader());
}
return te;
}
public void getBigTerms(int doc, Callback target) throws IOException {
if (bigTermSets != null) {
for (int i=0; i<bigTermSets.length; i++) {
if (bigTermSets[i].exists(doc)) {
target.call( bigTermNums[i] );
}
}
}
}
public void getSmallTerms(int doc, Callback target) {
if (termInstances > 0) {
int code = index[doc];
if ((code & 0x80000000)!=0) {
int pos = code & 0x7fffffff;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for(;;) {
int delta = 0;
for(;;) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
target.call(tnum);
}
} else {
int tnum = 0;
int delta = 0;
for (;;) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80)==0) {
if (delta==0) break;
tnum += delta - TNUM_OFFSET;
target.call(tnum);
delta = 0;
}
code >>>= 8;
}
}
}
}
@Override
public void close() throws IOException {
for (DocSet set : bigTermSets) {
// set.decref(); // OFF-HEAP
}
}
}
public interface Callback {
public void call(int termNum);
}
private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException {
DocSet docs = processor.fcontext.base;
int baseSize = docs.size();
int maxDoc = searcher.maxDoc();
// what about allBuckets?
if (baseSize < processor.effectiveMincount) {
return;
}
SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
final int[] index = this.index;
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null;
if (doNegative) {
FixedBitSet bs = ((BitDocSet) docs).getBits().clone();
bs.flip(0, maxDoc);
// TODO: when iterator across negative elements is available, use that
// instead of creating a new bitset and inverting.
docs = new BitDocSet(bs, maxDoc - baseSize);
// simply negating will mean that we have deleted docs in the set.
// that should be OK, as their entries in our table should be empty.
baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs);
}
// For the biggest terms, do straight set intersections
for (TopTerm tt : bigTerms.values()) {
// TODO: counts could be deferred if sorting by index order
final int termOrd = tt.termNum;
Iterator<SweepCountAccStruct> othersIter = others.iterator();
SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next();
for (;;) {
entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet));
if (!othersIter.hasNext()) {
break;
}
entry = othersIter.next();
}
}
// TODO: we could short-circuit counting altogether for sorted faceting
// where we already have enough terms from the bigTerms
if (termInstances > 0) {
final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
final SweepDocIterator iter = iterAndCounts.iter;
final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
int maxIdx = iter.registerCounts(counts);
int code = index[doc];
if ((code & 0x80000000)!=0) {
int pos = code & 0x7fffffff;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for (; ; ) {
int delta = 0;
for (; ; ) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum, 1, maxIdx);
}
} else {
int tnum = 0;
int delta = 0;
for (; ; ) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0) {
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
counts.incrementCount(tnum, 1, maxIdx);
delta = 0;
}
code >>>= 8;
}
}
}
}
if (doNegative) {
final CountSlotAcc baseCounts = processor.countAcc;
for (int i=0; i<numTermsInField; i++) {
// counts[i] = maxTermCounts[i] - counts[i];
baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i)*2);
}
}
/*** TODO - future optimization to handle allBuckets
if (processor.allBucketsSlot >= 0) {
int all = 0; // overflow potential
for (int i=0; i<numTermsInField; i++) {
all += counts.getCount(i);
}
counts.incrementCount(processor.allBucketsSlot, all);
}
***/
}
public void collectDocs(FacetFieldProcessorByArrayUIF processor) throws IOException {
if (processor.collectAcc==null && processor.allBucketsAcc == null && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField) {
getCounts(processor);
return;
}
collectDocsGeneric(processor);
}
// called from FieldFacetProcessor
// TODO: do a callback version that can be specialized!
public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException {
use.incrementAndGet();
int startTermIndex = processor.startTermIndex;
int endTermIndex = processor.endTermIndex;
int nTerms = processor.nTerms;
DocSet docs = processor.fcontext.base;
int uniqueTerms = 0;
final CountSlotAcc countAcc = processor.countAcc;
final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor);
final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor);
for (TopTerm tt : bigTerms.values()) {
if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) {
// handle the biggest terms
DocSet termSet = searcher.getDocSet(tt.termQuery);
DocSet intersection = termSet.intersection(docs);
int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex,
slotNum -> { return new SlotContext(tt.termQuery); });
final int termOrd = tt.termNum - startTermIndex;
countAcc.incrementCount(termOrd, collected);
for (SweepCountAccStruct entry : others) {
entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet));
}
if (collected > 0) {
uniqueTerms++;
}
}
}
if (termInstances > 0) {
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
LeafReaderContext ctx = null;
int segBase = 0;
int segMax;
int adjustedMax = 0;
// TODO: handle facet.prefix here!!!
SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others);
final SweepDocIterator iter = sweepIterAndCounts.iter;
final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs;
final SegCountGlobal counts = new SegCountGlobal(countAccs);
while (iter.hasNext()) {
int doc = iter.nextDoc();
int maxIdx = iter.registerCounts(counts);
boolean collectBase = iter.collectBase();
if (doc >= adjustedMax) {
do {
ctx = ctxIt.next();
if (ctx == null) {
// should be impossible
throw new RuntimeException("INTERNAL FACET ERROR");
}
segBase = ctx.docBase;
segMax = ctx.reader().maxDoc();
adjustedMax = segBase + segMax;
} while (doc >= adjustedMax);
assert doc >= ctx.docBase;
processor.setNextReaderFirstPhase(ctx);
}
int segDoc = doc - segBase;
int code = index[doc];
if ((code & 0x80000000)!=0) {
int pos = code & 0x7fffffff;
int whichArray = (doc >>> 16) & 0xff;
byte[] arr = tnums[whichArray];
int tnum = 0;
for(;;) {
int delta = 0;
for(;;) {
byte b = arr[pos++];
delta = (delta << 7) | (b & 0x7f);
if ((b & 0x80) == 0) break;
}
if (delta == 0) break;
tnum += delta - TNUM_OFFSET;
int arrIdx = tnum - startTermIndex;
if (arrIdx < 0) continue;
if (arrIdx >= nTerms) break;
counts.incrementCount(arrIdx, 1, maxIdx);
if (collectBase) {
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
}
}
} else {
int tnum = 0;
int delta = 0;
for (;;) {
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80)==0) {
if (delta==0) break;
tnum += delta - TNUM_OFFSET;
int arrIdx = tnum - startTermIndex;
if (arrIdx >= 0) {
if (arrIdx >= nTerms) break;
counts.incrementCount(arrIdx, 1, maxIdx);
if (collectBase) {
processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext);
}
}
delta = 0;
}
code >>>= 8;
}
}
}
}
}
String getReadableValue(BytesRef termval, FieldType ft, CharsRefBuilder charsRef) {
return ft.indexedToReadable(termval, charsRef).toString();
}
/** may return a reused BytesRef */
BytesRef getTermValue(TermsEnum te, int termNum) throws IOException {
//System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" + numTermsInField);
if (bigTerms.size() > 0) {
// see if the term is one of our big terms.
TopTerm tt = bigTerms.get(termNum);
if (tt != null) {
//System.out.println(" return big " + tt.term);
return tt.term;
}
}
return lookupTerm(te, termNum);
}
@Override
public String toString() {
final long indexSize = indexedTermsArray == null ? 0 : (8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings); // assume 8 byte references?
return "{field=" + field
+ ",memSize="+memSize()
+ ",tindexSize="+indexSize
+ ",time="+total_time
+ ",phase1="+phase1_time
+ ",nTerms="+numTermsInField
+ ",bigTerms="+bigTerms.size()
+ ",termInstances="+termInstances
+ ",uses="+use.get()
+ "}";
}
//////////////////////////////////////////////////////////////////
//////////////////////////// caching /////////////////////////////
//////////////////////////////////////////////////////////////////
@SuppressWarnings("unchecked")
public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
SolrCache<String, UnInvertedField> cache = searcher.getFieldValueCache();
if (cache == null) {
return new UnInvertedField(field, searcher);
}
return cache.computeIfAbsent(field, f -> new UnInvertedField(f, searcher));
}
// Returns null if not already populated
@SuppressWarnings({"rawtypes", "unchecked"})
public static UnInvertedField checkUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
SolrCache cache = searcher.getFieldValueCache();
if (cache == null) {
return null;
}
Object uif = cache.get(field); // cache is already synchronized, so no extra sync needed
// placeholder is an implementation detail, keep it hidden and return null if that is what we got
return uif==uifPlaceholder || !(uif instanceof UnInvertedField)? null : (UnInvertedField) uif;
}
}