| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.search.facet; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.concurrent.atomic.AtomicLong; |
| |
| import org.apache.lucene.index.LeafReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CharsRefBuilder; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.solr.common.SolrException; |
| import org.apache.solr.index.SlowCompositeReaderWrapper; |
| import org.apache.solr.schema.FieldType; |
| import org.apache.solr.schema.TrieField; |
| import org.apache.solr.search.BitDocSet; |
| import org.apache.solr.search.DocSet; |
| import org.apache.solr.search.SolrCache; |
| import org.apache.solr.search.SolrIndexSearcher; |
| import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal; |
| import org.apache.solr.search.facet.SlotAcc.CountSlotAcc; |
| import org.apache.solr.search.facet.SlotAcc.SlotContext; |
| import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct; |
| import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc; |
| import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts; |
| import org.apache.solr.uninverting.DocTermOrds; |
| import org.apache.solr.util.TestInjection; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * |
| * Final form of the un-inverted field: |
| * Each document points to a list of term numbers that are contained in that document. |
| * |
| * Term numbers are in sorted order, and are encoded as variable-length deltas from the |
| * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A |
| * term number of 0 signals the end of the termNumber list. |
| * |
| * There is a single int[maxDoc()] which either contains a pointer into a byte[] for |
| * the termNumber lists, or directly contains the termNumber list if it fits in the 4 |
| * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes |
| * are a pointer into a byte[] where the termNumber list starts. |
| * |
| * There are actually 256 byte arrays, to compensate for the fact that the pointers |
| * into the byte arrays are only 3 bytes long. The correct byte array for a document |
| * is a function of its id. |
| * |
| * To save space and speed up faceting, any term that matches enough documents will |
| * not be un-inverted... it will be skipped while building the un-inverted field structure, |
| * and will use a set intersection method during faceting. |
| * |
| * To further save memory, the terms (the actual string values) are not all stored in |
| * memory, but a TermIndex is used to convert term numbers to term values only |
| * for the terms needed after faceting has completed. Only every 128th term value |
| * is stored, along with its corresponding term number, and this is used as an |
| * index to find the closest term and iterate until the desired number is hit (very |
| * much like Lucene's own internal term index). |
| * |
| */ |
| public class UnInvertedField extends DocTermOrds { |
| private static int TNUM_OFFSET=2; |
| |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| static class TopTerm { |
| Query termQuery; |
| BytesRef term; |
| int termNum; |
| |
| long memSize() { |
| return 8 + // obj header |
| 8 + 8 +term.length + //term |
| 4; // int |
| } |
| } |
| |
| long memsz; |
| final AtomicLong use = new AtomicLong(); // number of uses |
| |
| /* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */ |
| int[] maxTermCounts = new int[1024]; |
| |
| /* termNum -> docIDs for big terms. */ |
| final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<>(); |
| |
| private SolrIndexSearcher.DocsEnumState deState; |
| private final SolrIndexSearcher searcher; |
| |
| private static final UnInvertedField uifPlaceholder = new UnInvertedField(); |
| |
| private UnInvertedField() { // Dummy for synchronization. |
| super("fake", 0, 0); // cheapest initialization I can find. |
| searcher = null; |
| } |
| |
| /** |
| * Called for each term in the field being uninverted. |
| * Collects {@link #maxTermCounts} for all bigTerms as well as storing them in {@link #bigTerms}. |
| * @param te positioned at the current term. |
| * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between calls. |
| */ |
| @Override |
| protected void visitTerm(TermsEnum te, int termNum) throws IOException { |
| |
| if (termNum >= maxTermCounts.length) { |
| // resize by doubling - for very large number of unique terms, expanding |
| // by 4K and resultant GC will dominate uninvert times. Resize at end if material |
| int[] newMaxTermCounts = new int[ Math.min(Integer.MAX_VALUE-16, maxTermCounts.length*2) ]; |
| System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); |
| maxTermCounts = newMaxTermCounts; |
| } |
| |
| final BytesRef term = te.term(); |
| |
| if (te.docFreq() > maxTermDocFreq) { |
| Term t = new Term(field, term); // this makes a deep copy of the term bytes |
| TopTerm topTerm = new TopTerm(); |
| topTerm.term = t.bytes(); |
| topTerm.termNum = termNum; |
| topTerm.termQuery = new TermQuery(t); |
| |
| bigTerms.put(topTerm.termNum, topTerm); |
| |
| if (deState == null) { |
| deState = new SolrIndexSearcher.DocsEnumState(); |
| deState.fieldName = field; |
| deState.liveDocs = searcher.getLiveDocsBits(); |
| deState.termsEnum = te; // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail? |
| deState.postingsEnum = postingsEnum; |
| deState.minSetSizeCached = maxTermDocFreq; |
| } |
| |
| postingsEnum = deState.postingsEnum; |
| DocSet set = searcher.getDocSet(deState); |
| maxTermCounts[termNum] = set.size(); |
| } |
| } |
| |
| @Override |
| protected void setActualDocFreq(int termNum, int docFreq) { |
| maxTermCounts[termNum] = docFreq; |
| } |
| |
| public long memSize() { |
| // can cache the mem size since it shouldn't change |
| if (memsz!=0) return memsz; |
| long sz = super.ramBytesUsed(); |
| sz += 8*8 + 32; // local fields |
| sz += bigTerms.size() * 64; |
| for (TopTerm tt : bigTerms.values()) { |
| sz += tt.memSize(); |
| } |
| if (maxTermCounts != null) |
| sz += maxTermCounts.length * 4; |
| memsz = sz; |
| return sz; |
| } |
| |
| public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { |
| super(field, |
| // threshold, over which we use set intersections instead of counting |
| // to (1) save memory, and (2) speed up faceting. |
| // Add 2 for testing purposes so that there will always be some terms under |
| // the threshold even when the index is very |
| // small. |
| searcher.maxDoc()/20 + 2, |
| DEFAULT_INDEX_INTERVAL_BITS); |
| |
| assert TestInjection.injectUIFOutOfMemoryError(); |
| |
| final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field)); |
| this.searcher = searcher; |
| try { |
| // TODO: it's wasteful to create one of these each time |
| // but DocTermOrds will throw an exception if it thinks the field has doc values (which is faked by UnInvertingReader) |
| LeafReader r = SlowCompositeReaderWrapper.wrap(searcher.getRawReader()); |
| uninvert(r, r.getLiveDocs(), prefix == null ? null : new BytesRef(prefix)); |
| } catch (IllegalStateException ise) { |
| throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ise); |
| } |
| if (tnums != null) { |
| for(byte[] target : tnums) { |
| if (target != null && target.length > (1<<24)*.9) { |
| log.warn("Approaching too many values for UnInvertedField faceting on field '{}' : bucket size={}", field, target.length); |
| } |
| } |
| } |
| |
| // free space if outrageously wasteful (tradeoff memory/cpu) |
| if ((maxTermCounts.length - numTermsInField) > 1024) { // too much waste! |
| int[] newMaxTermCounts = new int[numTermsInField]; |
| System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, numTermsInField); |
| maxTermCounts = newMaxTermCounts; |
| } |
| |
| log.info("UnInverted multi-valued field {}", this); |
| //System.out.println("CREATED: " + toString() + " ti.index=" + ti.index); |
| } |
| |
| public int getNumTerms() { |
| return numTermsInField; |
| } |
| |
| |
| |
| public class DocToTerm implements Closeable { |
| private final DocSet[] bigTermSets; |
| private final int[] bigTermNums; |
| private TermsEnum te; |
| |
| public DocToTerm() throws IOException { |
| bigTermSets = new DocSet[bigTerms.size()]; |
| bigTermNums = new int[bigTerms.size()]; |
| int i=0; |
| for (TopTerm tt : bigTerms.values()) { |
| bigTermSets[i] = searcher.getDocSet(tt.termQuery); |
| bigTermNums[i] = tt.termNum; |
| i++; |
| } |
| } |
| |
| public BytesRef lookupOrd(int ord) throws IOException { |
| return getTermValue( getTermsEnum() , ord ); |
| } |
| |
| public TermsEnum getTermsEnum() throws IOException { |
| if (te == null) { |
| te = getOrdTermsEnum(searcher.getSlowAtomicReader()); |
| } |
| return te; |
| } |
| |
| public void getBigTerms(int doc, Callback target) throws IOException { |
| if (bigTermSets != null) { |
| for (int i=0; i<bigTermSets.length; i++) { |
| if (bigTermSets[i].exists(doc)) { |
| target.call( bigTermNums[i] ); |
| } |
| } |
| } |
| } |
| |
| public void getSmallTerms(int doc, Callback target) { |
| if (termInstances > 0) { |
| int code = index[doc]; |
| |
| if ((code & 0x80000000)!=0) { |
| int pos = code & 0x7fffffff; |
| int whichArray = (doc >>> 16) & 0xff; |
| byte[] arr = tnums[whichArray]; |
| int tnum = 0; |
| for(;;) { |
| int delta = 0; |
| for(;;) { |
| byte b = arr[pos++]; |
| delta = (delta << 7) | (b & 0x7f); |
| if ((b & 0x80) == 0) break; |
| } |
| if (delta == 0) break; |
| tnum += delta - TNUM_OFFSET; |
| target.call(tnum); |
| } |
| } else { |
| int tnum = 0; |
| int delta = 0; |
| for (;;) { |
| delta = (delta << 7) | (code & 0x7f); |
| if ((code & 0x80)==0) { |
| if (delta==0) break; |
| tnum += delta - TNUM_OFFSET; |
| target.call(tnum); |
| delta = 0; |
| } |
| code >>>= 8; |
| } |
| } |
| } |
| } |
| |
| @Override |
| public void close() throws IOException { |
| for (DocSet set : bigTermSets) { |
| // set.decref(); // OFF-HEAP |
| } |
| } |
| } |
| |
| public interface Callback { |
| public void call(int termNum); |
| } |
| |
| |
| |
| private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException { |
| DocSet docs = processor.fcontext.base; |
| int baseSize = docs.size(); |
| int maxDoc = searcher.maxDoc(); |
| |
| // what about allBuckets? |
| if (baseSize < processor.effectiveMincount) { |
| return; |
| } |
| |
| SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); |
| final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor); |
| |
| final int[] index = this.index; |
| |
| boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null; |
| |
| if (doNegative) { |
| FixedBitSet bs = ((BitDocSet) docs).getBits().clone(); |
| bs.flip(0, maxDoc); |
| // TODO: when iterator across negative elements is available, use that |
| // instead of creating a new bitset and inverting. |
| docs = new BitDocSet(bs, maxDoc - baseSize); |
| // simply negating will mean that we have deleted docs in the set. |
| // that should be OK, as their entries in our table should be empty. |
| baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs); |
| } |
| |
| // For the biggest terms, do straight set intersections |
| for (TopTerm tt : bigTerms.values()) { |
| // TODO: counts could be deferred if sorting by index order |
| final int termOrd = tt.termNum; |
| Iterator<SweepCountAccStruct> othersIter = others.iterator(); |
| SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next(); |
| for (;;) { |
| entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet)); |
| if (!othersIter.hasNext()) { |
| break; |
| } |
| entry = othersIter.next(); |
| } |
| } |
| |
| // TODO: we could short-circuit counting altogether for sorted faceting |
| // where we already have enough terms from the bigTerms |
| |
| if (termInstances > 0) { |
| final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); |
| final SweepDocIterator iter = iterAndCounts.iter; |
| final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs); |
| while (iter.hasNext()) { |
| int doc = iter.nextDoc(); |
| int maxIdx = iter.registerCounts(counts); |
| int code = index[doc]; |
| |
| if ((code & 0x80000000)!=0) { |
| int pos = code & 0x7fffffff; |
| int whichArray = (doc >>> 16) & 0xff; |
| byte[] arr = tnums[whichArray]; |
| int tnum = 0; |
| for (; ; ) { |
| int delta = 0; |
| for (; ; ) { |
| byte b = arr[pos++]; |
| delta = (delta << 7) | (b & 0x7f); |
| if ((b & 0x80) == 0) break; |
| } |
| if (delta == 0) break; |
| tnum += delta - TNUM_OFFSET; |
| counts.incrementCount(tnum, 1, maxIdx); |
| } |
| } else { |
| int tnum = 0; |
| int delta = 0; |
| for (; ; ) { |
| delta = (delta << 7) | (code & 0x7f); |
| if ((code & 0x80) == 0) { |
| if (delta == 0) break; |
| tnum += delta - TNUM_OFFSET; |
| counts.incrementCount(tnum, 1, maxIdx); |
| delta = 0; |
| } |
| code >>>= 8; |
| } |
| } |
| } |
| } |
| |
| if (doNegative) { |
| final CountSlotAcc baseCounts = processor.countAcc; |
| for (int i=0; i<numTermsInField; i++) { |
| // counts[i] = maxTermCounts[i] - counts[i]; |
| baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i)*2); |
| } |
| } |
| |
| /*** TODO - future optimization to handle allBuckets |
| if (processor.allBucketsSlot >= 0) { |
| int all = 0; // overflow potential |
| for (int i=0; i<numTermsInField; i++) { |
| all += counts.getCount(i); |
| } |
| counts.incrementCount(processor.allBucketsSlot, all); |
| } |
| ***/ |
| } |
| |
| |
| |
| public void collectDocs(FacetFieldProcessorByArrayUIF processor) throws IOException { |
| if (processor.collectAcc==null && processor.allBucketsAcc == null && processor.startTermIndex == 0 && processor.endTermIndex >= numTermsInField) { |
| getCounts(processor); |
| return; |
| } |
| |
| collectDocsGeneric(processor); |
| } |
| |
| // called from FieldFacetProcessor |
| // TODO: do a callback version that can be specialized! |
| public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException { |
| use.incrementAndGet(); |
| |
| int startTermIndex = processor.startTermIndex; |
| int endTermIndex = processor.endTermIndex; |
| int nTerms = processor.nTerms; |
| DocSet docs = processor.fcontext.base; |
| |
| int uniqueTerms = 0; |
| final CountSlotAcc countAcc = processor.countAcc; |
| final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); |
| final List<SweepCountAccStruct> others = SweepingCountSlotAcc.otherStructsOf(processor); |
| |
| for (TopTerm tt : bigTerms.values()) { |
| if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) { |
| // handle the biggest terms |
| DocSet termSet = searcher.getDocSet(tt.termQuery); |
| DocSet intersection = termSet.intersection(docs); |
| int collected = processor.collectFirstPhase(intersection, tt.termNum - startTermIndex, |
| slotNum -> { return new SlotContext(tt.termQuery); }); |
| final int termOrd = tt.termNum - startTermIndex; |
| countAcc.incrementCount(termOrd, collected); |
| for (SweepCountAccStruct entry : others) { |
| entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet)); |
| } |
| if (collected > 0) { |
| uniqueTerms++; |
| } |
| } |
| } |
| |
| |
| if (termInstances > 0) { |
| |
| final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves(); |
| final Iterator<LeafReaderContext> ctxIt = leaves.iterator(); |
| LeafReaderContext ctx = null; |
| int segBase = 0; |
| int segMax; |
| int adjustedMax = 0; |
| |
| |
| // TODO: handle facet.prefix here!!! |
| |
| SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); |
| final SweepDocIterator iter = sweepIterAndCounts.iter; |
| final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs; |
| final SegCountGlobal counts = new SegCountGlobal(countAccs); |
| while (iter.hasNext()) { |
| int doc = iter.nextDoc(); |
| int maxIdx = iter.registerCounts(counts); |
| boolean collectBase = iter.collectBase(); |
| |
| if (doc >= adjustedMax) { |
| do { |
| ctx = ctxIt.next(); |
| if (ctx == null) { |
| // should be impossible |
| throw new RuntimeException("INTERNAL FACET ERROR"); |
| } |
| segBase = ctx.docBase; |
| segMax = ctx.reader().maxDoc(); |
| adjustedMax = segBase + segMax; |
| } while (doc >= adjustedMax); |
| assert doc >= ctx.docBase; |
| processor.setNextReaderFirstPhase(ctx); |
| } |
| int segDoc = doc - segBase; |
| |
| |
| int code = index[doc]; |
| |
| if ((code & 0x80000000)!=0) { |
| int pos = code & 0x7fffffff; |
| int whichArray = (doc >>> 16) & 0xff; |
| byte[] arr = tnums[whichArray]; |
| int tnum = 0; |
| for(;;) { |
| int delta = 0; |
| for(;;) { |
| byte b = arr[pos++]; |
| delta = (delta << 7) | (b & 0x7f); |
| if ((b & 0x80) == 0) break; |
| } |
| if (delta == 0) break; |
| tnum += delta - TNUM_OFFSET; |
| int arrIdx = tnum - startTermIndex; |
| if (arrIdx < 0) continue; |
| if (arrIdx >= nTerms) break; |
| counts.incrementCount(arrIdx, 1, maxIdx); |
| if (collectBase) { |
| processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); |
| } |
| } |
| } else { |
| int tnum = 0; |
| int delta = 0; |
| for (;;) { |
| delta = (delta << 7) | (code & 0x7f); |
| if ((code & 0x80)==0) { |
| if (delta==0) break; |
| tnum += delta - TNUM_OFFSET; |
| int arrIdx = tnum - startTermIndex; |
| if (arrIdx >= 0) { |
| if (arrIdx >= nTerms) break; |
| counts.incrementCount(arrIdx, 1, maxIdx); |
| if (collectBase) { |
| processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); |
| } |
| } |
| delta = 0; |
| } |
| code >>>= 8; |
| } |
| } |
| } |
| } |
| |
| |
| } |
| |
| |
| |
| String getReadableValue(BytesRef termval, FieldType ft, CharsRefBuilder charsRef) { |
| return ft.indexedToReadable(termval, charsRef).toString(); |
| } |
| |
| /** may return a reused BytesRef */ |
| BytesRef getTermValue(TermsEnum te, int termNum) throws IOException { |
| //System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" + numTermsInField); |
| if (bigTerms.size() > 0) { |
| // see if the term is one of our big terms. |
| TopTerm tt = bigTerms.get(termNum); |
| if (tt != null) { |
| //System.out.println(" return big " + tt.term); |
| return tt.term; |
| } |
| } |
| |
| return lookupTerm(te, termNum); |
| } |
| |
| @Override |
| public String toString() { |
| final long indexSize = indexedTermsArray == null ? 0 : (8+8+8+8+(indexedTermsArray.length<<3)+sizeOfIndexedStrings); // assume 8 byte references? |
| return "{field=" + field |
| + ",memSize="+memSize() |
| + ",tindexSize="+indexSize |
| + ",time="+total_time |
| + ",phase1="+phase1_time |
| + ",nTerms="+numTermsInField |
| + ",bigTerms="+bigTerms.size() |
| + ",termInstances="+termInstances |
| + ",uses="+use.get() |
| + "}"; |
| } |
| |
| ////////////////////////////////////////////////////////////////// |
| //////////////////////////// caching ///////////////////////////// |
| ////////////////////////////////////////////////////////////////// |
| |
| @SuppressWarnings("unchecked") |
| public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { |
| SolrCache<String, UnInvertedField> cache = searcher.getFieldValueCache(); |
| if (cache == null) { |
| return new UnInvertedField(field, searcher); |
| } |
| return cache.computeIfAbsent(field, f -> new UnInvertedField(f, searcher)); |
| } |
| |
| // Returns null if not already populated |
| @SuppressWarnings({"rawtypes", "unchecked"}) |
| public static UnInvertedField checkUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { |
| SolrCache cache = searcher.getFieldValueCache(); |
| if (cache == null) { |
| return null; |
| } |
| Object uif = cache.get(field); // cache is already synchronized, so no extra sync needed |
| // placeholder is an implementation detail, keep it hidden and return null if that is what we got |
| return uif==uifPlaceholder || !(uif instanceof UnInvertedField)? null : (UnInvertedField) uif; |
| } |
| |
| } |