blob: bf6a2c850cd060884117d280b2b0fa4824d20897 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.Closeable;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.IntFunction;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiPostingsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.HashDocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SortedIntDocSet;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
/**
* Enumerates indexed terms in order in a streaming fashion.
* It's able to stream since no data needs to be accumulated so long as it's index order.
*/
class FacetFieldProcessorByEnumTermsStream extends FacetFieldProcessor implements Closeable {
long bucketsToSkip;
long bucketsReturned;
boolean closed;
boolean countOnly;
boolean hasSubFacets; // true if there are subfacets
int minDfFilterCache;
DocSet docs;
DocSet fastForRandomSet;
TermsEnum termsEnum = null;
SolrIndexSearcher.DocsEnumState deState = null;
PostingsEnum postingsEnum;
BytesRef startTermBytes;
BytesRef term;
AtomicBoolean shardHasMoreBuckets = new AtomicBoolean(false); // set after streaming as finished
// at any point in processing where we need a SlotContext, all we care about is the current 'term'
private IntFunction<SlotContext> slotContext = (slotNum) -> {
assert null != term;
return new SlotAcc.SlotContext(new TermQuery(new Term(sf.getName(), term)));
};
LeafReaderContext[] leaves;
FacetFieldProcessorByEnumTermsStream(FacetContext fcontext, FacetField freq, SchemaField sf) {
super(fcontext, freq, sf);
}
@Override
public void close() throws IOException {
if (!closed) {
closed = true;
// fcontext.base.decref(); // OFF-HEAP
}
}
@Override
@SuppressWarnings({"rawtypes"})
public void process() throws IOException {
super.process();
// We need to keep the fcontext open after processing is done (since we will be streaming in the response writer).
// But if the connection is broken, we want to clean up.
// fcontext.base.incref(); // OFF-HEAP
fcontext.qcontext.addCloseHook(this);
setup();
response = new SimpleOrderedMap<>();
response.add("buckets", new Iterator() {
boolean retrieveNext = true;
Object val;
@Override
public boolean hasNext() {
if (retrieveNext) {
val = nextBucket();
}
retrieveNext = false;
return val != null;
}
@Override
public Object next() {
if (retrieveNext) {
val = nextBucket();
}
retrieveNext = true;
if (val == null) {
// Last value, so clean up. In the case that we are doing streaming facets within streaming facets,
// the number of close hooks could grow very large, so we want to remove ourselves.
boolean removed = fcontext.qcontext.removeCloseHook(FacetFieldProcessorByEnumTermsStream.this);
assert removed;
try {
close();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming close", e);
}
}
return val;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
});
if (fcontext.isShard()) {
response.add("more", shardHasMoreBuckets); // lazily evaluated
}
}
private void setup() throws IOException {
countOnly = freq.facetStats.size() == 0 || freq.facetStats.values().iterator().next() instanceof CountAgg;
hasSubFacets = freq.subFacets.size() > 0;
bucketsToSkip = freq.offset;
createAccs(-1, 1);
// Minimum term docFreq in order to use the filterCache for that term.
if (freq.cacheDf == -1) { // -1 means never cache
minDfFilterCache = Integer.MAX_VALUE;
} else if (freq.cacheDf == 0) { // default; compute as fraction of maxDoc
minDfFilterCache = Math.max(fcontext.searcher.maxDoc() >> 4, 3); // (minimum of 3 is for test coverage purposes)
} else {
minDfFilterCache = freq.cacheDf;
}
docs = fcontext.base;
fastForRandomSet = null;
if (freq.prefix != null) {
String indexedPrefix = sf.getType().toInternal(freq.prefix);
startTermBytes = new BytesRef(indexedPrefix);
} else if (sf.getType().getNumberType() != null) {
String triePrefix = TrieField.getMainValuePrefix(sf.getType());
if (triePrefix != null) {
startTermBytes = new BytesRef(triePrefix);
}
}
Terms terms = fcontext.searcher.getSlowAtomicReader().terms(sf.getName());
termsEnum = null;
deState = null;
term = null;
if (terms != null) {
termsEnum = terms.iterator();
// TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
// facet.offset when sorting by index order.
if (startTermBytes != null) {
if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
}
List<LeafReaderContext> leafList = fcontext.searcher.getTopReaderContext().leaves();
leaves = leafList.toArray( new LeafReaderContext[ leafList.size() ]);
}
private SimpleOrderedMap<Object> nextBucket() {
try {
return _nextBucket();
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Error during facet streaming", e);
}
}
private SimpleOrderedMap<Object> _nextBucket() throws IOException {
DocSet termSet = null;
try {
while (term != null) {
if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) {
break;
}
int df = termsEnum.docFreq();
if (df < effectiveMincount) {
term = termsEnum.next();
continue;
}
if (termSet != null) {
// termSet.decref(); // OFF-HEAP
termSet = null;
}
int c = 0;
if (hasSubFacets || df >= minDfFilterCache) {
// use the filter cache
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = sf.getName();
deState.liveDocs = fcontext.searcher.getLiveDocsBits();
deState.termsEnum = termsEnum;
deState.postingsEnum = postingsEnum;
deState.minSetSizeCached = minDfFilterCache;
}
if (hasSubFacets || !countOnly) {
DocSet termsAll = fcontext.searcher.getDocSet(deState);
termSet = docs.intersection(termsAll);
// termsAll.decref(); // OFF-HEAP
c = termSet.size();
} else {
c = fcontext.searcher.numDocs(docs, deState);
}
postingsEnum = deState.postingsEnum;
resetStats();
if (!countOnly) {
collect(termSet, 0, slotContext);
}
} else {
// We don't need the docset here (meaning no sub-facets).
// if countOnly, then we are calculating some other stats...
resetStats();
// lazy convert to fastForRandomSet
if (fastForRandomSet == null) {
fastForRandomSet = docs;
if (docs instanceof SortedIntDocSet) { // OFF-HEAP todo: also check for native version
SortedIntDocSet sset = (SortedIntDocSet) docs;
fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
}
}
// iterate over TermDocs to calculate the intersection
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null) continue;
int base = sub.slice.start;
int docid;
if (countOnly) {
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid + base)) c++;
}
} else {
setNextReader(leaves[sub.slice.readerIndex]);
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid + base)) {
c++;
collect(docid, 0, slotContext);
}
}
}
}
} else {
int docid;
if (countOnly) {
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid)) c++;
}
} else {
setNextReader(leaves[0]);
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid)) {
c++;
collect(docid, 0, slotContext);
}
}
}
}
}
if (c < effectiveMincount) {
term = termsEnum.next();
continue;
}
// handle offset and limit
if (bucketsToSkip > 0) {
bucketsToSkip--;
term = termsEnum.next();
continue;
}
if (freq.limit >= 0 && ++bucketsReturned > freq.limit) {
shardHasMoreBuckets.set(true);
return null;
}
// set count in case other stats depend on it
countAcc.incrementCount(0, c);
// OK, we have a good bucket to return... first get bucket value before moving to next term
Object bucketVal = sf.getType().toObject(sf, term);
TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null;
term = termsEnum.next();
SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>();
bucket.add("val", bucketVal);
addStats(bucket, 0);
if (hasSubFacets) {
processSubs(bucket, bucketQuery, termSet, false, null);
}
// TODO... termSet needs to stick around for streaming sub-facets?
return bucket;
}
} finally {
if (termSet != null) {
// termSet.decref(); // OFF-HEAP
termSet = null;
}
}
// end of the iteration
return null;
}
}