blob: ed6fd9b93cf6d0e3bef7f7e658afcd138a127293 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.io.IOException;
import java.text.ParseException;
import java.util.function.IntFunction;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSetUtil;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
/**
* Facets numbers into a hash table. The number is either a raw numeric DocValues value, or
* a term global ordinal integer.
* Limitations:
* <ul>
* <li>doesn't handle prefix, but could easily be added</li>
* <li>doesn't handle mincount==0 -- you're better off with an array alg</li>
* </ul>
*/
class FacetFieldProcessorByHashDV extends FacetFieldProcessor {
static int MAXIMUM_STARTING_TABLE_SIZE=1024; // must be a power of two, non-final to support setting by tests
/** a hash table with long keys (what we're counting) and integer values (counts) */
private static class LongCounts {
static final float LOAD_FACTOR = 0.7f;
long[] vals;
int[] counts; // maintain the counts here since we need them to tell if there was actually a value anyway
int[] oldToNewMapping;
int cardinality;
int threshold;
/** sz must be a power of two */
LongCounts(int sz) {
vals = new long[sz];
counts = new int[sz];
threshold = (int) (sz * LOAD_FACTOR);
}
/** Current number of slots in the hash table */
int numSlots() {
return vals.length;
}
private int hash(long val) {
// For floats: exponent bits start at bit 23 for single precision,
// and bit 52 for double precision.
// Many values will only have significant bits just to the right of that,
// and the leftmost bits will all be zero.
// For now, lets just settle to get first 8 significant mantissa bits of double or float in the lowest bits of our hash
// The upper bits of our hash will be irrelevant.
int h = (int) (val + (val >>> 44) + (val >>> 15));
return h;
}
/** returns the slot */
int add(long val) {
if (cardinality >= threshold) {
rehash();
}
int h = hash(val);
for (int slot = h & (vals.length-1); ;slot = (slot + ((h>>7)|1)) & (vals.length-1)) {
int count = counts[slot];
if (count == 0) {
counts[slot] = 1;
vals[slot] = val;
cardinality++;
return slot;
} else if (vals[slot] == val) {
// val is already in the set
counts[slot] = count + 1;
return slot;
}
}
}
protected void rehash() {
long[] oldVals = vals;
int[] oldCounts = counts; // after retrieving the count, this array is reused as a mapping to new array
int newCapacity = vals.length << 1;
vals = new long[newCapacity];
counts = new int[newCapacity];
threshold = (int) (newCapacity * LOAD_FACTOR);
for (int i=0; i<oldVals.length; i++) {
int count = oldCounts[i];
if (count == 0) {
oldCounts[i] = -1;
continue;
}
long val = oldVals[i];
int h = hash(val);
int slot = h & (vals.length-1);
while (counts[slot] != 0) {
slot = (slot + ((h>>7)|1)) & (vals.length-1);
}
counts[slot] = count;
vals[slot] = val;
oldCounts[i] = slot;
}
oldToNewMapping = oldCounts;
}
int cardinality() {
return cardinality;
}
}
/** A hack instance of Calc for Term ordinals in DocValues. */
// TODO consider making FacetRangeProcessor.Calc facet top level; then less of a hack?
private class TermOrdCalc extends FacetRangeProcessor.Calc {
IntFunction<BytesRef> lookupOrdFunction; // set in collectDocs()!
TermOrdCalc() throws IOException {
super(sf);
}
@Override
public long bitsToSortableBits(long globalOrd) {
return globalOrd;
}
/** To be returned in "buckets"/"val" */
@Override
@SuppressWarnings({"rawtypes"})
public Comparable bitsToValue(long globalOrd) {
BytesRef bytesRef = lookupOrdFunction.apply((int) globalOrd);
// note FacetFieldProcessorByArray.findTopSlots also calls SchemaFieldType.toObject
return sf.getType().toObject(sf, bytesRef).toString();
}
@Override
@SuppressWarnings({"rawtypes"})
public String formatValue(Comparable val) {
return (String) val;
}
@Override
@SuppressWarnings({"rawtypes"})
protected Comparable parseStr(String rawval) throws ParseException {
throw new UnsupportedOperationException();
}
@Override
@SuppressWarnings({"rawtypes"})
protected Comparable parseAndAddGap(Comparable value, String gap) throws ParseException {
throw new UnsupportedOperationException();
}
}
FacetRangeProcessor.Calc calc;
LongCounts table;
int allBucketsSlot = -1;
FacetFieldProcessorByHashDV(FacetContext fcontext, FacetField freq, SchemaField sf) {
super(fcontext, freq, sf);
if (freq.mincount == 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass()+" doesn't support mincount=0");
}
if (freq.prefix != null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass()+" doesn't support prefix"); // yet, but it could
}
FieldInfo fieldInfo = fcontext.searcher.getFieldInfos().fieldInfo(sf.getName());
if (fieldInfo != null &&
fieldInfo.getDocValuesType() != DocValuesType.NUMERIC &&
fieldInfo.getDocValuesType() != DocValuesType.SORTED &&
fieldInfo.getDocValuesType() != DocValuesType.SORTED_NUMERIC) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass()+" only support single valued number/string with docValues");
}
}
@Override
public void process() throws IOException {
super.process();
response = calcFacets();
table = null;//gc
}
private SimpleOrderedMap<Object> calcFacets() throws IOException {
if (sf.getType().getNumberType() != null) {
calc = FacetRangeProcessor.getNumericCalc(sf);
} else {
calc = new TermOrdCalc(); // kind of a hack
}
// TODO: Use the number of indexed terms, if present, as an estimate!
// Even for NumericDocValues, we could check for a terms index for an estimate.
// Our estimation should aim high to avoid expensive rehashes.
int possibleValues = fcontext.base.size();
// size smaller tables so that no resize will be necessary
int currHashSize = BitUtil.nextHighestPowerOfTwo((int) (possibleValues * (1 / LongCounts.LOAD_FACTOR) + 1));
currHashSize = Math.min(currHashSize, MAXIMUM_STARTING_TABLE_SIZE);
table = new LongCounts(currHashSize) {
@Override
protected void rehash() {
super.rehash();
doRehash(this);
oldToNewMapping = null; // allow for gc
}
};
// note: these methods/phases align with FacetFieldProcessorByArray's
createCollectAcc();
collectDocs();
return super.findTopSlots(table.numSlots(), table.cardinality(),
slotNum -> calc.bitsToValue(table.vals[slotNum]), // getBucketValFromSlotNum
val -> calc.formatValue(val)); // getFieldQueryVal
}
private void createCollectAcc() throws IOException {
int numSlots = table.numSlots();
if (freq.allBuckets) {
allBucketsSlot = numSlots++;
}
indexOrderAcc = new SlotAcc(fcontext) {
@Override
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
}
@Override
public int compare(int slotA, int slotB) {
long s1 = calc.bitsToSortableBits(table.vals[slotA]);
long s2 = calc.bitsToSortableBits(table.vals[slotB]);
return Long.compare(s1, s2);
}
@Override
public Object getValue(int slotNum) throws IOException {
return null;
}
@Override
public void reset() {
}
@Override
public void resize(Resizer resizer) {
}
};
countAcc = new SlotAcc.CountSlotAcc(fcontext) {
@Override
public void incrementCount(int slot, int count) {
throw new UnsupportedOperationException();
}
@Override
public int getCount(int slot) {
return table.counts[slot];
}
@Override
public Object getValue(int slotNum) {
return getCount(slotNum);
}
@Override
public void reset() {
throw new UnsupportedOperationException();
}
@Override
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int compare(int slotA, int slotB) {
return Integer.compare( table.counts[slotA], table.counts[slotB] );
}
@Override
public void resize(Resizer resizer) {
throw new UnsupportedOperationException();
}
};
// we set the countAcc & indexAcc first so generic ones won't be created for us.
super.createCollectAcc(fcontext.base.size(), numSlots);
if (freq.allBuckets) {
allBucketsAcc = new SpecialSlotAcc(fcontext, collectAcc, allBucketsSlot, otherAccs, 0);
}
}
private void collectDocs() throws IOException {
if (calc instanceof TermOrdCalc) { // Strings
// TODO support SortedSetDocValues
SortedDocValues globalDocValues = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null);
((TermOrdCalc)calc).lookupOrdFunction = ord -> {
try {
return globalDocValues.lookupOrd(ord);
} catch (IOException e) {
throw new RuntimeException(e);
}
};
DocSetUtil.collectSortedDocSet(fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() {
SortedDocValues docValues = globalDocValues; // this segment/leaf. NN
LongValues toGlobal = LongValues.IDENTITY; // this segment to global ordinal. NN
@Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; }
@Override
protected void doSetNextReader(LeafReaderContext ctx) throws IOException {
setNextReaderFirstPhase(ctx);
if (globalDocValues instanceof MultiDocValues.MultiSortedDocValues) {
MultiDocValues.MultiSortedDocValues multiDocValues = (MultiDocValues.MultiSortedDocValues) globalDocValues;
docValues = multiDocValues.values[ctx.ord];
toGlobal = multiDocValues.mapping.getGlobalOrds(ctx.ord);
}
}
@Override
public void collect(int segDoc) throws IOException {
if (docValues.advanceExact(segDoc)) {
long val = toGlobal.get(docValues.ordValue());
collectValFirstPhase(segDoc, val);
}
}
});
} else { // Numeric:
if (sf.multiValued()) {
DocSetUtil.collectSortedDocSet(fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() {
SortedNumericDocValues values = null; //NN
@Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; }
@Override
protected void doSetNextReader(LeafReaderContext ctx) throws IOException {
setNextReaderFirstPhase(ctx);
values = DocValues.getSortedNumeric(ctx.reader(), sf.getName());
}
@Override
public void collect(int segDoc) throws IOException {
if (values.advanceExact(segDoc)) {
long l = values.nextValue(); // This document must have at least one value
collectValFirstPhase(segDoc, l);
for (int i = 1, count = values.docValueCount(); i < count; i++) {
long lnew = values.nextValue();
if (lnew != l) { // Skip the value if it's equal to the last one, we don't want to double-count it
collectValFirstPhase(segDoc, lnew);
}
l = lnew;
}
}
}
});
} else {
DocSetUtil.collectSortedDocSet(fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() {
NumericDocValues values = null; //NN
@Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; }
@Override
protected void doSetNextReader(LeafReaderContext ctx) throws IOException {
setNextReaderFirstPhase(ctx);
values = DocValues.getNumeric(ctx.reader(), sf.getName());
}
@Override
public void collect(int segDoc) throws IOException {
if (values.advanceExact(segDoc)) {
collectValFirstPhase(segDoc, values.longValue());
}
}
});
}
}
}
private void collectValFirstPhase(int segDoc, long val) throws IOException {
int slot = table.add(val); // this can trigger a rehash
// Our countAcc is virtual, so this is not needed:
// countAcc.incrementCount(slot, 1);
super.collectFirstPhase(segDoc, slot, slotContext);
}
/**
* SlotContext to use during all {@link SlotAcc} collection.
*
* This avoids a memory allocation for each invocation of collectValFirstPhase.
*/
private IntFunction<SlotContext> slotContext = (slotNum) -> {
long val = table.vals[slotNum];
@SuppressWarnings({"rawtypes"})
Comparable value = calc.bitsToValue(val);
return new SlotContext(sf.getType().getFieldQuery(null, sf, calc.formatValue(value)));
};
private void doRehash(LongCounts table) {
if (collectAcc == null && allBucketsAcc == null) return;
// Our "count" acc is backed by the hash table and will already be rehashed
// otherAccs don't need to be rehashed
int newTableSize = table.numSlots();
int numSlots = newTableSize;
final int oldAllBucketsSlot = allBucketsSlot;
if (oldAllBucketsSlot >= 0) {
allBucketsSlot = numSlots++;
}
final int finalNumSlots = numSlots;
final int[] mapping = table.oldToNewMapping;
SlotAcc.Resizer resizer = new SlotAcc.Resizer() {
@Override
public int getNewSize() {
return finalNumSlots;
}
@Override
public int getNewSlot(int oldSlot) {
if (oldSlot < mapping.length) {
return mapping[oldSlot];
}
if (oldSlot == oldAllBucketsSlot) {
return allBucketsSlot;
}
return -1;
}
};
// NOTE: resizing isn't strictly necessary for missing/allBuckets... we could just set the new slot directly
if (collectAcc != null) {
collectAcc.resize(resizer);
}
if (allBucketsAcc != null) {
allBucketsAcc.resize(resizer);
}
}
}