| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| import java.io.IOException; |
| import java.util.AbstractList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Objects; |
| |
| import org.apache.lucene.index.DocValues; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.PrefixCodedTerms; |
| import org.apache.lucene.index.PrefixCodedTerms.TermIterator; |
| import org.apache.lucene.index.SortedSetDocValues; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.LongBitSet; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * A {@link Query} that only accepts documents whose |
| * term value in the specified field is contained in the |
| * provided set of allowed terms. |
| * |
| * <p> |
| * This is the same functionality as TermsQuery (from |
| * queries/), but because of drastically different |
| * implementations, they also have different performance |
| * characteristics, as described below. |
| * |
| * <p> |
| * <b>NOTE</b>: be very careful using this query: it is |
| * typically much slower than using {@code TermsQuery}, |
| * but in certain specialized cases may be faster. |
| * |
| * <p> |
| * With each search, this query translates the specified |
| * set of Terms into a private {@link LongBitSet} keyed by |
| * term number per unique {@link IndexReader} (normally one |
| * reader per segment). Then, during matching, the term |
| * number for each docID is retrieved from the cache and |
| * then checked for inclusion using the {@link LongBitSet}. |
| * Since all testing is done using RAM resident data |
| * structures, performance should be very fast, most likely |
| * fast enough to not require further caching of the |
| * DocIdSet for each possible combination of terms. |
| * However, because docIDs are simply scanned linearly, an |
| * index with a great many small documents may find this |
| * linear scan too costly. |
| * |
| * <p> |
| * In contrast, TermsQuery builds up an {@link FixedBitSet}, |
| * keyed by docID, every time it's created, by enumerating |
| * through all matching docs using {@link org.apache.lucene.index.PostingsEnum} to seek |
| * and scan through each term's docID list. While there is |
| * no linear scan of all docIDs, besides the allocation of |
| * the underlying array in the {@link FixedBitSet}, this |
| * approach requires a number of "disk seeks" in proportion |
| * to the number of terms, which can be exceptionally costly |
| * when there are cache misses in the OS's IO cache. |
| * |
| * <p> |
| * Generally, this filter will be slower on the first |
| * invocation for a given field, but subsequent invocations, |
| * even if you change the allowed set of Terms, should be |
| * faster than TermsQuery, especially as the number of |
| * Terms being matched increases. If you are matching only |
| * a very small number of terms, and those terms in turn |
| * match a very small number of documents, TermsQuery may |
| * perform faster. |
| * |
| * <p> |
| * Which query is best is very application dependent. |
| * |
| * @lucene.experimental |
| */ |
| public class DocValuesTermsQuery extends Query implements Accountable { |
| private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(DocValuesTermsQuery.class); |
| |
| private final String field; |
| private final PrefixCodedTerms termData; |
| private final int termDataHashCode; // cached hashcode of termData |
| |
| public DocValuesTermsQuery(String field, Collection<BytesRef> terms) { |
| this.field = Objects.requireNonNull(field); |
| Objects.requireNonNull(terms, "Collection of terms must not be null"); |
| BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]); |
| ArrayUtil.timSort(sortedTerms); |
| PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); |
| BytesRef previous = null; |
| for (BytesRef term : sortedTerms) { |
| if (term.equals(previous) == false) { |
| builder.add(field, term); |
| } |
| previous = term; |
| } |
| termData = builder.finish(); |
| termDataHashCode = termData.hashCode(); |
| } |
| |
| public DocValuesTermsQuery(String field, BytesRef... terms) { |
| this(field, Arrays.asList(terms)); |
| } |
| |
| public DocValuesTermsQuery(String field, String... terms) { |
| this(field, new AbstractList<BytesRef>() { |
| @Override |
| public BytesRef get(int index) { |
| return new BytesRef(terms[index]); |
| } |
| @Override |
| public int size() { |
| return terms.length; |
| } |
| }); |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return sameClassAs(other) && |
| equalsTo(getClass().cast(other)); |
| } |
| |
| private boolean equalsTo(DocValuesTermsQuery other) { |
| // termData might be heavy to compare so check the hash code first |
| return termDataHashCode == other.termDataHashCode && |
| termData.equals(other.termData); |
| } |
| |
| @Override |
| public int hashCode() { |
| return 31 * classHash() + termDataHashCode; |
| } |
| |
| @Override |
| public String toString(String defaultField) { |
| StringBuilder builder = new StringBuilder(); |
| boolean first = true; |
| TermIterator iterator = termData.iterator(); |
| for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { |
| if (!first) { |
| builder.append(' '); |
| } |
| first = false; |
| builder.append(new Term(iterator.field(), term).toString()); |
| } |
| |
| return builder.toString(); |
| } |
| |
| /** |
| * @return the name of the field searched by this query. |
| */ |
| public String getField() { |
| return field; |
| } |
| |
| /** |
| * @return the terms looked up by this query, prefix-encoded. |
| */ |
| public PrefixCodedTerms getTerms() { |
| return termData; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_BYTES + |
| RamUsageEstimator.sizeOfObject(field) + |
| RamUsageEstimator.sizeOfObject(termData); |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| if (visitor.acceptField(field)) { |
| visitor.visitLeaf(this); |
| } |
| } |
| |
| @Override |
| public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { |
| return new ConstantScoreWeight(this, boost) { |
| |
| @Override |
| public Scorer scorer(LeafReaderContext context) throws IOException { |
| final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); |
| final LongBitSet bits = new LongBitSet(values.getValueCount()); |
| boolean matchesAtLeastOneTerm = false; |
| TermIterator iterator = termData.iterator(); |
| for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { |
| final long ord = values.lookupTerm(term); |
| if (ord >= 0) { |
| matchesAtLeastOneTerm = true; |
| bits.set(ord); |
| } |
| } |
| if (matchesAtLeastOneTerm == false) { |
| return null; |
| } |
| return new ConstantScoreScorer(this, score(), scoreMode, new TwoPhaseIterator(values) { |
| |
| @Override |
| public boolean matches() throws IOException { |
| for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { |
| if (bits.get(ord)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| @Override |
| public float matchCost() { |
| return 3; // lookup in a bitset |
| } |
| |
| }); |
| } |
| |
| @Override |
| public boolean isCacheable(LeafReaderContext ctx) { |
| return DocValues.isCacheable(ctx, field); |
| } |
| |
| }; |
| } |
| |
| } |