lucene/sandbox/src/java/org/apache/lucene/search/DocValuesTermsQuery.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;

 import java.io.IOException;
 import java.util.AbstractList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Objects;

 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PrefixCodedTerms;
 import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.LongBitSet;
 import org.apache.lucene.util.RamUsageEstimator;

 /**
  * A {@link Query} that only accepts documents whose
  * term value in the specified field is contained in the
  * provided set of allowed terms.
  *
  * <p>
  * This is the same functionality as TermsQuery (from
  * queries/), but because of drastically different
  * implementations, they also have different performance
  * characteristics, as described below.
  *
  * <p>
  * <b>NOTE</b>: be very careful using this query: it is
  * typically much slower than using {@code TermsQuery},
  * but in certain specialized cases may be faster.
  *
  * <p>
  * With each search, this query translates the specified
  * set of Terms into a private {@link LongBitSet} keyed by
  * term number per unique {@link IndexReader} (normally one
  * reader per segment).  Then, during matching, the term
  * number for each docID is retrieved from the cache and
  * then checked for inclusion using the {@link LongBitSet}.
  * Since all testing is done using RAM resident data
  * structures, performance should be very fast, most likely
  * fast enough to not require further caching of the
  * DocIdSet for each possible combination of terms.
  * However, because docIDs are simply scanned linearly, an
  * index with a great many small documents may find this
  * linear scan too costly.
  *
  * <p>
  * In contrast, TermsQuery builds up an {@link FixedBitSet},
  * keyed by docID, every time it's created, by enumerating
  * through all matching docs using {@link org.apache.lucene.index.PostingsEnum} to seek
  * and scan through each term's docID list.  While there is
  * no linear scan of all docIDs, besides the allocation of
  * the underlying array in the {@link FixedBitSet}, this
  * approach requires a number of "disk seeks" in proportion
  * to the number of terms, which can be exceptionally costly
  * when there are cache misses in the OS's IO cache.
  *
  * <p>
  * Generally, this filter will be slower on the first
  * invocation for a given field, but subsequent invocations,
  * even if you change the allowed set of Terms, should be
  * faster than TermsQuery, especially as the number of
  * Terms being matched increases.  If you are matching only
  * a very small number of terms, and those terms in turn
  * match a very small number of documents, TermsQuery may
  * perform faster.
  *
  * <p>
  * Which query is best is very application dependent.
  *
  * @lucene.experimental
  */
 public class DocValuesTermsQuery extends Query implements Accountable {
   private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(DocValuesTermsQuery.class);

   private final String field;
   private final PrefixCodedTerms termData;
   private final int termDataHashCode; // cached hashcode of termData

   public DocValuesTermsQuery(String field, Collection<BytesRef> terms) {
     this.field = Objects.requireNonNull(field);
     Objects.requireNonNull(terms, "Collection of terms must not be null");
     BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
     ArrayUtil.timSort(sortedTerms);
     PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
     BytesRef previous = null;
     for (BytesRef term : sortedTerms) {
       if (term.equals(previous) == false) {
         builder.add(field, term);
       }
       previous = term;
     }
     termData = builder.finish();
     termDataHashCode = termData.hashCode();
   }

   public DocValuesTermsQuery(String field, BytesRef... terms) {
     this(field, Arrays.asList(terms));
   }

   public DocValuesTermsQuery(String field, String... terms) {
     this(field, new AbstractList<BytesRef>() {
       @Override
       public BytesRef get(int index) {
         return new BytesRef(terms[index]);
       }
       @Override
       public int size() {
         return terms.length;
       }
     });
   }

   @Override
   public boolean equals(Object other) {
     return sameClassAs(other) &&
            equalsTo(getClass().cast(other));
   }

   private boolean equalsTo(DocValuesTermsQuery other) {
     // termData might be heavy to compare so check the hash code first
     return termDataHashCode == other.termDataHashCode &&
            termData.equals(other.termData);
   }

   @Override
   public int hashCode() {
     return 31 * classHash() + termDataHashCode;
   }

   @Override
   public String toString(String defaultField) {
     StringBuilder builder = new StringBuilder();
     boolean first = true;
     TermIterator iterator = termData.iterator();
     for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
       if (!first) {
         builder.append(' ');
       }
       first = false;
       builder.append(new Term(iterator.field(), term).toString());
     }

     return builder.toString();
   }

   /**
    * @return the name of the field searched by this query.
    */
   public String getField() {
     return field;
   }

   /**
    * @return the terms looked up by this query, prefix-encoded.
    */
   public PrefixCodedTerms getTerms() {
     return termData;
   }

   @Override
   public long ramBytesUsed() {
     return BASE_RAM_BYTES +
         RamUsageEstimator.sizeOfObject(field) +
         RamUsageEstimator.sizeOfObject(termData);
   }

   @Override
   public void visit(QueryVisitor visitor) {
     if (visitor.acceptField(field)) {
       visitor.visitLeaf(this);
     }
   }

   @Override
   public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
     return new ConstantScoreWeight(this, boost) {

       @Override
       public Scorer scorer(LeafReaderContext context) throws IOException {
         final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field);
         final LongBitSet bits = new LongBitSet(values.getValueCount());
         boolean matchesAtLeastOneTerm = false;
         TermIterator iterator = termData.iterator();
         for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
           final long ord = values.lookupTerm(term);
           if (ord >= 0) {
             matchesAtLeastOneTerm = true;
             bits.set(ord);
           }
         }
         if (matchesAtLeastOneTerm == false) {
           return null;
         }
         return new ConstantScoreScorer(this, score(), scoreMode, new TwoPhaseIterator(values) {

           @Override
           public boolean matches() throws IOException {
             for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
               if (bits.get(ord)) {
                 return true;
               }
             }
             return false;
           }

           @Override
           public float matchCost() {
             return 3; // lookup in a bitset
           }

         });
       }

       @Override
       public boolean isCacheable(LeafReaderContext ctx) {
         return DocValues.isCacheable(ctx, field);
       }

     };
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search;

	import java.io.IOException;
	import java.util.AbstractList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Objects;

	import org.apache.lucene.index.DocValues;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.PrefixCodedTerms;
	import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
	import org.apache.lucene.index.SortedSetDocValues;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.LongBitSet;
	import org.apache.lucene.util.RamUsageEstimator;

	/**
	* A {@link Query} that only accepts documents whose
	* term value in the specified field is contained in the
	* provided set of allowed terms.
	*
	* <p>
	* This is the same functionality as TermsQuery (from
	* queries/), but because of drastically different
	* implementations, they also have different performance
	* characteristics, as described below.
	*
	* <p>
	* <b>NOTE</b>: be very careful using this query: it is
	* typically much slower than using {@code TermsQuery},
	* but in certain specialized cases may be faster.
	*
	* <p>
	* With each search, this query translates the specified
	* set of Terms into a private {@link LongBitSet} keyed by
	* term number per unique {@link IndexReader} (normally one
	* reader per segment). Then, during matching, the term
	* number for each docID is retrieved from the cache and
	* then checked for inclusion using the {@link LongBitSet}.
	* Since all testing is done using RAM resident data
	* structures, performance should be very fast, most likely
	* fast enough to not require further caching of the
	* DocIdSet for each possible combination of terms.
	* However, because docIDs are simply scanned linearly, an
	* index with a great many small documents may find this
	* linear scan too costly.
	*
	* <p>
	* In contrast, TermsQuery builds up an {@link FixedBitSet},
	* keyed by docID, every time it's created, by enumerating
	* through all matching docs using {@link org.apache.lucene.index.PostingsEnum} to seek
	* and scan through each term's docID list. While there is
	* no linear scan of all docIDs, besides the allocation of
	* the underlying array in the {@link FixedBitSet}, this
	* approach requires a number of "disk seeks" in proportion
	* to the number of terms, which can be exceptionally costly
	* when there are cache misses in the OS's IO cache.
	*
	* <p>
	* Generally, this filter will be slower on the first
	* invocation for a given field, but subsequent invocations,
	* even if you change the allowed set of Terms, should be
	* faster than TermsQuery, especially as the number of
	* Terms being matched increases. If you are matching only
	* a very small number of terms, and those terms in turn
	* match a very small number of documents, TermsQuery may
	* perform faster.
	*
	* <p>
	* Which query is best is very application dependent.
	*
	* @lucene.experimental
	*/
	public class DocValuesTermsQuery extends Query implements Accountable {
	private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(DocValuesTermsQuery.class);

	private final String field;
	private final PrefixCodedTerms termData;
	private final int termDataHashCode; // cached hashcode of termData

	public DocValuesTermsQuery(String field, Collection<BytesRef> terms) {
	this.field = Objects.requireNonNull(field);
	Objects.requireNonNull(terms, "Collection of terms must not be null");
	BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
	ArrayUtil.timSort(sortedTerms);
	PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
	BytesRef previous = null;
	for (BytesRef term : sortedTerms) {
	if (term.equals(previous) == false) {
	builder.add(field, term);
	}
	previous = term;
	}
	termData = builder.finish();
	termDataHashCode = termData.hashCode();
	}

	public DocValuesTermsQuery(String field, BytesRef... terms) {
	this(field, Arrays.asList(terms));
	}

	public DocValuesTermsQuery(String field, String... terms) {
	this(field, new AbstractList<BytesRef>() {
	@Override
	public BytesRef get(int index) {
	return new BytesRef(terms[index]);
	}
	@Override
	public int size() {
	return terms.length;
	}
	});
	}

	@Override
	public boolean equals(Object other) {
	return sameClassAs(other) &&
	equalsTo(getClass().cast(other));
	}

	private boolean equalsTo(DocValuesTermsQuery other) {
	// termData might be heavy to compare so check the hash code first
	return termDataHashCode == other.termDataHashCode &&
	termData.equals(other.termData);
	}

	@Override
	public int hashCode() {
	return 31 * classHash() + termDataHashCode;
	}

	@Override
	public String toString(String defaultField) {
	StringBuilder builder = new StringBuilder();
	boolean first = true;
	TermIterator iterator = termData.iterator();
	for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
	if (!first) {
	builder.append(' ');
	}
	first = false;
	builder.append(new Term(iterator.field(), term).toString());
	}

	return builder.toString();
	}

	/**
	* @return the name of the field searched by this query.
	*/
	public String getField() {
	return field;
	}

	/**
	* @return the terms looked up by this query, prefix-encoded.
	*/
	public PrefixCodedTerms getTerms() {
	return termData;
	}

	@Override
	public long ramBytesUsed() {
	return BASE_RAM_BYTES +
	RamUsageEstimator.sizeOfObject(field) +
	RamUsageEstimator.sizeOfObject(termData);
	}

	@Override
	public void visit(QueryVisitor visitor) {
	if (visitor.acceptField(field)) {
	visitor.visitLeaf(this);
	}
	}

	@Override
	public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
	return new ConstantScoreWeight(this, boost) {

	@Override
	public Scorer scorer(LeafReaderContext context) throws IOException {
	final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field);
	final LongBitSet bits = new LongBitSet(values.getValueCount());
	boolean matchesAtLeastOneTerm = false;
	TermIterator iterator = termData.iterator();
	for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
	final long ord = values.lookupTerm(term);
	if (ord >= 0) {
	matchesAtLeastOneTerm = true;
	bits.set(ord);
	}
	}
	if (matchesAtLeastOneTerm == false) {
	return null;
	}
	return new ConstantScoreScorer(this, score(), scoreMode, new TwoPhaseIterator(values) {

	@Override
	public boolean matches() throws IOException {
	for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
	if (bits.get(ord)) {
	return true;
	}
	}
	return false;
	}

	@Override
	public float matchCost() {
	return 3; // lookup in a bitset
	}

	});
	}

	@Override
	public boolean isCacheable(LeafReaderContext ctx) {
	return DocValues.isCacheable(ctx, field);
	}

	};
	}

	}