lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.queries.intervals;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Objects;

 import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
 import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.util.BytesRef;

 class TermIntervalsSource extends IntervalsSource {

   final BytesRef term;

   TermIntervalsSource(BytesRef term) {
     this.term = term;
   }

   @Override
   public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
     Terms terms = ctx.reader().terms(field);
     if (terms == null)
       return null;
     if (terms.hasPositions() == false) {
       throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
     }
     TermsEnum te = terms.iterator();
     if (te.seekExact(term) == false) {
       return null;
     }
     return intervals(term, te);
   }

   static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
     PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
     float cost = termPositionsCost(te);
     return new IntervalIterator() {

       @Override
       public int docID() {
         return pe.docID();
       }

       @Override
       public int nextDoc() throws IOException {
         int doc = pe.nextDoc();
         reset();
         return doc;
       }

       @Override
       public int advance(int target) throws IOException {
         int doc = pe.advance(target);
         reset();
         return doc;
       }

       @Override
       public long cost() {
         return pe.cost();
       }

       int pos = -1, upto;

       @Override
       public int start() {
         return pos;
       }

       @Override
       public int end() {
         return pos;
       }

       @Override
       public int gaps() {
         return 0;
       }

       @Override
       public int nextInterval() throws IOException {
         if (upto <= 0)
           return pos = NO_MORE_INTERVALS;
         upto--;
         return pos = pe.nextPosition();
       }

       @Override
       public float matchCost() {
         return cost;
       }

       private void reset() throws IOException {
         if (pe.docID() == NO_MORE_DOCS) {
           upto = -1;
           pos = NO_MORE_INTERVALS;
         }
         else {
           upto = pe.freq();
           pos = -1;
         }
       }

       @Override
       public String toString() {
         return term.utf8ToString() + ":" + super.toString();
       }
     };
   }

   @Override
   public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
     Terms terms = ctx.reader().terms(field);
     if (terms == null)
       return null;
     if (terms.hasPositions() == false) {
       throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
     }
     TermsEnum te = terms.iterator();
     if (te.seekExact(term) == false) {
       return null;
     }
     return matches(te, doc, field);
   }

   static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
     TermQuery query = new TermQuery(new Term(field, te.term()));
     PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
     if (pe.advance(doc) != doc) {
       return null;
     }
     return new IntervalMatchesIterator() {

       @Override
       public int gaps() {
         return 0;
       }

       @Override
       public int width() {
         return 1;
       }

       int upto = pe.freq();
       int pos = -1;

       @Override
       public boolean next() throws IOException {
         if (upto <= 0) {
           pos = IntervalIterator.NO_MORE_INTERVALS;
           return false;
         }
         upto--;
         pos = pe.nextPosition();
         return true;
       }

       @Override
       public int startPosition() {
         return pos;
       }

       @Override
       public int endPosition() {
         return pos;
       }

       @Override
       public int startOffset() throws IOException {
         return pe.startOffset();
       }

       @Override
       public int endOffset() throws IOException {
         return pe.endOffset();
       }

       @Override
       public MatchesIterator getSubMatches() {
         return null;
       }

       @Override
       public Query getQuery() {
         return query;
       }
     };
   }

   @Override
   public int minExtent() {
     return 1;
   }

   @Override
   public Collection<IntervalsSource> pullUpDisjunctions() {
     return Collections.singleton(this);
   }

   @Override
   public int hashCode() {
     return Objects.hash(term);
   }

   @Override
   public boolean equals(Object o) {
     if (this == o) return true;
     if (o == null || getClass() != o.getClass()) return false;
     TermIntervalsSource that = (TermIntervalsSource) o;
     return Objects.equals(term, that.term);
   }

   @Override
   public String toString() {
     return term.utf8ToString();
   }

   @Override
   public void visit(String field, QueryVisitor visitor) {
     visitor.consumeTerms(new IntervalQuery(field, this), new Term(field, term));
   }

   /** A guess of
    * the average number of simple operations for the initial seek and buffer refill
    * per document for the positions of a term.
    * See also {@link Lucene84PostingsReader.EverythingEnum#nextPosition()}.
    * <p>
    * Aside: Instead of being constant this could depend among others on
    * {@link Lucene84PostingsFormat#BLOCK_SIZE},
    * {@link TermsEnum#docFreq()},
    * {@link TermsEnum#totalTermFreq()},
    * {@link DocIdSetIterator#cost()} (expected number of matching docs),
    * {@link LeafReader#maxDoc()} (total number of docs in the segment),
    * and the seek time and block size of the device storing the index.
    */
   private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

   /** Number of simple operations in {@link Lucene84PostingsReader.EverythingEnum#nextPosition()}
    *  when no seek or buffer refill is done.
    */
   private static final int TERM_OPS_PER_POS = 7;

   /** Returns an expected cost in simple operations
    *  of processing the occurrences of a term
    *  in a document that contains the term.
    *  This is for use by {@link TwoPhaseIterator#matchCost} implementations.
    *  @param termsEnum The term is the term at which this TermsEnum is positioned.
    */
   static float termPositionsCost(TermsEnum termsEnum) throws IOException {
     // TODO: When intervals move to core, refactor to use the copy of this in PhraseQuery
     int docFreq = termsEnum.docFreq();
     assert docFreq > 0;
     long totalTermFreq = termsEnum.totalTermFreq();
     float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
     return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.queries.intervals;

	import java.io.IOException;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.Objects;

	import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
	import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.MatchesIterator;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.QueryVisitor;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.TwoPhaseIterator;
	import org.apache.lucene.util.BytesRef;

	class TermIntervalsSource extends IntervalsSource {

	final BytesRef term;

	TermIntervalsSource(BytesRef term) {
	this.term = term;
	}

	@Override
	public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
	Terms terms = ctx.reader().terms(field);
	if (terms == null)
	return null;
	if (terms.hasPositions() == false) {
	throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
	}
	TermsEnum te = terms.iterator();
	if (te.seekExact(term) == false) {
	return null;
	}
	return intervals(term, te);
	}

	static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
	PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
	float cost = termPositionsCost(te);
	return new IntervalIterator() {

	@Override
	public int docID() {
	return pe.docID();
	}

	@Override
	public int nextDoc() throws IOException {
	int doc = pe.nextDoc();
	reset();
	return doc;
	}

	@Override
	public int advance(int target) throws IOException {
	int doc = pe.advance(target);
	reset();
	return doc;
	}

	@Override
	public long cost() {
	return pe.cost();
	}

	int pos = -1, upto;

	@Override
	public int start() {
	return pos;
	}

	@Override
	public int end() {
	return pos;
	}

	@Override
	public int gaps() {
	return 0;
	}

	@Override
	public int nextInterval() throws IOException {
	if (upto <= 0)
	return pos = NO_MORE_INTERVALS;
	upto--;
	return pos = pe.nextPosition();
	}

	@Override
	public float matchCost() {
	return cost;
	}

	private void reset() throws IOException {
	if (pe.docID() == NO_MORE_DOCS) {
	upto = -1;
	pos = NO_MORE_INTERVALS;
	}
	else {
	upto = pe.freq();
	pos = -1;
	}
	}

	@Override
	public String toString() {
	return term.utf8ToString() + ":" + super.toString();
	}
	};
	}

	@Override
	public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException {
	Terms terms = ctx.reader().terms(field);
	if (terms == null)
	return null;
	if (terms.hasPositions() == false) {
	throw new IllegalArgumentException("Cannot create an IntervalIterator over field " + field + " because it has no indexed positions");
	}
	TermsEnum te = terms.iterator();
	if (te.seekExact(term) == false) {
	return null;
	}
	return matches(te, doc, field);
	}

	static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
	TermQuery query = new TermQuery(new Term(field, te.term()));
	PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
	if (pe.advance(doc) != doc) {
	return null;
	}
	return new IntervalMatchesIterator() {

	@Override
	public int gaps() {
	return 0;
	}

	@Override
	public int width() {
	return 1;
	}

	int upto = pe.freq();
	int pos = -1;

	@Override
	public boolean next() throws IOException {
	if (upto <= 0) {
	pos = IntervalIterator.NO_MORE_INTERVALS;
	return false;
	}
	upto--;
	pos = pe.nextPosition();
	return true;
	}

	@Override
	public int startPosition() {
	return pos;
	}

	@Override
	public int endPosition() {
	return pos;
	}

	@Override
	public int startOffset() throws IOException {
	return pe.startOffset();
	}

	@Override
	public int endOffset() throws IOException {
	return pe.endOffset();
	}

	@Override
	public MatchesIterator getSubMatches() {
	return null;
	}

	@Override
	public Query getQuery() {
	return query;
	}
	};
	}

	@Override
	public int minExtent() {
	return 1;
	}

	@Override
	public Collection<IntervalsSource> pullUpDisjunctions() {
	return Collections.singleton(this);
	}

	@Override
	public int hashCode() {
	return Objects.hash(term);
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) return true;
	if (o == null \|\| getClass() != o.getClass()) return false;
	TermIntervalsSource that = (TermIntervalsSource) o;
	return Objects.equals(term, that.term);
	}

	@Override
	public String toString() {
	return term.utf8ToString();
	}

	@Override
	public void visit(String field, QueryVisitor visitor) {
	visitor.consumeTerms(new IntervalQuery(field, this), new Term(field, term));
	}

	/** A guess of
	* the average number of simple operations for the initial seek and buffer refill
	* per document for the positions of a term.
	* See also {@link Lucene84PostingsReader.EverythingEnum#nextPosition()}.
	* <p>
	* Aside: Instead of being constant this could depend among others on
	* {@link Lucene84PostingsFormat#BLOCK_SIZE},
	* {@link TermsEnum#docFreq()},
	* {@link TermsEnum#totalTermFreq()},
	* {@link DocIdSetIterator#cost()} (expected number of matching docs),
	* {@link LeafReader#maxDoc()} (total number of docs in the segment),
	* and the seek time and block size of the device storing the index.
	*/
	private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

	/** Number of simple operations in {@link Lucene84PostingsReader.EverythingEnum#nextPosition()}
	* when no seek or buffer refill is done.
	*/
	private static final int TERM_OPS_PER_POS = 7;

	/** Returns an expected cost in simple operations
	* of processing the occurrences of a term
	* in a document that contains the term.
	* This is for use by {@link TwoPhaseIterator#matchCost} implementations.
	* @param termsEnum The term is the term at which this TermsEnum is positioned.
	*/
	static float termPositionsCost(TermsEnum termsEnum) throws IOException {
	// TODO: When intervals move to core, refactor to use the copy of this in PhraseQuery
	int docFreq = termsEnum.docFreq();
	assert docFreq > 0;
	long totalTermFreq = termsEnum.totalTermFreq();
	float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
	return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
	}
	}