lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java - lucene-solr - Git at Google

 package org.apache.lucene.search;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.Arrays;

 import org.apache.lucene.index.*;
 import org.apache.lucene.search.similarities.Similarity;

 final class ExactPhraseScorer extends Scorer {
   private final int endMinus1;

   private final static int CHUNK = 4096;

   private int gen;
   private final int[] counts = new int[CHUNK];
   private final int[] gens = new int[CHUNK];

   boolean noDocs;
   private final long cost;

   private final static class ChunkState {
     final DocsAndPositionsEnum posEnum;
     final int offset;
     final boolean useAdvance;
     int posUpto;
     int posLimit;
     int pos;
     int lastPos;

     public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
       this.posEnum = posEnum;
       this.offset = offset;
       this.useAdvance = useAdvance;
     }
   }

   private final ChunkState[] chunkStates;

   private int docID = -1;
   private int freq;

   private final Similarity.SimScorer docScorer;

   ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
                     Similarity.SimScorer docScorer) throws IOException {
     super(weight);
     this.docScorer = docScorer;

     chunkStates = new ChunkState[postings.length];

     endMinus1 = postings.length-1;

     // min(cost)
     cost = postings[0].postings.cost();

     for(int i=0;i<postings.length;i++) {

       // Coarse optimization: advance(target) is fairly
       // costly, so, if the relative freq of the 2nd
       // rarest term is not that much (> 1/5th) rarer than
       // the first term, then we just use .nextDoc() when
       // ANDing.  This buys ~15% gain for phrases where
       // freq of rarest 2 terms is close:
       final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
       chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
       if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
         noDocs = true;
         return;
       }
     }
   }

   @Override
   public int nextDoc() throws IOException {
     while(true) {

       // first (rarest) term
       final int doc = chunkStates[0].posEnum.nextDoc();
       if (doc == DocIdSetIterator.NO_MORE_DOCS) {
         docID = doc;
         return doc;
       }

       // not-first terms
       int i = 1;
       while(i < chunkStates.length) {
         final ChunkState cs = chunkStates[i];
         int doc2 = cs.posEnum.docID();
         if (cs.useAdvance) {
           if (doc2 < doc) {
             doc2 = cs.posEnum.advance(doc);
           }
         } else {
           int iter = 0;
           while(doc2 < doc) {
             // safety net -- fallback to .advance if we've
             // done too many .nextDocs
             if (++iter == 50) {
               doc2 = cs.posEnum.advance(doc);
               break;
             } else {
               doc2 = cs.posEnum.nextDoc();
             }
           }
         }
         if (doc2 > doc) {
           break;
         }
         i++;
       }

       if (i == chunkStates.length) {
         // this doc has all the terms -- now test whether
         // phrase occurs
         docID = doc;

         freq = phraseFreq();
         if (freq != 0) {
           return docID;
         }
       }
     }
   }

   @Override
   public int advance(int target) throws IOException {

     // first term
     int doc = chunkStates[0].posEnum.advance(target);
     if (doc == DocIdSetIterator.NO_MORE_DOCS) {
       docID = DocIdSetIterator.NO_MORE_DOCS;
       return doc;
     }

     while(true) {

       // not-first terms
       int i = 1;
       while(i < chunkStates.length) {
         int doc2 = chunkStates[i].posEnum.docID();
         if (doc2 < doc) {
           doc2 = chunkStates[i].posEnum.advance(doc);
         }
         if (doc2 > doc) {
           break;
         }
         i++;
       }

       if (i == chunkStates.length) {
         // this doc has all the terms -- now test whether
         // phrase occurs
         docID = doc;
         freq = phraseFreq();
         if (freq != 0) {
           return docID;
         }
       }

       doc = chunkStates[0].posEnum.nextDoc();
       if (doc == DocIdSetIterator.NO_MORE_DOCS) {
         docID = doc;
         return doc;
       }
     }
   }

   @Override
   public String toString() {
     return "ExactPhraseScorer(" + weight + ")";
   }

   @Override
   public int freq() {
     return freq;
   }

   @Override
   public int docID() {
     return docID;
   }

   @Override
   public float score() {
     return docScorer.score(docID, freq);
   }

   private int phraseFreq() throws IOException {

     freq = 0;

     // init chunks
     for(int i=0;i<chunkStates.length;i++) {
       final ChunkState cs = chunkStates[i];
       cs.posLimit = cs.posEnum.freq();
       cs.pos = cs.offset + cs.posEnum.nextPosition();
       cs.posUpto = 1;
       cs.lastPos = -1;
     }

     int chunkStart = 0;
     int chunkEnd = CHUNK;

     // process chunk by chunk
     boolean end = false;

     // TODO: we could fold in chunkStart into offset and
     // save one subtract per pos incr

     while(!end) {

       gen++;

       if (gen == 0) {
         // wraparound
         Arrays.fill(gens, 0);
         gen++;
       }

       // first term
       {
         final ChunkState cs = chunkStates[0];
         while(cs.pos < chunkEnd) {
           if (cs.pos > cs.lastPos) {
             cs.lastPos = cs.pos;
             final int posIndex = cs.pos - chunkStart;
             counts[posIndex] = 1;
             assert gens[posIndex] != gen;
             gens[posIndex] = gen;
           }

           if (cs.posUpto == cs.posLimit) {
             end = true;
             break;
           }
           cs.posUpto++;
           cs.pos = cs.offset + cs.posEnum.nextPosition();
         }
       }

       // middle terms
       boolean any = true;
       for(int t=1;t<endMinus1;t++) {
         final ChunkState cs = chunkStates[t];
         any = false;
         while(cs.pos < chunkEnd) {
           if (cs.pos > cs.lastPos) {
             cs.lastPos = cs.pos;
             final int posIndex = cs.pos - chunkStart;
             if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
               // viable
               counts[posIndex]++;
               any = true;
             }
           }

           if (cs.posUpto == cs.posLimit) {
             end = true;
             break;
           }
           cs.posUpto++;
           cs.pos = cs.offset + cs.posEnum.nextPosition();
         }

         if (!any) {
           break;
         }
       }

       if (!any) {
         // petered out for this chunk
         chunkStart += CHUNK;
         chunkEnd += CHUNK;
         continue;
       }

       // last term

       {
         final ChunkState cs = chunkStates[endMinus1];
         while(cs.pos < chunkEnd) {
           if (cs.pos > cs.lastPos) {
             cs.lastPos = cs.pos;
             final int posIndex = cs.pos - chunkStart;
             if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
               freq++;
             }
           }

           if (cs.posUpto == cs.posLimit) {
             end = true;
             break;
           }
           cs.posUpto++;
           cs.pos = cs.offset + cs.posEnum.nextPosition();
         }
       }

       chunkStart += CHUNK;
       chunkEnd += CHUNK;
     }

     return freq;
   }

   @Override
   public long cost() {
     return cost;
   }
 }
	package org.apache.lucene.search;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.Arrays;

	import org.apache.lucene.index.*;
	import org.apache.lucene.search.similarities.Similarity;

	final class ExactPhraseScorer extends Scorer {
	private final int endMinus1;

	private final static int CHUNK = 4096;

	private int gen;
	private final int[] counts = new int[CHUNK];
	private final int[] gens = new int[CHUNK];

	boolean noDocs;
	private final long cost;

	private final static class ChunkState {
	final DocsAndPositionsEnum posEnum;
	final int offset;
	final boolean useAdvance;
	int posUpto;
	int posLimit;
	int pos;
	int lastPos;

	public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
	this.posEnum = posEnum;
	this.offset = offset;
	this.useAdvance = useAdvance;
	}
	}

	private final ChunkState[] chunkStates;

	private int docID = -1;
	private int freq;

	private final Similarity.SimScorer docScorer;

	ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
	Similarity.SimScorer docScorer) throws IOException {
	super(weight);
	this.docScorer = docScorer;

	chunkStates = new ChunkState[postings.length];

	endMinus1 = postings.length-1;

	// min(cost)
	cost = postings[0].postings.cost();

	for(int i=0;i<postings.length;i++) {

	// Coarse optimization: advance(target) is fairly
	// costly, so, if the relative freq of the 2nd
	// rarest term is not that much (> 1/5th) rarer than
	// the first term, then we just use .nextDoc() when
	// ANDing. This buys ~15% gain for phrases where
	// freq of rarest 2 terms is close:
	final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
	chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
	if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
	noDocs = true;
	return;
	}
	}
	}

	@Override
	public int nextDoc() throws IOException {
	while(true) {

	// first (rarest) term
	final int doc = chunkStates[0].posEnum.nextDoc();
	if (doc == DocIdSetIterator.NO_MORE_DOCS) {
	docID = doc;
	return doc;
	}

	// not-first terms
	int i = 1;
	while(i < chunkStates.length) {
	final ChunkState cs = chunkStates[i];
	int doc2 = cs.posEnum.docID();
	if (cs.useAdvance) {
	if (doc2 < doc) {
	doc2 = cs.posEnum.advance(doc);
	}
	} else {
	int iter = 0;
	while(doc2 < doc) {
	// safety net -- fallback to .advance if we've
	// done too many .nextDocs
	if (++iter == 50) {
	doc2 = cs.posEnum.advance(doc);
	break;
	} else {
	doc2 = cs.posEnum.nextDoc();
	}
	}
	}
	if (doc2 > doc) {
	break;
	}
	i++;
	}

	if (i == chunkStates.length) {
	// this doc has all the terms -- now test whether
	// phrase occurs
	docID = doc;

	freq = phraseFreq();
	if (freq != 0) {
	return docID;
	}
	}
	}
	}

	@Override
	public int advance(int target) throws IOException {

	// first term
	int doc = chunkStates[0].posEnum.advance(target);
	if (doc == DocIdSetIterator.NO_MORE_DOCS) {
	docID = DocIdSetIterator.NO_MORE_DOCS;
	return doc;
	}

	while(true) {

	// not-first terms
	int i = 1;
	while(i < chunkStates.length) {
	int doc2 = chunkStates[i].posEnum.docID();
	if (doc2 < doc) {
	doc2 = chunkStates[i].posEnum.advance(doc);
	}
	if (doc2 > doc) {
	break;
	}
	i++;
	}

	if (i == chunkStates.length) {
	// this doc has all the terms -- now test whether
	// phrase occurs
	docID = doc;
	freq = phraseFreq();
	if (freq != 0) {
	return docID;
	}
	}

	doc = chunkStates[0].posEnum.nextDoc();
	if (doc == DocIdSetIterator.NO_MORE_DOCS) {
	docID = doc;
	return doc;
	}
	}
	}

	@Override
	public String toString() {
	return "ExactPhraseScorer(" + weight + ")";
	}

	@Override
	public int freq() {
	return freq;
	}

	@Override
	public int docID() {
	return docID;
	}

	@Override
	public float score() {
	return docScorer.score(docID, freq);
	}

	private int phraseFreq() throws IOException {

	freq = 0;

	// init chunks
	for(int i=0;i<chunkStates.length;i++) {
	final ChunkState cs = chunkStates[i];
	cs.posLimit = cs.posEnum.freq();
	cs.pos = cs.offset + cs.posEnum.nextPosition();
	cs.posUpto = 1;
	cs.lastPos = -1;
	}

	int chunkStart = 0;
	int chunkEnd = CHUNK;

	// process chunk by chunk
	boolean end = false;

	// TODO: we could fold in chunkStart into offset and
	// save one subtract per pos incr

	while(!end) {

	gen++;

	if (gen == 0) {
	// wraparound
	Arrays.fill(gens, 0);
	gen++;
	}

	// first term
	{
	final ChunkState cs = chunkStates[0];
	while(cs.pos < chunkEnd) {
	if (cs.pos > cs.lastPos) {
	cs.lastPos = cs.pos;
	final int posIndex = cs.pos - chunkStart;
	counts[posIndex] = 1;
	assert gens[posIndex] != gen;
	gens[posIndex] = gen;
	}

	if (cs.posUpto == cs.posLimit) {
	end = true;
	break;
	}
	cs.posUpto++;
	cs.pos = cs.offset + cs.posEnum.nextPosition();
	}
	}

	// middle terms
	boolean any = true;
	for(int t=1;t<endMinus1;t++) {
	final ChunkState cs = chunkStates[t];
	any = false;
	while(cs.pos < chunkEnd) {
	if (cs.pos > cs.lastPos) {
	cs.lastPos = cs.pos;
	final int posIndex = cs.pos - chunkStart;
	if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
	// viable
	counts[posIndex]++;
	any = true;
	}
	}

	if (cs.posUpto == cs.posLimit) {
	end = true;
	break;
	}
	cs.posUpto++;
	cs.pos = cs.offset + cs.posEnum.nextPosition();
	}

	if (!any) {
	break;
	}
	}

	if (!any) {
	// petered out for this chunk
	chunkStart += CHUNK;
	chunkEnd += CHUNK;
	continue;
	}

	// last term

	{
	final ChunkState cs = chunkStates[endMinus1];
	while(cs.pos < chunkEnd) {
	if (cs.pos > cs.lastPos) {
	cs.lastPos = cs.pos;
	final int posIndex = cs.pos - chunkStart;
	if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
	freq++;
	}
	}

	if (cs.posUpto == cs.posLimit) {
	end = true;
	break;
	}
	cs.posUpto++;
	cs.pos = cs.offset + cs.posEnum.nextPosition();
	}
	}

	chunkStart += CHUNK;
	chunkEnd += CHUNK;
	}

	return freq;
	}

	@Override
	public long cost() {
	return cost;
	}
	}