lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search;

 import java.io.IOException;
 import java.util.Map;

 import org.apache.lucene.search.TermAutomatonQuery.EnumAndScorer;
 import org.apache.lucene.search.TermAutomatonQuery.TermAutomatonWeight;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.RunAutomaton;

 // TODO: add two-phase and needsScores support. maybe use conjunctionDISI internally?
 class TermAutomatonScorer extends Scorer {
   private final EnumAndScorer[] subs;
   private final EnumAndScorer[] subsOnDoc;
   private final PriorityQueue<EnumAndScorer> docIDQueue;
   private final PriorityQueue<EnumAndScorer> posQueue;
   private final RunAutomaton runAutomaton;
   private final Map<Integer,BytesRef> idToTerm;

   // We reuse this array to check for matches starting from an initial
   // position; we increase posShift every time we move to a new possible
   // start:
   private PosState[] positions;
   int posShift;

   // This is -1 if wildcard (null) terms were not used, else it's the id
   // of the wildcard term:
   private final int anyTermID;
   private final LeafSimScorer docScorer;

   private int numSubsOnDoc;

   private final long cost;

   private int docID = -1;
   private int freq;

   public TermAutomatonScorer(TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, Map<Integer,BytesRef> idToTerm, LeafSimScorer docScorer) throws IOException {
     super(weight);
     //System.out.println("  automaton:\n" + weight.automaton.toDot());
     this.runAutomaton = new TermRunAutomaton(weight.automaton, subs.length);
     this.docScorer = docScorer;
     this.idToTerm = idToTerm;
     this.subs = subs;
     this.docIDQueue = new DocIDQueue(subs.length);
     this.posQueue = new PositionQueue(subs.length);
     this.anyTermID = anyTermID;
     this.subsOnDoc = new EnumAndScorer[subs.length];
     this.positions = new PosState[4];
     for(int i=0;i<this.positions.length;i++) {
       this.positions[i] = new PosState();
     }
     long cost = 0;

     // Init docIDQueue:
     for(EnumAndScorer sub : subs) {
       if (sub != null) {
         cost += sub.posEnum.cost();
         subsOnDoc[numSubsOnDoc++] = sub;
       }
     }
     this.cost = cost;
   }

   /** Sorts by docID so we can quickly pull out all scorers that are on
    *  the same (lowest) docID. */
   private static class DocIDQueue extends PriorityQueue<EnumAndScorer> {
     public DocIDQueue(int maxSize) {
       super(maxSize);
     }

     @Override
     protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
       return a.posEnum.docID() < b.posEnum.docID();
     }
   }

   /** Sorts by position so we can visit all scorers on one doc, by
    *  position. */
   private static class PositionQueue extends PriorityQueue<EnumAndScorer> {
     public PositionQueue(int maxSize) {
       super(maxSize);
     }

     @Override
     protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
       return a.pos < b.pos;
     }
   }

   /** Pops all enums positioned on the current (minimum) doc */
   private void popCurrentDoc() {
     assert numSubsOnDoc == 0;
     assert docIDQueue.size() > 0;
     subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
     docID = subsOnDoc[0].posEnum.docID();
     while (docIDQueue.size() > 0 && docIDQueue.top().posEnum.docID() == docID) {
       subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
     }
   }

   /** Pushes all previously pop'd enums back into the docIDQueue */
   private void pushCurrentDoc() {
     for(int i=0;i<numSubsOnDoc;i++) {
       docIDQueue.add(subsOnDoc[i]);
     }
     numSubsOnDoc = 0;
   }

   @Override
   public DocIdSetIterator iterator() {
     return new DocIdSetIterator() {
       @Override
       public int docID() {
         return docID;
       }

       @Override
       public long cost() {
         return cost;
       }

       @Override
       public int nextDoc() throws IOException {
         // we only need to advance docs that are positioned since all docs in the
         // pq are guaranteed to be beyond the current doc already
         for(int i=0;i<numSubsOnDoc;i++) {
           EnumAndScorer sub = subsOnDoc[i];
           if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
             sub.posLeft = sub.posEnum.freq()-1;
             sub.pos = sub.posEnum.nextPosition();
           }
         }
         pushCurrentDoc();
         return doNext();
       }

       @Override
       public int advance(int target) throws IOException {
         // Both positioned docs and docs in the pq might be behind target

         // 1. Advance the PQ
         if (docIDQueue.size() > 0) {
           EnumAndScorer top = docIDQueue.top();
           while (top.posEnum.docID() < target) {
             if (top.posEnum.advance(target) != NO_MORE_DOCS) {
               top.posLeft = top.posEnum.freq()-1;
               top.pos = top.posEnum.nextPosition();
             }
             top = docIDQueue.updateTop();
           }
         }

         // 2. Advance subsOnDoc
         for(int i=0;i<numSubsOnDoc;i++) {
           EnumAndScorer sub = subsOnDoc[i];
           if (sub.posEnum.advance(target) != NO_MORE_DOCS) {
             sub.posLeft = sub.posEnum.freq()-1;
             sub.pos = sub.posEnum.nextPosition();
           }
         }
         pushCurrentDoc();
         return doNext();
       }

       private int doNext() throws IOException {
         assert numSubsOnDoc == 0;
         assert docIDQueue.top().posEnum.docID() > docID;
         while (true) {
           //System.out.println("  doNext: cycle");
           popCurrentDoc();
           //System.out.println("    docID=" + docID);
           if (docID == NO_MORE_DOCS) {
             return docID;
           }
           countMatches();
           if (freq > 0) {
             return docID;
           }
           for(int i=0;i<numSubsOnDoc;i++) {
             EnumAndScorer sub = subsOnDoc[i];
             if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
               sub.posLeft = sub.posEnum.freq()-1;
               sub.pos = sub.posEnum.nextPosition();
             }
           }
           pushCurrentDoc();
         }
       }
     };
   }

   private PosState getPosition(int pos) {
     return positions[pos-posShift];
   }

   private void shift(int pos) {
     int limit = pos-posShift;
     for(int i=0;i<limit;i++) {
       positions[i].count = 0;
     }
     posShift = pos;
   }

   private void countMatches() throws IOException {
     freq = 0;
     for(int i=0;i<numSubsOnDoc;i++) {
       posQueue.add(subsOnDoc[i]);
     }
     // System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID + " id=" + reader.document(docID).get("id"));
     // System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID);

     int lastPos = -1;

     posShift = -1;

     while (posQueue.size() != 0) {
       EnumAndScorer sub = posQueue.pop();

       // This is a graph intersection, and pos is the state this token
       // leaves from.  Until index stores posLength (which we could
       // stuff into a payload using a simple TokenFilter), this token
       // always transitions from state=pos to state=pos+1:
       final int pos = sub.pos;

       if (posShift == -1) {
         posShift = pos;
       }

       if (pos+1-posShift >= positions.length) {
         PosState[] newPositions = new PosState[ArrayUtil.oversize(pos+1-posShift, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
         System.arraycopy(positions, 0, newPositions, 0, positions.length);
         for(int i=positions.length;i<newPositions.length;i++) {
           newPositions[i] = new PosState();
         }
         positions = newPositions;
       }

       // System.out.println("  term=" + idToTerm.get(sub.termID).utf8ToString() + " pos=" + pos + " (count=" + getPosition(pos).count + " lastPos=" + lastPos + ") posQueue.size=" + posQueue.size() + " posShift=" + posShift);

       PosState posState;
       PosState nextPosState;

       // Maybe advance ANY matches:
       if (lastPos != -1) {
         if (anyTermID != -1) {
           int startLastPos = lastPos;
           while (lastPos < pos) {
             posState = getPosition(lastPos);
             if (posState.count == 0 && lastPos > startLastPos) {
               // Petered out...
               lastPos = pos;
               break;
             }
             // System.out.println("  iter lastPos=" + lastPos + " count=" + posState.count);

             nextPosState = getPosition(lastPos+1);

             // Advance all states from lastPos -> pos, if they had an any arc:
             for(int i=0;i<posState.count;i++) {
               int state = runAutomaton.step(posState.states[i], anyTermID);
               if (state != -1) {
                 // System.out.println("    add pos=" + (lastPos+1) + " state=" + state);
                 nextPosState.add(state);
               }
             }

             lastPos++;
           }
         }
       }

       posState = getPosition(pos);
       nextPosState = getPosition(pos+1);

       // If there are no pending matches at neither this position or the
       // next position, then it's safe to shift back to positions[0]:
       if (posState.count == 0 && nextPosState.count == 0) {
         shift(pos);
         posState = getPosition(pos);
         nextPosState = getPosition(pos+1);
       }

       // Match current token:
       for(int i=0;i<posState.count;i++) {
         // System.out.println("    check cur state=" + posState.states[i]);
         int state = runAutomaton.step(posState.states[i], sub.termID);
         if (state != -1) {
           // System.out.println("      --> " + state);
           nextPosState.add(state);
           if (runAutomaton.isAccept(state)) {
             // System.out.println("      *** (1)");
             freq++;
           }
         }
       }

       // Also consider starting a new match from this position:
       int state = runAutomaton.step(0, sub.termID);
       if (state != -1) {
         // System.out.println("  add init state=" + state);
         nextPosState.add(state);
         if (runAutomaton.isAccept(state)) {
           // System.out.println("      *** (2)");
           freq++;
         }
       }

       if (sub.posLeft > 0) {
         // Put this sub back into the posQueue:
         sub.pos = sub.posEnum.nextPosition();
         sub.posLeft--;
         posQueue.add(sub);
       }

       lastPos = pos;
     }

     int limit = lastPos+1-posShift;
     // reset
     for(int i=0;i<=limit;i++) {
       positions[i].count = 0;
     }
   }

   @Override
   public String toString() {
     return "TermAutomatonScorer(" + weight + ")";
   }

   @Override
   public int docID() {
     return docID;
   }

   @Override
   public float score() throws IOException {
     // TODO: we could probably do better here, e.g. look @ freqs of actual terms involved in this doc and score differently
     return docScorer.score(docID, freq);
   }

   @Override
   public float getMaxScore(int upTo) throws IOException {
     return docScorer.getSimScorer().score(Float.MAX_VALUE, 1L);
   }

   static class TermRunAutomaton extends RunAutomaton {
     public TermRunAutomaton(Automaton a, int termCount) {
       super(a, termCount);
     }
   }

   private static class PosState {
     // Which automaton states we are in at this position
     int[] states = new int[2];

     // How many states
     int count;

     public void add(int state) {
       if (states.length == count) {
         states = ArrayUtil.grow(states);
       }
       states[count++] = state;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search;

	import java.io.IOException;
	import java.util.Map;

	import org.apache.lucene.search.TermAutomatonQuery.EnumAndScorer;
	import org.apache.lucene.search.TermAutomatonQuery.TermAutomatonWeight;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.PriorityQueue;
	import org.apache.lucene.util.RamUsageEstimator;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.RunAutomaton;

	// TODO: add two-phase and needsScores support. maybe use conjunctionDISI internally?
	class TermAutomatonScorer extends Scorer {
	private final EnumAndScorer[] subs;
	private final EnumAndScorer[] subsOnDoc;
	private final PriorityQueue<EnumAndScorer> docIDQueue;
	private final PriorityQueue<EnumAndScorer> posQueue;
	private final RunAutomaton runAutomaton;
	private final Map<Integer,BytesRef> idToTerm;

	// We reuse this array to check for matches starting from an initial
	// position; we increase posShift every time we move to a new possible
	// start:
	private PosState[] positions;
	int posShift;

	// This is -1 if wildcard (null) terms were not used, else it's the id
	// of the wildcard term:
	private final int anyTermID;
	private final LeafSimScorer docScorer;

	private int numSubsOnDoc;

	private final long cost;

	private int docID = -1;
	private int freq;

	public TermAutomatonScorer(TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, Map<Integer,BytesRef> idToTerm, LeafSimScorer docScorer) throws IOException {
	super(weight);
	//System.out.println(" automaton:\n" + weight.automaton.toDot());
	this.runAutomaton = new TermRunAutomaton(weight.automaton, subs.length);
	this.docScorer = docScorer;
	this.idToTerm = idToTerm;
	this.subs = subs;
	this.docIDQueue = new DocIDQueue(subs.length);
	this.posQueue = new PositionQueue(subs.length);
	this.anyTermID = anyTermID;
	this.subsOnDoc = new EnumAndScorer[subs.length];
	this.positions = new PosState[4];
	for(int i=0;i<this.positions.length;i++) {
	this.positions[i] = new PosState();
	}
	long cost = 0;

	// Init docIDQueue:
	for(EnumAndScorer sub : subs) {
	if (sub != null) {
	cost += sub.posEnum.cost();
	subsOnDoc[numSubsOnDoc++] = sub;
	}
	}
	this.cost = cost;
	}

	/** Sorts by docID so we can quickly pull out all scorers that are on
	* the same (lowest) docID. */
	private static class DocIDQueue extends PriorityQueue<EnumAndScorer> {
	public DocIDQueue(int maxSize) {
	super(maxSize);
	}

	@Override
	protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
	return a.posEnum.docID() < b.posEnum.docID();
	}
	}

	/** Sorts by position so we can visit all scorers on one doc, by
	* position. */
	private static class PositionQueue extends PriorityQueue<EnumAndScorer> {
	public PositionQueue(int maxSize) {
	super(maxSize);
	}

	@Override
	protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
	return a.pos < b.pos;
	}
	}

	/** Pops all enums positioned on the current (minimum) doc */
	private void popCurrentDoc() {
	assert numSubsOnDoc == 0;
	assert docIDQueue.size() > 0;
	subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
	docID = subsOnDoc[0].posEnum.docID();
	while (docIDQueue.size() > 0 && docIDQueue.top().posEnum.docID() == docID) {
	subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
	}
	}

	/** Pushes all previously pop'd enums back into the docIDQueue */
	private void pushCurrentDoc() {
	for(int i=0;i<numSubsOnDoc;i++) {
	docIDQueue.add(subsOnDoc[i]);
	}
	numSubsOnDoc = 0;
	}

	@Override
	public DocIdSetIterator iterator() {
	return new DocIdSetIterator() {
	@Override
	public int docID() {
	return docID;
	}

	@Override
	public long cost() {
	return cost;
	}

	@Override
	public int nextDoc() throws IOException {
	// we only need to advance docs that are positioned since all docs in the
	// pq are guaranteed to be beyond the current doc already
	for(int i=0;i<numSubsOnDoc;i++) {
	EnumAndScorer sub = subsOnDoc[i];
	if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
	sub.posLeft = sub.posEnum.freq()-1;
	sub.pos = sub.posEnum.nextPosition();
	}
	}
	pushCurrentDoc();
	return doNext();
	}

	@Override
	public int advance(int target) throws IOException {
	// Both positioned docs and docs in the pq might be behind target

	// 1. Advance the PQ
	if (docIDQueue.size() > 0) {
	EnumAndScorer top = docIDQueue.top();
	while (top.posEnum.docID() < target) {
	if (top.posEnum.advance(target) != NO_MORE_DOCS) {
	top.posLeft = top.posEnum.freq()-1;
	top.pos = top.posEnum.nextPosition();
	}
	top = docIDQueue.updateTop();
	}
	}

	// 2. Advance subsOnDoc
	for(int i=0;i<numSubsOnDoc;i++) {
	EnumAndScorer sub = subsOnDoc[i];
	if (sub.posEnum.advance(target) != NO_MORE_DOCS) {
	sub.posLeft = sub.posEnum.freq()-1;
	sub.pos = sub.posEnum.nextPosition();
	}
	}
	pushCurrentDoc();
	return doNext();
	}

	private int doNext() throws IOException {
	assert numSubsOnDoc == 0;
	assert docIDQueue.top().posEnum.docID() > docID;
	while (true) {
	//System.out.println(" doNext: cycle");
	popCurrentDoc();
	//System.out.println(" docID=" + docID);
	if (docID == NO_MORE_DOCS) {
	return docID;
	}
	countMatches();
	if (freq > 0) {
	return docID;
	}
	for(int i=0;i<numSubsOnDoc;i++) {
	EnumAndScorer sub = subsOnDoc[i];
	if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
	sub.posLeft = sub.posEnum.freq()-1;
	sub.pos = sub.posEnum.nextPosition();
	}
	}
	pushCurrentDoc();
	}
	}
	};
	}

	private PosState getPosition(int pos) {
	return positions[pos-posShift];
	}

	private void shift(int pos) {
	int limit = pos-posShift;
	for(int i=0;i<limit;i++) {
	positions[i].count = 0;
	}
	posShift = pos;
	}

	private void countMatches() throws IOException {
	freq = 0;
	for(int i=0;i<numSubsOnDoc;i++) {
	posQueue.add(subsOnDoc[i]);
	}
	// System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID + " id=" + reader.document(docID).get("id"));
	// System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID);

	int lastPos = -1;

	posShift = -1;

	while (posQueue.size() != 0) {
	EnumAndScorer sub = posQueue.pop();

	// This is a graph intersection, and pos is the state this token
	// leaves from. Until index stores posLength (which we could
	// stuff into a payload using a simple TokenFilter), this token
	// always transitions from state=pos to state=pos+1:
	final int pos = sub.pos;

	if (posShift == -1) {
	posShift = pos;
	}

	if (pos+1-posShift >= positions.length) {
	PosState[] newPositions = new PosState[ArrayUtil.oversize(pos+1-posShift, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
	System.arraycopy(positions, 0, newPositions, 0, positions.length);
	for(int i=positions.length;i<newPositions.length;i++) {
	newPositions[i] = new PosState();
	}
	positions = newPositions;
	}

	// System.out.println(" term=" + idToTerm.get(sub.termID).utf8ToString() + " pos=" + pos + " (count=" + getPosition(pos).count + " lastPos=" + lastPos + ") posQueue.size=" + posQueue.size() + " posShift=" + posShift);

	PosState posState;
	PosState nextPosState;

	// Maybe advance ANY matches:
	if (lastPos != -1) {
	if (anyTermID != -1) {
	int startLastPos = lastPos;
	while (lastPos < pos) {
	posState = getPosition(lastPos);
	if (posState.count == 0 && lastPos > startLastPos) {
	// Petered out...
	lastPos = pos;
	break;
	}
	// System.out.println(" iter lastPos=" + lastPos + " count=" + posState.count);

	nextPosState = getPosition(lastPos+1);

	// Advance all states from lastPos -> pos, if they had an any arc:
	for(int i=0;i<posState.count;i++) {
	int state = runAutomaton.step(posState.states[i], anyTermID);
	if (state != -1) {
	// System.out.println(" add pos=" + (lastPos+1) + " state=" + state);
	nextPosState.add(state);
	}
	}

	lastPos++;
	}
	}
	}

	posState = getPosition(pos);
	nextPosState = getPosition(pos+1);

	// If there are no pending matches at neither this position or the
	// next position, then it's safe to shift back to positions[0]:
	if (posState.count == 0 && nextPosState.count == 0) {
	shift(pos);
	posState = getPosition(pos);
	nextPosState = getPosition(pos+1);
	}

	// Match current token:
	for(int i=0;i<posState.count;i++) {
	// System.out.println(" check cur state=" + posState.states[i]);
	int state = runAutomaton.step(posState.states[i], sub.termID);
	if (state != -1) {
	// System.out.println(" --> " + state);
	nextPosState.add(state);
	if (runAutomaton.isAccept(state)) {
	// System.out.println(" *** (1)");
	freq++;
	}
	}
	}

	// Also consider starting a new match from this position:
	int state = runAutomaton.step(0, sub.termID);
	if (state != -1) {
	// System.out.println(" add init state=" + state);
	nextPosState.add(state);
	if (runAutomaton.isAccept(state)) {
	// System.out.println(" *** (2)");
	freq++;
	}
	}

	if (sub.posLeft > 0) {
	// Put this sub back into the posQueue:
	sub.pos = sub.posEnum.nextPosition();
	sub.posLeft--;
	posQueue.add(sub);
	}

	lastPos = pos;
	}

	int limit = lastPos+1-posShift;
	// reset
	for(int i=0;i<=limit;i++) {
	positions[i].count = 0;
	}
	}

	@Override
	public String toString() {
	return "TermAutomatonScorer(" + weight + ")";
	}

	@Override
	public int docID() {
	return docID;
	}

	@Override
	public float score() throws IOException {
	// TODO: we could probably do better here, e.g. look @ freqs of actual terms involved in this doc and score differently
	return docScorer.score(docID, freq);
	}

	@Override
	public float getMaxScore(int upTo) throws IOException {
	return docScorer.getSimScorer().score(Float.MAX_VALUE, 1L);
	}

	static class TermRunAutomaton extends RunAutomaton {
	public TermRunAutomaton(Automaton a, int termCount) {
	super(a, termCount);
	}
	}

	private static class PosState {
	// Which automaton states we are in at this position
	int[] states = new int[2];

	// How many states
	int count;

	public void add(int state) {
	if (states.length == count) {
	states = ArrayUtil.grow(states);
	}
	states[count++] = state;
	}
	}
	}