lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.vectorhighlight;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Set;

 import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRefBuilder;

 /**
  * <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
  * of the document to be highlighted.
  */
 public class FieldTermStack {

   private final String fieldName;
   LinkedList<TermInfo> termList = new LinkedList<>();

   //public static void main( String[] args ) throws Exception {
   //  Analyzer analyzer = new WhitespaceAnalyzer(Version.LATEST);
   //  QueryParser parser = new QueryParser(Version.LATEST,  "f", analyzer );
   //  Query query = parser.parse( "a x:b" );
   //  FieldQuery fieldQuery = new FieldQuery( query, true, false );

   //  Directory dir = new RAMDirectory();
   //  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LATEST, analyzer));
   //  Document doc = new Document();
   //  FieldType ft = new FieldType(TextField.TYPE_STORED);
   //  ft.setStoreTermVectors(true);
   //  ft.setStoreTermVectorOffsets(true);
   //  ft.setStoreTermVectorPositions(true);
   //  doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
   //  doc.add( new Field( "f", ft, "b a b a f" ) );
   //  writer.addDocument( doc );
   //  writer.close();

   //  IndexReader reader = IndexReader.open(dir1);
   //  new FieldTermStack( reader, 0, "f", fieldQuery );
   //  reader.close();
   //}

   /**
    * a constructor.
    *
    * @param reader IndexReader of the index
    * @param docId document id to be highlighted
    * @param fieldName field of the document to be highlighted
    * @param fieldQuery FieldQuery object
    * @throws IOException If there is a low-level I/O error
    */
   public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
     this.fieldName = fieldName;

     Set<String> termSet = fieldQuery.getTermSet( fieldName );
     // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
     if( termSet == null ) return;

     final Fields vectors = reader.getTermVectors(docId);
     if (vectors == null) {
       // null snippet
       return;
     }

     final Terms vector = vectors.terms(fieldName);
     if (vector == null || vector.hasPositions() == false) {
       // null snippet
       return;
     }

     final CharsRefBuilder spare = new CharsRefBuilder();
     final TermsEnum termsEnum = vector.iterator();
     PostingsEnum dpEnum = null;
     BytesRef text;

     int numDocs = reader.maxDoc();

     while ((text = termsEnum.next()) != null) {
       spare.copyUTF8Bytes(text);
       final String term = spare.toString();
       if (!termSet.contains(term)) {
         continue;
       }
       dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS);
       dpEnum.nextDoc();

       // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
       final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );

       final int freq = dpEnum.freq();

       for(int i = 0;i < freq;i++) {
         int pos = dpEnum.nextPosition();
         if (dpEnum.startOffset() < 0) {
           return; // no offsets, null snippet
         }
         termList.add( new TermInfo( term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight ) );
       }
     }

     // sort by position
     Collections.sort(termList);

     // now look for dups at the same position, linking them together
     int currentPos = -1;
     TermInfo previous = null;
     TermInfo first = null;
     Iterator<TermInfo> iterator = termList.iterator();
     while (iterator.hasNext()) {
       TermInfo current = iterator.next();
       if (current.position == currentPos) {
         assert previous != null;
         previous.setNext(current);
         previous = current;
         iterator.remove();
       } else {
         if (previous != null) {
           previous.setNext(first);
         }
         previous = first = current;
         currentPos = current.position;
       }
     }
     if (previous != null) {
       previous.setNext(first);
     }
   }

   /**
    * @return field name
    */
   public String getFieldName(){
     return fieldName;
   }

   /**
    * @return the top TermInfo object of the stack
    */
   public TermInfo pop(){
     return termList.poll();
   }

   /**
    * @param termInfo the TermInfo object to be put on the top of the stack
    */
   public void push( TermInfo termInfo ){
     termList.push( termInfo );
   }

   /**
    * to know whether the stack is empty
    *
    * @return true if the stack is empty, false if not
    */
   public boolean isEmpty(){
     return termList == null || termList.size() == 0;
   }

   /**
    * Single term with its position/offsets in the document and IDF weight.
    * It is Comparable but considers only position.
    */
   public static class TermInfo implements Comparable<TermInfo>{

     private final String text;
     private final int startOffset;
     private final int endOffset;
     private final int position;

     // IDF-weight of this term
     private final float weight;

     // pointer to other TermInfo's at the same position.
     // this is a circular list, so with no syns, just points to itself
     private TermInfo next;

     TermInfo(String text, int startOffset, int endOffset, int position, float weight){
       this.text = text;
       this.startOffset = startOffset;
       this.endOffset = endOffset;
       this.position = position;
       this.weight = weight;
       this.next = this;
     }

     void setNext(TermInfo next) { this.next = next; }
     /**
      * Returns the next TermInfo at this same position.
      * This is a circular list!
      */
     public TermInfo getNext() { return next; }
     public String getText(){ return text; }
     public int getStartOffset(){ return startOffset; }
     public int getEndOffset(){ return endOffset; }
     public int getPosition(){ return position; }
     public float getWeight(){ return weight; }

     @Override
     public String toString(){
       return text + '(' + startOffset + ',' + endOffset + ',' + position + ')';
     }

     @Override
     public int compareTo( TermInfo o ){
       return ( this.position - o.position );
     }
     @Override
     public int hashCode() {
       final int prime = 31;
       int result = 1;
       result = prime * result + position;
       return result;
     }

     @Override
     public boolean equals(Object obj) {
       if (this == obj) {
         return true;
       }
       if (obj == null) {
         return false;
       }
       if (getClass() != obj.getClass()) {
         return false;
       }
       TermInfo other = (TermInfo) obj;
       return position == other.position;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.vectorhighlight;
	import java.io.IOException;
	import java.util.Collections;
	import java.util.Iterator;
	import java.util.LinkedList;
	import java.util.Set;

	import org.apache.lucene.index.PostingsEnum;
	import org.apache.lucene.index.Fields;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.CharsRefBuilder;

	/**
	* <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
	* of the document to be highlighted.
	*/
	public class FieldTermStack {

	private final String fieldName;
	LinkedList<TermInfo> termList = new LinkedList<>();

	//public static void main( String[] args ) throws Exception {
	// Analyzer analyzer = new WhitespaceAnalyzer(Version.LATEST);
	// QueryParser parser = new QueryParser(Version.LATEST, "f", analyzer );
	// Query query = parser.parse( "a x:b" );
	// FieldQuery fieldQuery = new FieldQuery( query, true, false );

	// Directory dir = new RAMDirectory();
	// IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LATEST, analyzer));
	// Document doc = new Document();
	// FieldType ft = new FieldType(TextField.TYPE_STORED);
	// ft.setStoreTermVectors(true);
	// ft.setStoreTermVectorOffsets(true);
	// ft.setStoreTermVectorPositions(true);
	// doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
	// doc.add( new Field( "f", ft, "b a b a f" ) );
	// writer.addDocument( doc );
	// writer.close();

	// IndexReader reader = IndexReader.open(dir1);
	// new FieldTermStack( reader, 0, "f", fieldQuery );
	// reader.close();
	//}

	/**
	* a constructor.
	*
	* @param reader IndexReader of the index
	* @param docId document id to be highlighted
	* @param fieldName field of the document to be highlighted
	* @param fieldQuery FieldQuery object
	* @throws IOException If there is a low-level I/O error
	*/
	public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException {
	this.fieldName = fieldName;

	Set<String> termSet = fieldQuery.getTermSet( fieldName );
	// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
	if( termSet == null ) return;

	final Fields vectors = reader.getTermVectors(docId);
	if (vectors == null) {
	// null snippet
	return;
	}

	final Terms vector = vectors.terms(fieldName);
	if (vector == null \|\| vector.hasPositions() == false) {
	// null snippet
	return;
	}

	final CharsRefBuilder spare = new CharsRefBuilder();
	final TermsEnum termsEnum = vector.iterator();
	PostingsEnum dpEnum = null;
	BytesRef text;

	int numDocs = reader.maxDoc();

	while ((text = termsEnum.next()) != null) {
	spare.copyUTF8Bytes(text);
	final String term = spare.toString();
	if (!termSet.contains(term)) {
	continue;
	}
	dpEnum = termsEnum.postings(dpEnum, PostingsEnum.POSITIONS);
	dpEnum.nextDoc();

	// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
	final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );

	final int freq = dpEnum.freq();

	for(int i = 0;i < freq;i++) {
	int pos = dpEnum.nextPosition();
	if (dpEnum.startOffset() < 0) {
	return; // no offsets, null snippet
	}
	termList.add( new TermInfo( term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight ) );
	}
	}

	// sort by position
	Collections.sort(termList);

	// now look for dups at the same position, linking them together
	int currentPos = -1;
	TermInfo previous = null;
	TermInfo first = null;
	Iterator<TermInfo> iterator = termList.iterator();
	while (iterator.hasNext()) {
	TermInfo current = iterator.next();
	if (current.position == currentPos) {
	assert previous != null;
	previous.setNext(current);
	previous = current;
	iterator.remove();
	} else {
	if (previous != null) {
	previous.setNext(first);
	}
	previous = first = current;
	currentPos = current.position;
	}
	}
	if (previous != null) {
	previous.setNext(first);
	}
	}

	/**
	* @return field name
	*/
	public String getFieldName(){
	return fieldName;
	}

	/**
	* @return the top TermInfo object of the stack
	*/
	public TermInfo pop(){
	return termList.poll();
	}

	/**
	* @param termInfo the TermInfo object to be put on the top of the stack
	*/
	public void push( TermInfo termInfo ){
	termList.push( termInfo );
	}

	/**
	* to know whether the stack is empty
	*
	* @return true if the stack is empty, false if not
	*/
	public boolean isEmpty(){
	return termList == null \|\| termList.size() == 0;
	}

	/**
	* Single term with its position/offsets in the document and IDF weight.
	* It is Comparable but considers only position.
	*/
	public static class TermInfo implements Comparable<TermInfo>{

	private final String text;
	private final int startOffset;
	private final int endOffset;
	private final int position;

	// IDF-weight of this term
	private final float weight;

	// pointer to other TermInfo's at the same position.
	// this is a circular list, so with no syns, just points to itself
	private TermInfo next;

	TermInfo(String text, int startOffset, int endOffset, int position, float weight){
	this.text = text;
	this.startOffset = startOffset;
	this.endOffset = endOffset;
	this.position = position;
	this.weight = weight;
	this.next = this;
	}

	void setNext(TermInfo next) { this.next = next; }
	/**
	* Returns the next TermInfo at this same position.
	* This is a circular list!
	*/
	public TermInfo getNext() { return next; }
	public String getText(){ return text; }
	public int getStartOffset(){ return startOffset; }
	public int getEndOffset(){ return endOffset; }
	public int getPosition(){ return position; }
	public float getWeight(){ return weight; }

	@Override
	public String toString(){
	return text + '(' + startOffset + ',' + endOffset + ',' + position + ')';
	}

	@Override
	public int compareTo( TermInfo o ){
	return ( this.position - o.position );
	}
	@Override
	public int hashCode() {
	final int prime = 31;
	int result = 1;
	result = prime * result + position;
	return result;
	}

	@Override
	public boolean equals(Object obj) {
	if (this == obj) {
	return true;
	}
	if (obj == null) {
	return false;
	}
	if (getClass() != obj.getClass()) {
	return false;
	}
	TermInfo other = (TermInfo) obj;
	return position == other.position;
	}
	}
	}