src/Lucene.Net.Highlighter/VectorHighlight/FieldTermStack.cs - lucenenet - Git at Google

 using Lucene.Net.Diagnostics;
 using Lucene.Net.Index;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
 using System.Text;

 namespace Lucene.Net.Search.VectorHighlight
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// <see cref="FieldTermStack"/> is a stack that keeps query terms in the specified field
     /// of the document to be highlighted.
     /// </summary>
     public class FieldTermStack
     {
         private readonly string fieldName;
         internal List<TermInfo> termList = new List<TermInfo>();

         //public static void main( string[] args ) throws Exception {
         //  Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
         //  QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,  "f", analyzer );
         //  Query query = parser.parse( "a x:b" );
         //  FieldQuery fieldQuery = new FieldQuery( query, true, false );

         //  Directory dir = new RAMDirectory();
         //  IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
         //  Document doc = new Document();
         //  IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED);
         //  ft.setStoreTermVectors(true);
         //  ft.setStoreTermVectorOffsets(true);
         //  ft.setStoreTermVectorPositions(true);
         //  doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
         //  doc.add( new Field( "f", ft, "b a b a f" ) );
         //  writer.addDocument( doc );
         //  writer.close();

         //  IndexReader reader = IndexReader.open(dir1);
         //  new FieldTermStack( reader, 0, "f", fieldQuery );
         //  reader.close();
         //}

         /// <summary>
         /// a constructor.
         /// </summary>
         /// <param name="reader"><see cref="IndexReader"/> of the index</param>
         /// <param name="docId">document id to be highlighted</param>
         /// <param name="fieldName">field of the document to be highlighted</param>
         /// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery)
         {
             this.fieldName = fieldName;

             ISet<string> termSet = fieldQuery.GetTermSet(fieldName);
             // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
             if (termSet == null) return;

             Fields vectors = reader.GetTermVectors(docId);
             if (vectors == null)
             {
                 // null snippet
                 return;
             }

             Terms vector = vectors.GetTerms(fieldName);
             if (vector == null)
             {
                 // null snippet
                 return;
             }

             CharsRef spare = new CharsRef();
             TermsEnum termsEnum = vector.GetEnumerator();
             DocsAndPositionsEnum dpEnum = null;
             BytesRef text;

             int numDocs = reader.MaxDoc;

             while (termsEnum.MoveNext())
             {
                 text = termsEnum.Term;
                 UnicodeUtil.UTF8toUTF16(text, spare);
                 string term = spare.ToString();
                 if (!termSet.Contains(term))
                 {
                     continue;
                 }
                 dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                 if (dpEnum == null)
                 {
                     // null snippet
                     return;
                 }

                 dpEnum.NextDoc();

                 // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
                 float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0);

                 int freq = dpEnum.Freq;

                 for (int i = 0; i < freq; i++)
                 {
                     int pos = dpEnum.NextPosition();
                     if (dpEnum.StartOffset < 0)
                     {
                         return; // no offsets, null snippet
                     }
                     termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight));
                 }
             }

             // sort by position
             CollectionUtil.TimSort(termList);

             // now look for dups at the same position, linking them together
             int currentPos = -1;
             TermInfo previous = null;
             TermInfo first = null;
             for (int i = 0; i < termList.Count; )
             {
                 TermInfo current = termList[i];
                 if (current.Position == currentPos)
                 {
                     if (Debugging.AssertsEnabled) Debugging.Assert(previous != null);
                     previous.SetNext(current);
                     previous = current;
                     //iterator.Remove();

                     // LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item)
                     termList.RemoveAt(i);
                 }
                 else
                 {
                     if (previous != null)
                     {
                         previous.SetNext(first);
                     }
                     previous = first = current;
                     currentPos = current.Position;

                     // LUCENENET NOTE: Only increment the position if we don't do a delete.
                     i++;
                 }
             }

             if (previous != null)
             {
                 previous.SetNext(first);
             }
         }

         /// <summary>
         /// field name
         /// </summary>
         public virtual string FieldName => fieldName;

         /// <summary>
         /// Returns the top <see cref="TermInfo"/> object of the stack
         /// </summary>
         /// <returns>the top <see cref="TermInfo"/> object of the stack</returns>
         public virtual TermInfo Pop()
         {
             if (termList.Count == 0)
             {
                 return null;
             }
             TermInfo first = termList[0];
             termList.Remove(first);
             return first;
         }

         /// <summary>
         /// Puts a <see cref="TermInfo"/> onto the top of the stack
         /// </summary>
         /// <param name="termInfo">the <see cref="TermInfo"/> object to be put on the top of the stack</param>
         public virtual void Push(TermInfo termInfo)
         {
             termList.Insert(0, termInfo);
         }

         /// <summary>
         /// to know whether the stack is empty. Returns true if the stack is empty, false if not
         /// </summary>
         public virtual bool IsEmpty => termList == null || termList.Count == 0;

         /// <summary>
         /// Single term with its position/offsets in the document and IDF weight.
         /// It is <see cref="IComparable{TermInfo}"/> but considers only position.
         /// </summary>
         public class TermInfo : IComparable<TermInfo>
         {
             private readonly string text;
             private readonly int startOffset;
             private readonly int endOffset;
             private readonly int position;

             // IDF-weight of this term
             private readonly float weight;

             // pointer to other TermInfo's at the same position.
             // this is a circular list, so with no syns, just points to itself
             private TermInfo next;

             public TermInfo(string text, int startOffset, int endOffset, int position, float weight)
             {
                 this.text = text;
                 this.startOffset = startOffset;
                 this.endOffset = endOffset;
                 this.position = position;
                 this.weight = weight;
                 this.next = this;
             }

             internal void SetNext(TermInfo next) { this.next = next; }
             /// <summary>
             /// Returns the next TermInfo at this same position. This is a circular list!
             /// </summary>
             public virtual TermInfo Next => next;

             public virtual string Text => text;
             public virtual int StartOffset => startOffset;
             public virtual int EndOffset => endOffset;
             public virtual int Position => position;
             public virtual float Weight => weight;

             public override string ToString()
             {
                 StringBuilder sb = new StringBuilder();
                 sb.Append(text).Append('(').Append(startOffset).Append(',').Append(endOffset).Append(',').Append(position).Append(')');
                 return sb.ToString();
             }

             public virtual int CompareTo(TermInfo o)
             {
                 return (this.position - o.position);
             }

             public override int GetHashCode()
             {
                 int prime = 31;
                 int result = 1;
                 result = prime * result + position;
                 return result;
             }

             public override bool Equals(object obj)
             {
                 if (this == obj)
                 {
                     return true;
                 }
                 if (obj == null)
                 {
                     return false;
                 }
                 if (GetType() != obj.GetType())
                 {
                     return false;
                 }
                 TermInfo other = (TermInfo)obj;
                 if (position != other.position)
                 {
                     return false;
                 }
                 return true;
             }
         }
     }
 }
	using Lucene.Net.Diagnostics;
	using Lucene.Net.Index;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.Diagnostics;
	using System.IO;
	using System.Text;

	namespace Lucene.Net.Search.VectorHighlight
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// <see cref="FieldTermStack"/> is a stack that keeps query terms in the specified field
	/// of the document to be highlighted.
	/// </summary>
	public class FieldTermStack
	{
	private readonly string fieldName;
	internal List<TermInfo> termList = new List<TermInfo>();

	//public static void main( string[] args ) throws Exception {
	// Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
	// QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "f", analyzer );
	// Query query = parser.parse( "a x:b" );
	// FieldQuery fieldQuery = new FieldQuery( query, true, false );

	// Directory dir = new RAMDirectory();
	// IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer));
	// Document doc = new Document();
	// IndexableFieldType ft = new IndexableFieldType(TextField.TYPE_STORED);
	// ft.setStoreTermVectors(true);
	// ft.setStoreTermVectorOffsets(true);
	// ft.setStoreTermVectorPositions(true);
	// doc.add( new Field( "f", ft, "a a a b b c a b b c d e f" ) );
	// doc.add( new Field( "f", ft, "b a b a f" ) );
	// writer.addDocument( doc );
	// writer.close();

	// IndexReader reader = IndexReader.open(dir1);
	// new FieldTermStack( reader, 0, "f", fieldQuery );
	// reader.close();
	//}

	/// <summary>
	/// a constructor.
	/// </summary>
	/// <param name="reader"><see cref="IndexReader"/> of the index</param>
	/// <param name="docId">document id to be highlighted</param>
	/// <param name="fieldName">field of the document to be highlighted</param>
	/// <param name="fieldQuery"><see cref="FieldQuery"/> object</param>
	/// <exception cref="IOException">If there is a low-level I/O error</exception>
	public FieldTermStack(IndexReader reader, int docId, string fieldName, FieldQuery fieldQuery)
	{
	this.fieldName = fieldName;

	ISet<string> termSet = fieldQuery.GetTermSet(fieldName);
	// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
	if (termSet == null) return;

	Fields vectors = reader.GetTermVectors(docId);
	if (vectors == null)
	{
	// null snippet
	return;
	}

	Terms vector = vectors.GetTerms(fieldName);
	if (vector == null)
	{
	// null snippet
	return;
	}

	CharsRef spare = new CharsRef();
	TermsEnum termsEnum = vector.GetEnumerator();
	DocsAndPositionsEnum dpEnum = null;
	BytesRef text;

	int numDocs = reader.MaxDoc;

	while (termsEnum.MoveNext())
	{
	text = termsEnum.Term;
	UnicodeUtil.UTF8toUTF16(text, spare);
	string term = spare.ToString();
	if (!termSet.Contains(term))
	{
	continue;
	}
	dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
	if (dpEnum == null)
	{
	// null snippet
	return;
	}

	dpEnum.NextDoc();

	// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
	float weight = (float)(Math.Log(numDocs / (double)(reader.DocFreq(new Term(fieldName, text)) + 1)) + 1.0);

	int freq = dpEnum.Freq;

	for (int i = 0; i < freq; i++)
	{
	int pos = dpEnum.NextPosition();
	if (dpEnum.StartOffset < 0)
	{
	return; // no offsets, null snippet
	}
	termList.Add(new TermInfo(term, dpEnum.StartOffset, dpEnum.EndOffset, pos, weight));
	}
	}

	// sort by position
	CollectionUtil.TimSort(termList);

	// now look for dups at the same position, linking them together
	int currentPos = -1;
	TermInfo previous = null;
	TermInfo first = null;
	for (int i = 0; i < termList.Count; )
	{
	TermInfo current = termList[i];
	if (current.Position == currentPos)
	{
	if (Debugging.AssertsEnabled) Debugging.Assert(previous != null);
	previous.SetNext(current);
	previous = current;
	//iterator.Remove();

	// LUCENENET NOTE: Remove, but don't advance the i position (since removing will advance to the next item)
	termList.RemoveAt(i);
	}
	else
	{
	if (previous != null)
	{
	previous.SetNext(first);
	}
	previous = first = current;
	currentPos = current.Position;

	// LUCENENET NOTE: Only increment the position if we don't do a delete.
	i++;
	}
	}

	if (previous != null)
	{
	previous.SetNext(first);
	}
	}

	/// <summary>
	/// field name
	/// </summary>
	public virtual string FieldName => fieldName;

	/// <summary>
	/// Returns the top <see cref="TermInfo"/> object of the stack
	/// </summary>
	/// <returns>the top <see cref="TermInfo"/> object of the stack</returns>
	public virtual TermInfo Pop()
	{
	if (termList.Count == 0)
	{
	return null;
	}
	TermInfo first = termList[0];
	termList.Remove(first);
	return first;
	}

	/// <summary>
	/// Puts a <see cref="TermInfo"/> onto the top of the stack
	/// </summary>
	/// <param name="termInfo">the <see cref="TermInfo"/> object to be put on the top of the stack</param>
	public virtual void Push(TermInfo termInfo)
	{
	termList.Insert(0, termInfo);
	}

	/// <summary>
	/// to know whether the stack is empty. Returns true if the stack is empty, false if not
	/// </summary>
	public virtual bool IsEmpty => termList == null \|\| termList.Count == 0;

	/// <summary>
	/// Single term with its position/offsets in the document and IDF weight.
	/// It is <see cref="IComparable{TermInfo}"/> but considers only position.
	/// </summary>
	public class TermInfo : IComparable<TermInfo>
	{
	private readonly string text;
	private readonly int startOffset;
	private readonly int endOffset;
	private readonly int position;

	// IDF-weight of this term
	private readonly float weight;

	// pointer to other TermInfo's at the same position.
	// this is a circular list, so with no syns, just points to itself
	private TermInfo next;

	public TermInfo(string text, int startOffset, int endOffset, int position, float weight)
	{
	this.text = text;
	this.startOffset = startOffset;
	this.endOffset = endOffset;
	this.position = position;
	this.weight = weight;
	this.next = this;
	}

	internal void SetNext(TermInfo next) { this.next = next; }
	/// <summary>
	/// Returns the next TermInfo at this same position. This is a circular list!
	/// </summary>
	public virtual TermInfo Next => next;

	public virtual string Text => text;
	public virtual int StartOffset => startOffset;
	public virtual int EndOffset => endOffset;
	public virtual int Position => position;
	public virtual float Weight => weight;

	public override string ToString()
	{
	StringBuilder sb = new StringBuilder();
	sb.Append(text).Append('(').Append(startOffset).Append(',').Append(endOffset).Append(',').Append(position).Append(')');
	return sb.ToString();
	}

	public virtual int CompareTo(TermInfo o)
	{
	return (this.position - o.position);
	}

	public override int GetHashCode()
	{
	int prime = 31;
	int result = 1;
	result = prime * result + position;
	return result;
	}

	public override bool Equals(object obj)
	{
	if (this == obj)
	{
	return true;
	}
	if (obj == null)
	{
	return false;
	}
	if (GetType() != obj.GetType())
	{
	return false;
	}
	TermInfo other = (TermInfo)obj;
	if (position != other.position)
	{
	return false;
	}
	return true;
	}
	}
	}
	}