src/Lucene.Net.Highlighter/Highlight/TokenSources.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Documents;
 using Lucene.Net.Index;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.IO;

 namespace Lucene.Net.Search.Highlight
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     ///<summary>
     /// Hides implementation issues associated with obtaining a <see cref="TokenStream"/> for use with
     /// the <see cref="Highlighter"/> - can obtain from
     /// term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
     /// see TokenStreamFromTermVector
     ///</summary>
     public static class TokenSources // LUCENENET specific: CA1052 Static holder types should be Static or NotInheritable
     {
         private class TokenComparer : IComparer<Token>
         {
             public int Compare(Token t1, Token t2)
             {
                 if (t1.StartOffset == t2.StartOffset)
                 {
                     return t1.EndOffset - t2.EndOffset;
                 }
                 else
                 {
                     return t1.StartOffset - t2.StartOffset;
                 }
             }
         }

         internal sealed class StoredTokenStream : TokenStream
         {
             internal Token[] tokens;
             internal int currentToken = 0;
             internal ICharTermAttribute termAtt;
             internal IOffsetAttribute offsetAtt;
             internal IPositionIncrementAttribute posincAtt;
             internal IPayloadAttribute payloadAtt;

             internal StoredTokenStream(Token[] tokens)
             {
                 this.tokens = tokens;
                 termAtt = AddAttribute<ICharTermAttribute>();
                 offsetAtt = AddAttribute<IOffsetAttribute>();
                 posincAtt = AddAttribute<IPositionIncrementAttribute>();
                 payloadAtt = AddAttribute<IPayloadAttribute>();
             }

             public override bool IncrementToken()
             {
                 if (currentToken >= tokens.Length)
                 {
                     return false;
                 }
                 Token token = tokens[currentToken++];
                 ClearAttributes();
                 termAtt.SetEmpty().Append(token);
                 offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
                 BytesRef payload = token.Payload;
                 if (payload != null)
                 {
                     payloadAtt.Payload = payload;
                 }
                 posincAtt.PositionIncrement =
                     (currentToken <= 1 ||
                     tokens[currentToken - 1].StartOffset > tokens[currentToken - 2].StartOffset
                     ? 1 : 0);
                 return true;
             }
         }

         /// <summary>
         /// A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
         /// using the passed in <see cref="Document"/> to retrieve the <see cref="TokenStream"/>.  This is useful when
         /// you already have the document, but would prefer to use the vector first.
         /// </summary>
         /// <param name="reader">The <see cref="IndexReader"/> to use to try and get the vector from</param>
         /// <param name="docId">The docId to retrieve.</param>
         /// <param name="field">The field to retrieve on the document</param>
         /// <param name="doc">The document to fall back on</param>
         /// <param name="analyzer">The analyzer to use for creating the TokenStream if the vector doesn't exist</param>
         /// <returns>The <see cref="TokenStream"/> for the <see cref="IIndexableField"/> on the <see cref="Document"/></returns>
         /// <exception cref="IOException">if there was an error loading</exception>
         public static TokenStream GetAnyTokenStream(IndexReader reader, int docId,
             string field, Document doc, Analyzer analyzer)
         {
             TokenStream ts = null;

             Fields vectors = reader.GetTermVectors(docId);
             Terms vector = vectors?.GetTerms(field);
             if (vector != null)
             {
                 ts = GetTokenStream(vector);
             }

             // No token info stored so fall back to analyzing raw content
             ts = ts ?? GetTokenStream(doc, field, analyzer);
             return ts;
         }

         /// <summary>
         /// A convenience method that tries a number of approaches to getting a token stream.
         /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
         /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
         /// </summary>
         /// <returns>null if field not stored correctly</returns>
         /// <exception cref="IOException">If there is a low-level I/O error</exception>
         public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, string field, Analyzer analyzer)
         {
             TokenStream ts = null;

             Fields vectors = reader.GetTermVectors(docId);
             Terms vector = vectors?.GetTerms(field);
             if (vector != null)
             {
                 ts = GetTokenStream(vector);
             }

             // No token info stored so fall back to analyzing raw content
             ts = ts ?? GetTokenStream(reader, docId, field, analyzer);
             return ts;
         }

         public static TokenStream GetTokenStream(Terms vector)
         {
             //assumes the worst and makes no assumptions about token position sequences.
             return GetTokenStream(vector, false);
         }

         /// <summary>
         /// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
         /// can be used to feed the highlighter with a pre-parsed token
         /// stream.  The <see cref="Terms"/> must have offsets available.
         /// <para/>
         /// In my tests the speeds to recreate 1000 token streams using this method are:
         /// <list type="bullet">
         ///     <item><description>
         ///     with TermVector offset only data stored - 420  milliseconds
         ///     </description></item>
         ///     <item><description>
         ///     with TermVector offset AND position data stored - 271 milliseconds
         ///     (nb timings for TermVector with position data are based on a tokenizer with contiguous
         ///     positions - no overlaps or gaps)
         ///     </description></item>
         ///     <item><description>
         ///     The cost of not using TermPositionVector to store
         ///     pre-parsed content and using an analyzer to re-parse the original content:
         ///     - reanalyzing the original content - 980 milliseconds
         ///     </description></item>
         /// </list>
         ///
         /// The re-analyze timings will typically vary depending on -
         /// <list type="number">
         ///     <item><description>
         ///     The complexity of the analyzer code (timings above were using a
         ///     stemmer/lowercaser/stopword combo)
         ///     </description></item>
         ///     <item><description>
         ///     The  number of other fields (Lucene reads ALL fields off the disk
         ///     when accessing just one document field - can cost dear!)
         ///     </description></item>
         ///     <item><description>
         ///     Use of compression on field storage - could be faster due to compression (less disk IO)
         ///     or slower (more CPU burn) depending on the content.
         ///     </description></item>
         /// </list>
         /// </summary>
         /// <param name="tpv"></param>
         /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
         /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
         /// <exception cref="ArgumentException">if no offsets are available</exception>
         public static TokenStream GetTokenStream(Terms tpv,
               bool tokenPositionsGuaranteedContiguous)
         {
             if (!tpv.HasOffsets)
             {
                 throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
             }

             if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
             {
                 return new TokenStreamFromTermPositionVector(tpv);
             }

             bool hasPayloads = tpv.HasPayloads;

             // code to reconstruct the original sequence of Tokens
             TermsEnum termsEnum = tpv.GetEnumerator();
             int totalTokens = 0;
             while (termsEnum.MoveNext())
             {
                 totalTokens += (int)termsEnum.TotalTermFreq;
             }
             Token[] tokensInOriginalOrder = new Token[totalTokens];
             List<Token> unsortedTokens = null;
             termsEnum = tpv.GetEnumerator();
             DocsAndPositionsEnum dpEnum = null;
             while (termsEnum.MoveNext())
             {
                 dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
                 if (dpEnum == null)
                 {
                     throw new ArgumentException("Required TermVector Offset information was not found");
                 }
                 string term = termsEnum.Term.Utf8ToString();

                 dpEnum.NextDoc();
                 int freq = dpEnum.Freq;
                 for (int posUpto = 0; posUpto < freq; posUpto++)
                 {
                     int pos = dpEnum.NextPosition();
                     if (dpEnum.StartOffset < 0)
                     {
                         throw new ArgumentException("Required TermVector Offset information was not found");
                     }
                     Token token = new Token(term,dpEnum.StartOffset, dpEnum.EndOffset);
                     if (hasPayloads)
                     {
                         // Must make a deep copy of the returned payload,
                         // since D&PEnum API is allowed to re-use on every
                         // call:
                         token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
                     }

                     if (tokenPositionsGuaranteedContiguous && pos != -1)
                     {
                         // We have positions stored and a guarantee that the token position
                         // information is contiguous

                         // This may be fast BUT wont work if Tokenizers used which create >1
                         // token in same position or
                         // creates jumps in position numbers - this code would fail under those
                         // circumstances

                         // tokens stored with positions - can use this to index straight into
                         // sorted array
                         tokensInOriginalOrder[pos] = token;
                     }
                     else
                     {
                         // tokens NOT stored with positions or not guaranteed contiguous - must
                         // add to list and sort later
                         if (unsortedTokens == null)
                         {
                             unsortedTokens = new List<Token>();
                         }
                         unsortedTokens.Add(token);
                     }
                 }
             }

             // If the field has been stored without position data we must perform a sort
             if (unsortedTokens != null)
             {
                 tokensInOriginalOrder = unsortedTokens.ToArray();
                 ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer());
                 //tokensInOriginalOrder = tokensInOriginalOrder
                 //    .OrderBy(t => t, new TokenComparer() )
                 //    .ToArray();
             }
             return new StoredTokenStream(tokensInOriginalOrder);
         }

         ///<summary>
         /// Returns a <see cref="TokenStream"/> with positions and offsets constructed from
         /// field termvectors. If the field has no termvectors or offsets
         /// are not included in the termvector, return null.  See
         /// <see cref="GetTokenStream(Terms)"/>
         /// for an explanation of what happens when positions aren't present.
         /// </summary>
         /// <param name="reader">the <see cref="IndexReader"/> to retrieve term vectors from</param>
         /// <param name="docId">the document to retrieve term vectors for </param>
         /// <param name="field">the field to retrieve term vectors for</param>
         /// <returns>a <see cref="TokenStream"/>, or null if offsets are not available</returns>
         /// <exception cref="IOException"> If there is a low-level I/O error</exception>
         public static TokenStream GetTokenStreamWithOffsets(IndexReader reader, int docId, string field)
         {
             Fields vectors = reader.GetTermVectors(docId);
             if (vectors == null) {
                 return null;
             }

             Terms vector = vectors.GetTerms(field);
             if (vector == null) {
                 return null;
             }

             if (!vector.HasPositions || !vector.HasOffsets) {
                 return null;
             }

             return GetTokenStream(vector);
         }

         // convenience method
         public static TokenStream GetTokenStream(IndexReader reader, int docId,
               string field, Analyzer analyzer)
         {
             Document doc = reader.Document(docId);
             return GetTokenStream(doc, field, analyzer);
         }

         public static TokenStream GetTokenStream(Document doc, string field,
             Analyzer analyzer)
         {
             string contents = doc.Get(field);
             if (contents == null)
             {
                 throw new ArgumentException("Field " + field
                     + " in document is not stored and cannot be analyzed");
             }
             return GetTokenStream(field, contents, analyzer);
         }

         // convenience method
         public static TokenStream GetTokenStream(string field, string contents,
             Analyzer analyzer)
         {
             try
             {
                 return analyzer.GetTokenStream(field, contents);
             }
             catch (IOException ex)
             {
                 throw new Exception(ex.ToString(), ex);
             }
         }
     }
 }
	using Lucene.Net.Analysis;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Documents;
	using Lucene.Net.Index;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.IO;

	namespace Lucene.Net.Search.Highlight
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	///<summary>
	/// Hides implementation issues associated with obtaining a <see cref="TokenStream"/> for use with
	/// the <see cref="Highlighter"/> - can obtain from
	/// term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
	/// see TokenStreamFromTermVector
	///</summary>
	public static class TokenSources // LUCENENET specific: CA1052 Static holder types should be Static or NotInheritable
	{
	private class TokenComparer : IComparer<Token>
	{
	public int Compare(Token t1, Token t2)
	{
	if (t1.StartOffset == t2.StartOffset)
	{
	return t1.EndOffset - t2.EndOffset;
	}
	else
	{
	return t1.StartOffset - t2.StartOffset;
	}
	}
	}

	internal sealed class StoredTokenStream : TokenStream
	{
	internal Token[] tokens;
	internal int currentToken = 0;
	internal ICharTermAttribute termAtt;
	internal IOffsetAttribute offsetAtt;
	internal IPositionIncrementAttribute posincAtt;
	internal IPayloadAttribute payloadAtt;

	internal StoredTokenStream(Token[] tokens)
	{
	this.tokens = tokens;
	termAtt = AddAttribute<ICharTermAttribute>();
	offsetAtt = AddAttribute<IOffsetAttribute>();
	posincAtt = AddAttribute<IPositionIncrementAttribute>();
	payloadAtt = AddAttribute<IPayloadAttribute>();
	}

	public override bool IncrementToken()
	{
	if (currentToken >= tokens.Length)
	{
	return false;
	}
	Token token = tokens[currentToken++];
	ClearAttributes();
	termAtt.SetEmpty().Append(token);
	offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
	BytesRef payload = token.Payload;
	if (payload != null)
	{
	payloadAtt.Payload = payload;
	}
	posincAtt.PositionIncrement =
	(currentToken <= 1 \|\|
	tokens[currentToken - 1].StartOffset > tokens[currentToken - 2].StartOffset
	? 1 : 0);
	return true;
	}
	}

	/// <summary>
	/// A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
	/// using the passed in <see cref="Document"/> to retrieve the <see cref="TokenStream"/>. This is useful when
	/// you already have the document, but would prefer to use the vector first.
	/// </summary>
	/// <param name="reader">The <see cref="IndexReader"/> to use to try and get the vector from</param>
	/// <param name="docId">The docId to retrieve.</param>
	/// <param name="field">The field to retrieve on the document</param>
	/// <param name="doc">The document to fall back on</param>
	/// <param name="analyzer">The analyzer to use for creating the TokenStream if the vector doesn't exist</param>
	/// <returns>The <see cref="TokenStream"/> for the <see cref="IIndexableField"/> on the <see cref="Document"/></returns>
	/// <exception cref="IOException">if there was an error loading</exception>
	public static TokenStream GetAnyTokenStream(IndexReader reader, int docId,
	string field, Document doc, Analyzer analyzer)
	{
	TokenStream ts = null;

	Fields vectors = reader.GetTermVectors(docId);
	Terms vector = vectors?.GetTerms(field);
	if (vector != null)
	{
	ts = GetTokenStream(vector);
	}

	// No token info stored so fall back to analyzing raw content
	ts = ts ?? GetTokenStream(doc, field, analyzer);
	return ts;
	}

	/// <summary>
	/// A convenience method that tries a number of approaches to getting a token stream.
	/// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
	/// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
	/// </summary>
	/// <returns>null if field not stored correctly</returns>
	/// <exception cref="IOException">If there is a low-level I/O error</exception>
	public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, string field, Analyzer analyzer)
	{
	TokenStream ts = null;

	Fields vectors = reader.GetTermVectors(docId);
	Terms vector = vectors?.GetTerms(field);
	if (vector != null)
	{
	ts = GetTokenStream(vector);
	}

	// No token info stored so fall back to analyzing raw content
	ts = ts ?? GetTokenStream(reader, docId, field, analyzer);
	return ts;
	}

	public static TokenStream GetTokenStream(Terms vector)
	{
	//assumes the worst and makes no assumptions about token position sequences.
	return GetTokenStream(vector, false);
	}

	/// <summary>
	/// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
	/// can be used to feed the highlighter with a pre-parsed token
	/// stream. The <see cref="Terms"/> must have offsets available.
	/// <para/>
	/// In my tests the speeds to recreate 1000 token streams using this method are:
	/// <list type="bullet">
	/// <item><description>
	/// with TermVector offset only data stored - 420 milliseconds
	/// </description></item>
	/// <item><description>
	/// with TermVector offset AND position data stored - 271 milliseconds
	/// (nb timings for TermVector with position data are based on a tokenizer with contiguous
	/// positions - no overlaps or gaps)
	/// </description></item>
	/// <item><description>
	/// The cost of not using TermPositionVector to store
	/// pre-parsed content and using an analyzer to re-parse the original content:
	/// - reanalyzing the original content - 980 milliseconds
	/// </description></item>
	/// </list>
	///
	/// The re-analyze timings will typically vary depending on -
	/// <list type="number">
	/// <item><description>
	/// The complexity of the analyzer code (timings above were using a
	/// stemmer/lowercaser/stopword combo)
	/// </description></item>
	/// <item><description>
	/// The number of other fields (Lucene reads ALL fields off the disk
	/// when accessing just one document field - can cost dear!)
	/// </description></item>
	/// <item><description>
	/// Use of compression on field storage - could be faster due to compression (less disk IO)
	/// or slower (more CPU burn) depending on the content.
	/// </description></item>
	/// </list>
	/// </summary>
	/// <param name="tpv"></param>
	/// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
	/// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
	/// <exception cref="ArgumentException">if no offsets are available</exception>
	public static TokenStream GetTokenStream(Terms tpv,
	bool tokenPositionsGuaranteedContiguous)
	{
	if (!tpv.HasOffsets)
	{
	throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
	}

	if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
	{
	return new TokenStreamFromTermPositionVector(tpv);
	}

	bool hasPayloads = tpv.HasPayloads;

	// code to reconstruct the original sequence of Tokens
	TermsEnum termsEnum = tpv.GetEnumerator();
	int totalTokens = 0;
	while (termsEnum.MoveNext())
	{
	totalTokens += (int)termsEnum.TotalTermFreq;
	}
	Token[] tokensInOriginalOrder = new Token[totalTokens];
	List<Token> unsortedTokens = null;
	termsEnum = tpv.GetEnumerator();
	DocsAndPositionsEnum dpEnum = null;
	while (termsEnum.MoveNext())
	{
	dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
	if (dpEnum == null)
	{
	throw new ArgumentException("Required TermVector Offset information was not found");
	}
	string term = termsEnum.Term.Utf8ToString();

	dpEnum.NextDoc();
	int freq = dpEnum.Freq;
	for (int posUpto = 0; posUpto < freq; posUpto++)
	{
	int pos = dpEnum.NextPosition();
	if (dpEnum.StartOffset < 0)
	{
	throw new ArgumentException("Required TermVector Offset information was not found");
	}
	Token token = new Token(term,dpEnum.StartOffset, dpEnum.EndOffset);
	if (hasPayloads)
	{
	// Must make a deep copy of the returned payload,
	// since D&PEnum API is allowed to re-use on every
	// call:
	token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
	}

	if (tokenPositionsGuaranteedContiguous && pos != -1)
	{
	// We have positions stored and a guarantee that the token position
	// information is contiguous

	// This may be fast BUT wont work if Tokenizers used which create >1
	// token in same position or
	// creates jumps in position numbers - this code would fail under those
	// circumstances

	// tokens stored with positions - can use this to index straight into
	// sorted array
	tokensInOriginalOrder[pos] = token;
	}
	else
	{
	// tokens NOT stored with positions or not guaranteed contiguous - must
	// add to list and sort later
	if (unsortedTokens == null)
	{
	unsortedTokens = new List<Token>();
	}
	unsortedTokens.Add(token);
	}
	}
	}

	// If the field has been stored without position data we must perform a sort
	if (unsortedTokens != null)
	{
	tokensInOriginalOrder = unsortedTokens.ToArray();
	ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer());
	//tokensInOriginalOrder = tokensInOriginalOrder
	// .OrderBy(t => t, new TokenComparer() )
	// .ToArray();
	}
	return new StoredTokenStream(tokensInOriginalOrder);
	}

	///<summary>
	/// Returns a <see cref="TokenStream"/> with positions and offsets constructed from
	/// field termvectors. If the field has no termvectors or offsets
	/// are not included in the termvector, return null. See
	/// <see cref="GetTokenStream(Terms)"/>
	/// for an explanation of what happens when positions aren't present.
	/// </summary>
	/// <param name="reader">the <see cref="IndexReader"/> to retrieve term vectors from</param>
	/// <param name="docId">the document to retrieve term vectors for </param>
	/// <param name="field">the field to retrieve term vectors for</param>
	/// <returns>a <see cref="TokenStream"/>, or null if offsets are not available</returns>
	/// <exception cref="IOException"> If there is a low-level I/O error</exception>
	public static TokenStream GetTokenStreamWithOffsets(IndexReader reader, int docId, string field)
	{
	Fields vectors = reader.GetTermVectors(docId);
	if (vectors == null) {
	return null;
	}

	Terms vector = vectors.GetTerms(field);
	if (vector == null) {
	return null;
	}

	if (!vector.HasPositions \|\| !vector.HasOffsets) {
	return null;
	}

	return GetTokenStream(vector);
	}

	// convenience method
	public static TokenStream GetTokenStream(IndexReader reader, int docId,
	string field, Analyzer analyzer)
	{
	Document doc = reader.Document(docId);
	return GetTokenStream(doc, field, analyzer);
	}

	public static TokenStream GetTokenStream(Document doc, string field,
	Analyzer analyzer)
	{
	string contents = doc.Get(field);
	if (contents == null)
	{
	throw new ArgumentException("Field " + field
	+ " in document is not stored and cannot be analyzed");
	}
	return GetTokenStream(field, contents, analyzer);
	}

	// convenience method
	public static TokenStream GetTokenStream(string field, string contents,
	Analyzer analyzer)
	{
	try
	{
	return analyzer.GetTokenStream(field, contents);
	}
	catch (IOException ex)
	{
	throw new Exception(ex.ToString(), ex);
	}
	}
	}
	}