blob: bc9cd26affa482853cd6d70bdf5f75c2fd5e5240 [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.IO;
namespace Lucene.Net.Search.Highlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
///<summary>
/// Hides implementation issues associated with obtaining a <see cref="TokenStream"/> for use with
/// the <see cref="Highlighter"/> - can obtain from
/// term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
/// see TokenStreamFromTermVector
///</summary>
public static class TokenSources // LUCENENET specific: CA1052 Static holder types should be Static or NotInheritable
{
private class TokenComparer : IComparer<Token>
{
public int Compare(Token t1, Token t2)
{
if (t1.StartOffset == t2.StartOffset)
{
return t1.EndOffset - t2.EndOffset;
}
else
{
return t1.StartOffset - t2.StartOffset;
}
}
}
internal sealed class StoredTokenStream : TokenStream
{
internal Token[] tokens;
internal int currentToken = 0;
internal ICharTermAttribute termAtt;
internal IOffsetAttribute offsetAtt;
internal IPositionIncrementAttribute posincAtt;
internal IPayloadAttribute payloadAtt;
internal StoredTokenStream(Token[] tokens)
{
this.tokens = tokens;
termAtt = AddAttribute<ICharTermAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
posincAtt = AddAttribute<IPositionIncrementAttribute>();
payloadAtt = AddAttribute<IPayloadAttribute>();
}
public override bool IncrementToken()
{
if (currentToken >= tokens.Length)
{
return false;
}
Token token = tokens[currentToken++];
ClearAttributes();
termAtt.SetEmpty().Append(token);
offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
BytesRef payload = token.Payload;
if (payload != null)
{
payloadAtt.Payload = payload;
}
posincAtt.PositionIncrement =
(currentToken <= 1 ||
tokens[currentToken - 1].StartOffset > tokens[currentToken - 2].StartOffset
? 1 : 0);
return true;
}
}
/// <summary>
/// A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
/// using the passed in <see cref="Document"/> to retrieve the <see cref="TokenStream"/>. This is useful when
/// you already have the document, but would prefer to use the vector first.
/// </summary>
/// <param name="reader">The <see cref="IndexReader"/> to use to try and get the vector from</param>
/// <param name="docId">The docId to retrieve.</param>
/// <param name="field">The field to retrieve on the document</param>
/// <param name="doc">The document to fall back on</param>
/// <param name="analyzer">The analyzer to use for creating the TokenStream if the vector doesn't exist</param>
/// <returns>The <see cref="TokenStream"/> for the <see cref="IIndexableField"/> on the <see cref="Document"/></returns>
/// <exception cref="IOException">if there was an error loading</exception>
public static TokenStream GetAnyTokenStream(IndexReader reader, int docId,
string field, Document doc, Analyzer analyzer)
{
TokenStream ts = null;
Fields vectors = reader.GetTermVectors(docId);
Terms vector = vectors?.GetTerms(field);
if (vector != null)
{
ts = GetTokenStream(vector);
}
// No token info stored so fall back to analyzing raw content
ts = ts ?? GetTokenStream(doc, field, analyzer);
return ts;
}
/// <summary>
/// A convenience method that tries a number of approaches to getting a token stream.
/// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
/// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
/// </summary>
/// <returns>null if field not stored correctly</returns>
/// <exception cref="IOException">If there is a low-level I/O error</exception>
public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, string field, Analyzer analyzer)
{
TokenStream ts = null;
Fields vectors = reader.GetTermVectors(docId);
Terms vector = vectors?.GetTerms(field);
if (vector != null)
{
ts = GetTokenStream(vector);
}
// No token info stored so fall back to analyzing raw content
ts = ts ?? GetTokenStream(reader, docId, field, analyzer);
return ts;
}
public static TokenStream GetTokenStream(Terms vector)
{
//assumes the worst and makes no assumptions about token position sequences.
return GetTokenStream(vector, false);
}
/// <summary>
/// Low level api. Returns a token stream generated from a <see cref="Terms"/>. This
/// can be used to feed the highlighter with a pre-parsed token
/// stream. The <see cref="Terms"/> must have offsets available.
/// <para/>
/// In my tests the speeds to recreate 1000 token streams using this method are:
/// <list type="bullet">
/// <item><description>
/// with TermVector offset only data stored - 420 milliseconds
/// </description></item>
/// <item><description>
/// with TermVector offset AND position data stored - 271 milliseconds
/// (nb timings for TermVector with position data are based on a tokenizer with contiguous
/// positions - no overlaps or gaps)
/// </description></item>
/// <item><description>
/// The cost of not using TermPositionVector to store
/// pre-parsed content and using an analyzer to re-parse the original content:
/// - reanalyzing the original content - 980 milliseconds
/// </description></item>
/// </list>
///
/// The re-analyze timings will typically vary depending on -
/// <list type="number">
/// <item><description>
/// The complexity of the analyzer code (timings above were using a
/// stemmer/lowercaser/stopword combo)
/// </description></item>
/// <item><description>
/// The number of other fields (Lucene reads ALL fields off the disk
/// when accessing just one document field - can cost dear!)
/// </description></item>
/// <item><description>
/// Use of compression on field storage - could be faster due to compression (less disk IO)
/// or slower (more CPU burn) depending on the content.
/// </description></item>
/// </list>
/// </summary>
/// <param name="tpv"></param>
/// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking
/// to eek out the last drops of performance, set to true. If in doubt, set to false.</param>
/// <exception cref="ArgumentException">if no offsets are available</exception>
public static TokenStream GetTokenStream(Terms tpv,
bool tokenPositionsGuaranteedContiguous)
{
if (!tpv.HasOffsets)
{
throw new ArgumentException("Cannot create TokenStream from Terms without offsets");
}
if (!tokenPositionsGuaranteedContiguous && tpv.HasPositions)
{
return new TokenStreamFromTermPositionVector(tpv);
}
bool hasPayloads = tpv.HasPayloads;
// code to reconstruct the original sequence of Tokens
TermsEnum termsEnum = tpv.GetEnumerator();
int totalTokens = 0;
while (termsEnum.MoveNext())
{
totalTokens += (int)termsEnum.TotalTermFreq;
}
Token[] tokensInOriginalOrder = new Token[totalTokens];
List<Token> unsortedTokens = null;
termsEnum = tpv.GetEnumerator();
DocsAndPositionsEnum dpEnum = null;
while (termsEnum.MoveNext())
{
dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
if (dpEnum == null)
{
throw new ArgumentException("Required TermVector Offset information was not found");
}
string term = termsEnum.Term.Utf8ToString();
dpEnum.NextDoc();
int freq = dpEnum.Freq;
for (int posUpto = 0; posUpto < freq; posUpto++)
{
int pos = dpEnum.NextPosition();
if (dpEnum.StartOffset < 0)
{
throw new ArgumentException("Required TermVector Offset information was not found");
}
Token token = new Token(term,dpEnum.StartOffset, dpEnum.EndOffset);
if (hasPayloads)
{
// Must make a deep copy of the returned payload,
// since D&PEnum API is allowed to re-use on every
// call:
token.Payload = BytesRef.DeepCopyOf(dpEnum.GetPayload());
}
if (tokenPositionsGuaranteedContiguous && pos != -1)
{
// We have positions stored and a guarantee that the token position
// information is contiguous
// This may be fast BUT wont work if Tokenizers used which create >1
// token in same position or
// creates jumps in position numbers - this code would fail under those
// circumstances
// tokens stored with positions - can use this to index straight into
// sorted array
tokensInOriginalOrder[pos] = token;
}
else
{
// tokens NOT stored with positions or not guaranteed contiguous - must
// add to list and sort later
if (unsortedTokens == null)
{
unsortedTokens = new List<Token>();
}
unsortedTokens.Add(token);
}
}
}
// If the field has been stored without position data we must perform a sort
if (unsortedTokens != null)
{
tokensInOriginalOrder = unsortedTokens.ToArray();
ArrayUtil.TimSort(tokensInOriginalOrder, new TokenComparer());
//tokensInOriginalOrder = tokensInOriginalOrder
// .OrderBy(t => t, new TokenComparer() )
// .ToArray();
}
return new StoredTokenStream(tokensInOriginalOrder);
}
///<summary>
/// Returns a <see cref="TokenStream"/> with positions and offsets constructed from
/// field termvectors. If the field has no termvectors or offsets
/// are not included in the termvector, return null. See
/// <see cref="GetTokenStream(Terms)"/>
/// for an explanation of what happens when positions aren't present.
/// </summary>
/// <param name="reader">the <see cref="IndexReader"/> to retrieve term vectors from</param>
/// <param name="docId">the document to retrieve term vectors for </param>
/// <param name="field">the field to retrieve term vectors for</param>
/// <returns>a <see cref="TokenStream"/>, or null if offsets are not available</returns>
/// <exception cref="IOException"> If there is a low-level I/O error</exception>
public static TokenStream GetTokenStreamWithOffsets(IndexReader reader, int docId, string field)
{
Fields vectors = reader.GetTermVectors(docId);
if (vectors == null) {
return null;
}
Terms vector = vectors.GetTerms(field);
if (vector == null) {
return null;
}
if (!vector.HasPositions || !vector.HasOffsets) {
return null;
}
return GetTokenStream(vector);
}
// convenience method
public static TokenStream GetTokenStream(IndexReader reader, int docId,
string field, Analyzer analyzer)
{
Document doc = reader.Document(docId);
return GetTokenStream(doc, field, analyzer);
}
public static TokenStream GetTokenStream(Document doc, string field,
Analyzer analyzer)
{
string contents = doc.Get(field);
if (contents == null)
{
throw new ArgumentException("Field " + field
+ " in document is not stored and cannot be analyzed");
}
return GetTokenStream(field, contents, analyzer);
}
// convenience method
public static TokenStream GetTokenStream(string field, string contents,
Analyzer analyzer)
{
try
{
return analyzer.GetTokenStream(field, contents);
}
catch (IOException ex)
{
throw new Exception(ex.ToString(), ex);
}
}
}
}