blob: d5d0fb75828a95f678339e889370e3f358da68ed [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Diagnostics;
using Lucene.Net.Util;
using System.Collections.Generic;
using Console = Lucene.Net.Util.SystemConsole;
namespace Lucene.Net.Analysis
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AttributeSource = Lucene.Net.Util.AttributeSource;
//using RollingBuffer = Lucene.Net.Util.RollingBuffer;
// TODO: cut SynFilter over to this
// TODO: somehow add "nuke this input token" capability...
/// <summary>
/// LUCENENET specific abstraction so we can reference <see cref="LookaheadTokenFilter.Position"/> without
/// specifying a generic closing type.
/// </summary>
public abstract class LookaheadTokenFilter : TokenFilter
{
internal LookaheadTokenFilter(TokenStream input) // Not for end users to use directly
: base(input)
{ }
public abstract override bool IncrementToken();
/// <summary>
/// Holds all state for a single position; subclass this
/// to record other state at each position.
/// </summary>
// LUCENENET NOTE: This class was originally marked protected, but was made public because of
// inconsistent accessibility issues with using it as a generic constraint.
public class Position : RollingBuffer.IResettable
{
// Buffered input tokens at this position:
public IList<AttributeSource.State> InputTokens { get; private set; } = new List<AttributeSource.State>();
// Next buffered token to be returned to consumer:
public int NextRead { get; set; }
// Any token leaving from this position should have this startOffset:
public int StartOffset { get; set; } = -1;
// Any token arriving to this position should have this endOffset:
public int EndOffset { get; set; } = -1;
public void Reset()
{
InputTokens.Clear();
NextRead = 0;
StartOffset = -1;
EndOffset = -1;
}
public virtual void Add(AttributeSource.State state)
{
InputTokens.Add(state);
}
public virtual AttributeSource.State NextState()
{
if (Debugging.AssertsEnabled) Debugging.Assert(NextRead < InputTokens.Count);
return InputTokens[NextRead++];
}
}
}
/// <summary>
/// An abstract <see cref="TokenFilter"/> to make it easier to build graph
/// token filters requiring some lookahead. This class handles
/// the details of buffering up tokens, recording them by
/// position, restoring them, providing access to them, etc.
/// </summary>
public abstract class LookaheadTokenFilter<T> : LookaheadTokenFilter
where T : LookaheadTokenFilter.Position
{
protected readonly static bool DEBUG =
#if VERBOSE_TEST_LOGGING
true
#else
false
#endif
;
protected readonly IPositionIncrementAttribute m_posIncAtt;
protected readonly IPositionLengthAttribute m_posLenAtt;
protected readonly IOffsetAttribute m_offsetAtt;
// Position of last read input token:
protected int m_inputPos;
// Position of next possible output token to return:
protected int m_outputPos;
// True if we hit end from our input:
protected bool m_end;
private bool tokenPending;
private bool insertPending;
// LUCENENET specific - moved Position class to a non-generic class named LookaheadTokenFilter so we can refer to
// it without referring to the generic closing type.
protected internal LookaheadTokenFilter(TokenStream input)
: base(input)
{
m_positions = new RollingBufferAnonymousClass(this);
m_posIncAtt = AddAttribute<IPositionIncrementAttribute>();
m_posLenAtt = AddAttribute<IPositionLengthAttribute>();
m_offsetAtt = AddAttribute<IOffsetAttribute>();
}
/// <summary>
/// Call this only from within <see cref="AfterPosition()"/>, to insert a new
/// token. After calling this you should set any
/// necessary token you need.
/// </summary>
protected virtual void InsertToken()
{
if (tokenPending)
{
m_positions.Get(m_inputPos).Add(CaptureState());
tokenPending = false;
}
if (Debugging.AssertsEnabled) Debugging.Assert(!insertPending);
insertPending = true;
}
/// <summary>
/// This is called when all input tokens leaving a given
/// position have been returned. Override this and
/// call insertToken and then set whichever token's
/// attributes you want, if you want to inject
/// a token starting from this position.
/// </summary>
protected virtual void AfterPosition()
{
}
protected abstract T NewPosition();
protected readonly RollingBuffer<T> m_positions;
private class RollingBufferAnonymousClass : RollingBuffer<T>
{
private readonly LookaheadTokenFilter<T> outerInstance;
public RollingBufferAnonymousClass(LookaheadTokenFilter<T> outerInstance)
: base(outerInstance.NewPosition)
{
this.outerInstance = outerInstance;
}
protected override T NewInstance()
{
return outerInstance.NewPosition();
}
}
/// <summary>
/// Returns true if there is a new token. </summary>
protected virtual bool PeekToken()
{
if (DEBUG)
{
Console.WriteLine("LTF.peekToken inputPos=" + m_inputPos + " outputPos=" + m_outputPos + " tokenPending=" + tokenPending);
}
if (Debugging.AssertsEnabled) Debugging.Assert(!m_end);
if (Debugging.AssertsEnabled) Debugging.Assert(m_inputPos == -1 || m_outputPos <= m_inputPos);
if (tokenPending)
{
m_positions.Get(m_inputPos).Add(CaptureState());
tokenPending = false;
}
bool gotToken = m_input.IncrementToken();
if (DEBUG)
{
Console.WriteLine(" input.incrToken() returned " + gotToken);
}
if (gotToken)
{
m_inputPos += m_posIncAtt.PositionIncrement;
if (Debugging.AssertsEnabled) Debugging.Assert(m_inputPos >= 0);
if (DEBUG)
{
Console.WriteLine(" now inputPos=" + m_inputPos);
}
Position startPosData = m_positions.Get(m_inputPos);
Position endPosData = m_positions.Get(m_inputPos + m_posLenAtt.PositionLength);
int startOffset = m_offsetAtt.StartOffset;
if (startPosData.StartOffset == -1)
{
startPosData.StartOffset = startOffset;
}
else
{
// Make sure our input isn't messing up offsets:
if (Debugging.AssertsEnabled) Debugging.Assert(startPosData.StartOffset == startOffset, "prev startOffset={0} vs new startOffset={1} inputPos={2}", startPosData.StartOffset, startOffset, m_inputPos);
}
int endOffset = m_offsetAtt.EndOffset;
if (endPosData.EndOffset == -1)
{
endPosData.EndOffset = endOffset;
}
else
{
// Make sure our input isn't messing up offsets:
if (Debugging.AssertsEnabled) Debugging.Assert(endPosData.EndOffset == endOffset, "prev endOffset={0} vs new endOffset={1} inputPos={2}", endPosData.EndOffset, endOffset, m_inputPos);
}
tokenPending = true;
}
else
{
m_end = true;
}
return gotToken;
}
/// <summary>
/// Call this when you are done looking ahead; it will set
/// the next token to return. Return the boolean back to
/// the caller.
/// </summary>
protected virtual bool NextToken()
{
//System.out.println(" nextToken: tokenPending=" + tokenPending);
if (DEBUG)
{
Console.WriteLine("LTF.nextToken inputPos=" + m_inputPos + " outputPos=" + m_outputPos + " tokenPending=" + tokenPending);
}
Position posData = m_positions.Get(m_outputPos);
// While loop here in case we have to
// skip over a hole from the input:
while (true)
{
//System.out.println(" check buffer @ outputPos=" +
//outputPos + " inputPos=" + inputPos + " nextRead=" +
//posData.nextRead + " vs size=" +
//posData.inputTokens.size());
// See if we have a previously buffered token to
// return at the current position:
if (posData.NextRead < posData.InputTokens.Count)
{
if (DEBUG)
{
Console.WriteLine(" return previously buffered token");
}
// this position has buffered tokens to serve up:
if (tokenPending)
{
m_positions.Get(m_inputPos).Add(CaptureState());
tokenPending = false;
}
RestoreState(m_positions.Get(m_outputPos).NextState());
//System.out.println(" return!");
return true;
}
if (m_inputPos == -1 || m_outputPos == m_inputPos)
{
// No more buffered tokens:
// We may still get input tokens at this position
//System.out.println(" break buffer");
if (tokenPending)
{
// Fast path: just return token we had just incr'd,
// without having captured/restored its state:
if (DEBUG)
{
Console.WriteLine(" pass-through: return pending token");
}
tokenPending = false;
return true;
}
else if (m_end || !PeekToken())
{
if (DEBUG)
{
Console.WriteLine(" END");
}
AfterPosition();
if (insertPending)
{
// Subclass inserted a token at this same
// position:
if (DEBUG)
{
Console.WriteLine(" return inserted token");
}
if (Debugging.AssertsEnabled) Debugging.Assert(InsertedTokenConsistent());
insertPending = false;
return true;
}
return false;
}
}
else
{
if (posData.StartOffset != -1)
{
// this position had at least one token leaving
if (DEBUG)
{
Console.WriteLine(" call afterPosition");
}
AfterPosition();
if (insertPending)
{
// Subclass inserted a token at this same
// position:
if (DEBUG)
{
Console.WriteLine(" return inserted token");
}
if (Debugging.AssertsEnabled) Debugging.Assert(InsertedTokenConsistent());
insertPending = false;
return true;
}
}
// Done with this position; move on:
m_outputPos++;
if (DEBUG)
{
Console.WriteLine(" next position: outputPos=" + m_outputPos);
}
m_positions.FreeBefore(m_outputPos);
posData = m_positions.Get(m_outputPos);
}
}
}
// If subclass inserted a token, make sure it had in fact
// looked ahead enough:
private bool InsertedTokenConsistent()
{
int posLen = m_posLenAtt.PositionLength;
Position endPosData = m_positions.Get(m_outputPos + posLen);
if (Debugging.AssertsEnabled) Debugging.Assert(endPosData.EndOffset != -1);
if (Debugging.AssertsEnabled) Debugging.Assert(m_offsetAtt.EndOffset == endPosData.EndOffset,"offsetAtt.endOffset={0} vs expected={1}", m_offsetAtt.EndOffset, endPosData.EndOffset);
return true;
}
// TODO: end()?
// TODO: close()?
public override void Reset()
{
base.Reset();
m_positions.Reset();
m_inputPos = -1;
m_outputPos = 0;
tokenPending = false;
m_end = false;
}
}
}