| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Util; |
| using System.Collections.Generic; |
| using Console = Lucene.Net.Util.SystemConsole; |
| |
| namespace Lucene.Net.Analysis |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using AttributeSource = Lucene.Net.Util.AttributeSource; |
| |
| //using RollingBuffer = Lucene.Net.Util.RollingBuffer; |
| |
| // TODO: cut SynFilter over to this |
| // TODO: somehow add "nuke this input token" capability... |
| |
| /// <summary> |
| /// LUCENENET specific abstraction so we can reference <see cref="LookaheadTokenFilter.Position"/> without |
| /// specifying a generic closing type. |
| /// </summary> |
| public abstract class LookaheadTokenFilter : TokenFilter |
| { |
| internal LookaheadTokenFilter(TokenStream input) // Not for end users to use directly |
| : base(input) |
| { } |
| |
| |
| public abstract override bool IncrementToken(); |
| |
| /// <summary> |
| /// Holds all state for a single position; subclass this |
| /// to record other state at each position. |
| /// </summary> |
| // LUCENENET NOTE: This class was originally marked protected, but was made public because of |
| // inconsistent accessibility issues with using it as a generic constraint. |
| public class Position : RollingBuffer.IResettable |
| { |
| // Buffered input tokens at this position: |
| public IList<AttributeSource.State> InputTokens { get; private set; } = new List<AttributeSource.State>(); |
| |
| // Next buffered token to be returned to consumer: |
| public int NextRead { get; set; } |
| |
| // Any token leaving from this position should have this startOffset: |
| public int StartOffset { get; set; } = -1; |
| |
| // Any token arriving to this position should have this endOffset: |
| public int EndOffset { get; set; } = -1; |
| |
| public void Reset() |
| { |
| InputTokens.Clear(); |
| NextRead = 0; |
| StartOffset = -1; |
| EndOffset = -1; |
| } |
| |
| public virtual void Add(AttributeSource.State state) |
| { |
| InputTokens.Add(state); |
| } |
| |
| public virtual AttributeSource.State NextState() |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(NextRead < InputTokens.Count); |
| return InputTokens[NextRead++]; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// An abstract <see cref="TokenFilter"/> to make it easier to build graph |
| /// token filters requiring some lookahead. This class handles |
| /// the details of buffering up tokens, recording them by |
| /// position, restoring them, providing access to them, etc. |
| /// </summary> |
| public abstract class LookaheadTokenFilter<T> : LookaheadTokenFilter |
| where T : LookaheadTokenFilter.Position |
| { |
| protected readonly static bool DEBUG = |
| #if VERBOSE_TEST_LOGGING |
| true |
| #else |
| false |
| #endif |
| ; |
| |
| protected readonly IPositionIncrementAttribute m_posIncAtt; |
| protected readonly IPositionLengthAttribute m_posLenAtt; |
| protected readonly IOffsetAttribute m_offsetAtt; |
| |
| // Position of last read input token: |
| protected int m_inputPos; |
| |
| // Position of next possible output token to return: |
| protected int m_outputPos; |
| |
| // True if we hit end from our input: |
| protected bool m_end; |
| |
| private bool tokenPending; |
| private bool insertPending; |
| |
| // LUCENENET specific - moved Position class to a non-generic class named LookaheadTokenFilter so we can refer to |
| // it without referring to the generic closing type. |
| |
| protected internal LookaheadTokenFilter(TokenStream input) |
| : base(input) |
| { |
| m_positions = new RollingBufferAnonymousClass(this); |
| m_posIncAtt = AddAttribute<IPositionIncrementAttribute>(); |
| m_posLenAtt = AddAttribute<IPositionLengthAttribute>(); |
| m_offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| /// <summary> |
| /// Call this only from within <see cref="AfterPosition()"/>, to insert a new |
| /// token. After calling this you should set any |
| /// necessary token you need. |
| /// </summary> |
| protected virtual void InsertToken() |
| { |
| if (tokenPending) |
| { |
| m_positions.Get(m_inputPos).Add(CaptureState()); |
| tokenPending = false; |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(!insertPending); |
| insertPending = true; |
| } |
| |
| /// <summary> |
| /// This is called when all input tokens leaving a given |
| /// position have been returned. Override this and |
| /// call insertToken and then set whichever token's |
| /// attributes you want, if you want to inject |
| /// a token starting from this position. |
| /// </summary> |
| protected virtual void AfterPosition() |
| { |
| } |
| |
| protected abstract T NewPosition(); |
| |
| protected readonly RollingBuffer<T> m_positions; |
| |
| private class RollingBufferAnonymousClass : RollingBuffer<T> |
| { |
| private readonly LookaheadTokenFilter<T> outerInstance; |
| |
| public RollingBufferAnonymousClass(LookaheadTokenFilter<T> outerInstance) |
| : base(outerInstance.NewPosition) |
| { |
| this.outerInstance = outerInstance; |
| } |
| |
| protected override T NewInstance() |
| { |
| return outerInstance.NewPosition(); |
| } |
| } |
| |
| /// <summary> |
| /// Returns true if there is a new token. </summary> |
| protected virtual bool PeekToken() |
| { |
| if (DEBUG) |
| { |
| Console.WriteLine("LTF.peekToken inputPos=" + m_inputPos + " outputPos=" + m_outputPos + " tokenPending=" + tokenPending); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(!m_end); |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_inputPos == -1 || m_outputPos <= m_inputPos); |
| if (tokenPending) |
| { |
| m_positions.Get(m_inputPos).Add(CaptureState()); |
| tokenPending = false; |
| } |
| bool gotToken = m_input.IncrementToken(); |
| if (DEBUG) |
| { |
| Console.WriteLine(" input.incrToken() returned " + gotToken); |
| } |
| if (gotToken) |
| { |
| m_inputPos += m_posIncAtt.PositionIncrement; |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_inputPos >= 0); |
| if (DEBUG) |
| { |
| Console.WriteLine(" now inputPos=" + m_inputPos); |
| } |
| |
| Position startPosData = m_positions.Get(m_inputPos); |
| Position endPosData = m_positions.Get(m_inputPos + m_posLenAtt.PositionLength); |
| |
| int startOffset = m_offsetAtt.StartOffset; |
| if (startPosData.StartOffset == -1) |
| { |
| startPosData.StartOffset = startOffset; |
| } |
| else |
| { |
| // Make sure our input isn't messing up offsets: |
| if (Debugging.AssertsEnabled) Debugging.Assert(startPosData.StartOffset == startOffset, "prev startOffset={0} vs new startOffset={1} inputPos={2}", startPosData.StartOffset, startOffset, m_inputPos); |
| } |
| |
| int endOffset = m_offsetAtt.EndOffset; |
| if (endPosData.EndOffset == -1) |
| { |
| endPosData.EndOffset = endOffset; |
| } |
| else |
| { |
| // Make sure our input isn't messing up offsets: |
| if (Debugging.AssertsEnabled) Debugging.Assert(endPosData.EndOffset == endOffset, "prev endOffset={0} vs new endOffset={1} inputPos={2}", endPosData.EndOffset, endOffset, m_inputPos); |
| } |
| |
| tokenPending = true; |
| } |
| else |
| { |
| m_end = true; |
| } |
| |
| return gotToken; |
| } |
| |
| /// <summary> |
| /// Call this when you are done looking ahead; it will set |
| /// the next token to return. Return the boolean back to |
| /// the caller. |
| /// </summary> |
| protected virtual bool NextToken() |
| { |
| //System.out.println(" nextToken: tokenPending=" + tokenPending); |
| if (DEBUG) |
| { |
| Console.WriteLine("LTF.nextToken inputPos=" + m_inputPos + " outputPos=" + m_outputPos + " tokenPending=" + tokenPending); |
| } |
| |
| Position posData = m_positions.Get(m_outputPos); |
| |
| // While loop here in case we have to |
| // skip over a hole from the input: |
| while (true) |
| { |
| //System.out.println(" check buffer @ outputPos=" + |
| //outputPos + " inputPos=" + inputPos + " nextRead=" + |
| //posData.nextRead + " vs size=" + |
| //posData.inputTokens.size()); |
| |
| // See if we have a previously buffered token to |
| // return at the current position: |
| if (posData.NextRead < posData.InputTokens.Count) |
| { |
| if (DEBUG) |
| { |
| Console.WriteLine(" return previously buffered token"); |
| } |
| // this position has buffered tokens to serve up: |
| if (tokenPending) |
| { |
| m_positions.Get(m_inputPos).Add(CaptureState()); |
| tokenPending = false; |
| } |
| RestoreState(m_positions.Get(m_outputPos).NextState()); |
| //System.out.println(" return!"); |
| return true; |
| } |
| |
| if (m_inputPos == -1 || m_outputPos == m_inputPos) |
| { |
| // No more buffered tokens: |
| // We may still get input tokens at this position |
| //System.out.println(" break buffer"); |
| if (tokenPending) |
| { |
| // Fast path: just return token we had just incr'd, |
| // without having captured/restored its state: |
| if (DEBUG) |
| { |
| Console.WriteLine(" pass-through: return pending token"); |
| } |
| tokenPending = false; |
| return true; |
| } |
| else if (m_end || !PeekToken()) |
| { |
| if (DEBUG) |
| { |
| Console.WriteLine(" END"); |
| } |
| AfterPosition(); |
| if (insertPending) |
| { |
| // Subclass inserted a token at this same |
| // position: |
| if (DEBUG) |
| { |
| Console.WriteLine(" return inserted token"); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(InsertedTokenConsistent()); |
| insertPending = false; |
| return true; |
| } |
| |
| return false; |
| } |
| } |
| else |
| { |
| if (posData.StartOffset != -1) |
| { |
| // this position had at least one token leaving |
| if (DEBUG) |
| { |
| Console.WriteLine(" call afterPosition"); |
| } |
| AfterPosition(); |
| if (insertPending) |
| { |
| // Subclass inserted a token at this same |
| // position: |
| if (DEBUG) |
| { |
| Console.WriteLine(" return inserted token"); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(InsertedTokenConsistent()); |
| insertPending = false; |
| return true; |
| } |
| } |
| |
| // Done with this position; move on: |
| m_outputPos++; |
| if (DEBUG) |
| { |
| Console.WriteLine(" next position: outputPos=" + m_outputPos); |
| } |
| m_positions.FreeBefore(m_outputPos); |
| posData = m_positions.Get(m_outputPos); |
| } |
| } |
| } |
| |
| // If subclass inserted a token, make sure it had in fact |
| // looked ahead enough: |
| private bool InsertedTokenConsistent() |
| { |
| int posLen = m_posLenAtt.PositionLength; |
| Position endPosData = m_positions.Get(m_outputPos + posLen); |
| if (Debugging.AssertsEnabled) Debugging.Assert(endPosData.EndOffset != -1); |
| if (Debugging.AssertsEnabled) Debugging.Assert(m_offsetAtt.EndOffset == endPosData.EndOffset,"offsetAtt.endOffset={0} vs expected={1}", m_offsetAtt.EndOffset, endPosData.EndOffset); |
| return true; |
| } |
| |
| // TODO: end()? |
| // TODO: close()? |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| m_positions.Reset(); |
| m_inputPos = -1; |
| m_outputPos = 0; |
| tokenPending = false; |
| m_end = false; |
| } |
| } |
| } |