blob: 78da520725e7e5183af3644f73a89559331f612a [file] [log] [blame]
using System;
using System.Collections.Generic;
using System.Diagnostics;
namespace Lucene.Net.Index
{
using ByteRunAutomaton = Lucene.Net.Util.Automaton.ByteRunAutomaton;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Lucene.Net.Util.BytesRef;
using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton;
using IntsRef = Lucene.Net.Util.IntsRef;
using StringHelper = Lucene.Net.Util.StringHelper;
using Transition = Lucene.Net.Util.Automaton.Transition;
/// <summary>
/// A FilteredTermsEnum that enumerates terms based upon what is accepted by a
/// DFA.
/// <p>
/// The algorithm is such:
/// <ol>
/// <li>As long as matches are successful, keep reading sequentially.
/// <li>When a match fails, skip to the next string in lexicographic order that
/// does not enter a reject state.
/// </ol>
/// <p>
/// The algorithm does not attempt to actually skip to the next string that is
/// completely accepted. this is not possible when the language accepted by the
/// FSM is not finite (i.e. * operator).
/// </p>
/// @lucene.experimental
/// </summary>
internal class AutomatonTermsEnum : FilteredTermsEnum
{
// a tableized array-based form of the DFA
private readonly ByteRunAutomaton RunAutomaton;
// common suffix of the automaton
private readonly BytesRef CommonSuffixRef;
// true if the automaton accepts a finite language
private readonly bool? Finite;
// array of sorted transitions for each state, indexed by state number
private readonly Transition[][] AllTransitions;
// for path tracking: each long records gen when we last
// visited the state; we use gens to avoid having to clear
private readonly long[] Visited;
private long CurGen;
// the reference used for seeking forwards through the term dictionary
private readonly BytesRef SeekBytesRef = new BytesRef(10);
// true if we are enumerating an infinite portion of the DFA.
// in this case it is faster to drive the query based on the terms dictionary.
// when this is true, linearUpperBound indicate the end of range
// of terms where we should simply do sequential reads instead.
private bool Linear_Renamed = false;
private readonly BytesRef LinearUpperBound = new BytesRef(10);
private readonly IComparer<BytesRef> TermComp;
/// <summary>
/// Construct an enumerator based upon an automaton, enumerating the specified
/// field, working on a supplied TermsEnum
/// <p>
/// @lucene.experimental
/// <p> </summary>
/// <param name="compiled"> CompiledAutomaton </param>
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled)
: base(tenum)
{
this.Finite = compiled.Finite;
this.RunAutomaton = compiled.RunAutomaton;
Debug.Assert(this.RunAutomaton != null);
this.CommonSuffixRef = compiled.CommonSuffixRef;
this.AllTransitions = compiled.SortedTransitions;
// used for path tracking, where each bit is a numbered state.
Visited = new long[RunAutomaton.Size];
TermComp = Comparator;
}
/// <summary>
/// Returns true if the term matches the automaton. Also stashes away the term
/// to assist with smart enumeration.
/// </summary>
protected internal override AcceptStatus Accept(BytesRef term)
{
if (CommonSuffixRef == null || StringHelper.EndsWith(term, CommonSuffixRef))
{
if (RunAutomaton.Run(term.Bytes, term.Offset, term.Length))
{
return Linear_Renamed ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK;
}
else
{
return (Linear_Renamed && TermComp.Compare(term, LinearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK;
}
}
else
{
return (Linear_Renamed && TermComp.Compare(term, LinearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK;
}
}
protected internal override BytesRef NextSeekTerm(BytesRef term)
{
//System.out.println("ATE.nextSeekTerm term=" + term);
if (term == null)
{
Debug.Assert(SeekBytesRef.Length == 0);
// return the empty term, as its valid
if (RunAutomaton.IsAccept(RunAutomaton.InitialState))
{
return SeekBytesRef;
}
}
else
{
SeekBytesRef.CopyBytes(term);
}
// seek to the next possible string;
if (NextString())
{
return SeekBytesRef; // reposition
}
else
{
return null; // no more possible strings can match
}
}
/// <summary>
/// Sets the enum to operate in linear fashion, as we have found
/// a looping transition at position: we set an upper bound and
/// act like a TermRangeQuery for this portion of the term space.
/// </summary>
private int Linear
{
set
{
Debug.Assert(Linear_Renamed == false);
int state = RunAutomaton.InitialState;
int maxInterval = 0xff;
for (int i = 0; i < value; i++)
{
state = RunAutomaton.Step(state, SeekBytesRef.Bytes[i] & 0xff);
Debug.Assert(state >= 0, "state=" + state);
}
for (int i = 0; i < AllTransitions[state].Length; i++)
{
Transition t = AllTransitions[state][i];
if (t.Min <= (SeekBytesRef.Bytes[value] & 0xff) && (SeekBytesRef.Bytes[value] & 0xff) <= t.Max)
{
maxInterval = t.Max;
break;
}
}
// 0xff terms don't get the optimization... not worth the trouble.
if (maxInterval != 0xff)
{
maxInterval++;
}
int length = value + 1; // value + maxTransition
if (LinearUpperBound.Bytes.Length < length)
{
LinearUpperBound.Bytes = new byte[length];
}
Array.Copy(SeekBytesRef.Bytes, 0, LinearUpperBound.Bytes, 0, value);
LinearUpperBound.Bytes[value] = (byte)maxInterval;
LinearUpperBound.Length = length;
Linear_Renamed = true;
}
}
private readonly IntsRef SavedStates = new IntsRef(10);
/// <summary>
/// Increments the byte buffer to the next String in binary order after s that will not put
/// the machine into a reject state. If such a string does not exist, returns
/// false.
///
/// The correctness of this method depends upon the automaton being deterministic,
/// and having no transitions to dead states.
/// </summary>
/// <returns> true if more possible solutions exist for the DFA </returns>
private bool NextString()
{
int state;
int pos = 0;
SavedStates.Grow(SeekBytesRef.Length + 1);
int[] states = SavedStates.Ints;
states[0] = RunAutomaton.InitialState;
while (true)
{
CurGen++;
Linear_Renamed = false;
// walk the automaton until a character is rejected.
for (state = states[pos]; pos < SeekBytesRef.Length; pos++)
{
Visited[state] = CurGen;
int nextState = RunAutomaton.Step(state, SeekBytesRef.Bytes[pos] & 0xff);
if (nextState == -1)
{
break;
}
states[pos + 1] = nextState;
// we found a loop, record it for faster enumeration
if ((Finite == false) && !Linear_Renamed && Visited[nextState] == CurGen)
{
Linear = pos;
}
state = nextState;
}
// take the useful portion, and the last non-reject state, and attempt to
// append characters that will match.
if (NextString(state, pos))
{
return true;
} // no more solutions exist from this useful portion, backtrack
else
{
if ((pos = Backtrack(pos)) < 0) // no more solutions at all
{
return false;
}
int newState = RunAutomaton.Step(states[pos], SeekBytesRef.Bytes[pos] & 0xff);
if (newState >= 0 && RunAutomaton.IsAccept(newState))
/* String is good to go as-is */
{
return true;
}
/* else advance further */
// TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important!
// for now, restart from scratch for all infinite DFAs
if (Finite == false)
{
pos = 0;
}
}
}
}
/// <summary>
/// Returns the next String in lexicographic order that will not put
/// the machine into a reject state.
///
/// this method traverses the DFA from the given position in the String,
/// starting at the given state.
///
/// If this cannot satisfy the machine, returns false. this method will
/// walk the minimal path, in lexicographic order, as long as possible.
///
/// If this method returns false, then there might still be more solutions,
/// it is necessary to backtrack to find out.
/// </summary>
/// <param name="state"> current non-reject state </param>
/// <param name="position"> useful portion of the string </param>
/// <returns> true if more possible solutions exist for the DFA from this
/// position </returns>
private bool NextString(int state, int position)
{
/*
* the next lexicographic character must be greater than the existing
* character, if it exists.
*/
int c = 0;
if (position < SeekBytesRef.Length)
{
c = SeekBytesRef.Bytes[position] & 0xff;
// if the next byte is 0xff and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
if (c++ == 0xff)
{
return false;
}
}
SeekBytesRef.Length = position;
Visited[state] = CurGen;
Transition[] transitions = AllTransitions[state];
// find the minimal path (lexicographic order) that is >= c
for (int i = 0; i < transitions.Length; i++)
{
Transition transition = transitions[i];
if (transition.Max >= c)
{
int nextChar = Math.Max(c, transition.Min);
// append either the next sequential char, or the minimum transition
SeekBytesRef.Grow(SeekBytesRef.Length + 1);
SeekBytesRef.Length++;
SeekBytesRef.Bytes[SeekBytesRef.Length - 1] = (byte)nextChar;
state = transition.Dest.Number;
/*
* as long as is possible, continue down the minimal path in
* lexicographic order. if a loop or accept state is encountered, stop.
*/
while (Visited[state] != CurGen && !RunAutomaton.IsAccept(state))
{
Visited[state] = CurGen;
/*
* Note: we work with a DFA with no transitions to dead states.
* so the below is ok, if it is not an accept state,
* then there MUST be at least one transition.
*/
transition = AllTransitions[state][0];
state = transition.Dest.Number;
// append the minimum transition
SeekBytesRef.Grow(SeekBytesRef.Length + 1);
SeekBytesRef.Length++;
SeekBytesRef.Bytes[SeekBytesRef.Length - 1] = (byte)transition.Min;
// we found a loop, record it for faster enumeration
if ((Finite == false) && !Linear_Renamed && Visited[state] == CurGen)
{
Linear = SeekBytesRef.Length - 1;
}
}
return true;
}
}
return false;
}
/// <summary>
/// Attempts to backtrack thru the string after encountering a dead end
/// at some given position. Returns false if no more possible strings
/// can match.
/// </summary>
/// <param name="position"> current position in the input String </param>
/// <returns> position >=0 if more possible solutions exist for the DFA </returns>
private int Backtrack(int position)
{
while (position-- > 0)
{
int nextChar = SeekBytesRef.Bytes[position] & 0xff;
// if a character is 0xff its a dead-end too,
// because there is no higher character in binary sort order.
if (nextChar++ != 0xff)
{
SeekBytesRef.Bytes[position] = (byte)nextChar;
SeekBytesRef.Length = position + 1;
return position;
}
}
return -1; // all solutions exhausted
}
}
}