| using Lucene.Net.Diagnostics; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| |
| namespace Lucene.Net.Index |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using ByteRunAutomaton = Lucene.Net.Util.Automaton.ByteRunAutomaton; |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton; |
| using Int32sRef = Lucene.Net.Util.Int32sRef; |
| using StringHelper = Lucene.Net.Util.StringHelper; |
| using Transition = Lucene.Net.Util.Automaton.Transition; |
| |
| /// <summary> |
| /// A <see cref="FilteredTermsEnum"/> that enumerates terms based upon what is accepted by a |
| /// DFA. |
| /// <para/> |
| /// The algorithm is such: |
| /// <list type="number"> |
| /// <item><description>As long as matches are successful, keep reading sequentially.</description></item> |
| /// <item><description>When a match fails, skip to the next string in lexicographic order that |
| /// does not enter a reject state.</description></item> |
| /// </list> |
| /// <para> |
| /// The algorithm does not attempt to actually skip to the next string that is |
| /// completely accepted. this is not possible when the language accepted by the |
| /// FSM is not finite (i.e. * operator). |
| /// </para> |
| /// @lucene.experimental |
| /// </summary> |
| internal class AutomatonTermsEnum : FilteredTermsEnum |
| { |
| // a tableized array-based form of the DFA |
| private readonly ByteRunAutomaton runAutomaton; |
| |
| // common suffix of the automaton |
| private readonly BytesRef commonSuffixRef; |
| |
| // true if the automaton accepts a finite language |
| private readonly bool? finite; |
| |
| // array of sorted transitions for each state, indexed by state number |
| private readonly Transition[][] allTransitions; |
| |
| // for path tracking: each long records gen when we last |
| // visited the state; we use gens to avoid having to clear |
| private readonly long[] visited; |
| |
| private long curGen; |
| |
| // the reference used for seeking forwards through the term dictionary |
| private readonly BytesRef seekBytesRef = new BytesRef(10); |
| |
| // true if we are enumerating an infinite portion of the DFA. |
| // in this case it is faster to drive the query based on the terms dictionary. |
| // when this is true, linearUpperBound indicate the end of range |
| // of terms where we should simply do sequential reads instead. |
| private bool linear = false; |
| |
| private readonly BytesRef linearUpperBound = new BytesRef(10); |
| private readonly IComparer<BytesRef> termComp; |
| |
| /// <summary> |
| /// Construct an enumerator based upon an automaton, enumerating the specified |
| /// field, working on a supplied <see cref="TermsEnum"/> |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| /// <param name="tenum"> TermsEnum </param> |
| /// <param name="compiled"> CompiledAutomaton </param> |
| public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) |
| : base(tenum) |
| { |
| this.finite = compiled.Finite; |
| this.runAutomaton = compiled.RunAutomaton; |
| if (Debugging.AssertsEnabled) Debugging.Assert(this.runAutomaton != null); |
| this.commonSuffixRef = compiled.CommonSuffixRef; |
| this.allTransitions = compiled.SortedTransitions; |
| |
| // used for path tracking, where each bit is a numbered state. |
| visited = new long[runAutomaton.Count]; |
| |
| termComp = Comparer; |
| } |
| |
| /// <summary> |
| /// Returns <c>true</c> if the term matches the automaton. Also stashes away the term |
| /// to assist with smart enumeration. |
| /// </summary> |
| protected override AcceptStatus Accept(BytesRef term) |
| { |
| if (commonSuffixRef == null || StringHelper.EndsWith(term, commonSuffixRef)) |
| { |
| if (runAutomaton.Run(term.Bytes, term.Offset, term.Length)) |
| { |
| return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; |
| } |
| else |
| { |
| return (linear && termComp.Compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } |
| } |
| else |
| { |
| return (linear && termComp.Compare(term, linearUpperBound) < 0) ? AcceptStatus.NO : AcceptStatus.NO_AND_SEEK; |
| } |
| } |
| |
| protected override BytesRef NextSeekTerm(BytesRef term) |
| { |
| //System.out.println("ATE.nextSeekTerm term=" + term); |
| if (term == null) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(seekBytesRef.Length == 0); |
| // return the empty term, as its valid |
| if (runAutomaton.IsAccept(runAutomaton.InitialState)) |
| { |
| return seekBytesRef; |
| } |
| } |
| else |
| { |
| seekBytesRef.CopyBytes(term); |
| } |
| |
| // seek to the next possible string; |
| if (NextString()) |
| { |
| return seekBytesRef; // reposition |
| } |
| else |
| { |
| return null; // no more possible strings can match |
| } |
| } |
| |
| /// <summary> |
| /// Sets the enum to operate in linear fashion, as we have found |
| /// a looping transition at position: we set an upper bound and |
| /// act like a <see cref="Search.TermRangeQuery"/> for this portion of the term space. |
| /// </summary> |
| private void SetLinear(int position) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(linear == false); |
| |
| int state = runAutomaton.InitialState; |
| int maxInterval = 0xff; |
| for (int i = 0; i < position; i++) |
| { |
| state = runAutomaton.Step(state, seekBytesRef.Bytes[i] & 0xff); |
| if (Debugging.AssertsEnabled) Debugging.Assert(state >= 0, () => "state=" + state); |
| } |
| for (int i = 0; i < allTransitions[state].Length; i++) |
| { |
| Transition t = allTransitions[state][i]; |
| if (t.Min <= (seekBytesRef.Bytes[position] & 0xff) && (seekBytesRef.Bytes[position] & 0xff) <= t.Max) |
| { |
| maxInterval = t.Max; |
| break; |
| } |
| } |
| // 0xff terms don't get the optimization... not worth the trouble. |
| if (maxInterval != 0xff) |
| { |
| maxInterval++; |
| } |
| int length = position + 1; // value + maxTransition |
| if (linearUpperBound.Bytes.Length < length) |
| { |
| linearUpperBound.Bytes = new byte[length]; |
| } |
| Array.Copy(seekBytesRef.Bytes, 0, linearUpperBound.Bytes, 0, position); |
| linearUpperBound.Bytes[position] = (byte)maxInterval; |
| linearUpperBound.Length = length; |
| |
| linear = true; |
| } |
| |
| private readonly Int32sRef savedStates = new Int32sRef(10); |
| |
| /// <summary> |
| /// Increments the byte buffer to the next string in binary order after s that will not put |
| /// the machine into a reject state. If such a string does not exist, returns |
| /// <c>false</c>. |
| /// <para/> |
| /// The correctness of this method depends upon the automaton being deterministic, |
| /// and having no transitions to dead states. |
| /// </summary> |
| /// <returns> <c>true</c> if more possible solutions exist for the DFA </returns> |
| private bool NextString() |
| { |
| int state; |
| int pos = 0; |
| savedStates.Grow(seekBytesRef.Length + 1); |
| int[] states = savedStates.Int32s; |
| states[0] = runAutomaton.InitialState; |
| |
| while (true) |
| { |
| curGen++; |
| linear = false; |
| // walk the automaton until a character is rejected. |
| for (state = states[pos]; pos < seekBytesRef.Length; pos++) |
| { |
| visited[state] = curGen; |
| int nextState = runAutomaton.Step(state, seekBytesRef.Bytes[pos] & 0xff); |
| if (nextState == -1) |
| { |
| break; |
| } |
| states[pos + 1] = nextState; |
| // we found a loop, record it for faster enumeration |
| if ((finite == false) && !linear && visited[nextState] == curGen) |
| { |
| SetLinear(pos); |
| } |
| state = nextState; |
| } |
| |
| // take the useful portion, and the last non-reject state, and attempt to |
| // append characters that will match. |
| if (NextString(state, pos)) |
| { |
| return true; |
| } // no more solutions exist from this useful portion, backtrack |
| else |
| { |
| if ((pos = Backtrack(pos)) < 0) // no more solutions at all |
| { |
| return false; |
| } |
| int newState = runAutomaton.Step(states[pos], seekBytesRef.Bytes[pos] & 0xff); |
| if (newState >= 0 && runAutomaton.IsAccept(newState)) |
| /* String is good to go as-is */ |
| { |
| return true; |
| } |
| /* else advance further */ |
| // TODO: paranoia? if we backtrack thru an infinite DFA, the loop detection is important! |
| // for now, restart from scratch for all infinite DFAs |
| if (finite == false) |
| { |
| pos = 0; |
| } |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Returns the next string in lexicographic order that will not put |
| /// the machine into a reject state. |
| /// <para/> |
| /// This method traverses the DFA from the given position in the string, |
| /// starting at the given state. |
| /// <para/> |
| /// If this cannot satisfy the machine, returns <c>false</c>. This method will |
| /// walk the minimal path, in lexicographic order, as long as possible. |
| /// <para/> |
| /// If this method returns <c>false</c>, then there might still be more solutions, |
| /// it is necessary to backtrack to find out. |
| /// </summary> |
| /// <param name="state"> current non-reject state </param> |
| /// <param name="position"> useful portion of the string </param> |
| /// <returns> <c>true</c> if more possible solutions exist for the DFA from this |
| /// position </returns> |
| private bool NextString(int state, int position) |
| { |
| /* |
| * the next lexicographic character must be greater than the existing |
| * character, if it exists. |
| */ |
| int c = 0; |
| if (position < seekBytesRef.Length) |
| { |
| c = seekBytesRef.Bytes[position] & 0xff; |
| // if the next byte is 0xff and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| if (c++ == 0xff) |
| { |
| return false; |
| } |
| } |
| |
| seekBytesRef.Length = position; |
| visited[state] = curGen; |
| |
| Transition[] transitions = allTransitions[state]; |
| |
| // find the minimal path (lexicographic order) that is >= c |
| |
| for (int i = 0; i < transitions.Length; i++) |
| { |
| Transition transition = transitions[i]; |
| if (transition.Max >= c) |
| { |
| int nextChar = Math.Max(c, transition.Min); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.Grow(seekBytesRef.Length + 1); |
| seekBytesRef.Length++; |
| seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)nextChar; |
| state = transition.Dest.Number; |
| /* |
| * as long as is possible, continue down the minimal path in |
| * lexicographic order. if a loop or accept state is encountered, stop. |
| */ |
| while (visited[state] != curGen && !runAutomaton.IsAccept(state)) |
| { |
| visited[state] = curGen; |
| /* |
| * Note: we work with a DFA with no transitions to dead states. |
| * so the below is ok, if it is not an accept state, |
| * then there MUST be at least one transition. |
| */ |
| transition = allTransitions[state][0]; |
| state = transition.Dest.Number; |
| |
| // append the minimum transition |
| seekBytesRef.Grow(seekBytesRef.Length + 1); |
| seekBytesRef.Length++; |
| seekBytesRef.Bytes[seekBytesRef.Length - 1] = (byte)transition.Min; |
| |
| // we found a loop, record it for faster enumeration |
| if ((finite == false) && !linear && visited[state] == curGen) |
| { |
| SetLinear(seekBytesRef.Length - 1); |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /// <summary> |
| /// Attempts to backtrack thru the string after encountering a dead end |
| /// at some given position. Returns <c>false</c> if no more possible strings |
| /// can match. |
| /// </summary> |
| /// <param name="position"> current position in the input string </param> |
| /// <returns> position >=0 if more possible solutions exist for the DFA </returns> |
| private int Backtrack(int position) |
| { |
| while (position-- > 0) |
| { |
| int nextChar = seekBytesRef.Bytes[position] & 0xff; |
| // if a character is 0xff its a dead-end too, |
| // because there is no higher character in binary sort order. |
| if (nextChar++ != 0xff) |
| { |
| seekBytesRef.Bytes[position] = (byte)nextChar; |
| seekBytesRef.Length = position + 1; |
| return position; |
| } |
| } |
| return -1; // all solutions exhausted |
| } |
| } |
| } |