lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFlattenGraphFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.core;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Random;
 import java.util.stream.Collectors;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.AutomatonToTokenStream;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.TokenStreamToAutomaton;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.Transition;

 public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {

   private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
     final Token t = new Token(term, startOffset, endOffset);
     t.setPositionIncrement(posInc);
     t.setPositionLength(posLength);
     return t;
   }

   public void testSimpleMock() throws Exception {
     Analyzer a = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
           TokenStream ts = new FlattenGraphFilter(tokenizer);
           return new TokenStreamComponents(tokenizer, ts);
         }
       };

     assertAnalyzesTo(a, "wtf happened",
                      new String[] {"wtf", "happened"},
                      new int[]    {    0,          4},
                      new int[]    {    3,         12},
                      null,
                      new int[]    {    1,          1},
                      new int[]    {    1,          1},
                      true);
   }

   // Make sure graph is unchanged if it's already flat
   public void testAlreadyFlatten() throws Exception {
     TokenStream in = new CannedTokenStream(0, 12, new Token[] {
         token("wtf", 1, 1, 0, 3),
         token("what", 0, 1, 0, 3),
         token("wow", 0, 1, 0, 3),
         token("the", 1, 1, 0, 3),
         token("that's", 0, 1, 0, 3),
         token("fudge", 1, 1, 0, 3),
         token("funny", 0, 1, 0, 3),
         token("happened", 1, 1, 4, 12)
       });

     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
                               new int[] {0, 0, 0, 0, 0, 0, 0, 4},
                               new int[] {3, 3, 3, 3, 3, 3, 3, 12},
                               new int[] {1, 0, 0, 1, 0, 1, 0, 1},
                               new int[] {1, 1, 1, 1, 1, 1, 1, 1},
                               12);
   }

   public void testWTF1() throws Exception {

     // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
     TokenStream in = new CannedTokenStream(0, 12, new Token[] {
         token("wtf", 1, 5, 0, 3),
         token("what", 0, 1, 0, 3),
         token("wow", 0, 3, 0, 3),
         token("the", 1, 1, 0, 3),
         token("fudge", 1, 3, 0, 3),
         token("that's", 1, 1, 0, 3),
         token("funny", 1, 1, 0, 3),
         token("happened", 1, 1, 4, 12)
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
                               new int[] {0, 0, 0, 0, 0, 0, 0, 4},
                               new int[] {3, 3, 3, 3, 3, 3, 3, 12},
                               new int[] {1, 0, 0, 1, 0, 1, 0, 1},
                               new int[] {3, 1, 1, 1, 1, 1, 1, 1},
                               12);

   }

   /** Same as testWTF1 except the "wtf" token comes out later */
   public void testWTF2() throws Exception {

     // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
     TokenStream in = new CannedTokenStream(0, 12, new Token[] {
         token("what", 1, 1, 0, 3),
         token("wow", 0, 3, 0, 3),
         token("wtf", 0, 5, 0, 3),
         token("the", 1, 1, 0, 3),
         token("fudge", 1, 3, 0, 3),
         token("that's", 1, 1, 0, 3),
         token("funny", 1, 1, 0, 3),
         token("happened", 1, 1, 4, 12)
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
                               new int[] {0, 0, 0, 0, 0, 0, 0, 4},
                               new int[] {3, 3, 3, 3, 3, 3, 3, 12},
                               new int[] {1, 0, 0, 1, 0, 1, 0, 1},
                               new int[] {1, 1, 3, 1, 1, 1, 1, 1},
                               12);

   }

   public void testNonGreedySynonyms() throws Exception {
     // This is just "hypothetical" for Lucene today, because SynFilter is
     // greedy: when two syn rules match on overlapping tokens, only one
     // (greedily) wins.  This test pretends all syn matches could match:

     TokenStream in = new CannedTokenStream(0, 20, new Token[] {
         token("wizard", 1, 1, 0, 6),
         token("wizard_of_oz", 0, 3, 0, 12),
         token("of", 1, 1, 7, 9),
         token("oz", 1, 1, 10, 12),
         token("oz_screams", 0, 2, 10, 20),
         token("screams", 1, 1, 13, 20),
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
                               new int[] {0, 0, 7, 10, 10, 13},
                               new int[] {6, 12, 9, 12, 20, 20},
                               new int[] {1, 0, 1, 1, 0, 1},
                               new int[] {1, 3, 1, 1, 2, 1},
                               20);

   }

   public void testNonGraph() throws Exception {
     TokenStream in = new CannedTokenStream(0, 22, new Token[] {
         token("hello", 1, 1, 0, 5),
         token("pseudo", 1, 1, 6, 12),
         token("world", 1, 1, 13, 18),
         token("fun", 1, 1, 19, 22),
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"hello", "pseudo", "world", "fun"},
                               new int[] {0, 6, 13, 19},
                               new int[] {5, 12, 18, 22},
                               new int[] {1, 1, 1, 1},
                               new int[] {1, 1, 1, 1},
                               22);
   }

   public void testSimpleHole() throws Exception {
     TokenStream in = new CannedTokenStream(0, 13, new Token[] {
         token("hello", 1, 1, 0, 5),
         token("hole", 2, 1, 6, 10),
         token("fun", 1, 1, 11, 13),
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
     assertTokenStreamContents(out,
                               new String[] {"hello", "hole", "fun"},
                               new int[] {0, 6, 11},
                               new int[] {5, 10, 13},
                               new int[] {1, 2, 1},
                               new int[] {1, 1, 1},
                               13);
   }

   public void testHoleUnderSyn() throws Exception {
     // Tests a StopFilter after SynFilter where a stopword in a syn is removed
     //
     //   wizard of oz -> woz syn, but then "of" becomes a hole

     TokenStream in = new CannedTokenStream(0, 12, new Token[] {
         token("wizard", 1, 1, 0, 6),
         token("woz", 0, 3, 0, 12),
         token("oz", 2, 1, 10, 12),
       });


     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(out,
                               new String[] {"wizard", "woz", "oz"},
                               new int[] {0, 0, 10},
                               new int[] {6, 12, 12},
                               new int[] {1, 0, 2},
                               new int[] {1, 3, 1},
                               12);
   }

   public void testStrangelyNumberedNodes() throws Exception {

     // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
     TokenStream in = new CannedTokenStream(0, 27, new Token[] {
         token("dog", 1, 3, 0, 5),
         token("puppy", 0, 3, 0, 5),
         token("flies", 3, 1, 6, 11),
       });

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(out,
                               new String[] {"dog", "puppy", "flies"},
                               new int[] {0, 0, 6},
                               new int[] {5, 5, 11},
                               new int[] {1, 0, 1},
                               new int[] {1, 1, 1},
                               27);
   }

   public void testTwoLongParallelPaths() throws Exception {

     // "a a a a a a" in parallel with "b b b b b b"
     TokenStream in = new CannedTokenStream(0, 11, new Token[] {
         token("a", 1, 1, 0, 1),
         token("b", 0, 2, 0, 1),
         token("a", 1, 2, 2, 3),
         token("b", 1, 2, 2, 3),
         token("a", 1, 2, 4, 5),
         token("b", 1, 2, 4, 5),
         token("a", 1, 2, 6, 7),
         token("b", 1, 2, 6, 7),
         token("a", 1, 2, 8, 9),
         token("b", 1, 2, 8, 9),
         token("a", 1, 2, 10, 11),
         token("b", 1, 2, 10, 11),
       });


     TokenStream out = new FlattenGraphFilter(in);

     // ... becomes flattened to a single path with overlapping a/b token between each node:
     assertTokenStreamContents(out,
                               new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
                               new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
                               new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
                               new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
                               new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
                               11);

   }

   // b has a posInc of 1, which is correct, but no edge ever visited that node.
   // After hole recovery 'b' and 'c' should still be under 'abc'
   // assert disabled = pos length of abc = 4
   // assert enabled = AssertionError: outputEndNode=3 vs inputTo=2
   public void testAltPathFirstStepHole() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             3,
             new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("c", 1, 1, 2, 3)});

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "b", "c"},
         new int[] {0, 1, 2},
         new int[] {3, 2, 3},
         new int[] {1, 1, 1},
         new int[] {3, 1, 1},
         3);
   }

   // Last node in an alt path fixes outputnode of long path. In this graph the follow up node fixes
   // that.
   // incorrect pos length of abc = 1
   public void testAltPathLastStepHole() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             4,
             new Token[] {
               token("abc", 1, 3, 0, 3),
               token("a", 0, 1, 0, 1),
               token("b", 1, 1, 1, 2),
               token("d", 2, 1, 3, 4)
             });

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "a", "b", "d"},
         new int[] {0, 0, 1, 3},
         new int[] {1, 1, 2, 4},
         new int[] {1, 0, 1, 2},
         new int[] {3, 1, 1, 1},
         4);
   }

   // Check to see how multiple holes in a row are preserved.
   public void testLongHole() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             28,
             new Token[] {
               token("hello", 1, 1, 0, 5), token("hole", 5, 1, 20, 24), token("fun", 1, 1, 25, 28),
             });

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"hello", "hole", "fun"},
         new int[] {0, 20, 25},
         new int[] {5, 24, 28},
         new int[] {1, 2, 1},
         new int[] {1, 1, 1},
         28);
   }

   // multiple nodes missing in the alt path.
   // assert disabled = nothing
   // assert enabled = AssertionError
   public void testAltPathLastStepLongHole() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             4,
             new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("d", 3, 1, 3, 4)});

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "a", "d"},
         new int[] {0, 0, 3},
         new int[] {1, 1, 4},
         new int[] {1, 0, 2},
         new int[] {2, 1, 1},
         4);
   }

   // LUCENE-8723
   // Token stream ends without any edge to fix the long edge's output node
   // assert disabled = dropped token
   // assert enabled = AssertionError: 2
   public void testAltPathLastStepHoleWithoutEndToken() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             2,
             new Token[] {token("abc", 1, 3, 0, 3), token("a", 0, 1, 0, 1), token("b", 1, 1, 1, 2)});

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "a", "b"},
         new int[] {0, 0, 1},
         new int[] {1, 1, 2},
         new int[] {1, 0, 1},
         new int[] {1, 1, 1},
         2);
   }

   // similar to AltPathLastStepHoleWithoutEndToken, but instead of no token to trigger long path
   // resolution,
   // the next token has no way to reference to the long path so we have to resolve as if that last
   // token wasn't present.
   public void testAltPathLastStepHoleFollowedByHole() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             5,
             new Token[] {token("abc", 1, 3, 0, 3), token("b", 1, 1, 1, 2), token("e", 3, 1, 4, 5)});

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "b", "e"},
         new int[] {0, 1, 4},
         new int[] {3, 2, 5},
         new int[] {1, 1, 2},
         new int[] {1, 1, 1},
         5);
   }

   // Two Shingled long paths pass each other which gives a flattened graph with tokens backing up a
   // lot.
   public void testShingledGap() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             5,
             new Token[] {
               token("abc", 1, 3, 0, 3),
               token("a", 0, 1, 0, 1),
               token("b", 1, 1, 1, 2),
               token("cde", 1, 3, 2, 5),
               token("d", 1, 1, 3, 4),
               token("e", 1, 1, 4, 5)
             });

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "a", "d", "b", "cde", "e"},
         new int[] {0, 0, 3, 3, 4, 4},
         new int[] {1, 1, 3, 3, 5, 5},
         new int[] {1, 0, 1, 0, 1, 0},
         new int[] {1, 1, 1, 1, 1, 1},
         5);
   }

   // With shingles, token order may change during flattening.
   // We need to be careful not to free input nodes if they still have unreleased edges.
   // with/without exceptions ArrayIndexOutOfBoundsException
   public void testShingledGapWithHoles() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             5,
             new Token[] {
               token("abc", 1, 3, 0, 3),
               token("b", 1, 1, 1, 2),
               token("cde", 1, 3, 2, 5),
               token("d", 1, 1, 3, 4),
               token("e", 1, 1, 4, 5)
             });

     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out,
         new String[] {"abc", "d", "b", "cde", "e"},
         new int[] {0, 3, 3, 4, 4},
         new int[] {3, 3, 3, 5, 5},
         new int[] {1, 1, 0, 1, 0},
         new int[] {1, 1, 1, 1, 1},
         5);
   }

   // When the first token is a hole there is no original token to offset from.
   public void testFirstTokenHole() throws Exception {
     TokenStream in = new CannedTokenStream(0, 9, new Token[] {token("start", 2, 1, 0, 5)});
     TokenStream out = new FlattenGraphFilter(in);

     assertTokenStreamContents(
         out, new String[] {"start"}, new int[] {0}, new int[] {5}, new int[] {2}, new int[] {1}, 9);
   }

   // The singled token starts from a hole.
   // Hole recovery will cause the shingled token to start later in the output than its alternate
   // paths.
   // This will result in it being released too early.
   public void testShingleFromGap() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             9,
             new Token[] {
               token("a", 1, 1, 4, 8),
               token("abc", 0, 3, 4, 7),
               token("cd", 2, 2, 6, 8),
               token("d", 1, 1, 7, 8),
               token("e", 1, 1, 8, 9)
             });
     TokenStream out = new FlattenGraphFilter(in);
     assertTokenStreamContents(
         out,
         new String[] {"a", "abc", "d", "cd", "e"},
         new int[] {4, 4, 7, 7, 8},
         new int[] {7, 7, 8, 8, 9},
         new int[] {1, 0, 1, 1, 1},
         new int[] {1, 1, 2, 1, 1},
         9);
   }

   public void testShingledGapAltPath() throws Exception {
     TokenStream in =
         new CannedTokenStream(
             0,
             4,
             new Token[] {
               token("abc", 1, 3, 0, 3), token("abcd", 0, 4, 0, 4), token("cd", 2, 2, 2, 4),
             });
     TokenStream out = new FlattenGraphFilter(in);
     assertTokenStreamContents(
         out,
         new String[] {"abc", "abcd", "cd"},
         new int[] {0, 0, 2},
         new int[] {3, 4, 4},
         new int[] {1, 0, 1},
         new int[] {1, 2, 1},
         4);
   }

   // Lots of shingles and alternate paths connecting to each other. One edge 'c' missing between
   // 'ab' and 'def'
   public void testHeavilyConnectedGraphWithGap() throws IOException {
     TokenStream in =
         new CannedTokenStream(
             0,
             7,
             new Token[] {
               token("a", 1, 1, 0, 1),
               token("ab", 0, 2, 0, 2),
               token("abcdef", 0, 6, 0, 6),
               token("abcd", 0, 4, 0, 4),
               token("bcdef", 1, 5, 1, 7),
               token("def", 2, 3, 4, 7),
               token("e", 1, 1, 5, 6),
               token("f", 1, 1, 6, 7)
             });
     TokenStream out = new FlattenGraphFilter(in);
     assertTokenStreamContents(
         out,
         new String[] {"a", "ab", "abcdef", "abcd", "bcdef", "e", "def", "f"},
         new int[] {0, 0, 0, 0, 5, 5, 6, 6},
         new int[] {1, 1, 7, 1, 7, 6, 7, 7},
         new int[] {1, 0, 0, 0, 1, 0, 1, 0},
         new int[] {1, 1, 3, 1, 2, 1, 1, 1},
         7);
   }
   // This graph can create a disconnected input node that is farther ahead in the output than its
   // subsequent input node.
   // Exceptions: Free too early or dropped tokens.
   public void testShingleWithLargeLeadingGap() throws IOException {
     TokenStream in =
         new CannedTokenStream(
             0,
             6,
             new Token[] {
               token("abcde", 1, 5, 0, 5), token("ef", 4, 2, 4, 6), token("f", 1, 1, 5, 6),
             });
     TokenStream out = new FlattenGraphFilter(in);
     assertTokenStreamContents(
         out,
         new String[] {"abcde", "f", "ef"},
         new int[] {0, 5, 5},
         new int[] {5, 6, 6},
         new int[] {1, 1, 0},
         new int[] {1, 1, 1},
         6);
   }

   /**
    * build CharsRef containing 2-4 tokens
    *
    * @param tokens vocabulary of tokens
    * @param charsRefBuilder CharsRefBuilder
    * @param random Random for selecting tokens
    * @return Charsref containing 2-4 tokens.
    */
   private CharsRef buildMultiTokenCharsRef(
       String[] tokens, CharsRefBuilder charsRefBuilder, Random random) {
     int srcLen = random.nextInt(2) + 2;
     String[] srcTokens = new String[srcLen];
     for (int pos = 0; pos < srcLen; pos++) {
       srcTokens[pos] = tokens[random().nextInt(tokens.length)];
     }
     SynonymMap.Builder.join(srcTokens, charsRefBuilder);
     return charsRefBuilder.toCharsRef();
   }

   // Create a random graph then delete some edges to see if we can trip up FlattenGraphFilter
   public void testRandomGraphs() throws Exception {
     String[] baseTokens = new String[] {"t1", "t2", "t3", "t4"};
     String[] synTokens = new String[] {"s1", "s2", "s3", "s4"};

     SynonymMap.Builder mapBuilder = new SynonymMap.Builder();
     CharsRefBuilder charRefBuilder = new CharsRefBuilder();
     Random random = random();

     // between 10 and 20 synonym entries
     int synCount = random.nextInt(10) + 10;
     for (int i = 0; i < synCount; i++) {
       int type = random.nextInt(4);
       CharsRef src;
       CharsRef dest;
       switch (type) {
         case 0:
           // 1:1
           src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
           charRefBuilder.clear();
           dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
           charRefBuilder.clear();
           break;
         case 1:
           // many:1
           src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
           charRefBuilder.clear();
           dest = charRefBuilder.append(synTokens[random.nextInt(synTokens.length)]).toCharsRef();
           charRefBuilder.clear();
           break;
         case 2:
           // 1:many
           src = charRefBuilder.append(baseTokens[random.nextInt(baseTokens.length)]).toCharsRef();
           charRefBuilder.clear();
           dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
           charRefBuilder.clear();
           break;
         default:
           // many:many
           src = buildMultiTokenCharsRef(baseTokens, charRefBuilder, random);
           charRefBuilder.clear();
           dest = buildMultiTokenCharsRef(synTokens, charRefBuilder, random);
           charRefBuilder.clear();
       }
       mapBuilder.add(src, dest, true);
     }

     SynonymMap synMap = mapBuilder.build();

     int stopWordCount = random.nextInt(4) + 1;
     CharArraySet stopWords = new CharArraySet(stopWordCount, true);
     while (stopWords.size() < stopWordCount) {
       int index = random.nextInt(baseTokens.length + synTokens.length);
       String[] tokenArray = baseTokens;
       if (index >= baseTokens.length) {
         index -= baseTokens.length;
         tokenArray = synTokens;
       }
       stopWords.add(tokenArray[index]);
     }

     Analyzer withFlattenGraph =
         new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer in = new WhitespaceTokenizer();
             TokenStream result = new SynonymGraphFilter(in, synMap, true);
             result = new StopFilter(result, stopWords);
             result = new FlattenGraphFilter(result);
             return new TokenStreamComponents(in, result);
           }
         };

     int tokenCount = random.nextInt(20) + 20;
     List<String> stringTokens = new ArrayList<>();
     while (stringTokens.size() < tokenCount) {
       stringTokens.add(baseTokens[random.nextInt(baseTokens.length)]);
     }

     String text = String.join(" ", stringTokens);
     // FlattenGraphFilter can create inconsistent offsets.
     // If that is resolved we can check offsets
     // Until then converting to automaton will pull text through and check if we hit asserts.
     // checkAnalysisConsistency(random, withFlattenGraph, false, text);
     TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
     TokenStream flattenedTokenStream = withFlattenGraph.tokenStream("field", text);
     assertFalse(Operations.hasDeadStates(tsta.toAutomaton(flattenedTokenStream)));
     flattenedTokenStream.close();

     /*
        CheckGeneralization can get VERY slow as matching holes to tokens or other holes generates a lot of potentially valid paths.
        Analyzer withoutFlattenGraph =
            new Analyzer() {
              @Override
              protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer in = new WhitespaceTokenizer();
                TokenStream result = new SynonymGraphFilter(in, synMap, true);
                result = new StopFilter(result, stopWords);
                return new TokenStreamComponents(in, result);
              }
            };
        checkGeneralization(
            withFlattenGraph.tokenStream("field", text),
            withoutFlattenGraph.tokenStream("field", text));

     */
   }

   /*
    * Make some strings, make an automaton that accepts those strings, convert that automaton into a TokenStream,
    * flatten it, back to an automaton, and see if the original strings are still accepted.
    */
   public void testPathsNotLost() throws IOException {
     int wordCount = random().nextInt(5) + 5;
     List<BytesRef> acceptStrings = new LinkedList<>();
     for (int i = 0; i < wordCount; i++) {
       int wordLen = random().nextInt(5) + 5;
       BytesRef ref = new BytesRef(wordLen);
       ref.length = wordLen;
       ref.offset = 0;
       for (int j = 0; j < wordLen; j++) {
         ref.bytes[j] = (byte) (random().nextInt(5) + 65);
       }
       acceptStrings.add(ref);
     }
     acceptStrings.sort(Comparator.naturalOrder());

     acceptStrings = acceptStrings.stream().limit(wordCount).collect(Collectors.toList());
     Automaton nonFlattenedAutomaton = DaciukMihovAutomatonBuilder.build(acceptStrings);

     TokenStream ts = AutomatonToTokenStream.toTokenStream(nonFlattenedAutomaton);
     TokenStream flattenedTokenStream = new FlattenGraphFilter(ts);
     TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();
     Automaton flattenedAutomaton = tsta.toAutomaton(flattenedTokenStream);

     // TokenStreamToAutomaton adds position increment transitions into the automaton.
     List<BytesRef> acceptStringsWithPosSep = createAcceptStringsWithPosSep(acceptStrings);

     for (BytesRef acceptString : acceptStringsWithPosSep) {
       assertTrue(
           "string not accepted " + acceptString.utf8ToString(),
           recursivelyValidate(acceptString, 0, 0, flattenedAutomaton));
     }
   }

   /**
    * adds POS_SEP bytes between bytes to match TokenStreamToAutomaton format.
    *
    * @param acceptStrings Byte refs of accepted strings. Each byte is a transition
    * @return List of ByteRefs where each byte is separated by a POS_SEP byte.
    */
   private List<BytesRef> createAcceptStringsWithPosSep(List<BytesRef> acceptStrings) {
     List<BytesRef> acceptStringsWithPosSep = new ArrayList<>();
     for (BytesRef acceptString : acceptStrings) {
       BytesRef withPosSep = new BytesRef(acceptString.length * 2 - 1);
       withPosSep.length = acceptString.length * 2 - 1;
       withPosSep.offset = 0;
       for (int i = 0; i < acceptString.length; i++) {
         withPosSep.bytes[i * 2] = acceptString.bytes[i];
         if (i * 2 + 1 < withPosSep.length) {
           withPosSep.bytes[i * 2 + 1] = TokenStreamToAutomaton.POS_SEP;
         }
       }
       acceptStringsWithPosSep.add(withPosSep);
     }
     return acceptStringsWithPosSep;
   }

   /**
    * Checks if acceptString is accepted by the automaton. Automaton may be an NFA.
    *
    * @param acceptString String to test
    * @param acceptStringIndex current index into acceptString, initial value should be 0
    * @param state state to transition from. initial value should be 0
    * @param automaton Automaton to test
    * @return true if acceptString is accepted by the automaton. otherwise false.
    */
   public boolean recursivelyValidate(
       BytesRef acceptString, int acceptStringIndex, int state, Automaton automaton) {
     if (acceptStringIndex == acceptString.length) {
       return automaton.isAccept(state);
     }

     Transition transition = new Transition();
     automaton.initTransition(state, transition);
     int numTransitions = automaton.getNumTransitions(state);
     boolean accept = false;
     // Automaton can be NFA, so we need to check all matching transitions
     for (int i = 0; i < numTransitions; i++) {
       automaton.getTransition(state, i, transition);
       if (transition.min <= acceptString.bytes[acceptStringIndex]
           && transition.max >= acceptString.bytes[acceptStringIndex]) {
         accept =
             recursivelyValidate(acceptString, acceptStringIndex + 1, transition.dest, automaton);
       }
       if (accept == true) {
         break;
       }
     }
     return accept;
   }

   /**
    * This method checks if strings that lead to the accept state of the not flattened TokenStream
    * also lead to the accept state in the flattened TokenStream. This gets complicated when you
    * factor in holes. The FlattenGraphFilter will remove alternate paths that are made entirely of
    * holes. An alternate path of Holes is indistinguishable from a path that just has long
    * lengths(ex: testStrangelyNumberedNodes). Also alternate paths that end in multiple holes could
    * be interpreted as sequential holes after the branching has converged during flattening. This
    * leads to a lot of weird logic about navigating around holes that may compromise the accuracy of
    * this test.
    *
    * @param flattened flattened TokenStream
    * @param notFlattened not flattened TokenStream
    * @throws IOException on error creating Automata
    */
   /* private void checkGeneralization(TokenStream flattened, TokenStream notFlattened)
       throws IOException {
     TokenStreamToAutomaton tsta = new TokenStreamToAutomaton();

     List<LinkedList<Integer>> acceptStrings = getAcceptStrings(tsta.toAutomaton(notFlattened));
     checkAcceptStrings(acceptStrings, tsta.toAutomaton(flattened));
     flattened.close();
     notFlattened.close();
   }*/

   /**
    * gets up to 10000 strings that lead to accept state in the given automaton.
    *
    * @param automaton automaton
    * @return list of accept sequences
    */
   /* private List<LinkedList<Integer>> getAcceptStrings(Automaton automaton) {
     List<LinkedList<Integer>> acceptedSequences = new LinkedList<>();
     LinkedList<Integer> prefix = new LinkedList<>();
     // state 0 is always the start node
     // Particularly branching automatons can create lots of possible acceptable strings. limit to
     // the first 10K
     buildAcceptStringRecursive(automaton, 0, prefix, acceptedSequences, 10000);
     return acceptedSequences;
   }*/

   /**
    * @param automaton automaton to generate strings from
    * @param state state to start at
    * @param prefix string prefix
    * @param acceptedSequences List of strings build so far.
    * @param limit maximum number of acceptedSequences.
    */
   /*private void buildAcceptStringRecursive(
       Automaton automaton,
       int state,
       LinkedList<Integer> prefix,
       List<LinkedList<Integer>> acceptedSequences,
       int limit) {
     if (acceptedSequences.size() == limit) {
       return;
     }
     if (automaton.isAccept(state)) {
       acceptedSequences.add(new LinkedList<>(prefix));
       return;
     }
     int numTransitions = automaton.getNumTransitions(state);
     Transition transition = new Transition();
     for (int i = 0; i < numTransitions; i++) {
       automaton.getTransition(state, i, transition);
       // min and max are the same transitions made from TokenStreamToAutomaton
       prefix.addLast(transition.min);
       buildAcceptStringRecursive(automaton, transition.dest, prefix, acceptedSequences, limit);
       prefix.removeLast();
     }
   }

   private void checkAcceptStrings(List<LinkedList<Integer>> acceptSequence, Automaton automaton) {
     for (LinkedList<Integer> acceptString : acceptSequence) {
       assertTrue(
           "String did not lead to accept state " + acceptString,
           recursivelyValidateWithHoles(acceptString, 0, automaton));
     }
   }

   private boolean recursivelyValidateWithHoles(
       LinkedList<Integer> acceptSequence, int state, Automaton automaton) {
     if (acceptSequence.isEmpty()) {
       return automaton.isAccept(state);
     }

     Integer curr = acceptSequence.pop();
     int numTransitions = automaton.getNumTransitions(state);
     Transition transition = new Transition();

     boolean accept = false;
     // Automaton can be NFA, so we need to check all matching transitions
     for (int i = 0; i < numTransitions; i++) {
       automaton.getTransition(state, i, transition);
       if (transition.min <= curr && transition.max >= curr) {
         accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton);
         // Factoring in flattened graphs the space covered by a hole may be bigger in the flattened
         // graph.
         // Try consuming more steps with holes.
         if (accept == false
             && transition.min == TokenStreamToAutomaton.HOLE
             && transition.max == TokenStreamToAutomaton.HOLE) {
           acceptSequence.push(TokenStreamToAutomaton.HOLE);
           acceptSequence.push(TokenStreamToAutomaton.POS_SEP);
           accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton);
           acceptSequence.pop();
           acceptSequence.pop();
         }
       } else if (transition.min == TokenStreamToAutomaton.HOLE
           && transition.max == TokenStreamToAutomaton.HOLE
           && automaton.getNumTransitions(transition.dest) > 0) {
         //consume multiple holes in the automaton
         // clear POS_INC
         automaton.getTransition(transition.dest, 0, transition);
         acceptSequence.push(curr);
         accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton);
         acceptSequence.pop();
       } else if(curr == TokenStreamToAutomaton.HOLE) {
         //consume non-holes in the automaton with holes
         while (transition.min != TokenStreamToAutomaton.POS_SEP
                 && automaton.getNumTransitions(transition.dest) > 0) {
           automaton.getTransition(transition.dest, 0, transition);
         }
         acceptSequence.push(curr);
         accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton);
         acceptSequence.pop();
       }
       if (accept) {
         break;
       }
     }
     // Flatten graph filter will remove side paths that are only Holes. Gaps may also change size as
     // graph is flattened.
     // Traverse over them if curr is a hole to make sure the gap is kept
     if (accept == false && curr == TokenStreamToAutomaton.HOLE && acceptSequence.size() > 0) {
       // get rid of the separator
       acceptSequence.pop();

       for (int i = 0; i < numTransitions; i++) {
         automaton.getTransition(state, i, transition);
         //advance to the next POS_SEP in automaton
         while (transition.min != TokenStreamToAutomaton.POS_SEP
             && automaton.getNumTransitions(transition.dest) > 0) {
           automaton.getTransition(transition.dest, 0, transition);
         }
         accept = recursivelyValidateWithHoles(acceptSequence, transition.dest, automaton);
         if (accept) {
           break;
         }
       }

       // might be multiple holes squashed under a one step path. Try burning remaining holes
       if (accept == false) {
         accept = recursivelyValidateWithHoles(acceptSequence, state, automaton);
       }

       acceptSequence.push(TokenStreamToAutomaton.POS_SEP);
     }
     acceptSequence.push(curr);
     return accept;
   } */

   // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
 }