lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;

 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Random;
 import java.util.Set;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.fst.Util;

 import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

 public class TestGraphTokenizers extends BaseTokenStreamTestCase {

   // Makes a graph TokenStream from the string; separate
   // positions with single space, multiple tokens at the same
   // position with /, and add optional position length with
   // :.  EG "a b c" is a simple chain, "a/x b c" adds 'x'
   // over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
   // 'x' over a with posLen=3.  Tokens are in normal-form!
   // So, offsets are computed based on the first token at a
   // given position.  NOTE: each token must be a single
   // character!  We assume this when computing offsets...

   // NOTE: all input tokens must be length 1!!!  This means
   // you cannot turn on MockCharFilter when random
   // testing...

   public static final class GraphTokenizer extends Tokenizer {
     private List<Token> tokens;
     private int upto;
     private int inputLength;

     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
     private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

     @Override
     public void reset() throws IOException {
       super.reset();
       tokens = null;
       upto = 0;
     }

     @Override
     public boolean incrementToken() throws IOException {
       if (tokens == null) {
         fillTokens();
       }
       //System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
       if (upto == tokens.size()) {
         //System.out.println("  END @ " + tokens.size());
         return false;
       }
       final Token t = tokens.get(upto++);
       //System.out.println("  return token=" + t);
       clearAttributes();
       termAtt.append(t.toString());
       offsetAtt.setOffset(t.startOffset(), t.endOffset());
       posIncrAtt.setPositionIncrement(t.getPositionIncrement());
       posLengthAtt.setPositionLength(t.getPositionLength());
       return true;
     }

     @Override
     public void end() throws IOException {
       super.end();
       // NOTE: somewhat... hackish, but we need this to
       // satisfy BTSTC:
       final int lastOffset;
       if (tokens != null && !tokens.isEmpty()) {
         lastOffset = tokens.get(tokens.size()-1).endOffset();
       } else {
         lastOffset = 0;
       }
       offsetAtt.setOffset(correctOffset(lastOffset),
                           correctOffset(inputLength));
     }

     private void fillTokens() throws IOException {
       final StringBuilder sb = new StringBuilder();
       final char[] buffer = new char[256];
       while (true) {
         final int count = input.read(buffer);
         if (count == -1) {
           break;
         }
         sb.append(buffer, 0, count);
         //System.out.println("got count=" + count);
       }
       //System.out.println("fillTokens: " + sb);

       inputLength = sb.length();

       final String[] parts = sb.toString().split(" ");

       tokens = new ArrayList<>();
       int pos = 0;
       int maxPos = -1;
       int offset = 0;
       //System.out.println("again");
       for(String part : parts) {
         final String[] overlapped = part.split("/");
         boolean firstAtPos = true;
         int minPosLength = Integer.MAX_VALUE;
         for(String part2 : overlapped) {
           final int colonIndex = part2.indexOf(':');
           final String token;
           final int posLength;
           if (colonIndex != -1) {
             token = part2.substring(0, colonIndex);
             posLength = Integer.parseInt(part2.substring(1+colonIndex));
           } else {
             token = part2;
             posLength = 1;
           }
           maxPos = Math.max(maxPos, pos + posLength);
           minPosLength = Math.min(minPosLength, posLength);
           final Token t = new Token(token, offset, offset + 2*posLength - 1);
           t.setPositionLength(posLength);
           t.setPositionIncrement(firstAtPos ? 1:0);
           firstAtPos = false;
           //System.out.println("  add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
           tokens.add(t);
         }
         pos += minPosLength;
         offset = 2 * pos;
       }
       assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
     }
   }

   public void testMockGraphTokenFilterBasic() throws Exception {

     for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
         };

       checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
     }
   }

   public void testMockGraphTokenFilterOnGraphInput() throws Exception {
     for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new GraphTokenizer();
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
         };

       checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
     }
   }

   // Just deletes (leaving hole) token 'a':
   private final static class RemoveATokens extends TokenFilter {
     private int pendingPosInc;

     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

     public RemoveATokens(TokenStream in) {
       super(in);
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       pendingPosInc = 0;
     }

     @Override
     public void end() throws IOException {
       super.end();
       posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
     }

     @Override
     public boolean incrementToken() throws IOException {
       while (true) {
         final boolean gotOne = input.incrementToken();
         if (!gotOne) {
           return false;
         } else if (termAtt.toString().equals("a")) {
           pendingPosInc += posIncAtt.getPositionIncrement();
         } else {
           posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
           pendingPosInc = 0;
           return true;
         }
       }
     }
   }

   public void testMockGraphTokenFilterBeforeHoles() throws Exception {
     for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             final TokenStream t3 = new RemoveATokens(t2);
             return new TokenStreamComponents(t, t3);
           }
         };

       Random random = random();
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
       checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
     }
   }

   public void testMockGraphTokenFilterAfterHoles() throws Exception {
     for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new RemoveATokens(t);
             final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
             return new TokenStreamComponents(t, t3);
           }
         };

       Random random = random();
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
       checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
       checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
     }
   }

   public void testMockGraphTokenFilterRandom() throws Exception {
     for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t);
             return new TokenStreamComponents(t, t2);
           }
         };

       Random random = random();
       checkRandomData(random, a, 5, atLeast(100));
     }
   }

   // Two MockGraphTokenFilters
   public void testDoubleMockGraphTokenFilterRandom() throws Exception {
     for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockGraphTokenFilter(random(), t);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);
           }
         };

       Random random = random();
       checkRandomData(random, a, 5, atLeast(100));
     }
   }

   public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
     for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockGraphTokenFilter(random(), t);
             final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);
           }
         };

       Random random = random();
       checkRandomData(random, a, 5, atLeast(100));
     }
   }

   public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
     for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST: iter=" + iter);
       }

       // Make new analyzer each time, because MGTF has fixed
       // seed:
       final Analyzer a = new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
             final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
             return new TokenStreamComponents(t, t2);
           }
         };

       Random random = random();
       checkRandomData(random, a, 5, atLeast(100));
     }
   }

   private static Token token(String term, int posInc, int posLength) {
     final Token t = new Token(term, 0, 0);
     t.setPositionIncrement(posInc);
     t.setPositionLength(posLength);
     return t;
   }

   private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
     final Token t = new Token(term, startOffset, endOffset);
     t.setPositionIncrement(posInc);
     t.setPositionLength(posLength);
     return t;
   }

   public void testSingleToken() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
       });
     assertSameLanguage(s2a("abc"), ts);
   }

   public void testMultipleHoles() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("a", 1, 1),
         token("b", 3, 1),
       });
     assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
   }

   public void testSynOverMultipleHoles() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("a", 1, 1),
         token("x", 0, 3),
         token("b", 3, 1),
       });
     final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
     final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
     assertSameLanguage(Operations.union(a1, a2), ts);
   }

   // for debugging!
   /*
   private static void toDot(Automaton a) throws IOException {
     final String s = a.toDot();
     Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
     w.write(s);
     w.close();
     System.out.println("TEST: saved to /x/tmp/out.dot");
   }
   */

   private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
   private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);

   private Automaton join(String ... strings) {
     List<Automaton> as = new ArrayList<>();
     for(String s : strings) {
       as.add(s2a(s));
       as.add(SEP_A);
     }
     as.remove(as.size()-1);
     return Operations.concatenate(as);
   }

   private Automaton join(Automaton ... as) {
     return Operations.concatenate(Arrays.asList(as));
   }

   private Automaton s2a(String s) {
     return Automata.makeString(s);
   }

   public void testTwoTokens() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("def", 1, 1),
       });
     assertSameLanguage(join("abc", "def"), ts);
   }

   public void testHole() throws Exception {

     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("def", 2, 1),
       });
     assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
   }

   public void testOverlappedTokensSausage() throws Exception {

     // Two tokens on top of each other (sausage):
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("xyz", 0, 1)
       });
     final Automaton a1 = s2a("abc");
     final Automaton a2 = s2a("xyz");
     assertSameLanguage(Operations.union(a1, a2), ts);
   }

   public void testOverlappedTokensLattice() throws Exception {

     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("xyz", 0, 2),
         token("def", 1, 1),
       });
     final Automaton a1 = s2a("xyz");
     final Automaton a2 = join("abc", "def");
     assertSameLanguage(Operations.union(a1, a2), ts);
   }

   public void testSynOverHole() throws Exception {

     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("a", 1, 1),
         token("X", 0, 2),
         token("b", 2, 1),
       });
     final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
     final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
     assertSameLanguage(expected, ts);
   }

   public void testSynOverHole2() throws Exception {

     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("xyz", 1, 1),
         token("abc", 0, 3),
         token("def", 2, 1),
       });
     final Automaton expected = Operations.union(
       join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
     assertSameLanguage(expected, ts);
   }

   public void testOverlappedTokensLattice2() throws Exception {

     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("xyz", 0, 3),
         token("def", 1, 1),
         token("ghi", 1, 1),
       });
     final Automaton a1 = s2a("xyz");
     final Automaton a2 = join("abc", "def", "ghi");
     assertSameLanguage(Operations.union(a1, a2), ts);
   }

   public void testToDot() throws Exception {
     final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
     StringWriter w = new StringWriter();
     new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
     assertTrue(w.toString().indexOf("abc / abcd") != -1);
   }

   public void testStartsWithHole() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 2, 1),
       });
     assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
   }

   public void testEndsWithHole() throws Exception {
     final TokenStream ts = new CannedTokenStream(1, 0,
                                                  new Token[] {
                                                    token("abc", 2, 1),
                                                  });
     assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
   }

   public void testSynHangingOverEnd() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("a", 1, 1),
         token("X", 0, 10),
       });
     assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
   }

   /** Returns all paths */
   private Set<String> toPathStrings(Automaton a) {
     BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
     Set<String> paths = new HashSet<>();
     for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
       paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
     }
     return paths;
   }

   private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
     assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
   }

   private void assertSameLanguage(Automaton expected, Automaton actual) {
     Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT);
     Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT);
     if (Operations.sameLanguage(expectedDet, actualDet) == false) {
       Set<String> expectedPaths = toPathStrings(expectedDet);
       Set<String> actualPaths = toPathStrings(actualDet);
       StringBuilder b = new StringBuilder();
       b.append("expected:\n");
       for(String path : expectedPaths) {
         b.append("  ");
         b.append(path);
         if (actualPaths.contains(path) == false) {
           b.append(" [missing!]");
         }
         b.append('\n');
       }
       b.append("actual:\n");
       for(String path : actualPaths) {
         b.append("  ");
         b.append(path);
         if (expectedPaths.contains(path) == false) {
           b.append(" [unexpected!]");
         }
         b.append('\n');
       }
       fail("accepted language is different:\n\n" + b.toString());
     }
   }

   public void testTokenStreamGraphWithHoles() throws Exception {
     final TokenStream ts = new CannedTokenStream(
       new Token[] {
         token("abc", 1, 1),
         token("xyz", 1, 8),
         token("def", 1, 1),
         token("ghi", 1, 1),
       });
     assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
                                         join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;

	import java.io.IOException;
	import java.io.PrintWriter;
	import java.io.StringWriter;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Random;
	import java.util.Set;

	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
	import org.apache.lucene.util.BytesRefBuilder;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.automaton.Automata;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.AutomatonTestUtil;
	import org.apache.lucene.util.automaton.Operations;
	import org.apache.lucene.util.fst.Util;

	import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;

	public class TestGraphTokenizers extends BaseTokenStreamTestCase {

	// Makes a graph TokenStream from the string; separate
	// positions with single space, multiple tokens at the same
	// position with /, and add optional position length with
	// :. EG "a b c" is a simple chain, "a/x b c" adds 'x'
	// over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
	// 'x' over a with posLen=3. Tokens are in normal-form!
	// So, offsets are computed based on the first token at a
	// given position. NOTE: each token must be a single
	// character! We assume this when computing offsets...

	// NOTE: all input tokens must be length 1!!! This means
	// you cannot turn on MockCharFilter when random
	// testing...

	public static final class GraphTokenizer extends Tokenizer {
	private List<Token> tokens;
	private int upto;
	private int inputLength;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
	private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

	@Override
	public void reset() throws IOException {
	super.reset();
	tokens = null;
	upto = 0;
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (tokens == null) {
	fillTokens();
	}
	//System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
	if (upto == tokens.size()) {
	//System.out.println(" END @ " + tokens.size());
	return false;
	}
	final Token t = tokens.get(upto++);
	//System.out.println(" return token=" + t);
	clearAttributes();
	termAtt.append(t.toString());
	offsetAtt.setOffset(t.startOffset(), t.endOffset());
	posIncrAtt.setPositionIncrement(t.getPositionIncrement());
	posLengthAtt.setPositionLength(t.getPositionLength());
	return true;
	}

	@Override
	public void end() throws IOException {
	super.end();
	// NOTE: somewhat... hackish, but we need this to
	// satisfy BTSTC:
	final int lastOffset;
	if (tokens != null && !tokens.isEmpty()) {
	lastOffset = tokens.get(tokens.size()-1).endOffset();
	} else {
	lastOffset = 0;
	}
	offsetAtt.setOffset(correctOffset(lastOffset),
	correctOffset(inputLength));
	}

	private void fillTokens() throws IOException {
	final StringBuilder sb = new StringBuilder();
	final char[] buffer = new char[256];
	while (true) {
	final int count = input.read(buffer);
	if (count == -1) {
	break;
	}
	sb.append(buffer, 0, count);
	//System.out.println("got count=" + count);
	}
	//System.out.println("fillTokens: " + sb);

	inputLength = sb.length();

	final String[] parts = sb.toString().split(" ");

	tokens = new ArrayList<>();
	int pos = 0;
	int maxPos = -1;
	int offset = 0;
	//System.out.println("again");
	for(String part : parts) {
	final String[] overlapped = part.split("/");
	boolean firstAtPos = true;
	int minPosLength = Integer.MAX_VALUE;
	for(String part2 : overlapped) {
	final int colonIndex = part2.indexOf(':');
	final String token;
	final int posLength;
	if (colonIndex != -1) {
	token = part2.substring(0, colonIndex);
	posLength = Integer.parseInt(part2.substring(1+colonIndex));
	} else {
	token = part2;
	posLength = 1;
	}
	maxPos = Math.max(maxPos, pos + posLength);
	minPosLength = Math.min(minPosLength, posLength);
	final Token t = new Token(token, offset, offset + 2*posLength - 1);
	t.setPositionLength(posLength);
	t.setPositionIncrement(firstAtPos ? 1:0);
	firstAtPos = false;
	//System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
	tokens.add(t);
	}
	pos += minPosLength;
	offset = 2 * pos;
	}
	assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
	}
	}

	public void testMockGraphTokenFilterBasic() throws Exception {

	for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t2 = new MockGraphTokenFilter(random(), t);
	return new TokenStreamComponents(t, t2);
	}
	};

	checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
	}
	}

	public void testMockGraphTokenFilterOnGraphInput() throws Exception {
	for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new GraphTokenizer();
	final TokenStream t2 = new MockGraphTokenFilter(random(), t);
	return new TokenStreamComponents(t, t2);
	}
	};

	checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
	}
	}

	// Just deletes (leaving hole) token 'a':
	private final static class RemoveATokens extends TokenFilter {
	private int pendingPosInc;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

	public RemoveATokens(TokenStream in) {
	super(in);
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	pendingPosInc = 0;
	}

	@Override
	public void end() throws IOException {
	super.end();
	posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
	}

	@Override
	public boolean incrementToken() throws IOException {
	while (true) {
	final boolean gotOne = input.incrementToken();
	if (!gotOne) {
	return false;
	} else if (termAtt.toString().equals("a")) {
	pendingPosInc += posIncAtt.getPositionIncrement();
	} else {
	posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
	pendingPosInc = 0;
	return true;
	}
	}
	}
	}

	public void testMockGraphTokenFilterBeforeHoles() throws Exception {
	for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t2 = new MockGraphTokenFilter(random(), t);
	final TokenStream t3 = new RemoveATokens(t2);
	return new TokenStreamComponents(t, t3);
	}
	};

	Random random = random();
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
	checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
	}
	}

	public void testMockGraphTokenFilterAfterHoles() throws Exception {
	for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t2 = new RemoveATokens(t);
	final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
	return new TokenStreamComponents(t, t3);
	}
	};

	Random random = random();
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
	checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
	checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
	}
	}

	public void testMockGraphTokenFilterRandom() throws Exception {
	for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t2 = new MockGraphTokenFilter(random(), t);
	return new TokenStreamComponents(t, t2);
	}
	};

	Random random = random();
	checkRandomData(random, a, 5, atLeast(100));
	}
	}

	// Two MockGraphTokenFilters
	public void testDoubleMockGraphTokenFilterRandom() throws Exception {
	for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t1 = new MockGraphTokenFilter(random(), t);
	final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
	return new TokenStreamComponents(t, t2);
	}
	};

	Random random = random();
	checkRandomData(random, a, 5, atLeast(100));
	}
	}

	public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
	for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t1 = new MockGraphTokenFilter(random(), t);
	final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
	return new TokenStreamComponents(t, t2);
	}
	};

	Random random = random();
	checkRandomData(random, a, 5, atLeast(100));
	}
	}

	public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
	for(int iter=0;iter<3*RANDOM_MULTIPLIER;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST: iter=" + iter);
	}

	// Make new analyzer each time, because MGTF has fixed
	// seed:
	final Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	final Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
	final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
	return new TokenStreamComponents(t, t2);
	}
	};

	Random random = random();
	checkRandomData(random, a, 5, atLeast(100));
	}
	}

	private static Token token(String term, int posInc, int posLength) {
	final Token t = new Token(term, 0, 0);
	t.setPositionIncrement(posInc);
	t.setPositionLength(posLength);
	return t;
	}

	private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
	final Token t = new Token(term, startOffset, endOffset);
	t.setPositionIncrement(posInc);
	t.setPositionLength(posLength);
	return t;
	}

	public void testSingleToken() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	});
	assertSameLanguage(s2a("abc"), ts);
	}

	public void testMultipleHoles() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("a", 1, 1),
	token("b", 3, 1),
	});
	assertSameLanguage(join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")), ts);
	}

	public void testSynOverMultipleHoles() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("a", 1, 1),
	token("x", 0, 3),
	token("b", 3, 1),
	});
	final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
	final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
	assertSameLanguage(Operations.union(a1, a2), ts);
	}

	// for debugging!
	/*
	private static void toDot(Automaton a) throws IOException {
	final String s = a.toDot();
	Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
	w.write(s);
	w.close();
	System.out.println("TEST: saved to /x/tmp/out.dot");
	}
	*/

	private static final Automaton SEP_A = Automata.makeChar(TokenStreamToAutomaton.POS_SEP);
	private static final Automaton HOLE_A = Automata.makeChar(TokenStreamToAutomaton.HOLE);

	private Automaton join(String ... strings) {
	List<Automaton> as = new ArrayList<>();
	for(String s : strings) {
	as.add(s2a(s));
	as.add(SEP_A);
	}
	as.remove(as.size()-1);
	return Operations.concatenate(as);
	}

	private Automaton join(Automaton ... as) {
	return Operations.concatenate(Arrays.asList(as));
	}

	private Automaton s2a(String s) {
	return Automata.makeString(s);
	}

	public void testTwoTokens() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("def", 1, 1),
	});
	assertSameLanguage(join("abc", "def"), ts);
	}

	public void testHole() throws Exception {

	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("def", 2, 1),
	});
	assertSameLanguage(join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")), ts);
	}

	public void testOverlappedTokensSausage() throws Exception {

	// Two tokens on top of each other (sausage):
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("xyz", 0, 1)
	});
	final Automaton a1 = s2a("abc");
	final Automaton a2 = s2a("xyz");
	assertSameLanguage(Operations.union(a1, a2), ts);
	}

	public void testOverlappedTokensLattice() throws Exception {

	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("xyz", 0, 2),
	token("def", 1, 1),
	});
	final Automaton a1 = s2a("xyz");
	final Automaton a2 = join("abc", "def");
	assertSameLanguage(Operations.union(a1, a2), ts);
	}

	public void testSynOverHole() throws Exception {

	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("a", 1, 1),
	token("X", 0, 2),
	token("b", 2, 1),
	});
	final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X"));
	final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b")));
	assertSameLanguage(expected, ts);
	}

	public void testSynOverHole2() throws Exception {

	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("xyz", 1, 1),
	token("abc", 0, 3),
	token("def", 2, 1),
	});
	final Automaton expected = Operations.union(
	join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc"));
	assertSameLanguage(expected, ts);
	}

	public void testOverlappedTokensLattice2() throws Exception {

	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("xyz", 0, 3),
	token("def", 1, 1),
	token("ghi", 1, 1),
	});
	final Automaton a1 = s2a("xyz");
	final Automaton a2 = join("abc", "def", "ghi");
	assertSameLanguage(Operations.union(a1, a2), ts);
	}

	public void testToDot() throws Exception {
	final TokenStream ts = new CannedTokenStream(new Token[] {token("abc", 1, 1, 0, 4)});
	StringWriter w = new StringWriter();
	new TokenStreamToDot("abcd", ts, new PrintWriter(w)).toDot();
	assertTrue(w.toString().indexOf("abc / abcd") != -1);
	}

	public void testStartsWithHole() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 2, 1),
	});
	assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc")), ts);
	}

	public void testEndsWithHole() throws Exception {
	final TokenStream ts = new CannedTokenStream(1, 0,
	new Token[] {
	token("abc", 2, 1),
	});
	assertSameLanguage(join(HOLE_A, SEP_A, s2a("abc"), SEP_A, HOLE_A), ts);
	}

	public void testSynHangingOverEnd() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("a", 1, 1),
	token("X", 0, 10),
	});
	assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts);
	}

	/** Returns all paths */
	private Set<String> toPathStrings(Automaton a) {
	BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder();
	Set<String> paths = new HashSet<>();
	for (IntsRef ir: AutomatonTestUtil.getFiniteStringsRecursive(a, -1)) {
	paths.add(Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '));
	}
	return paths;
	}

	private void assertSameLanguage(Automaton expected, TokenStream ts) throws IOException {
	assertSameLanguage(expected, new TokenStreamToAutomaton().toAutomaton(ts));
	}

	private void assertSameLanguage(Automaton expected, Automaton actual) {
	Automaton expectedDet = Operations.determinize(Operations.removeDeadStates(expected), DEFAULT_DETERMINIZE_WORK_LIMIT);
	Automaton actualDet = Operations.determinize(Operations.removeDeadStates(actual), DEFAULT_DETERMINIZE_WORK_LIMIT);
	if (Operations.sameLanguage(expectedDet, actualDet) == false) {
	Set<String> expectedPaths = toPathStrings(expectedDet);
	Set<String> actualPaths = toPathStrings(actualDet);
	StringBuilder b = new StringBuilder();
	b.append("expected:\n");
	for(String path : expectedPaths) {
	b.append(" ");
	b.append(path);
	if (actualPaths.contains(path) == false) {
	b.append(" [missing!]");
	}
	b.append('\n');
	}
	b.append("actual:\n");
	for(String path : actualPaths) {
	b.append(" ");
	b.append(path);
	if (expectedPaths.contains(path) == false) {
	b.append(" [unexpected!]");
	}
	b.append('\n');
	}
	fail("accepted language is different:\n\n" + b.toString());
	}
	}

	public void testTokenStreamGraphWithHoles() throws Exception {
	final TokenStream ts = new CannedTokenStream(
	new Token[] {
	token("abc", 1, 1),
	token("xyz", 1, 8),
	token("def", 1, 1),
	token("ghi", 1, 1),
	});
	assertSameLanguage(Operations.union(join(s2a("abc"), SEP_A, s2a("xyz")),
	join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"), SEP_A, s2a("ghi"))), ts);
	}
	}