lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;

 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.RamUsageEstimator;

 /**
  * Splits words into subwords and performs optional transformations on subword
  * groups, producing a correct token graph so that e.g. {@link PhraseQuery} can
  * work correctly when this filter is used in the search-time analyzer.  Unlike
  * the deprecated {@link WordDelimiterFilter}, this token filter produces a
  * correct token graph as output.  However, it cannot consume an input token
  * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
  *
  * <p>
  * Words are split into subwords with the following rules:
  * <ul>
  * <li>split on intra-word delimiters (by default, all non alpha-numeric
  * characters): <code>"Wi-Fi"</code> &#8594; <code>"Wi", "Fi"</code></li>
  * <li>split on case transitions: <code>"PowerShot"</code> &#8594;
  * <code>"Power", "Shot"</code></li>
  * <li>split on letter-number transitions: <code>"SD500"</code> &#8594;
  * <code>"SD", "500"</code></li>
  * <li>leading and trailing intra-word delimiters on each subword are ignored:
  * <code>"//hello---there, 'dude'"</code> &#8594;
  * <code>"hello", "there", "dude"</code></li>
  * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
  * &#8594; <code>"O", "Neil"</code>
  * <ul>
  * <li>Note: this step isn't performed in a separate filter because of possible
  * subword combinations.</li>
  * </ul>
  * </li>
  * </ul>
  *
  * The <b>GENERATE...</b> options affect how incoming tokens are broken into parts, and the
  * various <b>CATENATE_...</b> parameters affect how those parts are combined.
  *
  * <ul>
  * <li>If no CATENATE option is set, then no subword combinations are generated:
  * <code>"PowerShot"</code> &#8594; <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token
  * positions)</li>
  * <li>CATENATE_WORDS means that in addition to the subwords, maximum runs of
  * non-numeric subwords are catenated and produced at the same position of the
  * last subword in the run:
  * <ul>
  * <li><code>"PowerShot"</code> &#8594;
  * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
  * <li><code>"A's+B's&amp;C's"</code> &gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
  * </li>
  * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> &#8594;
  * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
  * </li>
  * </ul>
  * </li>
  * <li>CATENATE_NUMBERS works like CATENATE_WORDS, but for adjacent digit sequences.</li>
  * <li>CATENATE_ALL smushes together all the token parts without distinguishing numbers and words.</li>
  * </ul>
  * One use for {@link WordDelimiterGraphFilter} is to help match words with different
  * subword delimiters. For example, if the source text contained "wi-fi" one may
  * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
  * is to specify CATENATE options in the analyzer used for indexing, and not
  * in the analyzer used for querying. Given that
  * the current {@link StandardTokenizer} immediately removes many intra-word
  * delimiters, it is recommended that this filter be used after a tokenizer that
  * does not do this (such as {@link WhitespaceTokenizer}).
  */

 public final class WordDelimiterGraphFilter extends TokenFilter {

   /**
    * Causes parts of words to be generated:
    * <p>
    * "PowerShot" =&gt; "Power" "Shot"
    */
   public static final int GENERATE_WORD_PARTS = 1;

   /**
    * Causes number subwords to be generated:
    * <p>
    * "500-42" =&gt; "500" "42"
    */
   public static final int GENERATE_NUMBER_PARTS = 2;

   /**
    * Causes maximum runs of word parts to be catenated:
    * <p>
    * "wi-fi" =&gt; "wifi"
    */
   public static final int CATENATE_WORDS = 4;

   /**
    * Causes maximum runs of number parts to be catenated:
    * <p>
    * "500-42" =&gt; "50042"
    */
   public static final int CATENATE_NUMBERS = 8;

   /**
    * Causes all subword parts to be catenated:
    * <p>
    * "wi-fi-4000" =&gt; "wifi4000"
    */
   public static final int CATENATE_ALL = 16;

   /**
    * Causes original words are preserved and added to the subword list (Defaults to false)
    * <p>
    * "500-42" =&gt; "500" "42" "500-42"
    */
   public static final int PRESERVE_ORIGINAL = 32;

   /**
    * Causes lowercase -&gt; uppercase transition to start a new subword.
    */
   public static final int SPLIT_ON_CASE_CHANGE = 64;

   /**
    * If not set, causes numeric changes to be ignored (subwords will only be generated
    * given SUBWORD_DELIM tokens).
    */
   public static final int SPLIT_ON_NUMERICS = 128;

   /**
    * Causes trailing "'s" to be removed for each subword
    * <p>
    * "O'Neil's" =&gt; "O", "Neil"
    */
   public static final int STEM_ENGLISH_POSSESSIVE = 256;

   /**
    * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
    */
   public static final int IGNORE_KEYWORDS = 512;

   /**
    * If not null is the set of tokens to protect from being delimited
    *
    */
   final CharArraySet protWords;

   private final int flags;

   // packs start pos, end pos, start part, end part (= slice of the term text) for each buffered part:
   private int[] bufferedParts = new int[16];
   private int bufferedLen;
   private int bufferedPos;

   // holds text for each buffered part, or null if it's a simple slice of the original term
   private char[][] bufferedTermParts = new char[4][];

   private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
   private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
   private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
   private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);

   // used for iterating word delimiter breaks
   private final WordDelimiterIterator iterator;

   // used for concatenating runs of similar typed subwords (word,number)
   private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();

   private final boolean adjustInternalOffsets;

   // number of subwords last output by concat.
   private int lastConcatCount;

   // used for catenate all
   private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();

   // used for accumulating position increment gaps so that we preserve incoming holes:
   private int accumPosInc;

   private char[] savedTermBuffer = new char[16];
   private int savedTermLength;
   private int savedStartOffset;
   private int savedEndOffset;
   private AttributeSource.State savedState;
   private int lastStartOffset;
   private boolean adjustingOffsets;

   private int wordPos;

   /**
    * Creates a new WordDelimiterGraphFilter
    *
    * @param in TokenStream to be filtered
    * @param adjustInternalOffsets if the offsets of partial terms should be adjusted
    * @param charTypeTable table containing character types
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
   public WordDelimiterGraphFilter(TokenStream in, boolean adjustInternalOffsets, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) {
     super(in);
     if ((configurationFlags &
         ~(GENERATE_WORD_PARTS |
           GENERATE_NUMBER_PARTS |
           CATENATE_WORDS |
           CATENATE_NUMBERS |
           CATENATE_ALL |
           PRESERVE_ORIGINAL |
           SPLIT_ON_CASE_CHANGE |
           SPLIT_ON_NUMERICS |
           STEM_ENGLISH_POSSESSIVE |
           IGNORE_KEYWORDS)) != 0) {
       throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
     }
     this.flags = configurationFlags;
     this.protWords = protWords;
     this.iterator = new WordDelimiterIterator(
         charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
     this.adjustInternalOffsets = adjustInternalOffsets;
   }

   /**
    * Creates a new WordDelimiterGraphFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE}
    * as its charTypeTable
    *
    * @param in TokenStream to be filtered
    * @param configurationFlags Flags configuring the filter
    * @param protWords If not null is the set of tokens to protect from being delimited
    */
   public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) {
     this(in, false, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
   }

   /** Iterates all words parts and concatenations, buffering up the term parts we should return. */
   private void bufferWordParts() throws IOException {

     saveState();

     // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming
     // offsets.  this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc:
     adjustingOffsets = adjustInternalOffsets && savedEndOffset - savedStartOffset == savedTermLength;

     bufferedLen = 0;
     lastConcatCount = 0;
     wordPos = 0;

     if (has(PRESERVE_ORIGINAL)) {
       // add the original token now so that it is always emitted first
       // we will edit the term length after all other parts have been buffered
       buffer(0, 1, 0, savedTermLength);
     }

     if (iterator.isSingleWord()) {
       buffer(wordPos, wordPos+1, iterator.current, iterator.end);
       wordPos++;
       iterator.next();
     } else {

       // iterate all words parts, possibly buffering them, building up concatenations and possibly buffering them too:
       while (iterator.end != WordDelimiterIterator.DONE) {
         int wordType = iterator.type();

         // do we already have queued up incompatible concatenations?
         if (concat.isNotEmpty() && (concat.type & wordType) == 0) {
           flushConcatenation(concat);
         }

         // add subwords depending upon options
         if (shouldConcatenate(wordType)) {
           concatenate(concat);
         }

         // add all subwords (catenateAll)
         if (has(CATENATE_ALL)) {
           concatenate(concatAll);
         }

         // if we should output the word or number part
         if (shouldGenerateParts(wordType)) {
           buffer(wordPos, wordPos+1, iterator.current, iterator.end);
           wordPos++;
         }
         iterator.next();
       }

       if (concat.isNotEmpty()) {
         // flush final concatenation
         flushConcatenation(concat);
       }

       if (concatAll.isNotEmpty()) {
         // only if we haven't output this same combo above, e.g. PowerShot with CATENATE_WORDS:
         if (concatAll.subwordCount > lastConcatCount) {
           if (wordPos == concatAll.startPos) {
             // we are not generating parts, so we must advance wordPos now
             wordPos++;
           }
           concatAll.write();
         }
         concatAll.clear();
       }
     }

     if (has(PRESERVE_ORIGINAL)) {
       // we now know how many tokens need to be injected, so we can set the original
       // token's position length
       if (wordPos == 0) {
         // can happen w/ strange flag combos and inputs :)
         wordPos++;
       }
       bufferedParts[1] = wordPos;
     }

     sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
     wordPos = 0;

     // set back to 0 for iterating from the buffer
     bufferedPos = 0;
   }

   @Override
   public boolean incrementToken() throws IOException {
     while (true) {
       if (savedState == null) {

         // process a new input token
         if (input.incrementToken() == false) {
           return false;
         }
         if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
             return true;
         }
         int termLength = termAttribute.length();
         char[] termBuffer = termAttribute.buffer();

         accumPosInc += posIncAttribute.getPositionIncrement();

         // iterate & cache all word parts up front:
         iterator.setText(termBuffer, termLength);
         iterator.next();

         // word of no delimiters, or protected word: just return it
         if ((iterator.current == 0 && iterator.end == termLength) ||
             (protWords != null && protWords.contains(termBuffer, 0, termLength))) {
           posIncAttribute.setPositionIncrement(accumPosInc);
           accumPosInc = 0;
           return true;
         }

         // word of simply delimiters: swallow this token, creating a hole, and move on to next token
         if (iterator.end == WordDelimiterIterator.DONE) {
           if (has(PRESERVE_ORIGINAL) == false) {
             continue;
           } else {
             accumPosInc = 0;
             return true;
           }
         }

         // otherwise, we have delimiters, process & buffer all parts:
         bufferWordParts();
       }

       if (bufferedPos < bufferedLen) {
         clearAttributes();
         restoreState(savedState);

         char[] termPart = bufferedTermParts[bufferedPos];
         int startPos = bufferedParts[4*bufferedPos];
         int endPos = bufferedParts[4*bufferedPos+1];
         int startPart = bufferedParts[4*bufferedPos+2];
         int endPart = bufferedParts[4*bufferedPos+3];
         bufferedPos++;

         int startOffset;
         int endOffset;

         if (adjustingOffsets == false) {
           startOffset = savedStartOffset;
           endOffset = savedEndOffset;
         } else {
           startOffset = savedStartOffset + startPart;
           endOffset = savedStartOffset + endPart;
         }

         // never let offsets go backwards:
         startOffset = Math.max(startOffset, lastStartOffset);
         endOffset = Math.max(endOffset, lastStartOffset);

         offsetAttribute.setOffset(startOffset, endOffset);
         lastStartOffset = startOffset;

         if (termPart == null) {
           termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart);
         } else {
           termAttribute.copyBuffer(termPart, 0, termPart.length);
         }

         posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos);
         accumPosInc = 0;
         posLenAttribute.setPositionLength(endPos - startPos);
         wordPos = startPos;
         return true;
       }

       // no saved concatenations, on to the next input word
       savedState = null;
     }
   }

   @Override
   public void reset() throws IOException {
     super.reset();
     accumPosInc = 0;
     savedState = null;
     lastStartOffset = 0;
     concat.clear();
     concatAll.clear();
   }

   // ================================================= Helper Methods ================================================

   private class PositionSorter extends InPlaceMergeSorter {
     @Override
     protected int compare(int i, int j) {
       // smaller start offset
       int iOff = bufferedParts[4 * i + 2];
       int jOff = bufferedParts[4 * j + 2];
       int cmp = Integer.compare(iOff, jOff);
       if (cmp != 0) {
         return cmp;
       }

       // longer end offset
       iOff = bufferedParts[4 * i + 3];
       jOff = bufferedParts[4 * j + 3];
       return Integer.compare(jOff, iOff);
     }

     @Override
     protected void swap(int i, int j) {
       int iOffset = 4*i;
       int jOffset = 4*j;
       for(int x=0;x<4;x++) {
         int tmp = bufferedParts[iOffset+x];
         bufferedParts[iOffset+x] = bufferedParts[jOffset+x];
         bufferedParts[jOffset+x] = tmp;
       }

       char[] tmp2 = bufferedTermParts[i];
       bufferedTermParts[i] = bufferedTermParts[j];
       bufferedTermParts[j] = tmp2;
     }
   }

   final PositionSorter sorter = new PositionSorter();

   /**
    * startPos, endPos -> graph start/end position
    * startPart, endPart -> slice of the original term for this part
    */

   void buffer(int startPos, int endPos, int startPart, int endPart) {
     buffer(null, startPos, endPos, startPart, endPart);
   }

   /**
    * a null termPart means it's a simple slice of the original term
    */
   void buffer(char[] termPart, int startPos, int endPos, int startPart, int endPart) {
     /*
     System.out.println("buffer: pos=" + startPos + "-" + endPos + " part=" + startPart + "-" + endPart);
     if (termPart != null) {
       System.out.println("  termIn=" + new String(termPart));
     } else {
       System.out.println("  term=" + new String(savedTermBuffer, startPart, endPart-startPart));
     }
     */
     assert endPos > startPos: "startPos=" + startPos + " endPos=" + endPos;
     assert endPart > startPart || (endPart == 0 && startPart == 0 && savedTermLength == 0): "startPart=" + startPart + " endPart=" + endPart;
     if ((bufferedLen+1)*4 > bufferedParts.length) {
       bufferedParts = ArrayUtil.grow(bufferedParts, (bufferedLen+1)*4);
     }
     if (bufferedTermParts.length == bufferedLen) {
       int newSize = ArrayUtil.oversize(bufferedLen+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
       char[][] newArray = new char[newSize][];
       System.arraycopy(bufferedTermParts, 0, newArray, 0, bufferedTermParts.length);
       bufferedTermParts = newArray;
     }
     bufferedTermParts[bufferedLen] = termPart;
     bufferedParts[bufferedLen*4] = startPos;
     bufferedParts[bufferedLen*4+1] = endPos;
     bufferedParts[bufferedLen*4+2] = startPart;
     bufferedParts[bufferedLen*4+3] = endPart;
     bufferedLen++;
   }

   /**
    * Saves the existing attribute states
    */
   private void saveState() {
     savedTermLength = termAttribute.length();
     savedStartOffset = offsetAttribute.startOffset();
     savedEndOffset = offsetAttribute.endOffset();
     savedState = captureState();

     if (savedTermBuffer.length < savedTermLength) {
       savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)];
     }

     System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength);
   }

   /**
    * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
    *
    * @param concat WordDelimiterConcatenation that will be flushed
    */
   private void flushConcatenation(WordDelimiterConcatenation concat) {
     if (wordPos == concat.startPos) {
       // we are not generating parts, so we must advance wordPos now
       wordPos++;
     }
     lastConcatCount = concat.subwordCount;
     if (concat.subwordCount != 1 || shouldGenerateParts(concat.type) == false) {
       concat.write();
     }
     concat.clear();
   }

   /**
    * Determines whether to concatenate a word or number if the current word is the given type
    *
    * @param wordType Type of the current word used to determine if it should be concatenated
    * @return {@code true} if concatenation should occur, {@code false} otherwise
    */
   private boolean shouldConcatenate(int wordType) {
     return (has(CATENATE_WORDS) && WordDelimiterIterator.isAlpha(wordType)) || (has(CATENATE_NUMBERS) && WordDelimiterIterator.isDigit(wordType));
   }

   /**
    * Determines whether a word/number part should be generated for a word of the given type
    *
    * @param wordType Type of the word used to determine if a word/number part should be generated
    * @return {@code true} if a word/number part should be generated, {@code false} otherwise
    */
   private boolean shouldGenerateParts(int wordType) {
     return (has(GENERATE_WORD_PARTS) && WordDelimiterIterator.isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && WordDelimiterIterator.isDigit(wordType));
   }

   /**
    * Concatenates the saved buffer to the given WordDelimiterConcatenation
    *
    * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
    */
   private void concatenate(WordDelimiterConcatenation concatenation) {
     if (concatenation.isEmpty()) {
       concatenation.type = iterator.type();
       concatenation.startPart = iterator.current;
       concatenation.startPos = wordPos;
     }
     concatenation.append(savedTermBuffer, iterator.current, iterator.end - iterator.current);
     concatenation.endPart = iterator.end;
   }

   /**
    * Determines whether the given flag is set
    *
    * @param flag Flag to see if set
    * @return {@code true} if flag is set
    */
   private boolean has(int flag) {
     return (flags & flag) != 0;
   }

   // ================================================= Inner Classes =================================================

   /**
    * A WDF concatenated 'run'
    */
   final class WordDelimiterConcatenation {
     final StringBuilder buffer = new StringBuilder();
     int startPart;
     int endPart;
     int startPos;
     int type;
     int subwordCount;

     /**
      * Appends the given text of the given length, to the concetenation at the given offset
      *
      * @param text Text to append
      * @param offset Offset in the concetenation to add the text
      * @param length Length of the text to append
      */
     void append(char text[], int offset, int length) {
       buffer.append(text, offset, length);
       subwordCount++;
     }

     /**
      * Writes the concatenation to part buffer
      */
     void write() {
       char[] termPart = new char[buffer.length()];
       buffer.getChars(0, buffer.length(), termPart, 0);
       buffer(termPart, startPos, wordPos, startPart, endPart);
     }

     /**
      * Determines if the concatenation is empty
      *
      * @return {@code true} if the concatenation is empty, {@code false} otherwise
      */
     boolean isEmpty() {
       return buffer.length() == 0;
     }

     boolean isNotEmpty() {
       return isEmpty() == false;
     }

     /**
      * Clears the concatenation and resets its state
      */
     void clear() {
       buffer.setLength(0);
       startPart = endPart = type = subwordCount = 0;
     }
   }

   /** Returns string representation of configuration flags */
   public static String flagsToString(int flags) {
     StringBuilder b = new StringBuilder();
     if ((flags & GENERATE_WORD_PARTS) != 0) {
       b.append("GENERATE_WORD_PARTS");
     }
     if ((flags & GENERATE_NUMBER_PARTS) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("GENERATE_NUMBER_PARTS");
     }
     if ((flags & CATENATE_WORDS) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("CATENATE_WORDS");
     }
     if ((flags & CATENATE_NUMBERS) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("CATENATE_NUMBERS");
     }
     if ((flags & CATENATE_ALL) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("CATENATE_ALL");
     }
     if ((flags & PRESERVE_ORIGINAL) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("PRESERVE_ORIGINAL");
     }
     if ((flags & SPLIT_ON_CASE_CHANGE) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("SPLIT_ON_CASE_CHANGE");
     }
     if ((flags & SPLIT_ON_NUMERICS) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("SPLIT_ON_NUMERICS");
     }
     if ((flags & STEM_ENGLISH_POSSESSIVE) != 0) {
       if (b.length() > 0) {
         b.append(" | ");
       }
       b.append("STEM_ENGLISH_POSSESSIVE");
     }

     return b.toString();
   }

   @Override
   public String toString() {
     StringBuilder b = new StringBuilder();
     b.append("WordDelimiterGraphFilter(flags=");
     b.append(flagsToString(flags));
     b.append(')');
     return b.toString();
   }

   // questions:
   // negative numbers?  -42 indexed as just 42?
   // dollar sign?  $42
   // percent sign?  33%
   // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
 }