| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.InPlaceMergeSorter; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| /** |
| * Splits words into subwords and performs optional transformations on subword |
| * groups, producing a correct token graph so that e.g. {@link PhraseQuery} can |
| * work correctly when this filter is used in the search-time analyzer. Unlike |
| * the deprecated {@link WordDelimiterFilter}, this token filter produces a |
| * correct token graph as output. However, it cannot consume an input token |
| * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true. |
| * |
| * <p> |
| * Words are split into subwords with the following rules: |
| * <ul> |
| * <li>split on intra-word delimiters (by default, all non alpha-numeric |
| * characters): <code>"Wi-Fi"</code> → <code>"Wi", "Fi"</code></li> |
| * <li>split on case transitions: <code>"PowerShot"</code> → |
| * <code>"Power", "Shot"</code></li> |
| * <li>split on letter-number transitions: <code>"SD500"</code> → |
| * <code>"SD", "500"</code></li> |
| * <li>leading and trailing intra-word delimiters on each subword are ignored: |
| * <code>"//hello---there, 'dude'"</code> → |
| * <code>"hello", "there", "dude"</code></li> |
| * <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code> |
| * → <code>"O", "Neil"</code> |
| * <ul> |
| * <li>Note: this step isn't performed in a separate filter because of possible |
| * subword combinations.</li> |
| * </ul> |
| * </li> |
| * </ul> |
| * |
| * The <b>GENERATE...</b> options affect how incoming tokens are broken into parts, and the |
| * various <b>CATENATE_...</b> parameters affect how those parts are combined. |
| * |
| * <ul> |
| * <li>If no CATENATE option is set, then no subword combinations are generated: |
| * <code>"PowerShot"</code> → <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token |
| * positions)</li> |
| * <li>CATENATE_WORDS means that in addition to the subwords, maximum runs of |
| * non-numeric subwords are catenated and produced at the same position of the |
| * last subword in the run: |
| * <ul> |
| * <li><code>"PowerShot"</code> → |
| * <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li> |
| * <li><code>"A's+B's&C's"</code> > <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code> |
| * </li> |
| * <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> → |
| * <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code> |
| * </li> |
| * </ul> |
| * </li> |
| * <li>CATENATE_NUMBERS works like CATENATE_WORDS, but for adjacent digit sequences.</li> |
| * <li>CATENATE_ALL smushes together all the token parts without distinguishing numbers and words.</li> |
| * </ul> |
| * One use for {@link WordDelimiterGraphFilter} is to help match words with different |
| * subword delimiters. For example, if the source text contained "wi-fi" one may |
| * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so |
| * is to specify CATENATE options in the analyzer used for indexing, and not |
| * in the analyzer used for querying. Given that |
| * the current {@link StandardTokenizer} immediately removes many intra-word |
| * delimiters, it is recommended that this filter be used after a tokenizer that |
| * does not do this (such as {@link WhitespaceTokenizer}). |
| */ |
| |
| public final class WordDelimiterGraphFilter extends TokenFilter { |
| |
| /** |
| * Causes parts of words to be generated: |
| * <p> |
| * "PowerShot" => "Power" "Shot" |
| */ |
| public static final int GENERATE_WORD_PARTS = 1; |
| |
| /** |
| * Causes number subwords to be generated: |
| * <p> |
| * "500-42" => "500" "42" |
| */ |
| public static final int GENERATE_NUMBER_PARTS = 2; |
| |
| /** |
| * Causes maximum runs of word parts to be catenated: |
| * <p> |
| * "wi-fi" => "wifi" |
| */ |
| public static final int CATENATE_WORDS = 4; |
| |
| /** |
| * Causes maximum runs of number parts to be catenated: |
| * <p> |
| * "500-42" => "50042" |
| */ |
| public static final int CATENATE_NUMBERS = 8; |
| |
| /** |
| * Causes all subword parts to be catenated: |
| * <p> |
| * "wi-fi-4000" => "wifi4000" |
| */ |
| public static final int CATENATE_ALL = 16; |
| |
| /** |
| * Causes original words are preserved and added to the subword list (Defaults to false) |
| * <p> |
| * "500-42" => "500" "42" "500-42" |
| */ |
| public static final int PRESERVE_ORIGINAL = 32; |
| |
| /** |
| * Causes lowercase -> uppercase transition to start a new subword. |
| */ |
| public static final int SPLIT_ON_CASE_CHANGE = 64; |
| |
| /** |
| * If not set, causes numeric changes to be ignored (subwords will only be generated |
| * given SUBWORD_DELIM tokens). |
| */ |
| public static final int SPLIT_ON_NUMERICS = 128; |
| |
| /** |
| * Causes trailing "'s" to be removed for each subword |
| * <p> |
| * "O'Neil's" => "O", "Neil" |
| */ |
| public static final int STEM_ENGLISH_POSSESSIVE = 256; |
| |
| /** |
| * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true. |
| */ |
| public static final int IGNORE_KEYWORDS = 512; |
| |
| /** |
| * If not null is the set of tokens to protect from being delimited |
| * |
| */ |
| final CharArraySet protWords; |
| |
| private final int flags; |
| |
| // packs start pos, end pos, start part, end part (= slice of the term text) for each buffered part: |
| private int[] bufferedParts = new int[16]; |
| private int bufferedLen; |
| private int bufferedPos; |
| |
| // holds text for each buffered part, or null if it's a simple slice of the original term |
| private char[][] bufferedTermParts = new char[4][]; |
| |
| private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
| private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);; |
| private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
| private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); |
| private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); |
| |
| // used for iterating word delimiter breaks |
| private final WordDelimiterIterator iterator; |
| |
| // used for concatenating runs of similar typed subwords (word,number) |
| private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); |
| |
| private final boolean adjustInternalOffsets; |
| |
| // number of subwords last output by concat. |
| private int lastConcatCount; |
| |
| // used for catenate all |
| private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); |
| |
| // used for accumulating position increment gaps so that we preserve incoming holes: |
| private int accumPosInc; |
| |
| private char[] savedTermBuffer = new char[16]; |
| private int savedTermLength; |
| private int savedStartOffset; |
| private int savedEndOffset; |
| private AttributeSource.State savedState; |
| private int lastStartOffset; |
| private boolean adjustingOffsets; |
| |
| private int wordPos; |
| |
| /** |
| * Creates a new WordDelimiterGraphFilter |
| * |
| * @param in TokenStream to be filtered |
| * @param adjustInternalOffsets if the offsets of partial terms should be adjusted |
| * @param charTypeTable table containing character types |
| * @param configurationFlags Flags configuring the filter |
| * @param protWords If not null is the set of tokens to protect from being delimited |
| */ |
| public WordDelimiterGraphFilter(TokenStream in, boolean adjustInternalOffsets, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) { |
| super(in); |
| if ((configurationFlags & |
| ~(GENERATE_WORD_PARTS | |
| GENERATE_NUMBER_PARTS | |
| CATENATE_WORDS | |
| CATENATE_NUMBERS | |
| CATENATE_ALL | |
| PRESERVE_ORIGINAL | |
| SPLIT_ON_CASE_CHANGE | |
| SPLIT_ON_NUMERICS | |
| STEM_ENGLISH_POSSESSIVE | |
| IGNORE_KEYWORDS)) != 0) { |
| throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags); |
| } |
| this.flags = configurationFlags; |
| this.protWords = protWords; |
| this.iterator = new WordDelimiterIterator( |
| charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); |
| this.adjustInternalOffsets = adjustInternalOffsets; |
| } |
| |
| /** |
| * Creates a new WordDelimiterGraphFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} |
| * as its charTypeTable |
| * |
| * @param in TokenStream to be filtered |
| * @param configurationFlags Flags configuring the filter |
| * @param protWords If not null is the set of tokens to protect from being delimited |
| */ |
| public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) { |
| this(in, false, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); |
| } |
| |
| /** Iterates all words parts and concatenations, buffering up the term parts we should return. */ |
| private void bufferWordParts() throws IOException { |
| |
| saveState(); |
| |
| // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming |
| // offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc: |
| adjustingOffsets = adjustInternalOffsets && savedEndOffset - savedStartOffset == savedTermLength; |
| |
| bufferedLen = 0; |
| lastConcatCount = 0; |
| wordPos = 0; |
| |
| if (has(PRESERVE_ORIGINAL)) { |
| // add the original token now so that it is always emitted first |
| // we will edit the term length after all other parts have been buffered |
| buffer(0, 1, 0, savedTermLength); |
| } |
| |
| if (iterator.isSingleWord()) { |
| buffer(wordPos, wordPos+1, iterator.current, iterator.end); |
| wordPos++; |
| iterator.next(); |
| } else { |
| |
| // iterate all words parts, possibly buffering them, building up concatenations and possibly buffering them too: |
| while (iterator.end != WordDelimiterIterator.DONE) { |
| int wordType = iterator.type(); |
| |
| // do we already have queued up incompatible concatenations? |
| if (concat.isNotEmpty() && (concat.type & wordType) == 0) { |
| flushConcatenation(concat); |
| } |
| |
| // add subwords depending upon options |
| if (shouldConcatenate(wordType)) { |
| concatenate(concat); |
| } |
| |
| // add all subwords (catenateAll) |
| if (has(CATENATE_ALL)) { |
| concatenate(concatAll); |
| } |
| |
| // if we should output the word or number part |
| if (shouldGenerateParts(wordType)) { |
| buffer(wordPos, wordPos+1, iterator.current, iterator.end); |
| wordPos++; |
| } |
| iterator.next(); |
| } |
| |
| if (concat.isNotEmpty()) { |
| // flush final concatenation |
| flushConcatenation(concat); |
| } |
| |
| if (concatAll.isNotEmpty()) { |
| // only if we haven't output this same combo above, e.g. PowerShot with CATENATE_WORDS: |
| if (concatAll.subwordCount > lastConcatCount) { |
| if (wordPos == concatAll.startPos) { |
| // we are not generating parts, so we must advance wordPos now |
| wordPos++; |
| } |
| concatAll.write(); |
| } |
| concatAll.clear(); |
| } |
| } |
| |
| if (has(PRESERVE_ORIGINAL)) { |
| // we now know how many tokens need to be injected, so we can set the original |
| // token's position length |
| if (wordPos == 0) { |
| // can happen w/ strange flag combos and inputs :) |
| wordPos++; |
| } |
| bufferedParts[1] = wordPos; |
| } |
| |
| sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen); |
| wordPos = 0; |
| |
| // set back to 0 for iterating from the buffer |
| bufferedPos = 0; |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| while (true) { |
| if (savedState == null) { |
| |
| // process a new input token |
| if (input.incrementToken() == false) { |
| return false; |
| } |
| if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) { |
| return true; |
| } |
| int termLength = termAttribute.length(); |
| char[] termBuffer = termAttribute.buffer(); |
| |
| accumPosInc += posIncAttribute.getPositionIncrement(); |
| |
| // iterate & cache all word parts up front: |
| iterator.setText(termBuffer, termLength); |
| iterator.next(); |
| |
| // word of no delimiters, or protected word: just return it |
| if ((iterator.current == 0 && iterator.end == termLength) || |
| (protWords != null && protWords.contains(termBuffer, 0, termLength))) { |
| posIncAttribute.setPositionIncrement(accumPosInc); |
| accumPosInc = 0; |
| return true; |
| } |
| |
| // word of simply delimiters: swallow this token, creating a hole, and move on to next token |
| if (iterator.end == WordDelimiterIterator.DONE) { |
| if (has(PRESERVE_ORIGINAL) == false) { |
| continue; |
| } else { |
| accumPosInc = 0; |
| return true; |
| } |
| } |
| |
| // otherwise, we have delimiters, process & buffer all parts: |
| bufferWordParts(); |
| } |
| |
| if (bufferedPos < bufferedLen) { |
| clearAttributes(); |
| restoreState(savedState); |
| |
| char[] termPart = bufferedTermParts[bufferedPos]; |
| int startPos = bufferedParts[4*bufferedPos]; |
| int endPos = bufferedParts[4*bufferedPos+1]; |
| int startPart = bufferedParts[4*bufferedPos+2]; |
| int endPart = bufferedParts[4*bufferedPos+3]; |
| bufferedPos++; |
| |
| int startOffset; |
| int endOffset; |
| |
| if (adjustingOffsets == false) { |
| startOffset = savedStartOffset; |
| endOffset = savedEndOffset; |
| } else { |
| startOffset = savedStartOffset + startPart; |
| endOffset = savedStartOffset + endPart; |
| } |
| |
| // never let offsets go backwards: |
| startOffset = Math.max(startOffset, lastStartOffset); |
| endOffset = Math.max(endOffset, lastStartOffset); |
| |
| offsetAttribute.setOffset(startOffset, endOffset); |
| lastStartOffset = startOffset; |
| |
| if (termPart == null) { |
| termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); |
| } else { |
| termAttribute.copyBuffer(termPart, 0, termPart.length); |
| } |
| |
| posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); |
| accumPosInc = 0; |
| posLenAttribute.setPositionLength(endPos - startPos); |
| wordPos = startPos; |
| return true; |
| } |
| |
| // no saved concatenations, on to the next input word |
| savedState = null; |
| } |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| accumPosInc = 0; |
| savedState = null; |
| lastStartOffset = 0; |
| concat.clear(); |
| concatAll.clear(); |
| } |
| |
| // ================================================= Helper Methods ================================================ |
| |
| private class PositionSorter extends InPlaceMergeSorter { |
| @Override |
| protected int compare(int i, int j) { |
| // smaller start offset |
| int iOff = bufferedParts[4 * i + 2]; |
| int jOff = bufferedParts[4 * j + 2]; |
| int cmp = Integer.compare(iOff, jOff); |
| if (cmp != 0) { |
| return cmp; |
| } |
| |
| // longer end offset |
| iOff = bufferedParts[4 * i + 3]; |
| jOff = bufferedParts[4 * j + 3]; |
| return Integer.compare(jOff, iOff); |
| } |
| |
| @Override |
| protected void swap(int i, int j) { |
| int iOffset = 4*i; |
| int jOffset = 4*j; |
| for(int x=0;x<4;x++) { |
| int tmp = bufferedParts[iOffset+x]; |
| bufferedParts[iOffset+x] = bufferedParts[jOffset+x]; |
| bufferedParts[jOffset+x] = tmp; |
| } |
| |
| char[] tmp2 = bufferedTermParts[i]; |
| bufferedTermParts[i] = bufferedTermParts[j]; |
| bufferedTermParts[j] = tmp2; |
| } |
| } |
| |
| final PositionSorter sorter = new PositionSorter(); |
| |
| /** |
| * startPos, endPos -> graph start/end position |
| * startPart, endPart -> slice of the original term for this part |
| */ |
| |
| void buffer(int startPos, int endPos, int startPart, int endPart) { |
| buffer(null, startPos, endPos, startPart, endPart); |
| } |
| |
| /** |
| * a null termPart means it's a simple slice of the original term |
| */ |
| void buffer(char[] termPart, int startPos, int endPos, int startPart, int endPart) { |
| /* |
| System.out.println("buffer: pos=" + startPos + "-" + endPos + " part=" + startPart + "-" + endPart); |
| if (termPart != null) { |
| System.out.println(" termIn=" + new String(termPart)); |
| } else { |
| System.out.println(" term=" + new String(savedTermBuffer, startPart, endPart-startPart)); |
| } |
| */ |
| assert endPos > startPos: "startPos=" + startPos + " endPos=" + endPos; |
| assert endPart > startPart || (endPart == 0 && startPart == 0 && savedTermLength == 0): "startPart=" + startPart + " endPart=" + endPart; |
| if ((bufferedLen+1)*4 > bufferedParts.length) { |
| bufferedParts = ArrayUtil.grow(bufferedParts, (bufferedLen+1)*4); |
| } |
| if (bufferedTermParts.length == bufferedLen) { |
| int newSize = ArrayUtil.oversize(bufferedLen+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); |
| char[][] newArray = new char[newSize][]; |
| System.arraycopy(bufferedTermParts, 0, newArray, 0, bufferedTermParts.length); |
| bufferedTermParts = newArray; |
| } |
| bufferedTermParts[bufferedLen] = termPart; |
| bufferedParts[bufferedLen*4] = startPos; |
| bufferedParts[bufferedLen*4+1] = endPos; |
| bufferedParts[bufferedLen*4+2] = startPart; |
| bufferedParts[bufferedLen*4+3] = endPart; |
| bufferedLen++; |
| } |
| |
| /** |
| * Saves the existing attribute states |
| */ |
| private void saveState() { |
| savedTermLength = termAttribute.length(); |
| savedStartOffset = offsetAttribute.startOffset(); |
| savedEndOffset = offsetAttribute.endOffset(); |
| savedState = captureState(); |
| |
| if (savedTermBuffer.length < savedTermLength) { |
| savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)]; |
| } |
| |
| System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength); |
| } |
| |
| /** |
| * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing. |
| * |
| * @param concat WordDelimiterConcatenation that will be flushed |
| */ |
| private void flushConcatenation(WordDelimiterConcatenation concat) { |
| if (wordPos == concat.startPos) { |
| // we are not generating parts, so we must advance wordPos now |
| wordPos++; |
| } |
| lastConcatCount = concat.subwordCount; |
| if (concat.subwordCount != 1 || shouldGenerateParts(concat.type) == false) { |
| concat.write(); |
| } |
| concat.clear(); |
| } |
| |
| /** |
| * Determines whether to concatenate a word or number if the current word is the given type |
| * |
| * @param wordType Type of the current word used to determine if it should be concatenated |
| * @return {@code true} if concatenation should occur, {@code false} otherwise |
| */ |
| private boolean shouldConcatenate(int wordType) { |
| return (has(CATENATE_WORDS) && WordDelimiterIterator.isAlpha(wordType)) || (has(CATENATE_NUMBERS) && WordDelimiterIterator.isDigit(wordType)); |
| } |
| |
| /** |
| * Determines whether a word/number part should be generated for a word of the given type |
| * |
| * @param wordType Type of the word used to determine if a word/number part should be generated |
| * @return {@code true} if a word/number part should be generated, {@code false} otherwise |
| */ |
| private boolean shouldGenerateParts(int wordType) { |
| return (has(GENERATE_WORD_PARTS) && WordDelimiterIterator.isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && WordDelimiterIterator.isDigit(wordType)); |
| } |
| |
| /** |
| * Concatenates the saved buffer to the given WordDelimiterConcatenation |
| * |
| * @param concatenation WordDelimiterConcatenation to concatenate the buffer to |
| */ |
| private void concatenate(WordDelimiterConcatenation concatenation) { |
| if (concatenation.isEmpty()) { |
| concatenation.type = iterator.type(); |
| concatenation.startPart = iterator.current; |
| concatenation.startPos = wordPos; |
| } |
| concatenation.append(savedTermBuffer, iterator.current, iterator.end - iterator.current); |
| concatenation.endPart = iterator.end; |
| } |
| |
| /** |
| * Determines whether the given flag is set |
| * |
| * @param flag Flag to see if set |
| * @return {@code true} if flag is set |
| */ |
| private boolean has(int flag) { |
| return (flags & flag) != 0; |
| } |
| |
| // ================================================= Inner Classes ================================================= |
| |
| /** |
| * A WDF concatenated 'run' |
| */ |
| final class WordDelimiterConcatenation { |
| final StringBuilder buffer = new StringBuilder(); |
| int startPart; |
| int endPart; |
| int startPos; |
| int type; |
| int subwordCount; |
| |
| /** |
| * Appends the given text of the given length, to the concetenation at the given offset |
| * |
| * @param text Text to append |
| * @param offset Offset in the concetenation to add the text |
| * @param length Length of the text to append |
| */ |
| void append(char text[], int offset, int length) { |
| buffer.append(text, offset, length); |
| subwordCount++; |
| } |
| |
| /** |
| * Writes the concatenation to part buffer |
| */ |
| void write() { |
| char[] termPart = new char[buffer.length()]; |
| buffer.getChars(0, buffer.length(), termPart, 0); |
| buffer(termPart, startPos, wordPos, startPart, endPart); |
| } |
| |
| /** |
| * Determines if the concatenation is empty |
| * |
| * @return {@code true} if the concatenation is empty, {@code false} otherwise |
| */ |
| boolean isEmpty() { |
| return buffer.length() == 0; |
| } |
| |
| boolean isNotEmpty() { |
| return isEmpty() == false; |
| } |
| |
| /** |
| * Clears the concatenation and resets its state |
| */ |
| void clear() { |
| buffer.setLength(0); |
| startPart = endPart = type = subwordCount = 0; |
| } |
| } |
| |
| /** Returns string representation of configuration flags */ |
| public static String flagsToString(int flags) { |
| StringBuilder b = new StringBuilder(); |
| if ((flags & GENERATE_WORD_PARTS) != 0) { |
| b.append("GENERATE_WORD_PARTS"); |
| } |
| if ((flags & GENERATE_NUMBER_PARTS) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("GENERATE_NUMBER_PARTS"); |
| } |
| if ((flags & CATENATE_WORDS) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("CATENATE_WORDS"); |
| } |
| if ((flags & CATENATE_NUMBERS) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("CATENATE_NUMBERS"); |
| } |
| if ((flags & CATENATE_ALL) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("CATENATE_ALL"); |
| } |
| if ((flags & PRESERVE_ORIGINAL) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("PRESERVE_ORIGINAL"); |
| } |
| if ((flags & SPLIT_ON_CASE_CHANGE) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("SPLIT_ON_CASE_CHANGE"); |
| } |
| if ((flags & SPLIT_ON_NUMERICS) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("SPLIT_ON_NUMERICS"); |
| } |
| if ((flags & STEM_ENGLISH_POSSESSIVE) != 0) { |
| if (b.length() > 0) { |
| b.append(" | "); |
| } |
| b.append("STEM_ENGLISH_POSSESSIVE"); |
| } |
| |
| return b.toString(); |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder b = new StringBuilder(); |
| b.append("WordDelimiterGraphFilter(flags="); |
| b.append(flagsToString(flags)); |
| b.append(')'); |
| return b.toString(); |
| } |
| |
| // questions: |
| // negative numbers? -42 indexed as just 42? |
| // dollar sign? $42 |
| // percent sign? 33% |
| // downsides: if source text is "powershot" then a query of "PowerShot" won't match! |
| } |