| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.util.Locale; |
| |
| /** |
| * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules. |
| * @lucene.internal |
| */ |
| public final class WordDelimiterIterator { |
| |
| static final int LOWER = 0x01; |
| static final int UPPER = 0x02; |
| static final int DIGIT = 0x04; |
| static final int SUBWORD_DELIM = 0x08; |
| |
| // combinations: for testing, not for setting bits |
| public static final int ALPHA = 0x03; |
| public static final int ALPHANUM = 0x07; |
| |
| /** Indicates the end of iteration */ |
| public static final int DONE = -1; |
| |
| public static final byte[] DEFAULT_WORD_DELIM_TABLE; |
| |
| char text[]; |
| int length; |
| |
| /** start position of text, excluding leading delimiters */ |
| int startBounds; |
| /** end position of text, excluding trailing delimiters */ |
| int endBounds; |
| |
| /** Beginning of subword */ |
| int current; |
| /** End of subword */ |
| int end; |
| |
| /* does this string end with a possessive such as 's */ |
| private boolean hasFinalPossessive = false; |
| |
| /** |
| * If false, causes case changes to be ignored (subwords will only be generated |
| * given SUBWORD_DELIM tokens). (Defaults to true) |
| */ |
| final boolean splitOnCaseChange; |
| |
| /** |
| * If false, causes numeric changes to be ignored (subwords will only be generated |
| * given SUBWORD_DELIM tokens). (Defaults to true) |
| */ |
| final boolean splitOnNumerics; |
| |
| /** |
| * If true, causes trailing "'s" to be removed for each subword. (Defaults to true) |
| * <p/> |
| * "O'Neil's" => "O", "Neil" |
| */ |
| final boolean stemEnglishPossessive; |
| |
| private final byte[] charTypeTable; |
| |
| /** if true, need to skip over a possessive found in the last call to next() */ |
| private boolean skipPossessive = false; |
| |
| // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be |
| // done if separated by these chars?) "," would be an obvious candidate... |
| static { |
| byte[] tab = new byte[256]; |
| for (int i = 0; i < 256; i++) { |
| byte code = 0; |
| if (Character.isLowerCase(i)) { |
| code |= LOWER; |
| } |
| else if (Character.isUpperCase(i)) { |
| code |= UPPER; |
| } |
| else if (Character.isDigit(i)) { |
| code |= DIGIT; |
| } |
| if (code == 0) { |
| code = SUBWORD_DELIM; |
| } |
| tab[i] = code; |
| } |
| DEFAULT_WORD_DELIM_TABLE = tab; |
| } |
| |
| /** |
| * Create a new WordDelimiterIterator operating with the supplied rules. |
| * |
| * @param charTypeTable table containing character types |
| * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless) |
| * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" |
| * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" |
| */ |
| WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { |
| this.charTypeTable = charTypeTable; |
| this.splitOnCaseChange = splitOnCaseChange; |
| this.splitOnNumerics = splitOnNumerics; |
| this.stemEnglishPossessive = stemEnglishPossessive; |
| } |
| |
| @Override |
| public String toString() { |
| if (end == DONE) { |
| return "DONE"; |
| } |
| return new String(text, current, end - current) |
| + " [" + current + "-" + end + "]" |
| + " type=" + String.format(Locale.ROOT, "%#02x", type()); |
| } |
| |
| /** |
| * Advance to the next subword in the string. |
| * |
| * @return index of the next subword, or {@link #DONE} if all subwords have been returned |
| */ |
| int next() { |
| current = end; |
| if (current == DONE) { |
| return DONE; |
| } |
| |
| if (skipPossessive) { |
| current += 2; |
| skipPossessive = false; |
| } |
| |
| int lastType = 0; |
| |
| while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) { |
| current++; |
| } |
| |
| if (current >= endBounds) { |
| return end = DONE; |
| } |
| |
| for (end = current + 1; end < endBounds; end++) { |
| int type = charType(text[end]); |
| if (isBreak(lastType, type)) { |
| break; |
| } |
| lastType = type; |
| } |
| |
| if (end < endBounds - 1 && endsWithPossessive(end + 2)) { |
| skipPossessive = true; |
| } |
| |
| return end; |
| } |
| |
| |
| /** |
| * Return the type of the current subword. |
| * This currently uses the type of the first character in the subword. |
| * |
| * @return type of the current word |
| */ |
| int type() { |
| if (end == DONE) { |
| return 0; |
| } |
| |
| int type = charType(text[current]); |
| switch (type) { |
| // return ALPHA word type for both lower and upper |
| case LOWER: |
| case UPPER: |
| return ALPHA; |
| default: |
| return type; |
| } |
| } |
| |
| /** |
| * Reset the text to a new value, and reset all state |
| * |
| * @param text New text |
| * @param length length of the text |
| */ |
| void setText(char text[], int length) { |
| this.text = text; |
| this.length = this.endBounds = length; |
| current = startBounds = end = 0; |
| skipPossessive = hasFinalPossessive = false; |
| setBounds(); |
| } |
| |
| // ================================================= Helper Methods ================================================ |
| |
| /** |
| * Determines whether the transition from lastType to type indicates a break |
| * |
| * @param lastType Last subword type |
| * @param type Current subword type |
| * @return {@code true} if the transition indicates a break, {@code false} otherwise |
| */ |
| private boolean isBreak(int lastType, int type) { |
| if ((type & lastType) != 0) { |
| return false; |
| } |
| |
| if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) { |
| // ALPHA->ALPHA: always ignore if case isn't considered. |
| return false; |
| } else if (isUpper(lastType) && isAlpha(type)) { |
| // UPPER->letter: Don't split |
| return false; |
| } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) { |
| // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split |
| return false; |
| } |
| |
| return true; |
| } |
| |
| /** |
| * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters |
| * |
| * @return {@code true} if the current word contains only one subword, {@code false} otherwise |
| */ |
| boolean isSingleWord() { |
| if (hasFinalPossessive) { |
| return current == startBounds && end == endBounds - 2; |
| } |
| else { |
| return current == startBounds && end == endBounds; |
| } |
| } |
| |
| /** |
| * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove |
| * it yet, simply note it. |
| */ |
| private void setBounds() { |
| while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) { |
| startBounds++; |
| } |
| |
| while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) { |
| endBounds--; |
| } |
| if (endsWithPossessive(endBounds)) { |
| hasFinalPossessive = true; |
| } |
| current = startBounds; |
| } |
| |
| /** |
| * Determines if the text at the given position indicates an English possessive which should be removed |
| * |
| * @param pos Position in the text to check if it indicates an English possessive |
| * @return {@code true} if the text at the position indicates an English possessive, {@code false} otherwise |
| */ |
| private boolean endsWithPossessive(int pos) { |
| return (stemEnglishPossessive && |
| pos > 2 && |
| text[pos - 2] == '\'' && |
| (text[pos - 1] == 's' || text[pos - 1] == 'S') && |
| isAlpha(charType(text[pos - 3])) && |
| (pos == endBounds || isSubwordDelim(charType(text[pos])))); |
| } |
| |
| /** |
| * Determines the type of the given character |
| * |
| * @param ch Character whose type is to be determined |
| * @return Type of the character |
| */ |
| private int charType(int ch) { |
| if (ch < charTypeTable.length) { |
| return charTypeTable[ch]; |
| } |
| return getType(ch); |
| } |
| |
| /** |
| * Computes the type of the given character |
| * |
| * @param ch Character whose type is to be determined |
| * @return Type of the character |
| */ |
| public static byte getType(int ch) { |
| switch (Character.getType(ch)) { |
| case Character.UPPERCASE_LETTER: return UPPER; |
| case Character.LOWERCASE_LETTER: return LOWER; |
| |
| case Character.TITLECASE_LETTER: |
| case Character.MODIFIER_LETTER: |
| case Character.OTHER_LETTER: |
| case Character.NON_SPACING_MARK: |
| case Character.ENCLOSING_MARK: // depends what it encloses? |
| case Character.COMBINING_SPACING_MARK: |
| return ALPHA; |
| |
| case Character.DECIMAL_DIGIT_NUMBER: |
| case Character.LETTER_NUMBER: |
| case Character.OTHER_NUMBER: |
| return DIGIT; |
| |
| // case Character.SPACE_SEPARATOR: |
| // case Character.LINE_SEPARATOR: |
| // case Character.PARAGRAPH_SEPARATOR: |
| // case Character.CONTROL: |
| // case Character.FORMAT: |
| // case Character.PRIVATE_USE: |
| |
| case Character.SURROGATE: // prevent splitting |
| return ALPHA|DIGIT; |
| |
| // case Character.DASH_PUNCTUATION: |
| // case Character.START_PUNCTUATION: |
| // case Character.END_PUNCTUATION: |
| // case Character.CONNECTOR_PUNCTUATION: |
| // case Character.OTHER_PUNCTUATION: |
| // case Character.MATH_SYMBOL: |
| // case Character.CURRENCY_SYMBOL: |
| // case Character.MODIFIER_SYMBOL: |
| // case Character.OTHER_SYMBOL: |
| // case Character.INITIAL_QUOTE_PUNCTUATION: |
| // case Character.FINAL_QUOTE_PUNCTUATION: |
| |
| default: return SUBWORD_DELIM; |
| } |
| } |
| |
| /** |
| * Checks if the given word type includes {@link #ALPHA} |
| * |
| * @param type Word type to check |
| * @return {@code true} if the type contains ALPHA, {@code false} otherwise |
| */ |
| static boolean isAlpha(int type) { |
| return (type & ALPHA) != 0; |
| } |
| |
| /** |
| * Checks if the given word type includes {@link #DIGIT} |
| * |
| * @param type Word type to check |
| * @return {@code true} if the type contains DIGIT, {@code false} otherwise |
| */ |
| static boolean isDigit(int type) { |
| return (type & DIGIT) != 0; |
| } |
| |
| /** |
| * Checks if the given word type includes {@link #SUBWORD_DELIM} |
| * |
| * @param type Word type to check |
| * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise |
| */ |
| static boolean isSubwordDelim(int type) { |
| return (type & SUBWORD_DELIM) != 0; |
| } |
| |
| /** |
| * Checks if the given word type includes {@link #UPPER} |
| * |
| * @param type Word type to check |
| * @return {@code true} if the type contains UPPER, {@code false} otherwise |
| */ |
| static boolean isUpper(int type) { |
| return (type & UPPER) != 0; |
| } |
| |
| } |