lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.miscellaneous;

 import java.util.Locale;

 /**
  * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
  * @lucene.internal
  */
 public final class WordDelimiterIterator {

   static final int LOWER = 0x01;
   static final int UPPER = 0x02;
   static final int DIGIT = 0x04;
   static final int SUBWORD_DELIM = 0x08;

   // combinations: for testing, not for setting bits
   public static final int ALPHA = 0x03;
   public static final int ALPHANUM = 0x07;

   /** Indicates the end of iteration */
   public static final int DONE = -1;

   public static final byte[] DEFAULT_WORD_DELIM_TABLE;

   char text[];
   int length;

   /** start position of text, excluding leading delimiters */
   int startBounds;
   /** end position of text, excluding trailing delimiters */
   int endBounds;

   /** Beginning of subword */
   int current;
   /** End of subword */
   int end;

   /* does this string end with a possessive such as 's */
   private boolean hasFinalPossessive = false;

   /**
    * If false, causes case changes to be ignored (subwords will only be generated
    * given SUBWORD_DELIM tokens). (Defaults to true)
    */
   final boolean splitOnCaseChange;

   /**
    * If false, causes numeric changes to be ignored (subwords will only be generated
    * given SUBWORD_DELIM tokens). (Defaults to true)
    */
   final boolean splitOnNumerics;

   /**
    * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
    * <p/>
    * "O'Neil's" =&gt; "O", "Neil"
    */
   final boolean stemEnglishPossessive;

   private final byte[] charTypeTable;

   /** if true, need to skip over a possessive found in the last call to next() */
   private boolean skipPossessive = false;

   // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
   // done if separated by these chars?) "," would be an obvious candidate...
   static {
     byte[] tab = new byte[256];
     for (int i = 0; i < 256; i++) {
       byte code = 0;
       if (Character.isLowerCase(i)) {
         code |= LOWER;
       }
       else if (Character.isUpperCase(i)) {
         code |= UPPER;
       }
       else if (Character.isDigit(i)) {
         code |= DIGIT;
       }
       if (code == 0) {
         code = SUBWORD_DELIM;
       }
       tab[i] = code;
     }
     DEFAULT_WORD_DELIM_TABLE = tab;
   }

   /**
    * Create a new WordDelimiterIterator operating with the supplied rules.
    *
    * @param charTypeTable table containing character types
    * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
    * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
    * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" =&gt; "O", "Neil"
    */
   WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
     this.charTypeTable = charTypeTable;
     this.splitOnCaseChange = splitOnCaseChange;
     this.splitOnNumerics = splitOnNumerics;
     this.stemEnglishPossessive = stemEnglishPossessive;
   }

   @Override
   public String toString() {
     if (end == DONE) {
       return "DONE";
     }
     return new String(text, current, end - current)
         + " [" + current + "-" + end + "]"
         + " type=" + String.format(Locale.ROOT, "%#02x", type());
   }

   /**
    * Advance to the next subword in the string.
    *
    * @return index of the next subword, or {@link #DONE} if all subwords have been returned
    */
   int next() {
     current = end;
     if (current == DONE) {
       return DONE;
     }

     if (skipPossessive) {
       current += 2;
       skipPossessive = false;
     }

     int lastType = 0;

     while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
       current++;
     }

     if (current >= endBounds) {
       return end = DONE;
     }

     for (end = current + 1; end < endBounds; end++) {
       int type = charType(text[end]);
       if (isBreak(lastType, type)) {
         break;
       }
       lastType = type;
     }

     if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
       skipPossessive = true;
     }

     return end;
   }


   /**
    * Return the type of the current subword.
    * This currently uses the type of the first character in the subword.
    *
    * @return type of the current word
    */
   int type() {
     if (end == DONE) {
       return 0;
     }

     int type = charType(text[current]);
     switch (type) {
       // return ALPHA word type for both lower and upper
       case LOWER:
       case UPPER:
         return ALPHA;
       default:
         return type;
     }
   }

   /**
    * Reset the text to a new value, and reset all state
    *
    * @param text New text
    * @param length length of the text
    */
   void setText(char text[], int length) {
     this.text = text;
     this.length = this.endBounds = length;
     current = startBounds = end = 0;
     skipPossessive = hasFinalPossessive = false;
     setBounds();
   }

   // ================================================= Helper Methods ================================================

   /**
    * Determines whether the transition from lastType to type indicates a break
    *
    * @param lastType Last subword type
    * @param type Current subword type
    * @return {@code true} if the transition indicates a break, {@code false} otherwise
    */
   private boolean isBreak(int lastType, int type) {
     if ((type & lastType) != 0) {
       return false;
     }

     if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
       // ALPHA->ALPHA: always ignore if case isn't considered.
       return false;
     } else if (isUpper(lastType) && isAlpha(type)) {
       // UPPER->letter: Don't split
       return false;
     } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
       // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
       return false;
     }

     return true;
   }

   /**
    * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
    *
    * @return {@code true} if the current word contains only one subword, {@code false} otherwise
    */
   boolean isSingleWord() {
     if (hasFinalPossessive) {
       return current == startBounds && end == endBounds - 2;
     }
     else {
       return current == startBounds && end == endBounds;
     }
   }

   /**
    * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
    * it yet, simply note it.
    */
   private void setBounds() {
     while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
       startBounds++;
     }

     while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
       endBounds--;
     }
     if (endsWithPossessive(endBounds)) {
       hasFinalPossessive = true;
     }
     current = startBounds;
   }

   /**
    * Determines if the text at the given position indicates an English possessive which should be removed
    *
    * @param pos Position in the text to check if it indicates an English possessive
    * @return {@code true} if the text at the position indicates an English possessive, {@code false} otherwise
    */
   private boolean endsWithPossessive(int pos) {
     return (stemEnglishPossessive &&
             pos > 2 &&
             text[pos - 2] == '\'' &&
             (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
             isAlpha(charType(text[pos - 3])) &&
             (pos == endBounds || isSubwordDelim(charType(text[pos]))));
   }

   /**
    * Determines the type of the given character
    *
    * @param ch Character whose type is to be determined
    * @return Type of the character
    */
   private int charType(int ch) {
     if (ch < charTypeTable.length) {
       return charTypeTable[ch];
     }
     return getType(ch);
   }

   /**
    * Computes the type of the given character
    *
    * @param ch Character whose type is to be determined
    * @return Type of the character
    */
   public static byte getType(int ch) {
     switch (Character.getType(ch)) {
       case Character.UPPERCASE_LETTER: return UPPER;
       case Character.LOWERCASE_LETTER: return LOWER;

       case Character.TITLECASE_LETTER:
       case Character.MODIFIER_LETTER:
       case Character.OTHER_LETTER:
       case Character.NON_SPACING_MARK:
       case Character.ENCLOSING_MARK:  // depends what it encloses?
       case Character.COMBINING_SPACING_MARK:
         return ALPHA;

       case Character.DECIMAL_DIGIT_NUMBER:
       case Character.LETTER_NUMBER:
       case Character.OTHER_NUMBER:
         return DIGIT;

       // case Character.SPACE_SEPARATOR:
       // case Character.LINE_SEPARATOR:
       // case Character.PARAGRAPH_SEPARATOR:
       // case Character.CONTROL:
       // case Character.FORMAT:
       // case Character.PRIVATE_USE:

       case Character.SURROGATE:  // prevent splitting
         return ALPHA|DIGIT;

       // case Character.DASH_PUNCTUATION:
       // case Character.START_PUNCTUATION:
       // case Character.END_PUNCTUATION:
       // case Character.CONNECTOR_PUNCTUATION:
       // case Character.OTHER_PUNCTUATION:
       // case Character.MATH_SYMBOL:
       // case Character.CURRENCY_SYMBOL:
       // case Character.MODIFIER_SYMBOL:
       // case Character.OTHER_SYMBOL:
       // case Character.INITIAL_QUOTE_PUNCTUATION:
       // case Character.FINAL_QUOTE_PUNCTUATION:

       default: return SUBWORD_DELIM;
     }
   }

   /**
    * Checks if the given word type includes {@link #ALPHA}
    *
    * @param type Word type to check
    * @return {@code true} if the type contains ALPHA, {@code false} otherwise
    */
   static boolean isAlpha(int type) {
     return (type & ALPHA) != 0;
   }

   /**
    * Checks if the given word type includes {@link #DIGIT}
    *
    * @param type Word type to check
    * @return {@code true} if the type contains DIGIT, {@code false} otherwise
    */
   static boolean isDigit(int type) {
     return (type & DIGIT) != 0;
   }

   /**
    * Checks if the given word type includes {@link #SUBWORD_DELIM}
    *
    * @param type Word type to check
    * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
    */
   static boolean isSubwordDelim(int type) {
     return (type & SUBWORD_DELIM) != 0;
   }

   /**
    * Checks if the given word type includes {@link #UPPER}
    *
    * @param type Word type to check
    * @return {@code true} if the type contains UPPER, {@code false} otherwise
    */
   static boolean isUpper(int type) {
     return (type & UPPER) != 0;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.miscellaneous;

	import java.util.Locale;

	/**
	* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
	* @lucene.internal
	*/
	public final class WordDelimiterIterator {

	static final int LOWER = 0x01;
	static final int UPPER = 0x02;
	static final int DIGIT = 0x04;
	static final int SUBWORD_DELIM = 0x08;

	// combinations: for testing, not for setting bits
	public static final int ALPHA = 0x03;
	public static final int ALPHANUM = 0x07;

	/** Indicates the end of iteration */
	public static final int DONE = -1;

	public static final byte[] DEFAULT_WORD_DELIM_TABLE;

	char text[];
	int length;

	/** start position of text, excluding leading delimiters */
	int startBounds;
	/** end position of text, excluding trailing delimiters */
	int endBounds;

	/** Beginning of subword */
	int current;
	/** End of subword */
	int end;

	/* does this string end with a possessive such as 's */
	private boolean hasFinalPossessive = false;

	/**
	* If false, causes case changes to be ignored (subwords will only be generated
	* given SUBWORD_DELIM tokens). (Defaults to true)
	*/
	final boolean splitOnCaseChange;

	/**
	* If false, causes numeric changes to be ignored (subwords will only be generated
	* given SUBWORD_DELIM tokens). (Defaults to true)
	*/
	final boolean splitOnNumerics;

	/**
	* If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
	* <p/>
	* "O'Neil's" => "O", "Neil"
	*/
	final boolean stemEnglishPossessive;

	private final byte[] charTypeTable;

	/** if true, need to skip over a possessive found in the last call to next() */
	private boolean skipPossessive = false;

	// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
	// done if separated by these chars?) "," would be an obvious candidate...
	static {
	byte[] tab = new byte[256];
	for (int i = 0; i < 256; i++) {
	byte code = 0;
	if (Character.isLowerCase(i)) {
	code \|= LOWER;
	}
	else if (Character.isUpperCase(i)) {
	code \|= UPPER;
	}
	else if (Character.isDigit(i)) {
	code \|= DIGIT;
	}
	if (code == 0) {
	code = SUBWORD_DELIM;
	}
	tab[i] = code;
	}
	DEFAULT_WORD_DELIM_TABLE = tab;
	}

	/**
	* Create a new WordDelimiterIterator operating with the supplied rules.
	*
	* @param charTypeTable table containing character types
	* @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
	* @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
	* @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
	*/
	WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
	this.charTypeTable = charTypeTable;
	this.splitOnCaseChange = splitOnCaseChange;
	this.splitOnNumerics = splitOnNumerics;
	this.stemEnglishPossessive = stemEnglishPossessive;
	}

	@Override
	public String toString() {
	if (end == DONE) {
	return "DONE";
	}
	return new String(text, current, end - current)
	+ " [" + current + "-" + end + "]"
	+ " type=" + String.format(Locale.ROOT, "%#02x", type());
	}

	/**
	* Advance to the next subword in the string.
	*
	* @return index of the next subword, or {@link #DONE} if all subwords have been returned
	*/
	int next() {
	current = end;
	if (current == DONE) {
	return DONE;
	}

	if (skipPossessive) {
	current += 2;
	skipPossessive = false;
	}

	int lastType = 0;

	while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
	current++;
	}

	if (current >= endBounds) {
	return end = DONE;
	}

	for (end = current + 1; end < endBounds; end++) {
	int type = charType(text[end]);
	if (isBreak(lastType, type)) {
	break;
	}
	lastType = type;
	}

	if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
	skipPossessive = true;
	}

	return end;
	}


	/**
	* Return the type of the current subword.
	* This currently uses the type of the first character in the subword.
	*
	* @return type of the current word
	*/
	int type() {
	if (end == DONE) {
	return 0;
	}

	int type = charType(text[current]);
	switch (type) {
	// return ALPHA word type for both lower and upper
	case LOWER:
	case UPPER:
	return ALPHA;
	default:
	return type;
	}
	}

	/**
	* Reset the text to a new value, and reset all state
	*
	* @param text New text
	* @param length length of the text
	*/
	void setText(char text[], int length) {
	this.text = text;
	this.length = this.endBounds = length;
	current = startBounds = end = 0;
	skipPossessive = hasFinalPossessive = false;
	setBounds();
	}

	// ================================================= Helper Methods ================================================

	/**
	* Determines whether the transition from lastType to type indicates a break
	*
	* @param lastType Last subword type
	* @param type Current subword type
	* @return {@code true} if the transition indicates a break, {@code false} otherwise
	*/
	private boolean isBreak(int lastType, int type) {
	if ((type & lastType) != 0) {
	return false;
	}

	if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
	// ALPHA->ALPHA: always ignore if case isn't considered.
	return false;
	} else if (isUpper(lastType) && isAlpha(type)) {
	// UPPER->letter: Don't split
	return false;
	} else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) \|\| (isDigit(lastType) && isAlpha(type)))) {
	// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
	return false;
	}

	return true;
	}

	/**
	* Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
	*
	* @return {@code true} if the current word contains only one subword, {@code false} otherwise
	*/
	boolean isSingleWord() {
	if (hasFinalPossessive) {
	return current == startBounds && end == endBounds - 2;
	}
	else {
	return current == startBounds && end == endBounds;
	}
	}

	/**
	* Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
	* it yet, simply note it.
	*/
	private void setBounds() {
	while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
	startBounds++;
	}

	while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
	endBounds--;
	}
	if (endsWithPossessive(endBounds)) {
	hasFinalPossessive = true;
	}
	current = startBounds;
	}

	/**
	* Determines if the text at the given position indicates an English possessive which should be removed
	*
	* @param pos Position in the text to check if it indicates an English possessive
	* @return {@code true} if the text at the position indicates an English possessive, {@code false} otherwise
	*/
	private boolean endsWithPossessive(int pos) {
	return (stemEnglishPossessive &&
	pos > 2 &&
	text[pos - 2] == '\'' &&
	(text[pos - 1] == 's' \|\| text[pos - 1] == 'S') &&
	isAlpha(charType(text[pos - 3])) &&
	(pos == endBounds \|\| isSubwordDelim(charType(text[pos]))));
	}

	/**
	* Determines the type of the given character
	*
	* @param ch Character whose type is to be determined
	* @return Type of the character
	*/
	private int charType(int ch) {
	if (ch < charTypeTable.length) {
	return charTypeTable[ch];
	}
	return getType(ch);
	}

	/**
	* Computes the type of the given character
	*
	* @param ch Character whose type is to be determined
	* @return Type of the character
	*/
	public static byte getType(int ch) {
	switch (Character.getType(ch)) {
	case Character.UPPERCASE_LETTER: return UPPER;
	case Character.LOWERCASE_LETTER: return LOWER;

	case Character.TITLECASE_LETTER:
	case Character.MODIFIER_LETTER:
	case Character.OTHER_LETTER:
	case Character.NON_SPACING_MARK:
	case Character.ENCLOSING_MARK: // depends what it encloses?
	case Character.COMBINING_SPACING_MARK:
	return ALPHA;

	case Character.DECIMAL_DIGIT_NUMBER:
	case Character.LETTER_NUMBER:
	case Character.OTHER_NUMBER:
	return DIGIT;

	// case Character.SPACE_SEPARATOR:
	// case Character.LINE_SEPARATOR:
	// case Character.PARAGRAPH_SEPARATOR:
	// case Character.CONTROL:
	// case Character.FORMAT:
	// case Character.PRIVATE_USE:

	case Character.SURROGATE: // prevent splitting
	return ALPHA\|DIGIT;

	// case Character.DASH_PUNCTUATION:
	// case Character.START_PUNCTUATION:
	// case Character.END_PUNCTUATION:
	// case Character.CONNECTOR_PUNCTUATION:
	// case Character.OTHER_PUNCTUATION:
	// case Character.MATH_SYMBOL:
	// case Character.CURRENCY_SYMBOL:
	// case Character.MODIFIER_SYMBOL:
	// case Character.OTHER_SYMBOL:
	// case Character.INITIAL_QUOTE_PUNCTUATION:
	// case Character.FINAL_QUOTE_PUNCTUATION:

	default: return SUBWORD_DELIM;
	}
	}

	/**
	* Checks if the given word type includes {@link #ALPHA}
	*
	* @param type Word type to check
	* @return {@code true} if the type contains ALPHA, {@code false} otherwise
	*/
	static boolean isAlpha(int type) {
	return (type & ALPHA) != 0;
	}

	/**
	* Checks if the given word type includes {@link #DIGIT}
	*
	* @param type Word type to check
	* @return {@code true} if the type contains DIGIT, {@code false} otherwise
	*/
	static boolean isDigit(int type) {
	return (type & DIGIT) != 0;
	}

	/**
	* Checks if the given word type includes {@link #SUBWORD_DELIM}
	*
	* @param type Word type to check
	* @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
	*/
	static boolean isSubwordDelim(int type) {
	return (type & SUBWORD_DELIM) != 0;
	}

	/**
	* Checks if the given word type includes {@link #UPPER}
	*
	* @param type Word type to check
	* @return {@code true} if the type contains UPPER, {@code false} otherwise
	*/
	static boolean isUpper(int type) {
	return (type & UPPER) != 0;
	}

	}