lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java - lucene-solr - Git at Google

 /*
  * Copyright (C) 1999-2010, International Business Machines
  * Corporation and others.  All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
  * Software, and to permit persons to whom the Software is furnished to do so,
  * provided that the above copyright notice(s) and this permission notice appear
  * in all copies of the Software and that both the above copyright notice(s) and
  * this permission notice appear in supporting documentation.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
  * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
  * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  *
  * Except as contained in this notice, the name of a copyright holder shall not
  * be used in advertising or otherwise to promote the sale, use or other
  * dealings in this Software without prior written authorization of the
  * copyright holder.
  */
 package org.apache.lucene.analysis.icu.segmentation;

 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
 import com.ibm.icu.lang.UScript;
 import com.ibm.icu.text.UTF16;

 /**
  * An iterator that locates ISO 15924 script boundaries in text.
  * <p>
  * This is not the same as simply looking at the Unicode block, or even the
  * Script property. Some characters are 'common' across multiple scripts, and
  * some 'inherit' the script value of text surrounding them.
  * <p>
  * This is similar to ICU (internal-only) UScriptRun, with the following
  * differences:
  * <ul>
  *  <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
  * is not necessary. It's also quite expensive.
  *  <li>Non-spacing marks inherit the script of their base character, following
  *  recommendations from UTR #24.
  * </ul>
  * @lucene.experimental
  */
 final class ScriptIterator {
   private char text[];
   private int start;
   private int limit;
   private int index;

   private int scriptStart;
   private int scriptLimit;
   private int scriptCode;

   private final boolean combineCJ;

   /**
    * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
    */
   ScriptIterator(boolean combineCJ) {
     this.combineCJ = combineCJ;
   }

   /**
    * Get the start of this script run
    *
    * @return start position of script run
    */
   int getScriptStart() {
     return scriptStart;
   }

   /**
    * Get the index of the first character after the end of this script run
    *
    * @return position of the first character after this script run
    */
   int getScriptLimit() {
     return scriptLimit;
   }

   /**
    * Get the UScript script code for this script run
    *
    * @return code for the script of the current run
    */
   int getScriptCode() {
     return scriptCode;
   }

   /**
    * Iterates to the next script run, returning true if one exists.
    *
    * @return true if there is another script run, false otherwise.
    */
   boolean next() {
     if (scriptLimit >= limit)
       return false;

     scriptCode = UScript.COMMON;
     scriptStart = scriptLimit;

     while (index < limit) {
       final int ch = UTF16.charAt(text, start, limit, index - start);
       final int sc = getScript(ch);

       /*
        * From UTR #24: Implementations that determine the boundaries between
        * characters of given scripts should never break between a non-spacing
        * mark and its base character. Thus for boundary determinations and
        * similar sorts of processing, a non-spacing mark — whatever its script
        * value — should inherit the script value of its base character.
        */
       if (isSameScript(scriptCode, sc)
           || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
         index += UTF16.getCharCount(ch);

         /*
          * Inherited or Common becomes the script code of the surrounding text.
          */
         if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
           scriptCode = sc;
         }

       } else {
         break;
       }
     }

     scriptLimit = index;
     return true;
   }

   /** Determine if two scripts are compatible. */
   private static boolean isSameScript(int scriptOne, int scriptTwo) {
     return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
         || scriptOne == scriptTwo;
   }

   /**
    * Set a new region of text to be examined by this iterator
    *
    * @param text text buffer to examine
    * @param start offset into buffer
    * @param length maximum length to examine
    */
   void setText(char text[], int start, int length) {
     this.text = text;
     this.start = start;
     this.index = start;
     this.limit = start + length;
     this.scriptStart = start;
     this.scriptLimit = start;
     this.scriptCode = UScript.INVALID_CODE;
   }

   /** linear fast-path for basic latin case */
   private static final int basicLatin[] = new int[128];

   static {
     for (int i = 0; i < basicLatin.length; i++)
       basicLatin[i] = UScript.getScript(i);
   }

   /** fast version of UScript.getScript(). Basic Latin is an array lookup */
   private int getScript(int codepoint) {
     if (0 <= codepoint && codepoint < basicLatin.length) {
       return basicLatin[codepoint];
     } else {
       int script = UScript.getScript(codepoint);
       if (combineCJ) {
         if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
           return UScript.JAPANESE;
         } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
           // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
           // they are treated as punctuation. we currently have no cleaner way to fix this!
           return UScript.LATIN;
         } else {
           return script;
         }
       } else {
         return script;
       }
     }
   }
 }
	/*
	* Copyright (C) 1999-2010, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
	* Software, and to permit persons to whom the Software is furnished to do so,
	* provided that the above copyright notice(s) and this permission notice appear
	* in all copies of the Software and that both the above copyright notice(s) and
	* this permission notice appear in supporting documentation.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
	* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
	* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
	* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
	* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	*
	* Except as contained in this notice, the name of a copyright holder shall not
	* be used in advertising or otherwise to promote the sale, use or other
	* dealings in this Software without prior written authorization of the
	* copyright holder.
	*/
	package org.apache.lucene.analysis.icu.segmentation;

	import com.ibm.icu.lang.UCharacter;
	import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
	import com.ibm.icu.lang.UScript;
	import com.ibm.icu.text.UTF16;

	/**
	* An iterator that locates ISO 15924 script boundaries in text.
	* <p>
	* This is not the same as simply looking at the Unicode block, or even the
	* Script property. Some characters are 'common' across multiple scripts, and
	* some 'inherit' the script value of text surrounding them.
	* <p>
	* This is similar to ICU (internal-only) UScriptRun, with the following
	* differences:
	* <ul>
	* <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
	* is not necessary. It's also quite expensive.
	* <li>Non-spacing marks inherit the script of their base character, following
	* recommendations from UTR #24.
	* </ul>
	* @lucene.experimental
	*/
	final class ScriptIterator {
	private char text[];
	private int start;
	private int limit;
	private int index;

	private int scriptStart;
	private int scriptLimit;
	private int scriptCode;

	private final boolean combineCJ;

	/**
	* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
	*/
	ScriptIterator(boolean combineCJ) {
	this.combineCJ = combineCJ;
	}

	/**
	* Get the start of this script run
	*
	* @return start position of script run
	*/
	int getScriptStart() {
	return scriptStart;
	}

	/**
	* Get the index of the first character after the end of this script run
	*
	* @return position of the first character after this script run
	*/
	int getScriptLimit() {
	return scriptLimit;
	}

	/**
	* Get the UScript script code for this script run
	*
	* @return code for the script of the current run
	*/
	int getScriptCode() {
	return scriptCode;
	}

	/**
	* Iterates to the next script run, returning true if one exists.
	*
	* @return true if there is another script run, false otherwise.
	*/
	boolean next() {
	if (scriptLimit >= limit)
	return false;

	scriptCode = UScript.COMMON;
	scriptStart = scriptLimit;

	while (index < limit) {
	final int ch = UTF16.charAt(text, start, limit, index - start);
	final int sc = getScript(ch);

	/*
	* From UTR #24: Implementations that determine the boundaries between
	* characters of given scripts should never break between a non-spacing
	* mark and its base character. Thus for boundary determinations and
	* similar sorts of processing, a non-spacing mark — whatever its script
	* value — should inherit the script value of its base character.
	*/
	if (isSameScript(scriptCode, sc)
	\|\| UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
	index += UTF16.getCharCount(ch);

	/*
	* Inherited or Common becomes the script code of the surrounding text.
	*/
	if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
	scriptCode = sc;
	}

	} else {
	break;
	}
	}

	scriptLimit = index;
	return true;
	}

	/** Determine if two scripts are compatible. */
	private static boolean isSameScript(int scriptOne, int scriptTwo) {
	return scriptOne <= UScript.INHERITED \|\| scriptTwo <= UScript.INHERITED
	\|\| scriptOne == scriptTwo;
	}

	/**
	* Set a new region of text to be examined by this iterator
	*
	* @param text text buffer to examine
	* @param start offset into buffer
	* @param length maximum length to examine
	*/
	void setText(char text[], int start, int length) {
	this.text = text;
	this.start = start;
	this.index = start;
	this.limit = start + length;
	this.scriptStart = start;
	this.scriptLimit = start;
	this.scriptCode = UScript.INVALID_CODE;
	}

	/** linear fast-path for basic latin case */
	private static final int basicLatin[] = new int[128];

	static {
	for (int i = 0; i < basicLatin.length; i++)
	basicLatin[i] = UScript.getScript(i);
	}

	/** fast version of UScript.getScript(). Basic Latin is an array lookup */
	private int getScript(int codepoint) {
	if (0 <= codepoint && codepoint < basicLatin.length) {
	return basicLatin[codepoint];
	} else {
	int script = UScript.getScript(codepoint);
	if (combineCJ) {
	if (script == UScript.HAN \|\| script == UScript.HIRAGANA \|\| script == UScript.KATAKANA) {
	return UScript.JAPANESE;
	} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
	// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
	// they are treated as punctuation. we currently have no cleaner way to fix this!
	return UScript.LATIN;
	} else {
	return script;
	}
	} else {
	return script;
	}
	}
	}
	}