blob: 4dd723e287a6f44e62db50db9fe4fab37f3c9209 [file] [log] [blame]
/*
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* provided that the above copyright notice(s) and this permission notice appear
* in all copies of the Software and that both the above copyright notice(s) and
* this permission notice appear in supporting documentation.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder shall not
* be used in advertising or otherwise to promote the sale, use or other
* dealings in this Software without prior written authorization of the
* copyright holder.
*/
package org.apache.lucene.analysis.icu.segmentation;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UTF16;
/**
* An iterator that locates ISO 15924 script boundaries in text.
* <p>
* This is not the same as simply looking at the Unicode block, or even the
* Script property. Some characters are 'common' across multiple scripts, and
* some 'inherit' the script value of text surrounding them.
* <p>
* This is similar to ICU (internal-only) UScriptRun, with the following
* differences:
* <ul>
* <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
* is not necessary. It's also quite expensive.
* <li>Non-spacing marks inherit the script of their base character, following
* recommendations from UTR #24.
* </ul>
* @lucene.experimental
*/
final class ScriptIterator {
private char text[];
private int start;
private int limit;
private int index;
private int scriptStart;
private int scriptLimit;
private int scriptCode;
private final boolean combineCJ;
/**
* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
*/
ScriptIterator(boolean combineCJ) {
this.combineCJ = combineCJ;
}
/**
* Get the start of this script run
*
* @return start position of script run
*/
int getScriptStart() {
return scriptStart;
}
/**
* Get the index of the first character after the end of this script run
*
* @return position of the first character after this script run
*/
int getScriptLimit() {
return scriptLimit;
}
/**
* Get the UScript script code for this script run
*
* @return code for the script of the current run
*/
int getScriptCode() {
return scriptCode;
}
/**
* Iterates to the next script run, returning true if one exists.
*
* @return true if there is another script run, false otherwise.
*/
boolean next() {
if (scriptLimit >= limit)
return false;
scriptCode = UScript.COMMON;
scriptStart = scriptLimit;
while (index < limit) {
final int ch = UTF16.charAt(text, start, limit, index - start);
final int sc = getScript(ch);
/*
* From UTR #24: Implementations that determine the boundaries between
* characters of given scripts should never break between a non-spacing
* mark and its base character. Thus for boundary determinations and
* similar sorts of processing, a non-spacing mark — whatever its script
* value — should inherit the script value of its base character.
*/
if (isSameScript(scriptCode, sc)
|| UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) {
index += UTF16.getCharCount(ch);
/*
* Inherited or Common becomes the script code of the surrounding text.
*/
if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
scriptCode = sc;
}
} else {
break;
}
}
scriptLimit = index;
return true;
}
/** Determine if two scripts are compatible. */
private static boolean isSameScript(int scriptOne, int scriptTwo) {
return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
|| scriptOne == scriptTwo;
}
/**
* Set a new region of text to be examined by this iterator
*
* @param text text buffer to examine
* @param start offset into buffer
* @param length maximum length to examine
*/
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.index = start;
this.limit = start + length;
this.scriptStart = start;
this.scriptLimit = start;
this.scriptCode = UScript.INVALID_CODE;
}
/** linear fast-path for basic latin case */
private static final int basicLatin[] = new int[128];
static {
for (int i = 0; i < basicLatin.length; i++)
basicLatin[i] = UScript.getScript(i);
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length) {
return basicLatin[codepoint];
} else {
int script = UScript.getScript(codepoint);
if (combineCJ) {
if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
return UScript.JAPANESE;
} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
// they are treated as punctuation. we currently have no cleaner way to fix this!
return UScript.LATIN;
} else {
return script;
}
} else {
return script;
}
}
}
}