| /* |
| * Copyright (C) 1999-2010, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to deal |
| * in the Software without restriction, including without limitation the rights |
| * to use, copy, modify, merge, publish, distribute, and/or sell copies of the |
| * Software, and to permit persons to whom the Software is furnished to do so, |
| * provided that the above copyright notice(s) and this permission notice appear |
| * in all copies of the Software and that both the above copyright notice(s) and |
| * this permission notice appear in supporting documentation. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
| * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE |
| * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR |
| * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER |
| * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| * |
| * Except as contained in this notice, the name of a copyright holder shall not |
| * be used in advertising or otherwise to promote the sale, use or other |
| * dealings in this Software without prior written authorization of the |
| * copyright holder. |
| */ |
| package org.apache.lucene.analysis.icu.segmentation; |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; |
| import com.ibm.icu.lang.UScript; |
| import com.ibm.icu.text.UTF16; |
| |
| /** |
| * An iterator that locates ISO 15924 script boundaries in text. |
| * <p> |
| * This is not the same as simply looking at the Unicode block, or even the |
| * Script property. Some characters are 'common' across multiple scripts, and |
| * some 'inherit' the script value of text surrounding them. |
| * <p> |
| * This is similar to ICU (internal-only) UScriptRun, with the following |
| * differences: |
| * <ul> |
| * <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this |
| * is not necessary. It's also quite expensive. |
| * <li>Non-spacing marks inherit the script of their base character, following |
| * recommendations from UTR #24. |
| * </ul> |
| * @lucene.experimental |
| */ |
| final class ScriptIterator { |
| private char text[]; |
| private int start; |
| private int limit; |
| private int index; |
| |
| private int scriptStart; |
| private int scriptLimit; |
| private int scriptCode; |
| |
| private final boolean combineCJ; |
| |
| /** |
| * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE} |
| */ |
| ScriptIterator(boolean combineCJ) { |
| this.combineCJ = combineCJ; |
| } |
| |
| /** |
| * Get the start of this script run |
| * |
| * @return start position of script run |
| */ |
| int getScriptStart() { |
| return scriptStart; |
| } |
| |
| /** |
| * Get the index of the first character after the end of this script run |
| * |
| * @return position of the first character after this script run |
| */ |
| int getScriptLimit() { |
| return scriptLimit; |
| } |
| |
| /** |
| * Get the UScript script code for this script run |
| * |
| * @return code for the script of the current run |
| */ |
| int getScriptCode() { |
| return scriptCode; |
| } |
| |
| /** |
| * Iterates to the next script run, returning true if one exists. |
| * |
| * @return true if there is another script run, false otherwise. |
| */ |
| boolean next() { |
| if (scriptLimit >= limit) |
| return false; |
| |
| scriptCode = UScript.COMMON; |
| scriptStart = scriptLimit; |
| |
| while (index < limit) { |
| final int ch = UTF16.charAt(text, start, limit, index - start); |
| final int sc = getScript(ch); |
| |
| /* |
| * From UTR #24: Implementations that determine the boundaries between |
| * characters of given scripts should never break between a non-spacing |
| * mark and its base character. Thus for boundary determinations and |
| * similar sorts of processing, a non-spacing mark — whatever its script |
| * value — should inherit the script value of its base character. |
| */ |
| if (isSameScript(scriptCode, sc) |
| || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) { |
| index += UTF16.getCharCount(ch); |
| |
| /* |
| * Inherited or Common becomes the script code of the surrounding text. |
| */ |
| if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { |
| scriptCode = sc; |
| } |
| |
| } else { |
| break; |
| } |
| } |
| |
| scriptLimit = index; |
| return true; |
| } |
| |
| /** Determine if two scripts are compatible. */ |
| private static boolean isSameScript(int scriptOne, int scriptTwo) { |
| return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED |
| || scriptOne == scriptTwo; |
| } |
| |
| /** |
| * Set a new region of text to be examined by this iterator |
| * |
| * @param text text buffer to examine |
| * @param start offset into buffer |
| * @param length maximum length to examine |
| */ |
| void setText(char text[], int start, int length) { |
| this.text = text; |
| this.start = start; |
| this.index = start; |
| this.limit = start + length; |
| this.scriptStart = start; |
| this.scriptLimit = start; |
| this.scriptCode = UScript.INVALID_CODE; |
| } |
| |
| /** linear fast-path for basic latin case */ |
| private static final int basicLatin[] = new int[128]; |
| |
| static { |
| for (int i = 0; i < basicLatin.length; i++) |
| basicLatin[i] = UScript.getScript(i); |
| } |
| |
| /** fast version of UScript.getScript(). Basic Latin is an array lookup */ |
| private int getScript(int codepoint) { |
| if (0 <= codepoint && codepoint < basicLatin.length) { |
| return basicLatin[codepoint]; |
| } else { |
| int script = UScript.getScript(codepoint); |
| if (combineCJ) { |
| if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) { |
| return UScript.JAPANESE; |
| } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) { |
| // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise |
| // they are treated as punctuation. we currently have no cleaner way to fix this! |
| return UScript.LATIN; |
| } else { |
| return script; |
| } |
| } else { |
| return script; |
| } |
| } |
| } |
| } |