blob: 8a55cb1327c04aef2f3608a34ffbbea3af496437 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id$ */
package org.apache.fop.util;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
// CSOFF: AvoidNestedBlocksCheck
// CSOFF: InnerAssignmentCheck
// CSOFF: WhitespaceAfterCheck
// CSOFF: SimplifyBooleanReturnCheck
/**
* This class provides utilities to distinguish various kinds of Unicode
* whitespace and to get character widths in a given FontState.
*/
public class CharUtilities {
/**
* Character code used to signal a character boundary in
* inline content, such as an inline with borders and padding
* or a nested block object.
*/
public static final char CODE_EOT = 0;
/**
* Character class: Unicode white space
*/
public static final int UCWHITESPACE = 0;
/**
* Character class: Line feed
*/
public static final int LINEFEED = 1;
/**
* Character class: Boundary between text runs
*/
public static final int EOT = 2;
/**
* Character class: non-whitespace
*/
public static final int NONWHITESPACE = 3;
/**
* Character class: XML whitespace
*/
public static final int XMLWHITESPACE = 4;
/** null char */
public static final char NULL_CHAR = '\u0000';
/** linefeed character */
public static final char LINEFEED_CHAR = '\n';
/** carriage return */
public static final char CARRIAGE_RETURN = '\r';
/** normal tab */
public static final char TAB = '\t';
/** normal space */
public static final char SPACE = '\u0020';
/** non-breaking space */
public static final char NBSPACE = '\u00A0';
/** next line control character */
public static final char NEXT_LINE = '\u0085';
/** zero-width space */
public static final char ZERO_WIDTH_SPACE = '\u200B';
/** word joiner */
public static final char WORD_JOINER = '\u2060';
/** zero-width joiner */
public static final char ZERO_WIDTH_JOINER = '\u200D';
/** left-to-right mark */
public static final char LRM = '\u200E';
/** right-to-left mark */
public static final char RLM = '\u202F';
/** left-to-right embedding */
public static final char LRE = '\u202A';
/** right-to-left embedding */
public static final char RLE = '\u202B';
/** pop directional formatting */
public static final char PDF = '\u202C';
/** left-to-right override */
public static final char LRO = '\u202D';
/** right-to-left override */
public static final char RLO = '\u202E';
/** zero-width no-break space (= byte order mark) */
public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
/** soft hyphen */
public static final char SOFT_HYPHEN = '\u00AD';
/** line-separator */
public static final char LINE_SEPARATOR = '\u2028';
/** paragraph-separator */
public static final char PARAGRAPH_SEPARATOR = '\u2029';
/** missing ideograph */
public static final char MISSING_IDEOGRAPH = '\u25A1';
/** Ideogreaphic space */
public static final char IDEOGRAPHIC_SPACE = '\u3000';
/** Object replacement character */
public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
/** Unicode value indicating the the character is "not a character". */
public static final char NOT_A_CHARACTER = '\uFFFF';
/**
* A static (class) parameter indicating whether V2 indic shaping
* rules apply or not, with default being <code>true</code>.
*/
private static final boolean useV2Indic = true; // CSOK: ConstantNameCheck
/**
* Utility class: Constructor prevents instantiating when subclassed.
*/
protected CharUtilities() {
throw new UnsupportedOperationException();
}
/**
* Return the appropriate CharClass constant for the type
* of the passed character.
* @param c character to inspect
* @return the determined character class
*/
public static int classOf ( int c ) {
switch (c) {
case CODE_EOT:
return EOT;
case LINEFEED_CHAR:
return LINEFEED;
case SPACE:
case CARRIAGE_RETURN:
case TAB:
return XMLWHITESPACE;
default:
return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
}
}
/**
* Helper method to determine if the character is a
* space with normal behavior. Normal behavior means that
* it's not non-breaking.
* @param c character to inspect
* @return True if the character is a normal space
*/
public static boolean isBreakableSpace ( int c ) {
return (c == SPACE || isFixedWidthSpace(c));
}
/**
* Method to determine if the character is a zero-width space.
* @param c the character to check
* @return true if the character is a zero-width space
*/
public static boolean isZeroWidthSpace ( int c ) {
return c == ZERO_WIDTH_SPACE // 200Bh
|| c == WORD_JOINER // 2060h
|| c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
}
/**
* Method to determine if the character is a (breakable) fixed-width space.
* @param c the character to check
* @return true if the character has a fixed-width
*/
public static boolean isFixedWidthSpace ( int c ) {
return (c >= '\u2000' && c <= '\u200B')
|| c == '\u3000';
// c == '\u2000' // en quad
// c == '\u2001' // em quad
// c == '\u2002' // en space
// c == '\u2003' // em space
// c == '\u2004' // three-per-em space
// c == '\u2005' // four-per-em space
// c == '\u2006' // six-per-em space
// c == '\u2007' // figure space
// c == '\u2008' // punctuation space
// c == '\u2009' // thin space
// c == '\u200A' // hair space
// c == '\u200B' // zero width space
// c == '\u3000' // ideographic space
}
/**
* Method to determine if the character is a nonbreaking
* space.
* @param c character to check
* @return True if the character is a nbsp
*/
public static boolean isNonBreakableSpace ( int c ) {
return
(c == NBSPACE // no-break space
|| c == '\u202F' // narrow no-break space
|| c == '\u3000' // ideographic space
|| c == WORD_JOINER // word joiner
|| c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
}
/**
* Method to determine if the character is an adjustable
* space.
* @param c character to check
* @return True if the character is adjustable
*/
public static boolean isAdjustableSpace ( int c ) {
//TODO: are there other kinds of adjustable spaces?
return
(c == '\u0020' // normal space
|| c == NBSPACE); // no-break space
}
/**
* Determines if the character represents any kind of space.
* @param c character to check
* @return True if the character represents any kind of space
*/
public static boolean isAnySpace ( int c ) {
return (isBreakableSpace(c) || isNonBreakableSpace(c));
}
/**
* Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
* @param c the character
* @return true if the character is "Alphabetic"
*/
public static boolean isAlphabetic ( int c ) {
//http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
//Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
int generalCategory = Character.getType((char)c);
switch (generalCategory) {
case Character.UPPERCASE_LETTER: //Lu
case Character.LOWERCASE_LETTER: //Ll
case Character.TITLECASE_LETTER: //Lt
case Character.MODIFIER_LETTER: //Lm
case Character.OTHER_LETTER: //Lo
case Character.LETTER_NUMBER: //Nl
return true;
default:
//TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
//Other_Alphabetic contains mostly more exotic characters
return false;
}
}
/**
* Indicates whether the given character is an explicit break-character
* @param c the character to check
* @return true if the character represents an explicit break
*/
public static boolean isExplicitBreak ( int c ) {
return (c == LINEFEED_CHAR
|| c == CARRIAGE_RETURN
|| c == NEXT_LINE
|| c == LINE_SEPARATOR
|| c == PARAGRAPH_SEPARATOR);
}
//
// The following script codes are based on ISO 15924. Codes less than 1000 are
// official assignments from 15924; those equal to or greater than 1000 are FOP
// implementation specific.
//
// CSOFF: LineLengthCheck
/** hebrew script constant */
public static final int SCRIPT_HEBREW = 125; // 'hebr'
/** mongolian script constant */
public static final int SCRIPT_MONGOLIAN = 145; // 'mong'
/** arabic script constant */
public static final int SCRIPT_ARABIC = 160; // 'arab'
/** greek script constant */
public static final int SCRIPT_GREEK = 200; // 'grek'
/** latin script constant */
public static final int SCRIPT_LATIN = 215; // 'latn'
/** cyrillic script constant */
public static final int SCRIPT_CYRILLIC = 220; // 'cyrl'
/** georgian script constant */
public static final int SCRIPT_GEORGIAN = 240; // 'geor'
/** bopomofo script constant */
public static final int SCRIPT_BOPOMOFO = 285; // 'bopo'
/** hangul script constant */
public static final int SCRIPT_HANGUL = 286; // 'hang'
/** gurmukhi script constant */
public static final int SCRIPT_GURMUKHI = 310; // 'guru'
/** gurmukhi 2 script constant */
public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics
/** devanagari script constant */
public static final int SCRIPT_DEVANAGARI = 315; // 'deva'
/** devanagari 2 script constant */
public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics
/** gujarati script constant */
public static final int SCRIPT_GUJARATI = 320; // 'gujr'
/** gujarati 2 script constant */
public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics
/** bengali script constant */
public static final int SCRIPT_BENGALI = 326; // 'beng'
/** bengali 2 script constant */
public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics
/** oriya script constant */
public static final int SCRIPT_ORIYA = 327; // 'orya'
/** oriya 2 script constant */
public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics
/** tibetan script constant */
public static final int SCRIPT_TIBETAN = 330; // 'tibt'
/** telugu script constant */
public static final int SCRIPT_TELUGU = 340; // 'telu'
/** telugu 2 script constant */
public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics
/** kannada script constant */
public static final int SCRIPT_KANNADA = 345; // 'knda'
/** kannada 2 script constant */
public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics
/** tamil script constant */
public static final int SCRIPT_TAMIL = 346; // 'taml'
/** tamil 2 script constant */
public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics
/** malayalam script constant */
public static final int SCRIPT_MALAYALAM = 347; // 'mlym'
/** malayalam 2 script constant */
public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics
/** sinhalese script constant */
public static final int SCRIPT_SINHALESE = 348; // 'sinh'
/** burmese script constant */
public static final int SCRIPT_BURMESE = 350; // 'mymr'
/** thai script constant */
public static final int SCRIPT_THAI = 352; // 'thai'
/** khmer script constant */
public static final int SCRIPT_KHMER = 355; // 'khmr'
/** lao script constant */
public static final int SCRIPT_LAO = 356; // 'laoo'
/** hiragana script constant */
public static final int SCRIPT_HIRAGANA = 410; // 'hira'
/** ethiopic script constant */
public static final int SCRIPT_ETHIOPIC = 430; // 'ethi'
/** han script constant */
public static final int SCRIPT_HAN = 500; // 'hani'
/** katakana script constant */
public static final int SCRIPT_KATAKANA = 410; // 'kana'
/** math script constant */
public static final int SCRIPT_MATH = 995; // 'zmth'
/** symbol script constant */
public static final int SCRIPT_SYMBOL = 996; // 'zsym'
/** undetermined script constant */
public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy'
/** uncoded script constant */
public static final int SCRIPT_UNCODED = 999; // 'zzzz'
// CSON: LineLengthCheck
/**
* Determine if character c is punctuation.
* @param c a character represented as a unicode scalar value
* @return true if character is punctuation
*/
public static boolean isPunctuation ( int c ) {
if ( ( c >= 0x0021 ) && ( c <= 0x002F ) ) { // basic latin punctuation
return true;
} else if ( ( c >= 0x003A ) && ( c <= 0x0040 ) ) { // basic latin punctuation
return true;
} else if ( ( c >= 0x005F ) && ( c <= 0x0060 ) ) { // basic latin punctuation
return true;
} else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation
return true;
} else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation
return true;
} else if ( ( c >= 0x00A1 ) && ( c <= 0x00BF ) ) { // latin supplement punctuation
return true;
} else if ( ( c >= 0x00D7 ) && ( c <= 0x00D7 ) ) { // latin supplement punctuation
return true;
} else if ( ( c >= 0x00F7 ) && ( c <= 0x00F7 ) ) { // latin supplement punctuation
return true;
} else if ( ( c >= 0x2000 ) && ( c <= 0x206F ) ) { // general punctuation
return true;
} else { // [TBD] - not complete
return false;
}
}
/**
* Determine if character c is a digit.
* @param c a character represented as a unicode scalar value
* @return true if character is a digit
*/
public static boolean isDigit ( int c ) {
if ( ( c >= 0x0030 ) && ( c <= 0x0039 ) ) { // basic latin digits
return true;
} else { // [TBD] - not complete
return false;
}
}
/**
* Determine if character c belong to the hebrew script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to hebrew script
*/
public static boolean isHebrew ( int c ) {
if ( ( c >= 0x0590 ) && ( c <= 0x05FF ) ) { // hebrew block
return true;
} else if ( ( c >= 0xFB00 ) && ( c <= 0xFB4F ) ) { // hebrew presentation forms block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the mongolian script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to mongolian script
*/
public static boolean isMongolian ( int c ) {
if ( ( c >= 0x1800 ) && ( c <= 0x18AF ) ) { // mongolian block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the arabic script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to arabic script
*/
public static boolean isArabic ( int c ) {
if ( ( c >= 0x0600 ) && ( c <= 0x06FF ) ) { // arabic block
return true;
} else if ( ( c >= 0x0750 ) && ( c <= 0x077F ) ) { // arabic supplement block
return true;
} else if ( ( c >= 0xFB50 ) && ( c <= 0xFDFF ) ) { // arabic presentation forms a block
return true;
} else if ( ( c >= 0xFE70 ) && ( c <= 0xFEFF ) ) { // arabic presentation forms b block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the greek script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to greek script
*/
public static boolean isGreek ( int c ) {
if ( ( c >= 0x0370 ) && ( c <= 0x03FF ) ) { // greek (and coptic) block
return true;
} else if ( ( c >= 0x1F00 ) && ( c <= 0x1FFF ) ) { // greek extended block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the latin script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to latin script
*/
public static boolean isLatin ( int c ) {
if ( ( c >= 0x0041 ) && ( c <= 0x005A ) ) { // basic latin upper case
return true;
} else if ( ( c >= 0x0061 ) && ( c <= 0x007A ) ) { // basic latin lower case
return true;
} else if ( ( c >= 0x00C0 ) && ( c <= 0x00D6 ) ) { // latin supplement upper case
return true;
} else if ( ( c >= 0x00D8 ) && ( c <= 0x00DF ) ) { // latin supplement upper case
return true;
} else if ( ( c >= 0x00E0 ) && ( c <= 0x00F6 ) ) { // latin supplement lower case
return true;
} else if ( ( c >= 0x00F8 ) && ( c <= 0x00FF ) ) { // latin supplement lower case
return true;
} else if ( ( c >= 0x0100 ) && ( c <= 0x017F ) ) { // latin extended a
return true;
} else if ( ( c >= 0x0180 ) && ( c <= 0x024F ) ) { // latin extended b
return true;
} else if ( ( c >= 0x1E00 ) && ( c <= 0x1EFF ) ) { // latin extended additional
return true;
} else if ( ( c >= 0x2C60 ) && ( c <= 0x2C7F ) ) { // latin extended c
return true;
} else if ( ( c >= 0xA720 ) && ( c <= 0xA7FF ) ) { // latin extended d
return true;
} else if ( ( c >= 0xFB00 ) && ( c <= 0xFB0F ) ) { // latin ligatures
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the cyrillic script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to cyrillic script
*/
public static boolean isCyrillic ( int c ) {
if ( ( c >= 0x0400 ) && ( c <= 0x04FF ) ) { // cyrillic block
return true;
} else if ( ( c >= 0x0500 ) && ( c <= 0x052F ) ) { // cyrillic supplement block
return true;
} else if ( ( c >= 0x2DE0 ) && ( c <= 0x2DFF ) ) { // cyrillic extended-a block
return true;
} else if ( ( c >= 0xA640 ) && ( c <= 0xA69F ) ) { // cyrillic extended-b block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the georgian script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to georgian script
*/
public static boolean isGeorgian ( int c ) {
if ( ( c >= 0x10A0 ) && ( c <= 0x10FF ) ) { // georgian block
return true;
} else if ( ( c >= 0x2D00 ) && ( c <= 0x2D2F ) ) { // georgian supplement block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the hangul script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to hangul script
*/
public static boolean isHangul ( int c ) {
if ( ( c >= 0x1100 ) && ( c <= 0x11FF ) ) { // hangul jamo
return true;
} else if ( ( c >= 0x3130 ) && ( c <= 0x318F ) ) { // hangul compatibility jamo
return true;
} else if ( ( c >= 0xA960 ) && ( c <= 0xA97F ) ) { // hangul jamo extended a
return true;
} else if ( ( c >= 0xAC00 ) && ( c <= 0xD7A3 ) ) { // hangul syllables
return true;
} else if ( ( c >= 0xD7B0 ) && ( c <= 0xD7FF ) ) { // hangul jamo extended a
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the gurmukhi script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to gurmukhi script
*/
public static boolean isGurmukhi ( int c ) {
if ( ( c >= 0x0A00 ) && ( c <= 0x0A7F ) ) { // gurmukhi block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the devanagari script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to devanagari script
*/
public static boolean isDevanagari ( int c ) {
if ( ( c >= 0x0900 ) && ( c <= 0x097F ) ) { // devangari block
return true;
} else if ( ( c >= 0xA8E0 ) && ( c <= 0xA8FF ) ) { // devangari extended block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the gujarati script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to gujarati script
*/
public static boolean isGujarati ( int c ) {
if ( ( c >= 0x0A80 ) && ( c <= 0x0AFF ) ) { // gujarati block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the bengali script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to bengali script
*/
public static boolean isBengali ( int c ) {
if ( ( c >= 0x0980 ) && ( c <= 0x09FF ) ) { // bengali block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the oriya script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to oriya script
*/
public static boolean isOriya ( int c ) {
if ( ( c >= 0x0B00 ) && ( c <= 0x0B7F ) ) { // oriya block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the tibetan script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to tibetan script
*/
public static boolean isTibetan ( int c ) {
if ( ( c >= 0x0F00 ) && ( c <= 0x0FFF ) ) { // tibetan block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the telugu script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to telugu script
*/
public static boolean isTelugu ( int c ) {
if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // telugu block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the kannada script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to kannada script
*/
public static boolean isKannada ( int c ) {
if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // kannada block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the tamil script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to tamil script
*/
public static boolean isTamil ( int c ) {
if ( ( c >= 0x0B80 ) && ( c <= 0x0BFF ) ) { // tamil block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the malayalam script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to malayalam script
*/
public static boolean isMalayalam ( int c ) {
if ( ( c >= 0x0D00 ) && ( c <= 0x0D7F ) ) { // malayalam block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the sinhalese script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to sinhalese script
*/
public static boolean isSinhalese ( int c ) {
if ( ( c >= 0x0D80 ) && ( c <= 0x0DFF ) ) { // sinhala block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the burmese script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to burmese script
*/
public static boolean isBurmese ( int c ) {
if ( ( c >= 0x1000 ) && ( c <= 0x109F ) ) { // burmese (myanmar) block
return true;
} else if ( ( c >= 0xAA60 ) && ( c <= 0xAA7F ) ) { // burmese (myanmar) extended block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the thai script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to thai script
*/
public static boolean isThai ( int c ) {
if ( ( c >= 0x0E00 ) && ( c <= 0x0E7F ) ) { // thai block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the khmer script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to khmer script
*/
public static boolean isKhmer ( int c ) {
if ( ( c >= 0x1780 ) && ( c <= 0x17FF ) ) { // khmer block
return true;
} else if ( ( c >= 0x19E0 ) && ( c <= 0x19FF ) ) { // khmer symbols block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the lao script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to lao script
*/
public static boolean isLao ( int c ) {
if ( ( c >= 0x0E80 ) && ( c <= 0x0EFF ) ) { // lao block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the ethiopic (amharic) script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to ethiopic (amharic) script
*/
public static boolean isEthiopic ( int c ) {
if ( ( c >= 0x1200 ) && ( c <= 0x137F ) ) { // ethiopic block
return true;
} else if ( ( c >= 0x1380 ) && ( c <= 0x139F ) ) { // ethoipic supplement block
return true;
} else if ( ( c >= 0x2D80 ) && ( c <= 0x2DDF ) ) { // ethoipic extended block
return true;
} else if ( ( c >= 0xAB00 ) && ( c <= 0xAB2F ) ) { // ethoipic extended-a block
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the han (unified cjk) script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to han (unified cjk) script
*/
public static boolean isHan ( int c ) {
if ( ( c >= 0x3400 ) && ( c <= 0x4DBF ) ) {
return true; // cjk unified ideographs extension a
} else if ( ( c >= 0x4E00 ) && ( c <= 0x9FFF ) ) {
return true; // cjk unified ideographs
} else if ( ( c >= 0xF900 ) && ( c <= 0xFAFF ) ) {
return true; // cjk compatibility ideographs
} else if ( ( c >= 0x20000 ) && ( c <= 0x2A6DF ) ) {
return true; // cjk unified ideographs extension b
} else if ( ( c >= 0x2A700 ) && ( c <= 0x2B73F ) ) {
return true; // cjk unified ideographs extension c
} else if ( ( c >= 0x2F800 ) && ( c <= 0x2FA1F ) ) {
return true; // cjk compatibility ideographs supplement
} else {
return false;
}
}
/**
* Determine if character c belong to the bopomofo script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to bopomofo script
*/
public static boolean isBopomofo ( int c ) {
if ( ( c >= 0x3100 ) && ( c <= 0x312F ) ) {
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the hiragana script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to hiragana script
*/
public static boolean isHiragana ( int c ) {
if ( ( c >= 0x3040 ) && ( c <= 0x309F ) ) {
return true;
} else {
return false;
}
}
/**
* Determine if character c belong to the katakana script.
* @param c a character represented as a unicode scalar value
* @return true if character belongs to katakana script
*/
public static boolean isKatakana ( int c ) {
if ( ( c >= 0x30A0 ) && ( c <= 0x30FF ) ) {
return true;
} else if ( ( c >= 0x31F0 ) && ( c <= 0x31FF ) ) {
return true;
} else {
return false;
}
}
/**
* Obtain ISO15924 numeric script code of character. If script is not or cannot be determined,
* then the script code 998 ('zyyy') is returned.
* @param c the character to obtain script
* @return an ISO15924 script code
*/
public static int scriptOf ( int c ) { // [TBD] - needs optimization!!!
if ( isAnySpace ( c ) ) {
return SCRIPT_UNDETERMINED;
} else if ( isPunctuation ( c ) ) {
return SCRIPT_UNDETERMINED;
} else if ( isDigit ( c ) ) {
return SCRIPT_UNDETERMINED;
} else if ( isLatin ( c ) ) {
return SCRIPT_LATIN;
} else if ( isCyrillic ( c ) ) {
return SCRIPT_CYRILLIC;
} else if ( isGreek ( c ) ) {
return SCRIPT_GREEK;
} else if ( isHan ( c ) ) {
return SCRIPT_HAN;
} else if ( isBopomofo ( c ) ) {
return SCRIPT_BOPOMOFO;
} else if ( isKatakana ( c ) ) {
return SCRIPT_KATAKANA;
} else if ( isHiragana ( c ) ) {
return SCRIPT_HIRAGANA;
} else if ( isHangul ( c ) ) {
return SCRIPT_HANGUL;
} else if ( isArabic ( c ) ) {
return SCRIPT_ARABIC;
} else if ( isHebrew ( c ) ) {
return SCRIPT_HEBREW;
} else if ( isMongolian ( c ) ) {
return SCRIPT_MONGOLIAN;
} else if ( isGeorgian ( c ) ) {
return SCRIPT_GEORGIAN;
} else if ( isGurmukhi ( c ) ) {
return useV2IndicRules ( SCRIPT_GURMUKHI );
} else if ( isDevanagari ( c ) ) {
return useV2IndicRules ( SCRIPT_DEVANAGARI );
} else if ( isGujarati ( c ) ) {
return useV2IndicRules ( SCRIPT_GUJARATI );
} else if ( isBengali ( c ) ) {
return useV2IndicRules ( SCRIPT_BENGALI );
} else if ( isOriya ( c ) ) {
return useV2IndicRules ( SCRIPT_ORIYA );
} else if ( isTibetan ( c ) ) {
return SCRIPT_TIBETAN;
} else if ( isTelugu ( c ) ) {
return useV2IndicRules ( SCRIPT_TELUGU );
} else if ( isKannada ( c ) ) {
return useV2IndicRules ( SCRIPT_KANNADA );
} else if ( isTamil ( c ) ) {
return useV2IndicRules ( SCRIPT_TAMIL );
} else if ( isMalayalam ( c ) ) {
return useV2IndicRules ( SCRIPT_MALAYALAM );
} else if ( isSinhalese ( c ) ) {
return SCRIPT_SINHALESE;
} else if ( isBurmese ( c ) ) {
return SCRIPT_BURMESE;
} else if ( isThai ( c ) ) {
return SCRIPT_THAI;
} else if ( isKhmer ( c ) ) {
return SCRIPT_KHMER;
} else if ( isLao ( c ) ) {
return SCRIPT_LAO;
} else if ( isEthiopic ( c ) ) {
return SCRIPT_ETHIOPIC;
} else {
return SCRIPT_UNDETERMINED;
}
}
/**
* Obtain the V2 indic script code corresponding to V1 indic script code SC if
* and only iff V2 indic rules apply; otherwise return SC.
* @param sc a V1 indic script code
* @return either SC or the V2 flavor of SC if V2 indic rules apply
*/
public static int useV2IndicRules ( int sc ) {
if ( useV2Indic ) {
return ( sc < 1000 ) ? ( sc + 1000 ) : sc;
} else {
return sc;
}
}
/**
* Obtain the script codes of each character in a character sequence. If script
* is not or cannot be determined for some character, then the script code 998
* ('zyyy') is returned.
* @param cs the character sequence
* @return a (possibly empty) array of script codes
*/
public static int[] scriptsOf ( CharSequence cs ) {
Set s = new HashSet();
for ( int i = 0, n = cs.length(); i < n; i++ ) {
s.add ( Integer.valueOf ( scriptOf ( cs.charAt ( i ) ) ) );
}
int[] sa = new int [ s.size() ];
int ns = 0;
for ( Iterator it = s.iterator(); it.hasNext();) {
sa [ ns++ ] = ( (Integer) it.next() ) .intValue();
}
Arrays.sort ( sa );
return sa;
}
/**
* Determine the dominant script of a character sequence.
* @param cs the character sequence
* @return the dominant script or SCRIPT_UNDETERMINED
*/
public static int dominantScript ( CharSequence cs ) {
Map m = new HashMap();
for ( int i = 0, n = cs.length(); i < n; i++ ) {
int c = cs.charAt ( i );
int s = scriptOf ( c );
Integer k = Integer.valueOf ( s );
Integer v = (Integer) m.get ( k );
if ( v != null ) {
m.put ( k, Integer.valueOf ( v.intValue() + 1 ) );
} else {
m.put ( k, Integer.valueOf ( 0 ) );
}
}
int sMax = -1;
int cMax = -1;
for ( Iterator it = m.entrySet().iterator(); it.hasNext();) {
Map.Entry e = (Map.Entry) it.next();
Integer k = (Integer) e.getKey();
int s = k.intValue();
switch ( s ) {
case SCRIPT_UNDETERMINED:
case SCRIPT_UNCODED:
break;
default:
{
Integer v = (Integer) e.getValue();
assert v != null;
int c = v.intValue();
if ( c > cMax ) {
cMax = c; sMax = s;
}
break;
}
}
}
if ( sMax < 0 ) {
sMax = SCRIPT_UNDETERMINED;
}
return sMax;
}
/**
* Determine if script tag denotes an 'Indic' script, where a
* script is an 'Indic' script if it is intended to be processed by
* the generic 'Indic' Script Processor.
* @param script a script tag
* @return true if script tag is a designated 'Indic' script
*/
public static boolean isIndicScript ( String script ) {
switch ( scriptCodeFromTag ( script ) ) {
case SCRIPT_BENGALI:
case SCRIPT_BENGALI_2:
case SCRIPT_BURMESE:
case SCRIPT_DEVANAGARI:
case SCRIPT_DEVANAGARI_2:
case SCRIPT_GUJARATI:
case SCRIPT_GUJARATI_2:
case SCRIPT_GURMUKHI:
case SCRIPT_GURMUKHI_2:
case SCRIPT_KANNADA:
case SCRIPT_KANNADA_2:
case SCRIPT_MALAYALAM:
case SCRIPT_MALAYALAM_2:
case SCRIPT_ORIYA:
case SCRIPT_ORIYA_2:
case SCRIPT_TAMIL:
case SCRIPT_TAMIL_2:
case SCRIPT_TELUGU:
case SCRIPT_TELUGU_2:
return true;
default:
return false;
}
}
/**
* Determine the script tag associated with an internal script code.
* @param code the script code
* @return a script tag
*/
public static String scriptTagFromCode ( int code ) {
Map<Integer,String> m = getScriptTagsMap();
if ( m != null ) {
String tag;
if ( ( tag = m.get ( Integer.valueOf ( code ) ) ) != null ) {
return tag;
} else {
return "";
}
} else {
return "";
}
}
/**
* Determine the internal script code associated with a script tag.
* @param tag the script tag
* @return a script code
*/
public static int scriptCodeFromTag ( String tag ) {
Map<String,Integer> m = getScriptCodeMap();
if ( m != null ) {
Integer c;
if ( ( c = m.get ( tag ) ) != null ) {
return (int) c;
} else {
return SCRIPT_UNDETERMINED;
}
} else {
return SCRIPT_UNDETERMINED;
}
}
/**
* Convert a single unicode scalar value to an XML numeric character
* reference. If in the BMP, four digits are used, otherwise 6 digits are used.
* @param c a unicode scalar value
* @return a string representing a numeric character reference
*/
public static String charToNCRef ( int c ) {
StringBuffer sb = new StringBuffer();
for ( int i = 0, nDigits = ( c > 0xFFFF ) ? 6 : 4; i < nDigits; i++, c >>= 4 ) {
int d = c & 0xF;
char hd;
if ( d < 10 ) {
hd = (char) ( (int) '0' + d );
} else {
hd = (char) ( (int) 'A' + ( d - 10 ) );
}
sb.append ( hd );
}
return "&#x" + sb.reverse() + ";";
}
/**
* Convert a string to a sequence of ASCII or XML numeric character references.
* @param s a java string (encoded in UTF-16)
* @return a string representing a sequence of numeric character reference or
* ASCII characters
*/
public static String toNCRefs ( String s ) {
StringBuffer sb = new StringBuffer();
if ( s != null ) {
for ( int i = 0; i < s.length(); i++ ) {
char c = s.charAt(i);
if ( ( c >= 32 ) && ( c < 127 ) ) {
if ( c == '<' ) {
sb.append ( "&lt;" );
} else if ( c == '>' ) {
sb.append ( "&gt;" );
} else if ( c == '&' ) {
sb.append ( "&amp;" );
} else {
sb.append ( c );
}
} else {
sb.append ( charToNCRef ( c ) );
}
}
}
return sb.toString();
}
/**
* Pad a string S on left out to width W using padding character PAD.
* @param s string to pad
* @param width width of field to add padding
* @param pad character to use for padding
* @return padded string
*/
public static String padLeft ( String s, int width, char pad ) {
StringBuffer sb = new StringBuffer();
for ( int i = s.length(); i < width; i++ ) {
sb.append(pad);
}
sb.append ( s );
return sb.toString();
}
/**
* Format character for debugging output, which it is prefixed with "0x", padded left with '0'
* and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
* @param c character code
* @return formatted character string
*/
public static String format ( int c ) {
if ( c < 1114112 ) {
return "0x" + padLeft ( Integer.toString ( c, 16 ), ( c < 65536 ) ? 4 : 6, '0' );
} else {
return "!NOT A CHARACTER!";
}
}
private static Map<Integer,String> scriptTagsMap = null;
private static Map<String,Integer> scriptCodeMap = null;
private static void putScriptTag ( Map tm, Map cm, int code, String tag ) {
assert tag != null;
assert tag.length() != 0;
assert code >= 0;
assert code < 2000;
tm.put ( Integer.valueOf ( code ), tag );
cm.put ( tag, Integer.valueOf ( code ) );
}
private static void makeScriptMaps() {
HashMap<Integer,String> tm = new HashMap<Integer,String>();
HashMap<String,Integer> cm = new HashMap<String,Integer>();
putScriptTag ( tm, cm, SCRIPT_HEBREW, "hebr" );
putScriptTag ( tm, cm, SCRIPT_MONGOLIAN, "mong" );
putScriptTag ( tm, cm, SCRIPT_ARABIC, "arab" );
putScriptTag ( tm, cm, SCRIPT_GREEK, "grek" );
putScriptTag ( tm, cm, SCRIPT_LATIN, "latn" );
putScriptTag ( tm, cm, SCRIPT_CYRILLIC, "cyrl" );
putScriptTag ( tm, cm, SCRIPT_GEORGIAN, "geor" );
putScriptTag ( tm, cm, SCRIPT_BOPOMOFO, "bopo" );
putScriptTag ( tm, cm, SCRIPT_HANGUL, "hang" );
putScriptTag ( tm, cm, SCRIPT_GURMUKHI, "guru" );
putScriptTag ( tm, cm, SCRIPT_GURMUKHI_2, "gur2" );
putScriptTag ( tm, cm, SCRIPT_DEVANAGARI, "deva" );
putScriptTag ( tm, cm, SCRIPT_DEVANAGARI_2, "dev2" );
putScriptTag ( tm, cm, SCRIPT_GUJARATI, "gujr" );
putScriptTag ( tm, cm, SCRIPT_GUJARATI_2, "gjr2" );
putScriptTag ( tm, cm, SCRIPT_BENGALI, "beng" );
putScriptTag ( tm, cm, SCRIPT_BENGALI_2, "bng2" );
putScriptTag ( tm, cm, SCRIPT_ORIYA, "orya" );
putScriptTag ( tm, cm, SCRIPT_ORIYA_2, "ory2" );
putScriptTag ( tm, cm, SCRIPT_TIBETAN, "tibt" );
putScriptTag ( tm, cm, SCRIPT_TELUGU, "telu" );
putScriptTag ( tm, cm, SCRIPT_TELUGU_2, "tel2" );
putScriptTag ( tm, cm, SCRIPT_KANNADA, "knda" );
putScriptTag ( tm, cm, SCRIPT_KANNADA_2, "knd2" );
putScriptTag ( tm, cm, SCRIPT_TAMIL, "taml" );
putScriptTag ( tm, cm, SCRIPT_TAMIL_2, "tml2" );
putScriptTag ( tm, cm, SCRIPT_MALAYALAM, "mlym" );
putScriptTag ( tm, cm, SCRIPT_MALAYALAM_2, "mlm2" );
putScriptTag ( tm, cm, SCRIPT_SINHALESE, "sinh" );
putScriptTag ( tm, cm, SCRIPT_BURMESE, "mymr" );
putScriptTag ( tm, cm, SCRIPT_THAI, "thai" );
putScriptTag ( tm, cm, SCRIPT_KHMER, "khmr" );
putScriptTag ( tm, cm, SCRIPT_LAO, "laoo" );
putScriptTag ( tm, cm, SCRIPT_HIRAGANA, "hira" );
putScriptTag ( tm, cm, SCRIPT_ETHIOPIC, "ethi" );
putScriptTag ( tm, cm, SCRIPT_HAN, "hani" );
putScriptTag ( tm, cm, SCRIPT_KATAKANA, "kana" );
putScriptTag ( tm, cm, SCRIPT_MATH, "zmth" );
putScriptTag ( tm, cm, SCRIPT_SYMBOL, "zsym" );
putScriptTag ( tm, cm, SCRIPT_UNDETERMINED, "zyyy" );
putScriptTag ( tm, cm, SCRIPT_UNCODED, "zzzz" );
scriptTagsMap = tm;
scriptCodeMap = cm;
}
private static Map<Integer,String> getScriptTagsMap() {
if ( scriptTagsMap == null ) {
makeScriptMaps();
}
return scriptTagsMap;
}
private static Map<String,Integer> getScriptCodeMap() {
if ( scriptCodeMap == null ) {
makeScriptMaps();
}
return scriptCodeMap;
}
/**
* Mirror characters that are designated as having the bidi mirrorred property.
* @param s a string whose characters are to be mirrored
* @return the resulting string
*/
public static String mirror ( String s ) {
StringBuffer sb = new StringBuffer ( s );
for ( int i = 0, n = sb.length(); i < n; i++ ) {
sb.setCharAt ( i, (char) mirror ( sb.charAt ( i ) ) );
}
return sb.toString();
}
private static int[] mirroredCharacters = {
0x0028,
0x0029,
0x003C,
0x003E,
0x005B,
0x005D,
0x007B,
0x007D,
0x00AB,
0x00BB,
0x0F3A,
0x0F3B,
0x0F3C,
0x0F3D,
0x169B,
0x169C,
0x2039,
0x203A,
0x2045,
0x2046,
0x207D,
0x207E,
0x208D,
0x208E,
0x2208,
0x2209,
0x220A,
0x220B,
0x220C,
0x220D,
0x2215,
0x223C,
0x223D,
0x2243,
0x2252,
0x2253,
0x2254,
0x2255,
0x2264,
0x2265,
0x2266,
0x2267,
0x2268,
0x2269,
0x226A,
0x226B,
0x226E,
0x226F,
0x2270,
0x2271,
0x2272,
0x2273,
0x2274,
0x2275,
0x2276,
0x2277,
0x2278,
0x2279,
0x227A,
0x227B,
0x227C,
0x227D,
0x227E,
0x227F,
0x2280,
0x2281,
0x2282,
0x2283,
0x2284,
0x2285,
0x2286,
0x2287,
0x2288,
0x2289,
0x228A,
0x228B,
0x228F,
0x2290,
0x2291,
0x2292,
0x2298,
0x22A2,
0x22A3,
0x22A6,
0x22A8,
0x22A9,
0x22AB,
0x22B0,
0x22B1,
0x22B2,
0x22B3,
0x22B4,
0x22B5,
0x22B6,
0x22B7,
0x22C9,
0x22CA,
0x22CB,
0x22CC,
0x22CD,
0x22D0,
0x22D1,
0x22D6,
0x22D7,
0x22D8,
0x22D9,
0x22DA,
0x22DB,
0x22DC,
0x22DD,
0x22DE,
0x22DF,
0x22E0,
0x22E1,
0x22E2,
0x22E3,
0x22E4,
0x22E5,
0x22E6,
0x22E7,
0x22E8,
0x22E9,
0x22EA,
0x22EB,
0x22EC,
0x22ED,
0x22F0,
0x22F1,
0x22F2,
0x22F3,
0x22F4,
0x22F6,
0x22F7,
0x22FA,
0x22FB,
0x22FC,
0x22FD,
0x22FE,
0x2308,
0x2309,
0x230A,
0x230B,
0x2329,
0x232A,
0x2768,
0x2769,
0x276A,
0x276B,
0x276C,
0x276D,
0x276E,
0x276F,
0x2770,
0x2771,
0x2772,
0x2773,
0x2774,
0x2775,
0x27C3,
0x27C4,
0x27C5,
0x27C6,
0x27C8,
0x27C9,
0x27D5,
0x27D6,
0x27DD,
0x27DE,
0x27E2,
0x27E3,
0x27E4,
0x27E5,
0x27E6,
0x27E7,
0x27E8,
0x27E9,
0x27EA,
0x27EB,
0x27EC,
0x27ED,
0x27EE,
0x27EF,
0x2983,
0x2984,
0x2985,
0x2986,
0x2987,
0x2988,
0x2989,
0x298A,
0x298B,
0x298C,
0x298D,
0x298E,
0x298F,
0x2990,
0x2991,
0x2992,
0x2993,
0x2994,
0x2995,
0x2996,
0x2997,
0x2998,
0x29B8,
0x29C0,
0x29C1,
0x29C4,
0x29C5,
0x29CF,
0x29D0,
0x29D1,
0x29D2,
0x29D4,
0x29D5,
0x29D8,
0x29D9,
0x29DA,
0x29DB,
0x29F5,
0x29F8,
0x29F9,
0x29FC,
0x29FD,
0x2A2B,
0x2A2C,
0x2A2D,
0x2A2E,
0x2A34,
0x2A35,
0x2A3C,
0x2A3D,
0x2A64,
0x2A65,
0x2A79,
0x2A7A,
0x2A7D,
0x2A7E,
0x2A7F,
0x2A80,
0x2A81,
0x2A82,
0x2A83,
0x2A84,
0x2A8B,
0x2A8C,
0x2A91,
0x2A92,
0x2A93,
0x2A94,
0x2A95,
0x2A96,
0x2A97,
0x2A98,
0x2A99,
0x2A9A,
0x2A9B,
0x2A9C,
0x2AA1,
0x2AA2,
0x2AA6,
0x2AA7,
0x2AA8,
0x2AA9,
0x2AAA,
0x2AAB,
0x2AAC,
0x2AAD,
0x2AAF,
0x2AB0,
0x2AB3,
0x2AB4,
0x2AC3,
0x2AC4,
0x2AC5,
0x2AC6,
0x2ACD,
0x2ACE,
0x2ACF,
0x2AD0,
0x2AD1,
0x2AD2,
0x2AD3,
0x2AD4,
0x2AD5,
0x2AD6,
0x2ADE,
0x2AE3,
0x2E02,
0x2E03,
0x2E04,
0x2E05,
0x2E09,
0x2E0A,
0x2E0C,
0x2E0D,
0x2E1C,
0x2E1D,
0x2E20,
0x2E21,
0x2E22,
0x2E23,
0x2E24,
0x2E25,
0x2E26,
0x300E,
0x300F,
0x3010,
0x3011,
0x3014,
0x3015,
0x3016,
0x3017,
0x3018,
0x3019,
0x301A,
0x301B,
0xFE59,
0xFE5A,
0xFF3B,
0xFF3D,
0xFF5B,
0xFF5D,
0xFF5F,
0xFF60,
0xFF62,
0xFF63
};
private static int[] mirroredCharactersMapping = {
0x0029,
0x0028,
0x003E,
0x003C,
0x005D,
0x005B,
0x007D,
0x007B,
0x00BB,
0x00AB,
0x0F3B,
0x0F3A,
0x0F3D,
0x0F3C,
0x169C,
0x169B,
0x203A,
0x2039,
0x2046,
0x2045,
0x207E,
0x207D,
0x208E,
0x208D,
0x220B,
0x220C,
0x220D,
0x2208,
0x2209,
0x220A,
0x29F5,
0x223D,
0x223C,
0x22CD,
0x2253,
0x2252,
0x2255,
0x2254,
0x2265,
0x2264,
0x2267,
0x2266,
0x2269,
0x2268,
0x226B,
0x226A,
0x226F,
0x226E,
0x2271,
0x2270,
0x2273,
0x2272,
0x2275,
0x2274,
0x2277,
0x2276,
0x2279,
0x2278,
0x227B,
0x227A,
0x227D,
0x227C,
0x227F,
0x227E,
0x2281,
0x2280,
0x2283,
0x2282,
0x2285,
0x2284,
0x2287,
0x2286,
0x2289,
0x2288,
0x228B,
0x228A,
0x2290,
0x228F,
0x2292,
0x2291,
0x29B8,
0x22A3,
0x22A2,
0x2ADE,
0x2AE4,
0x2AE3,
0x2AE5,
0x22B1,
0x22B0,
0x22B3,
0x22B2,
0x22B5,
0x22B4,
0x22B7,
0x22B6,
0x22CA,
0x22C9,
0x22CC,
0x22CB,
0x2243,
0x22D1,
0x22D0,
0x22D7,
0x22D6,
0x22D9,
0x22D8,
0x22DB,
0x22DA,
0x22DD,
0x22DC,
0x22DF,
0x22DE,
0x22E1,
0x22E0,
0x22E3,
0x22E2,
0x22E5,
0x22E4,
0x22E7,
0x22E6,
0x22E9,
0x22E8,
0x22EB,
0x22EA,
0x22ED,
0x22EC,
0x22F1,
0x22F0,
0x22FA,
0x22FB,
0x22FC,
0x22FD,
0x22FE,
0x22F2,
0x22F3,
0x22F4,
0x22F6,
0x22F7,
0x2309,
0x2308,
0x230B,
0x230A,
0x232A,
0x2329,
0x2769,
0x2768,
0x276B,
0x276A,
0x276D,
0x276C,
0x276F,
0x276E,
0x2771,
0x2770,
0x2773,
0x2772,
0x2775,
0x2774,
0x27C4,
0x27C3,
0x27C6,
0x27C5,
0x27C9,
0x27C8,
0x27D6,
0x27D5,
0x27DE,
0x27DD,
0x27E3,
0x27E2,
0x27E5,
0x27E4,
0x27E7,
0x27E6,
0x27E9,
0x27E8,
0x27EB,
0x27EA,
0x27ED,
0x27EC,
0x27EF,
0x27EE,
0x2984,
0x2983,
0x2986,
0x2985,
0x2988,
0x2987,
0x298A,
0x2989,
0x298C,
0x298B,
0x2990,
0x298F,
0x298E,
0x298D,
0x2992,
0x2991,
0x2994,
0x2993,
0x2996,
0x2995,
0x2998,
0x2997,
0x2298,
0x29C1,
0x29C0,
0x29C5,
0x29C4,
0x29D0,
0x29CF,
0x29D2,
0x29D1,
0x29D5,
0x29D4,
0x29D9,
0x29D8,
0x29DB,
0x29DA,
0x2215,
0x29F9,
0x29F8,
0x29FD,
0x29FC,
0x2A2C,
0x2A2B,
0x2A2E,
0x2A2D,
0x2A35,
0x2A34,
0x2A3D,
0x2A3C,
0x2A65,
0x2A64,
0x2A7A,
0x2A79,
0x2A7E,
0x2A7D,
0x2A80,
0x2A7F,
0x2A82,
0x2A81,
0x2A84,
0x2A83,
0x2A8C,
0x2A8B,
0x2A92,
0x2A91,
0x2A94,
0x2A93,
0x2A96,
0x2A95,
0x2A98,
0x2A97,
0x2A9A,
0x2A99,
0x2A9C,
0x2A9B,
0x2AA2,
0x2AA1,
0x2AA7,
0x2AA6,
0x2AA9,
0x2AA8,
0x2AAB,
0x2AAA,
0x2AAD,
0x2AAC,
0x2AB0,
0x2AAF,
0x2AB4,
0x2AB3,
0x2AC4,
0x2AC3,
0x2AC6,
0x2AC5,
0x2ACE,
0x2ACD,
0x2AD0,
0x2ACF,
0x2AD2,
0x2AD1,
0x2AD4,
0x2AD3,
0x2AD6,
0x2AD5,
0x22A6,
0x22A9,
0x2E03,
0x2E02,
0x2E05,
0x2E04,
0x2E0A,
0x2E09,
0x2E0D,
0x2E0C,
0x2E1D,
0x2E1C,
0x2E21,
0x2E20,
0x2E23,
0x2E22,
0x2E25,
0x2E24,
0x2E27,
0x300F,
0x300E,
0x3011,
0x3010,
0x3015,
0x3014,
0x3017,
0x3016,
0x3019,
0x3018,
0x301B,
0x301A,
0xFE5A,
0xFE59,
0xFF3D,
0xFF3B,
0xFF5D,
0xFF5B,
0xFF60,
0xFF5F,
0xFF63,
0xFF62
};
private static int mirror ( int c ) {
int i = Arrays.binarySearch ( mirroredCharacters, c );
if ( i < 0 ) {
return c;
} else {
return mirroredCharactersMapping [ i ];
}
}
/**
* Determine if two character sequences contain the same characters.
* @param cs1 first character sequence
* @param cs2 second character sequence
* @return true if both sequences have same length and same character sequence
*/
public static boolean isSameSequence ( CharSequence cs1, CharSequence cs2 ) {
assert cs1 != null;
assert cs2 != null;
if ( cs1.length() != cs2.length() ) {
return false;
} else {
for ( int i = 0, n = cs1.length(); i < n; i++ ) {
if ( cs1.charAt(i) != cs2.charAt(i) ) {
return false;
}
}
return true;
}
}
/**
* Convert Java string (UTF-16) to a Unicode scalar array (UTF-32).
* Note that if there are any non-BMP encoded characters present in the
* input, then the number of entries in the output array will be less
* than the number of elements in the input string. Any
* @param s input string
* @param substitution value to substitute for ill-formed surrogate
* @param errorOnSubstitution throw runtime exception (IllegalArgumentException) in
* case this argument is true and a substitution would be attempted
* @return output scalar array
* @throws IllegalArgumentException if substitution required and errorOnSubstitution
* is not false
*/
public static Integer[] toUTF32 ( String s, int substitution, boolean errorOnSubstitution )
throws IllegalArgumentException {
int n;
if ( ( n = s.length() ) == 0 ) {
return new Integer[0];
} else {
Integer[] sa = new Integer [ n ];
int k = 0;
for ( int i = 0; i < n; i++ ) {
int c = (int) s.charAt(i);
if ( ( c >= 0xD800 ) && ( c < 0xE000 ) ) {
int s1 = c;
int s2 = ( ( i + 1 ) < n ) ? (int) s.charAt ( i + 1 ) : 0;
if ( s1 < 0xDC00 ) {
if ( ( s2 >= 0xDC00 ) && ( s2 < 0xE000 ) ) {
c = ( ( s1 - 0xD800 ) << 10 ) + ( s2 - 0xDC00 ) + 65536;
i++;
} else {
if ( errorOnSubstitution ) {
throw new IllegalArgumentException
( "isolated high (leading) surrogate" );
} else {
c = substitution;
}
}
} else {
if ( errorOnSubstitution ) {
throw new IllegalArgumentException
( "isolated low (trailing) surrogate" );
} else {
c = substitution;
}
}
}
sa[k++] = c;
}
if ( k == n ) {
return sa;
} else {
Integer[] na = new Integer [ k ];
System.arraycopy ( sa, 0, na, 0, k );
return na;
}
}
}
/**
* Convert a Unicode scalar array (UTF-32) a Java string (UTF-16).
* @param sa input scalar array
* @return output (UTF-16) string
* @throws IllegalArgumentException if an input scalar value is illegal,
* e.g., a surrogate or out of range
*/
public static String fromUTF32 ( Integer[] sa ) throws IllegalArgumentException {
StringBuffer sb = new StringBuffer();
for ( int s : sa ) {
if ( s < 65535 ) {
if ( ( s < 0xD800 ) || ( s > 0xDFFF ) ) {
sb.append ( (char) s );
} else {
String ncr = charToNCRef(s);
throw new IllegalArgumentException
( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1)
+ "; cannot be UTF-16 surrogate" );
}
} else if ( s < 1114112 ) {
int s1 = ( ( ( s - 65536 ) >> 10 ) & 0x3FF ) + 0xD800;
int s2 = ( ( ( s - 65536 ) >> 0 ) & 0x3FF ) + 0xDC00;
sb.append ( (char) s1 );
sb.append ( (char) s2 );
} else {
String ncr = charToNCRef(s);
throw new IllegalArgumentException
( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1)
+ "; out of range for UTF-16" );
}
}
return sb.toString();
}
}