| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* $Id$ */ |
| |
| package org.apache.fop.util; |
| |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Map; |
| import java.util.Set; |
| |
| // CSOFF: AvoidNestedBlocksCheck |
| // CSOFF: InnerAssignmentCheck |
| // CSOFF: WhitespaceAfterCheck |
| // CSOFF: SimplifyBooleanReturnCheck |
| |
| /** |
| * This class provides utilities to distinguish various kinds of Unicode |
| * whitespace and to get character widths in a given FontState. |
| */ |
| public class CharUtilities { |
| |
| /** |
| * Character code used to signal a character boundary in |
| * inline content, such as an inline with borders and padding |
| * or a nested block object. |
| */ |
| public static final char CODE_EOT = 0; |
| |
| /** |
| * Character class: Unicode white space |
| */ |
| public static final int UCWHITESPACE = 0; |
| /** |
| * Character class: Line feed |
| */ |
| public static final int LINEFEED = 1; |
| /** |
| * Character class: Boundary between text runs |
| */ |
| public static final int EOT = 2; |
| /** |
| * Character class: non-whitespace |
| */ |
| public static final int NONWHITESPACE = 3; |
| /** |
| * Character class: XML whitespace |
| */ |
| public static final int XMLWHITESPACE = 4; |
| |
| |
| /** null char */ |
| public static final char NULL_CHAR = '\u0000'; |
| /** linefeed character */ |
| public static final char LINEFEED_CHAR = '\n'; |
| /** carriage return */ |
| public static final char CARRIAGE_RETURN = '\r'; |
| /** normal tab */ |
| public static final char TAB = '\t'; |
| /** normal space */ |
| public static final char SPACE = '\u0020'; |
| /** non-breaking space */ |
| public static final char NBSPACE = '\u00A0'; |
| /** next line control character */ |
| public static final char NEXT_LINE = '\u0085'; |
| /** zero-width space */ |
| public static final char ZERO_WIDTH_SPACE = '\u200B'; |
| /** word joiner */ |
| public static final char WORD_JOINER = '\u2060'; |
| /** zero-width joiner */ |
| public static final char ZERO_WIDTH_JOINER = '\u200D'; |
| /** left-to-right mark */ |
| public static final char LRM = '\u200E'; |
| /** right-to-left mark */ |
| public static final char RLM = '\u202F'; |
| /** left-to-right embedding */ |
| public static final char LRE = '\u202A'; |
| /** right-to-left embedding */ |
| public static final char RLE = '\u202B'; |
| /** pop directional formatting */ |
| public static final char PDF = '\u202C'; |
| /** left-to-right override */ |
| public static final char LRO = '\u202D'; |
| /** right-to-left override */ |
| public static final char RLO = '\u202E'; |
| /** zero-width no-break space (= byte order mark) */ |
| public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF'; |
| /** soft hyphen */ |
| public static final char SOFT_HYPHEN = '\u00AD'; |
| /** line-separator */ |
| public static final char LINE_SEPARATOR = '\u2028'; |
| /** paragraph-separator */ |
| public static final char PARAGRAPH_SEPARATOR = '\u2029'; |
| /** missing ideograph */ |
| public static final char MISSING_IDEOGRAPH = '\u25A1'; |
| /** Ideogreaphic space */ |
| public static final char IDEOGRAPHIC_SPACE = '\u3000'; |
| /** Object replacement character */ |
| public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC'; |
| /** Unicode value indicating the the character is "not a character". */ |
| public static final char NOT_A_CHARACTER = '\uFFFF'; |
| |
| /** |
| * A static (class) parameter indicating whether V2 indic shaping |
| * rules apply or not, with default being <code>true</code>. |
| */ |
| private static final boolean useV2Indic = true; // CSOK: ConstantNameCheck |
| |
| /** |
| * Utility class: Constructor prevents instantiating when subclassed. |
| */ |
| protected CharUtilities() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * Return the appropriate CharClass constant for the type |
| * of the passed character. |
| * @param c character to inspect |
| * @return the determined character class |
| */ |
| public static int classOf ( int c ) { |
| switch (c) { |
| case CODE_EOT: |
| return EOT; |
| case LINEFEED_CHAR: |
| return LINEFEED; |
| case SPACE: |
| case CARRIAGE_RETURN: |
| case TAB: |
| return XMLWHITESPACE; |
| default: |
| return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE; |
| } |
| } |
| |
| |
| /** |
| * Helper method to determine if the character is a |
| * space with normal behavior. Normal behavior means that |
| * it's not non-breaking. |
| * @param c character to inspect |
| * @return True if the character is a normal space |
| */ |
| public static boolean isBreakableSpace ( int c ) { |
| return (c == SPACE || isFixedWidthSpace(c)); |
| } |
| |
| /** |
| * Method to determine if the character is a zero-width space. |
| * @param c the character to check |
| * @return true if the character is a zero-width space |
| */ |
| public static boolean isZeroWidthSpace ( int c ) { |
| return c == ZERO_WIDTH_SPACE // 200Bh |
| || c == WORD_JOINER // 2060h |
| || c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM) |
| } |
| |
| /** |
| * Method to determine if the character is a (breakable) fixed-width space. |
| * @param c the character to check |
| * @return true if the character has a fixed-width |
| */ |
| public static boolean isFixedWidthSpace ( int c ) { |
| return (c >= '\u2000' && c <= '\u200B') |
| || c == '\u3000'; |
| // c == '\u2000' // en quad |
| // c == '\u2001' // em quad |
| // c == '\u2002' // en space |
| // c == '\u2003' // em space |
| // c == '\u2004' // three-per-em space |
| // c == '\u2005' // four-per-em space |
| // c == '\u2006' // six-per-em space |
| // c == '\u2007' // figure space |
| // c == '\u2008' // punctuation space |
| // c == '\u2009' // thin space |
| // c == '\u200A' // hair space |
| // c == '\u200B' // zero width space |
| // c == '\u3000' // ideographic space |
| } |
| |
| /** |
| * Method to determine if the character is a nonbreaking |
| * space. |
| * @param c character to check |
| * @return True if the character is a nbsp |
| */ |
| public static boolean isNonBreakableSpace ( int c ) { |
| return |
| (c == NBSPACE // no-break space |
| || c == '\u202F' // narrow no-break space |
| || c == '\u3000' // ideographic space |
| || c == WORD_JOINER // word joiner |
| || c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space |
| } |
| |
| /** |
| * Method to determine if the character is an adjustable |
| * space. |
| * @param c character to check |
| * @return True if the character is adjustable |
| */ |
| public static boolean isAdjustableSpace ( int c ) { |
| //TODO: are there other kinds of adjustable spaces? |
| return |
| (c == '\u0020' // normal space |
| || c == NBSPACE); // no-break space |
| } |
| |
| /** |
| * Determines if the character represents any kind of space. |
| * @param c character to check |
| * @return True if the character represents any kind of space |
| */ |
| public static boolean isAnySpace ( int c ) { |
| return (isBreakableSpace(c) || isNonBreakableSpace(c)); |
| } |
| |
| /** |
| * Indicates whether a character is classified as "Alphabetic" by the Unicode standard. |
| * @param c the character |
| * @return true if the character is "Alphabetic" |
| */ |
| public static boolean isAlphabetic ( int c ) { |
| //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic |
| //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl |
| int generalCategory = Character.getType((char)c); |
| switch (generalCategory) { |
| case Character.UPPERCASE_LETTER: //Lu |
| case Character.LOWERCASE_LETTER: //Ll |
| case Character.TITLECASE_LETTER: //Lt |
| case Character.MODIFIER_LETTER: //Lm |
| case Character.OTHER_LETTER: //Lo |
| case Character.LETTER_NUMBER: //Nl |
| return true; |
| default: |
| //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that) |
| //Other_Alphabetic contains mostly more exotic characters |
| return false; |
| } |
| } |
| |
| /** |
| * Indicates whether the given character is an explicit break-character |
| * @param c the character to check |
| * @return true if the character represents an explicit break |
| */ |
| public static boolean isExplicitBreak ( int c ) { |
| return (c == LINEFEED_CHAR |
| || c == CARRIAGE_RETURN |
| || c == NEXT_LINE |
| || c == LINE_SEPARATOR |
| || c == PARAGRAPH_SEPARATOR); |
| } |
| |
| |
| // |
| // The following script codes are based on ISO 15924. Codes less than 1000 are |
| // official assignments from 15924; those equal to or greater than 1000 are FOP |
| // implementation specific. |
| // |
| // CSOFF: LineLengthCheck |
| /** hebrew script constant */ |
| public static final int SCRIPT_HEBREW = 125; // 'hebr' |
| /** mongolian script constant */ |
| public static final int SCRIPT_MONGOLIAN = 145; // 'mong' |
| /** arabic script constant */ |
| public static final int SCRIPT_ARABIC = 160; // 'arab' |
| /** greek script constant */ |
| public static final int SCRIPT_GREEK = 200; // 'grek' |
| /** latin script constant */ |
| public static final int SCRIPT_LATIN = 215; // 'latn' |
| /** cyrillic script constant */ |
| public static final int SCRIPT_CYRILLIC = 220; // 'cyrl' |
| /** georgian script constant */ |
| public static final int SCRIPT_GEORGIAN = 240; // 'geor' |
| /** bopomofo script constant */ |
| public static final int SCRIPT_BOPOMOFO = 285; // 'bopo' |
| /** hangul script constant */ |
| public static final int SCRIPT_HANGUL = 286; // 'hang' |
| /** gurmukhi script constant */ |
| public static final int SCRIPT_GURMUKHI = 310; // 'guru' |
| /** gurmukhi 2 script constant */ |
| public static final int SCRIPT_GURMUKHI_2 = 1310; // 'gur2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** devanagari script constant */ |
| public static final int SCRIPT_DEVANAGARI = 315; // 'deva' |
| /** devanagari 2 script constant */ |
| public static final int SCRIPT_DEVANAGARI_2 = 1315; // 'dev2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** gujarati script constant */ |
| public static final int SCRIPT_GUJARATI = 320; // 'gujr' |
| /** gujarati 2 script constant */ |
| public static final int SCRIPT_GUJARATI_2 = 1320; // 'gjr2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** bengali script constant */ |
| public static final int SCRIPT_BENGALI = 326; // 'beng' |
| /** bengali 2 script constant */ |
| public static final int SCRIPT_BENGALI_2 = 1326; // 'bng2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** oriya script constant */ |
| public static final int SCRIPT_ORIYA = 327; // 'orya' |
| /** oriya 2 script constant */ |
| public static final int SCRIPT_ORIYA_2 = 1327; // 'ory2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** tibetan script constant */ |
| public static final int SCRIPT_TIBETAN = 330; // 'tibt' |
| /** telugu script constant */ |
| public static final int SCRIPT_TELUGU = 340; // 'telu' |
| /** telugu 2 script constant */ |
| public static final int SCRIPT_TELUGU_2 = 1340; // 'tel2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** kannada script constant */ |
| public static final int SCRIPT_KANNADA = 345; // 'knda' |
| /** kannada 2 script constant */ |
| public static final int SCRIPT_KANNADA_2 = 1345; // 'knd2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** tamil script constant */ |
| public static final int SCRIPT_TAMIL = 346; // 'taml' |
| /** tamil 2 script constant */ |
| public static final int SCRIPT_TAMIL_2 = 1346; // 'tml2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** malayalam script constant */ |
| public static final int SCRIPT_MALAYALAM = 347; // 'mlym' |
| /** malayalam 2 script constant */ |
| public static final int SCRIPT_MALAYALAM_2 = 1347; // 'mlm2' -- MSFT (pseudo) script tag for variant shaping semantics |
| /** sinhalese script constant */ |
| public static final int SCRIPT_SINHALESE = 348; // 'sinh' |
| /** burmese script constant */ |
| public static final int SCRIPT_BURMESE = 350; // 'mymr' |
| /** thai script constant */ |
| public static final int SCRIPT_THAI = 352; // 'thai' |
| /** khmer script constant */ |
| public static final int SCRIPT_KHMER = 355; // 'khmr' |
| /** lao script constant */ |
| public static final int SCRIPT_LAO = 356; // 'laoo' |
| /** hiragana script constant */ |
| public static final int SCRIPT_HIRAGANA = 410; // 'hira' |
| /** ethiopic script constant */ |
| public static final int SCRIPT_ETHIOPIC = 430; // 'ethi' |
| /** han script constant */ |
| public static final int SCRIPT_HAN = 500; // 'hani' |
| /** katakana script constant */ |
| public static final int SCRIPT_KATAKANA = 410; // 'kana' |
| /** math script constant */ |
| public static final int SCRIPT_MATH = 995; // 'zmth' |
| /** symbol script constant */ |
| public static final int SCRIPT_SYMBOL = 996; // 'zsym' |
| /** undetermined script constant */ |
| public static final int SCRIPT_UNDETERMINED = 998; // 'zyyy' |
| /** uncoded script constant */ |
| public static final int SCRIPT_UNCODED = 999; // 'zzzz' |
| // CSON: LineLengthCheck |
| |
| /** |
| * Determine if character c is punctuation. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character is punctuation |
| */ |
| public static boolean isPunctuation ( int c ) { |
| if ( ( c >= 0x0021 ) && ( c <= 0x002F ) ) { // basic latin punctuation |
| return true; |
| } else if ( ( c >= 0x003A ) && ( c <= 0x0040 ) ) { // basic latin punctuation |
| return true; |
| } else if ( ( c >= 0x005F ) && ( c <= 0x0060 ) ) { // basic latin punctuation |
| return true; |
| } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation |
| return true; |
| } else if ( ( c >= 0x007E ) && ( c <= 0x007E ) ) { // basic latin punctuation |
| return true; |
| } else if ( ( c >= 0x00A1 ) && ( c <= 0x00BF ) ) { // latin supplement punctuation |
| return true; |
| } else if ( ( c >= 0x00D7 ) && ( c <= 0x00D7 ) ) { // latin supplement punctuation |
| return true; |
| } else if ( ( c >= 0x00F7 ) && ( c <= 0x00F7 ) ) { // latin supplement punctuation |
| return true; |
| } else if ( ( c >= 0x2000 ) && ( c <= 0x206F ) ) { // general punctuation |
| return true; |
| } else { // [TBD] - not complete |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c is a digit. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character is a digit |
| */ |
| public static boolean isDigit ( int c ) { |
| if ( ( c >= 0x0030 ) && ( c <= 0x0039 ) ) { // basic latin digits |
| return true; |
| } else { // [TBD] - not complete |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the hebrew script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to hebrew script |
| */ |
| public static boolean isHebrew ( int c ) { |
| if ( ( c >= 0x0590 ) && ( c <= 0x05FF ) ) { // hebrew block |
| return true; |
| } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB4F ) ) { // hebrew presentation forms block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the mongolian script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to mongolian script |
| */ |
| public static boolean isMongolian ( int c ) { |
| if ( ( c >= 0x1800 ) && ( c <= 0x18AF ) ) { // mongolian block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the arabic script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to arabic script |
| */ |
| public static boolean isArabic ( int c ) { |
| if ( ( c >= 0x0600 ) && ( c <= 0x06FF ) ) { // arabic block |
| return true; |
| } else if ( ( c >= 0x0750 ) && ( c <= 0x077F ) ) { // arabic supplement block |
| return true; |
| } else if ( ( c >= 0xFB50 ) && ( c <= 0xFDFF ) ) { // arabic presentation forms a block |
| return true; |
| } else if ( ( c >= 0xFE70 ) && ( c <= 0xFEFF ) ) { // arabic presentation forms b block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the greek script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to greek script |
| */ |
| public static boolean isGreek ( int c ) { |
| if ( ( c >= 0x0370 ) && ( c <= 0x03FF ) ) { // greek (and coptic) block |
| return true; |
| } else if ( ( c >= 0x1F00 ) && ( c <= 0x1FFF ) ) { // greek extended block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the latin script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to latin script |
| */ |
| public static boolean isLatin ( int c ) { |
| if ( ( c >= 0x0041 ) && ( c <= 0x005A ) ) { // basic latin upper case |
| return true; |
| } else if ( ( c >= 0x0061 ) && ( c <= 0x007A ) ) { // basic latin lower case |
| return true; |
| } else if ( ( c >= 0x00C0 ) && ( c <= 0x00D6 ) ) { // latin supplement upper case |
| return true; |
| } else if ( ( c >= 0x00D8 ) && ( c <= 0x00DF ) ) { // latin supplement upper case |
| return true; |
| } else if ( ( c >= 0x00E0 ) && ( c <= 0x00F6 ) ) { // latin supplement lower case |
| return true; |
| } else if ( ( c >= 0x00F8 ) && ( c <= 0x00FF ) ) { // latin supplement lower case |
| return true; |
| } else if ( ( c >= 0x0100 ) && ( c <= 0x017F ) ) { // latin extended a |
| return true; |
| } else if ( ( c >= 0x0180 ) && ( c <= 0x024F ) ) { // latin extended b |
| return true; |
| } else if ( ( c >= 0x1E00 ) && ( c <= 0x1EFF ) ) { // latin extended additional |
| return true; |
| } else if ( ( c >= 0x2C60 ) && ( c <= 0x2C7F ) ) { // latin extended c |
| return true; |
| } else if ( ( c >= 0xA720 ) && ( c <= 0xA7FF ) ) { // latin extended d |
| return true; |
| } else if ( ( c >= 0xFB00 ) && ( c <= 0xFB0F ) ) { // latin ligatures |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the cyrillic script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to cyrillic script |
| */ |
| public static boolean isCyrillic ( int c ) { |
| if ( ( c >= 0x0400 ) && ( c <= 0x04FF ) ) { // cyrillic block |
| return true; |
| } else if ( ( c >= 0x0500 ) && ( c <= 0x052F ) ) { // cyrillic supplement block |
| return true; |
| } else if ( ( c >= 0x2DE0 ) && ( c <= 0x2DFF ) ) { // cyrillic extended-a block |
| return true; |
| } else if ( ( c >= 0xA640 ) && ( c <= 0xA69F ) ) { // cyrillic extended-b block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the georgian script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to georgian script |
| */ |
| public static boolean isGeorgian ( int c ) { |
| if ( ( c >= 0x10A0 ) && ( c <= 0x10FF ) ) { // georgian block |
| return true; |
| } else if ( ( c >= 0x2D00 ) && ( c <= 0x2D2F ) ) { // georgian supplement block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the hangul script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to hangul script |
| */ |
| public static boolean isHangul ( int c ) { |
| if ( ( c >= 0x1100 ) && ( c <= 0x11FF ) ) { // hangul jamo |
| return true; |
| } else if ( ( c >= 0x3130 ) && ( c <= 0x318F ) ) { // hangul compatibility jamo |
| return true; |
| } else if ( ( c >= 0xA960 ) && ( c <= 0xA97F ) ) { // hangul jamo extended a |
| return true; |
| } else if ( ( c >= 0xAC00 ) && ( c <= 0xD7A3 ) ) { // hangul syllables |
| return true; |
| } else if ( ( c >= 0xD7B0 ) && ( c <= 0xD7FF ) ) { // hangul jamo extended a |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the gurmukhi script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to gurmukhi script |
| */ |
| public static boolean isGurmukhi ( int c ) { |
| if ( ( c >= 0x0A00 ) && ( c <= 0x0A7F ) ) { // gurmukhi block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the devanagari script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to devanagari script |
| */ |
| public static boolean isDevanagari ( int c ) { |
| if ( ( c >= 0x0900 ) && ( c <= 0x097F ) ) { // devangari block |
| return true; |
| } else if ( ( c >= 0xA8E0 ) && ( c <= 0xA8FF ) ) { // devangari extended block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the gujarati script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to gujarati script |
| */ |
| public static boolean isGujarati ( int c ) { |
| if ( ( c >= 0x0A80 ) && ( c <= 0x0AFF ) ) { // gujarati block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the bengali script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to bengali script |
| */ |
| public static boolean isBengali ( int c ) { |
| if ( ( c >= 0x0980 ) && ( c <= 0x09FF ) ) { // bengali block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the oriya script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to oriya script |
| */ |
| public static boolean isOriya ( int c ) { |
| if ( ( c >= 0x0B00 ) && ( c <= 0x0B7F ) ) { // oriya block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the tibetan script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to tibetan script |
| */ |
| public static boolean isTibetan ( int c ) { |
| if ( ( c >= 0x0F00 ) && ( c <= 0x0FFF ) ) { // tibetan block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the telugu script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to telugu script |
| */ |
| public static boolean isTelugu ( int c ) { |
| if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // telugu block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the kannada script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to kannada script |
| */ |
| public static boolean isKannada ( int c ) { |
| if ( ( c >= 0x0C00 ) && ( c <= 0x0C7F ) ) { // kannada block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the tamil script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to tamil script |
| */ |
| public static boolean isTamil ( int c ) { |
| if ( ( c >= 0x0B80 ) && ( c <= 0x0BFF ) ) { // tamil block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the malayalam script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to malayalam script |
| */ |
| public static boolean isMalayalam ( int c ) { |
| if ( ( c >= 0x0D00 ) && ( c <= 0x0D7F ) ) { // malayalam block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the sinhalese script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to sinhalese script |
| */ |
| public static boolean isSinhalese ( int c ) { |
| if ( ( c >= 0x0D80 ) && ( c <= 0x0DFF ) ) { // sinhala block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the burmese script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to burmese script |
| */ |
| public static boolean isBurmese ( int c ) { |
| if ( ( c >= 0x1000 ) && ( c <= 0x109F ) ) { // burmese (myanmar) block |
| return true; |
| } else if ( ( c >= 0xAA60 ) && ( c <= 0xAA7F ) ) { // burmese (myanmar) extended block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the thai script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to thai script |
| */ |
| public static boolean isThai ( int c ) { |
| if ( ( c >= 0x0E00 ) && ( c <= 0x0E7F ) ) { // thai block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the khmer script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to khmer script |
| */ |
| public static boolean isKhmer ( int c ) { |
| if ( ( c >= 0x1780 ) && ( c <= 0x17FF ) ) { // khmer block |
| return true; |
| } else if ( ( c >= 0x19E0 ) && ( c <= 0x19FF ) ) { // khmer symbols block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the lao script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to lao script |
| */ |
| public static boolean isLao ( int c ) { |
| if ( ( c >= 0x0E80 ) && ( c <= 0x0EFF ) ) { // lao block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the ethiopic (amharic) script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to ethiopic (amharic) script |
| */ |
| public static boolean isEthiopic ( int c ) { |
| if ( ( c >= 0x1200 ) && ( c <= 0x137F ) ) { // ethiopic block |
| return true; |
| } else if ( ( c >= 0x1380 ) && ( c <= 0x139F ) ) { // ethoipic supplement block |
| return true; |
| } else if ( ( c >= 0x2D80 ) && ( c <= 0x2DDF ) ) { // ethoipic extended block |
| return true; |
| } else if ( ( c >= 0xAB00 ) && ( c <= 0xAB2F ) ) { // ethoipic extended-a block |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the han (unified cjk) script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to han (unified cjk) script |
| */ |
| public static boolean isHan ( int c ) { |
| if ( ( c >= 0x3400 ) && ( c <= 0x4DBF ) ) { |
| return true; // cjk unified ideographs extension a |
| } else if ( ( c >= 0x4E00 ) && ( c <= 0x9FFF ) ) { |
| return true; // cjk unified ideographs |
| } else if ( ( c >= 0xF900 ) && ( c <= 0xFAFF ) ) { |
| return true; // cjk compatibility ideographs |
| } else if ( ( c >= 0x20000 ) && ( c <= 0x2A6DF ) ) { |
| return true; // cjk unified ideographs extension b |
| } else if ( ( c >= 0x2A700 ) && ( c <= 0x2B73F ) ) { |
| return true; // cjk unified ideographs extension c |
| } else if ( ( c >= 0x2F800 ) && ( c <= 0x2FA1F ) ) { |
| return true; // cjk compatibility ideographs supplement |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the bopomofo script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to bopomofo script |
| */ |
| public static boolean isBopomofo ( int c ) { |
| if ( ( c >= 0x3100 ) && ( c <= 0x312F ) ) { |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the hiragana script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to hiragana script |
| */ |
| public static boolean isHiragana ( int c ) { |
| if ( ( c >= 0x3040 ) && ( c <= 0x309F ) ) { |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Determine if character c belong to the katakana script. |
| * @param c a character represented as a unicode scalar value |
| * @return true if character belongs to katakana script |
| */ |
| public static boolean isKatakana ( int c ) { |
| if ( ( c >= 0x30A0 ) && ( c <= 0x30FF ) ) { |
| return true; |
| } else if ( ( c >= 0x31F0 ) && ( c <= 0x31FF ) ) { |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Obtain ISO15924 numeric script code of character. If script is not or cannot be determined, |
| * then the script code 998 ('zyyy') is returned. |
| * @param c the character to obtain script |
| * @return an ISO15924 script code |
| */ |
| public static int scriptOf ( int c ) { // [TBD] - needs optimization!!! |
| if ( isAnySpace ( c ) ) { |
| return SCRIPT_UNDETERMINED; |
| } else if ( isPunctuation ( c ) ) { |
| return SCRIPT_UNDETERMINED; |
| } else if ( isDigit ( c ) ) { |
| return SCRIPT_UNDETERMINED; |
| } else if ( isLatin ( c ) ) { |
| return SCRIPT_LATIN; |
| } else if ( isCyrillic ( c ) ) { |
| return SCRIPT_CYRILLIC; |
| } else if ( isGreek ( c ) ) { |
| return SCRIPT_GREEK; |
| } else if ( isHan ( c ) ) { |
| return SCRIPT_HAN; |
| } else if ( isBopomofo ( c ) ) { |
| return SCRIPT_BOPOMOFO; |
| } else if ( isKatakana ( c ) ) { |
| return SCRIPT_KATAKANA; |
| } else if ( isHiragana ( c ) ) { |
| return SCRIPT_HIRAGANA; |
| } else if ( isHangul ( c ) ) { |
| return SCRIPT_HANGUL; |
| } else if ( isArabic ( c ) ) { |
| return SCRIPT_ARABIC; |
| } else if ( isHebrew ( c ) ) { |
| return SCRIPT_HEBREW; |
| } else if ( isMongolian ( c ) ) { |
| return SCRIPT_MONGOLIAN; |
| } else if ( isGeorgian ( c ) ) { |
| return SCRIPT_GEORGIAN; |
| } else if ( isGurmukhi ( c ) ) { |
| return useV2IndicRules ( SCRIPT_GURMUKHI ); |
| } else if ( isDevanagari ( c ) ) { |
| return useV2IndicRules ( SCRIPT_DEVANAGARI ); |
| } else if ( isGujarati ( c ) ) { |
| return useV2IndicRules ( SCRIPT_GUJARATI ); |
| } else if ( isBengali ( c ) ) { |
| return useV2IndicRules ( SCRIPT_BENGALI ); |
| } else if ( isOriya ( c ) ) { |
| return useV2IndicRules ( SCRIPT_ORIYA ); |
| } else if ( isTibetan ( c ) ) { |
| return SCRIPT_TIBETAN; |
| } else if ( isTelugu ( c ) ) { |
| return useV2IndicRules ( SCRIPT_TELUGU ); |
| } else if ( isKannada ( c ) ) { |
| return useV2IndicRules ( SCRIPT_KANNADA ); |
| } else if ( isTamil ( c ) ) { |
| return useV2IndicRules ( SCRIPT_TAMIL ); |
| } else if ( isMalayalam ( c ) ) { |
| return useV2IndicRules ( SCRIPT_MALAYALAM ); |
| } else if ( isSinhalese ( c ) ) { |
| return SCRIPT_SINHALESE; |
| } else if ( isBurmese ( c ) ) { |
| return SCRIPT_BURMESE; |
| } else if ( isThai ( c ) ) { |
| return SCRIPT_THAI; |
| } else if ( isKhmer ( c ) ) { |
| return SCRIPT_KHMER; |
| } else if ( isLao ( c ) ) { |
| return SCRIPT_LAO; |
| } else if ( isEthiopic ( c ) ) { |
| return SCRIPT_ETHIOPIC; |
| } else { |
| return SCRIPT_UNDETERMINED; |
| } |
| } |
| |
| /** |
| * Obtain the V2 indic script code corresponding to V1 indic script code SC if |
| * and only iff V2 indic rules apply; otherwise return SC. |
| * @param sc a V1 indic script code |
| * @return either SC or the V2 flavor of SC if V2 indic rules apply |
| */ |
| public static int useV2IndicRules ( int sc ) { |
| if ( useV2Indic ) { |
| return ( sc < 1000 ) ? ( sc + 1000 ) : sc; |
| } else { |
| return sc; |
| } |
| } |
| |
| /** |
| * Obtain the script codes of each character in a character sequence. If script |
| * is not or cannot be determined for some character, then the script code 998 |
| * ('zyyy') is returned. |
| * @param cs the character sequence |
| * @return a (possibly empty) array of script codes |
| */ |
| public static int[] scriptsOf ( CharSequence cs ) { |
| Set s = new HashSet(); |
| for ( int i = 0, n = cs.length(); i < n; i++ ) { |
| s.add ( Integer.valueOf ( scriptOf ( cs.charAt ( i ) ) ) ); |
| } |
| int[] sa = new int [ s.size() ]; |
| int ns = 0; |
| for ( Iterator it = s.iterator(); it.hasNext();) { |
| sa [ ns++ ] = ( (Integer) it.next() ) .intValue(); |
| } |
| Arrays.sort ( sa ); |
| return sa; |
| } |
| |
| /** |
| * Determine the dominant script of a character sequence. |
| * @param cs the character sequence |
| * @return the dominant script or SCRIPT_UNDETERMINED |
| */ |
| public static int dominantScript ( CharSequence cs ) { |
| Map m = new HashMap(); |
| for ( int i = 0, n = cs.length(); i < n; i++ ) { |
| int c = cs.charAt ( i ); |
| int s = scriptOf ( c ); |
| Integer k = Integer.valueOf ( s ); |
| Integer v = (Integer) m.get ( k ); |
| if ( v != null ) { |
| m.put ( k, Integer.valueOf ( v.intValue() + 1 ) ); |
| } else { |
| m.put ( k, Integer.valueOf ( 0 ) ); |
| } |
| } |
| int sMax = -1; |
| int cMax = -1; |
| for ( Iterator it = m.entrySet().iterator(); it.hasNext();) { |
| Map.Entry e = (Map.Entry) it.next(); |
| Integer k = (Integer) e.getKey(); |
| int s = k.intValue(); |
| switch ( s ) { |
| case SCRIPT_UNDETERMINED: |
| case SCRIPT_UNCODED: |
| break; |
| default: |
| { |
| Integer v = (Integer) e.getValue(); |
| assert v != null; |
| int c = v.intValue(); |
| if ( c > cMax ) { |
| cMax = c; sMax = s; |
| } |
| break; |
| } |
| } |
| } |
| if ( sMax < 0 ) { |
| sMax = SCRIPT_UNDETERMINED; |
| } |
| return sMax; |
| } |
| |
| /** |
| * Determine if script tag denotes an 'Indic' script, where a |
| * script is an 'Indic' script if it is intended to be processed by |
| * the generic 'Indic' Script Processor. |
| * @param script a script tag |
| * @return true if script tag is a designated 'Indic' script |
| */ |
| public static boolean isIndicScript ( String script ) { |
| switch ( scriptCodeFromTag ( script ) ) { |
| case SCRIPT_BENGALI: |
| case SCRIPT_BENGALI_2: |
| case SCRIPT_BURMESE: |
| case SCRIPT_DEVANAGARI: |
| case SCRIPT_DEVANAGARI_2: |
| case SCRIPT_GUJARATI: |
| case SCRIPT_GUJARATI_2: |
| case SCRIPT_GURMUKHI: |
| case SCRIPT_GURMUKHI_2: |
| case SCRIPT_KANNADA: |
| case SCRIPT_KANNADA_2: |
| case SCRIPT_MALAYALAM: |
| case SCRIPT_MALAYALAM_2: |
| case SCRIPT_ORIYA: |
| case SCRIPT_ORIYA_2: |
| case SCRIPT_TAMIL: |
| case SCRIPT_TAMIL_2: |
| case SCRIPT_TELUGU: |
| case SCRIPT_TELUGU_2: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| /** |
| * Determine the script tag associated with an internal script code. |
| * @param code the script code |
| * @return a script tag |
| */ |
| public static String scriptTagFromCode ( int code ) { |
| Map<Integer,String> m = getScriptTagsMap(); |
| if ( m != null ) { |
| String tag; |
| if ( ( tag = m.get ( Integer.valueOf ( code ) ) ) != null ) { |
| return tag; |
| } else { |
| return ""; |
| } |
| } else { |
| return ""; |
| } |
| } |
| |
| /** |
| * Determine the internal script code associated with a script tag. |
| * @param tag the script tag |
| * @return a script code |
| */ |
| public static int scriptCodeFromTag ( String tag ) { |
| Map<String,Integer> m = getScriptCodeMap(); |
| if ( m != null ) { |
| Integer c; |
| if ( ( c = m.get ( tag ) ) != null ) { |
| return (int) c; |
| } else { |
| return SCRIPT_UNDETERMINED; |
| } |
| } else { |
| return SCRIPT_UNDETERMINED; |
| } |
| } |
| |
| /** |
| * Convert a single unicode scalar value to an XML numeric character |
| * reference. If in the BMP, four digits are used, otherwise 6 digits are used. |
| * @param c a unicode scalar value |
| * @return a string representing a numeric character reference |
| */ |
| public static String charToNCRef ( int c ) { |
| StringBuffer sb = new StringBuffer(); |
| for ( int i = 0, nDigits = ( c > 0xFFFF ) ? 6 : 4; i < nDigits; i++, c >>= 4 ) { |
| int d = c & 0xF; |
| char hd; |
| if ( d < 10 ) { |
| hd = (char) ( (int) '0' + d ); |
| } else { |
| hd = (char) ( (int) 'A' + ( d - 10 ) ); |
| } |
| sb.append ( hd ); |
| } |
| return "&#x" + sb.reverse() + ";"; |
| } |
| |
| /** |
| * Convert a string to a sequence of ASCII or XML numeric character references. |
| * @param s a java string (encoded in UTF-16) |
| * @return a string representing a sequence of numeric character reference or |
| * ASCII characters |
| */ |
| public static String toNCRefs ( String s ) { |
| StringBuffer sb = new StringBuffer(); |
| if ( s != null ) { |
| for ( int i = 0; i < s.length(); i++ ) { |
| char c = s.charAt(i); |
| if ( ( c >= 32 ) && ( c < 127 ) ) { |
| if ( c == '<' ) { |
| sb.append ( "<" ); |
| } else if ( c == '>' ) { |
| sb.append ( ">" ); |
| } else if ( c == '&' ) { |
| sb.append ( "&" ); |
| } else { |
| sb.append ( c ); |
| } |
| } else { |
| sb.append ( charToNCRef ( c ) ); |
| } |
| } |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Pad a string S on left out to width W using padding character PAD. |
| * @param s string to pad |
| * @param width width of field to add padding |
| * @param pad character to use for padding |
| * @return padded string |
| */ |
| public static String padLeft ( String s, int width, char pad ) { |
| StringBuffer sb = new StringBuffer(); |
| for ( int i = s.length(); i < width; i++ ) { |
| sb.append(pad); |
| } |
| sb.append ( s ); |
| return sb.toString(); |
| } |
| |
| /** |
| * Format character for debugging output, which it is prefixed with "0x", padded left with '0' |
| * and either 4 or 6 hex characters in width according to whether it is in the BMP or not. |
| * @param c character code |
| * @return formatted character string |
| */ |
| public static String format ( int c ) { |
| if ( c < 1114112 ) { |
| return "0x" + padLeft ( Integer.toString ( c, 16 ), ( c < 65536 ) ? 4 : 6, '0' ); |
| } else { |
| return "!NOT A CHARACTER!"; |
| } |
| } |
| |
| private static Map<Integer,String> scriptTagsMap = null; |
| private static Map<String,Integer> scriptCodeMap = null; |
| |
| private static void putScriptTag ( Map tm, Map cm, int code, String tag ) { |
| assert tag != null; |
| assert tag.length() != 0; |
| assert code >= 0; |
| assert code < 2000; |
| tm.put ( Integer.valueOf ( code ), tag ); |
| cm.put ( tag, Integer.valueOf ( code ) ); |
| } |
| |
| private static void makeScriptMaps() { |
| HashMap<Integer,String> tm = new HashMap<Integer,String>(); |
| HashMap<String,Integer> cm = new HashMap<String,Integer>(); |
| putScriptTag ( tm, cm, SCRIPT_HEBREW, "hebr" ); |
| putScriptTag ( tm, cm, SCRIPT_MONGOLIAN, "mong" ); |
| putScriptTag ( tm, cm, SCRIPT_ARABIC, "arab" ); |
| putScriptTag ( tm, cm, SCRIPT_GREEK, "grek" ); |
| putScriptTag ( tm, cm, SCRIPT_LATIN, "latn" ); |
| putScriptTag ( tm, cm, SCRIPT_CYRILLIC, "cyrl" ); |
| putScriptTag ( tm, cm, SCRIPT_GEORGIAN, "geor" ); |
| putScriptTag ( tm, cm, SCRIPT_BOPOMOFO, "bopo" ); |
| putScriptTag ( tm, cm, SCRIPT_HANGUL, "hang" ); |
| putScriptTag ( tm, cm, SCRIPT_GURMUKHI, "guru" ); |
| putScriptTag ( tm, cm, SCRIPT_GURMUKHI_2, "gur2" ); |
| putScriptTag ( tm, cm, SCRIPT_DEVANAGARI, "deva" ); |
| putScriptTag ( tm, cm, SCRIPT_DEVANAGARI_2, "dev2" ); |
| putScriptTag ( tm, cm, SCRIPT_GUJARATI, "gujr" ); |
| putScriptTag ( tm, cm, SCRIPT_GUJARATI_2, "gjr2" ); |
| putScriptTag ( tm, cm, SCRIPT_BENGALI, "beng" ); |
| putScriptTag ( tm, cm, SCRIPT_BENGALI_2, "bng2" ); |
| putScriptTag ( tm, cm, SCRIPT_ORIYA, "orya" ); |
| putScriptTag ( tm, cm, SCRIPT_ORIYA_2, "ory2" ); |
| putScriptTag ( tm, cm, SCRIPT_TIBETAN, "tibt" ); |
| putScriptTag ( tm, cm, SCRIPT_TELUGU, "telu" ); |
| putScriptTag ( tm, cm, SCRIPT_TELUGU_2, "tel2" ); |
| putScriptTag ( tm, cm, SCRIPT_KANNADA, "knda" ); |
| putScriptTag ( tm, cm, SCRIPT_KANNADA_2, "knd2" ); |
| putScriptTag ( tm, cm, SCRIPT_TAMIL, "taml" ); |
| putScriptTag ( tm, cm, SCRIPT_TAMIL_2, "tml2" ); |
| putScriptTag ( tm, cm, SCRIPT_MALAYALAM, "mlym" ); |
| putScriptTag ( tm, cm, SCRIPT_MALAYALAM_2, "mlm2" ); |
| putScriptTag ( tm, cm, SCRIPT_SINHALESE, "sinh" ); |
| putScriptTag ( tm, cm, SCRIPT_BURMESE, "mymr" ); |
| putScriptTag ( tm, cm, SCRIPT_THAI, "thai" ); |
| putScriptTag ( tm, cm, SCRIPT_KHMER, "khmr" ); |
| putScriptTag ( tm, cm, SCRIPT_LAO, "laoo" ); |
| putScriptTag ( tm, cm, SCRIPT_HIRAGANA, "hira" ); |
| putScriptTag ( tm, cm, SCRIPT_ETHIOPIC, "ethi" ); |
| putScriptTag ( tm, cm, SCRIPT_HAN, "hani" ); |
| putScriptTag ( tm, cm, SCRIPT_KATAKANA, "kana" ); |
| putScriptTag ( tm, cm, SCRIPT_MATH, "zmth" ); |
| putScriptTag ( tm, cm, SCRIPT_SYMBOL, "zsym" ); |
| putScriptTag ( tm, cm, SCRIPT_UNDETERMINED, "zyyy" ); |
| putScriptTag ( tm, cm, SCRIPT_UNCODED, "zzzz" ); |
| scriptTagsMap = tm; |
| scriptCodeMap = cm; |
| } |
| |
| private static Map<Integer,String> getScriptTagsMap() { |
| if ( scriptTagsMap == null ) { |
| makeScriptMaps(); |
| } |
| return scriptTagsMap; |
| } |
| |
| private static Map<String,Integer> getScriptCodeMap() { |
| if ( scriptCodeMap == null ) { |
| makeScriptMaps(); |
| } |
| return scriptCodeMap; |
| } |
| |
| /** |
| * Mirror characters that are designated as having the bidi mirrorred property. |
| * @param s a string whose characters are to be mirrored |
| * @return the resulting string |
| */ |
| public static String mirror ( String s ) { |
| StringBuffer sb = new StringBuffer ( s ); |
| for ( int i = 0, n = sb.length(); i < n; i++ ) { |
| sb.setCharAt ( i, (char) mirror ( sb.charAt ( i ) ) ); |
| } |
| return sb.toString(); |
| } |
| |
| private static int[] mirroredCharacters = { |
| 0x0028, |
| 0x0029, |
| 0x003C, |
| 0x003E, |
| 0x005B, |
| 0x005D, |
| 0x007B, |
| 0x007D, |
| 0x00AB, |
| 0x00BB, |
| 0x0F3A, |
| 0x0F3B, |
| 0x0F3C, |
| 0x0F3D, |
| 0x169B, |
| 0x169C, |
| 0x2039, |
| 0x203A, |
| 0x2045, |
| 0x2046, |
| 0x207D, |
| 0x207E, |
| 0x208D, |
| 0x208E, |
| 0x2208, |
| 0x2209, |
| 0x220A, |
| 0x220B, |
| 0x220C, |
| 0x220D, |
| 0x2215, |
| 0x223C, |
| 0x223D, |
| 0x2243, |
| 0x2252, |
| 0x2253, |
| 0x2254, |
| 0x2255, |
| 0x2264, |
| 0x2265, |
| 0x2266, |
| 0x2267, |
| 0x2268, |
| 0x2269, |
| 0x226A, |
| 0x226B, |
| 0x226E, |
| 0x226F, |
| 0x2270, |
| 0x2271, |
| 0x2272, |
| 0x2273, |
| 0x2274, |
| 0x2275, |
| 0x2276, |
| 0x2277, |
| 0x2278, |
| 0x2279, |
| 0x227A, |
| 0x227B, |
| 0x227C, |
| 0x227D, |
| 0x227E, |
| 0x227F, |
| 0x2280, |
| 0x2281, |
| 0x2282, |
| 0x2283, |
| 0x2284, |
| 0x2285, |
| 0x2286, |
| 0x2287, |
| 0x2288, |
| 0x2289, |
| 0x228A, |
| 0x228B, |
| 0x228F, |
| 0x2290, |
| 0x2291, |
| 0x2292, |
| 0x2298, |
| 0x22A2, |
| 0x22A3, |
| 0x22A6, |
| 0x22A8, |
| 0x22A9, |
| 0x22AB, |
| 0x22B0, |
| 0x22B1, |
| 0x22B2, |
| 0x22B3, |
| 0x22B4, |
| 0x22B5, |
| 0x22B6, |
| 0x22B7, |
| 0x22C9, |
| 0x22CA, |
| 0x22CB, |
| 0x22CC, |
| 0x22CD, |
| 0x22D0, |
| 0x22D1, |
| 0x22D6, |
| 0x22D7, |
| 0x22D8, |
| 0x22D9, |
| 0x22DA, |
| 0x22DB, |
| 0x22DC, |
| 0x22DD, |
| 0x22DE, |
| 0x22DF, |
| 0x22E0, |
| 0x22E1, |
| 0x22E2, |
| 0x22E3, |
| 0x22E4, |
| 0x22E5, |
| 0x22E6, |
| 0x22E7, |
| 0x22E8, |
| 0x22E9, |
| 0x22EA, |
| 0x22EB, |
| 0x22EC, |
| 0x22ED, |
| 0x22F0, |
| 0x22F1, |
| 0x22F2, |
| 0x22F3, |
| 0x22F4, |
| 0x22F6, |
| 0x22F7, |
| 0x22FA, |
| 0x22FB, |
| 0x22FC, |
| 0x22FD, |
| 0x22FE, |
| 0x2308, |
| 0x2309, |
| 0x230A, |
| 0x230B, |
| 0x2329, |
| 0x232A, |
| 0x2768, |
| 0x2769, |
| 0x276A, |
| 0x276B, |
| 0x276C, |
| 0x276D, |
| 0x276E, |
| 0x276F, |
| 0x2770, |
| 0x2771, |
| 0x2772, |
| 0x2773, |
| 0x2774, |
| 0x2775, |
| 0x27C3, |
| 0x27C4, |
| 0x27C5, |
| 0x27C6, |
| 0x27C8, |
| 0x27C9, |
| 0x27D5, |
| 0x27D6, |
| 0x27DD, |
| 0x27DE, |
| 0x27E2, |
| 0x27E3, |
| 0x27E4, |
| 0x27E5, |
| 0x27E6, |
| 0x27E7, |
| 0x27E8, |
| 0x27E9, |
| 0x27EA, |
| 0x27EB, |
| 0x27EC, |
| 0x27ED, |
| 0x27EE, |
| 0x27EF, |
| 0x2983, |
| 0x2984, |
| 0x2985, |
| 0x2986, |
| 0x2987, |
| 0x2988, |
| 0x2989, |
| 0x298A, |
| 0x298B, |
| 0x298C, |
| 0x298D, |
| 0x298E, |
| 0x298F, |
| 0x2990, |
| 0x2991, |
| 0x2992, |
| 0x2993, |
| 0x2994, |
| 0x2995, |
| 0x2996, |
| 0x2997, |
| 0x2998, |
| 0x29B8, |
| 0x29C0, |
| 0x29C1, |
| 0x29C4, |
| 0x29C5, |
| 0x29CF, |
| 0x29D0, |
| 0x29D1, |
| 0x29D2, |
| 0x29D4, |
| 0x29D5, |
| 0x29D8, |
| 0x29D9, |
| 0x29DA, |
| 0x29DB, |
| 0x29F5, |
| 0x29F8, |
| 0x29F9, |
| 0x29FC, |
| 0x29FD, |
| 0x2A2B, |
| 0x2A2C, |
| 0x2A2D, |
| 0x2A2E, |
| 0x2A34, |
| 0x2A35, |
| 0x2A3C, |
| 0x2A3D, |
| 0x2A64, |
| 0x2A65, |
| 0x2A79, |
| 0x2A7A, |
| 0x2A7D, |
| 0x2A7E, |
| 0x2A7F, |
| 0x2A80, |
| 0x2A81, |
| 0x2A82, |
| 0x2A83, |
| 0x2A84, |
| 0x2A8B, |
| 0x2A8C, |
| 0x2A91, |
| 0x2A92, |
| 0x2A93, |
| 0x2A94, |
| 0x2A95, |
| 0x2A96, |
| 0x2A97, |
| 0x2A98, |
| 0x2A99, |
| 0x2A9A, |
| 0x2A9B, |
| 0x2A9C, |
| 0x2AA1, |
| 0x2AA2, |
| 0x2AA6, |
| 0x2AA7, |
| 0x2AA8, |
| 0x2AA9, |
| 0x2AAA, |
| 0x2AAB, |
| 0x2AAC, |
| 0x2AAD, |
| 0x2AAF, |
| 0x2AB0, |
| 0x2AB3, |
| 0x2AB4, |
| 0x2AC3, |
| 0x2AC4, |
| 0x2AC5, |
| 0x2AC6, |
| 0x2ACD, |
| 0x2ACE, |
| 0x2ACF, |
| 0x2AD0, |
| 0x2AD1, |
| 0x2AD2, |
| 0x2AD3, |
| 0x2AD4, |
| 0x2AD5, |
| 0x2AD6, |
| 0x2ADE, |
| 0x2AE3, |
| 0x2E02, |
| 0x2E03, |
| 0x2E04, |
| 0x2E05, |
| 0x2E09, |
| 0x2E0A, |
| 0x2E0C, |
| 0x2E0D, |
| 0x2E1C, |
| 0x2E1D, |
| 0x2E20, |
| 0x2E21, |
| 0x2E22, |
| 0x2E23, |
| 0x2E24, |
| 0x2E25, |
| 0x2E26, |
| 0x300E, |
| 0x300F, |
| 0x3010, |
| 0x3011, |
| 0x3014, |
| 0x3015, |
| 0x3016, |
| 0x3017, |
| 0x3018, |
| 0x3019, |
| 0x301A, |
| 0x301B, |
| 0xFE59, |
| 0xFE5A, |
| 0xFF3B, |
| 0xFF3D, |
| 0xFF5B, |
| 0xFF5D, |
| 0xFF5F, |
| 0xFF60, |
| 0xFF62, |
| 0xFF63 |
| }; |
| |
| private static int[] mirroredCharactersMapping = { |
| 0x0029, |
| 0x0028, |
| 0x003E, |
| 0x003C, |
| 0x005D, |
| 0x005B, |
| 0x007D, |
| 0x007B, |
| 0x00BB, |
| 0x00AB, |
| 0x0F3B, |
| 0x0F3A, |
| 0x0F3D, |
| 0x0F3C, |
| 0x169C, |
| 0x169B, |
| 0x203A, |
| 0x2039, |
| 0x2046, |
| 0x2045, |
| 0x207E, |
| 0x207D, |
| 0x208E, |
| 0x208D, |
| 0x220B, |
| 0x220C, |
| 0x220D, |
| 0x2208, |
| 0x2209, |
| 0x220A, |
| 0x29F5, |
| 0x223D, |
| 0x223C, |
| 0x22CD, |
| 0x2253, |
| 0x2252, |
| 0x2255, |
| 0x2254, |
| 0x2265, |
| 0x2264, |
| 0x2267, |
| 0x2266, |
| 0x2269, |
| 0x2268, |
| 0x226B, |
| 0x226A, |
| 0x226F, |
| 0x226E, |
| 0x2271, |
| 0x2270, |
| 0x2273, |
| 0x2272, |
| 0x2275, |
| 0x2274, |
| 0x2277, |
| 0x2276, |
| 0x2279, |
| 0x2278, |
| 0x227B, |
| 0x227A, |
| 0x227D, |
| 0x227C, |
| 0x227F, |
| 0x227E, |
| 0x2281, |
| 0x2280, |
| 0x2283, |
| 0x2282, |
| 0x2285, |
| 0x2284, |
| 0x2287, |
| 0x2286, |
| 0x2289, |
| 0x2288, |
| 0x228B, |
| 0x228A, |
| 0x2290, |
| 0x228F, |
| 0x2292, |
| 0x2291, |
| 0x29B8, |
| 0x22A3, |
| 0x22A2, |
| 0x2ADE, |
| 0x2AE4, |
| 0x2AE3, |
| 0x2AE5, |
| 0x22B1, |
| 0x22B0, |
| 0x22B3, |
| 0x22B2, |
| 0x22B5, |
| 0x22B4, |
| 0x22B7, |
| 0x22B6, |
| 0x22CA, |
| 0x22C9, |
| 0x22CC, |
| 0x22CB, |
| 0x2243, |
| 0x22D1, |
| 0x22D0, |
| 0x22D7, |
| 0x22D6, |
| 0x22D9, |
| 0x22D8, |
| 0x22DB, |
| 0x22DA, |
| 0x22DD, |
| 0x22DC, |
| 0x22DF, |
| 0x22DE, |
| 0x22E1, |
| 0x22E0, |
| 0x22E3, |
| 0x22E2, |
| 0x22E5, |
| 0x22E4, |
| 0x22E7, |
| 0x22E6, |
| 0x22E9, |
| 0x22E8, |
| 0x22EB, |
| 0x22EA, |
| 0x22ED, |
| 0x22EC, |
| 0x22F1, |
| 0x22F0, |
| 0x22FA, |
| 0x22FB, |
| 0x22FC, |
| 0x22FD, |
| 0x22FE, |
| 0x22F2, |
| 0x22F3, |
| 0x22F4, |
| 0x22F6, |
| 0x22F7, |
| 0x2309, |
| 0x2308, |
| 0x230B, |
| 0x230A, |
| 0x232A, |
| 0x2329, |
| 0x2769, |
| 0x2768, |
| 0x276B, |
| 0x276A, |
| 0x276D, |
| 0x276C, |
| 0x276F, |
| 0x276E, |
| 0x2771, |
| 0x2770, |
| 0x2773, |
| 0x2772, |
| 0x2775, |
| 0x2774, |
| 0x27C4, |
| 0x27C3, |
| 0x27C6, |
| 0x27C5, |
| 0x27C9, |
| 0x27C8, |
| 0x27D6, |
| 0x27D5, |
| 0x27DE, |
| 0x27DD, |
| 0x27E3, |
| 0x27E2, |
| 0x27E5, |
| 0x27E4, |
| 0x27E7, |
| 0x27E6, |
| 0x27E9, |
| 0x27E8, |
| 0x27EB, |
| 0x27EA, |
| 0x27ED, |
| 0x27EC, |
| 0x27EF, |
| 0x27EE, |
| 0x2984, |
| 0x2983, |
| 0x2986, |
| 0x2985, |
| 0x2988, |
| 0x2987, |
| 0x298A, |
| 0x2989, |
| 0x298C, |
| 0x298B, |
| 0x2990, |
| 0x298F, |
| 0x298E, |
| 0x298D, |
| 0x2992, |
| 0x2991, |
| 0x2994, |
| 0x2993, |
| 0x2996, |
| 0x2995, |
| 0x2998, |
| 0x2997, |
| 0x2298, |
| 0x29C1, |
| 0x29C0, |
| 0x29C5, |
| 0x29C4, |
| 0x29D0, |
| 0x29CF, |
| 0x29D2, |
| 0x29D1, |
| 0x29D5, |
| 0x29D4, |
| 0x29D9, |
| 0x29D8, |
| 0x29DB, |
| 0x29DA, |
| 0x2215, |
| 0x29F9, |
| 0x29F8, |
| 0x29FD, |
| 0x29FC, |
| 0x2A2C, |
| 0x2A2B, |
| 0x2A2E, |
| 0x2A2D, |
| 0x2A35, |
| 0x2A34, |
| 0x2A3D, |
| 0x2A3C, |
| 0x2A65, |
| 0x2A64, |
| 0x2A7A, |
| 0x2A79, |
| 0x2A7E, |
| 0x2A7D, |
| 0x2A80, |
| 0x2A7F, |
| 0x2A82, |
| 0x2A81, |
| 0x2A84, |
| 0x2A83, |
| 0x2A8C, |
| 0x2A8B, |
| 0x2A92, |
| 0x2A91, |
| 0x2A94, |
| 0x2A93, |
| 0x2A96, |
| 0x2A95, |
| 0x2A98, |
| 0x2A97, |
| 0x2A9A, |
| 0x2A99, |
| 0x2A9C, |
| 0x2A9B, |
| 0x2AA2, |
| 0x2AA1, |
| 0x2AA7, |
| 0x2AA6, |
| 0x2AA9, |
| 0x2AA8, |
| 0x2AAB, |
| 0x2AAA, |
| 0x2AAD, |
| 0x2AAC, |
| 0x2AB0, |
| 0x2AAF, |
| 0x2AB4, |
| 0x2AB3, |
| 0x2AC4, |
| 0x2AC3, |
| 0x2AC6, |
| 0x2AC5, |
| 0x2ACE, |
| 0x2ACD, |
| 0x2AD0, |
| 0x2ACF, |
| 0x2AD2, |
| 0x2AD1, |
| 0x2AD4, |
| 0x2AD3, |
| 0x2AD6, |
| 0x2AD5, |
| 0x22A6, |
| 0x22A9, |
| 0x2E03, |
| 0x2E02, |
| 0x2E05, |
| 0x2E04, |
| 0x2E0A, |
| 0x2E09, |
| 0x2E0D, |
| 0x2E0C, |
| 0x2E1D, |
| 0x2E1C, |
| 0x2E21, |
| 0x2E20, |
| 0x2E23, |
| 0x2E22, |
| 0x2E25, |
| 0x2E24, |
| 0x2E27, |
| 0x300F, |
| 0x300E, |
| 0x3011, |
| 0x3010, |
| 0x3015, |
| 0x3014, |
| 0x3017, |
| 0x3016, |
| 0x3019, |
| 0x3018, |
| 0x301B, |
| 0x301A, |
| 0xFE5A, |
| 0xFE59, |
| 0xFF3D, |
| 0xFF3B, |
| 0xFF5D, |
| 0xFF5B, |
| 0xFF60, |
| 0xFF5F, |
| 0xFF63, |
| 0xFF62 |
| }; |
| |
| private static int mirror ( int c ) { |
| int i = Arrays.binarySearch ( mirroredCharacters, c ); |
| if ( i < 0 ) { |
| return c; |
| } else { |
| return mirroredCharactersMapping [ i ]; |
| } |
| } |
| |
| /** |
| * Determine if two character sequences contain the same characters. |
| * @param cs1 first character sequence |
| * @param cs2 second character sequence |
| * @return true if both sequences have same length and same character sequence |
| */ |
| public static boolean isSameSequence ( CharSequence cs1, CharSequence cs2 ) { |
| assert cs1 != null; |
| assert cs2 != null; |
| if ( cs1.length() != cs2.length() ) { |
| return false; |
| } else { |
| for ( int i = 0, n = cs1.length(); i < n; i++ ) { |
| if ( cs1.charAt(i) != cs2.charAt(i) ) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| |
| /** |
| * Convert Java string (UTF-16) to a Unicode scalar array (UTF-32). |
| * Note that if there are any non-BMP encoded characters present in the |
| * input, then the number of entries in the output array will be less |
| * than the number of elements in the input string. Any |
| * @param s input string |
| * @param substitution value to substitute for ill-formed surrogate |
| * @param errorOnSubstitution throw runtime exception (IllegalArgumentException) in |
| * case this argument is true and a substitution would be attempted |
| * @return output scalar array |
| * @throws IllegalArgumentException if substitution required and errorOnSubstitution |
| * is not false |
| */ |
| public static Integer[] toUTF32 ( String s, int substitution, boolean errorOnSubstitution ) |
| throws IllegalArgumentException { |
| int n; |
| if ( ( n = s.length() ) == 0 ) { |
| return new Integer[0]; |
| } else { |
| Integer[] sa = new Integer [ n ]; |
| int k = 0; |
| for ( int i = 0; i < n; i++ ) { |
| int c = (int) s.charAt(i); |
| if ( ( c >= 0xD800 ) && ( c < 0xE000 ) ) { |
| int s1 = c; |
| int s2 = ( ( i + 1 ) < n ) ? (int) s.charAt ( i + 1 ) : 0; |
| if ( s1 < 0xDC00 ) { |
| if ( ( s2 >= 0xDC00 ) && ( s2 < 0xE000 ) ) { |
| c = ( ( s1 - 0xD800 ) << 10 ) + ( s2 - 0xDC00 ) + 65536; |
| i++; |
| } else { |
| if ( errorOnSubstitution ) { |
| throw new IllegalArgumentException |
| ( "isolated high (leading) surrogate" ); |
| } else { |
| c = substitution; |
| } |
| } |
| } else { |
| if ( errorOnSubstitution ) { |
| throw new IllegalArgumentException |
| ( "isolated low (trailing) surrogate" ); |
| } else { |
| c = substitution; |
| } |
| } |
| } |
| sa[k++] = c; |
| } |
| if ( k == n ) { |
| return sa; |
| } else { |
| Integer[] na = new Integer [ k ]; |
| System.arraycopy ( sa, 0, na, 0, k ); |
| return na; |
| } |
| } |
| } |
| |
| /** |
| * Convert a Unicode scalar array (UTF-32) a Java string (UTF-16). |
| * @param sa input scalar array |
| * @return output (UTF-16) string |
| * @throws IllegalArgumentException if an input scalar value is illegal, |
| * e.g., a surrogate or out of range |
| */ |
| public static String fromUTF32 ( Integer[] sa ) throws IllegalArgumentException { |
| StringBuffer sb = new StringBuffer(); |
| for ( int s : sa ) { |
| if ( s < 65535 ) { |
| if ( ( s < 0xD800 ) || ( s > 0xDFFF ) ) { |
| sb.append ( (char) s ); |
| } else { |
| String ncr = charToNCRef(s); |
| throw new IllegalArgumentException |
| ( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1) |
| + "; cannot be UTF-16 surrogate" ); |
| } |
| } else if ( s < 1114112 ) { |
| int s1 = ( ( ( s - 65536 ) >> 10 ) & 0x3FF ) + 0xD800; |
| int s2 = ( ( ( s - 65536 ) >> 0 ) & 0x3FF ) + 0xDC00; |
| sb.append ( (char) s1 ); |
| sb.append ( (char) s2 ); |
| } else { |
| String ncr = charToNCRef(s); |
| throw new IllegalArgumentException |
| ( "illegal scalar value 0x" + ncr.substring(2,ncr.length() - 1) |
| + "; out of range for UTF-16" ); |
| } |
| } |
| return sb.toString(); |
| } |
| } |