| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* $Id$ */ |
| |
| package org.apache.fop.util; |
| |
| /** |
| * This class provides utilities to distinguish various kinds of Unicode |
| * whitespace and to get character widths in a given FontState. |
| */ |
| public class CharUtilities { |
| |
| /** |
| * Character code used to signal a character boundary in |
| * inline content, such as an inline with borders and padding |
| * or a nested block object. |
| */ |
| public static final char CODE_EOT = 0; |
| |
| /** |
| * Character class: Unicode white space |
| */ |
| public static final int UCWHITESPACE = 0; |
| /** |
| * Character class: Line feed |
| */ |
| public static final int LINEFEED = 1; |
| /** |
| * Character class: Boundary between text runs |
| */ |
| public static final int EOT = 2; |
| /** |
| * Character class: non-whitespace |
| */ |
| public static final int NONWHITESPACE = 3; |
| /** |
| * Character class: XML whitespace |
| */ |
| public static final int XMLWHITESPACE = 4; |
| |
| |
| /** null char */ |
| public static final char NULL_CHAR = '\u0000'; |
| /** linefeed character */ |
| public static final char LINEFEED_CHAR = '\n'; |
| /** carriage return */ |
| public static final char CARRIAGE_RETURN = '\r'; |
| /** normal tab */ |
| public static final char TAB = '\t'; |
| /** normal space */ |
| public static final char SPACE = '\u0020'; |
| /** non-breaking space */ |
| public static final char NBSPACE = '\u00A0'; |
| /** next line control character */ |
| public static final char NEXT_LINE = '\u0085'; |
| /** zero-width space */ |
| public static final char ZERO_WIDTH_SPACE = '\u200B'; |
| /** word joiner */ |
| public static final char WORD_JOINER = '\u2060'; |
| /** zero-width joiner */ |
| public static final char ZERO_WIDTH_JOINER = '\u200D'; |
| /** left-to-right mark */ |
| public static final char LRM = '\u200E'; |
| /** right-to-left mark */ |
| public static final char RLM = '\u202F'; |
| /** left-to-right embedding */ |
| public static final char LRE = '\u202A'; |
| /** right-to-left embedding */ |
| public static final char RLE = '\u202B'; |
| /** pop directional formatting */ |
| public static final char PDF = '\u202C'; |
| /** left-to-right override */ |
| public static final char LRO = '\u202D'; |
| /** right-to-left override */ |
| public static final char RLO = '\u202E'; |
| /** zero-width no-break space (= byte order mark) */ |
| public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF'; |
| /** soft hyphen */ |
| public static final char SOFT_HYPHEN = '\u00AD'; |
| /** line-separator */ |
| public static final char LINE_SEPARATOR = '\u2028'; |
| /** paragraph-separator */ |
| public static final char PARAGRAPH_SEPARATOR = '\u2029'; |
| /** missing ideograph */ |
| public static final char MISSING_IDEOGRAPH = '\u25A1'; |
| /** Ideogreaphic space */ |
| public static final char IDEOGRAPHIC_SPACE = '\u3000'; |
| /** Object replacement character */ |
| public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC'; |
| /** Unicode value indicating the the character is "not a character". */ |
| public static final char NOT_A_CHARACTER = '\uFFFF'; |
| |
| /** |
| * Utility class: Constructor prevents instantiating when subclassed. |
| */ |
| protected CharUtilities() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| /** |
| * Return the appropriate CharClass constant for the type |
| * of the passed character. |
| * @param c character to inspect |
| * @return the determined character class |
| */ |
| public static int classOf(int c) { |
| switch (c) { |
| case CODE_EOT: |
| return EOT; |
| case LINEFEED_CHAR: |
| return LINEFEED; |
| case SPACE: |
| case CARRIAGE_RETURN: |
| case TAB: |
| return XMLWHITESPACE; |
| default: |
| return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE; |
| } |
| } |
| |
| |
| /** |
| * Helper method to determine if the character is a |
| * space with normal behavior. Normal behavior means that |
| * it's not non-breaking. |
| * @param c character to inspect |
| * @return True if the character is a normal space |
| */ |
| public static boolean isBreakableSpace(int c) { |
| return (c == SPACE || isFixedWidthSpace(c)); |
| } |
| |
| /** |
| * Method to determine if the character is a zero-width space. |
| * @param c the character to check |
| * @return true if the character is a zero-width space |
| */ |
| public static boolean isZeroWidthSpace(int c) { |
| return c == ZERO_WIDTH_SPACE // 200Bh |
| || c == WORD_JOINER // 2060h |
| || c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM) |
| } |
| |
| /** |
| * Method to determine if the character is a (breakable) fixed-width space. |
| * @param c the character to check |
| * @return true if the character has a fixed-width |
| */ |
| public static boolean isFixedWidthSpace(int c) { |
| return (c >= '\u2000' && c <= '\u200B') |
| || c == '\u3000'; |
| // c == '\u2000' // en quad |
| // c == '\u2001' // em quad |
| // c == '\u2002' // en space |
| // c == '\u2003' // em space |
| // c == '\u2004' // three-per-em space |
| // c == '\u2005' // four-per-em space |
| // c == '\u2006' // six-per-em space |
| // c == '\u2007' // figure space |
| // c == '\u2008' // punctuation space |
| // c == '\u2009' // thin space |
| // c == '\u200A' // hair space |
| // c == '\u200B' // zero width space |
| // c == '\u3000' // ideographic space |
| } |
| |
| /** |
| * Method to determine if the character is a nonbreaking |
| * space. |
| * @param c character to check |
| * @return True if the character is a nbsp |
| */ |
| public static boolean isNonBreakableSpace(int c) { |
| return |
| (c == NBSPACE // no-break space |
| || c == '\u202F' // narrow no-break space |
| || c == '\u3000' // ideographic space |
| || c == WORD_JOINER // word joiner |
| || c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space |
| } |
| |
| /** |
| * Method to determine if the character is an adjustable |
| * space. |
| * @param c character to check |
| * @return True if the character is adjustable |
| */ |
| public static boolean isAdjustableSpace(int c) { |
| //TODO: are there other kinds of adjustable spaces? |
| return |
| (c == '\u0020' // normal space |
| || c == NBSPACE); // no-break space |
| } |
| |
| /** |
| * Determines if the character represents any kind of space. |
| * @param c character to check |
| * @return True if the character represents any kind of space |
| */ |
| public static boolean isAnySpace(int c) { |
| return (isBreakableSpace(c) || isNonBreakableSpace(c)); |
| } |
| |
| /** |
| * Indicates whether a character is classified as "Alphabetic" by the Unicode standard. |
| * @param c the character |
| * @return true if the character is "Alphabetic" |
| */ |
| public static boolean isAlphabetic(int c) { |
| //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic |
| //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl |
| int generalCategory = Character.getType((char)c); |
| switch (generalCategory) { |
| case Character.UPPERCASE_LETTER: //Lu |
| case Character.LOWERCASE_LETTER: //Ll |
| case Character.TITLECASE_LETTER: //Lt |
| case Character.MODIFIER_LETTER: //Lm |
| case Character.OTHER_LETTER: //Lo |
| case Character.LETTER_NUMBER: //Nl |
| return true; |
| default: |
| //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that) |
| //Other_Alphabetic contains mostly more exotic characters |
| return false; |
| } |
| } |
| |
| /** |
| * Indicates whether the given character is an explicit break-character |
| * @param c the character to check |
| * @return true if the character represents an explicit break |
| */ |
| public static boolean isExplicitBreak(int c) { |
| return (c == LINEFEED_CHAR |
| || c == CARRIAGE_RETURN |
| || c == NEXT_LINE |
| || c == LINE_SEPARATOR |
| || c == PARAGRAPH_SEPARATOR); |
| } |
| |
| /** |
| * Convert a single unicode scalar value to an XML numeric character |
| * reference. If in the BMP, four digits are used, otherwise 6 digits are used. |
| * @param c a unicode scalar value |
| * @return a string representing a numeric character reference |
| */ |
| public static String charToNCRef(int c) { |
| StringBuffer sb = new StringBuffer(); |
| for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) { |
| int d = c & 0xF; |
| char hd; |
| if (d < 10) { |
| hd = (char) ((int) '0' + d); |
| } else { |
| hd = (char) ((int) 'A' + (d - 10)); |
| } |
| sb.append(hd); |
| } |
| return "&#x" + sb.reverse() + ";"; |
| } |
| |
| /** |
| * Convert a string to a sequence of ASCII or XML numeric character references. |
| * @param s a java string (encoded in UTF-16) |
| * @return a string representing a sequence of numeric character reference or |
| * ASCII characters |
| */ |
| public static String toNCRefs(String s) { |
| StringBuffer sb = new StringBuffer(); |
| if (s != null) { |
| for (int i = 0; i < s.length(); i++) { |
| char c = s.charAt(i); |
| if ((c >= 32) && (c < 127)) { |
| if (c == '<') { |
| sb.append("<"); |
| } else if (c == '>') { |
| sb.append(">"); |
| } else if (c == '&') { |
| sb.append("&"); |
| } else { |
| sb.append(c); |
| } |
| } else { |
| sb.append(charToNCRef(c)); |
| } |
| } |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Pad a string S on left out to width W using padding character PAD. |
| * @param s string to pad |
| * @param width width of field to add padding |
| * @param pad character to use for padding |
| * @return padded string |
| */ |
| public static String padLeft(String s, int width, char pad) { |
| StringBuffer sb = new StringBuffer(); |
| for (int i = s.length(); i < width; i++) { |
| sb.append(pad); |
| } |
| sb.append(s); |
| return sb.toString(); |
| } |
| |
| /** |
| * Format character for debugging output, which it is prefixed with "0x", padded left with '0' |
| * and either 4 or 6 hex characters in width according to whether it is in the BMP or not. |
| * @param c character code |
| * @return formatted character string |
| */ |
| public static String format(int c) { |
| if (c < 1114112) { |
| return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0'); |
| } else { |
| return "!NOT A CHARACTER!"; |
| } |
| } |
| |
| /** |
| * Determine if two character sequences contain the same characters. |
| * @param cs1 first character sequence |
| * @param cs2 second character sequence |
| * @return true if both sequences have same length and same character sequence |
| */ |
| public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) { |
| assert cs1 != null; |
| assert cs2 != null; |
| if (cs1.length() != cs2.length()) { |
| return false; |
| } else { |
| for (int i = 0, n = cs1.length(); i < n; i++) { |
| if (cs1.charAt(i) != cs2.charAt(i)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| |
| } |