blob: e0f5e1911abf16b0b7347412da667e6d898f101b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id$ */
package org.apache.fop.util;
/**
* This class provides utilities to distinguish various kinds of Unicode
* whitespace and to get character widths in a given FontState.
*/
public class CharUtilities {
/**
* Character code used to signal a character boundary in
* inline content, such as an inline with borders and padding
* or a nested block object.
*/
public static final char CODE_EOT = 0;
/**
* Character class: Unicode white space
*/
public static final int UCWHITESPACE = 0;
/**
* Character class: Line feed
*/
public static final int LINEFEED = 1;
/**
* Character class: Boundary between text runs
*/
public static final int EOT = 2;
/**
* Character class: non-whitespace
*/
public static final int NONWHITESPACE = 3;
/**
* Character class: XML whitespace
*/
public static final int XMLWHITESPACE = 4;
/** null char */
public static final char NULL_CHAR = '\u0000';
/** linefeed character */
public static final char LINEFEED_CHAR = '\n';
/** carriage return */
public static final char CARRIAGE_RETURN = '\r';
/** normal tab */
public static final char TAB = '\t';
/** normal space */
public static final char SPACE = '\u0020';
/** non-breaking space */
public static final char NBSPACE = '\u00A0';
/** next line control character */
public static final char NEXT_LINE = '\u0085';
/** zero-width space */
public static final char ZERO_WIDTH_SPACE = '\u200B';
/** word joiner */
public static final char WORD_JOINER = '\u2060';
/** zero-width joiner */
public static final char ZERO_WIDTH_JOINER = '\u200D';
/** left-to-right mark */
public static final char LRM = '\u200E';
/** right-to-left mark */
public static final char RLM = '\u202F';
/** left-to-right embedding */
public static final char LRE = '\u202A';
/** right-to-left embedding */
public static final char RLE = '\u202B';
/** pop directional formatting */
public static final char PDF = '\u202C';
/** left-to-right override */
public static final char LRO = '\u202D';
/** right-to-left override */
public static final char RLO = '\u202E';
/** zero-width no-break space (= byte order mark) */
public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
/** soft hyphen */
public static final char SOFT_HYPHEN = '\u00AD';
/** line-separator */
public static final char LINE_SEPARATOR = '\u2028';
/** paragraph-separator */
public static final char PARAGRAPH_SEPARATOR = '\u2029';
/** missing ideograph */
public static final char MISSING_IDEOGRAPH = '\u25A1';
/** Ideogreaphic space */
public static final char IDEOGRAPHIC_SPACE = '\u3000';
/** Object replacement character */
public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
/** Unicode value indicating the the character is "not a character". */
public static final char NOT_A_CHARACTER = '\uFFFF';
/**
* Utility class: Constructor prevents instantiating when subclassed.
*/
protected CharUtilities() {
throw new UnsupportedOperationException();
}
/**
* Return the appropriate CharClass constant for the type
* of the passed character.
* @param c character to inspect
* @return the determined character class
*/
public static int classOf(int c) {
switch (c) {
case CODE_EOT:
return EOT;
case LINEFEED_CHAR:
return LINEFEED;
case SPACE:
case CARRIAGE_RETURN:
case TAB:
return XMLWHITESPACE;
default:
return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
}
}
/**
* Helper method to determine if the character is a
* space with normal behavior. Normal behavior means that
* it's not non-breaking.
* @param c character to inspect
* @return True if the character is a normal space
*/
public static boolean isBreakableSpace(int c) {
return (c == SPACE || isFixedWidthSpace(c));
}
/**
* Method to determine if the character is a zero-width space.
* @param c the character to check
* @return true if the character is a zero-width space
*/
public static boolean isZeroWidthSpace(int c) {
return c == ZERO_WIDTH_SPACE // 200Bh
|| c == WORD_JOINER // 2060h
|| c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
}
/**
* Method to determine if the character is a (breakable) fixed-width space.
* @param c the character to check
* @return true if the character has a fixed-width
*/
public static boolean isFixedWidthSpace(int c) {
return (c >= '\u2000' && c <= '\u200B')
|| c == '\u3000';
// c == '\u2000' // en quad
// c == '\u2001' // em quad
// c == '\u2002' // en space
// c == '\u2003' // em space
// c == '\u2004' // three-per-em space
// c == '\u2005' // four-per-em space
// c == '\u2006' // six-per-em space
// c == '\u2007' // figure space
// c == '\u2008' // punctuation space
// c == '\u2009' // thin space
// c == '\u200A' // hair space
// c == '\u200B' // zero width space
// c == '\u3000' // ideographic space
}
/**
* Method to determine if the character is a nonbreaking
* space.
* @param c character to check
* @return True if the character is a nbsp
*/
public static boolean isNonBreakableSpace(int c) {
return
(c == NBSPACE // no-break space
|| c == '\u202F' // narrow no-break space
|| c == '\u3000' // ideographic space
|| c == WORD_JOINER // word joiner
|| c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
}
/**
* Method to determine if the character is an adjustable
* space.
* @param c character to check
* @return True if the character is adjustable
*/
public static boolean isAdjustableSpace(int c) {
//TODO: are there other kinds of adjustable spaces?
return
(c == '\u0020' // normal space
|| c == NBSPACE); // no-break space
}
/**
* Determines if the character represents any kind of space.
* @param c character to check
* @return True if the character represents any kind of space
*/
public static boolean isAnySpace(int c) {
return (isBreakableSpace(c) || isNonBreakableSpace(c));
}
/**
* Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
* @param c the character
* @return true if the character is "Alphabetic"
*/
public static boolean isAlphabetic(int c) {
//http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
//Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
int generalCategory = Character.getType((char)c);
switch (generalCategory) {
case Character.UPPERCASE_LETTER: //Lu
case Character.LOWERCASE_LETTER: //Ll
case Character.TITLECASE_LETTER: //Lt
case Character.MODIFIER_LETTER: //Lm
case Character.OTHER_LETTER: //Lo
case Character.LETTER_NUMBER: //Nl
return true;
default:
//TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
//Other_Alphabetic contains mostly more exotic characters
return false;
}
}
/**
* Indicates whether the given character is an explicit break-character
* @param c the character to check
* @return true if the character represents an explicit break
*/
public static boolean isExplicitBreak(int c) {
return (c == LINEFEED_CHAR
|| c == CARRIAGE_RETURN
|| c == NEXT_LINE
|| c == LINE_SEPARATOR
|| c == PARAGRAPH_SEPARATOR);
}
/**
* Convert a single unicode scalar value to an XML numeric character
* reference. If in the BMP, four digits are used, otherwise 6 digits are used.
* @param c a unicode scalar value
* @return a string representing a numeric character reference
*/
public static String charToNCRef(int c) {
StringBuffer sb = new StringBuffer();
for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) {
int d = c & 0xF;
char hd;
if (d < 10) {
hd = (char) ((int) '0' + d);
} else {
hd = (char) ((int) 'A' + (d - 10));
}
sb.append(hd);
}
return "&#x" + sb.reverse() + ";";
}
/**
* Convert a string to a sequence of ASCII or XML numeric character references.
* @param s a java string (encoded in UTF-16)
* @return a string representing a sequence of numeric character reference or
* ASCII characters
*/
public static String toNCRefs(String s) {
StringBuffer sb = new StringBuffer();
if (s != null) {
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if ((c >= 32) && (c < 127)) {
if (c == '<') {
sb.append("&lt;");
} else if (c == '>') {
sb.append("&gt;");
} else if (c == '&') {
sb.append("&amp;");
} else {
sb.append(c);
}
} else {
sb.append(charToNCRef(c));
}
}
}
return sb.toString();
}
/**
* Pad a string S on left out to width W using padding character PAD.
* @param s string to pad
* @param width width of field to add padding
* @param pad character to use for padding
* @return padded string
*/
public static String padLeft(String s, int width, char pad) {
StringBuffer sb = new StringBuffer();
for (int i = s.length(); i < width; i++) {
sb.append(pad);
}
sb.append(s);
return sb.toString();
}
/**
* Format character for debugging output, which it is prefixed with "0x", padded left with '0'
* and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
* @param c character code
* @return formatted character string
*/
public static String format(int c) {
if (c < 1114112) {
return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0');
} else {
return "!NOT A CHARACTER!";
}
}
/**
* Determine if two character sequences contain the same characters.
* @param cs1 first character sequence
* @param cs2 second character sequence
* @return true if both sequences have same length and same character sequence
*/
public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) {
assert cs1 != null;
assert cs2 != null;
if (cs1.length() != cs2.length()) {
return false;
} else {
for (int i = 0, n = cs1.length(); i < n; i++) {
if (cs1.charAt(i) != cs2.charAt(i)) {
return false;
}
}
return true;
}
}
}