blob: 7786552ff280d796897601aeec0facf4b9dc9395 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id$ */
package org.apache.fop.util;
/**
* This class provides utilities to distinguish various kinds of Unicode
* whitespace and to get character widths in a given FontState.
*/
public class CharUtilities {
/**
* Character code used to signal a character boundary in
* inline content, such as an inline with borders and padding
* or a nested block object.
*/
public static final char CODE_EOT = 0;
/**
* Character class: Unicode white space
*/
public static final int UCWHITESPACE = 0;
/**
* Character class: Line feed
*/
public static final int LINEFEED = 1;
/**
* Character class: Boundary between text runs
*/
public static final int EOT = 2;
/**
* Character class: non-whitespace
*/
public static final int NONWHITESPACE = 3;
/**
* Character class: XML whitespace
*/
public static final int XMLWHITESPACE = 4;
/** null char */
public static final char NULL_CHAR = '\u0000';
/** linefeed character */
public static final char LINEFEED_CHAR = '\n';
/** carriage return */
public static final char CARRIAGE_RETURN = '\r';
/** normal tab */
public static final char TAB = '\t';
/** normal space */
public static final char SPACE = '\u0020';
/** non-breaking space */
public static final char NBSPACE = '\u00A0';
/** next line control character */
public static final char NEXT_LINE = '\u0085';
/** zero-width space */
public static final char ZERO_WIDTH_SPACE = '\u200B';
/** word joiner */
public static final char WORD_JOINER = '\u2060';
/** zero-width joiner */
public static final char ZERO_WIDTH_JOINER = '\u200D';
/** zero-width no-break space (= byte order mark) */
public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
/** soft hyphen */
public static final char SOFT_HYPHEN = '\u00AD';
/** line-separator */
public static final char LINE_SEPARATOR = '\u2028';
/** paragraph-separator */
public static final char PARAGRAPH_SEPARATOR = '\u2029';
/** missing ideograph */
public static final char MISSING_IDEOGRAPH = '\u25A1';
/** Ideogreaphic space */
public static final char IDEOGRAPHIC_SPACE = '\u3000';
/** Unicode value indicating the the character is "not a character". */
public static final char NOT_A_CHARACTER = '\uFFFF';
/**
* Utility class: Constructor prevents instantiating when subclassed.
*/
protected CharUtilities() {
throw new UnsupportedOperationException();
}
/**
* Return the appropriate CharClass constant for the type
* of the passed character.
* @param c character to inspect
* @return the determined character class
*/
public static int classOf(char c) {
switch (c) {
case CODE_EOT:
return EOT;
case LINEFEED_CHAR:
return LINEFEED;
case SPACE:
case CARRIAGE_RETURN:
case TAB:
return XMLWHITESPACE;
default:
return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
}
}
/**
* Helper method to determine if the character is a
* space with normal behavior. Normal behavior means that
* it's not non-breaking.
* @param c character to inspect
* @return True if the character is a normal space
*/
public static boolean isBreakableSpace(char c) {
return (c == SPACE || isFixedWidthSpace(c));
}
/**
* Method to determine if the character is a zero-width space.
* @param c the character to check
* @return true if the character is a zero-width space
*/
public static boolean isZeroWidthSpace(char c) {
return c == ZERO_WIDTH_SPACE // 200Bh
|| c == WORD_JOINER // 2060h
|| c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
}
/**
* Method to determine if the character is a (breakable) fixed-width space.
* @param c the character to check
* @return true if the character has a fixed-width
*/
public static boolean isFixedWidthSpace(char c) {
return (c >= '\u2000' && c <= '\u200B')
|| c == '\u3000';
// c == '\u2000' // en quad
// c == '\u2001' // em quad
// c == '\u2002' // en space
// c == '\u2003' // em space
// c == '\u2004' // three-per-em space
// c == '\u2005' // four-per-em space
// c == '\u2006' // six-per-em space
// c == '\u2007' // figure space
// c == '\u2008' // punctuation space
// c == '\u2009' // thin space
// c == '\u200A' // hair space
// c == '\u200B' // zero width space
// c == '\u3000' // ideographic space
}
/**
* Method to determine if the character is a nonbreaking
* space.
* @param c character to check
* @return True if the character is a nbsp
*/
public static boolean isNonBreakableSpace(char c) {
return
(c == NBSPACE // no-break space
|| c == '\u202F' // narrow no-break space
|| c == '\u3000' // ideographic space
|| c == WORD_JOINER // word joiner
|| c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
}
/**
* Method to determine if the character is an adjustable
* space.
* @param c character to check
* @return True if the character is adjustable
*/
public static boolean isAdjustableSpace(char c) {
//TODO: are there other kinds of adjustable spaces?
return
(c == '\u0020' // normal space
|| c == NBSPACE); // no-break space
}
/**
* Determines if the character represents any kind of space.
* @param c character to check
* @return True if the character represents any kind of space
*/
public static boolean isAnySpace(char c) {
return (isBreakableSpace(c) || isNonBreakableSpace(c));
}
/**
* Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
* @param ch the character
* @return true if the character is "Alphabetic"
*/
public static boolean isAlphabetic(char ch) {
//http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
//Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
int generalCategory = Character.getType(ch);
switch (generalCategory) {
case Character.UPPERCASE_LETTER: //Lu
case Character.LOWERCASE_LETTER: //Ll
case Character.TITLECASE_LETTER: //Lt
case Character.MODIFIER_LETTER: //Lm
case Character.OTHER_LETTER: //Lo
case Character.LETTER_NUMBER: //Nl
return true;
default:
//TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
//Other_Alphabetic contains mostly more exotic characters
return false;
}
}
/**
* Indicates whether the given character is an explicit break-character
* @param ch the character to check
* @return true if the character represents an explicit break
*/
public static boolean isExplicitBreak(char ch) {
return (ch == LINEFEED_CHAR
|| ch == CARRIAGE_RETURN
|| ch == NEXT_LINE
|| ch == LINE_SEPARATOR
|| ch == PARAGRAPH_SEPARATOR);
}
}