src/java/org/apache/fop/util/CharUtilities.java - xmlgraphics-fop - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* $Id$ */

 package org.apache.fop.util;

 /**
  * This class provides utilities to distinguish various kinds of Unicode
  * whitespace and to get character widths in a given FontState.
  */
 public class CharUtilities {

     /**
      * Character code used to signal a character boundary in
      * inline content, such as an inline with borders and padding
      * or a nested block object.
      */
     public static final char CODE_EOT = 0;

     /**
      * Character class: Unicode white space
      */
     public static final int UCWHITESPACE = 0;
     /**
      * Character class: Line feed
      */
     public static final int LINEFEED = 1;
     /**
      * Character class: Boundary between text runs
      */
     public static final int EOT = 2;
     /**
      * Character class: non-whitespace
      */
     public static final int NONWHITESPACE = 3;
     /**
      * Character class: XML whitespace
      */
     public static final int XMLWHITESPACE = 4;


     /** null char */
     public static final char NULL_CHAR = '\u0000';
     /** linefeed character */
     public static final char LINEFEED_CHAR = '\n';
     /** carriage return */
     public static final char CARRIAGE_RETURN = '\r';
     /** normal tab */
     public static final char TAB = '\t';
     /** normal space */
     public static final char SPACE = '\u0020';
     /** non-breaking space */
     public static final char NBSPACE = '\u00A0';
     /** next line control character */
     public static final char NEXT_LINE = '\u0085';
     /** zero-width space */
     public static final char ZERO_WIDTH_SPACE = '\u200B';
     /** word joiner */
     public static final char WORD_JOINER = '\u2060';
     /** zero-width joiner */
     public static final char ZERO_WIDTH_JOINER = '\u200D';
     /** left-to-right mark */
     public static final char LRM = '\u200E';
     /** right-to-left mark */
     public static final char RLM = '\u202F';
     /** left-to-right embedding */
     public static final char LRE = '\u202A';
     /** right-to-left embedding */
     public static final char RLE = '\u202B';
     /** pop directional formatting */
     public static final char PDF = '\u202C';
     /** left-to-right override */
     public static final char LRO = '\u202D';
     /** right-to-left override */
     public static final char RLO = '\u202E';
     /** zero-width no-break space (= byte order mark) */
     public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
     /** soft hyphen */
     public static final char SOFT_HYPHEN = '\u00AD';
     /** line-separator */
     public static final char LINE_SEPARATOR = '\u2028';
     /** paragraph-separator */
     public static final char PARAGRAPH_SEPARATOR = '\u2029';
     /** missing ideograph */
     public static final char MISSING_IDEOGRAPH = '\u25A1';
     /** Ideogreaphic space */
     public static final char IDEOGRAPHIC_SPACE = '\u3000';
     /** Object replacement character */
     public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
     /** Unicode value indicating the the character is "not a character". */
     public static final char NOT_A_CHARACTER = '\uFFFF';

     /**
      * Utility class: Constructor prevents instantiating when subclassed.
      */
     protected CharUtilities() {
         throw new UnsupportedOperationException();
     }

     /**
      * Return the appropriate CharClass constant for the type
      * of the passed character.
      * @param c character to inspect
      * @return the determined character class
      */
     public static int classOf(int c) {
         switch (c) {
             case CODE_EOT:
                 return EOT;
             case LINEFEED_CHAR:
                 return LINEFEED;
             case SPACE:
             case CARRIAGE_RETURN:
             case TAB:
                 return XMLWHITESPACE;
             default:
                 return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
         }
     }


     /**
      * Helper method to determine if the character is a
      * space with normal behavior. Normal behavior means that
      * it's not non-breaking.
      * @param c character to inspect
      * @return True if the character is a normal space
      */
     public static boolean isBreakableSpace(int c) {
         return (c == SPACE || isFixedWidthSpace(c));
     }

     /**
      * Method to determine if the character is a zero-width space.
      * @param c the character to check
      * @return true if the character is a zero-width space
      */
     public static boolean isZeroWidthSpace(int c) {
         return c == ZERO_WIDTH_SPACE           // 200Bh
             || c == WORD_JOINER                // 2060h
             || c == ZERO_WIDTH_NOBREAK_SPACE;  // FEFFh (also used as BOM)
     }

     /**
      * Method to determine if the character is a (breakable) fixed-width space.
      * @param c the character to check
      * @return true if the character has a fixed-width
      */
     public static boolean isFixedWidthSpace(int c) {
         return (c >= '\u2000' && c <= '\u200B')
                 || c == '\u3000';
 //      c == '\u2000'                   // en quad
 //      c == '\u2001'                   // em quad
 //      c == '\u2002'                   // en space
 //      c == '\u2003'                   // em space
 //      c == '\u2004'                   // three-per-em space
 //      c == '\u2005'                   // four-per-em space
 //      c == '\u2006'                   // six-per-em space
 //      c == '\u2007'                   // figure space
 //      c == '\u2008'                   // punctuation space
 //      c == '\u2009'                   // thin space
 //      c == '\u200A'                   // hair space
 //      c == '\u200B'                   // zero width space
 //      c == '\u3000'                   // ideographic space
     }

     /**
      * Method to determine if the character is a nonbreaking
      * space.
      * @param c character to check
      * @return True if the character is a nbsp
      */
     public static boolean isNonBreakableSpace(int c) {
         return
             (c == NBSPACE       // no-break space
             || c == '\u202F'    // narrow no-break space
             || c == '\u3000'    // ideographic space
             || c == WORD_JOINER // word joiner
             || c == ZERO_WIDTH_NOBREAK_SPACE);  // zero width no-break space
     }

     /**
      * Method to determine if the character is an adjustable
      * space.
      * @param c character to check
      * @return True if the character is adjustable
      */
     public static boolean isAdjustableSpace(int c) {
         //TODO: are there other kinds of adjustable spaces?
         return
             (c == '\u0020'    // normal space
             || c == NBSPACE); // no-break space
     }

     /**
      * Determines if the character represents any kind of space.
      * @param c character to check
      * @return True if the character represents any kind of space
      */
     public static boolean isAnySpace(int c) {
         return (isBreakableSpace(c) || isNonBreakableSpace(c));
     }

     /**
      * Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
      * @param c the character
      * @return true if the character is "Alphabetic"
      */
     public static boolean isAlphabetic(int c) {
         //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
         //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
         int generalCategory = Character.getType((char)c);
         switch (generalCategory) {
             case Character.UPPERCASE_LETTER: //Lu
             case Character.LOWERCASE_LETTER: //Ll
             case Character.TITLECASE_LETTER: //Lt
             case Character.MODIFIER_LETTER: //Lm
             case Character.OTHER_LETTER: //Lo
             case Character.LETTER_NUMBER: //Nl
                 return true;
             default:
                 //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
                 //Other_Alphabetic contains mostly more exotic characters
                 return false;
         }
     }

     /**
      * Indicates whether the given character is an explicit break-character
      * @param c    the character to check
      * @return  true if the character represents an explicit break
      */
     public static boolean isExplicitBreak(int c) {
         return (c == LINEFEED_CHAR
             || c == CARRIAGE_RETURN
             || c == NEXT_LINE
             || c == LINE_SEPARATOR
             || c == PARAGRAPH_SEPARATOR);
     }

     /**
      * Convert a single unicode scalar value to an XML numeric character
      * reference. If in the BMP, four digits are used, otherwise 6 digits are used.
      * @param c a unicode scalar value
      * @return a string representing a numeric character reference
      */
     public static String charToNCRef(int c) {
         StringBuffer sb = new StringBuffer();
         for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) {
             int d = c & 0xF;
             char hd;
             if (d < 10) {
                 hd = (char) ((int) '0' + d);
             } else {
                 hd = (char) ((int) 'A' + (d - 10));
             }
             sb.append(hd);
         }
         return "&#x" + sb.reverse() + ";";
     }

     /**
      * Convert a string to a sequence of ASCII or XML numeric character references.
      * @param s a java string (encoded in UTF-16)
      * @return a string representing a sequence of numeric character reference or
      * ASCII characters
      */
     public static String toNCRefs(String s) {
         StringBuffer sb = new StringBuffer();
         if (s != null) {
             for (int i = 0; i < s.length(); i++) {
                 char c = s.charAt(i);
                 if ((c >= 32) && (c < 127)) {
                     if (c == '<') {
                         sb.append("&lt;");
                     } else if (c == '>') {
                         sb.append("&gt;");
                     } else if (c == '&') {
                         sb.append("&amp;");
                     } else {
                         sb.append(c);
                     }
                 } else {
                     sb.append(charToNCRef(c));
                 }
             }
         }
         return sb.toString();
     }

     /**
      * Pad a string S on left out to width W using padding character PAD.
      * @param s string to pad
      * @param width width of field to add padding
      * @param pad character to use for padding
      * @return padded string
      */
     public static String padLeft(String s, int width, char pad) {
         StringBuffer sb = new StringBuffer();
         for (int i = s.length(); i < width; i++) {
             sb.append(pad);
         }
         sb.append(s);
         return sb.toString();
     }

     /**
      * Format character for debugging output, which it is prefixed with "0x", padded left with '0'
      * and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
      * @param c character code
      * @return formatted character string
      */
     public static String format(int c) {
         if (c < 1114112) {
             return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0');
         } else {
             return "!NOT A CHARACTER!";
         }
     }

     /**
      * Determine if two character sequences contain the same characters.
      * @param cs1 first character sequence
      * @param cs2 second character sequence
      * @return true if both sequences have same length and same character sequence
      */
     public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) {
         assert cs1 != null;
         assert cs2 != null;
         if (cs1.length() != cs2.length()) {
             return false;
         } else {
             for (int i = 0, n = cs1.length(); i < n; i++) {
                 if (cs1.charAt(i) != cs2.charAt(i)) {
                     return false;
                 }
             }
             return true;
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* $Id$ */

	package org.apache.fop.util;

	/**
	* This class provides utilities to distinguish various kinds of Unicode
	* whitespace and to get character widths in a given FontState.
	*/
	public class CharUtilities {

	/**
	* Character code used to signal a character boundary in
	* inline content, such as an inline with borders and padding
	* or a nested block object.
	*/
	public static final char CODE_EOT = 0;

	/**
	* Character class: Unicode white space
	*/
	public static final int UCWHITESPACE = 0;
	/**
	* Character class: Line feed
	*/
	public static final int LINEFEED = 1;
	/**
	* Character class: Boundary between text runs
	*/
	public static final int EOT = 2;
	/**
	* Character class: non-whitespace
	*/
	public static final int NONWHITESPACE = 3;
	/**
	* Character class: XML whitespace
	*/
	public static final int XMLWHITESPACE = 4;


	/** null char */
	public static final char NULL_CHAR = '\u0000';
	/** linefeed character */
	public static final char LINEFEED_CHAR = '\n';
	/** carriage return */
	public static final char CARRIAGE_RETURN = '\r';
	/** normal tab */
	public static final char TAB = '\t';
	/** normal space */
	public static final char SPACE = '\u0020';
	/** non-breaking space */
	public static final char NBSPACE = '\u00A0';
	/** next line control character */
	public static final char NEXT_LINE = '\u0085';
	/** zero-width space */
	public static final char ZERO_WIDTH_SPACE = '\u200B';
	/** word joiner */
	public static final char WORD_JOINER = '\u2060';
	/** zero-width joiner */
	public static final char ZERO_WIDTH_JOINER = '\u200D';
	/** left-to-right mark */
	public static final char LRM = '\u200E';
	/** right-to-left mark */
	public static final char RLM = '\u202F';
	/** left-to-right embedding */
	public static final char LRE = '\u202A';
	/** right-to-left embedding */
	public static final char RLE = '\u202B';
	/** pop directional formatting */
	public static final char PDF = '\u202C';
	/** left-to-right override */
	public static final char LRO = '\u202D';
	/** right-to-left override */
	public static final char RLO = '\u202E';
	/** zero-width no-break space (= byte order mark) */
	public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
	/** soft hyphen */
	public static final char SOFT_HYPHEN = '\u00AD';
	/** line-separator */
	public static final char LINE_SEPARATOR = '\u2028';
	/** paragraph-separator */
	public static final char PARAGRAPH_SEPARATOR = '\u2029';
	/** missing ideograph */
	public static final char MISSING_IDEOGRAPH = '\u25A1';
	/** Ideogreaphic space */
	public static final char IDEOGRAPHIC_SPACE = '\u3000';
	/** Object replacement character */
	public static final char OBJECT_REPLACEMENT_CHARACTER = '\uFFFC';
	/** Unicode value indicating the the character is "not a character". */
	public static final char NOT_A_CHARACTER = '\uFFFF';

	/**
	* Utility class: Constructor prevents instantiating when subclassed.
	*/
	protected CharUtilities() {
	throw new UnsupportedOperationException();
	}

	/**
	* Return the appropriate CharClass constant for the type
	* of the passed character.
	* @param c character to inspect
	* @return the determined character class
	*/
	public static int classOf(int c) {
	switch (c) {
	case CODE_EOT:
	return EOT;
	case LINEFEED_CHAR:
	return LINEFEED;
	case SPACE:
	case CARRIAGE_RETURN:
	case TAB:
	return XMLWHITESPACE;
	default:
	return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
	}
	}


	/**
	* Helper method to determine if the character is a
	* space with normal behavior. Normal behavior means that
	* it's not non-breaking.
	* @param c character to inspect
	* @return True if the character is a normal space
	*/
	public static boolean isBreakableSpace(int c) {
	return (c == SPACE \|\| isFixedWidthSpace(c));
	}

	/**
	* Method to determine if the character is a zero-width space.
	* @param c the character to check
	* @return true if the character is a zero-width space
	*/
	public static boolean isZeroWidthSpace(int c) {
	return c == ZERO_WIDTH_SPACE // 200Bh
	\|\| c == WORD_JOINER // 2060h
	\|\| c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
	}

	/**
	* Method to determine if the character is a (breakable) fixed-width space.
	* @param c the character to check
	* @return true if the character has a fixed-width
	*/
	public static boolean isFixedWidthSpace(int c) {
	return (c >= '\u2000' && c <= '\u200B')
	\|\| c == '\u3000';
	// c == '\u2000' // en quad
	// c == '\u2001' // em quad
	// c == '\u2002' // en space
	// c == '\u2003' // em space
	// c == '\u2004' // three-per-em space
	// c == '\u2005' // four-per-em space
	// c == '\u2006' // six-per-em space
	// c == '\u2007' // figure space
	// c == '\u2008' // punctuation space
	// c == '\u2009' // thin space
	// c == '\u200A' // hair space
	// c == '\u200B' // zero width space
	// c == '\u3000' // ideographic space
	}

	/**
	* Method to determine if the character is a nonbreaking
	* space.
	* @param c character to check
	* @return True if the character is a nbsp
	*/
	public static boolean isNonBreakableSpace(int c) {
	return
	(c == NBSPACE // no-break space
	\|\| c == '\u202F' // narrow no-break space
	\|\| c == '\u3000' // ideographic space
	\|\| c == WORD_JOINER // word joiner
	\|\| c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
	}

	/**
	* Method to determine if the character is an adjustable
	* space.
	* @param c character to check
	* @return True if the character is adjustable
	*/
	public static boolean isAdjustableSpace(int c) {
	//TODO: are there other kinds of adjustable spaces?
	return
	(c == '\u0020' // normal space
	\|\| c == NBSPACE); // no-break space
	}

	/**
	* Determines if the character represents any kind of space.
	* @param c character to check
	* @return True if the character represents any kind of space
	*/
	public static boolean isAnySpace(int c) {
	return (isBreakableSpace(c) \|\| isNonBreakableSpace(c));
	}

	/**
	* Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
	* @param c the character
	* @return true if the character is "Alphabetic"
	*/
	public static boolean isAlphabetic(int c) {
	//http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
	//Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
	int generalCategory = Character.getType((char)c);
	switch (generalCategory) {
	case Character.UPPERCASE_LETTER: //Lu
	case Character.LOWERCASE_LETTER: //Ll
	case Character.TITLECASE_LETTER: //Lt
	case Character.MODIFIER_LETTER: //Lm
	case Character.OTHER_LETTER: //Lo
	case Character.LETTER_NUMBER: //Nl
	return true;
	default:
	//TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
	//Other_Alphabetic contains mostly more exotic characters
	return false;
	}
	}

	/**
	* Indicates whether the given character is an explicit break-character
	* @param c the character to check
	* @return true if the character represents an explicit break
	*/
	public static boolean isExplicitBreak(int c) {
	return (c == LINEFEED_CHAR
	\|\| c == CARRIAGE_RETURN
	\|\| c == NEXT_LINE
	\|\| c == LINE_SEPARATOR
	\|\| c == PARAGRAPH_SEPARATOR);
	}

	/**
	* Convert a single unicode scalar value to an XML numeric character
	* reference. If in the BMP, four digits are used, otherwise 6 digits are used.
	* @param c a unicode scalar value
	* @return a string representing a numeric character reference
	*/
	public static String charToNCRef(int c) {
	StringBuffer sb = new StringBuffer();
	for (int i = 0, nDigits = (c > 0xFFFF) ? 6 : 4; i < nDigits; i++, c >>= 4) {
	int d = c & 0xF;
	char hd;
	if (d < 10) {
	hd = (char) ((int) '0' + d);
	} else {
	hd = (char) ((int) 'A' + (d - 10));
	}
	sb.append(hd);
	}
	return "&#x" + sb.reverse() + ";";
	}

	/**
	* Convert a string to a sequence of ASCII or XML numeric character references.
	* @param s a java string (encoded in UTF-16)
	* @return a string representing a sequence of numeric character reference or
	* ASCII characters
	*/
	public static String toNCRefs(String s) {
	StringBuffer sb = new StringBuffer();
	if (s != null) {
	for (int i = 0; i < s.length(); i++) {
	char c = s.charAt(i);
	if ((c >= 32) && (c < 127)) {
	if (c == '<') {
	sb.append("<");
	} else if (c == '>') {
	sb.append(">");
	} else if (c == '&') {
	sb.append("&");
	} else {
	sb.append(c);
	}
	} else {
	sb.append(charToNCRef(c));
	}
	}
	}
	return sb.toString();
	}

	/**
	* Pad a string S on left out to width W using padding character PAD.
	* @param s string to pad
	* @param width width of field to add padding
	* @param pad character to use for padding
	* @return padded string
	*/
	public static String padLeft(String s, int width, char pad) {
	StringBuffer sb = new StringBuffer();
	for (int i = s.length(); i < width; i++) {
	sb.append(pad);
	}
	sb.append(s);
	return sb.toString();
	}

	/**
	* Format character for debugging output, which it is prefixed with "0x", padded left with '0'
	* and either 4 or 6 hex characters in width according to whether it is in the BMP or not.
	* @param c character code
	* @return formatted character string
	*/
	public static String format(int c) {
	if (c < 1114112) {
	return "0x" + padLeft(Integer.toString(c, 16), (c < 65536) ? 4 : 6, '0');
	} else {
	return "!NOT A CHARACTER!";
	}
	}

	/**
	* Determine if two character sequences contain the same characters.
	* @param cs1 first character sequence
	* @param cs2 second character sequence
	* @return true if both sequences have same length and same character sequence
	*/
	public static boolean isSameSequence(CharSequence cs1, CharSequence cs2) {
	assert cs1 != null;
	assert cs2 != null;
	if (cs1.length() != cs2.length()) {
	return false;
	} else {
	for (int i = 0, n = cs1.length(); i < n; i++) {
	if (cs1.charAt(i) != cs2.charAt(i)) {
	return false;
	}
	}
	return true;
	}
	}

	}