src/java/org/apache/fop/util/CharUtilities.java - xmlgraphics-fop - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* $Id$ */

 package org.apache.fop.util;

 /**
  * This class provides utilities to distinguish various kinds of Unicode
  * whitespace and to get character widths in a given FontState.
  */
 public class CharUtilities {

     /**
      * Character code used to signal a character boundary in
      * inline content, such as an inline with borders and padding
      * or a nested block object.
      */
     public static final char CODE_EOT = 0;

     /**
      * Character class: Unicode white space
      */
     public static final int UCWHITESPACE = 0;
     /**
      * Character class: Line feed
      */
     public static final int LINEFEED = 1;
     /**
      * Character class: Boundary between text runs
      */
     public static final int EOT = 2;
     /**
      * Character class: non-whitespace
      */
     public static final int NONWHITESPACE = 3;
     /**
      * Character class: XML whitespace
      */
     public static final int XMLWHITESPACE = 4;


     /** null char */
     public static final char NULL_CHAR = '\u0000';
     /** linefeed character */
     public static final char LINEFEED_CHAR = '\n';
     /** carriage return */
     public static final char CARRIAGE_RETURN = '\r';
     /** normal tab */
     public static final char TAB = '\t';
     /** normal space */
     public static final char SPACE = '\u0020';
     /** non-breaking space */
     public static final char NBSPACE = '\u00A0';
     /** next line control character */
     public static final char NEXT_LINE = '\u0085';
     /** zero-width space */
     public static final char ZERO_WIDTH_SPACE = '\u200B';
     /** word joiner */
     public static final char WORD_JOINER = '\u2060';
     /** zero-width joiner */
     public static final char ZERO_WIDTH_JOINER = '\u200D';
     /** zero-width no-break space (= byte order mark) */
     public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
     /** soft hyphen */
     public static final char SOFT_HYPHEN = '\u00AD';
     /** line-separator */
     public static final char LINE_SEPARATOR = '\u2028';
     /** paragraph-separator */
     public static final char PARAGRAPH_SEPARATOR = '\u2029';
     /** missing ideograph */
     public static final char MISSING_IDEOGRAPH = '\u25A1';
     /** Ideogreaphic space */
     public static final char IDEOGRAPHIC_SPACE = '\u3000';
     /** Unicode value indicating the the character is "not a character". */
     public static final char NOT_A_CHARACTER = '\uFFFF';


     /**
      * Utility class: Constructor prevents instantiating when subclassed.
      */
     protected CharUtilities() {
         throw new UnsupportedOperationException();
     }

     /**
      * Return the appropriate CharClass constant for the type
      * of the passed character.
      * @param c character to inspect
      * @return the determined character class
      */
     public static int classOf(char c) {
         switch (c) {
             case CODE_EOT:
                 return EOT;
             case LINEFEED_CHAR:
                 return LINEFEED;
             case SPACE:
             case CARRIAGE_RETURN:
             case TAB:
                 return XMLWHITESPACE;
             default:
                 return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
         }
     }


     /**
      * Helper method to determine if the character is a
      * space with normal behavior. Normal behavior means that
      * it's not non-breaking.
      * @param c character to inspect
      * @return True if the character is a normal space
      */
     public static boolean isBreakableSpace(char c) {
         return (c == SPACE || isFixedWidthSpace(c));
     }

     /**
      * Method to determine if the character is a zero-width space.
      * @param c the character to check
      * @return true if the character is a zero-width space
      */
     public static boolean isZeroWidthSpace(char c) {
         return c == ZERO_WIDTH_SPACE           // 200Bh
             || c == WORD_JOINER                // 2060h
             || c == ZERO_WIDTH_NOBREAK_SPACE;  // FEFFh (also used as BOM)
     }

     /**
      * Method to determine if the character is a (breakable) fixed-width space.
      * @param c the character to check
      * @return true if the character has a fixed-width
      */
     public static boolean isFixedWidthSpace(char c) {
         return (c >= '\u2000' && c <= '\u200B')
                 || c == '\u3000';
 //      c == '\u2000'                   // en quad
 //      c == '\u2001'                   // em quad
 //      c == '\u2002'                   // en space
 //      c == '\u2003'                   // em space
 //      c == '\u2004'                   // three-per-em space
 //      c == '\u2005'                   // four-per-em space
 //      c == '\u2006'                   // six-per-em space
 //      c == '\u2007'                   // figure space
 //      c == '\u2008'                   // punctuation space
 //      c == '\u2009'                   // thin space
 //      c == '\u200A'                   // hair space
 //      c == '\u200B'                   // zero width space
 //      c == '\u3000'                   // ideographic space
     }

     /**
      * Method to determine if the character is a nonbreaking
      * space.
      * @param c character to check
      * @return True if the character is a nbsp
      */
     public static boolean isNonBreakableSpace(char c) {
         return
             (c == NBSPACE       // no-break space
             || c == '\u202F'    // narrow no-break space
             || c == '\u3000'    // ideographic space
             || c == WORD_JOINER // word joiner
             || c == ZERO_WIDTH_NOBREAK_SPACE);  // zero width no-break space
     }

     /**
      * Method to determine if the character is an adjustable
      * space.
      * @param c character to check
      * @return True if the character is adjustable
      */
     public static boolean isAdjustableSpace(char c) {
         //TODO: are there other kinds of adjustable spaces?
         return
             (c == '\u0020'    // normal space
             || c == NBSPACE); // no-break space
     }

     /**
      * Determines if the character represents any kind of space.
      * @param c character to check
      * @return True if the character represents any kind of space
      */
     public static boolean isAnySpace(char c) {
         return (isBreakableSpace(c) || isNonBreakableSpace(c));
     }

     /**
      * Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
      * @param ch the character
      * @return true if the character is "Alphabetic"
      */
     public static boolean isAlphabetic(char ch) {
         //http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
         //Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
         int generalCategory = Character.getType(ch);
         switch (generalCategory) {
             case Character.UPPERCASE_LETTER: //Lu
             case Character.LOWERCASE_LETTER: //Ll
             case Character.TITLECASE_LETTER: //Lt
             case Character.MODIFIER_LETTER: //Lm
             case Character.OTHER_LETTER: //Lo
             case Character.LETTER_NUMBER: //Nl
                 return true;
             default:
                 //TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
                 //Other_Alphabetic contains mostly more exotic characters
                 return false;
         }
     }

     /**
      * Indicates whether the given character is an explicit break-character
      * @param ch    the character to check
      * @return  true if the character represents an explicit break
      */
     public static boolean isExplicitBreak(char ch) {
         return (ch == LINEFEED_CHAR
             || ch == CARRIAGE_RETURN
             || ch == NEXT_LINE
             || ch == LINE_SEPARATOR
             || ch == PARAGRAPH_SEPARATOR);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* $Id$ */

	package org.apache.fop.util;

	/**
	* This class provides utilities to distinguish various kinds of Unicode
	* whitespace and to get character widths in a given FontState.
	*/
	public class CharUtilities {

	/**
	* Character code used to signal a character boundary in
	* inline content, such as an inline with borders and padding
	* or a nested block object.
	*/
	public static final char CODE_EOT = 0;

	/**
	* Character class: Unicode white space
	*/
	public static final int UCWHITESPACE = 0;
	/**
	* Character class: Line feed
	*/
	public static final int LINEFEED = 1;
	/**
	* Character class: Boundary between text runs
	*/
	public static final int EOT = 2;
	/**
	* Character class: non-whitespace
	*/
	public static final int NONWHITESPACE = 3;
	/**
	* Character class: XML whitespace
	*/
	public static final int XMLWHITESPACE = 4;


	/** null char */
	public static final char NULL_CHAR = '\u0000';
	/** linefeed character */
	public static final char LINEFEED_CHAR = '\n';
	/** carriage return */
	public static final char CARRIAGE_RETURN = '\r';
	/** normal tab */
	public static final char TAB = '\t';
	/** normal space */
	public static final char SPACE = '\u0020';
	/** non-breaking space */
	public static final char NBSPACE = '\u00A0';
	/** next line control character */
	public static final char NEXT_LINE = '\u0085';
	/** zero-width space */
	public static final char ZERO_WIDTH_SPACE = '\u200B';
	/** word joiner */
	public static final char WORD_JOINER = '\u2060';
	/** zero-width joiner */
	public static final char ZERO_WIDTH_JOINER = '\u200D';
	/** zero-width no-break space (= byte order mark) */
	public static final char ZERO_WIDTH_NOBREAK_SPACE = '\uFEFF';
	/** soft hyphen */
	public static final char SOFT_HYPHEN = '\u00AD';
	/** line-separator */
	public static final char LINE_SEPARATOR = '\u2028';
	/** paragraph-separator */
	public static final char PARAGRAPH_SEPARATOR = '\u2029';
	/** missing ideograph */
	public static final char MISSING_IDEOGRAPH = '\u25A1';
	/** Ideogreaphic space */
	public static final char IDEOGRAPHIC_SPACE = '\u3000';
	/** Unicode value indicating the the character is "not a character". */
	public static final char NOT_A_CHARACTER = '\uFFFF';


	/**
	* Utility class: Constructor prevents instantiating when subclassed.
	*/
	protected CharUtilities() {
	throw new UnsupportedOperationException();
	}

	/**
	* Return the appropriate CharClass constant for the type
	* of the passed character.
	* @param c character to inspect
	* @return the determined character class
	*/
	public static int classOf(char c) {
	switch (c) {
	case CODE_EOT:
	return EOT;
	case LINEFEED_CHAR:
	return LINEFEED;
	case SPACE:
	case CARRIAGE_RETURN:
	case TAB:
	return XMLWHITESPACE;
	default:
	return isAnySpace(c) ? UCWHITESPACE : NONWHITESPACE;
	}
	}


	/**
	* Helper method to determine if the character is a
	* space with normal behavior. Normal behavior means that
	* it's not non-breaking.
	* @param c character to inspect
	* @return True if the character is a normal space
	*/
	public static boolean isBreakableSpace(char c) {
	return (c == SPACE \|\| isFixedWidthSpace(c));
	}

	/**
	* Method to determine if the character is a zero-width space.
	* @param c the character to check
	* @return true if the character is a zero-width space
	*/
	public static boolean isZeroWidthSpace(char c) {
	return c == ZERO_WIDTH_SPACE // 200Bh
	\|\| c == WORD_JOINER // 2060h
	\|\| c == ZERO_WIDTH_NOBREAK_SPACE; // FEFFh (also used as BOM)
	}

	/**
	* Method to determine if the character is a (breakable) fixed-width space.
	* @param c the character to check
	* @return true if the character has a fixed-width
	*/
	public static boolean isFixedWidthSpace(char c) {
	return (c >= '\u2000' && c <= '\u200B')
	\|\| c == '\u3000';
	// c == '\u2000' // en quad
	// c == '\u2001' // em quad
	// c == '\u2002' // en space
	// c == '\u2003' // em space
	// c == '\u2004' // three-per-em space
	// c == '\u2005' // four-per-em space
	// c == '\u2006' // six-per-em space
	// c == '\u2007' // figure space
	// c == '\u2008' // punctuation space
	// c == '\u2009' // thin space
	// c == '\u200A' // hair space
	// c == '\u200B' // zero width space
	// c == '\u3000' // ideographic space
	}

	/**
	* Method to determine if the character is a nonbreaking
	* space.
	* @param c character to check
	* @return True if the character is a nbsp
	*/
	public static boolean isNonBreakableSpace(char c) {
	return
	(c == NBSPACE // no-break space
	\|\| c == '\u202F' // narrow no-break space
	\|\| c == '\u3000' // ideographic space
	\|\| c == WORD_JOINER // word joiner
	\|\| c == ZERO_WIDTH_NOBREAK_SPACE); // zero width no-break space
	}

	/**
	* Method to determine if the character is an adjustable
	* space.
	* @param c character to check
	* @return True if the character is adjustable
	*/
	public static boolean isAdjustableSpace(char c) {
	//TODO: are there other kinds of adjustable spaces?
	return
	(c == '\u0020' // normal space
	\|\| c == NBSPACE); // no-break space
	}

	/**
	* Determines if the character represents any kind of space.
	* @param c character to check
	* @return True if the character represents any kind of space
	*/
	public static boolean isAnySpace(char c) {
	return (isBreakableSpace(c) \|\| isNonBreakableSpace(c));
	}

	/**
	* Indicates whether a character is classified as "Alphabetic" by the Unicode standard.
	* @param ch the character
	* @return true if the character is "Alphabetic"
	*/
	public static boolean isAlphabetic(char ch) {
	//http://www.unicode.org/Public/UNIDATA/UCD.html#Alphabetic
	//Generated from: Other_Alphabetic + Lu + Ll + Lt + Lm + Lo + Nl
	int generalCategory = Character.getType(ch);
	switch (generalCategory) {
	case Character.UPPERCASE_LETTER: //Lu
	case Character.LOWERCASE_LETTER: //Ll
	case Character.TITLECASE_LETTER: //Lt
	case Character.MODIFIER_LETTER: //Lm
	case Character.OTHER_LETTER: //Lo
	case Character.LETTER_NUMBER: //Nl
	return true;
	default:
	//TODO if (ch in Other_Alphabetic) return true; (Probably need ICU4J for that)
	//Other_Alphabetic contains mostly more exotic characters
	return false;
	}
	}

	/**
	* Indicates whether the given character is an explicit break-character
	* @param ch the character to check
	* @return true if the character represents an explicit break
	*/
	public static boolean isExplicitBreak(char ch) {
	return (ch == LINEFEED_CHAR
	\|\| ch == CARRIAGE_RETURN
	\|\| ch == NEXT_LINE
	\|\| ch == LINE_SEPARATOR
	\|\| ch == PARAGRAPH_SEPARATOR);
	}
	}