| /* |
| * $Id$ |
| * |
| * The Apache Software License, Version 1.1 |
| * |
| * |
| * Copyright (c) 2000 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Crimson" and "Apache Software Foundation" must |
| * not be used to endorse or promote products derived from this |
| * software without prior written permission. For written |
| * permission, please contact apache@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache", |
| * nor may "Apache" appear in their name, without prior written |
| * permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation and was |
| * originally based on software copyright (c) 1999, Sun Microsystems, Inc., |
| * http://www.sun.com. For more information on the Apache Software |
| * Foundation, please see <http://www.apache.org/>. |
| */ |
| |
| package org.apache.xerces.tree; |
| |
| |
| /** |
| * Methods in this class are used to determine whether characters may |
| * appear in certain roles in XML documents. Such methods are used |
| * both to parse and to create such documents. |
| * |
| * @version 1.8 |
| * @author David Brownell |
| */ |
| public class XmlChars |
| { |
| // can't construct instances |
| private XmlChars () { } |
| |
| /** |
| * Returns true if the argument, a UCS-4 character code, is valid in |
| * XML documents. Unicode characters fit into the low sixteen |
| * bits of a UCS-4 character, and pairs of Unicode <em>surrogate |
| * characters</em> can be combined to encode UCS-4 characters in |
| * documents containing only Unicode. (The <code>char</code> datatype |
| * in the Java Programming Language represents Unicode characters, |
| * including unpaired surrogates.) |
| * |
| * <P> In XML, UCS-4 characters can also be encoded by the use of |
| * <em>character references</em> such as <b>&#x12345678;</b>, which |
| * happens to refer to a character that is disallowed in XML documents. |
| * UCS-4 characters allowed in XML documents can be expressed with |
| * one or two Unicode characters. |
| * |
| * @param ucs4char The 32-bit UCS-4 character being tested. |
| */ |
| static public boolean isChar (int ucs4char) |
| { |
| // [2] Char ::= #x0009 | #x000A | #x000D |
| // | [#x0020-#xD7FF] |
| // ... surrogates excluded! |
| // | [#xE000-#xFFFD] |
| // | [#x10000-#x10ffff] |
| return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) |
| || ucs4char == 0x000A || ucs4char == 0x0009 |
| || ucs4char == 0x000D |
| || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) |
| || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); |
| } |
| |
| /** |
| * Returns true if the character is allowed to be a non-initial |
| * character in names according to the XML recommendation. |
| * @see #isNCNameChar |
| * @see #isLetter |
| */ |
| public static boolean isNameChar (char c) |
| { |
| // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' |
| // | CombiningChar | Extender |
| |
| if (isLetter2 (c)) |
| return true; |
| else if (c == '>') |
| return false; |
| else if (c == '.' || c == '-' || c == '_' || c == ':' |
| || isExtender (c)) |
| return true; |
| else |
| return false; |
| } |
| |
| /** |
| * Returns true if the character is allowed to be a non-initial |
| * character in unscoped names according to the rules of the XML |
| * Namespaces proposed recommendation. Except for precluding |
| * the colon (used to separate names from their scopes) these |
| * characters are just as allowed by the XML recommendation. |
| * @see #isNameChar |
| * @see #isLetter |
| */ |
| public static boolean isNCNameChar (char c) |
| { |
| // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' |
| // | CombiningChar | Extender |
| return c != ':' && isNameChar (c); |
| } |
| |
| /** |
| * Returns true if the character is allowed where XML supports |
| * whitespace characters, false otherwise. |
| */ |
| public static boolean isSpace (char c) |
| { |
| return c == ' ' || c == '\t' || c == '\n' || c == '\r'; |
| } |
| |
| |
| /* |
| * NOTE: java.lang.Character.getType() values are: |
| * |
| * UNASSIGNED = 0, |
| * |
| * UPPERCASE_LETTER = 1, // Lu |
| * LOWERCASE_LETTER = 2, // Ll |
| * TITLECASE_LETTER = 3, // Lt |
| * MODIFIER_LETTER = 4, // Lm |
| * OTHER_LETTER = 5, // Lo |
| * NON_SPACING_MARK = 6, // Mn |
| * ENCLOSING_MARK = 7, // Me |
| * COMBINING_SPACING_MARK = 8, // Mc |
| * DECIMAL_DIGIT_NUMBER = 9, // Nd |
| * LETTER_NUMBER = 10, // Nl |
| * OTHER_NUMBER = 11, // No |
| * SPACE_SEPARATOR = 12, // Zs |
| * LINE_SEPARATOR = 13, // Zl |
| * PARAGRAPH_SEPARATOR = 14, // Zp |
| * CONTROL = 15, // Cc |
| * FORMAT = 16, // Cf |
| * // 17 reserved for proposed Ci category |
| * PRIVATE_USE = 18, // Co |
| * SURROGATE = 19, // Cs |
| * DASH_PUNCTUATION = 20, // Pd |
| * START_PUNCTUATION = 21, // Ps |
| * END_PUNCTUATION = 22, // Pe |
| * CONNECTOR_PUNCTUATION = 23, // Pc |
| * OTHER_PUNCTUATION = 24, // Po |
| * MATH_SYMBOL = 25, // Sm |
| * CURRENCY_SYMBOL = 26, // Sc |
| * MODIFIER_SYMBOL = 27, // Sk |
| * OTHER_SYMBOL = 28; // So |
| */ |
| |
| /** |
| * Returns true if the character is an XML "letter". XML Names must |
| * start with Letters or a few other characters, but other characters |
| * in names must only satisfy the <em>isNameChar</em> predicate. |
| * |
| * @see #isNameChar |
| * @see #isNCNameChar |
| */ |
| public static boolean isLetter (char c) |
| { |
| // [84] Letter ::= BaseChar | Ideographic |
| // [85] BaseChar ::= ... too much to repeat |
| // [86] Ideographic ::= ... too much to repeat |
| |
| // |
| // Optimize the typical case. |
| // |
| if (c >= 'a' && c <= 'z') |
| return true; |
| if (c == '/') |
| return false; |
| if (c >= 'A' && c <= 'Z') |
| return true; |
| |
| // |
| // Since the tables are too ridiculous to use in code, |
| // we're using the footnotes here to drive this test. |
| // |
| switch (Character.getType (c)) { |
| // app. B footnote says these are 'name start' |
| // chars' ... |
| case Character.LOWERCASE_LETTER: // Ll |
| case Character.UPPERCASE_LETTER: // Lu |
| case Character.OTHER_LETTER: // Lo |
| case Character.TITLECASE_LETTER: // Lt |
| case Character.LETTER_NUMBER: // Nl |
| |
| // OK, here we just have some exceptions to check... |
| return !isCompatibilityChar (c) |
| // per "5.14 of Unicode", rule out some combiners |
| && !(c >= 0x20dd && c <= 0x20e0); |
| |
| default: |
| // check for some exceptions: these are "alphabetic" |
| return ((c >= 0x02bb && c <= 0x02c1) |
| || c == 0x0559 || c == 0x06e5 || c == 0x06e6); |
| } |
| } |
| |
| // |
| // XML 1.0 discourages "compatibility" characters in names; these |
| // were defined to permit passing through some information stored in |
| // older non-Unicode character sets. These always have alternative |
| // representations in Unicode, e.g. using combining chars. |
| // |
| private static boolean isCompatibilityChar (char c) |
| { |
| // the numerous comparisions here seem unavoidable, |
| // but the switch can reduce the number which must |
| // actually be executed. |
| |
| switch ((c >> 8) & 0x0ff) { |
| case 0x00: |
| // ISO Latin/1 has a few compatibility characters |
| return c == 0x00aa || c == 0x00b5 || c == 0x00ba; |
| |
| case 0x01: |
| // as do Latin Extended A and (parts of) B |
| return (c >= 0x0132 && c <= 0x0133) |
| || (c >= 0x013f && c <= 0x0140) |
| || c == 0x0149 |
| || c == 0x017f |
| || (c >= 0x01c4 && c <= 0x01cc) |
| || (c >= 0x01f1 && c <= 0x01f3) ; |
| |
| case 0x02: |
| // some spacing modifiers |
| return (c >= 0x02b0 && c <= 0x02b8) |
| || (c >= 0x02e0 && c <= 0x02e4); |
| |
| case 0x03: |
| return c == 0x037a; // Greek |
| |
| case 0x05: |
| return c == 0x0587; // Armenian |
| |
| case 0x0e: |
| return c >= 0x0edc && c <= 0x0edd; // Laotian |
| |
| case 0x11: |
| // big chunks of Hangul Jamo are all "compatibility" |
| return c == 0x1101 |
| || c == 0x1104 |
| || c == 0x1108 |
| || c == 0x110a |
| || c == 0x110d |
| || (c >= 0x1113 && c <= 0x113b) |
| || c == 0x113d |
| || c == 0x113f |
| || (c >= 0x1141 && c <= 0x114b) |
| || c == 0x114d |
| || c == 0x114f |
| || (c >= 0x1151 && c <= 0x1153) |
| || (c >= 0x1156 && c <= 0x1158) |
| || c == 0x1162 |
| || c == 0x1164 |
| || c == 0x1166 |
| || c == 0x1168 |
| || (c >= 0x116a && c <= 0x116c) |
| || (c >= 0x116f && c <= 0x1171) |
| || c == 0x1174 |
| || (c >= 0x1176 && c <= 0x119d) |
| || (c >= 0x119f && c <= 0x11a2) |
| || (c >= 0x11a9 && c <= 0x11aa) |
| || (c >= 0x11ac && c <= 0x11ad) |
| || (c >= 0x11b0 && c <= 0x11b6) |
| || c == 0x11b9 |
| || c == 0x11bb |
| || (c >= 0x11c3 && c <= 0x11ea) |
| || (c >= 0x11ec && c <= 0x11ef) |
| || (c >= 0x11f1 && c <= 0x11f8) |
| ; |
| |
| case 0x20: |
| return c == 0x207f; // superscript |
| |
| case 0x21: |
| return |
| // various letterlike symbols |
| c == 0x2102 |
| || c == 0x2107 |
| || (c >= 0x210a && c <= 0x2113) |
| || c == 0x2115 |
| || (c >= 0x2118 && c <= 0x211d) |
| || c == 0x2124 |
| || c == 0x2128 |
| || (c >= 0x212c && c <= 0x212d) |
| || (c >= 0x212f && c <= 0x2138) |
| |
| // most Roman numerals (less 1K, 5K, 10K) |
| || (c >= 0x2160 && c <= 0x217f) |
| ; |
| |
| case 0x30: |
| // some Hiragana |
| return c >= 0x309b && c <= 0x309c; |
| |
| case 0x31: |
| // all Hangul Compatibility Jamo |
| return c >= 0x3131 && c <= 0x318e; |
| |
| case 0xf9: |
| case 0xfa: |
| case 0xfb: |
| case 0xfc: |
| case 0xfd: |
| case 0xfe: |
| case 0xff: |
| // the whole "compatibility" area is for that purpose! |
| return true; |
| |
| default: |
| // most of Unicode isn't flagged as being for compatibility |
| return false; |
| } |
| } |
| |
| // guts of isNameChar/isNCNameChar |
| private static boolean isLetter2 (char c) |
| { |
| // [84] Letter ::= BaseChar | Ideographic |
| // [85] BaseChar ::= ... too much to repeat |
| // [86] Ideographic ::= ... too much to repeat |
| // [87] CombiningChar ::= ... too much to repeat |
| |
| // |
| // Optimize the typical case. |
| // |
| if (c >= 'a' && c <= 'z') |
| return true; |
| if (c == '>') |
| return false; |
| if (c >= 'A' && c <= 'Z') |
| return true; |
| |
| // |
| // Since the tables are too ridiculous to use in code, |
| // we're using the footnotes here to drive this test. |
| // |
| switch (Character.getType (c)) { |
| // app. B footnote says these are 'name start' |
| // chars' ... |
| case Character.LOWERCASE_LETTER: // Ll |
| case Character.UPPERCASE_LETTER: // Lu |
| case Character.OTHER_LETTER: // Lo |
| case Character.TITLECASE_LETTER: // Lt |
| case Character.LETTER_NUMBER: // Nl |
| // ... and these are name characters 'other |
| // than name start characters' |
| case Character.COMBINING_SPACING_MARK: // Mc |
| case Character.ENCLOSING_MARK: // Me |
| case Character.NON_SPACING_MARK: // Mn |
| case Character.MODIFIER_LETTER: // Lm |
| case Character.DECIMAL_DIGIT_NUMBER: // Nd |
| |
| // OK, here we just have some exceptions to check... |
| return !isCompatibilityChar (c) |
| // per "5.14 of Unicode", rule out some combiners |
| && !(c >= 0x20dd && c <= 0x20e0); |
| |
| default: |
| // added a character ... |
| return c == 0x0387; |
| } |
| } |
| |
| private static boolean isDigit (char c) |
| { |
| // [88] Digit ::= ... |
| |
| // |
| // java.lang.Character.isDigit is correct from the XML point |
| // of view except that it allows "fullwidth" digits. |
| // |
| return Character.isDigit (c) |
| && ! ( (c >= 0xff10) && (c <= 0xff19)); |
| } |
| |
| private static boolean isExtender (char c) |
| { |
| // [89] Extender ::= ... |
| return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 |
| || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 |
| || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) |
| || (c >= 0x309d && c <= 0x309e) |
| || (c >= 0x30fc && c <= 0x30fe) |
| ; |
| } |
| } |