core/sis-utility/src/main/java/org/apache/sis/util/Characters.java - sis - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.sis.util;

 import org.opengis.metadata.citation.Citation;  // For javadoc.
 import org.apache.sis.internal.util.Numerics;
 import org.apache.sis.internal.util.Strings;
 import org.apache.sis.util.resources.Errors;


 /**
  * Static methods working on {@code char} values, and some character constants.
  * Apache SIS uses Unicode symbols directly in the source code for easier reading,
  * except for some symbols that are difficult to differentiate from other similar
  * symbols. For those symbols, constants are declared in this class.
  *
  * @author  Martin Desruisseaux (Geomatys)
  * @version 1.1
  * @since   0.3
  * @module
  */
 public final class Characters extends Static {
     /**
      * Hyphen character ('\u2010', Unicode {@code 2010}).
      * This code tells to {@link org.apache.sis.io.LineAppender}
      * that a line break is allowed to be inserted after this character.
      *
      * <p>For non-breaking hyphen, use the Unicode {@code 2011} character.</p>
      */
     public static final char HYPHEN = '\u2010';

     /**
      * Hyphen character to be visible only if there is a line break to insert after it
      * (Unicode {@code 00AD}, HTML {@code &shy;}).
      * Otherwise this character is invisible. When visible, the graphical symbol is similar
      * to the {@link #HYPHEN} character.
      */
     public static final char SOFT_HYPHEN = '\u00AD';

     /**
      * The <cite>no-break space</cite> (Unicode {@code 00A0}, HTML {@code &nbsp;}).
      * Apache SIS uses Unicode symbols directly in the source code for easier reading,
      * except for no-break spaces since they can not be visually distinguished from the
      * ordinary space (Unicode {@code 0020}).
      */
     public static final char NO_BREAK_SPACE = '\u00A0';

     /**
      * The Unicode line separator (Unicode {@code 2028}, HTML {@code <br>}).
      *
      * @see Character#LINE_SEPARATOR
      */
     public static final char LINE_SEPARATOR = '\u2028';

     /**
      * The Unicode paragraph separator (Unicode {@code 2029}, HTML {@code <p>…</p>}).
      *
      * @see Character#PARAGRAPH_SEPARATOR
      */
     public static final char PARAGRAPH_SEPARATOR = '\u2029';

     /**
      * Do not allow instantiation of this class.
      */
     private Characters() {
     }

     /**
      * Returns {@code true} if the given code point is a valid character for <cite>Well Known Text</cite> (WKT).
      * This method returns {@code true} for the following characters:
      *
      * <blockquote><pre>{@literal A-Z a-z 0-9 _ [ ] ( ) { } < = > . , : ; + - (space) % & ' " * ^ / \ ? | °}</pre></blockquote>
      *
      * They are ASCII codes 32 to 125 inclusive except ! (33), # (35), $ (36), @ (64) and ` (96),
      * plus the addition of ° (176) despite being formally outside the ASCII character set.
      *
      * @param  c  the code point to test.
      * @return {@code true} if the given code point is a valid WKT character.
      *
      * @see org.apache.sis.io.wkt.Transliterator
      *
      * @since 0.6
      */
     public static boolean isValidWKT(final int c) {
         switch (c) {
             case '!':
             case '#':
             case '$':
             case '@':
             case '`': return false;
             case '°': return true;
             default : return (c >= ' ') && (c <= '}');
         }
     }

     /**
      * Returns {@code true} if the given code point is a {@linkplain Character#LINE_SEPARATOR
      * line separator}, a {@linkplain Character#PARAGRAPH_SEPARATOR paragraph separator} or one
      * of the {@code '\r'} or {@code '\n'} control characters.
      *
      * @param  c  the code point to test.
      * @return {@code true} if the given code point is a line or paragraph separator.
      *
      * @see #LINE_SEPARATOR
      * @see #PARAGRAPH_SEPARATOR
      */
     public static boolean isLineOrParagraphSeparator(final int c) {
         switch (Character.getType(c)) {
             default: return false;
             case Character.LINE_SEPARATOR:
             case Character.PARAGRAPH_SEPARATOR: return true;
             case Character.CONTROL: return (c == '\r') || (c == '\n');
         }
     }

     /**
      * Returns {@code true} if the given character is an hexadecimal digit.
      * This method returns {@code true} if {@code c} is between {@code '0'} and {@code '9'} inclusive,
      * or between {@code 'A'} and {@code 'F'} inclusive, or between {@code 'a'} and {@code 'f'} inclusive.
      *
      * @param  c  the character to test.
      * @return {@code true} if the given character is an hexadecimal digit.
      *
      * @since 0.5
      */
     public static boolean isHexadecimal(int c) {
         /*
          * The &= ~32 is a cheap conversion of lower-case letters to upper-case letters.
          * It is not a rigorous conversion since it does not check if 'c' is a letter,
          * but for the purpose of this method it is okay.
          */
         return (c >= '0' && c <= '9') || ((c &= ~32) >= 'A' && c <= 'F');
     }

     /**
      * Determines whether the given character is a superscript. Most (but not all) superscripts
      * have a Unicode value in the [2070 … 207F] range. Superscripts are the following symbols:
      *
      * {@preformat text
      *   ⁰ ¹ ² ³ ⁴ ⁵ ⁶ ⁷ ⁸ ⁹ ⁺ ⁻ ⁼ ⁽ ⁾ ⁿ
      * }
      *
      * @param  c  the character to test.
      * @return {@code true} if the given character is a superscript.
      */
     public static boolean isSuperScript(final int c) {
         switch (c) {
             case '¹':      // Legacy values in "Latin-1 supplement" space: 00B9, 00B2 and 00B3.
             case '²':      // Those values are outside the usual [2070 … 207F] range.
             case '³':      return true;
             case '\u2071': // Would be the '¹', '²' and '³' values if they were declared in the usual range.
             case '\u2072': // Since they are not, those values are unassigned.
             case '\u2073': return false;
             default:       return (c >= '⁰' && c <= 'ⁿ');
         }
     }

     /**
      * Determines whether the given character is a subscript. All subscripts have
      * a Unicode value in the [2080 … 208E]. Subscripts are the following symbols:
      *
      * {@preformat text
      *   ₀ ₁ ₂ ₃ ₄ ₅ ₆ ₇ ₈ ₉ ₊ ₋ ₌ ₍ ₎
      * }
      *
      * @param  c  the character to test.
      * @return {@code true} if the given character is a subscript.
      */
     public static boolean isSubScript(final int c) {
         return (c >= '₀' && c <= '₎');
     }

     /**
      * Converts the given character argument to superscript.
      * Only the following characters can be converted (other characters are left unchanged):
      *
      * {@preformat text
      *     0 1 2 3 4 5 6 7 8 9 + - = ( ) n
      * }
      *
      * @param  c  the character to convert.
      * @return the given character as a superscript, or {@code c} if the given character can not be converted.
      */
     public static char toSuperScript(char c) {
         switch (c) {
             case '1': c = '¹'; break;  // 00B9
             case '2': c = '²'; break;  // 00B2
             case '3': c = '³'; break;  // 00B3
             case '+': c = '⁺'; break;  // 207A
             case '-': c = '⁻'; break;  // 207B
             case '=': c = '⁼'; break;  // 207C
             case '(': c = '⁽'; break;  // 207D
             case ')': c = '⁾'; break;  // 207E
             case 'n': c = 'ⁿ'; break;  // 207F
             default: {
                 if (c >= '0' && c <= '9') {
                     c += ('⁰' - '0');
                 }
                 break;
             }
         }
         return c;
     }

     /**
      * Converts the given character argument to subscript.
      * Only the following characters can be converted (other characters are left unchanged):
      *
      * {@preformat text
      *     0 1 2 3 4 5 6 7 8 9 + - = ( )
      * }
      *
      * @param  c  the character to convert.
      * @return the given character as a subscript, or {@code c} if the given character can not be converted.
      */
     public static char toSubScript(char c) {
         switch (c) {
             case '+': c = '₊'; break;  // 208A
             case '-': c = '₋'; break;  // 208B
             case '=': c = '₌'; break;  // 208C
             case '(': c = '₍'; break;  // 208D
             case ')': c = '₎'; break;  // 208E
             default: {
                 if (c >= '0' && c <= '9') {
                     c += ('₀' - '0');
                 }
                 break;
             }
         }
         return c;
     }

     /**
      * Converts the given character argument to normal script.
      *
      * @param  c  the character to convert.
      * @return the given character as a normal script, or {@code c} if the
      *         given character was not a superscript or a subscript.
      */
     public static char toNormalScript(char c) {
         // Cast is safe because all return values are in the Basic Multilingual Plane (BMP).
         return (char) toNormalScript((int) c);
     }

     /**
      * Converts the given code point to normal script.
      *
      * @param  c  the character to convert.
      * @return the given character as a normal script, or {@code c} if the
      *         given character was not a superscript or a subscript.
      *
      * @since 1.0
      */
     public static int toNormalScript(int c) {
         switch (c) {
             case '\u2071': // Exceptions to the default case. They would be the ¹²³
             case '\u2072': // cases if they were not defined in the Latin-1 range.
             case '\u2073':               break;
             case '¹':           c = '1'; break;
             case '²':           c = '2'; break;
             case '³':           c = '3'; break;
             case '⁺': case '₊': c = '+'; break;
             case '⁻': case '₋': c = '-'; break;
             case '⁼': case '₌': c = '='; break;
             case '⁽': case '₍': c = '('; break;
             case '⁾': case '₎': c = ')'; break;
             case 'ⁿ':           c = 'n'; break;
             default: {
                 if (c >= '⁰' && c <= '₉') {
                     if      (c <= '⁹') c -= ('⁰' - '0');
                     else if (c >= '₀') c -= ('₀' - '0');
                 }
                 break;
             }
         }
         return c;
     }


     /**
      * Subsets of Unicode characters identified by their general category.
      * The categories are identified by constants defined in the {@link Character} class, like
      * {@link Character#LOWERCASE_LETTER     LOWERCASE_LETTER},
      * {@link Character#UPPERCASE_LETTER     UPPERCASE_LETTER},
      * {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER} and
      * {@link Character#SPACE_SEPARATOR      SPACE_SEPARATOR}.
      *
      * <p>An instance of this class can be obtained from an enumeration of character types
      * using the {@link #forTypes(byte[])} method, or using one of the constants predefined
      * in this class. Then, Unicode characters can be tested for inclusion in the subset by
      * calling the {@link #contains(int)} method.</p>
      *
      * <h2>Relationship with international standards</h2>
      * ISO 19162:2015 §B.5.2 recommends to ignore spaces, case and the following characters when comparing two
      * {@linkplain org.apache.sis.referencing.AbstractIdentifiedObject#getName() identified object names}:
      * “{@code _}” (underscore), “{@code -}” (minus sign), “{@code /}” (solidus),
      * “{@code (}” (left parenthesis) and “{@code )}” (right parenthesis).
      * The same specification also limits the set of valid characters in a name to the following (§6.3.1):
      *
      * <blockquote>{@code A-Z a-z 0-9 _ [ ] ( ) { } < = > . , : ; + - (space) % & ' " * ^ / \ ? | °}</blockquote>
      * <div class="note"><b>Note:</b> SIS does not enforce this restriction in its programmatic API,
      * but may perform some character substitutions at <cite>Well Known Text</cite> (WKT) formatting time.</div>
      *
      * If we take only the characters in the above list which are valid in a {@linkplain #UNICODE_IDENTIFIER
      * Unicode identifier} and remove the characters that ISO 19162 recommends to ignore, the only characters
      * left are {@linkplain #LETTERS_AND_DIGITS letters and digits}.
      *
      * @author  Martin Desruisseaux (Geomatys)
      * @version 1.1
      *
      * @see java.lang.Character.Subset
      * @see Character#getType(int)
      * @see <a href="http://docs.opengeospatial.org/is/12-063r5/12-063r5.html#139">WKT 2 specification §B.5</a>
      *
      * @since 0.3
      * @module
      */
     public static class Filter extends Character.Subset {
         /*
          * This class can not easily be Serializable, because the parent class is not Serializable
          * and does not define a no-argument constructor.  We could workaround with a writeReplace
          * method - waiting to see if there is a real need for that.
          */

         /**
          * The subset of all characters for which {@link Character#isLetterOrDigit(int)}
          * returns {@code true}. This subset includes the following general categories:
          *
          * <blockquote>
          * {@link Character#LOWERCASE_LETTER},
          * {@link Character#UPPERCASE_LETTER     UPPERCASE_LETTER},
          * {@link Character#TITLECASE_LETTER     TITLECASE_LETTER},
          * {@link Character#MODIFIER_LETTER      MODIFIER_LETTER},
          * {@link Character#OTHER_LETTER         OTHER_LETTER} and
          * {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER}.
          * </blockquote>
          *
          * SIS uses this filter when comparing two
          * {@linkplain org.apache.sis.referencing.AbstractIdentifiedObject#getName() identified object names}.
          * See the <cite>Relationship with international standards</cite> section in this class javadoc
          * for more information.
          *
          * @see org.apache.sis.referencing.AbstractIdentifiedObject#isHeuristicMatchForName(String)
          * @see org.apache.sis.metadata.iso.citation.Citations#identifierMatches(Citation, String)
          */
         public static final Filter LETTERS_AND_DIGITS = new LettersAndDigits();

         /**
          * The subset of all characters for which {@link Character#isUnicodeIdentifierPart(int)}
          * returns {@code true}, excluding {@linkplain Character#isIdentifierIgnorable(int) ignorable} characters.
          * This subset includes all the {@link #LETTERS_AND_DIGITS} categories with the addition of the following
          * ones:
          *
          * <blockquote>
          * {@link Character#LETTER_NUMBER},
          * {@link Character#CONNECTOR_PUNCTUATION CONNECTOR_PUNCTUATION},
          * {@link Character#NON_SPACING_MARK NON_SPACING_MARK} and
          * {@link Character#COMBINING_SPACING_MARK COMBINING_SPACING_MARK}.
          * </blockquote>
          */
         public static final Filter UNICODE_IDENTIFIER = new UnicodeIdentifier();

         /**
          * A bitmask of character types in this subset.
          */
         private final long types;

         /**
          * Creates a new subset of the given name.
          *
          * @param  name   the subset name.
          * @param  types  a bitmask of character types.
          */
         Filter(final String name, final long types) {
             super(name);
             this.types = types;
         }

         /**
          * Returns {@code true} if this subset contains the given Unicode character.
          *
          * @param  codePoint  the Unicode character, as a code point value.
          * @return {@code true} if this subset contains the given character.
          */
         public boolean contains(final int codePoint) {
             return containsType(Character.getType(codePoint));
         }

         /**
          * Returns {@code true} if this subset contains the characters of the given type.
          * The given type shall be one of the {@link Character} constants like
          * {@link Character#LOWERCASE_LETTER     LOWERCASE_LETTER},
          * {@link Character#UPPERCASE_LETTER     UPPERCASE_LETTER},
          * {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER} or
          * {@link Character#SPACE_SEPARATOR      SPACE_SEPARATOR}.
          *
          * @param  type  one of the {@link Character} constants.
          * @return {@code true} if this subset contains the characters of the given type.
          *
          * @see Character#getType(int)
          *
          * @deprecated to be removed because not used (only {@link #contains(int)} is used in practice)
          *             and consistency with {@link #contains(int)} is not guaranteed between different
          *             Java versions.
          */
         @Deprecated
         public final boolean containsType(final int type) {
             return (types & Numerics.bitmask(type)) != 0;
         }

         /**
          * Returns a subset representing the union of all Unicode characters of the given types.
          *
          * @param  types  the character types, as {@link Character} constants.
          * @return the subset of Unicode characters of the given type.
          *
          * @see Character#LOWERCASE_LETTER
          * @see Character#UPPERCASE_LETTER
          * @see Character#DECIMAL_DIGIT_NUMBER
          * @see Character#SPACE_SEPARATOR
          */
         public static Filter forTypes(final byte... types) {
             long mask = 0;
             for (int i=0; i<types.length; i++) {
                 final int type = types[i];
                 if (type < 0 || type >= Long.SIZE) {
                     throw new IllegalArgumentException(Errors.format(
                             Errors.Keys.IllegalArgumentValue_2, Strings.toIndexed("types", i), type));
                 }
                 mask |= (1L << type);
             }
 predefined: for (int i=0; ; i++) {
                 final Filter candidate;
                 switch (i) {
                     case 0:  candidate = LETTERS_AND_DIGITS; break;
                     case 1:  candidate = UNICODE_IDENTIFIER; break;
                     default: break predefined;
                 }
                 if (mask == candidate.types) {
                     return candidate;
                 }
             }
             return new Filter("Filter", mask);
         }
     }

     /**
      * Implementation of the {@link Filter#LETTERS_AND_DIGITS} constant.
      */
     private static final class LettersAndDigits extends Filter {
         /**
          * Creates the {@link Filter#LETTERS_AND_DIGITS} singleton instance.
          */
         LettersAndDigits() {
             super("LETTERS_AND_DIGITS",
                       (1L << Character.LOWERCASE_LETTER)
                     | (1L << Character.UPPERCASE_LETTER)
                     | (1L << Character.TITLECASE_LETTER)
                     | (1L << Character.MODIFIER_LETTER)
                     | (1L << Character.OTHER_LETTER)
                     | (1L << Character.DECIMAL_DIGIT_NUMBER));
         }

         /**
          * Returns {@code true} if this subset contains the given Unicode character.
          */
         @Override
         public boolean contains(final int codePoint) {
             return Character.isLetterOrDigit(codePoint);
         }
     }

     /**
      * Implementation of the {@link Filter#UNICODE_IDENTIFIER} constant.
      */
     private static final class UnicodeIdentifier extends Filter {
         /**
          * Creates the {@link Filter#LETTERS_AND_DIGITS} singleton instance.
          */
         UnicodeIdentifier() {
             super("UNICODE_IDENTIFIER",
                       (1L << Character.LOWERCASE_LETTER)
                     | (1L << Character.UPPERCASE_LETTER)
                     | (1L << Character.TITLECASE_LETTER)
                     | (1L << Character.MODIFIER_LETTER)
                     | (1L << Character.OTHER_LETTER)
                     | (1L << Character.DECIMAL_DIGIT_NUMBER)
                     | (1L << Character.LETTER_NUMBER)
                     | (1L << Character.CONNECTOR_PUNCTUATION)
                     | (1L << Character.NON_SPACING_MARK)
                     | (1L << Character.COMBINING_SPACING_MARK));
         }

         /**
          * Returns {@code true} if this subset contains the given Unicode character.
          */
         @Override
         public boolean contains(final int codePoint) {
             return Character.isUnicodeIdentifierPart(codePoint) &&
                   !Character.isIdentifierIgnorable(codePoint);
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.sis.util;

	import org.opengis.metadata.citation.Citation; // For javadoc.
	import org.apache.sis.internal.util.Numerics;
	import org.apache.sis.internal.util.Strings;
	import org.apache.sis.util.resources.Errors;


	/**
	* Static methods working on {@code char} values, and some character constants.
	* Apache SIS uses Unicode symbols directly in the source code for easier reading,
	* except for some symbols that are difficult to differentiate from other similar
	* symbols. For those symbols, constants are declared in this class.
	*
	* @author Martin Desruisseaux (Geomatys)
	* @version 1.1
	* @since 0.3
	* @module
	*/
	public final class Characters extends Static {
	/**
	* Hyphen character ('\u2010', Unicode {@code 2010}).
	* This code tells to {@link org.apache.sis.io.LineAppender}
	* that a line break is allowed to be inserted after this character.
	*
	* <p>For non-breaking hyphen, use the Unicode {@code 2011} character.</p>
	*/
	public static final char HYPHEN = '\u2010';

	/**
	* Hyphen character to be visible only if there is a line break to insert after it
	* (Unicode {@code 00AD}, HTML {@code }).
	* Otherwise this character is invisible. When visible, the graphical symbol is similar
	* to the {@link #HYPHEN} character.
	*/
	public static final char SOFT_HYPHEN = '\u00AD';

	/**
	* The <cite>no-break space</cite> (Unicode {@code 00A0}, HTML {@code  }).
	* Apache SIS uses Unicode symbols directly in the source code for easier reading,
	* except for no-break spaces since they can not be visually distinguished from the
	* ordinary space (Unicode {@code 0020}).
	*/
	public static final char NO_BREAK_SPACE = '\u00A0';

	/**
	* The Unicode line separator (Unicode {@code 2028}, HTML {@code <br>}).
	*
	* @see Character#LINE_SEPARATOR
	*/
	public static final char LINE_SEPARATOR = '\u2028';

	/**
	* The Unicode paragraph separator (Unicode {@code 2029}, HTML {@code <p>…</p>}).
	*
	* @see Character#PARAGRAPH_SEPARATOR
	*/
	public static final char PARAGRAPH_SEPARATOR = '\u2029';

	/**
	* Do not allow instantiation of this class.
	*/
	private Characters() {
	}

	/**
	* Returns {@code true} if the given code point is a valid character for <cite>Well Known Text</cite> (WKT).
	* This method returns {@code true} for the following characters:
	*
	* <blockquote><pre>{@literal A-Z a-z 0-9 _ [ ] ( ) { } < = > . , : ; + - (space) % & ' " * ^ / \ ? \| °}</pre></blockquote>
	*
	* They are ASCII codes 32 to 125 inclusive except ! (33), # (35), $ (36), @ (64) and ` (96),
	* plus the addition of ° (176) despite being formally outside the ASCII character set.
	*
	* @param c the code point to test.
	* @return {@code true} if the given code point is a valid WKT character.
	*
	* @see org.apache.sis.io.wkt.Transliterator
	*
	* @since 0.6
	*/
	public static boolean isValidWKT(final int c) {
	switch (c) {
	case '!':
	case '#':
	case '$':
	case '@':
	case '`': return false;
	case '°': return true;
	default : return (c >= ' ') && (c <= '}');
	}
	}

	/**
	* Returns {@code true} if the given code point is a {@linkplain Character#LINE_SEPARATOR
	* line separator}, a {@linkplain Character#PARAGRAPH_SEPARATOR paragraph separator} or one
	* of the {@code '\r'} or {@code '\n'} control characters.
	*
	* @param c the code point to test.
	* @return {@code true} if the given code point is a line or paragraph separator.
	*
	* @see #LINE_SEPARATOR
	* @see #PARAGRAPH_SEPARATOR
	*/
	public static boolean isLineOrParagraphSeparator(final int c) {
	switch (Character.getType(c)) {
	default: return false;
	case Character.LINE_SEPARATOR:
	case Character.PARAGRAPH_SEPARATOR: return true;
	case Character.CONTROL: return (c == '\r') \|\| (c == '\n');
	}
	}

	/**
	* Returns {@code true} if the given character is an hexadecimal digit.
	* This method returns {@code true} if {@code c} is between {@code '0'} and {@code '9'} inclusive,
	* or between {@code 'A'} and {@code 'F'} inclusive, or between {@code 'a'} and {@code 'f'} inclusive.
	*
	* @param c the character to test.
	* @return {@code true} if the given character is an hexadecimal digit.
	*
	* @since 0.5
	*/
	public static boolean isHexadecimal(int c) {
	/*
	* The &= ~32 is a cheap conversion of lower-case letters to upper-case letters.
	* It is not a rigorous conversion since it does not check if 'c' is a letter,
	* but for the purpose of this method it is okay.
	*/
	return (c >= '0' && c <= '9') \|\| ((c &= ~32) >= 'A' && c <= 'F');
	}

	/**
	* Determines whether the given character is a superscript. Most (but not all) superscripts
	* have a Unicode value in the [2070 … 207F] range. Superscripts are the following symbols:
	*
	* {@preformat text
	* ⁰ ¹ ² ³ ⁴ ⁵ ⁶ ⁷ ⁸ ⁹ ⁺ ⁻ ⁼ ⁽ ⁾ ⁿ
	* }
	*
	* @param c the character to test.
	* @return {@code true} if the given character is a superscript.
	*/
	public static boolean isSuperScript(final int c) {
	switch (c) {
	case '¹': // Legacy values in "Latin-1 supplement" space: 00B9, 00B2 and 00B3.
	case '²': // Those values are outside the usual [2070 … 207F] range.
	case '³': return true;
	case '\u2071': // Would be the '¹', '²' and '³' values if they were declared in the usual range.
	case '\u2072': // Since they are not, those values are unassigned.
	case '\u2073': return false;
	default: return (c >= '⁰' && c <= 'ⁿ');
	}
	}

	/**
	* Determines whether the given character is a subscript. All subscripts have
	* a Unicode value in the [2080 … 208E]. Subscripts are the following symbols:
	*
	* {@preformat text
	* ₀ ₁ ₂ ₃ ₄ ₅ ₆ ₇ ₈ ₉ ₊ ₋ ₌ ₍ ₎
	* }
	*
	* @param c the character to test.
	* @return {@code true} if the given character is a subscript.
	*/
	public static boolean isSubScript(final int c) {
	return (c >= '₀' && c <= '₎');
	}

	/**
	* Converts the given character argument to superscript.
	* Only the following characters can be converted (other characters are left unchanged):
	*
	* {@preformat text
	* 0 1 2 3 4 5 6 7 8 9 + - = ( ) n
	* }
	*
	* @param c the character to convert.
	* @return the given character as a superscript, or {@code c} if the given character can not be converted.
	*/
	public static char toSuperScript(char c) {
	switch (c) {
	case '1': c = '¹'; break; // 00B9
	case '2': c = '²'; break; // 00B2
	case '3': c = '³'; break; // 00B3
	case '+': c = '⁺'; break; // 207A
	case '-': c = '⁻'; break; // 207B
	case '=': c = '⁼'; break; // 207C
	case '(': c = '⁽'; break; // 207D
	case ')': c = '⁾'; break; // 207E
	case 'n': c = 'ⁿ'; break; // 207F
	default: {
	if (c >= '0' && c <= '9') {
	c += ('⁰' - '0');
	}
	break;
	}
	}
	return c;
	}

	/**
	* Converts the given character argument to subscript.
	* Only the following characters can be converted (other characters are left unchanged):
	*
	* {@preformat text
	* 0 1 2 3 4 5 6 7 8 9 + - = ( )
	* }
	*
	* @param c the character to convert.
	* @return the given character as a subscript, or {@code c} if the given character can not be converted.
	*/
	public static char toSubScript(char c) {
	switch (c) {
	case '+': c = '₊'; break; // 208A
	case '-': c = '₋'; break; // 208B
	case '=': c = '₌'; break; // 208C
	case '(': c = '₍'; break; // 208D
	case ')': c = '₎'; break; // 208E
	default: {
	if (c >= '0' && c <= '9') {
	c += ('₀' - '0');
	}
	break;
	}
	}
	return c;
	}

	/**
	* Converts the given character argument to normal script.
	*
	* @param c the character to convert.
	* @return the given character as a normal script, or {@code c} if the
	* given character was not a superscript or a subscript.
	*/
	public static char toNormalScript(char c) {
	// Cast is safe because all return values are in the Basic Multilingual Plane (BMP).
	return (char) toNormalScript((int) c);
	}

	/**
	* Converts the given code point to normal script.
	*
	* @param c the character to convert.
	* @return the given character as a normal script, or {@code c} if the
	* given character was not a superscript or a subscript.
	*
	* @since 1.0
	*/
	public static int toNormalScript(int c) {
	switch (c) {
	case '\u2071': // Exceptions to the default case. They would be the ¹²³
	case '\u2072': // cases if they were not defined in the Latin-1 range.
	case '\u2073': break;
	case '¹': c = '1'; break;
	case '²': c = '2'; break;
	case '³': c = '3'; break;
	case '⁺': case '₊': c = '+'; break;
	case '⁻': case '₋': c = '-'; break;
	case '⁼': case '₌': c = '='; break;
	case '⁽': case '₍': c = '('; break;
	case '⁾': case '₎': c = ')'; break;
	case 'ⁿ': c = 'n'; break;
	default: {
	if (c >= '⁰' && c <= '₉') {
	if (c <= '⁹') c -= ('⁰' - '0');
	else if (c >= '₀') c -= ('₀' - '0');
	}
	break;
	}
	}
	return c;
	}




	/**
	* Subsets of Unicode characters identified by their general category.
	* The categories are identified by constants defined in the {@link Character} class, like
	* {@link Character#LOWERCASE_LETTER LOWERCASE_LETTER},
	* {@link Character#UPPERCASE_LETTER UPPERCASE_LETTER},
	* {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER} and
	* {@link Character#SPACE_SEPARATOR SPACE_SEPARATOR}.
	*
	* <p>An instance of this class can be obtained from an enumeration of character types
	* using the {@link #forTypes(byte[])} method, or using one of the constants predefined
	* in this class. Then, Unicode characters can be tested for inclusion in the subset by
	* calling the {@link #contains(int)} method.</p>
	*
	* <h2>Relationship with international standards</h2>
	* ISO 19162:2015 §B.5.2 recommends to ignore spaces, case and the following characters when comparing two
	* {@linkplain org.apache.sis.referencing.AbstractIdentifiedObject#getName() identified object names}:
	* “{@code _}” (underscore), “{@code -}” (minus sign), “{@code /}” (solidus),
	* “{@code (}” (left parenthesis) and “{@code )}” (right parenthesis).
	* The same specification also limits the set of valid characters in a name to the following (§6.3.1):
	*
	* <blockquote>{@code A-Z a-z 0-9 _ [ ] ( ) { } < = > . , : ; + - (space) % & ' " * ^ / \ ? \| °}</blockquote>
	* <div class="note"><b>Note:</b> SIS does not enforce this restriction in its programmatic API,
	* but may perform some character substitutions at <cite>Well Known Text</cite> (WKT) formatting time.</div>
	*
	* If we take only the characters in the above list which are valid in a {@linkplain #UNICODE_IDENTIFIER
	* Unicode identifier} and remove the characters that ISO 19162 recommends to ignore, the only characters
	* left are {@linkplain #LETTERS_AND_DIGITS letters and digits}.
	*
	* @author Martin Desruisseaux (Geomatys)
	* @version 1.1
	*
	* @see java.lang.Character.Subset
	* @see Character#getType(int)
	* @see <a href="http://docs.opengeospatial.org/is/12-063r5/12-063r5.html#139">WKT 2 specification §B.5</a>
	*
	* @since 0.3
	* @module
	*/
	public static class Filter extends Character.Subset {
	/*
	* This class can not easily be Serializable, because the parent class is not Serializable
	* and does not define a no-argument constructor. We could workaround with a writeReplace
	* method - waiting to see if there is a real need for that.
	*/

	/**
	* The subset of all characters for which {@link Character#isLetterOrDigit(int)}
	* returns {@code true}. This subset includes the following general categories:
	*
	* <blockquote>
	* {@link Character#LOWERCASE_LETTER},
	* {@link Character#UPPERCASE_LETTER UPPERCASE_LETTER},
	* {@link Character#TITLECASE_LETTER TITLECASE_LETTER},
	* {@link Character#MODIFIER_LETTER MODIFIER_LETTER},
	* {@link Character#OTHER_LETTER OTHER_LETTER} and
	* {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER}.
	* </blockquote>
	*
	* SIS uses this filter when comparing two
	* {@linkplain org.apache.sis.referencing.AbstractIdentifiedObject#getName() identified object names}.
	* See the <cite>Relationship with international standards</cite> section in this class javadoc
	* for more information.
	*
	* @see org.apache.sis.referencing.AbstractIdentifiedObject#isHeuristicMatchForName(String)
	* @see org.apache.sis.metadata.iso.citation.Citations#identifierMatches(Citation, String)
	*/
	public static final Filter LETTERS_AND_DIGITS = new LettersAndDigits();

	/**
	* The subset of all characters for which {@link Character#isUnicodeIdentifierPart(int)}
	* returns {@code true}, excluding {@linkplain Character#isIdentifierIgnorable(int) ignorable} characters.
	* This subset includes all the {@link #LETTERS_AND_DIGITS} categories with the addition of the following
	* ones:
	*
	* <blockquote>
	* {@link Character#LETTER_NUMBER},
	* {@link Character#CONNECTOR_PUNCTUATION CONNECTOR_PUNCTUATION},
	* {@link Character#NON_SPACING_MARK NON_SPACING_MARK} and
	* {@link Character#COMBINING_SPACING_MARK COMBINING_SPACING_MARK}.
	* </blockquote>
	*/
	public static final Filter UNICODE_IDENTIFIER = new UnicodeIdentifier();

	/**
	* A bitmask of character types in this subset.
	*/
	private final long types;

	/**
	* Creates a new subset of the given name.
	*
	* @param name the subset name.
	* @param types a bitmask of character types.
	*/
	Filter(final String name, final long types) {
	super(name);
	this.types = types;
	}

	/**
	* Returns {@code true} if this subset contains the given Unicode character.
	*
	* @param codePoint the Unicode character, as a code point value.
	* @return {@code true} if this subset contains the given character.
	*/
	public boolean contains(final int codePoint) {
	return containsType(Character.getType(codePoint));
	}

	/**
	* Returns {@code true} if this subset contains the characters of the given type.
	* The given type shall be one of the {@link Character} constants like
	* {@link Character#LOWERCASE_LETTER LOWERCASE_LETTER},
	* {@link Character#UPPERCASE_LETTER UPPERCASE_LETTER},
	* {@link Character#DECIMAL_DIGIT_NUMBER DECIMAL_DIGIT_NUMBER} or
	* {@link Character#SPACE_SEPARATOR SPACE_SEPARATOR}.
	*
	* @param type one of the {@link Character} constants.
	* @return {@code true} if this subset contains the characters of the given type.
	*
	* @see Character#getType(int)
	*
	* @deprecated to be removed because not used (only {@link #contains(int)} is used in practice)
	* and consistency with {@link #contains(int)} is not guaranteed between different
	* Java versions.
	*/
	@Deprecated
	public final boolean containsType(final int type) {
	return (types & Numerics.bitmask(type)) != 0;
	}

	/**
	* Returns a subset representing the union of all Unicode characters of the given types.
	*
	* @param types the character types, as {@link Character} constants.
	* @return the subset of Unicode characters of the given type.
	*
	* @see Character#LOWERCASE_LETTER
	* @see Character#UPPERCASE_LETTER
	* @see Character#DECIMAL_DIGIT_NUMBER
	* @see Character#SPACE_SEPARATOR
	*/
	public static Filter forTypes(final byte... types) {
	long mask = 0;
	for (int i=0; i<types.length; i++) {
	final int type = types[i];
	if (type < 0 \|\| type >= Long.SIZE) {
	throw new IllegalArgumentException(Errors.format(
	Errors.Keys.IllegalArgumentValue_2, Strings.toIndexed("types", i), type));
	}
	mask \|= (1L << type);
	}
	predefined: for (int i=0; ; i++) {
	final Filter candidate;
	switch (i) {
	case 0: candidate = LETTERS_AND_DIGITS; break;
	case 1: candidate = UNICODE_IDENTIFIER; break;
	default: break predefined;
	}
	if (mask == candidate.types) {
	return candidate;
	}
	}
	return new Filter("Filter", mask);
	}
	}

	/**
	* Implementation of the {@link Filter#LETTERS_AND_DIGITS} constant.
	*/
	private static final class LettersAndDigits extends Filter {
	/**
	* Creates the {@link Filter#LETTERS_AND_DIGITS} singleton instance.
	*/
	LettersAndDigits() {
	super("LETTERS_AND_DIGITS",
	(1L << Character.LOWERCASE_LETTER)
	\| (1L << Character.UPPERCASE_LETTER)
	\| (1L << Character.TITLECASE_LETTER)
	\| (1L << Character.MODIFIER_LETTER)
	\| (1L << Character.OTHER_LETTER)
	\| (1L << Character.DECIMAL_DIGIT_NUMBER));
	}

	/**
	* Returns {@code true} if this subset contains the given Unicode character.
	*/
	@Override
	public boolean contains(final int codePoint) {
	return Character.isLetterOrDigit(codePoint);
	}
	}

	/**
	* Implementation of the {@link Filter#UNICODE_IDENTIFIER} constant.
	*/
	private static final class UnicodeIdentifier extends Filter {
	/**
	* Creates the {@link Filter#LETTERS_AND_DIGITS} singleton instance.
	*/
	UnicodeIdentifier() {
	super("UNICODE_IDENTIFIER",
	(1L << Character.LOWERCASE_LETTER)
	\| (1L << Character.UPPERCASE_LETTER)
	\| (1L << Character.TITLECASE_LETTER)
	\| (1L << Character.MODIFIER_LETTER)
	\| (1L << Character.OTHER_LETTER)
	\| (1L << Character.DECIMAL_DIGIT_NUMBER)
	\| (1L << Character.LETTER_NUMBER)
	\| (1L << Character.CONNECTOR_PUNCTUATION)
	\| (1L << Character.NON_SPACING_MARK)
	\| (1L << Character.COMBINING_SPACING_MARK));
	}

	/**
	* Returns {@code true} if this subset contains the given Unicode character.
	*/
	@Override
	public boolean contains(final int codePoint) {
	return Character.isUnicodeIdentifierPart(codePoint) &&
	!Character.isIdentifierIgnorable(codePoint);
	}
	}
	}