| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.sis.util; |
| |
| import java.util.Arrays; |
| import java.nio.CharBuffer; |
| import org.opengis.metadata.citation.Citation; // For javadoc |
| import org.opengis.referencing.IdentifiedObject; // For javadoc |
| |
| import static java.lang.Character.*; |
| |
| |
| /** |
| * Static methods working with {@link CharSequence} instances. Some methods defined in this |
| * class duplicate the functionalities already provided in the standard {@link String} class, |
| * but works on a generic {@code CharSequence} instance instead than {@code String}. |
| * |
| * <div class="section">Unicode support</div> |
| * Every methods defined in this class work on <cite>code points</cite> instead than characters |
| * when appropriate. Consequently those methods should behave correctly with characters outside |
| * the <cite>Basic Multilingual Plane</cite> (BMP). |
| * |
| * <div class="section">Policy on space characters</div> |
| * Java defines two methods for testing if a character is a white space: |
| * {@link Character#isWhitespace(int)} and {@link Character#isSpaceChar(int)}. |
| * Those two methods differ in the way they handle {@linkplain Characters#NO_BREAK_SPACE |
| * no-break spaces}, tabulations and line feeds. The general policy in the SIS library is: |
| * |
| * <ul> |
| * <li>Use {@code isWhitespace(…)} when separating entities (words, numbers, tokens, <i>etc.</i>) |
| * in a list. Using that method, characters separated by a no-break space are considered as |
| * part of the same entity.</li> |
| * <li>Use {@code isSpaceChar(…)} when parsing a single entity, for example a single word. |
| * Using this method, no-break spaces are considered as part of the entity while line |
| * feeds or tabulations are entity boundaries.</li> |
| * </ul> |
| * |
| * <div class="note"><b>Example:</b> |
| * Numbers formatted in the French locale use no-break spaces as group separators. When parsing a list of numbers, |
| * ordinary spaces around the numbers may need to be ignored, but no-break spaces shall be considered as part of the |
| * numbers. Consequently {@code isWhitespace(…)} is appropriate for skipping spaces <em>between</em> the numbers. |
| * But if there is spaces to skip <em>inside</em> a single number, then {@code isSpaceChar(…)} is a good choice |
| * for accepting no-break spaces and for stopping the parse operation at tabulations or line feed character. |
| * A tabulation or line feed between two characters is very likely to separate two distinct values.</div> |
| * |
| * In practice, the {@link java.text.Format} implementations in the SIS library typically use |
| * {@code isSpaceChar(…)} while most of the rest of the SIS library, including this |
| * {@code CharSequences} class, consistently uses {@code isWhitespace(…)}. |
| * |
| * <p>Note that the {@link String#trim()} method doesn't follow any of those policies and should |
| * generally be avoided. That {@code trim()} method removes every ISO control characters without |
| * distinction about whether the characters are space or not, and ignore all Unicode spaces. |
| * The {@link #trimWhitespaces(String)} method defined in this class can be used as an alternative.</p> |
| * |
| * <div class="section">Handling of null values</div> |
| * Most methods in this class accept a {@code null} {@code CharSequence} argument. In such cases |
| * the method return value is either a {@code null} {@code CharSequence}, an empty array, or a |
| * {@code 0} or {@code false} primitive type calculated as if the input was an empty string. |
| * |
| * @author Martin Desruisseaux (Geomatys) |
| * @version 1.0 |
| * |
| * @see StringBuilders |
| * |
| * @since 0.3 |
| * @module |
| */ |
| public final class CharSequences extends Static { |
| /** |
| * An array of zero-length. This constant play a role equivalents to |
| * {@link java.util.Collections#EMPTY_LIST}. |
| */ |
| public static final String[] EMPTY_ARRAY = new String[0]; |
| |
| /** |
| * An array of strings containing only white spaces. String lengths are equal to their |
| * index in the {@code spaces} array. For example, {@code spaces[4]} contains a string |
| * of length 4. Strings are constructed only when first needed. |
| */ |
| private static final String[] SPACES = new String[10]; |
| |
| /** |
| * Do not allow instantiation of this class. |
| */ |
| private CharSequences() { |
| } |
| |
| /** |
| * Returns the code point after the given index. This method completes |
| * {@link Character#codePointBefore(CharSequence, int)} but is rarely used because slightly |
| * inefficient (in most cases, the code point at {@code index} is known together with the |
| * corresponding {@code charCount(int)} value, so the method calls should be unnecessary). |
| */ |
| private static int codePointAfter(final CharSequence text, final int index) { |
| return codePointAt(text, index + charCount(codePointAt(text, index))); |
| } |
| |
| /** |
| * Returns a character sequence of the specified length filled with white spaces. |
| * |
| * <div class="section">Use case</div> |
| * This method is typically invoked for performing right-alignment of text on the |
| * {@linkplain java.io.Console console} or other device using monospaced font. |
| * Callers compute a value for the {@code length} argument by (<var>desired width</var> - <var>used width</var>). |
| * Since the <var>used width</var> value may be greater than expected, this method handle negative {@code length} |
| * values as if the value was zero. |
| * |
| * @param length the string length. Negative values are clamped to 0. |
| * @return a string of length {@code length} filled with white spaces. |
| */ |
| public static CharSequence spaces(final int length) { |
| /* |
| * No need to synchronize. In the unlikely event of two threads calling this method |
| * at the same time and the two calls creating a new string, the String.intern() call |
| * will take care of canonicalizing the strings. |
| */ |
| if (length <= 0) { |
| return ""; |
| } |
| if (length < SPACES.length) { |
| String s = SPACES[length - 1]; |
| if (s == null) { |
| final char[] spaces = new char[length]; |
| Arrays.fill(spaces, ' '); |
| s = new String(spaces).intern(); |
| SPACES[length - 1] = s; |
| } |
| return s; |
| } |
| return new CharSequence() { |
| @Override public int length() { |
| return length; |
| } |
| |
| @Override public char charAt(int index) { |
| ArgumentChecks.ensureValidIndex(length, index); |
| return ' '; |
| } |
| |
| @Override public CharSequence subSequence(final int start, final int end) { |
| ArgumentChecks.ensureValidIndexRange(length, start, end); |
| final int n = end - start; |
| return (n == length) ? this : spaces(n); |
| } |
| |
| @Override public String toString() { |
| final char[] array = new char[length]; |
| Arrays.fill(array, ' '); |
| return new String(array); |
| } |
| }; |
| } |
| |
| /** |
| * Returns the {@linkplain CharSequence#length() length} of the given characters sequence, |
| * or 0 if {@code null}. |
| * |
| * @param text the character sequence from which to get the length, or {@code null}. |
| * @return the length of the character sequence, or 0 if the argument is {@code null}. |
| */ |
| public static int length(final CharSequence text) { |
| return (text != null) ? text.length() : 0; |
| } |
| |
| /** |
| * Returns the number of Unicode code points in the given characters sequence, |
| * or 0 if {@code null}. Unpaired surrogates within the text count as one code |
| * point each. |
| * |
| * @param text the character sequence from which to get the count, or {@code null}. |
| * @return the number of Unicode code points, or 0 if the argument is {@code null}. |
| * |
| * @see #codePointCount(CharSequence, int, int) |
| */ |
| public static int codePointCount(final CharSequence text) { |
| return (text != null) ? codePointCount(text, 0, text.length()) : 0; |
| } |
| |
| /** |
| * Returns the number of Unicode code points in the given characters sub-sequence, |
| * or 0 if {@code null}. Unpaired surrogates within the text count as one code |
| * point each. |
| * |
| * <p>This method performs the same work than the standard |
| * {@link Character#codePointCount(CharSequence, int, int)} method, except that it tries |
| * to delegate to the optimized methods from the {@link String}, {@link StringBuilder}, |
| * {@link StringBuffer} or {@link CharBuffer} classes if possible.</p> |
| * |
| * @param text the character sequence from which to get the count, or {@code null}. |
| * @param fromIndex the index from which to start the computation. |
| * @param toIndex the index after the last character to take in account. |
| * @return the number of Unicode code points, or 0 if the argument is {@code null}. |
| * |
| * @see Character#codePointCount(CharSequence, int, int) |
| * @see String#codePointCount(int, int) |
| * @see StringBuilder#codePointCount(int, int) |
| */ |
| public static int codePointCount(final CharSequence text, final int fromIndex, final int toIndex) { |
| if (text == null) return 0; |
| if (text instanceof String) return ((String) text).codePointCount(fromIndex, toIndex); |
| if (text instanceof StringBuilder) return ((StringBuilder) text).codePointCount(fromIndex, toIndex); |
| if (text instanceof StringBuffer) return ((StringBuffer) text).codePointCount(fromIndex, toIndex); |
| if (text instanceof CharBuffer) { |
| final CharBuffer buffer = (CharBuffer) text; |
| if (buffer.hasArray() && !buffer.isReadOnly()) { |
| final int position = buffer.position(); |
| return Character.codePointCount(buffer.array(), position + fromIndex, position + toIndex); |
| } |
| } |
| return Character.codePointCount(text, fromIndex, toIndex); |
| } |
| |
| /** |
| * Returns the number of occurrences of the {@code toSearch} string in the given {@code text}. |
| * The search is case-sensitive. |
| * |
| * @param text the character sequence to count occurrences, or {@code null}. |
| * @param toSearch the string to search in the given {@code text}. |
| * It shall contain at least one character. |
| * @return the number of occurrences of {@code toSearch} in {@code text}, |
| * or 0 if {@code text} was null or empty. |
| * @throws NullArgumentException if the {@code toSearch} argument is null. |
| * @throws IllegalArgumentException if the {@code toSearch} argument is empty. |
| */ |
| public static int count(final CharSequence text, final String toSearch) { |
| ArgumentChecks.ensureNonEmpty("toSearch", toSearch); |
| final int length = toSearch.length(); |
| if (length == 1) { |
| // Implementation working on a single character is faster. |
| return count(text, toSearch.charAt(0)); |
| } |
| int n = 0; |
| if (text != null) { |
| int i = 0; |
| while ((i = indexOf(text, toSearch, i, text.length())) >= 0) { |
| n++; |
| i += length; |
| } |
| } |
| return n; |
| } |
| |
| /** |
| * Counts the number of occurrence of the given character in the given character sequence. |
| * |
| * @param text the character sequence to count occurrences, or {@code null}. |
| * @param toSearch the character to count. |
| * @return the number of occurrences of the given character, or 0 if the {@code text} is null. |
| */ |
| public static int count(final CharSequence text, final char toSearch) { |
| int n = 0; |
| if (text != null) { |
| if (text instanceof String) { |
| final String s = (String) text; |
| for (int i=s.indexOf(toSearch); ++i != 0; i=s.indexOf(toSearch, i)) { |
| n++; |
| } |
| } else { |
| // No need to use the code point API here, since we are looking for exact matches. |
| for (int i=text.length(); --i>=0;) { |
| if (text.charAt(i) == toSearch) { |
| n++; |
| } |
| } |
| } |
| } |
| return n; |
| } |
| |
| /** |
| * Returns the index within the given strings of the first occurrence of the specified part, |
| * starting at the specified index. This method is equivalent to the following method call, |
| * except that this method works on arbitrary {@link CharSequence} objects instead than |
| * {@link String}s only, and that the upper limit can be specified: |
| * |
| * {@preformat java |
| * return text.indexOf(part, fromIndex); |
| * } |
| * |
| * There is no restriction on the value of {@code fromIndex}. If negative or greater |
| * than {@code toIndex}, then the behavior of this method is as if the search started |
| * from 0 or {@code toIndex} respectively. This is consistent with the |
| * {@link String#indexOf(String, int)} behavior. |
| * |
| * @param text the string in which to perform the search. |
| * @param toSearch the substring for which to search. |
| * @param fromIndex the index from which to start the search. |
| * @param toIndex the index after the last character where to perform the search. |
| * @return the index within the text of the first occurrence of the specified part, starting at the specified index, |
| * or -1 if no occurrence has been found or if the {@code text} argument is null. |
| * @throws NullArgumentException if the {@code toSearch} argument is null. |
| * @throws IllegalArgumentException if the {@code toSearch} argument is empty. |
| * |
| * @see String#indexOf(String, int) |
| * @see StringBuilder#indexOf(String, int) |
| * @see StringBuffer#indexOf(String, int) |
| */ |
| public static int indexOf(final CharSequence text, final CharSequence toSearch, int fromIndex, int toIndex) { |
| ArgumentChecks.ensureNonEmpty("toSearch", toSearch); |
| if (text != null) { |
| int length = text.length(); |
| if (toIndex > length) { |
| toIndex = length; |
| } |
| if (toSearch instanceof String && toIndex == length) { |
| if (text instanceof String) { |
| return ((String) text).indexOf((String) toSearch, fromIndex); |
| } |
| if (text instanceof StringBuilder) { |
| return ((StringBuilder) text).indexOf((String) toSearch, fromIndex); |
| } |
| if (text instanceof StringBuffer) { |
| return ((StringBuffer) text).indexOf((String) toSearch, fromIndex); |
| } |
| } |
| if (fromIndex < 0) { |
| fromIndex = 0; |
| } |
| length = toSearch.length(); |
| toIndex -= length; |
| search: for (; fromIndex <= toIndex; fromIndex++) { |
| for (int i=0; i<length; i++) { |
| // No need to use the codePointAt API here, since we are looking for exact matches. |
| if (text.charAt(fromIndex + i) != toSearch.charAt(i)) { |
| continue search; |
| } |
| } |
| return fromIndex; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Returns the index within the given character sequence of the first occurrence of the |
| * specified character, starting the search at the specified index. If the character is |
| * not found, then this method returns -1. |
| * |
| * <p>There is no restriction on the value of {@code fromIndex}. If negative or greater |
| * than {@code toIndex}, then the behavior of this method is as if the search started |
| * from 0 or {@code toIndex} respectively. This is consistent with the behavior documented |
| * in {@link String#indexOf(int, int)}.</p> |
| * |
| * @param text the character sequence in which to perform the search, or {@code null}. |
| * @param toSearch the Unicode code point of the character to search. |
| * @param fromIndex the index to start the search from. |
| * @param toIndex the index after the last character where to perform the search. |
| * @return the index of the first occurrence of the given character in the specified sub-sequence, |
| * or -1 if no occurrence has been found or if the {@code text} argument is null. |
| * |
| * @see String#indexOf(int, int) |
| */ |
| public static int indexOf(final CharSequence text, final int toSearch, int fromIndex, int toIndex) { |
| if (text != null) { |
| final int length = text.length(); |
| if (toIndex >= length) { |
| if (text instanceof String) { |
| // String provides a faster implementation. |
| return ((String) text).indexOf(toSearch, fromIndex); |
| } |
| toIndex = length; |
| } |
| if (fromIndex < 0) { |
| fromIndex = 0; |
| } |
| char head = (char) toSearch; |
| char tail = (char) 0; |
| if (head != toSearch) { // Outside BMP plane? |
| head = highSurrogate(toSearch); |
| tail = lowSurrogate (toSearch); |
| toIndex--; |
| } |
| while (fromIndex < toIndex) { |
| if (text.charAt(fromIndex) == head) { |
| if (tail == 0 || text.charAt(fromIndex+1) == tail) { |
| return fromIndex; |
| } |
| } |
| fromIndex++; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Returns the index within the given character sequence of the last occurrence of the |
| * specified character, searching backward in the given index range. |
| * If the character is not found, then this method returns -1. |
| * |
| * <p>There is no restriction on the value of {@code toIndex}. If greater than the text length |
| * or less than {@code fromIndex}, then the behavior of this method is as if the search started |
| * from {@code length} or {@code fromIndex} respectively. This is consistent with the behavior |
| * documented in {@link String#lastIndexOf(int, int)}.</p> |
| * |
| * @param text the character sequence in which to perform the search, or {@code null}. |
| * @param toSearch the Unicode code point of the character to search. |
| * @param fromIndex the index of the first character in the range where to perform the search. |
| * @param toIndex the index after the last character in the range where to perform the search. |
| * @return the index of the last occurrence of the given character in the specified sub-sequence, |
| * or -1 if no occurrence has been found or if the {@code text} argument is null. |
| * |
| * @see String#lastIndexOf(int, int) |
| */ |
| public static int lastIndexOf(final CharSequence text, final int toSearch, int fromIndex, int toIndex) { |
| if (text != null) { |
| if (fromIndex <= 0) { |
| if (text instanceof String) { |
| // String provides a faster implementation. |
| return ((String) text).lastIndexOf(toSearch, toIndex - 1); |
| } |
| fromIndex = 0; |
| } |
| final int length = text.length(); |
| if (toIndex > length) { |
| toIndex = length; |
| } |
| char tail = (char) toSearch; |
| char head = (char) 0; |
| if (tail != toSearch) { // Outside BMP plane? |
| tail = lowSurrogate (toSearch); |
| head = highSurrogate(toSearch); |
| fromIndex++; |
| } |
| while (toIndex > fromIndex) { |
| if (text.charAt(--toIndex) == tail) { |
| if (head == 0 || text.charAt(--toIndex) == head) { |
| return toIndex; |
| } |
| } |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Returns the index of the first character after the given number of lines. |
| * This method counts the number of occurrence of {@code '\n'}, {@code '\r'} |
| * or {@code "\r\n"} starting from the given position. When {@code numLines} |
| * occurrences have been found, the index of the first character after the last |
| * occurrence is returned. |
| * |
| * <p>If the {@code numLines} argument is positive, this method searches forward. |
| * If negative, this method searches backward. If 0, this method returns the |
| * beginning of the current line.</p> |
| * |
| * <p>If this method reaches the end of {@code text} while searching forward, then |
| * {@code text.length()} is returned. If this method reaches the beginning of |
| * {@code text} while searching backward, then 0 is returned.</p> |
| * |
| * @param text the string in which to skip a determined amount of lines. |
| * @param numLines the number of lines to skip. Can be positive, zero or negative. |
| * @param fromIndex index at which to start the search, from 0 to {@code text.length()} inclusive. |
| * @return index of the first character after the last skipped line. |
| * @throws NullPointerException if the {@code text} argument is null. |
| * @throws IndexOutOfBoundsException if {@code fromIndex} is out of bounds. |
| */ |
| public static int indexOfLineStart(final CharSequence text, int numLines, int fromIndex) { |
| final int length = text.length(); |
| /* |
| * Go backward if the number of lines is negative. |
| * No need to use the codePoint API because we are |
| * looking only for characters in the BMP plane. |
| */ |
| if (numLines <= 0) { |
| do { |
| char c; |
| do { |
| if (fromIndex == 0) { |
| return fromIndex; |
| } |
| c = text.charAt(--fromIndex); |
| if (c == '\n') { |
| if (fromIndex != 0 && text.charAt(fromIndex - 1) == '\r') { |
| --fromIndex; |
| } |
| break; |
| } |
| } while (c != '\r'); |
| } while (++numLines != 1); |
| // Execute the forward code below for skipping the "end of line" characters. |
| } |
| /* |
| * Skips forward the given amount of lines. |
| */ |
| while (--numLines >= 0) { |
| char c; |
| do { |
| if (fromIndex == length) { |
| return fromIndex; |
| } |
| c = text.charAt(fromIndex++); |
| if (c == '\r') { |
| if (fromIndex != length && text.charAt(fromIndex) == '\n') { |
| fromIndex++; |
| } |
| break; |
| } |
| } while (c != '\n'); |
| } |
| return fromIndex; |
| } |
| |
| /** |
| * Returns the index of the first non-white character in the given range. |
| * If the given range contains only space characters, then this method returns the index of the |
| * first character after the given range, which is always equals or greater than {@code toIndex}. |
| * Note that this character may not exist if {@code toIndex} is equals to the text length. |
| * |
| * <p>Special cases:</p> |
| * <ul> |
| * <li>If {@code fromIndex} is greater than {@code toIndex}, |
| * then this method unconditionally returns {@code fromIndex}.</li> |
| * <li>If the given range contains only space characters and the character at {@code toIndex-1} |
| * is the high surrogate of a valid supplementary code point, then this method returns |
| * {@code toIndex+1}, which is the index of the next code point.</li> |
| * <li>If {@code fromIndex} is negative or {@code toIndex} is greater than the text length, |
| * then the behavior of this method is undefined.</li> |
| * </ul> |
| * |
| * Space characters are identified by the {@link Character#isWhitespace(int)} method. |
| * |
| * @param text the string in which to perform the search (can not be null). |
| * @param fromIndex the index from which to start the search (can not be negative). |
| * @param toIndex the index after the last character where to perform the search. |
| * @return the index within the text of the first occurrence of a non-space character, starting |
| * at the specified index, or a value equals or greater than {@code toIndex} if none. |
| * @throws NullPointerException if the {@code text} argument is null. |
| * |
| * @see #skipTrailingWhitespaces(CharSequence, int, int) |
| * @see #trimWhitespaces(CharSequence) |
| */ |
| public static int skipLeadingWhitespaces(final CharSequence text, int fromIndex, final int toIndex) { |
| while (fromIndex < toIndex) { |
| final int c = Character.codePointAt(text, fromIndex); |
| if (!Character.isWhitespace(c)) break; |
| fromIndex += Character.charCount(c); |
| } |
| return fromIndex; |
| } |
| |
| /** |
| * Returns the index <em>after</em> the last non-white character in the given range. |
| * If the given range contains only space characters, then this method returns the index of the |
| * first character in the given range, which is always equals or lower than {@code fromIndex}. |
| * |
| * <p>Special cases:</p> |
| * <ul> |
| * <li>If {@code fromIndex} is lower than {@code toIndex}, |
| * then this method unconditionally returns {@code toIndex}.</li> |
| * <li>If the given range contains only space characters and the character at {@code fromIndex} |
| * is the low surrogate of a valid supplementary code point, then this method returns |
| * {@code fromIndex-1}, which is the index of the code point.</li> |
| * <li>If {@code fromIndex} is negative or {@code toIndex} is greater than the text length, |
| * then the behavior of this method is undefined.</li> |
| * </ul> |
| * |
| * Space characters are identified by the {@link Character#isWhitespace(int)} method. |
| * |
| * @param text the string in which to perform the search (can not be null). |
| * @param fromIndex the index from which to start the search (can not be negative). |
| * @param toIndex the index after the last character where to perform the search. |
| * @return the index within the text of the last occurrence of a non-space character, starting |
| * at the specified index, or a value equals or lower than {@code fromIndex} if none. |
| * @throws NullPointerException if the {@code text} argument is null. |
| * |
| * @see #skipLeadingWhitespaces(CharSequence, int, int) |
| * @see #trimWhitespaces(CharSequence) |
| */ |
| public static int skipTrailingWhitespaces(final CharSequence text, final int fromIndex, int toIndex) { |
| while (toIndex > fromIndex) { |
| final int c = Character.codePointBefore(text, toIndex); |
| if (!Character.isWhitespace(c)) break; |
| toIndex -= Character.charCount(c); |
| } |
| return toIndex; |
| } |
| |
| /** |
| * Allocates the array to be returned by the {@code split(…)} methods. If the given {@code text} argument is |
| * an instance of {@link String}, {@link StringBuilder} or {@link StringBuffer}, then this method returns a |
| * {@code String[]} array instead than {@code CharSequence[]}. This is possible because the specification of |
| * their {@link CharSequence#subSequence(int, int)} method guarantees to return {@code String} instances. |
| * Some Apache SIS code will cast the {@code split(…)} return value based on this knowledge. |
| * |
| * <p>Note that this is a undocumented SIS features. There is currently no commitment that this implementation |
| * details will not change in future version.</p> |
| * |
| * @param text the text to be splitted. |
| * @return an array where to store the result of splitting the given {@code text}. |
| */ |
| private static CharSequence[] createSplitArray(final CharSequence text) { |
| return (text instanceof String || |
| text instanceof StringBuilder || |
| text instanceof StringBuffer) ? new String[8] : new CharSequence[8]; |
| } |
| |
| /** |
| * Splits a text around the given character. The array returned by this method contains all |
| * subsequences of the given text that is terminated by the given character or is terminated |
| * by the end of the text. The subsequences in the array are in the order in which they occur |
| * in the given text. If the character is not found in the input, then the resulting array has |
| * just one element, which is the whole given text. |
| * |
| * <p>This method is similar to the standard {@link String#split(String)} method except for the |
| * following:</p> |
| * |
| * <ul> |
| * <li>It accepts generic character sequences.</li> |
| * <li>It accepts {@code null} argument, in which case an empty array is returned.</li> |
| * <li>The separator is a simple character instead than a regular expression.</li> |
| * <li>If the {@code separator} argument is {@code '\n'} or {@code '\r'}, then this method |
| * splits around any of {@code "\r"}, {@code "\n"} or {@code "\r\n"} characters sequences. |
| * <li>The leading and trailing spaces of each subsequences are trimmed.</li> |
| * </ul> |
| * |
| * @param text the text to split, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @return the array of subsequences computed by splitting the given text around the given |
| * character, or an empty array if {@code toSplit} was null. |
| * |
| * @see String#split(String) |
| */ |
| @SuppressWarnings("ReturnOfCollectionOrArrayField") |
| public static CharSequence[] split(final CharSequence text, final char separator) { |
| if (text == null) { |
| return EMPTY_ARRAY; |
| } |
| if (separator == '\n' || separator == '\r') { |
| final CharSequence[] splitted = splitOnEOL(text); |
| for (int i=0; i < splitted.length; i++) { |
| // For consistency with the rest of this method. |
| splitted[i] = trimWhitespaces(splitted[i]); |
| } |
| return splitted; |
| } |
| // 'excludeEmpty' must use the same criterion than trimWhitespaces(…). |
| final boolean excludeEmpty = isWhitespace(separator); |
| CharSequence[] splitted = createSplitArray(text); |
| final int length = text.length(); |
| int count = 0, last = 0, i = 0; |
| while ((i = indexOf(text, separator, i, length)) >= 0) { |
| final CharSequence item = trimWhitespaces(text, last, i); |
| if (!excludeEmpty || item.length() != 0) { |
| if (count == splitted.length) { |
| splitted = Arrays.copyOf(splitted, count << 1); |
| } |
| splitted[count++] = item; |
| } |
| last = ++i; |
| } |
| // Add the last element. |
| final CharSequence item = trimWhitespaces(text, last, length); |
| if (!excludeEmpty || item.length() != 0) { |
| if (count == splitted.length) { |
| splitted = Arrays.copyOf(splitted, count + 1); |
| } |
| splitted[count++] = item; |
| } |
| return ArraysExt.resize(splitted, count); |
| } |
| |
| /** |
| * Splits a text around the <cite>End Of Line</cite> (EOL) characters. |
| * EOL characters can be any of {@code "\r"}, {@code "\n"} or {@code "\r\n"} sequences. |
| * Each element in the returned array will be a single line. If the given text is already |
| * a single line, then this method returns a singleton containing only the given text. |
| * |
| * <p>Notes:</p> |
| * <ul> |
| * <li>At the difference of <code>{@linkplain #split split}(toSplit, '\n’)</code>, |
| * this method does not remove whitespaces.</li> |
| * <li>This method does not check for Unicode |
| * {@linkplain Characters#LINE_SEPARATOR line separator} and |
| * {@linkplain Characters#PARAGRAPH_SEPARATOR paragraph separator}.</li> |
| * </ul> |
| * |
| * <div class="note"><b>Performance note:</b> |
| * Prior JDK8 this method was usually cheap because all string instances created by |
| * {@link String#substring(int,int)} shared the same {@code char[]} internal array. |
| * However since JDK8, the new {@code String} implementation copies the data in new arrays. |
| * Consequently it is better to use index rather than this method for splitting large {@code String}s. |
| * However this method still useful for other {@link CharSequence} implementations providing an efficient |
| * {@code subSequence(int,int)} method.</div> |
| * |
| * @param text the multi-line text from which to get the individual lines, or {@code null}. |
| * @return the lines in the text, or an empty array if the given text was null. |
| * |
| * @see #indexOfLineStart(CharSequence, int, int) |
| */ |
| @SuppressWarnings("ReturnOfCollectionOrArrayField") |
| public static CharSequence[] splitOnEOL(final CharSequence text) { |
| if (text == null) { |
| return EMPTY_ARRAY; |
| } |
| /* |
| * This method is implemented on top of String.indexOf(int,int), |
| * assuming that it will be faster for String and StringBuilder. |
| */ |
| final int length = text.length(); |
| int lf = indexOf(text, '\n', 0, length); |
| int cr = indexOf(text, '\r', 0, length); |
| if (lf < 0 && cr < 0) { |
| return new CharSequence[] { |
| text |
| }; |
| } |
| int count = 0; |
| CharSequence[] splitted = createSplitArray(text); |
| int last = 0; |
| boolean hasMore; |
| do { |
| int skip = 1; |
| final int splitAt; |
| if (cr < 0) { |
| // There is no "\r" character in the whole text, only "\n". |
| splitAt = lf; |
| hasMore = (lf = indexOf(text, '\n', lf+1, length)) >= 0; |
| } else if (lf < 0) { |
| // There is no "\n" character in the whole text, only "\r". |
| splitAt = cr; |
| hasMore = (cr = indexOf(text, '\r', cr+1, length)) >= 0; |
| } else if (lf < cr) { |
| // There is both "\n" and "\r" characters with "\n" first. |
| splitAt = lf; |
| hasMore = true; |
| lf = indexOf(text, '\n', lf+1, length); |
| } else { |
| // There is both "\r" and "\n" characters with "\r" first. |
| // We need special care for the "\r\n" sequence. |
| splitAt = cr; |
| if (lf == ++cr) { |
| cr = indexOf(text, '\r', cr+1, length); |
| lf = indexOf(text, '\n', lf+1, length); |
| hasMore = (cr >= 0 || lf >= 0); |
| skip = 2; |
| } else { |
| cr = indexOf(text, '\r', cr+1, length); |
| hasMore = true; // Because there is lf. |
| } |
| } |
| if (count >= splitted.length) { |
| splitted = Arrays.copyOf(splitted, count*2); |
| } |
| splitted[count++] = text.subSequence(last, splitAt); |
| last = splitAt + skip; |
| } while (hasMore); |
| /* |
| * Add the remaining string and we are done. |
| */ |
| if (count >= splitted.length) { |
| splitted = Arrays.copyOf(splitted, count+1); |
| } |
| splitted[count++] = text.subSequence(last, text.length()); |
| return ArraysExt.resize(splitted, count); |
| } |
| |
| /** |
| * Returns {@code true} if {@link #split(CharSequence, char)} parsed an empty string. |
| */ |
| private static boolean isEmpty(final CharSequence[] tokens) { |
| switch (tokens.length) { |
| case 0: return true; |
| case 1: return tokens[0].length() == 0; |
| default: return false; |
| } |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Double#parseDouble(String) parses} each item as a {@code double}. |
| * Empty sub-sequences are parsed as {@link Double#NaN}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static double[] parseDoubles(final CharSequence values, final char separator) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_DOUBLE; |
| final double[] parsed = new double[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| final String token = trimWhitespaces(tokens[i]).toString(); |
| parsed[i] = token.isEmpty() ? Double.NaN : Double.parseDouble(token); |
| } |
| return parsed; |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Float#parseFloat(String) parses} each item as a {@code float}. |
| * Empty sub-sequences are parsed as {@link Float#NaN}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static float[] parseFloats(final CharSequence values, final char separator) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_FLOAT; |
| final float[] parsed = new float[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| final String token = trimWhitespaces(tokens[i]).toString(); |
| parsed[i] = token.isEmpty() ? Float.NaN : Float.parseFloat(token); |
| } |
| return parsed; |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Long#parseLong(String) parses} each item as a {@code long}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @param radix the radix to be used for parsing. This is usually 10. |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static long[] parseLongs(final CharSequence values, final char separator, final int radix) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_LONG; |
| final long[] parsed = new long[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| parsed[i] = Long.parseLong(trimWhitespaces(tokens[i]).toString(), radix); |
| } |
| return parsed; |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Integer#parseInt(String) parses} each item as an {@code int}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @param radix the radix to be used for parsing. This is usually 10. |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static int[] parseInts(final CharSequence values, final char separator, final int radix) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_INT; |
| final int[] parsed = new int[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| parsed[i] = Integer.parseInt(trimWhitespaces(tokens[i]).toString(), radix); |
| } |
| return parsed; |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Short#parseShort(String) parses} each item as a {@code short}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @param radix the radix to be used for parsing. This is usually 10. |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static short[] parseShorts(final CharSequence values, final char separator, final int radix) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_SHORT; |
| final short[] parsed = new short[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| parsed[i] = Short.parseShort(trimWhitespaces(tokens[i]).toString(), radix); |
| } |
| return parsed; |
| } |
| |
| /** |
| * {@linkplain #split(CharSequence, char) Splits} the given text around the given character, |
| * then {@linkplain Byte#parseByte(String) parses} each item as a {@code byte}. |
| * |
| * @param values the text containing the values to parse, or {@code null}. |
| * @param separator the delimiting character (typically the coma). |
| * @param radix the radix to be used for parsing. This is usually 10. |
| * @return the array of numbers parsed from the given text, |
| * or an empty array if {@code values} was null. |
| * @throws NumberFormatException if at least one number can not be parsed. |
| */ |
| public static byte[] parseBytes(final CharSequence values, final char separator, final int radix) |
| throws NumberFormatException |
| { |
| final CharSequence[] tokens = split(values, separator); |
| if (isEmpty(tokens)) return ArraysExt.EMPTY_BYTE; |
| final byte[] parsed = new byte[tokens.length]; |
| for (int i=0; i<tokens.length; i++) { |
| parsed[i] = Byte.parseByte(trimWhitespaces(tokens[i]).toString(), radix); |
| } |
| return parsed; |
| } |
| |
| /** |
| * Replaces some Unicode characters by ASCII characters on a "best effort basis". |
| * For example the “ é ” character is replaced by “ e ” (without accent), |
| * the “ ″ ” symbol for minutes of angle is replaced by straight double quotes “ " ”, |
| * and combined characters like ㎏, ㎎, ㎝, ㎞, ㎢, ㎦, ㎖, ㎧, ㎩, ㎐, <i>etc.</i> are replaced |
| * by the corresponding sequences of characters. |
| * |
| * <div class="note"><b>Note:</b> |
| * the replacement of Greek letters is a more complex task than what this method can do, |
| * since it depends on the context. For example if the Greek letters are abbreviations |
| * for coordinate system axes like φ and λ, then the replacements depend on the enclosing |
| * coordinate system. See {@link org.apache.sis.io.wkt.Transliterator} for more information.</div> |
| * |
| * @param text the text to scan for Unicode characters to replace by ASCII characters, or {@code null}. |
| * @return the given text with substitutions applied, or {@code text} if no replacement |
| * has been applied, or {@code null} if the given text was null. |
| * |
| * @see StringBuilders#toASCII(StringBuilder) |
| * @see org.apache.sis.io.wkt.Transliterator#filter(String) |
| * @see java.text.Normalizer |
| */ |
| public static CharSequence toASCII(final CharSequence text) { |
| return StringBuilders.toASCII(text, null); |
| } |
| |
| /** |
| * Returns a string with leading and trailing whitespace characters omitted. |
| * This method is similar in purpose to {@link String#trim()}, except that the later considers |
| * every {@linkplain Character#isISOControl(int) ISO control codes} below 32 to be a whitespace. |
| * That {@code String.trim()} behavior has the side effect of removing the heading of ANSI escape |
| * sequences (a.k.a. X3.64), and to ignore Unicode spaces. This {@code trimWhitespaces(…)} method |
| * is built on the more accurate {@link Character#isWhitespace(int)} method instead. |
| * |
| * <p>This method performs the same work than {@link #trimWhitespaces(CharSequence)}, |
| * but is overloaded for the {@code String} type because of its frequent use.</p> |
| * |
| * @param text the text from which to remove leading and trailing whitespaces, or {@code null}. |
| * @return a string with leading and trailing whitespaces removed, or {@code null} is the given |
| * text was null. |
| */ |
| public static String trimWhitespaces(String text) { |
| if (text != null) { |
| final int length = text.length(); |
| final int lower = skipLeadingWhitespaces(text, 0, length); |
| text = text.substring(lower, skipTrailingWhitespaces(text, lower, length)); |
| } |
| return text; |
| } |
| |
| /** |
| * Returns a text with leading and trailing whitespace characters omitted. |
| * Space characters are identified by the {@link Character#isWhitespace(int)} method. |
| * |
| * <p>This method is the generic version of {@link #trimWhitespaces(String)}.</p> |
| * |
| * @param text the text from which to remove leading and trailing whitespaces, or {@code null}. |
| * @return a characters sequence with leading and trailing whitespaces removed, |
| * or {@code null} is the given text was null. |
| * |
| * @see #skipLeadingWhitespaces(CharSequence, int, int) |
| * @see #skipTrailingWhitespaces(CharSequence, int, int) |
| */ |
| public static CharSequence trimWhitespaces(CharSequence text) { |
| if (text != null) { |
| text = trimWhitespaces(text, 0, text.length()); |
| } |
| return text; |
| } |
| |
| /** |
| * Returns a sub-sequence with leading and trailing whitespace characters omitted. |
| * Space characters are identified by the {@link Character#isWhitespace(int)} method. |
| * |
| * <p>Invoking this method is functionally equivalent to the following code snippet, |
| * except that the {@link CharSequence#subSequence(int, int) subSequence} method is |
| * invoked only once instead of two times:</p> |
| * |
| * {@preformat java |
| * text = trimWhitespaces(text.subSequence(lower, upper)); |
| * } |
| * |
| * @param text the text from which to remove leading and trailing white spaces. |
| * @param lower index of the first character to consider for inclusion in the sub-sequence. |
| * @param upper index after the last character to consider for inclusion in the sub-sequence. |
| * @return a characters sequence with leading and trailing white spaces removed, or {@code null} |
| * if the {@code text} argument is null. |
| * @throws IndexOutOfBoundsException if {@code lower} or {@code upper} is out of bounds. |
| */ |
| public static CharSequence trimWhitespaces(CharSequence text, int lower, int upper) { |
| final int length = length(text); |
| ArgumentChecks.ensureValidIndexRange(length, lower, upper); |
| if (text != null) { |
| lower = skipLeadingWhitespaces (text, lower, upper); |
| upper = skipTrailingWhitespaces(text, lower, upper); |
| if (lower != 0 || upper != length) { // Safety in case subSequence doesn't make the check. |
| text = text.subSequence(lower, upper); |
| } |
| } |
| return text; |
| } |
| |
| /** |
| * Trims the fractional part of the given formatted number, provided that it doesn't change |
| * the value. This method assumes that the number is formatted in the US locale, typically |
| * by the {@link Double#toString(double)} method. |
| * |
| * <p>More specifically if the given value ends with a {@code '.'} character followed by a |
| * sequence of {@code '0'} characters, then those characters are omitted. Otherwise this |
| * method returns the text unchanged. This is a <cite>"all or nothing"</cite> method: |
| * either the fractional part is completely removed, or either it is left unchanged.</p> |
| * |
| * <div class="section">Examples</div> |
| * This method returns {@code "4"} if the given value is {@code "4."}, {@code "4.0"} or |
| * {@code "4.00"}, but returns {@code "4.10"} unchanged (including the trailing {@code '0'} |
| * character) if the input is {@code "4.10"}. |
| * |
| * <div class="section">Use case</div> |
| * This method is useful before to {@linkplain Integer#parseInt(String) parse a number} |
| * if that number should preferably be parsed as an integer before attempting to parse |
| * it as a floating point number. |
| * |
| * @param value the value to trim if possible, or {@code null}. |
| * @return the value without the trailing {@code ".0"} part (if any), |
| * or {@code null} if the given text was null. |
| * |
| * @see StringBuilders#trimFractionalPart(StringBuilder) |
| */ |
| public static CharSequence trimFractionalPart(final CharSequence value) { |
| if (value != null) { |
| for (int i=value.length(); i>0;) { |
| final int c = codePointBefore(value, i); |
| i -= charCount(c); |
| switch (c) { |
| case '0': continue; |
| case '.': return value.subSequence(0, i); |
| default : return value; |
| } |
| } |
| } |
| return value; |
| } |
| |
| /** |
| * Makes sure that the {@code text} string is not longer than {@code maxLength} characters. |
| * If {@code text} is not longer, then it is returned unchanged. Otherwise this method returns |
| * a copy of {@code text} with some characters substituted by the {@code "(…)"} string. |
| * |
| * <p>If the text needs to be shortened, then this method tries to apply the above-cited |
| * substitution between two words. For example, the following text:</p> |
| * |
| * <blockquote> |
| * "This sentence given as an example is way too long to be included in a short name." |
| * </blockquote> |
| * |
| * May be shortened to something like this: |
| * |
| * <blockquote> |
| * "This sentence given (…) in a short name." |
| * </blockquote> |
| * |
| * @param text the sentence to reduce if it is too long, or {@code null}. |
| * @param maxLength the maximum length allowed for {@code text}. |
| * @return a sentence not longer than {@code maxLength}, or {@code null} if the given text was null. |
| */ |
| public static CharSequence shortSentence(CharSequence text, final int maxLength) { |
| ArgumentChecks.ensureStrictlyPositive("maxLength", maxLength); |
| if (text != null) { |
| final int length = text.length(); |
| int toRemove = length - maxLength; |
| if (toRemove > 0) { |
| toRemove += 5; // Space needed for the " (…) " string. |
| /* |
| * We will remove characters from 'lower' to 'upper' both exclusive. We try to |
| * adjust 'lower' and 'upper' in such a way that the first and last characters |
| * to be removed will be spaces or punctuation characters. |
| */ |
| int lower = length >>> 1; |
| if (lower != 0 && isLowSurrogate(text.charAt(lower))) { |
| lower--; |
| } |
| int upper = lower; |
| boolean forward = false; |
| do { // To be run as long as we need to remove more characters. |
| int nc=0, type=UNASSIGNED; |
| forward = !forward; |
| searchWordBreak: while (true) { |
| final int c; |
| if (forward) { |
| if ((upper += nc) == length) break; |
| c = codePointAt(text, upper); |
| } else { |
| if ((lower -= nc) == 0) break; |
| c = codePointBefore(text, lower); |
| } |
| nc = charCount(c); |
| if (isWhitespace(c)) { |
| if (type != UNASSIGNED) { |
| type = SPACE_SEPARATOR; |
| } |
| } else switch (type) { |
| // After we skipped white, then non-white, then white characters, stop. |
| case SPACE_SEPARATOR: { |
| break searchWordBreak; |
| } |
| // For the first non-white character, just remember its type. |
| // Arbitrarily use UPPERCASE_LETTER for any kind of identifier |
| // part (which include UPPERCASE_LETTER anyway). |
| case UNASSIGNED: { |
| type = isUnicodeIdentifierPart(c) ? UPPERCASE_LETTER : getType(c); |
| break; |
| } |
| // If we expected an identifier, stop at the first other char. |
| case UPPERCASE_LETTER: { |
| if (!isUnicodeIdentifierPart(c)) { |
| break searchWordBreak; |
| } |
| break; |
| } |
| // For all other kind of character, break when the type change. |
| default: { |
| if (getType(c) != type) { |
| break searchWordBreak; |
| } |
| break; |
| } |
| } |
| toRemove -= nc; |
| } |
| } while (toRemove > 0); |
| text = new StringBuilder(lower + (length-upper) + 5) // 5 is the length of " (…) " |
| .append(text, 0, lower).append(" (…) ").append(text, upper, length); |
| } |
| } |
| return text; |
| } |
| |
| /** |
| * Given a string in upper cases (typically a Java constant), returns a string formatted |
| * like an English sentence. This heuristic method performs the following steps: |
| * |
| * <ol> |
| * <li>Replace all occurrences of {@code '_'} by spaces.</li> |
| * <li>Converts all letters except the first one to lower case letters using |
| * {@link Character#toLowerCase(int)}. Note that this method does not use |
| * the {@link String#toLowerCase()} method. Consequently the system locale |
| * is ignored. This method behaves as if the conversion were done in the |
| * {@linkplain java.util.Locale#ROOT root} locale.</li> |
| * </ol> |
| * |
| * <p>Note that those heuristic rules may be modified in future SIS versions, |
| * depending on the practical experience gained.</p> |
| * |
| * @param identifier the name of a Java constant, or {@code null}. |
| * @return the identifier like an English sentence, or {@code null} |
| * if the given {@code identifier} argument was null. |
| */ |
| public static CharSequence upperCaseToSentence(final CharSequence identifier) { |
| if (identifier == null) { |
| return null; |
| } |
| final StringBuilder buffer = new StringBuilder(identifier.length()); |
| final int length = identifier.length(); |
| for (int i=0; i<length;) { |
| int c = Character.codePointAt(identifier, i); |
| if (i != 0) { |
| if (c == '_') { |
| c = ' '; |
| } else { |
| c = Character.toLowerCase(c); |
| } |
| } |
| buffer.appendCodePoint(c); |
| i += Character.charCount(c); |
| } |
| return buffer; |
| } |
| |
| /** |
| * Given a string in camel cases (typically an identifier), returns a string formatted |
| * like an English sentence. This heuristic method performs the following steps: |
| * |
| * <ol> |
| * <li>Invoke {@link #camelCaseToWords(CharSequence, boolean)}, which separate the words |
| * on the basis of character case. For example {@code "transferFunctionType"} become |
| * <cite>"transfer function type"</cite>. This works fine for ISO 19115 identifiers.</li> |
| * |
| * <li>Next replace all occurrence of {@code '_'} by spaces in order to take in account |
| * an other common naming convention, which uses {@code '_'} as a word separator. This |
| * convention is used by netCDF attributes like {@code "project_name"}.</li> |
| * |
| * <li>Finally ensure that the first character is upper-case.</li> |
| * </ol> |
| * |
| * <div class="section">Exception to the above rules</div> |
| * If the given identifier contains only upper-case letters, digits and the {@code '_'} character, |
| * then the identifier is returned "as is" except for the {@code '_'} characters which are replaced by {@code '-'}. |
| * This work well for identifiers like {@code "UTF-8"} or {@code "ISO-LATIN-1"} for instance. |
| * |
| * <p>Note that those heuristic rules may be modified in future SIS versions, |
| * depending on the practical experience gained.</p> |
| * |
| * @param identifier an identifier with no space, words begin with an upper-case character, or {@code null}. |
| * @return the identifier with spaces inserted after what looks like words, or {@code null} |
| * if the given {@code identifier} argument was null. |
| */ |
| public static CharSequence camelCaseToSentence(final CharSequence identifier) { |
| if (identifier == null) { |
| return null; |
| } |
| final StringBuilder buffer; |
| if (isCode(identifier)) { |
| if (identifier instanceof String) { |
| return ((String) identifier).replace('_', '-'); |
| } |
| buffer = new StringBuilder(identifier); |
| StringBuilders.replace(buffer, '_', '-'); |
| } else { |
| buffer = (StringBuilder) camelCaseToWords(identifier, true); |
| final int length = buffer.length(); |
| if (length != 0) { |
| StringBuilders.replace(buffer, '_', ' '); |
| final int c = buffer.codePointAt(0); |
| final int up = toUpperCase(c); |
| if (c != up) { |
| StringBuilders.replace(buffer, 0, charCount(c), toChars(up)); |
| } |
| } |
| } |
| return buffer; |
| } |
| |
| /** |
| * Given a string in camel cases, returns a string with the same words separated by spaces. |
| * A word begins with a upper-case character following a lower-case character. For example |
| * if the given string is {@code "PixelInterleavedSampleModel"}, then this method returns |
| * <cite>"Pixel Interleaved Sample Model"</cite> or <cite>"Pixel interleaved sample model"</cite> |
| * depending on the value of the {@code toLowerCase} argument. |
| * |
| * <p>If {@code toLowerCase} is {@code false}, then this method inserts spaces but does not change |
| * the case of characters. If {@code toLowerCase} is {@code true}, then this method changes |
| * {@linkplain Character#toLowerCase(int) to lower case} the first character after each spaces |
| * inserted by this method (note that this intentionally exclude the very first character in |
| * the given string), except if the second character {@linkplain Character#isUpperCase(int) |
| * is upper case}, in which case the word is assumed an acronym.</p> |
| * |
| * <p>The given string is usually a programmatic identifier like a class name or a method name.</p> |
| * |
| * @param identifier an identifier with no space, words begin with an upper-case character. |
| * @param toLowerCase {@code true} for changing the first character of words to lower case, |
| * except for the first word and acronyms. |
| * @return the identifier with spaces inserted after what looks like words, or {@code null} |
| * if the given {@code identifier} argument was null. |
| */ |
| public static CharSequence camelCaseToWords(final CharSequence identifier, final boolean toLowerCase) { |
| if (identifier == null) { |
| return null; |
| } |
| /* |
| * Implementation note: the 'camelCaseToSentence' method needs |
| * this method to unconditionally returns a new StringBuilder. |
| */ |
| final int length = identifier.length(); |
| final StringBuilder buffer = new StringBuilder(length + 8); |
| final int lastIndex = (length != 0) ? length - charCount(codePointBefore(identifier, length)) : 0; |
| int last = 0; |
| for (int i=1; i<=length;) { |
| final int cp; |
| final boolean doAppend; |
| if (i == length) { |
| cp = 0; |
| doAppend = true; |
| } else { |
| cp = codePointAt(identifier, i); |
| doAppend = Character.isUpperCase(cp) && isLowerCase(codePointBefore(identifier, i)); |
| } |
| if (doAppend) { |
| final int pos = buffer.length(); |
| buffer.append(identifier, last, i).append(' '); |
| if (toLowerCase && pos!=0 && last<lastIndex && isLowerCase(codePointAfter(identifier, last))) { |
| final int c = buffer.codePointAt(pos); |
| final int low = toLowerCase(c); |
| if (c != low) { |
| StringBuilders.replace(buffer, pos, pos + charCount(c), toChars(low)); |
| } |
| } |
| last = i; |
| } |
| i += charCount(cp); |
| } |
| /* |
| * Removes the trailing space, if any. |
| */ |
| final int lg = buffer.length(); |
| if (lg != 0) { |
| final int cp = buffer.codePointBefore(lg); |
| if (isWhitespace(cp)) { |
| buffer.setLength(lg - charCount(cp)); |
| } |
| } |
| return buffer; |
| } |
| |
| /** |
| * Creates an acronym from the given text. This method returns a string containing the first character of each word, |
| * where the words are separated by the camel case convention, the {@code '_'} character, or any character which is |
| * not a {@linkplain Character#isUnicodeIdentifierPart(int) Unicode identifier part} (including spaces). |
| * |
| * <p>An exception to the above rule happens if the given text is a Unicode identifier without the {@code '_'} |
| * character, and every characters are upper case. In such case the text is returned unchanged on the assumption |
| * that it is already an acronym.</p> |
| * |
| * <p><b>Examples:</b> given {@code "northEast"}, this method returns {@code "NE"}. |
| * Given {@code "Open Geospatial Consortium"}, this method returns {@code "OGC"}.</p> |
| * |
| * @param text the text for which to create an acronym, or {@code null}. |
| * @return the acronym, or {@code null} if the given text was null. |
| */ |
| public static CharSequence camelCaseToAcronym(CharSequence text) { |
| text = trimWhitespaces(text); |
| if (text != null && !isAcronym(text)) { |
| final int length = text.length(); |
| final StringBuilder buffer = new StringBuilder(8); // Acronyms are usually short. |
| boolean wantChar = true; |
| for (int i=0; i<length;) { |
| final int c = codePointAt(text, i); |
| if (wantChar) { |
| if (isUnicodeIdentifierStart(c)) { |
| buffer.appendCodePoint(c); |
| wantChar = false; |
| } |
| } else if (!isUnicodeIdentifierPart(c) || c == '_') { |
| wantChar = true; |
| } else if (Character.isUpperCase(c)) { |
| // Test for mixed-case (e.g. "northEast"). |
| // Note that i is guaranteed to be greater than 0 here. |
| if (!Character.isUpperCase(Character.codePointBefore(text, i))) { |
| buffer.appendCodePoint(c); |
| } |
| } |
| i += charCount(c); |
| } |
| final int acrlg = buffer.length(); |
| if (acrlg != 0) { |
| /* |
| * If every characters except the first one are upper-case, ensure that the |
| * first one is upper-case as well. This is for handling the identifiers which |
| * are compliant to Java-Beans convention (e.g. "northEast"). |
| */ |
| if (isUpperCase(buffer, 1, acrlg, true)) { |
| final int c = buffer.codePointAt(0); |
| final int up = toUpperCase(c); |
| if (c != up) { |
| StringBuilders.replace(buffer, 0, charCount(c), toChars(up)); |
| } |
| } |
| if (!equals(text, buffer)) { |
| text = buffer; |
| } |
| } |
| } |
| return text; |
| } |
| |
| /** |
| * Returns {@code true} if the first string is likely to be an acronym of the second string. |
| * An acronym is a sequence of {@linkplain Character#isLetterOrDigit(int) letters or digits} |
| * built from at least one character of each word in the {@code words} string. More than |
| * one character from the same word may appear in the acronym, but they must always |
| * be the first consecutive characters. The comparison is case-insensitive. |
| * |
| * <div class="note"><b>Example:</b> |
| * Given the {@code "Open Geospatial Consortium"} words, the following strings are recognized as acronyms: |
| * {@code "OGC"}, {@code "ogc"}, {@code "O.G.C."}, {@code "OpGeoCon"}.</div> |
| * |
| * If any of the given arguments is {@code null}, this method returns {@code false}. |
| * |
| * @param acronym a possible acronym of the sequence of words, or {@code null}. |
| * @param words the sequence of words, or {@code null}. |
| * @return {@code true} if the first string is an acronym of the second one. |
| */ |
| public static boolean isAcronymForWords(final CharSequence acronym, final CharSequence words) { |
| final int lga = length(acronym); |
| int ia=0, ca; |
| do { |
| if (ia >= lga) return false; |
| ca = codePointAt(acronym, ia); |
| ia += charCount(ca); |
| } while (!isLetterOrDigit(ca)); |
| final int lgc = length(words); |
| int ic=0, cc; |
| do { |
| if (ic >= lgc) return false; |
| cc = codePointAt(words, ic); |
| ic += charCount(cc); |
| } |
| while (!isLetterOrDigit(cc)); |
| if (toUpperCase(ca) != toUpperCase(cc)) { |
| // The first letter must match. |
| return false; |
| } |
| cmp: while (ia < lga) { |
| if (ic >= lgc) { |
| // There is more letters in the acronym than in the complete name. |
| return false; |
| } |
| ca = codePointAt(acronym, ia); ia += charCount(ca); |
| cc = codePointAt(words, ic); ic += charCount(cc); |
| if (isLetterOrDigit(ca)) { |
| if (toUpperCase(ca) == toUpperCase(cc)) { |
| // Acronym letter matches the letter from the complete name. |
| // Continue the comparison with next letter of both strings. |
| continue; |
| } |
| // Will search for the next word after the 'else' block. |
| } else do { |
| if (ia >= lga) break cmp; |
| ca = codePointAt(acronym, ia); |
| ia += charCount(ca); |
| } while (!isLetterOrDigit(ca)); |
| /* |
| * At this point, 'ca' is the next acronym letter to compare and we |
| * need to search for the next word in the complete name. We first |
| * skip remaining letters, then we skip non-letter characters. |
| */ |
| boolean skipLetters = true; |
| do while (isLetterOrDigit(cc) == skipLetters) { |
| if (ic >= lgc) { |
| return false; |
| } |
| cc = codePointAt(words, ic); |
| ic += charCount(cc); |
| } while ((skipLetters = !skipLetters) == false); |
| // Now that we are aligned on a new word, the first letter must match. |
| if (toUpperCase(ca) != toUpperCase(cc)) { |
| return false; |
| } |
| } |
| /* |
| * Now that we have processed all acronym letters, the complete name can not have |
| * any additional word. We can only finish the current word and skip trailing non- |
| * letter characters. |
| */ |
| boolean skipLetters = true; |
| do { |
| do { |
| if (ic >= lgc) return true; |
| cc = codePointAt(words, ic); |
| ic += charCount(cc); |
| } while (isLetterOrDigit(cc) == skipLetters); |
| } while ((skipLetters = !skipLetters) == false); |
| return false; |
| } |
| |
| /** |
| * Returns {@code true} if the given string contains only upper case letters or digits. |
| * A few punctuation characters like {@code '_'} and {@code '.'} are also accepted. |
| * |
| * <p>This method is used for identifying character strings that are likely to be code |
| * like {@code "UTF-8"} or {@code "ISO-LATIN-1"}.</p> |
| * |
| * @see #isUnicodeIdentifier(CharSequence) |
| */ |
| private static boolean isCode(final CharSequence identifier) { |
| for (int i=identifier.length(); --i>=0;) { |
| final char c = identifier.charAt(i); |
| // No need to use the code point API here, since the conditions |
| // below are requiring the characters to be in the basic plane. |
| if (!((c >= 'A' && c <= 'Z') || (c >= '-' && c <= ':') || c == '_')) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Returns {@code true} if the given text is presumed to be an acronym. Acronyms are presumed |
| * to be valid Unicode identifiers in all upper-case letters and without the {@code '_'} character. |
| * |
| * @see #camelCaseToAcronym(CharSequence) |
| */ |
| private static boolean isAcronym(final CharSequence text) { |
| return isUpperCase(text) && indexOf(text, '_', 0, text.length()) < 0 && isUnicodeIdentifier(text); |
| } |
| |
| /** |
| * Returns {@code true} if the given identifier is a legal Unicode identifier. |
| * This method returns {@code true} if the identifier length is greater than zero, |
| * the first character is a {@linkplain Character#isUnicodeIdentifierStart(int) |
| * Unicode identifier start} and all remaining characters (if any) are |
| * {@linkplain Character#isUnicodeIdentifierPart(int) Unicode identifier parts}. |
| * |
| * <div class="section">Relationship with legal XML identifiers</div> |
| * Most legal Unicode identifiers are also legal XML identifiers, but the converse is not true. |
| * The most noticeable differences are the ‘{@code :}’, ‘{@code -}’ and ‘{@code .}’ characters, |
| * which are legal in XML identifiers but not in Unicode. |
| * |
| * <table class="sis"> |
| * <caption>Characters legal in one set but not in the other</caption> |
| * <tr><th colspan="2">Not legal in Unicode</th> <th class="sep" colspan="2">Not legal in XML</th></tr> |
| * <tr><td>{@code :}</td><td>(colon)</td> <td class="sep">{@code µ}</td><td>(micro sign)</td></tr> |
| * <tr><td>{@code -}</td><td>(hyphen or minus)</td> <td class="sep">{@code ª}</td><td>(feminine ordinal indicator)</td></tr> |
| * <tr><td>{@code .}</td><td>(dot)</td> <td class="sep">{@code º}</td><td>(masculine ordinal indicator)</td></tr> |
| * <tr><td>{@code ·}</td><td>(middle dot)</td> <td class="sep">{@code ⁔}</td><td>(inverted undertie)</td></tr> |
| * <tr> |
| * <td colspan="2">Many punctuation, symbols, <i>etc</i>.</td> |
| * <td colspan="2" class="sep">{@linkplain Character#isIdentifierIgnorable(int) Identifier ignorable} characters.</td> |
| * </tr> |
| * </table> |
| * |
| * Note that the ‘{@code _}’ (underscore) character is legal according both Unicode and XML, while spaces, |
| * ‘{@code !}’, ‘{@code #}’, ‘{@code *}’, ‘{@code /}’, ‘{@code ?}’ and most other punctuation characters are not. |
| * |
| * <div class="section">Usage in Apache SIS</div> |
| * In its handling of {@linkplain org.apache.sis.referencing.ImmutableIdentifier identifiers}, Apache SIS favors |
| * Unicode identifiers without {@linkplain Character#isIdentifierIgnorable(int) ignorable} characters since those |
| * identifiers are legal XML identifiers except for the above-cited rarely used characters. As a side effect, |
| * this policy excludes ‘{@code :}’, ‘{@code -}’ and ‘{@code .}’ which would normally be legal XML identifiers. |
| * But since those characters could easily be confused with |
| * {@linkplain org.apache.sis.util.iso.DefaultNameSpace#DEFAULT_SEPARATOR namespace separators}, |
| * this exclusion is considered desirable. |
| * |
| * @param identifier the character sequence to test, or {@code null}. |
| * @return {@code true} if the given character sequence is a legal Unicode identifier. |
| * |
| * @see org.apache.sis.referencing.ImmutableIdentifier |
| * @see org.apache.sis.metadata.iso.citation.Citations#toCodeSpace(Citation) |
| * @see org.apache.sis.referencing.IdentifiedObjects#getSimpleNameOrIdentifier(IdentifiedObject) |
| */ |
| public static boolean isUnicodeIdentifier(final CharSequence identifier) { |
| final int length = length(identifier); |
| if (length == 0) { |
| return false; |
| } |
| int c = codePointAt(identifier, 0); |
| if (!isUnicodeIdentifierStart(c)) { |
| return false; |
| } |
| for (int i=0; (i += charCount(c)) < length;) { |
| c = codePointAt(identifier, i); |
| if (!isUnicodeIdentifierPart(c)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Returns {@code true} if the given text is non-null, contains at least one upper-case character and |
| * no lower-case character. Space and punctuation are ignored. |
| * |
| * @param text the character sequence to test (may be {@code null}). |
| * @return {@code true} if non-null, contains at least one upper-case character and no lower-case character. |
| * |
| * @see String#toUpperCase() |
| * |
| * @since 0.7 |
| */ |
| public static boolean isUpperCase(final CharSequence text) { |
| return isUpperCase(text, 0, length(text), false); |
| } |
| |
| /** |
| * Returns {@code true} if the given sub-sequence is non-null, contains at least one upper-case character and |
| * no lower-case character. Space and punctuation are ignored. |
| * |
| * @param text the character sequence to test. |
| * @param lower index of the first character to check, inclusive. |
| * @param upper index of the last character to check, exclusive. |
| * @param hasUpperCase {@code true} if this method should behave as if the given text already had |
| * at least one upper-case character (not necessarily in the portion given by the indices). |
| * @return {@code true} if contains at least one upper-case character and no lower-case character. |
| */ |
| private static boolean isUpperCase(final CharSequence text, int lower, final int upper, boolean hasUpperCase) { |
| while (lower < upper) { |
| final int c = codePointAt(text, lower); |
| if (Character.isLowerCase(c)) { |
| return false; |
| } |
| if (!hasUpperCase) { |
| hasUpperCase = Character.isUpperCase(c); |
| } |
| lower += charCount(c); |
| } |
| return hasUpperCase; |
| } |
| |
| /** |
| * Returns {@code true} if the given texts are equal, optionally ignoring case and filtered-out characters. |
| * This method is sometime used for comparing identifiers in a lenient way. |
| * |
| * <p><b>Example:</b> the following call compares the two strings ignoring case and any |
| * characters which are not {@linkplain Character#isLetterOrDigit(int) letter or digit}. |
| * In particular, spaces and punctuation characters like {@code '_'} and {@code '-'} are |
| * ignored:</p> |
| * |
| * {@preformat java |
| * assert equalsFiltered("WGS84", "WGS_84", Characters.Filter.LETTERS_AND_DIGITS, true) == true; |
| * } |
| * |
| * @param s1 the first characters sequence to compare, or {@code null}. |
| * @param s2 the second characters sequence to compare, or {@code null}. |
| * @param filter the subset of characters to compare, or {@code null} for comparing all characters. |
| * @param ignoreCase {@code true} for ignoring cases, or {@code false} for requiring exact match. |
| * @return {@code true} if both arguments are {@code null} or if the two given texts are equal, |
| * optionally ignoring case and filtered-out characters. |
| */ |
| public static boolean equalsFiltered(final CharSequence s1, final CharSequence s2, |
| final Characters.Filter filter, final boolean ignoreCase) |
| { |
| if (s1 == s2) { |
| return true; |
| } |
| if (s1 == null || s2 == null) { |
| return false; |
| } |
| if (filter == null) { |
| return ignoreCase ? equalsIgnoreCase(s1, s2) : equals(s1, s2); |
| } |
| final int lg1 = s1.length(); |
| final int lg2 = s2.length(); |
| int i1 = 0, i2 = 0; |
| while (i1 < lg1) { |
| int c1 = codePointAt(s1, i1); |
| final int n = charCount(c1); |
| if (filter.contains(c1)) { |
| int c2; // Fetch the next significant character from the second string. |
| do { |
| if (i2 >= lg2) { |
| return false; // The first string has more significant characters than expected. |
| } |
| c2 = codePointAt(s2, i2); |
| i2 += charCount(c2); |
| } while (!filter.contains(c2)); |
| |
| // Compare the characters in the same way than String.equalsIgnoreCase(String). |
| if (c1 != c2 && !(ignoreCase && equalsIgnoreCase(c1, c2))) { |
| return false; |
| } |
| } |
| i1 += n; |
| } |
| while (i2 < lg2) { |
| final int s = codePointAt(s2, i2); |
| if (filter.contains(s)) { |
| return false; // The first string has less significant characters than expected. |
| } |
| i2 += charCount(s); |
| } |
| return true; |
| } |
| |
| /** |
| * Returns {@code true} if the given code points are equal, ignoring case. |
| * This method implements the same comparison algorithm than String#equalsIgnoreCase(String). |
| * |
| * <p>This method does not verify if {@code c1 == c2}. This check should have been done |
| * by the caller, since the caller code is a more optimal place for this check.</p> |
| */ |
| private static boolean equalsIgnoreCase(int c1, int c2) { |
| c1 = toUpperCase(c1); |
| c2 = toUpperCase(c2); |
| if (c1 == c2) { |
| return true; |
| } |
| // Need this check for Georgian alphabet. |
| return toLowerCase(c1) == toLowerCase(c2); |
| } |
| |
| /** |
| * Returns {@code true} if the two given texts are equal, ignoring case. |
| * This method is similar to {@link String#equalsIgnoreCase(String)}, except |
| * it works on arbitrary character sequences and compares <cite>code points</cite> |
| * instead than characters. |
| * |
| * @param s1 the first string to compare, or {@code null}. |
| * @param s2 the second string to compare, or {@code null}. |
| * @return {@code true} if the two given texts are equal, ignoring case, |
| * or if both arguments are {@code null}. |
| * |
| * @see String#equalsIgnoreCase(String) |
| */ |
| public static boolean equalsIgnoreCase(final CharSequence s1, final CharSequence s2) { |
| if (s1 == s2) { |
| return true; |
| } |
| if (s1 == null || s2 == null) { |
| return false; |
| } |
| // Do not check for String cases. We do not want to delegate to String.equalsIgnoreCase |
| // because we compare code points while String.equalsIgnoreCase compares characters. |
| final int lg1 = s1.length(); |
| final int lg2 = s2.length(); |
| int i1 = 0, i2 = 0; |
| while (i1<lg1 && i2<lg2) { |
| final int c1 = codePointAt(s1, i1); |
| final int c2 = codePointAt(s2, i2); |
| if (c1 != c2 && !equalsIgnoreCase(c1, c2)) { |
| return false; |
| } |
| i1 += charCount(c1); |
| i2 += charCount(c2); |
| } |
| return i1 == i2; |
| } |
| |
| /** |
| * Returns {@code true} if the two given texts are equal. This method delegates to |
| * {@link String#contentEquals(CharSequence)} if possible. This method never invoke |
| * {@link CharSequence#toString()} in order to avoid a potentially large copy of data. |
| * |
| * @param s1 the first string to compare, or {@code null}. |
| * @param s2 the second string to compare, or {@code null}. |
| * @return {@code true} if the two given texts are equal, or if both arguments are {@code null}. |
| * |
| * @see String#contentEquals(CharSequence) |
| */ |
| public static boolean equals(final CharSequence s1, final CharSequence s2) { |
| if (s1 == s2) { |
| return true; |
| } |
| if (s1 != null && s2 != null) { |
| if (s1 instanceof String) return ((String) s1).contentEquals(s2); |
| if (s2 instanceof String) return ((String) s2).contentEquals(s1); |
| final int length = s1.length(); |
| if (s2.length() == length) { |
| for (int i=0; i<length; i++) { |
| if (s1.charAt(i) != s2.charAt(i)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Returns {@code true} if the given text at the given offset contains the given part, |
| * in a case-sensitive comparison. This method is equivalent to the following code, |
| * except that this method works on arbitrary {@link CharSequence} objects instead than |
| * {@link String}s only: |
| * |
| * {@preformat java |
| * return text.regionMatches(offset, part, 0, part.length()); |
| * } |
| * |
| * This method does not thrown {@code IndexOutOfBoundsException}. Instead if |
| * {@code fromIndex < 0} or {@code fromIndex + part.length() > text.length()}, |
| * then this method returns {@code false}. |
| * |
| * @param text the character sequence for which to tests for the presence of {@code part}. |
| * @param fromIndex the offset in {@code text} where to test for the presence of {@code part}. |
| * @param part the part which may be present in {@code text}. |
| * @return {@code true} if {@code text} contains {@code part} at the given {@code offset}. |
| * @throws NullPointerException if any of the arguments is null. |
| * |
| * @see String#regionMatches(int, String, int, int) |
| */ |
| public static boolean regionMatches(final CharSequence text, final int fromIndex, final CharSequence part) { |
| if (text instanceof String && part instanceof String) { |
| // It is okay to delegate to String implementation since we do not ignore cases. |
| return ((String) text).startsWith((String) part, fromIndex); |
| } |
| final int length; |
| if (fromIndex < 0 || fromIndex + (length = part.length()) > text.length()) { |
| return false; |
| } |
| for (int i=0; i<length; i++) { |
| // No need to use the code point API here, since we are looking for exact matches. |
| if (text.charAt(fromIndex + i) != part.charAt(i)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Returns {@code true} if the given text at the given offset contains the given part, |
| * optionally in a case-insensitive way. This method is equivalent to the following code, |
| * except that this method works on arbitrary {@link CharSequence} objects instead than |
| * {@link String}s only: |
| * |
| * {@preformat java |
| * return text.regionMatches(ignoreCase, offset, part, 0, part.length()); |
| * } |
| * |
| * This method does not thrown {@code IndexOutOfBoundsException}. Instead if |
| * {@code fromIndex < 0} or {@code fromIndex + part.length() > text.length()}, |
| * then this method returns {@code false}. |
| * |
| * @param text the character sequence for which to tests for the presence of {@code part}. |
| * @param fromIndex the offset in {@code text} where to test for the presence of {@code part}. |
| * @param part the part which may be present in {@code text}. |
| * @param ignoreCase {@code true} if the case should be ignored. |
| * @return {@code true} if {@code text} contains {@code part} at the given {@code offset}. |
| * @throws NullPointerException if any of the arguments is null. |
| * |
| * @see String#regionMatches(boolean, int, String, int, int) |
| * |
| * @since 0.4 |
| */ |
| public static boolean regionMatches(final CharSequence text, int fromIndex, final CharSequence part, final boolean ignoreCase) { |
| if (!ignoreCase) { |
| return regionMatches(text, fromIndex, part); |
| } |
| // Do not check for String cases. We do not want to delegate to String.regionMatches |
| // because we compare code points while String.regionMatches(…) compares characters. |
| final int limit = text.length(); |
| final int length = part.length(); |
| if (fromIndex < 0) { // Not checked before because we want NullPointerException if an argument is null. |
| return false; |
| } |
| for (int i=0; i<length;) { |
| if (fromIndex >= limit) { |
| return false; |
| } |
| final int c1 = codePointAt(part, i); |
| final int c2 = codePointAt(text, fromIndex); |
| if (c1 != c2 && !equalsIgnoreCase(c1, c2)) { |
| return false; |
| } |
| fromIndex += charCount(c2); |
| i += charCount(c1); |
| } |
| return true; |
| } |
| |
| /** |
| * Returns {@code true} if the given character sequence starts with the given prefix. |
| * |
| * @param text the characters sequence to test. |
| * @param prefix the expected prefix. |
| * @param ignoreCase {@code true} if the case should be ignored. |
| * @return {@code true} if the given sequence starts with the given prefix. |
| * @throws NullPointerException if any of the arguments is null. |
| */ |
| public static boolean startsWith(final CharSequence text, final CharSequence prefix, final boolean ignoreCase) { |
| return regionMatches(text, 0, prefix, ignoreCase); |
| } |
| |
| /** |
| * Returns {@code true} if the given character sequence ends with the given suffix. |
| * |
| * @param text the characters sequence to test. |
| * @param suffix the expected suffix. |
| * @param ignoreCase {@code true} if the case should be ignored. |
| * @return {@code true} if the given sequence ends with the given suffix. |
| * @throws NullPointerException if any of the arguments is null. |
| */ |
| public static boolean endsWith(final CharSequence text, final CharSequence suffix, final boolean ignoreCase) { |
| int is = text.length(); |
| int ip = suffix.length(); |
| while (ip > 0) { |
| if (is <= 0) { |
| return false; |
| } |
| final int cs = codePointBefore(text, is); |
| final int cp = codePointBefore(suffix, ip); |
| if (cs != cp && (!ignoreCase || !equalsIgnoreCase(cs, cp))) { |
| return false; |
| } |
| is -= charCount(cs); |
| ip -= charCount(cp); |
| } |
| return true; |
| } |
| |
| /** |
| * Returns the longest sequence of characters which is found at the beginning of the two |
| * given texts. If one of those texts is {@code null}, then the other text is returned. |
| * |
| * @param s1 the first text, or {@code null}. |
| * @param s2 the second text, or {@code null}. |
| * @return the common prefix of both texts, or {@code null} if both texts are null. |
| */ |
| public static CharSequence commonPrefix(final CharSequence s1, final CharSequence s2) { |
| if (s1 == null) return s2; |
| if (s2 == null) return s1; |
| final CharSequence shortest; |
| final int lg1 = s1.length(); |
| final int lg2 = s2.length(); |
| final int length; |
| if (lg1 <= lg2) { |
| shortest = s1; |
| length = lg1; |
| } else { |
| shortest = s2; |
| length = lg2; |
| } |
| int i = 0; |
| while (i < length) { |
| // No need to use the codePointAt API here, since we are looking for exact matches. |
| if (s1.charAt(i) != s2.charAt(i)) { |
| break; |
| } |
| i++; |
| } |
| return shortest.subSequence(0, i); |
| } |
| |
| /** |
| * Returns the longest sequence of characters which is found at the end of the two given texts. |
| * If one of those texts is {@code null}, then the other text is returned. |
| * |
| * @param s1 the first text, or {@code null}. |
| * @param s2 the second text, or {@code null}. |
| * @return the common suffix of both texts, or {@code null} if both texts are null. |
| */ |
| public static CharSequence commonSuffix(final CharSequence s1, final CharSequence s2) { |
| if (s1 == null) return s2; |
| if (s2 == null) return s1; |
| final CharSequence shortest; |
| final int lg1 = s1.length(); |
| final int lg2 = s2.length(); |
| final int length; |
| if (lg1 <= lg2) { |
| shortest = s1; |
| length = lg1; |
| } else { |
| shortest = s2; |
| length = lg2; |
| } |
| int i = 0; |
| while (++i <= length) { |
| // No need to use the codePointAt API here, since we are looking for exact matches. |
| if (s1.charAt(lg1 - i) != s2.charAt(lg2 - i)) { |
| break; |
| } |
| } |
| i--; |
| return shortest.subSequence(length - i, shortest.length()); |
| } |
| |
| /** |
| * Returns the token starting at the given offset in the given text. For the purpose of this |
| * method, a "token" is any sequence of consecutive characters of the same type, as defined |
| * below. |
| * |
| * <p>Let define <var>c</var> as the first non-blank character located at an index equals or |
| * greater than the given offset. Then the characters that are considered of the same type |
| * are:</p> |
| * |
| * <ul> |
| * <li>If <var>c</var> is a |
| * {@linkplain Character#isUnicodeIdentifierStart(int) Unicode identifier start}, |
| * then any following characters that are |
| * {@linkplain Character#isUnicodeIdentifierPart(int) Unicode identifier part}.</li> |
| * <li>Otherwise any character for which {@link Character#getType(int)} returns |
| * the same value than for <var>c</var>.</li> |
| * </ul> |
| * |
| * @param text the text for which to get the token. |
| * @param fromIndex index of the fist character to consider in the given text. |
| * @return a sub-sequence of {@code text} starting at the given offset, or an empty string |
| * if there is no non-blank character at or after the given offset. |
| * @throws NullPointerException if the {@code text} argument is null. |
| */ |
| public static CharSequence token(final CharSequence text, int fromIndex) { |
| final int length = text.length(); |
| int upper = fromIndex; |
| /* |
| * Skip whitespaces. At the end of this loop, |
| * 'c' will be the first non-blank character. |
| */ |
| int c; |
| do { |
| if (upper >= length) return ""; |
| c = codePointAt(text, upper); |
| fromIndex = upper; |
| upper += charCount(c); |
| } |
| while (isWhitespace(c)); |
| /* |
| * Advance over all characters "of the same type". |
| */ |
| if (isUnicodeIdentifierStart(c)) { |
| while (upper<length && isUnicodeIdentifierPart(c = codePointAt(text, upper))) { |
| upper += charCount(c); |
| } |
| } else { |
| final int type = getType(codePointAt(text, fromIndex)); |
| while (upper<length && getType(c = codePointAt(text, upper)) == type) { |
| upper += charCount(c); |
| } |
| } |
| return text.subSequence(fromIndex, upper); |
| } |
| |
| /** |
| * Replaces all occurrences of a given string in the given character sequence. If no occurrence of |
| * {@code toSearch} is found in the given text or if {@code toSearch} is equal to {@code replaceBy}, |
| * then this method returns the {@code text} unchanged. |
| * Otherwise this method returns a new character sequence with all occurrences replaced by {@code replaceBy}. |
| * |
| * <p>This method is similar to {@link String#replace(CharSequence, CharSequence)} except that is accepts |
| * arbitrary {@code CharSequence} objects. As of Java 10, another difference is that this method does not |
| * create a new {@code String} if {@code toSearch} is equals to {@code replaceBy}.</p> |
| * |
| * @param text the character sequence in which to perform the replacements, or {@code null}. |
| * @param toSearch the string to replace. |
| * @param replaceBy the replacement for the searched string. |
| * @return the given text with replacements applied, or {@code text} if no replacement has been applied, |
| * or {@code null} if the given text was null |
| * |
| * @see String#replace(char, char) |
| * @see StringBuilders#replace(StringBuilder, String, String) |
| * @see String#replace(CharSequence, CharSequence) |
| * |
| * @since 0.4 |
| */ |
| public static CharSequence replace(final CharSequence text, final CharSequence toSearch, final CharSequence replaceBy) { |
| ArgumentChecks.ensureNonEmpty("toSearch", toSearch); |
| ArgumentChecks.ensureNonNull ("replaceBy", replaceBy); |
| if (text != null && !toSearch.equals(replaceBy)) { |
| if (text instanceof String) { |
| return ((String) text).replace(toSearch, replaceBy); |
| } |
| final int length = text.length(); |
| int i = indexOf(text, toSearch, 0, length); |
| if (i >= 0) { |
| int p = 0; |
| final int sl = toSearch.length(); |
| final StringBuilder buffer = new StringBuilder(length + (replaceBy.length() - sl)); |
| do { |
| buffer.append(text, p, i).append(replaceBy); |
| i = indexOf(text, toSearch, p = i + sl, length); |
| } while (i >= 0); |
| return buffer.append(text, p, length); |
| } |
| } |
| return text; |
| } |
| |
| /** |
| * Copies a sequence of characters in the given {@code char[]} array. |
| * |
| * @param src the characters sequence from which to copy characters. |
| * @param srcOffset index of the first character from {@code src} to copy. |
| * @param dst the array where to copy the characters. |
| * @param dstOffset index where to write the first character in {@code dst}. |
| * @param length number of characters to copy. |
| * |
| * @see String#getChars(int, int, char[], int) |
| * @see StringBuilder#getChars(int, int, char[], int) |
| * @see StringBuffer#getChars(int, int, char[], int) |
| * @see CharBuffer#get(char[], int, int) |
| * @see javax.swing.text.Segment#array |
| */ |
| public static void copyChars(final CharSequence src, int srcOffset, |
| final char[] dst, int dstOffset, int length) |
| { |
| ArgumentChecks.ensurePositive("length", length); |
| if (src instanceof String) { |
| ((String) src).getChars(srcOffset, srcOffset + length, dst, dstOffset); |
| } else if (src instanceof StringBuilder) { |
| ((StringBuilder) src).getChars(srcOffset, srcOffset + length, dst, dstOffset); |
| } else if (src instanceof StringBuffer) { |
| ((StringBuffer) src).getChars(srcOffset, srcOffset + length, dst, dstOffset); |
| } else if (src instanceof CharBuffer) { |
| ((CharBuffer) src).subSequence(srcOffset, srcOffset + length).get(dst, dstOffset, length); |
| } else { |
| // An other candidate could be javax.swing.text.Segment, but it |
| // is probably not worth to introduce a Swing dependency for it. |
| while (length != 0) { |
| dst[dstOffset++] = src.charAt(srcOffset++); |
| length--; |
| } |
| } |
| } |
| } |