endorsed/src/org.apache.sis.util/main/org/apache/sis/io/LineAppender.java - sis - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.sis.io;

 import java.util.Objects;
 import java.io.Flushable;
 import java.io.IOException;
 import org.apache.sis.util.Characters;
 import org.apache.sis.util.CharSequences;
 import org.apache.sis.util.ArgumentChecks;
 import org.apache.sis.util.privy.X364;


 /**
  * An {@link Appendable} which can apply different kinds of reformatting that depend on the
  * <i>End Of Line</i> (EOL) occurrences. Available reformatting include inserting a
  * a margin before each line, wrapping to a maximal line length and replacing tabulations or
  * EOL characters. The actual work to be done can be enabled by invoking one or many of the
  * following methods:
  *
  * <ul>
  *   <li>{@link #setMaximalLineLength(int)} for wrapping the lines to some maximal line length,
  *       typically 80 Unicode characters (code points).</li>
  *   <li>{@link #setTabulationExpanded(boolean)} for replacing tabulation characters by spaces.</li>
  *   <li>{@link #setLineSeparator(String)} for replacing all occurrences of
  *       {@linkplain Characters#isLineOrParagraphSeparator(int) line separators} by the given string.</li>
  * </ul>
  *
  * In addition this class removes trailing {@linkplain Character#isWhitespace(int) whitespaces}
  * before end of lines.
  *
  * <h2>How line lengths are calculated</h2>
  * Line length are measured in unit of Unicode <i>code points</i>. This is usually the same
  * than the number of {@code char} primitive values, but not always. Combining characters are not
  * yet recognized by this class, but future versions may improve on that.
  *
  * <p>For proper line length calculation in presence of tabulation characters ({@code '\t'}),
  * this class needs to known the tabulation width. The default value is 8, but this can be changed
  * by a call to {@link #setTabulationWidth(int)}. Note that invoking that method affects only line
  * length calculation; it does not replace tabulations by spaces. For tabulation expansion, see
  * {@link #setTabulationExpanded(boolean)}.</p>
  *
  * @author  Martin Desruisseaux (Geomatys)
  * @version 1.5
  * @since   0.3
  */
 public class LineAppender extends Appender implements Flushable {
     /**
      * The line separator, or {@code null} if not yet determined. If {@code null}, then the
      * {@link #append(CharSequence, int, int)} method will try to infer it from the submitted text.
      *
      * <p>If {@link #isEndOfLineReplaced} is {@code false} (the default), then this line separator
      * will be used only when this class inserts new line separators as a consequence of line wraps.
      * Line separators found in the texts given by the user will be passed "as is".
      * If {@code true}, then all line separators are replaced.</p>
      */
     private String lineSeparator;

     /**
      * The maximal line length, in units of <em>code points</em> (not {@code char}).
      * Can be set to {@link Integer#MAX_VALUE} if there is no limit.
      *
      * @see #setMaximalLineLength(int)
      */
     private int maximalLineLength;

     /**
      * The length of the current line, in units of <em>code points</em> (not {@code char}).
      * It may be greater than the length of {@link #buffer} because the latter contains only
      * the last word.
      *
      * @see #getCurrentLineLength()
      */
     private int codePointCount;

     /**
      * The tabulation width, in number of code points.
      *
      * @see #setTabulationWidth(int)
      */
     private short tabulationWidth = 8;

     /**
      * {@code true} if this formatter shall expands tabulations into spaces.
      *
      * @see #setTabulationExpanded(boolean)
      */
     private boolean isTabulationExpanded;

     /**
      * {@code true} if all occurrences of EOL sequences shall be replaced by
      * the {@link #lineSeparator}, or {@code false} for keeping EOL unchanged.
      */
     private boolean isEndOfLineReplaced;

     /**
      * {@code true} if the next character needs to be skipped if equals to {@code '\n'}.
      * This field is used in order to avoid writing two EOL in place of {@code "\r\n"}.
      */
     private boolean skipLF;

     /**
      * {@code true} if the next character will be at the beginning of a new line.
      * This flag is set to {@code true} only for "real" new lines, as a result of
      * line separator found in the input given to this formatter. The "generated"
      * new lines (resulting from line wrap) will invoke {@link #onLineBegin(boolean)}
      * directly without the help of this temporary variable.
      *
      * @see #transfer(int)
      */
     private boolean isNewLine = true;

     /**
      * {@code true} if an escape sequence is in progress. The escape sequence will stop
      * after the first non-digit character other than {@link X364#BRACKET}.
      */
     private boolean isEscapeSequence;

     /**
      * The buffer for the last word being written.
      * This buffer will also contain trailing whitespace characters. If whitespaces are followed
      * by at least one non-white character, then the whitespaces are written to the underlying
      * stream before the non-ignorable one. Otherwise if whitespaces are followed by a line
      * separator, then they are discarded.
      */
     private final StringBuilder buffer = new StringBuilder();

     /**
      * The number of Java characters (not Unicode code points) in {@link #buffer},
      * ignoring trailing whitespaces.
      */
     private int printableLength;

     /**
      * Constructs a default formatter. Callers should invoke at least one of the following methods
      * after construction in order to perform useful work:
      *
      * <ul>
      *   <li>{@link #setMaximalLineLength(int)}</li>
      *   <li>{@link #setTabulationExpanded(boolean)}</li>
      *   <li>{@link #setLineSeparator(String)}</li>
      * </ul>
      *
      * @param out  the underlying stream or buffer to write to.
      */
     public LineAppender(final Appendable out) {
         super(out);
         maximalLineLength = Integer.MAX_VALUE;
     }

     /**
      * Constructs a formatter which will replaces line separators by the given string.
      *
      * @param out                   the underlying stream or buffer to write to.
      * @param lineSeparator         the line separator to send to {@code out}, or {@code null}
      *                              for forwarding the EOL sequences unchanged.
      * @param isTabulationExpanded  {@code true} for expanding tabulations into spaces,
      *                              or {@code false} for sending {@code '\t'} characters as-is.
      */
     public LineAppender(final Appendable out, final String lineSeparator, final boolean isTabulationExpanded) {
         super(out);
         maximalLineLength = Integer.MAX_VALUE;
         this.lineSeparator        = lineSeparator;
         this.isEndOfLineReplaced  = (lineSeparator != null);
         this.isTabulationExpanded = isTabulationExpanded;
     }

     /**
      * Constructs a formatter which will wrap the lines at a given maximal length.
      *
      * @param out                   the underlying stream or buffer to write to.
      * @param maximalLineLength     the maximal number of Unicode characters per line,
      *                              or {@link Integer#MAX_VALUE} if there is no limit.
      * @param isTabulationExpanded  {@code true} for expanding tabulations into spaces,
      *                              or {@code false} for forwarding {@code '\t'} characters as-is.
      */
     public LineAppender(final Appendable out, final int maximalLineLength, final boolean isTabulationExpanded) {
         super(out);
         ArgumentChecks.ensureStrictlyPositive("maximalLineLength", maximalLineLength);
         this.maximalLineLength    = maximalLineLength;
         this.isTabulationExpanded = isTabulationExpanded;
     }

     /**
      * Returns the maximal line length, in unit of Unicode characters (code point count).
      * The default value is no limit.
      *
      * @return the current maximal number of Unicode characters per line,
      *         or {@link Integer#MAX_VALUE} if there is no limit.
      */
     public int getMaximalLineLength() {
         return maximalLineLength;
     }

     /**
      * Sets the maximal line length, in units of Unicode characters (code point count).
      *
      * @param  length  the new maximal number of Unicode characters per line,
      *                 or {@link Integer#MAX_VALUE} if there is no limit.
      */
     public void setMaximalLineLength(final int length) {
         ArgumentChecks.ensureStrictlyPositive("length", length);
         maximalLineLength = length;
     }

     /**
      * (@return the length of the current line, in units of Unicode code points}.
      *
      * @since 1.5
      */
     public int getCurrentLineLength() {
         return codePointCount;
     }

     /**
      * Sets the length of the current line. This method usually do not need to be invoked,
      * because the value of this property is automatically adjusted when texts are appended
      * by this {@code LineAppender}. However, setting an explicit value may be useful when
      * the output specified to the constructor was not initially empty, or when the output
      * content is modified outside this {@code LineAppender} instance.
      *
      * @param  lengh  the new length of the current line, in units of Unicode code points.
      *
      * @since 1.5
      */
     public void setCurrentLineLength(final int length) {
         ArgumentChecks.ensurePositive("length", length);
         codePointCount = length;
     }

     /**
      * Returns the current tabulation width, in unit of Unicode characters (code point count).
      * The default value is 8.
      *
      * @return the current tabulation width in number of Unicode characters.
      */
     public int getTabulationWidth() {
         return tabulationWidth;
     }

     /**
      * Sets the tabulation width, in unit of Unicode characters (code point count).
      *
      * @param  width  the new tabulation width. Must be greater than 0.
      * @throws IllegalArgumentException if {@code tabWidth} is not greater than 0
      *         or is unreasonably high.
      */
     public void setTabulationWidth(final int width) {
         ArgumentChecks.ensureStrictlyPositive("width", width);
         ArgumentChecks.ensureBetween("width", 1, Integer.MAX_VALUE, width);
         tabulationWidth = (short) width;
     }

     /**
      * Returns {@code true} if this formatter expands tabulations into spaces.
      * The default value is {@code false}, which means that {@code '\t'} characters
      * are sent to the underlying appendable <i>as-is</i>.
      *
      * @return {@code true} if this formatter expands tabulations into spaces,
      *         or {@code false} if {@code '\t'} characters are forwarded <i>as-is</i>.
      */
     public boolean isTabulationExpanded() {
         return isTabulationExpanded;
     }

     /**
      * Sets whether this class formatter expands tabulations into spaces.
      *
      * @param expanded {@code true} if this class shall expands tabulations into spaces,
      *                 or {@code false} for forwarding {@code '\t'} characters as-is.
      */
     public void setTabulationExpanded(final boolean expanded) {
         isTabulationExpanded = expanded;
     }

     /**
      * Returns the line separator to be sent to the underlying appendable,
      * or {@code null} if EOL sequences are forwarded unchanged.
      *
      * @return the current line separator, or {@code null} if EOL are forwarded <i>as-is</i>.
      */
     public String getLineSeparator() {
         return isEndOfLineReplaced ? lineSeparator : null;
     }

     /**
      * Changes the line separator to be sent to the underlying appendable.
      * This is the string to insert in place of every occurrences of {@code "\r"}, {@code "\n"},
      * {@code "\r\n"} or other {@linkplain Characters#isLineOrParagraphSeparator(int) line separators}.
      * If {@code null} (the default), then the line separators given to the {@code append}
      * methods are forwarded unchanged.
      *
      * @param  lineSeparator  the new line separator, or {@code null} for forwarding EOL <i>as-is</i>.
      *
      * @see Characters#isLineOrParagraphSeparator(int)
      */
     public void setLineSeparator(final String lineSeparator) {
         this.lineSeparator  = lineSeparator;
         isEndOfLineReplaced = (lineSeparator != null);
     }

     /**
      * Writes a line separator to {@link #out}. This method is invoked for new line separators
      * generated by this class, not for the line separators found in the texts supplied by the
      * user, unless {@link #isEndOfLineReplaced} is {@code true}.
      *
      * The {@link #append(CharSequence,int,int)} method tries to detect the line separator used
      * in the text, but if no line separator has been found we have to use some fallback.
      */
     private void writeLineSeparator() throws IOException {
         if (lineSeparator == null) {
             lineSeparator = System.lineSeparator();
         }
         out.append(lineSeparator);
     }

     /**
      * Writes pending non-white characters, discards trailing whitespaces, and resets column position to zero.
      * This method does <strong>not</strong> write the line separator and does not modify the status of the
      * {@link #skipLF} flag. Those tasks are caller's responsibility.
      */
     private void endOfLine() throws IOException {
         buffer.setLength(printableLength);      // Reduce the amount of work for StringBuilder.deleteCharAt(int).
         deleteSoftHyphen(printableLength - 1);
         transfer(printableLength);
         printableLength  = 0;
         codePointCount   = 0;
         isEscapeSequence = false;               // Handle line-breaks as "end of escape sequence".
         isNewLine        = true;
     }

     /**
      * Removes the soft hyphen characters from the given buffer. This is invoked
      * when the buffer is about to be written without being split on two lines.
      *
      * @param i index after the last character to check. This is either {@link printableLength}
      *          for checking all characters, or {@code printableLength-1} for preserving the last
      *          soft hyphen on the line (while removing all others).
      */
     private void deleteSoftHyphen(int i) {
         while (--i >= 0) {
             if (buffer.charAt(i) == Characters.SOFT_HYPHEN) {
                 buffer.deleteCharAt(i);
                 printableLength--;
             }
         }
     }

     /**
      * Writes the given number of characters from the {@linkplain #buffer},
      * then removes those characters from the buffer. This method does not
      * adjust {@link #printableLength}; it is caller responsibility to do so.
      */
     private void transfer(final int length) throws IOException {
         if (isNewLine) {
             isNewLine = false;
             onLineBegin(false);
         }
         out.append(buffer, 0, length);
         buffer.delete(0, length);
     }

     /**
      * Writes the specified code point.
      *
      * @throws IOException if an I/O error occurs.
      */
     private void write(final int c) throws IOException {
         /*
          * If the character to write is a EOL sequence, then:
          *
          *   1) Trim trailing whitespaces in the buffer.
          *   2) Remove unused soft-hyphens (otherwise some consoles display them).
          *   3) Flush the buffer to the underlying appendable.
          *   4) Write the line separator.
          */
         if (Characters.isLineOrParagraphSeparator(c)) {
             final boolean skip;
             switch (c) {
                 case '\r': skip = false;  skipLF = true;  break;
                 case '\n': skip = skipLF; skipLF = false; break;
                 default:   skip = false;  skipLF = false; break;
             }
             if (!skip) {
                 endOfLine();
             }
             if (!isEndOfLineReplaced) {
                 appendCodePoint(c);         // Forward EOL sequences "as-is".
             } else if (!skip) {
                 writeLineSeparator();       // Replace EOL sequences by the unique line separator.
             }
             return;
         }
         skipLF = false;
         /*
          * If the character to write is a whitespace, then write any pending characters from
          * the buffer to the underlying appendable since we know that those characters didn't
          * exceed the line length limit.
          *
          * We use `Character.isWhitespace(…)` instead of `Character.isSpaceChar(…)` because
          * the former returns `true` for tabulations (which we want), and returns `false`
          * for non-breaking spaces (which we also want).
          */
         if (Character.isWhitespace(c)) {
             if (printableLength != 0) {
                 deleteSoftHyphen(printableLength);
                 transfer(printableLength);
                 printableLength = 0;
             }
             if (c != '\t') {
                 codePointCount++;
             } else {
                 final int width = tabulationWidth - (codePointCount % tabulationWidth);
                 codePointCount += width;
                 if (isTabulationExpanded) {
                     buffer.append(CharSequences.spaces(width));
                     return;
                 }
             }
             buffer.appendCodePoint(c);
             return;
         }
         buffer.appendCodePoint(c);
         printableLength = buffer.length();
         /*
          * Special handling of ANSI X3.64 escape sequences. Since they are not visible
          * characters (they are used for controlling the colors), do not count them in
          * `codePointCount` (but still count them as "printable" characters, since we
          * don't want to trim them). The sequence pattern is "CSI <digits> <command>"
          * where <command> is a single letter.
          */
         if (c == X364.ESCAPE) {
             isEscapeSequence = true;
             return;
         } else if (isEscapeSequence) {
             final char previous = buffer.charAt(printableLength - 2);
             if (previous != X364.ESCAPE) {
                 isEscapeSequence = (c >= '0' && c <= '9');
                 return;         // The letter after the digits will be the last character to skip.
             } else if (c == X364.BRACKET) {
                 return;         // Found the second part of the Control Sequence Introducer (CSI).
             }
             // [ESC] was not followed by '['. Proceed as a normal character.
             isEscapeSequence = false;
         }
         /*
          * The remaining of this method is executed only if we exceeded the maximal line length.
          * First, search for a dash character (hyphen) for splitting the line after it. If we do
          * not find a dash character, as a fallback split on any non-letter or digit characters
          * except the punctuation starts.
          */
         if (++codePointCount < maximalLineLength) {
             return;
         }
         int splitAt = buffer.length();          // Where to separate the line as two lines.
         int fallback = splitAt;                 // Fallback to use if we could not find a value for `splitAt`.
         boolean hasFallback = false;            // Whether the `fallback` value has been defined.
 split:  for (;;) {
             if (splitAt <= 0) {
                 splitAt = fallback;
                 break;
             }
             int b = buffer.codePointBefore(splitAt);
             int n = Character.charCount(b);
             switch (Character.getType(b)) {
                 case Character.UPPERCASE_LETTER:
                 case Character.LOWERCASE_LETTER:
                 case Character.TITLECASE_LETTER:
                 case Character.MODIFIER_LETTER:
                 case Character.OTHER_LETTER:
                 case Character.DECIMAL_DIGIT_NUMBER:
                 case Character.INITIAL_QUOTE_PUNCTUATION:
                 case Character.START_PUNCTUATION: break;            // Do nothing (search another character).
                 case Character.PARAGRAPH_SEPARATOR:
                 case Character.SPACE_SEPARATOR:
                 case Character.LINE_SEPARATOR:
                 case Character.CONTROL: {
                     /*
                      * Split the line before a space (except no-break space) and discard trailing spaces.
                      * The `isWhitespace(b)` check is necessary for excluding the no-break spaces.
                      */
                     final int end = splitAt;
                     while (Character.isWhitespace(b)) {
                         if ((splitAt -= n) <= 0) break;
                         b = buffer.codePointBefore(splitAt);
                         n = Character.charCount(b);
                     }
                     if (splitAt == end) break;                      // No-break space. Search another character.
                     buffer.delete(splitAt, end);
                     break split;                                    // Split here (before the space character).
                 }
                 /*
                  * Split the line after a dash character.
                  * The "letter before" condition is a way to avoid splitting at the minus sign
                  * of negative numbers, assuming that the minus sign is preceeded by a space.
                  * We cannot look at the character after because it may not be in the buffer yet.
                  */
                 case Character.DASH_PUNCTUATION: {
                     if (b == '-') {
                         b = splitAt - n;
                         if (b > 0 && !Character.isLetter(buffer.codePointBefore(b))) {
                             break;      // Continue the search in previous characters.
                         }
                     }
                     break split;        // Split here (after the dash character).
                 }
                 /*
                  * Soft hyphen are not in the dash category, so they need to be checked here.
                  * Replace soft-hyphen by ordinary (visible) hyphen since the hyphen is used.
                  */
                 case Character.FORMAT: {
                     if (b == Characters.SOFT_HYPHEN) {
                         buffer.setCharAt(splitAt - n, Characters.HYPHEN);
                         break split;    // Split here (after the dash character).
                     }
                     break;              // Do nothing (search another character).
                 }
                 /*
                  * All other categories (e.g. punctuations) may be used as a split point
                  * if no better location is found.
                  */
                 default: {
                     if (!hasFallback && b != '<') {
                         hasFallback = true;
                         fallback = splitAt;
                     }
                     break;
                 }
             }
             splitAt -= n;
         }
         transfer(splitAt);
         writeLineSeparator();
         printableLength = buffer.length();          // Remaining characters will be on next line.
         codePointCount  = buffer.codePointCount(0, printableLength);
         onLineBegin(true);
     }

     /**
      * Writes a single character.
      *
      * @param  c  the character to append.
      * @return a reference to this {@code Appendable}.
      * @throws IOException if an I/O error occurs.
      */
     @Override
     public Appendable append(final char c) throws IOException {
         final int cp = toCodePoint(c);
         if (cp >= 0) {
             write(cp);
         }
         return this;
     }

     /**
      * Writes a portion of a character sequence.
      *
      * @param  sequence  the character sequence to be written.
      * @param  start     index from which to start reading characters.
      * @param  end       index of the character following the last character to read.
      * @return a reference to this {@code Appendable}.
      * @throws IOException if an I/O error occurs.
      */
     @Override
     public Appendable append(final CharSequence sequence, int start, final int end) throws IOException {
         Objects.checkFromToIndex(start, end, sequence.length());
         if (lineSeparator == null) {
             /*
              * Use the line separator found in the submitted document, if possible.
              * If we don't find any line separator in the submitted content, leave
              * the `lineSeparator` field to null since the `write` method will set
              * it to the default value only if it really needs it.
              */
             lineSeparator = lineSeparator(sequence, start, end);
         }
         start = appendSurrogate(sequence, start, end);
         while (start < end) {
             final int c = toCodePoint(sequence.charAt(start++));
             if (c >= 0) {
                 write(c);
             }
         }
         return this;
     }

     /**
      * Resets the {@code LineAppender} internal state as if a new line was beginning.
      * Trailing whitespaces not yet sent to the {@linkplain #out underlying appendable}
      * are discarded, and the column position (for tabulation expansion calculation) is
      * reset to 0. This method does not write any line separator.
      *
      * @throws IOException if an error occurred while sending the trailing non-white
      *         characters to the underlying stream.
      */
     public void clear() throws IOException {
         endOfLine();
         skipLF = false;
     }

     /**
      * Sends all pending characters to the underlying appendable, including trailing whitespaces.
      * Note that this method should preferably be invoked at the end of a word, sentence or line,
      * since invoking this method may prevent {@code LineAppender} to properly wrap the current
      * line if the current position is in the middle of a word.
      *
      * <p>Invoking this method also flushes the underlying stream, if {@linkplain Flushable flushable}.
      * A cheaper way to send pending characters is to make sure that the last character is a
      * {@linkplain Characters#isLineOrParagraphSeparator(int) line or paragraph terminator},
      * or to invoke {@link #clear()}.</p>
      *
      * @throws IOException if an I/O error occurs.
      */
     @Override
     public void flush() throws IOException {
         out.append(buffer);
         buffer.setLength(0);
         printableLength = 0;
         IO.flush(out);
     }

     /**
      * Invoked when a new line is beginning. The default implementation does nothing,
      * but subclasses can override this method for example in order to insert a margin
      * on the left side before each line.
      *
      * <p>If an implementation wishes to write characters, it shall do so by writing
      * directly to {@link #out}, <strong>not</strong> by invoking the {@code append}
      * methods of this class.</p>
      *
      * @param  isContinuation {@code true} if the new line is the continuation of the previous
      *         line after a "line wrap", or {@code false} if a line or paragraph separator has
      *         been explicitly sent to this formatter.
      * @throws IOException if an error occurred while writing to {@link #out}.
      */
     protected void onLineBegin(boolean isContinuation) throws IOException {
     }
 }