| package edu.illinois.ncsa.daffodil.processors |
| |
| /* Copyright (c) 2012-2013 Tresys Technology, LLC. All rights reserved. |
| * |
| * Developed by: Tresys Technology, LLC |
| * http://www.tresys.com |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy of |
| * this software and associated documentation files (the "Software"), to deal with |
| * the Software without restriction, including without limitation the rights to |
| * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies |
| * of the Software, and to permit persons to whom the Software is furnished to do |
| * so, subject to the following conditions: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimers. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimers in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * 3. Neither the names of Tresys Technology, nor the names of its contributors |
| * may be used to endorse or promote products derived from this Software |
| * without specific prior written permission. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE |
| * SOFTWARE. |
| */ |
| |
| import java.nio.CharBuffer |
| import java.io.FileInputStream |
| import java.nio.charset.Charset |
| import sun.nio.cs.HistoricallyNamedCharset |
| import java.nio.channels.ReadableByteChannel |
| import java.nio.channels.Channels |
| import java.io.IOException |
| import java.nio.channels.FileChannel |
| import java.nio.charset.CharsetDecoder |
| import edu.illinois.ncsa.daffodil.exceptions.Assert |
| import java.nio.charset.CodingErrorAction |
| import java.nio.ByteBuffer |
| import java.io.InputStream |
| import scala.util.control.Breaks._ |
| import java.nio.charset.CoderResult |
| import edu.illinois.ncsa.daffodil.processors.charset.USASCII7BitPackedCharset |
| import edu.illinois.ncsa.daffodil.processors.charset.CharsetUtils |
| import edu.illinois.ncsa.daffodil.processors.charset.SupportsInitialBitOffset |
| import edu.illinois.ncsa.daffodil.compiler.Compiler |
| import java.nio.charset.MalformedInputException |
| import edu.illinois.ncsa.daffodil.processors.charset.CharacterSetAlignmentError |
| |
| /** |
| * The purpose of re-implementing this class is to gain control over |
| * how the StreamDecoder handles malformed input. In DFDL we want the |
| * malformed input error to be treated as the end of data. Java's |
| * StreamDecoder only ignores, replaces or treats it as an error. |
| * |
| * Mostly this class tries to remain true to the Java code from which it was derived |
| * so as to preserve future inter-operation potential. However, some modifications |
| * (for bit-level positioning) make that a lot less likely. |
| */ |
| object DFDLJavaIOStreamDecoder { |
| |
| private val DEFAULT_BYTE_BUFFER_SIZE: Int = Compiler.readerByteBufferSize.toInt |
| |
| def forInputStreamReader(in: InputStream, charset: Charset, bitOffset0to7: Int, bitLimit: Long): DFDLJavaIOStreamDecoder = { |
| |
| val myBB = ByteBuffer.allocateDirect(DFDLJavaIOStreamDecoder.DEFAULT_BYTE_BUFFER_SIZE) |
| myBB.flip() |
| |
| val chan = in match { |
| case fis: FileInputStream => fis.getChannel() |
| case _ => Channels.newChannel(in) |
| } |
| |
| new DFDLJavaIOStreamDecoder(bitOffset0to7, bitLimit, charset, myBB, in, chan) |
| } |
| |
| } |
| |
| /** |
| * The purpose of re-implementing this class is to gain control over |
| * how the StreamDecoder handles malformed input. In DFDL we want the |
| * malformed input error to be treated as the end of data. Java's |
| * StreamDecoder only ignores, replaces or treats it as an error. |
| * |
| * Forces the decoder to REPORT on malformed input. |
| * |
| * We also have to implment the upper-bound aka bitLimit, and not |
| * allow consumption of data past it. |
| */ |
| class DFDLJavaIOStreamDecoder private (bitOffsetWithinAByte: Int, val bitLimit: Long, var cs: Charset, var bb: ByteBuffer, |
| var in: InputStream, var ch: ReadableByteChannel) |
| extends java.io.Reader { |
| |
| Assert.usage(ch != null) |
| Assert.usage(bitOffsetWithinAByte >= 0) |
| Assert.usage(bitOffsetWithinAByte <= 7) |
| |
| val decoder = cs.newDecoder().onMalformedInput(CodingErrorAction.REPORT) |
| |
| // |
| // Now we deal with bit positioning. If the decoder is capable of dealing with a starting bit offset |
| // then we configure it that way. If not we insist we are byte aligned and fail if not. |
| // |
| decoder match { |
| case decoderWithBits: SupportsInitialBitOffset => |
| decoderWithBits.setInitialBitOffset(bitOffsetWithinAByte) |
| case _ if (bitOffsetWithinAByte == 0) => // ok. Do nothing. We are luckily, aligned to a byte. |
| // We're counting on the parser surrounding us to catch this error and turn it into |
| // a ParseError. |
| case _ => throw new CharacterSetAlignmentError(cs.name, 8, bitOffsetWithinAByte) |
| } |
| |
| @volatile |
| private var isOpen: Boolean = true |
| def ensureOpen = { if (!isOpen) throw new java.io.IOException("Stream closed") } |
| |
| private var haveLeftoverChar: Boolean = false |
| private var leftoverChar: Char = 0 |
| |
| def getEncoding: String = { |
| if (isOpen) { return encodingName } |
| return null |
| } |
| |
| override def read: Int = { |
| return read0 |
| } |
| |
| // Leftover char is needed because of utf-16 and the surrogate pair stuff. |
| // In that encoding, we need to read two codepoints, and if they are a surrogate pair |
| // synthesize one character from them. |
| // |
| // Presumably this happens in the decoder, but in the case where it isn't a surrogate |
| // pair, we save work by returning two decoded characters, not one since we've already |
| // decoded the second one. (This little optimization seems very not worth it to me, but |
| // is necessary if you can't peek forward 2 bytes into the underlying bytes without |
| // consuming them.) |
| // |
| // Issue is that UTF-16 is really a variable-width 2-byte or 4-byte character encoding. But |
| // In the early days of unicode and java, it was treated as a fixed width 2-byte. |
| // |
| // Hence, in DFDL we provide control. |
| // This depends on dfdl:utf16Width="variable". If dfdl:utf16Width="fixed", then |
| // we do NOT process surrogate pairs, and return them each as a separate codepoint. |
| // |
| // So at some point our decoder must implement utf16Width="fixed" which means it must |
| // not do this surrogate pair processing. That means either we find a feature for that |
| // in CharsetDecoder (it may be there as this is a common need), or we'll have to clone |
| // and reimplement that class too. |
| // |
| // TBD: implement utf16Width="variable". For now we can SDE on utf16Width="variable". |
| // |
| // This code assumes implicitly that it is ok if the byte stream has been advanced |
| // further (by one character decoding) than it would have to produce one character only. |
| // |
| // So long as we're never asking this stream of characters for a byte position we're ok. |
| // |
| // |
| // Note: we're not sharing this object across threads. We're assuming one instance per, for |
| // any sort of parallel execution; hence, we don't do lock.synchronized. |
| // |
| private def read0: Int = { |
| //lock.synchronized // in java this was a synchronized method |
| { |
| // Return the leftover char, if there is one |
| if (haveLeftoverChar) { |
| haveLeftoverChar = false |
| return leftoverChar |
| } |
| |
| // Convert more bytes |
| val cb: Array[Char] = new Array[Char](2) |
| val n: Int = read(cb, 0, 2) |
| n match { |
| case -1 => return -1 |
| case 2 => { |
| leftoverChar = cb(1) |
| haveLeftoverChar = true |
| return cb(0) |
| } |
| case 1 => return cb(0) |
| case _ => { |
| // TODO: assert false : n |
| return -1 |
| } |
| } |
| } |
| } |
| |
| // All lengths and offsets below are in character units. |
| // |
| def read(cbuf: Array[Char], offset: Int, length: Int): Int = { |
| var off: Int = offset |
| var len: Int = length |
| |
| // lock.synchronized // in java this was a synchronized method |
| { |
| ensureOpen |
| if ((off < 0) || (off > cbuf.length) || (len < 0) |
| || ((off + len) > cbuf.length) || ((off + len) < 0)) { |
| throw new IndexOutOfBoundsException() |
| } |
| |
| if (len == 0) { return 0 } |
| var n: Int = 0 |
| |
| if (haveLeftoverChar) { |
| // Copy the leftover char into the buffer |
| cbuf(off) = leftoverChar |
| off += 1 |
| len -= 1 |
| haveLeftoverChar = false |
| n = 1 |
| if ((len == 0) || !implReady) { |
| // Return now if this is all we can produce w/o blocking |
| DFDLCharCounter.incr(n) |
| return n |
| } |
| } |
| |
| // If length is 1, then we recursively end up back here with length 2, |
| // and that's why it's not a stack overflow. |
| if (len == 1) { |
| // Treat single-character array reads just like read() |
| val c: Int = read0 |
| if (c == -1) { |
| return if (n == 0) -1 else { |
| DFDLCharCounter.incr(n) |
| n |
| } |
| } |
| cbuf(off) = c.asInstanceOf[Char] |
| return { |
| DFDLCharCounter.incr(n + 1) |
| n + 1 |
| } |
| } |
| val res = n + implRead(cbuf, off, off + len) |
| if (res > 0) DFDLCharCounter.incr(res) |
| return res |
| } |
| |
| } |
| |
| override def ready: Boolean = { |
| // lock.synchronized { |
| ensureOpen |
| return haveLeftoverChar || implReady |
| // } |
| } |
| |
| def close: Unit = { |
| // lock.synchronized { |
| if (!isOpen) { return } |
| implClose |
| isOpen = false |
| // } |
| } |
| |
| private def readBytes: Int = { |
| bb.compact() |
| try { |
| // if (ch != null) { |
| // Read from the channel |
| val n: Int = ch.read(bb) |
| if (n < 0) return n |
| // } else { |
| // // Read from the input stream, and hten update the buffer |
| // val lim: Int = bb.limit() |
| // val pos: Int = bb.position() |
| // assert(pos <= lim) |
| // val rem: Int = if (pos <= lim) lim - pos else 0 |
| // assert(rem > 0) |
| // val n: Int = in.read(bb.array, bb.arrayOffset() + pos, rem) |
| // if (n < 0) return n |
| // if (n == 0) throw new IOException("Underlying input stream returned zero bytes") |
| // //assert(n <= rem) : "n = " + n + ", rem = " + rem |
| // assert(n <= rem) |
| // bb.position(pos + n) |
| // } |
| } finally { |
| // Flip even when an IOException is thrown, |
| // otherwise the stream will stutter |
| bb.flip() |
| } |
| val rem: Int = bb.remaining() |
| // assert(rem != 0) : rem |
| assert(rem != 0) |
| return rem |
| } |
| |
| /** |
| * Contains change that allows us to treat malformed data as |
| * end of data. However, this requires that the decoder is set |
| * to REPORT the issue rather than IGNORE or REPLACE it. |
| */ |
| def implRead(cbuf: Array[Char], off: Int, end: Int): Int = { |
| // In order to handle surrogate pairs, this method requires that |
| // the invoker attempt to read at least two characters. Saving the |
| // extra character, if any, at a higher level is easier |
| // to deal with it here. |
| assert(end - off > 1) |
| var cb: CharBuffer = CharBuffer.wrap(cbuf, off, end - off) |
| if (cb.position != 0) { |
| // Ensure that cb[0] == cbuf[off] |
| cb = cb.slice |
| } |
| |
| var eof: Boolean = false |
| var continue: Boolean = false |
| |
| breakable { |
| while (true) { |
| val cr: CoderResult = decoder.decode(bb, cb, eof) // decode bb to cb |
| if (cr.isUnderflow) { // no more data, or not enough data to complete a character. |
| if (eof) { break } // eof flag set last time around this loop, so if we get an underflow it just reinforces our eof. |
| if (!cb.hasRemaining()) { break } // no more room in cb, so break out. |
| if ((cb.position() > 0) && !inReady) { break } // we've got at least one character, and we're not ready to read anymore. So break out. |
| val n: Int = readBytes // try to add more bytes to the bb |
| if (n < 0) { // got 0 more bytes |
| eof = true // so we're at EOF. |
| if ((cb.position() == 0) && (!bb.hasRemaining())) { break } // no characters and no room to get more data into bb, then breakout. |
| decoder.reset() // TODO ??? why reset. Can we actually discard any decoder state (such as mid-character)? |
| } |
| continue = true // No continue exists in Scala |
| } |
| |
| if (!continue) { // No continue exists in Scala, fabricated one |
| if (cr.isOverflow) { |
| assert(cb.position() > 0) |
| break |
| } |
| Assert.invariant(cr.isMalformed) |
| // DFDL Implementors: |
| // The whole reason we reimplemented this code in scala is |
| // to change this behavior here. |
| // |
| // DFDL needs decode errors to behave as if end of data was reached. |
| // So instead of throw, just set eof true. |
| // cr.throwException |
| eof = true |
| // |
| // Setting eof to true isn't enough. Because we may have successfully |
| // parsed some characters already, so we'll be returning a count of |
| // that many. |
| // |
| // But since we're at EOF we need to return -1 so the calling context |
| // (The addMore member of Page[T] in PagedSeq.scala) |
| // knows we are done and no more can be provided. |
| // |
| // Doc for the CharsetDecoder class says |
| // "The malformed bytes begin at the input buffer's (possibly incremented) |
| // position" |
| // |
| // So the byte buffer position is left at the end of the correctly decoded |
| // characters. |
| // |
| |
| // |
| // This assert doesn't hold because bb.position() can be zero, but cr.length can be 1 or more. When the very first thing |
| // are malformed, then bb does not get advanced. |
| // |
| // Similarly, suppose bb.position() was advanced 2 bytes. But then a 4-character |
| // malformed is encountered. So cr.length() is 4. This invariant also will |
| // not hold. |
| // Assert.invariant(bb.position() >= cr.length()) |
| // bb.position(bb.position() - cr.length()) |
| break |
| } |
| continue = false |
| } // end while loop |
| } |
| |
| if (eof) { |
| // ## Need to flush decoder |
| decoder.reset() |
| } |
| |
| if (cb.position() == 0) { |
| if (eof) return -1 // note: we have to return -1, having decoded zero characters successfully. |
| assert(false) |
| } |
| |
| return cb.position() |
| } |
| |
| def encodingName: String = { |
| return if (cs.isInstanceOf[HistoricallyNamedCharset]) cs.asInstanceOf[HistoricallyNamedCharset].historicalName else cs.name() |
| } |
| |
| private def inReady: Boolean = { |
| try { |
| return (((in != null) && (in.available() > 0)) || (ch.isInstanceOf[FileChannel])) // ## RBC.available()? |
| } catch { |
| case e: IOException => return false |
| } |
| } |
| |
| def implReady: Boolean = { return bb.hasRemaining() || inReady } |
| |
| def implClose: Unit = { |
| if (ch != null) ch.close() |
| else in.close() |
| } |
| |
| } |