| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.daffodil.processors.unparsers |
| |
| import org.apache.daffodil.exceptions.Assert |
| import org.apache.daffodil.processors.ElementRuntimeData |
| import org.apache.daffodil.processors.TextTruncationType |
| import org.apache.daffodil.processors.CharsetEv |
| import org.apache.daffodil.processors.UnparseTargetLengthInBitsEv |
| import org.apache.daffodil.util.MaybeJULong |
| import org.apache.daffodil.processors.LengthEv |
| import org.apache.daffodil.processors.Evaluatable |
| |
| import java.nio.charset.MalformedInputException |
| import java.nio.charset.UnmappableCharacterException |
| |
| sealed abstract class StringSpecifiedLengthUnparserBase( |
| val erd: ElementRuntimeData) |
| extends TextPrimUnparser { |
| |
| override def context = erd |
| |
| /** |
| * override in nil specified length unparsers |
| */ |
| protected def contentString(state: UState) = |
| state.currentInfosetNode.asSimple.dataValueAsString |
| |
| } |
| |
| class StringNoTruncateUnparser( |
| erd: ElementRuntimeData) |
| extends StringSpecifiedLengthUnparserBase(erd) { |
| |
| override def runtimeDependencies: Vector[Evaluatable[AnyRef]] = Vector() |
| |
| override def unparse(state: UState): Unit = { |
| val dos = state.dataOutputStream |
| val valueToWrite = contentString(state) |
| val nCharsWritten = try { |
| dos.putString(valueToWrite, state) |
| } catch { |
| case m: MalformedInputException => { UE(state, "%s - MalformedInputException: \n%s", nom, m.getMessage()) } |
| case u: UnmappableCharacterException => { UE(state, "%s - UnmappableCharacterException: \n%s", nom, u.getMessage()) } |
| } |
| Assert.invariant(nCharsWritten == valueToWrite.length) |
| } |
| |
| } |
| |
| sealed abstract class StringSpecifiedLengthUnparserTruncateBase( |
| stringTruncationType: TextTruncationType.Type, |
| erd: ElementRuntimeData) |
| extends StringSpecifiedLengthUnparserBase(erd) { |
| |
| Assert.usage(stringTruncationType ne TextTruncationType.None) |
| |
| /** |
| * We only truncate strings, and only if textStringJustification is left or |
| * right, and only if truncateSpecifiedLengthString is yes. |
| */ |
| protected final def truncateByJustification(ustate: UState, str: String, nChars: Long): String = { |
| Assert.invariant(erd.optTruncateSpecifiedLengthString.isDefined) |
| val nCharsToTrim = str.length - nChars.toInt |
| val result = stringTruncationType match { |
| case TextTruncationType.Right => { |
| str.substring(nCharsToTrim) |
| } |
| case TextTruncationType.Left => { |
| str.substring(0, str.length - nCharsToTrim) |
| } |
| case TextTruncationType.ErrorIfNeeded => { |
| // justification type was "center", which cannot be truncated, so |
| // should be an error |
| UE(ustate, "Truncation required but disallowed when dfdl:truncateSpecifiedLengthString=\"yes\" and dfdl:textStringJustification=\"center\"") |
| } |
| case TextTruncationType.None => { |
| Assert.invariantFailed("cannot be TextTruncationType.None") |
| } |
| } |
| result |
| } |
| } |
| /** |
| * Truncates strings to the right length measured in bits. |
| * LengthUnits is Bits, but we still don't know whether the encoding |
| * is fixed width or variable width. |
| */ |
| class StringMaybeTruncateBitsUnparser( |
| targetLengthInBitsEv: UnparseTargetLengthInBitsEv, |
| stringTruncationType: TextTruncationType.Type, |
| erd: ElementRuntimeData, |
| charsetEv: CharsetEv) |
| extends StringSpecifiedLengthUnparserTruncateBase( |
| stringTruncationType, |
| erd) { |
| |
| override lazy val runtimeDependencies = Vector(targetLengthInBitsEv, charsetEv) |
| |
| private def getLengthInBits(str: String, state: UState): (Long, Long) = { |
| val cs = charsetEv.evaluate(state) |
| val mfw = cs.maybeFixedWidth |
| val sl = str.length.toLong |
| val res = |
| if (mfw.isDefined) { |
| // fixed width encoding so we can get the length in bits by calculation |
| // |
| // DFDL workgroup discussed whether one must scan the characters here so as |
| // to detect encoding errors, and it was decided one does not have |
| // to do so for the fixed length case. This encoding error will get |
| // found when the string is *actually* encoded later. |
| // |
| (sl * mfw.get, sl) |
| } else { |
| // |
| // variable width encoding. (most important example is utf-8 encoding) |
| // We have to measure. |
| // It would be nice to save the result of this, if the string is |
| // very long (which is certainly possible), but the overhead of |
| // doing so for short strings is probably not worth it. |
| // |
| // Almost certainly the right thing is to add complexity only if |
| // profiling shows this to be a bottleneck. |
| // |
| // We do this measurment is reusing the I/O system, this has the advantage |
| // of whatever the I/O system's encoding error behavior is, this will |
| // reuse that. However, we're not, per-se, required to detect that here, |
| // but we do need to get the exact same number of bits here as will |
| // be output to the actual data output stream later, so reusing the |
| // DOS insures that we're using the exact same encoder initialized |
| // the exact same way. |
| // |
| // TODO: PERFORMANCE: |
| // We also do have the option of recycling everything here from pools, |
| // or, since each unparser is single threaded and has its own state, |
| // we could put temp space for this in the UState and just directly |
| // reuse it. |
| // |
| state.withByteArrayOutputStream { |
| case (_, dos) => |
| val nChars = dos.putString(str, state) |
| val nBits = dos.relBitPos0b.toLong |
| (nBits, nChars) |
| } |
| |
| } |
| res |
| } |
| |
| override def unparse(state: UState): Unit = { |
| |
| // |
| // We have to stage the bits of the value just so as to be able to count them |
| // Then we can figure out the number of padChars to add because a padChar must |
| // be a minimum-width character. |
| // |
| val dos = state.dataOutputStream |
| val valueString = contentString(state) |
| val valueToWrite = { |
| // |
| // Historic note about lessons learned. |
| // |
| // We used to think unparsing would be symmetric to parsing, and there |
| // would be this ability to bind the length limit, and then use it |
| // to know how much to unparse, depending on the length limit to detect |
| // overruns (too much data), and to tell us the size for the truncation |
| // case. |
| // |
| // This is depending on some outer unparser binding the length limit |
| // so that we know how long to truncate to. |
| // And that we're not even being called until that target length is known |
| // and has been bound. |
| // |
| // But data output streams don't do binding of length limits the |
| // same way as the parser, because in many situations the data output |
| // stream before and after aren't the same object. |
| // |
| // So this concept doesn't work. |
| // |
| // Really we want to just get the target length in bits, knowing that |
| // by the time we get here it DOES have a value. |
| // |
| val maybeTargetLengthInBits: MaybeJULong = targetLengthInBitsEv.evaluate(state) |
| Assert.invariant(maybeTargetLengthInBits.isDefined) |
| val targetLengthInBits = maybeTargetLengthInBits.get |
| val (nBits, nChars) = getLengthInBits(valueString, state) |
| val targetLengthDiff = nBits - targetLengthInBits |
| if (targetLengthDiff <= 0) { |
| // truncation is not needed, just write the original string |
| valueString |
| } else { |
| val nBitsToTrim = targetLengthDiff |
| val dcs = erd.encInfo.getDFDLCharset(state) |
| val minBitsPerChar = erd.encodingInfo.encodingMinimumCodePointWidthInBits(dcs) |
| // padChar must be a minimum-width char |
| val nCharsToTrim = nBitsToTrim / minBitsPerChar // positive if we need to truncate. |
| Assert.invariant(nCharsToTrim <= nChars) |
| val truncatedValue = truncateByJustification(state, valueString, nChars - nCharsToTrim.toInt) |
| Assert.invariant(truncatedValue.length <= valueString.length) |
| truncatedValue |
| } |
| } |
| |
| val nCharsWritten = dos.putString(valueToWrite, state) |
| Assert.invariant(nCharsWritten == valueToWrite.length) // assertion because we figured this out above based on available space. |
| // |
| // Filling of unused bits is done elsewhere now |
| // |
| } |
| } |
| |
| /** |
| * Truncates strings to the right length measured in characters. |
| * |
| * LengthUnits is 'characters', but we still don't know what |
| * encoding so whether it is fixed or variable width. |
| * |
| * What's interesting is that we don't care whether the encoding is |
| * fixed or variable width in this case. |
| * |
| * This is more efficient for variable-width encodings than |
| * lengthUnits 'bytes' (or bits), because we don't need a pass to measure |
| * the number of bits. |
| * |
| * So, for utf-8, we should recommend lengthUnits 'characters' ? Maybe so. |
| */ |
| class StringMaybeTruncateCharactersUnparser( |
| lengthInCharactersEv: LengthEv, |
| stringTruncationType: TextTruncationType.Type, |
| erd: ElementRuntimeData) |
| extends StringSpecifiedLengthUnparserTruncateBase( |
| stringTruncationType, |
| erd) { |
| |
| override lazy val runtimeDependencies = Vector(lengthInCharactersEv) |
| |
| override def unparse(state: UState): Unit = { |
| val dos = state.dataOutputStream |
| val valueString = contentString(state) |
| val targetLengthInCharacters = |
| lengthInCharactersEv.evaluate(state) |
| val valueToWrite = { |
| if (targetLengthInCharacters >= valueString.length) { |
| valueString |
| } else { |
| // |
| // We might need to truncate |
| // |
| val nChars = valueString.length.toLong |
| val nCharsToTrim = nChars - targetLengthInCharacters |
| Assert.invariant(nCharsToTrim <= nChars) |
| val truncatedValue = truncateByJustification(state, valueString, nChars - nCharsToTrim.toInt) |
| Assert.invariant(truncatedValue.length <= valueString.length) |
| truncatedValue |
| } |
| } |
| // |
| val nCharsWritten = dos.putString(valueToWrite, state) |
| Assert.invariant(nCharsWritten == valueToWrite.length) |
| } |
| } |