blob: ec1ee36b5bc4f057fa6740747b950c5ba1fe6b16 [file] [log] [blame]
package edu.illinois.ncsa.daffodil.dsom
/* Copyright (c) 2012-2013 Tresys Technology, LLC. All rights reserved.
*
* Developed by: Tresys Technology, LLC
* http://www.tresys.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal with
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is furnished to do
* so, subject to the following conditions:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in the
* documentation and/or other materials provided with the distribution.
*
* 3. Neither the names of Tresys Technology, nor the names of its contributors
* may be used to endorse or promote products derived from this Software
* without specific prior written permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
* SOFTWARE.
*/
import edu.illinois.ncsa.daffodil.processors.charset.CharsetUtils
import edu.illinois.ncsa.daffodil.exceptions.Assert
import edu.illinois.ncsa.daffodil.schema.annotation.props.gen.UTF16Width
/**
* Split this out of AnnotatedMixin for separation of
* concerns reasons.
*
* TODO: move to GrammarMixins.scala, or another file
* of these sorts of traits that are mixed onto the
* schema components.
*/
trait EncodingMixin { self: AnnotatedSchemaComponent =>
/**
* Character encoding common attributes
*
* Note that since encoding can be computed at runtime, we
* create values to tell us if the encoding is known or not
* so that we can decide things at compile time when possible.
*/
lazy val isKnownEncoding = {
val isKnown = encoding.isConstant
if (isKnown) {
val encName = encoding.constantAsString.toUpperCase()
if (encName.startsWith("UTF-16")) {
schemaDefinitionUnless(utf16Width == UTF16Width.Fixed, "Property utf16Width='variable' not supported.")
//
// TODO: when runtime encoding is supproted, must also check for utf16Width
// (and error if unsupported then, or just implement it!)
}
}
isKnown
}
/**
* Note that the canonical form for encoding names is all upper case.
*/
lazy val knownEncodingName = {
Assert.invariant(isKnownEncoding)
val res = encoding.constantAsString.toUpperCase()
res
}
lazy val knownEncodingCharset = {
CharsetUtils.getCharset(knownEncodingName)
}
// Really bad idea. Don't save these. Decoders and Encoders are stateful
// so they can't be precomputed here and reused without all sorts of
// thread issues and reset protocols.
// lazy val knownEncodingDecoder = {
// val decoder = knownEncodingCharset.newDecoder()
// decoder
// }
//
// lazy val knownEncodingEncoder = {
// val encoder = knownEncodingCharset.newEncoder()
// encoder
// }
lazy val knownEncodingAlignmentInBits = {
knownEncodingName match {
case "US-ASCII-7-BIT-PACKED" => 1 // canonical form of encoding names is all upper case
case _ => 8
}
}
/**
* enables optimizations and random-access
*
* variable-width character sets require scanning to determine
* their end.
*/
lazy val knownEncodingIsFixedWidth = {
// val res = knownEncodingCharset.isFixedWidth
val res = knownEncodingName match {
case "US-ASCII" | "ASCII" => true
case "US-ASCII-7-BIT-PACKED" => true
case "UTF-8" => false
case "UTF-16" | "UTF-16LE" | "UTF-16BE" => {
if (utf16Width == UTF16Width.Fixed) true
else false
}
case "UTF-32" | "UTF-32BE" | "UTF-32LE" => true
case "ISO-8859-1" => true
case _ => schemaDefinitionError("Text encoding '%s' is not supported.", knownEncodingName)
}
res
}
lazy val couldBeVariableWidthEncoding = !knownEncodingIsFixedWidth
lazy val knownEncodingWidthInBits = {
// knownEncodingCharset.width()
val res = knownEncodingName match {
case "US-ASCII" | "ASCII" => 8
case "US-ASCII-7-BIT-PACKED" => 7 // NOTE! 7-bit characters dense packed. 8th bit is NOT unused.
case "UTF-8" => -1
case "UTF-16" | "UTF-16LE" | "UTF-16BE" => {
if (utf16Width == UTF16Width.Fixed) 16
else -1
}
case "UTF-32" | "UTF-32BE" | "UTF-32LE" => 32
case "ISO-8859-1" => 8
case _ => schemaDefinitionError("Text encoding '%s' is not supported.", knownEncodingName)
}
res
}
lazy val knownEncodingStringBitLengthFunction = {
//
// This will be called at runtime, so let's decide
// what we can, and return an optimized function that
// has characteristics of the encoding wired down.
//
if (knownEncodingIsFixedWidth) {
def stringBitLength(str: String) = str.length * knownEncodingWidthInBits
stringBitLength _
} else {
def stringBitLength(str: String) = {
// variable width encoding, so we have to convert each character
// We assume here that it will be a multiple of bytes
// that is, that variable-width encodings are all some number
// of bytes.
str.getBytes(knownEncodingName).length * 8
}
stringBitLength _
}
}
}