blob: c186212dfb5a85b3bd063877f9d1c23894e9b528 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.daffodil.xml
import java.io.File
import scala.xml._
import org.apache.daffodil.exceptions._
import scala.collection.mutable.ArrayBuilder
import org.apache.commons.io.IOUtils
import scala.xml.NamespaceBinding
import org.apache.daffodil.schema.annotation.props.LookupLocation
/**
* Utilities for handling XML
*
* @version 1
* @author Alejandro Rodriguez
*/
object XMLUtils {
/**
* We must have xsi prefix bound to the right namespace.
* That gets enforced elsewhere.
*/
val xmlNilAttribute = new PrefixedAttribute("xsi", "nil", "true", scala.xml.Null)
val PositiveInfinity = Double.PositiveInfinity
val NegativeInfinity = Double.NegativeInfinity
val NaN = Double.NaN
val PositiveInfinityString = "INF"
val NegativeInfinityString = "-INF"
val NaNString = "NaN"
/**
* Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*/
def remapXMLIllegalCharToPUA(checkForExistingPUA: Boolean = true, replaceCRWithLF: Boolean = true)(c: Char): Char = {
val cInt = c.toInt
val res = cInt match {
case 0x9 => c
case 0xA => c
case 0xD =>
if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and preserved.
case _ if (cInt < 0x20) => (cInt + 0xE000).toChar
case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar
case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => {
if (checkForExistingPUA)
Assert.usageError("Pre-existing Private Use Area (PUA) character found in data: '%s'".format(c))
else c
}
case 0xFFFE => 0xF0FE.toChar
case 0xFFFF => 0xF0FF.toChar
case _ if (cInt > 0x10FFFF) => {
Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(cInt))
}
case _ => c
}
res
}
/**
* Reverse of the above method
*/
def remapPUAToXMLIllegalChar(checkForExistingPUA: Boolean = true)(c: Char): Char = {
val cInt = c.toInt
val res = cInt match {
case _ if (c >= 0xE000 && c < 0xE020) => (c - 0xE000).toChar
case _ if (c > 0xE7FF && c < 0xF000) => (c - 0x1000).toChar
case 0xF0FE => 0xFFFE.toChar
case 0xF0FF => 0xFFFF.toChar
case _ if (c > 0x10FFFF) => {
Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(c.toInt))
}
case _ => c
}
res
}
def isLeadingSurrogate(c: Char) = {
c >= 0xD800 && c <= 0xDBFF
}
def isTrailingSurrogate(c: Char) = {
c >= 0xDC00 && c <= 0xDFFF
}
/**
* Length where a surrogate pair counts as 1 character, not two.
*/
def uncodeLength(s: String) = {
// performance note: this might get called a lot. So needs to be fast.
// it needs to scan the string once, examine each character.
// using getBytes utf-32 isn't necessarily slow. It might be fine.
val res = s.getBytes("UTF-32BE").length / 4
res
}
/**
* Because of surrogate pairs, and the difference between 16-bit string codepoints
* and real character codes, lots of things that traverse strings need
* to consider either the codepoint after (if current is a leading surrogate)
* or codepoint before (if current is a trailing surrogate).
*
* This calls a body function with prev, current, next bound to those.
* For first codepoint prev will be 0. For last codepoint next will be 0.
*
* NOTE: This function contains the same algorithm as
* remapXMLIllegalCharactersToPUA, but is more general and is a bit slower.
* Any changes made to this function probably need to be incorporated into
* the other.
*/
def walkUnicodeString[T](str: String)(bodyFunc: (Char, Char, Char) => T): Seq[T] = {
val len = str.length
if (len == 0) return Nil
val list = new scala.collection.mutable.ListBuffer[T]
var pos = 0;
var prev = 0.toChar
var curr = str(0)
var next = 0.toChar
while (pos < len) {
next = if (pos + 1 < len) str(pos + 1) else 0.toChar
list += bodyFunc(prev, curr, next)
prev = curr
curr = next
pos += 1
}
list
}
/*
* This function contains the same string traversal algorithm as
* walkUnicodeString. The only difference is that it uses a StringBuilder
* rather than a ListBuffer[T] that would be used in walkUnicodeString. Note
* that since StringBuilder is not synchronized it is noticably faster than
* StringBuffer, and since the StringBuilder is local to the function, we
* don't have to worry about any threading issues. This specificity makes for
* a noticable speed increase, so much so that the code duplication is worth
* it. Any changes made to this function probably need to be incorporated
* into the other.
*/
def remapXMLCharacters(dfdlString: String, remapFunc: (Char) => Char): String = {
// we want to remap XML-illegal characters
// but leave legal surrogate-pair character pairs alone.
def remapOneChar(previous: Char, current: Char, next: Char): Char = {
if (isLeadingSurrogate(current) && isTrailingSurrogate(next)) return current
if (isTrailingSurrogate(current) && isLeadingSurrogate(previous)) return current
remapFunc(current)
}
val len = dfdlString.length
if (len == 0) return dfdlString
val sb = new StringBuilder()
var pos = 0;
var prev = 0.toChar
var curr = dfdlString(0)
var next = 0.toChar
while (pos < len) {
next = if (pos + 1 < len) dfdlString(pos + 1) else 0.toChar
if (curr == 0xD) {
if (next != 0xA) {
// This is a lone CR (i.e. not a CRLF), so convert the CR to a LF
sb.append(0xA.toChar)
} else {
// This is a CRLF. Skip the CR, essentially converting the CRLF to
// just LF. Do nothing.
}
} else {
sb.append(remapOneChar(prev, curr, next))
}
prev = curr
curr = next
pos += 1
}
sb.toString
}
def remapXMLIllegalCharactersToPUA(dfdlString: String): String = {
remapXMLCharacters(dfdlString, remapXMLIllegalCharToPUA(false))
}
def remapPUAToXMLIllegalCharacters(dfdlString: String): String = {
remapXMLCharacters(dfdlString, remapPUAToXMLIllegalChar(false))
}
/*
* This is needed for equality comparison of XML.
*
* Ex: "foo&#x221;bar" is 3 nodes, not one string node.
* But appears to be one string when serialized as XML.
*
* Once the XML has been read into XML objects, the 3 objects
* are just 3 adjacent text nodes, so adjacent text nodes
* can be coalesced for use in the DFDL Infoset, or for comparing
* trees of XML that may have been created different ways.
*/
def coalesceAdjacentTextNodes(seq: Seq[Node]): Seq[Node] = {
if (seq.length == 0) return seq
if (seq.length == 1) {
seq(0) match {
case p: PCData => return seq
case Text(data) =>
if (data.matches("""\s*""")) return Nil
else return seq
case u: Unparsed => return seq // TODO: are these needed or possible?
case _ => // fall through to code below. (We need to process children)
}
}
val ab = ArrayBuilder.make[Node]
var i = 0
//
// invariant: either the tn node is null
// or the stringbuilder is null or empty
//
// They never both have content.
//
var tn: Node = null
var sb: StringBuilder = null
def processText = {
if (tn == null) {
if (sb != null && sb.length > 0) {
// we have accumulated text
// let's output a text node
// Note that a Text constructor
// will escapify the text again.
// We unescaped it
// when we used .text to get data
// out of the nodes.
ab += new Text(sb.toString)
sb.clear()
}
} else {
// tn not null
Assert.invariant(sb == null || sb.length == 0)
ab += tn
tn = null
}
}
while (i < seq.length) {
val current = seq(i)
i = i + 1
if ((current.isInstanceOf[Text] || current.isInstanceOf[Unparsed])) {
if (tn == null) {
if (sb == null || sb.length == 0) {
// hold onto this text node. It might be isolated
tn = current
} else {
// accumulate this text
sb.append(current.text)
}
} else {
if (sb == null) sb = new StringBuilder
// accumulate both the pending tn text node
// and this new one we just encountered.
//
// Note we use .text here - that unescapifies
// Which is important since we're putting together
// things that might be PCData (aka <![CDATA[...]]>
// We want that stuff gone.
//
sb.append(tn.text)
sb.append(current.text)
//
// set tn to null to indicate we're accumulating
// into the string buffer
//
tn = null
}
} else {
// not an atom
processText // if there is pending text output that first
ab += current // then the current non-atom node.
}
}
// we fell out of the loop. So
processText // in case there is text left pending when we hit the end
ab.result
}
val XSD_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema") // removed trailing slash (namespaces care)
val XSI_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema-instance")
val XPATH_FUNCTION_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions")
val XPATH_MATH_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions/math")
val DFDL_NAMESPACE = NS("http://www.ogf.org/dfdl/dfdl-1.0/") // dfdl ns does have a trailing slash
val TDML_NAMESPACE = NS("http://www.ibm.com/xmlns/dfdl/testData")
val EXAMPLE_NAMESPACE = NS("http://example.com")
val XHTML_NAMESPACE = NS("http://www.w3.org/1999/xhtml")
/**
* Returns an Elem with local name "element", and the scope provided
* with the prefix of the Elem setup to match the scope's binding
* for the XSD_NAMESPACE.
*
* If the XSD_NAMESPACE is the default namespace, then the prefix will
* be null. If the XSD_NAMESPACE is bound to a prefix, the first such
* prefix will be used for the returned Elem.
*/
def getXSDElement(scope: NamespaceBinding): Elem = {
val xsdPre = scope.getPrefix(XSD_NAMESPACE.toString)
val isXSDTheDefaultNS = XSD_NAMESPACE.toString() == scope.getURI(null)
val xsdPrefix =
if (xsdPre ne null) xsdPre
else if (isXSDTheDefaultNS) null
else Assert.usageError("Scope argument must have a binding for the XSD namespace.")
val res =
Elem(xsdPrefix, "element", Null, scope, true)
res
}
/**
* Added to support extensions and proposed future features as part of daffodil.
*
* The DFDL standard requires us to keep these out of the primary DFDL namespace, and
* we really should be using URN-style notation, not http URLs for these.
* (for why http URLs are a bad idea for these, see:
* http://www.w3.org/blog/systeam/2008/02/08/w3c_s_excessive_dtd_traffic/ )
*
* These definitions must match their XSD counterparts in dafint.xsd and dafext.xsd
*/
private val DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA = "urn:ogf:dfdl:2013:imp:opensource.ncsa.illinois.edu:2012"
private val DAFFODIL_EXTENSION_NAMESPACE_NCSA = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA + ":ext")
val EXT_PREFIX_NCSA = "daf"
val EXT_NS_NCSA = NS(DAFFODIL_EXTENSION_NAMESPACE_NCSA.uri)
private val DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE = "urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018"
private val DAFFODIL_EXTENSION_NAMESPACE_APACHE = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE + ":ext")
val EXT_PREFIX_APACHE = "daf"
val EXT_NS_APACHE = NS(DAFFODIL_EXTENSION_NAMESPACE_APACHE.uri)
private val DAFFODIL_INTERNAL_NAMESPACE = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE + ":int")
val INT_PREFIX = "dafint"
val INT_NS = NS(DAFFODIL_INTERNAL_NAMESPACE.uri)
val FILE_ATTRIBUTE_NAME = "file"
val LINE_ATTRIBUTE_NAME = "line"
val COLUMN_ATTRIBUTE_NAME = "col"
// shorter forms, to make constructing XML literals,... make the lines shorter.
val xsdURI = XSD_NAMESPACE
val dfdlURI = DFDL_NAMESPACE
val dfdlAppinfoSource = NS("http://www.ogf.org/dfdl/")
val targetNS = EXAMPLE_NAMESPACE // we use this for tests.
val xsiURI = XSI_NAMESPACE
val fnURI = XPATH_FUNCTION_NAMESPACE
val mathURI = XPATH_MATH_NAMESPACE
val dafintURI = DAFFODIL_INTERNAL_NAMESPACE
val DFDL_SIMPLE_BUILT_IN_TYPES =
List("string",
"float",
"double",
"decimal",
"integer",
"long",
"int",
"short",
"byte",
"unsignedLong",
"unsignedInt",
"nonNegativeInteger",
"unsignedShort",
"unsignedByte",
"boolean",
"date",
"time",
"dateTime",
"hexBinary")
def slashify(s: String): String = if (s == "" || s.endsWith("/")) s else s + "/"
/**
* Annoying, but namespace bindings are never a collection you can process like a normal collection.
* Instead they are linked by these parent chains.
*/
def namespaceBindings(nsBinding: NamespaceBinding): Seq[NamespaceBinding] = {
if (nsBinding == null) Nil
else {
val thisOne =
if (nsBinding.uri != null) List(nsBinding)
else Nil
val others = namespaceBindings(nsBinding.parent)
thisOne ++ others
}
}
/**
* We don't want to be sensitive to which prefix people bind
*/
def dfdlAttributes(n: Node) = {
n.attributes filter {
_.getNamespace(n) == DFDL_NAMESPACE.toString
}
}
def dafAttributes(n: Node) = {
n.attributes.filter { a =>
a.getNamespace(n) == XMLUtils.EXT_NS_NCSA.toString ||
a.getNamespace(n) == XMLUtils.EXT_NS_APACHE.toString
}
}
/**
* Used to collapse the excessive xmlns proliferation.
*
* If a local scope has bindings in it that are not in the outer scope
* then a new local scope is created which extends the outer scope.
*
* This algorithm is n^2 (or worse) in the length of the outer binding chain (worst case).
*/
def combineScopes(local: NamespaceBinding, outer: NamespaceBinding): NamespaceBinding = {
if (local == TopScope) outer
else {
val NamespaceBinding(pre, uri, moreBindings) = local
val outerURI = outer.getURI(pre)
if (outerURI == uri) {
// same binding for this prefix in the outer, so we don't need
// this binding from the local scope.
combineScopes(moreBindings, outer)
} else if (outerURI == null) {
// outer lacks a binding for this prefix
NamespaceBinding(pre, uri, combineScopes(moreBindings, outer))
} else {
// outer has a different binding for this prefix.
// one would hope that we can just put our superceding binding on the
// front, but you end up with two bindings for the same prefix
// in the chain ... and things fail
//
// The problem this creates is that it un-shares all the sub-structure
// of the scopes, and so we no longer have contained elements
// that share scopes with enclosing parents. That may mean that
// lots of xmlns:pre="ns" proliferate again even though they're
// unnecessary.
//
val outerWithoutDuplicate = removeBindings(NamespaceBinding(pre, uri, TopScope), outer)
val moreBindingsWithoutConflict = removeBindings(NamespaceBinding(pre, uri, TopScope), moreBindings)
NamespaceBinding(pre, uri, combineScopes(moreBindingsWithoutConflict, outerWithoutDuplicate))
}
}
}
/**
* remove all the binding s
*/
def removeBindings(nb: NamespaceBinding, scope: NamespaceBinding): NamespaceBinding = {
if (nb == TopScope) scope
else if (scope == TopScope) scope
else {
val NamespaceBinding(pre, _, more) = scope
if (nb.getURI(pre) != null) {
// the scope has a binding for this prefix
// so irrespective of the uri, we remove it.
removeBindings(nb, more)
} else {
// no binding, so keep it
scope.copy(parent = removeBindings(nb, more))
}
}
}
def combineScopes(prefix: String, ns: NS, outer: NamespaceBinding): NamespaceBinding = {
if (ns.optURI.isEmpty) {
outer
} else {
val uri = ns.optURI.get.toString
val inner = NamespaceBinding(prefix, uri, TopScope)
combineScopes(inner, outer)
}
}
def collapseScopes(x: Node, outer: NamespaceBinding): Node = {
x match {
case Elem(pre, lab, md, scp, child @ _*) => {
val newScope = combineScopes(scp, outer)
Elem(pre, lab, md, newScope, true, (child flatMap { ch => collapseScopes(ch, newScope) }): _*)
}
case _ => x
}
}
/**
* Removes NamespaceBindings from a scope containing specified namespaces
*/
def filterScope(nsb: NamespaceBinding, nss: Seq[NS]): NamespaceBinding = {
val newHead =
if (nsb == xml.TopScope) {
xml.TopScope
} else {
val parentCopy = filterScope(nsb.parent, nss)
if (nss.contains(NS(nsb.uri))) {
parentCopy
} else {
nsb.copy(parent = parentCopy)
}
}
newHead
}
/**
* Determines if a prefix is defined inside a scope
*/
def prefixInScope(prefix: String, scope: NamespaceBinding): Boolean = {
val ret =
if (scope == null) {
false
} else if (prefix == scope.prefix) {
true
} else {
prefixInScope(prefix, scope.parent)
}
ret
}
/**
* Remove Comments
*/
def removeComments(e: Node): Node = {
e match {
case Elem(prefix, label, attribs, scope, child @ _*) => {
val newChildren = child.filterNot { _.isInstanceOf[Comment] }.map { removeComments(_) }
Elem(prefix, label, attribs, scope, true, newChildren: _*)
}
case x => x
}
}
/**
* Removes attributes that we want to ignore when comparing
* infosets.
*
* Removes dafint namespace attributes such as dafint:line and dafint:col.
*
* If a sequence of namespaces are given, only those attributes and scopes in
* those namepsaces are removed. Otherwise, all attributes and scopes (aside
* from special ones like xsi:nil) are removed. Additionally, if a scope is
* filtered, the prefixes of elements prefixed with filtered scopes are also
* removed.
*
* If a scope is given, it will be used for a child element if the
* childs filtered scope is the same as the scope.
*
* Also strips out comments and mixed whitespace nodes, and coalesces
* adjacent text nodes.
*
* Throws an exception if it contains mixed non-whitespace nodes.
*/
def removeAttributes(n: Node, ns: Seq[NS] = Seq[NS](), parentScope: Option[NamespaceBinding] = None): Node = {
val res1 = removeAttributes1(n, ns, parentScope).asInstanceOf[scala.xml.Node]
val res2 = removeMixedWhitespace(res1)
val res = res2(0) // .asInstanceOf[scala.xml.Node]
res
}
/**
* removes insignificant whitespace from between elements
*/
private def removeMixedWhitespace(ns: Node): Node = {
if (!ns.isInstanceOf[Elem]) return ns
val e = ns.asInstanceOf[Elem]
val children = e.child
val noMixedChildren =
if (children.exists(_.isInstanceOf[Elem])) {
children.filter {
case Text(data) if data.matches("""\s*""") => false
case Text(data) => throw new Exception("Element %s contains mixed data: %s".format(e.label, data))
case _ => true
}.map(removeMixedWhitespace)
} else {
children.filter {
//
// So this is a bit strange, but we're dropping nodes that are Empty String.
//
// In XML we cannot tell <foo></foo> where there is a Text("") child, from <foo></foo> with Nil children
//
case Text("") => false // drop empty strings
case _ => true
}
}
val res =
if (noMixedChildren eq children) e
else e.copy(child = noMixedChildren)
res
}
def convertPCDataToText(n: Node): Node = {
val res = n match {
case PCData(data) => {
val t = Text(n.text)
t
}
case Elem(prefix, label, attributes, scope, children @ _*) => {
val newChildren = children.map { convertPCDataToText(_) }
Elem(prefix, label, attributes, scope, true, newChildren: _*)
}
case _ => n
}
res
}
private def removeAttributes1(n: Node, ns: Seq[NS], parentScope: Option[NamespaceBinding]): NodeSeq = {
val res = n match {
case e @ Elem(prefix, label, attributes, scope, children @ _*) => {
val filteredScope = if (ns.length > 0) filterScope(scope, ns) else xml.TopScope
// If the filtered scope is logically the same as the parent scope, use
// the parent scope. Scala uses references to determine if scopes are
// the same during pretty printing. However, scopes are immutable, so
// the filter algorithm creates new scopes. Because of this, we need to
// ignore the newly filtered scope if it is logically the same as the
// parent so that the scala pretty printer doesn't see them as
// different scopes.
val newScope = parentScope match {
case Some(ps) => if (ps == filteredScope) ps else filteredScope
case None => filteredScope
}
val newChildren: NodeSeq = children.flatMap { removeAttributes1(_, ns, Some(newScope)) }
// Important to merge adjacent text. Otherwise when comparing
// two structuers that print out the same, they might not be equal
// because they have different length lists of text nodes
//
// Ex: <foo>A&#xE000;</foo> creates an element containing TWO
// text nodes. But coming from the Daffodil Infoset, a string like
// that would be just one text node.
// Similarly <foo>abc<![CDATA[def]]>ghi</foo> has 3 child nodes.
// The middle one is PCData. The two around it are Text.
// Both Text and PCData are Atom[String].
val textMergedChildren = coalesceAdjacentTextNodes(newChildren)
val newPrefix = if (prefixInScope(prefix, newScope)) prefix else null
val newAttributes = attributes.filter { m =>
m match {
case xsiNilAttr @ PrefixedAttribute(_, "nil", Text("true"), _) if (NS(xsiNilAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) => {
true
}
//
// This tolerates xsi:nil='true' when xsi has no definition at all.
case xsiNilAttr @ PrefixedAttribute("xsi", "nil", Text("true"), _) if (xsiNilAttr.getNamespace(e) == null) => {
true
}
case dafIntAttr @ PrefixedAttribute(pre, _, _, _) if (pre ne null) && (dafIntAttr.getNamespace(e) == XMLUtils.DAFFODIL_INTERNAL_NAMESPACE.toString) => {
Assert.invariant(pre != "")
false // drop dafint attributes.
}
case xsiTypeAttr @ PrefixedAttribute(_, "type", _, _) if (NS(xsiTypeAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) =>
false // drop xsi:type attributes for now. Such time as we add
// support for them, we would need to not remove them.
// TODO: actually check xsi:type attributes are correct - but this
// requires schema-aware comparison.
case xsiTypeAttr @ PrefixedAttribute("xsi", "type", _, _) =>
false // drop xsi:type attributes for now. Even if prefix xsi is not defined.
// This just avoids having to edit many tests to add in the xmlns:xsi=....
// namespace declaration.
case attr =>
true // keep all other attributes
}
}
Elem(newPrefix, label, newAttributes, newScope, true, textMergedChildren: _*)
}
case c: scala.xml.Comment => NodeSeq.Empty // remove comments
case other => other
}
res
}
def compareAndReport(trimmedExpected: Node, actualNoAttrs: Node, ignoreProcInstr: Boolean = true) = {
if (trimmedExpected != actualNoAttrs) {
val expString = trimmedExpected.toString
val actString = actualNoAttrs.toString
if (expString != actString) {
val diffs = XMLUtils.computeDiff(trimmedExpected, actualNoAttrs, ignoreProcInstr)
if (diffs.length > 0) {
throw new Exception("""
Comparison failed.
Expected
%s
Actual
%s
Differences were (path, expected, actual):
%s""".format(
trimmedExpected.toString, actualNoAttrs.toString, diffs.map { _.toString }.mkString("\n")))
}
}
}
}
/**
* computes a precise difference list which is a sequence of triples.
* Each triple is the path (an x-path-like string), followed by expected, and actual values.
*/
def computeDiff(a: Node, b: Node, ignoreProcInstr: Boolean = true) = {
computeDiffOne(Seq(a), Seq(b), Map.empty, Nil, ignoreProcInstr)
}
def childArrayCounters(e: Elem) = {
val Elem(_, _, _, _, children @ _*) = e
val labels = children.map { _.label }
val groups = labels.groupBy { x => x }
val counts = groups.map { case (label, labelList) => (label, labelList.length) }
val arrayCounts = counts.filter { case (label, 1) => false; case _ => true } // remove counters for scalars
val arrayCounters = arrayCounts.map { case (label, _) => (label, 1.toLong) } // 1 based like XPath!
arrayCounters
}
def computeDiffOne(as: Seq[Node], bs: Seq[Node],
aCounters: Map[String, Long],
path: Seq[String],
ignoreProcInstr: Boolean = true): Seq[(String, String, String)] = {
lazy val zPath = path.reverse.mkString("/")
(as, bs) match {
case (a1 :: ars, b1 :: brs) if (a1.isInstanceOf[Elem] && b1.isInstanceOf[Elem]) => {
val (a: Elem, b: Elem) = (a1, b1)
val Elem(_, labelA, attribsA, _, childrenA @ _*) = a
val Elem(_, labelB, attribsB, _, childrenB @ _*) = b
if (labelA != labelB)
List((zPath, a.toString, b.toString))
else if (attribsA != attribsB
&& !((attribsA == null && (attribsB == null || attribsB.length == 0))
|| (attribsB == null) && attribsA.length == 0)) {
// println("attributes are different")
val aA = if (attribsA == null || attribsA == "") "null" else attribsA.toString
val aB = if (attribsB == null || attribsB == "") "null" else attribsB.toString
List((zPath, aA, aB))
} else {
val aIndex = aCounters.get(labelA)
val aIndexExpr = aIndex.map { n => labelA + "[" + n + "]" }
val newAIndex = aIndex.map { n => (labelA, n + 1) }
val newACounters = aCounters ++ newAIndex.toList
val pathStep = aIndexExpr.getOrElse(labelA)
val aChildArrayCounters = childArrayCounters(a)
//
// Tricky induction here. For the rest of our peers, we must use newACounters
// But as we move across our children, we're using a new map, aChildArrayCounters.
//
val newPath = pathStep +: path
val childrenAList = childrenA.toList
val childrenBList = childrenB.toList
val childrenDiffs =
computeDiffOne(childrenAList, childrenBList, aChildArrayCounters, newPath, ignoreProcInstr)
val subsequentPeerDiffs = computeDiffOne(ars, brs, newACounters, path, ignoreProcInstr)
val res = childrenDiffs ++ subsequentPeerDiffs
res
}
}
case (tA1 :: ars, tB1 :: brs) if (tA1.isInstanceOf[Text] && tB1.isInstanceOf[Text]) => {
val (tA: Text, tB: Text) = (tA1, tB1)
val thisDiff = computeTextDiff(zPath, tA, tB)
val restDiffs = computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
val res = thisDiff ++ restDiffs
res
}
case (tA1 :: ars, brs) if (ignoreProcInstr && tA1.isInstanceOf[scala.xml.ProcInstr]) =>
computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
case (ars, tB1 :: brs) if (ignoreProcInstr && tB1.isInstanceOf[scala.xml.ProcInstr]) =>
computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
case (scala.xml.ProcInstr(tA1label, tA1content) :: ars,
scala.xml.ProcInstr(tB1label, tB1content) :: brs) => {
val labelDiff = computeTextDiff(zPath, tA1label, tB1label)
//
// The content of a ProcInstr is technically a big string
// But our usage of them the content is XML-like so could be loaded and then compared
// as XML, if the label is in fact an indicator that this is our special
// PI with format info.
//
// Much of that XML-ish content is attributes however, so we need to be sure
// we're comparing those too.
//
// TODO: implement XML-comparison for our data format info PIs.
//
val contentDiff = computeTextDiff(zPath, tA1content, tB1content)
val restDiffs = computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
val res = labelDiff ++ contentDiff ++ restDiffs
res
}
case (Nil, Nil) => Nil
//
// special case.
//
// when we read in an infoset for comparison we might have <foo></foo> which
// loads as an Elem with Nil for child.
//
// But the actual might be Elem with child that is an array of exactly one Text node
// with value "" (empty string). Visually this is the same! <foo></foo>
//
// Something in scala's libraries removes the isolated Text empty string nodes
// So this comparison works. (Whitespace removal)
case _ => {
List((zPath, as.toString, bs.toString))
}
}
}
def computeTextDiff(zPath: String, tA: Text, tB: Text): Seq[(String, String, String)] = {
val dataA = tA.toString
val dataB = tB.toString
computeTextDiff(zPath, dataA, dataB)
}
def computeTextDiff(zPath: String, dataA: String, dataB: String): Seq[(String, String, String)] = {
def quoteIt(str: String) = "'" + str + "'"
if (dataA == dataB) Nil
else if (dataA.length != dataB.length) {
List((zPath, quoteIt(dataA), quoteIt(dataB)))
} else {
val ints = Stream.from(1).map { _.toString }
val z = dataA zip dataB zip ints
val res = z.flatMap {
case ((a1, b1), index) =>
if (a1 == b1) Nil
else {
val indexPath = zPath + ".charAt(" + index + ")"
List((indexPath, a1.toString + "(%%#x%04X;)".format(a1.toInt), b1.toString + "(%%#x%04X;)".format(b1.toInt)))
}
}
res
}
}
/**
* for quick tests, we use literal scala nodes. However, the underlying
* infrastructure wants to be all file centric for diagnostic-message
* reasons (line numbers for errors)
*/
def convertNodeToTempFile(xml: Node, tmpDir: File, nameHint: String = "daffodil_tmp_") = {
// Create temp file
// note that the prefix has a minimum length of 3.
val prefix = nameHint.length match {
case 0 => "daffodil_tmp_"
case 1 => nameHint + "__"
case 2 => nameHint + "_"
case _ => nameHint
}
val tmpSchemaFile = File.createTempFile(prefix, ".dfdl.xsd", tmpDir)
// Delete temp file when program exits
tmpSchemaFile.deleteOnExit
//
// Note: we use our own pretty printer here because
// Scala library one doesn't preserve/print CDATA properly.
//
val pp = new org.apache.daffodil.xml.PrettyPrinter(2)
val xmlString = pp.format(xml)
val fos = new java.io.FileOutputStream(tmpSchemaFile)
val fw = new java.io.OutputStreamWriter(fos, "utf-8")
fw.write(xmlString)
fw.close()
tmpSchemaFile
}
def convertInputStreamToTempFile(is: java.io.InputStream,
tmpDir: File,
nameHint: String,
suffix: String) = {
// Create temp file
// note that the prefix has a minimum length of 3.
val prefix = nameHint.length match {
case 0 => "daffodil_tmp_"
case 1 => nameHint + "__"
case 2 => nameHint + "_"
case _ => nameHint
}
val tmpSchemaFile = File.createTempFile(prefix, suffix, tmpDir)
// Delete temp file when program exits
tmpSchemaFile.deleteOnExit
val fos = new java.io.FileOutputStream(tmpSchemaFile)
IOUtils.copy(is, fos)
fos.close()
tmpSchemaFile
}
/**
* Strong escaping that never loses information, handles apos and CR right.
*
* Escapes apostrophe (single quote) as well as the other XML escaped chars.
* Remaps CR and any other XML-illegals into PUA. Replaces whitespace with
* numeric character entities for additional safety.
*
* This is needed since XML may be using single quotes to surround a string which
* might contain single quotes.
*
* The reason basic scala.xml.Utility.escape doesn't escape single-quotes is
* HTML compatibility. HTML doesn't define an "&apos;" entity.
*
* Furthermore, since some potentially illegal XML characters may be used here, we
* are going to remap all the illegal XML characters to their corresponding PUA characters.
*
* Lastly, all whitespace chars are replaced by numeric character entities, and
* anything above 0xFF that is not considered letter or digit, is also replaced
* by a numeric character entity.
*
* The result is a string which can be displayed as an XML attribute value, is
* invertible back to the original string.
*
* Finally, CRLF and CR will come through as &#xE00D;&#xA; that's because
* if we used &#xD; for the CR, it might be converted to a LF by XML readers.
* We have to use our own PUA remapping trick if we want to be sure to preserve
* CR in XML.
*/
def escape(str: String, sb: StringBuilder = new StringBuilder()): StringBuilder = {
var i = 0
while (i < str.length) {
val x = str(i)
val c = escapeMapper(x)
i += 1
c match {
case '\'' => sb.append("&#x27;") // don't use "&apos;" because it's not universally accepted (HTML doesn't have it in early versions)
case '"' => sb.append("&quot;")
case '&' => sb.append("&amp;")
case '<' => sb.append("&lt;")
case '>' => sb.append("&gt;")
case _ if (c.isLetterOrDigit) => sb.append(c)
case _ if (c.isWhitespace || c.isControl) => toNumericCharacterEntity(c, sb)
// A0 is the NBSP character - not considered whitespace, but no glyph, so we need it numeric
case _ if (c.toInt == 0xA0) => toNumericCharacterEntity(c, sb)
// Any other char < 256 is punctuation or other glyph char
case _ if (c.toInt < 0xFF) => sb.append(c)
case _ => toNumericCharacterEntity(c, sb)
}
}
sb
}
private val escapeMapper =
remapXMLIllegalCharToPUA(
checkForExistingPUA = false,
replaceCRWithLF = false) _
def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
val i = c.toInt
Assert.usage(i > 0) // NUL cannot be represented at all in XML.
val s = Integer.toHexString(i).toUpperCase()
sb.append("&#x")
sb.append(s)
sb.append(";")
}
}
trait GetAttributesMixin extends ThrowsSDE {
def xml: Node
/**
* Use to retrieve things that are not format properties.
*/
def getAttributeRequired(name: String) = {
getAttributeOption(name) match {
case None => schemaDefinitionError("The attribute '" + name + "' is required.")
case Some(s) => s
}
}
/**
* Use to retrieve things that are not format properties.
*/
def getAttributeOption(name: String): Option[String] = {
val attrString = xml.attribute(name).map { _.text }
attrString
}
def getAttributeOption(ns: NS, name: String): Option[String] = {
//
// Most annoying, but this doesn't work....
// val res = xml.attribute(ns.toString, name).map{ _.text }
val attr = (xml \ ("@{" + ns.toString + "}" + name))
if (attr.length == 0) None
else Some(attr.text)
}
/**
* For picking off the short-form annotations.
*/
def dfdlAttributes(n: Node) = XMLUtils.dfdlAttributes(n)
def dafAttributes(n: Node) = XMLUtils.dafAttributes(n)
}
class QNamePrefixNotInScopeException(pre: String, loc: LookupLocation)
extends Exception("Prefix " + pre + " not found in scope. Location: " + loc.toString)
// Commented out for now, but we may reactivate this to
// do more validation stuff in the TDMLRunner. So keeping in the
// source like this.
//
//object XMLSchemaUtils {
// /**
// * validate a DFDL schema.
// *
// * This validates the XML Schema language subset that DFDL uses, and also all the annotations
// * hung off of it.
// */
// def validateDFDLSchema(doc: Node) = {
// // TODO: should this do something other than throw an exception on a validation error?
// //
// // Users will write DFDL Schemas, using the xs or xsd prefix (usually) bound to the XML Schema namespace,
// // and the dfdl prefix (usually) bound to the DFDL namespace.
// //
// // However, we don't want to validate using the XML Schema for XML Schema (which would be the usual interpretation
// // of validating an XML Schema), instead we want to use the schema for the DFDL Subset of XML Schema.
// //
// // So, the hack here, is we're going to textually substitute the URIs, so that the validator doesn't have to be
// // modified to do this switch, and we don't have to lie in the DFDL Subset schema, and claim it is realizing the
// // XML Schema URI.
// //
// // However, we should consider whether there is a better way to do this involving either (a) lying and having the
// // DFDL Subset Schema pretend it is the XSD schema, or we can play some catalog tricks perhaps.
// //
// // Also, the way this whole thing finds the necessary schemas is a bit daft. It should look in the jar or files,
// // but it should be using an XML Catalog.
// //
// val docstring = doc.toString()
// val xmlnsURI = "http://www.w3.org/2001/XMLSchema";
// val xsdSubsetURI = "http://www.w3.org/2001/XMLSchema";
// val docReplaced = docstring.replaceAll(xmlnsURI, xsdSubsetURI)
// val docReader = new StringReader(docReplaced)
// val schemaResource = Misc.getRequiredResource(Validator.dfdlSchemaFileName()).toURI()
// val res =
// try {
// Validator.validateXMLStream(schemaResource, docReader)
// } catch {
// case e: ... => {
// val exc = e
// // System.err.println(exc.getMessage())
// // Really useful place for a breakpoint.
// throw e
// }
// }
// res
// }
//}