daffodil-lib/src/main/scala/org/apache/daffodil/xml/XMLUtils.scala - daffodil - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.daffodil.xml

 import java.io.File
 import scala.xml._
 import org.apache.daffodil.exceptions._
 import scala.collection.mutable.ArrayBuilder
 import org.apache.commons.io.IOUtils
 import scala.xml.NamespaceBinding
 import org.apache.daffodil.schema.annotation.props.LookupLocation

 /**
  * Utilities for handling XML
  *
  * @version 1
  * @author Alejandro Rodriguez
  */
 object XMLUtils {

   /**
    * We must have xsi prefix bound to the right namespace.
    * That gets enforced elsewhere.
    */
   val xmlNilAttribute = new PrefixedAttribute("xsi", "nil", "true", scala.xml.Null)

   val PositiveInfinity = Double.PositiveInfinity
   val NegativeInfinity = Double.NegativeInfinity
   val NaN = Double.NaN

   val PositiveInfinityString = "INF"
   val NegativeInfinityString = "-INF"
   val NaNString = "NaN"

   /**
    * Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
    */
   def remapXMLIllegalCharToPUA(checkForExistingPUA: Boolean = true, replaceCRWithLF: Boolean = true)(c: Char): Char = {
     val cInt = c.toInt
     val res = cInt match {
       case 0x9 => c
       case 0xA => c
       case 0xD =>
         if (replaceCRWithLF) 0xA.toChar // Map CR to LF. That's what XML does.
         else 0xE00D.toChar // or remap it to PUA so it is non-whitespace, and preserved.
       case _ if (cInt < 0x20) => (cInt + 0xE000).toChar
       case _ if (cInt > 0xD7FF && cInt < 0xE000) => (cInt + 0x1000).toChar
       case _ if (cInt >= 0xE000 && cInt <= 0xF8FF) => {
         if (checkForExistingPUA)
           Assert.usageError("Pre-existing Private Use Area (PUA) character found in data: '%s'".format(c))
         else c
       }
       case 0xFFFE => 0xF0FE.toChar
       case 0xFFFF => 0xF0FF.toChar
       case _ if (cInt > 0x10FFFF) => {
         Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(cInt))
       }
       case _ => c

     }
     res
   }

   /**
    * Reverse of the above method
    */
   def remapPUAToXMLIllegalChar(checkForExistingPUA: Boolean = true)(c: Char): Char = {
     val cInt = c.toInt
     val res = cInt match {
       case _ if (c >= 0xE000 && c < 0xE020) => (c - 0xE000).toChar
       case _ if (c > 0xE7FF && c < 0xF000) => (c - 0x1000).toChar
       case 0xF0FE => 0xFFFE.toChar
       case 0xF0FF => 0xFFFF.toChar
       case _ if (c > 0x10FFFF) => {
         Assert.invariantFailed("Character code beyond U+10FFFF found in data. Codepoint: %s".format(c.toInt))
       }
       case _ => c
     }
     res
   }

   def isLeadingSurrogate(c: Char) = {
     c >= 0xD800 && c <= 0xDBFF
   }

   def isTrailingSurrogate(c: Char) = {
     c >= 0xDC00 && c <= 0xDFFF
   }

   /**
    * Length where a surrogate pair counts as 1 character, not two.
    */
   def uncodeLength(s: String) = {
     // performance note: this might get called a lot. So needs to be fast.
     // it needs to scan the string once, examine each character.
     // using getBytes utf-32 isn't necessarily slow. It might be fine.
     val res = s.getBytes("UTF-32BE").length / 4
     res
   }

   /**
    * Because of surrogate pairs, and the difference between 16-bit string codepoints
    * and real character codes, lots of things that traverse strings need
    * to consider either the codepoint after (if current is a leading surrogate)
    * or codepoint before (if current is a trailing surrogate).
    *
    * This calls a body function with prev, current, next bound to those.
    * For first codepoint prev will be 0. For last codepoint next will be 0.
    *
    * NOTE: This function contains the same algorithm as
    * remapXMLIllegalCharactersToPUA, but is more general and is a bit slower.
    * Any changes made to this function probably need to be incorporated into
    * the other.
    */
   def walkUnicodeString[T](str: String)(bodyFunc: (Char, Char, Char) => T): Seq[T] = {
     val len = str.length
     if (len == 0) return Nil

     val list = new scala.collection.mutable.ListBuffer[T]

     var pos = 0;
     var prev = 0.toChar
     var curr = str(0)
     var next = 0.toChar

     while (pos < len) {
       next = if (pos + 1 < len) str(pos + 1) else 0.toChar
       list += bodyFunc(prev, curr, next)
       prev = curr
       curr = next

       pos += 1
     }
     list
   }

   /*
    * This function contains the same string traversal algorithm as
    * walkUnicodeString. The only difference is that it uses a StringBuilder
    * rather than a ListBuffer[T] that would be used in walkUnicodeString. Note
    * that since StringBuilder is not synchronized it is noticably faster than
    * StringBuffer, and since the StringBuilder is local to the function, we
    * don't have to worry about any threading issues. This specificity makes for
    * a noticable speed increase, so much so that the code duplication is worth
    * it. Any changes made to this function probably need to be incorporated
    * into the other.
    */
   def remapXMLCharacters(dfdlString: String, remapFunc: (Char) => Char): String = {
     // we want to remap XML-illegal characters
     // but leave legal surrogate-pair character pairs alone.
     def remapOneChar(previous: Char, current: Char, next: Char): Char = {
       if (isLeadingSurrogate(current) && isTrailingSurrogate(next)) return current
       if (isTrailingSurrogate(current) && isLeadingSurrogate(previous)) return current
       remapFunc(current)
     }

     val len = dfdlString.length
     if (len == 0) return dfdlString

     val sb = new StringBuilder()

     var pos = 0;
     var prev = 0.toChar
     var curr = dfdlString(0)
     var next = 0.toChar

     while (pos < len) {
       next = if (pos + 1 < len) dfdlString(pos + 1) else 0.toChar
       if (curr == 0xD) {
         if (next != 0xA) {
           // This is a lone CR (i.e. not a CRLF), so convert the CR to a LF
           sb.append(0xA.toChar)
         } else {
           // This is a CRLF. Skip the CR, essentially converting the CRLF to
           // just LF. Do nothing.
         }
       } else {
         sb.append(remapOneChar(prev, curr, next))
       }
       prev = curr
       curr = next

       pos += 1
     }

     sb.toString
   }

   def remapXMLIllegalCharactersToPUA(dfdlString: String): String = {
     remapXMLCharacters(dfdlString, remapXMLIllegalCharToPUA(false))
   }

   def remapPUAToXMLIllegalCharacters(dfdlString: String): String = {
     remapXMLCharacters(dfdlString, remapPUAToXMLIllegalChar(false))
   }

   /*
    * This is needed for equality comparison of XML.
    *
    * Ex: "foo&#x221;bar" is 3 nodes, not one string node.
    * But appears to be one string when serialized as XML.
    *
    * Once the XML has been read into XML objects, the 3 objects
    * are just 3 adjacent text nodes, so adjacent text nodes
    * can be coalesced for use in the DFDL Infoset, or for comparing
    * trees of XML that may have been created different ways.
    */
   def coalesceAdjacentTextNodes(seq: Seq[Node]): Seq[Node] = {
     if (seq.length == 0) return seq
     if (seq.length == 1) {
       seq(0) match {

         case p: PCData => return seq

         case Text(data) =>
           if (data.matches("""\s*""")) return Nil
           else return seq

         case u: Unparsed => return seq // TODO: are these needed or possible?

         case _ => // fall through to code below. (We need to process children)
       }
     }
     val ab = ArrayBuilder.make[Node]
     var i = 0
     //
     // invariant: either the tn node is null
     // or the stringbuilder is null or empty
     //
     // They never both have content.
     //
     var tn: Node = null
     var sb: StringBuilder = null
     def processText = {
       if (tn == null) {
         if (sb != null && sb.length > 0) {
           // we have accumulated text
           // let's output a text node
           // Note that a Text constructor
           // will escapify the text again.
           // We unescaped it
           // when we used .text to get data
           // out of the nodes.
           ab += new Text(sb.toString)
           sb.clear()
         }
       } else {
         // tn not null
         Assert.invariant(sb == null || sb.length == 0)
         ab += tn
         tn = null
       }
     }
     while (i < seq.length) {
       val current = seq(i)
       i = i + 1
       if ((current.isInstanceOf[Text] || current.isInstanceOf[Unparsed])) {
         if (tn == null) {
           if (sb == null || sb.length == 0) {
             // hold onto this text node. It might be isolated
             tn = current
           } else {
             // accumulate this text
             sb.append(current.text)
           }
         } else {
           if (sb == null) sb = new StringBuilder
           // accumulate both the pending tn text node
           // and this new one we just encountered.
           //
           // Note we use .text here - that unescapifies
           // Which is important since we're putting together
           // things that might be PCData (aka <![CDATA[...]]>
           // We want that stuff gone.
           //
           sb.append(tn.text)
           sb.append(current.text)
           //
           // set tn to null to indicate we're accumulating
           // into the string buffer
           //
           tn = null
         }
       } else {
         // not an atom
         processText // if there is pending text output that first
         ab += current // then the current non-atom node.
       }
     }
     // we fell out of the loop. So
     processText // in case there is text left pending when we hit the end
     ab.result
   }

   val XSD_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema") // removed trailing slash (namespaces care)
   val XSI_NAMESPACE = NS("http://www.w3.org/2001/XMLSchema-instance")
   val XPATH_FUNCTION_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions")
   val XPATH_MATH_NAMESPACE = NS("http://www.w3.org/2005/xpath-functions/math")
   val DFDL_NAMESPACE = NS("http://www.ogf.org/dfdl/dfdl-1.0/") // dfdl ns does have a trailing slash
   val TDML_NAMESPACE = NS("http://www.ibm.com/xmlns/dfdl/testData")
   val EXAMPLE_NAMESPACE = NS("http://example.com")
   val XHTML_NAMESPACE = NS("http://www.w3.org/1999/xhtml")

   /**
    * Returns an Elem with local name "element", and the scope provided
    * with the prefix of the Elem setup to match the scope's binding
    * for the XSD_NAMESPACE.
    *
    * If the XSD_NAMESPACE is the default namespace, then the prefix will
    * be null. If the XSD_NAMESPACE is bound to a prefix, the first such
    * prefix will be used for the returned Elem.
    */
   def getXSDElement(scope: NamespaceBinding): Elem = {
     val xsdPre = scope.getPrefix(XSD_NAMESPACE.toString)
     val isXSDTheDefaultNS = XSD_NAMESPACE.toString() == scope.getURI(null)
     val xsdPrefix =
       if (xsdPre ne null) xsdPre
       else if (isXSDTheDefaultNS) null
       else Assert.usageError("Scope argument must have a binding for the XSD namespace.")
     val res =
       Elem(xsdPrefix, "element", Null, scope, true)
     res
   }

   /**
    * Added to support extensions and proposed future features as part of daffodil.
    *
    * The DFDL standard requires us to keep these out of the primary DFDL namespace, and
    * we really should be using URN-style notation, not http URLs for these.
    * (for why http URLs are a bad idea for these, see:
    * http://www.w3.org/blog/systeam/2008/02/08/w3c_s_excessive_dtd_traffic/ )
    *
    * These definitions must match their XSD counterparts in dafint.xsd and dafext.xsd
    */
   private val DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA = "urn:ogf:dfdl:2013:imp:opensource.ncsa.illinois.edu:2012"
   private val DAFFODIL_EXTENSION_NAMESPACE_NCSA = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_NCSA + ":ext")
   val EXT_PREFIX_NCSA = "daf"
   val EXT_NS_NCSA = NS(DAFFODIL_EXTENSION_NAMESPACE_NCSA.uri)

   private val DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE = "urn:ogf:dfdl:2013:imp:daffodil.apache.org:2018"
   private val DAFFODIL_EXTENSION_NAMESPACE_APACHE = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE + ":ext")
   val EXT_PREFIX_APACHE = "daf"
   val EXT_NS_APACHE = NS(DAFFODIL_EXTENSION_NAMESPACE_APACHE.uri)

   private val DAFFODIL_INTERNAL_NAMESPACE = NS(DAFFODIL_EXTENSIONS_NAMESPACE_ROOT_APACHE + ":int")
   val INT_PREFIX = "dafint"
   val INT_NS = NS(DAFFODIL_INTERNAL_NAMESPACE.uri)

   val FILE_ATTRIBUTE_NAME = "file"
   val LINE_ATTRIBUTE_NAME = "line"
   val COLUMN_ATTRIBUTE_NAME = "col"

   // shorter forms, to make constructing XML literals,... make the lines shorter.
   val xsdURI = XSD_NAMESPACE
   val dfdlURI = DFDL_NAMESPACE
   val dfdlAppinfoSource = NS("http://www.ogf.org/dfdl/")
   val targetNS = EXAMPLE_NAMESPACE // we use this for tests.
   val xsiURI = XSI_NAMESPACE
   val fnURI = XPATH_FUNCTION_NAMESPACE
   val mathURI = XPATH_MATH_NAMESPACE
   val dafintURI = DAFFODIL_INTERNAL_NAMESPACE

   val DFDL_SIMPLE_BUILT_IN_TYPES =
     List("string",
       "float",
       "double",
       "decimal",
       "integer",
       "long",
       "int",
       "short",
       "byte",
       "unsignedLong",
       "unsignedInt",
       "nonNegativeInteger",
       "unsignedShort",
       "unsignedByte",
       "boolean",
       "date",
       "time",
       "dateTime",
       "hexBinary")

   def slashify(s: String): String = if (s == "" || s.endsWith("/")) s else s + "/"

   /**
    * Annoying, but namespace bindings are never a collection you can process like a normal collection.
    * Instead they are linked by these parent chains.
    */
   def namespaceBindings(nsBinding: NamespaceBinding): Seq[NamespaceBinding] = {
     if (nsBinding == null) Nil
     else {
       val thisOne =
         if (nsBinding.uri != null) List(nsBinding)
         else Nil
       val others = namespaceBindings(nsBinding.parent)
       thisOne ++ others
     }
   }

   /**
    * We don't want to be sensitive to which prefix people bind
    */
   def dfdlAttributes(n: Node) = {
     n.attributes filter {
       _.getNamespace(n) == DFDL_NAMESPACE.toString
     }
   }

   def dafAttributes(n: Node) = {
     n.attributes.filter { a =>
       a.getNamespace(n) == XMLUtils.EXT_NS_NCSA.toString ||
         a.getNamespace(n) == XMLUtils.EXT_NS_APACHE.toString
     }
   }

   /**
    * Used to collapse the excessive xmlns proliferation.
    *
    * If a local scope has bindings in it that are not in the outer scope
    * then a new local scope is created which extends the outer scope.
    *
    * This algorithm is n^2 (or worse) in the length of the outer binding chain (worst case).
    */
   def combineScopes(local: NamespaceBinding, outer: NamespaceBinding): NamespaceBinding = {
     if (local == TopScope) outer
     else {
       val NamespaceBinding(pre, uri, moreBindings) = local
       val outerURI = outer.getURI(pre)
       if (outerURI == uri) {
         // same binding for this prefix in the outer, so we don't need
         // this binding from the local scope.
         combineScopes(moreBindings, outer)
       } else if (outerURI == null) {
         // outer lacks a binding for this prefix
         NamespaceBinding(pre, uri, combineScopes(moreBindings, outer))
       } else {
         // outer has a different binding for this prefix.
         // one would hope that we can just put our superceding binding on the
         // front, but you end up with two bindings for the same prefix
         // in the chain ... and things fail
         //
         // The problem this creates is that it un-shares all the sub-structure
         // of the scopes, and so we no longer have contained elements
         // that share scopes with enclosing parents. That may mean that
         // lots of xmlns:pre="ns" proliferate again even though they're
         // unnecessary.
         //
         val outerWithoutDuplicate = removeBindings(NamespaceBinding(pre, uri, TopScope), outer)
         val moreBindingsWithoutConflict = removeBindings(NamespaceBinding(pre, uri, TopScope), moreBindings)
         NamespaceBinding(pre, uri, combineScopes(moreBindingsWithoutConflict, outerWithoutDuplicate))
       }
     }
   }

   /**
    * remove all the binding s
    */
   def removeBindings(nb: NamespaceBinding, scope: NamespaceBinding): NamespaceBinding = {
     if (nb == TopScope) scope
     else if (scope == TopScope) scope
     else {
       val NamespaceBinding(pre, _, more) = scope
       if (nb.getURI(pre) != null) {
         // the scope has a binding for this prefix
         // so irrespective of the uri, we remove it.
         removeBindings(nb, more)
       } else {
         // no binding, so keep it
         scope.copy(parent = removeBindings(nb, more))
       }
     }
   }

   def combineScopes(prefix: String, ns: NS, outer: NamespaceBinding): NamespaceBinding = {
     if (ns.optURI.isEmpty) {
       outer
     } else {
       val uri = ns.optURI.get.toString
       val inner = NamespaceBinding(prefix, uri, TopScope)
       combineScopes(inner, outer)
     }
   }

   def collapseScopes(x: Node, outer: NamespaceBinding): Node = {
     x match {
       case Elem(pre, lab, md, scp, child @ _*) => {
         val newScope = combineScopes(scp, outer)
         Elem(pre, lab, md, newScope, true, (child flatMap { ch => collapseScopes(ch, newScope) }): _*)
       }
       case _ => x
     }
   }

   /**
    * Removes NamespaceBindings from a scope containing specified namespaces
    */
   def filterScope(nsb: NamespaceBinding, nss: Seq[NS]): NamespaceBinding = {
     val newHead =
       if (nsb == xml.TopScope) {
         xml.TopScope
       } else {
         val parentCopy = filterScope(nsb.parent, nss)
         if (nss.contains(NS(nsb.uri))) {
           parentCopy
         } else {
           nsb.copy(parent = parentCopy)
         }
       }
     newHead
   }

   /**
    * Determines if a prefix is defined inside a scope
    */
   def prefixInScope(prefix: String, scope: NamespaceBinding): Boolean = {
     val ret =
       if (scope == null) {
         false
       } else if (prefix == scope.prefix) {
         true
       } else {
         prefixInScope(prefix, scope.parent)
       }
     ret
   }

   /**
    * Remove Comments
    */

   def removeComments(e: Node): Node = {
     e match {
       case Elem(prefix, label, attribs, scope, child @ _*) => {
         val newChildren = child.filterNot { _.isInstanceOf[Comment] }.map { removeComments(_) }
         Elem(prefix, label, attribs, scope, true, newChildren: _*)
       }
       case x => x
     }
   }

   /**
    * Removes attributes that we want to ignore when comparing
    * infosets.
    *
    * Removes dafint namespace attributes such as dafint:line and dafint:col.
    *
    * If a sequence of namespaces are given, only those attributes and scopes in
    * those namepsaces are removed. Otherwise, all attributes and scopes (aside
    * from special ones like xsi:nil) are removed. Additionally, if a scope is
    * filtered, the prefixes of elements prefixed with filtered scopes are also
    * removed.
    *
    * If a scope is given, it will be used for a child element if the
    * childs filtered scope is the same as the scope.
    *
    * Also strips out comments and mixed whitespace nodes, and coalesces
    * adjacent text nodes.
    *
    * Throws an exception if it contains mixed non-whitespace nodes.
    */
   def removeAttributes(n: Node, ns: Seq[NS] = Seq[NS](), parentScope: Option[NamespaceBinding] = None): Node = {
     val res1 = removeAttributes1(n, ns, parentScope).asInstanceOf[scala.xml.Node]
     val res2 = removeMixedWhitespace(res1)
     val res = res2(0) // .asInstanceOf[scala.xml.Node]
     res
   }

   /**
    * removes insignificant whitespace from between elements
    */

   private def removeMixedWhitespace(ns: Node): Node = {
     if (!ns.isInstanceOf[Elem]) return ns
     val e = ns.asInstanceOf[Elem]
     val children = e.child
     val noMixedChildren =
       if (children.exists(_.isInstanceOf[Elem])) {
         children.filter {
           case Text(data) if data.matches("""\s*""") => false
           case Text(data) => throw new Exception("Element %s contains mixed data: %s".format(e.label, data))
           case _ => true
         }.map(removeMixedWhitespace)
       } else {
         children.filter {
           //
           // So this is a bit strange, but we're dropping nodes that are Empty String.
           //
           // In XML we cannot tell <foo></foo> where there is a Text("") child, from <foo></foo> with Nil children
           //
           case Text("") => false // drop empty strings
           case _ => true
         }
       }

     val res =
       if (noMixedChildren eq children) e
       else e.copy(child = noMixedChildren)
     res
   }

   def convertPCDataToText(n: Node): Node = {
     val res = n match {
       case PCData(data) => {
         val t = Text(n.text)
         t
       }
       case Elem(prefix, label, attributes, scope, children @ _*) => {
         val newChildren = children.map { convertPCDataToText(_) }
         Elem(prefix, label, attributes, scope, true, newChildren: _*)
       }
       case _ => n
     }
     res
   }

   private def removeAttributes1(n: Node, ns: Seq[NS], parentScope: Option[NamespaceBinding]): NodeSeq = {
     val res = n match {

       case e @ Elem(prefix, label, attributes, scope, children @ _*) => {

         val filteredScope = if (ns.length > 0) filterScope(scope, ns) else xml.TopScope

         // If the filtered scope is logically the same as the parent scope, use
         // the parent scope. Scala uses references to determine if scopes are
         // the same during pretty printing. However, scopes are immutable, so
         // the filter algorithm creates new scopes. Because of this, we need to
         // ignore the newly filtered scope if it is logically the same as the
         // parent so that the scala pretty printer doesn't see them as
         // different scopes.
         val newScope = parentScope match {
           case Some(ps) => if (ps == filteredScope) ps else filteredScope
           case None => filteredScope
         }

         val newChildren: NodeSeq = children.flatMap { removeAttributes1(_, ns, Some(newScope)) }

         // Important to merge adjacent text. Otherwise when comparing
         // two structuers that print out the same, they might not be equal
         // because they have different length lists of text nodes
         //
         // Ex: <foo>A&#xE000;</foo> creates an element containing TWO
         // text nodes. But coming from the Daffodil Infoset, a string like
         // that would be just one text node.
         // Similarly <foo>abc<![CDATA[def]]>ghi</foo> has 3 child nodes.
         // The middle one is PCData. The two around it are Text.
         // Both Text and PCData are Atom[String].
         val textMergedChildren = coalesceAdjacentTextNodes(newChildren)

         val newPrefix = if (prefixInScope(prefix, newScope)) prefix else null

         val newAttributes = attributes.filter { m =>
           m match {
             case xsiNilAttr @ PrefixedAttribute(_, "nil", Text("true"), _) if (NS(xsiNilAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) => {
               true
             }
             //
             // This tolerates xsi:nil='true' when xsi has no definition at all.
             case xsiNilAttr @ PrefixedAttribute("xsi", "nil", Text("true"), _) if (xsiNilAttr.getNamespace(e) == null) => {
               true
             }
             case dafIntAttr @ PrefixedAttribute(pre, _, _, _) if (pre ne null) && (dafIntAttr.getNamespace(e) == XMLUtils.DAFFODIL_INTERNAL_NAMESPACE.toString) => {
               Assert.invariant(pre != "")
               false // drop dafint attributes.
             }
             case xsiTypeAttr @ PrefixedAttribute(_, "type", _, _) if (NS(xsiTypeAttr.getNamespace(e)) == XMLUtils.XSI_NAMESPACE) =>
               false // drop xsi:type attributes for now. Such time as we add
             // support for them, we would need to not remove them.
             // TODO: actually check xsi:type attributes are correct - but this
             // requires schema-aware comparison.
             case xsiTypeAttr @ PrefixedAttribute("xsi", "type", _, _) =>
               false // drop xsi:type attributes for now. Even if prefix xsi is not defined.
             // This just avoids having to edit many tests to add in the xmlns:xsi=....
             // namespace declaration.
             case attr =>
               true // keep all other attributes
           }
         }

         Elem(newPrefix, label, newAttributes, newScope, true, textMergedChildren: _*)
       }
       case c: scala.xml.Comment => NodeSeq.Empty // remove comments
       case other => other
     }
     res
   }

   def compareAndReport(trimmedExpected: Node, actualNoAttrs: Node, ignoreProcInstr: Boolean = true) = {
     if (trimmedExpected != actualNoAttrs) {
       val expString = trimmedExpected.toString
       val actString = actualNoAttrs.toString
       if (expString != actString) {
         val diffs = XMLUtils.computeDiff(trimmedExpected, actualNoAttrs, ignoreProcInstr)
         if (diffs.length > 0) {
           throw new Exception("""
 Comparison failed.
 Expected
           %s
 Actual
           %s
 Differences were (path, expected, actual):
  %s""".format(
             trimmedExpected.toString, actualNoAttrs.toString, diffs.map { _.toString }.mkString("\n")))
         }
       }
     }
   }

   /**
    * computes a precise difference list which is a sequence of triples.
    * Each triple is the path (an x-path-like string), followed by expected, and actual values.
    */
   def computeDiff(a: Node, b: Node, ignoreProcInstr: Boolean = true) = {
     computeDiffOne(Seq(a), Seq(b), Map.empty, Nil, ignoreProcInstr)
   }

   def childArrayCounters(e: Elem) = {
     val Elem(_, _, _, _, children @ _*) = e
     val labels = children.map { _.label }
     val groups = labels.groupBy { x => x }
     val counts = groups.map { case (label, labelList) => (label, labelList.length) }
     val arrayCounts = counts.filter { case (label, 1) => false; case _ => true } // remove counters for scalars
     val arrayCounters = arrayCounts.map { case (label, _) => (label, 1.toLong) } // 1 based like XPath!
     arrayCounters
   }

   def computeDiffOne(as: Seq[Node], bs: Seq[Node],
     aCounters: Map[String, Long],
     path: Seq[String],
     ignoreProcInstr: Boolean = true): Seq[(String, String, String)] = {
     lazy val zPath = path.reverse.mkString("/")
     (as, bs) match {
       case (a1 :: ars, b1 :: brs) if (a1.isInstanceOf[Elem] && b1.isInstanceOf[Elem]) => {
         val (a: Elem, b: Elem) = (a1, b1)
         val Elem(_, labelA, attribsA, _, childrenA @ _*) = a
         val Elem(_, labelB, attribsB, _, childrenB @ _*) = b
         if (labelA != labelB)
           List((zPath, a.toString, b.toString))
         else if (attribsA != attribsB
           && !((attribsA == null && (attribsB == null || attribsB.length == 0))
             || (attribsB == null) && attribsA.length == 0)) {

           // println("attributes are different")

           val aA = if (attribsA == null || attribsA == "") "null" else attribsA.toString
           val aB = if (attribsB == null || attribsB == "") "null" else attribsB.toString
           List((zPath, aA, aB))
         } else {
           val aIndex = aCounters.get(labelA)
           val aIndexExpr = aIndex.map { n => labelA + "[" + n + "]" }
           val newAIndex = aIndex.map { n => (labelA, n + 1) }
           val newACounters = aCounters ++ newAIndex.toList
           val pathStep = aIndexExpr.getOrElse(labelA)
           val aChildArrayCounters = childArrayCounters(a)
           //
           // Tricky induction here. For the rest of our peers, we must use newACounters
           // But as we move across our children, we're using a new map, aChildArrayCounters.
           //
           val newPath = pathStep +: path
           val childrenAList = childrenA.toList
           val childrenBList = childrenB.toList
           val childrenDiffs =
             computeDiffOne(childrenAList, childrenBList, aChildArrayCounters, newPath, ignoreProcInstr)
           val subsequentPeerDiffs = computeDiffOne(ars, brs, newACounters, path, ignoreProcInstr)
           val res = childrenDiffs ++ subsequentPeerDiffs
           res
         }
       }
       case (tA1 :: ars, tB1 :: brs) if (tA1.isInstanceOf[Text] && tB1.isInstanceOf[Text]) => {
         val (tA: Text, tB: Text) = (tA1, tB1)
         val thisDiff = computeTextDiff(zPath, tA, tB)
         val restDiffs = computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
         val res = thisDiff ++ restDiffs
         res
       }
       case (tA1 :: ars, brs) if (ignoreProcInstr && tA1.isInstanceOf[scala.xml.ProcInstr]) =>
         computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
       case (ars, tB1 :: brs) if (ignoreProcInstr && tB1.isInstanceOf[scala.xml.ProcInstr]) =>
         computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
       case (scala.xml.ProcInstr(tA1label, tA1content) :: ars,
         scala.xml.ProcInstr(tB1label, tB1content) :: brs) => {
         val labelDiff = computeTextDiff(zPath, tA1label, tB1label)
         //
         // The content of a ProcInstr is technically a big string
         // But our usage of them the content is XML-like so could be loaded and then compared
         // as XML, if the label is in fact an indicator that this is our special
         // PI with format info.
         //
         // Much of that XML-ish content is attributes however, so we need to be sure
         // we're comparing those too.
         //
         // TODO: implement XML-comparison for our data format info PIs.
         //
         val contentDiff = computeTextDiff(zPath, tA1content, tB1content)
         val restDiffs = computeDiffOne(ars, brs, aCounters, path, ignoreProcInstr)
         val res = labelDiff ++ contentDiff ++ restDiffs
         res
       }
       case (Nil, Nil) => Nil
       //
       // special case.
       //
       // when we read in an infoset for comparison we might have <foo></foo> which
       // loads as an Elem with Nil for child.
       //
       // But the actual might be Elem with child that is an array of exactly one Text node
       // with value "" (empty string). Visually this is the same! <foo></foo>
       //
       // Something in scala's libraries removes the isolated Text empty string nodes
       // So this comparison works. (Whitespace removal)
       case _ => {
         List((zPath, as.toString, bs.toString))
       }
     }
   }

   def computeTextDiff(zPath: String, tA: Text, tB: Text): Seq[(String, String, String)] = {
     val dataA = tA.toString
     val dataB = tB.toString
     computeTextDiff(zPath, dataA, dataB)
   }

   def computeTextDiff(zPath: String, dataA: String, dataB: String): Seq[(String, String, String)] = {
     def quoteIt(str: String) = "'" + str + "'"
     if (dataA == dataB) Nil
     else if (dataA.length != dataB.length) {
       List((zPath, quoteIt(dataA), quoteIt(dataB)))
     } else {
       val ints = Stream.from(1).map { _.toString }
       val z = dataA zip dataB zip ints
       val res = z.flatMap {
         case ((a1, b1), index) =>
           if (a1 == b1) Nil
           else {
             val indexPath = zPath + ".charAt(" + index + ")"
             List((indexPath, a1.toString + "(%%#x%04X;)".format(a1.toInt), b1.toString + "(%%#x%04X;)".format(b1.toInt)))
           }
       }
       res
     }
   }

   /**
    * for quick tests, we use literal scala nodes. However, the underlying
    * infrastructure wants to be all file centric for diagnostic-message
    * reasons (line numbers for errors)
    */
   def convertNodeToTempFile(xml: Node, tmpDir: File, nameHint: String = "daffodil_tmp_") = {
     // Create temp file
     // note that the prefix has a minimum length of 3.
     val prefix = nameHint.length match {
       case 0 => "daffodil_tmp_"
       case 1 => nameHint + "__"
       case 2 => nameHint + "_"
       case _ => nameHint
     }
     val tmpSchemaFile = File.createTempFile(prefix, ".dfdl.xsd", tmpDir)
     // Delete temp file when program exits
     tmpSchemaFile.deleteOnExit
     //
     // Note: we use our own pretty printer here because
     // Scala library one doesn't preserve/print CDATA properly.
     //
     val pp = new org.apache.daffodil.xml.PrettyPrinter(2)
     val xmlString = pp.format(xml)
     val fos = new java.io.FileOutputStream(tmpSchemaFile)
     val fw = new java.io.OutputStreamWriter(fos, "utf-8")
     fw.write(xmlString)
     fw.close()
     tmpSchemaFile
   }

   def convertInputStreamToTempFile(is: java.io.InputStream,
     tmpDir: File,
     nameHint: String,
     suffix: String) = {
     // Create temp file
     // note that the prefix has a minimum length of 3.
     val prefix = nameHint.length match {
       case 0 => "daffodil_tmp_"
       case 1 => nameHint + "__"
       case 2 => nameHint + "_"
       case _ => nameHint
     }
     val tmpSchemaFile = File.createTempFile(prefix, suffix, tmpDir)
     // Delete temp file when program exits
     tmpSchemaFile.deleteOnExit

     val fos = new java.io.FileOutputStream(tmpSchemaFile)
     IOUtils.copy(is, fos)
     fos.close()
     tmpSchemaFile
   }

   /**
    * Strong escaping that never loses information, handles apos and CR right.
    *
    * Escapes apostrophe (single quote) as well as the other XML escaped chars.
    * Remaps CR and any other XML-illegals into PUA. Replaces whitespace with
    * numeric character entities for additional safety.
    *
    * This is needed since XML may be using single quotes to surround a string which
    * might contain single quotes.
    *
    * The reason basic scala.xml.Utility.escape doesn't escape single-quotes is
    * HTML compatibility. HTML doesn't define an "&apos;" entity.
    *
    * Furthermore, since some potentially illegal XML characters may be used here, we
    * are going to remap all the illegal XML characters to their corresponding PUA characters.
    *
    * Lastly, all whitespace chars are replaced by numeric character entities, and
    * anything above 0xFF that is not considered letter or digit, is also replaced
    * by a numeric character entity.
    *
    * The result is a string which can be displayed as an XML attribute value, is
    * invertible back to the original string.
    *
    * Finally, CRLF and CR will come through as &#xE00D;&#xA; that's because
    * if we used &#xD; for the CR, it might be converted to a LF by XML readers.
    * We have to use our own PUA remapping trick if we want to be sure to preserve
    * CR in XML.
    */
   def escape(str: String, sb: StringBuilder = new StringBuilder()): StringBuilder = {
     var i = 0
     while (i < str.length) {
       val x = str(i)
       val c = escapeMapper(x)
       i += 1
       c match {
         case '\'' => sb.append("&#x27;") // don't use "&apos;" because it's not universally accepted (HTML doesn't have it in early versions)
         case '"' => sb.append("&quot;")
         case '&' => sb.append("&amp;")
         case '<' => sb.append("&lt;")
         case '>' => sb.append("&gt;")
         case _ if (c.isLetterOrDigit) => sb.append(c)
         case _ if (c.isWhitespace || c.isControl) => toNumericCharacterEntity(c, sb)
         // A0 is the NBSP character - not considered whitespace, but no glyph, so we need it numeric
         case _ if (c.toInt == 0xA0) => toNumericCharacterEntity(c, sb)
         // Any other char < 256 is punctuation or other glyph char
         case _ if (c.toInt < 0xFF) => sb.append(c)
         case _ => toNumericCharacterEntity(c, sb)
       }
     }
     sb
   }

   private val escapeMapper =
     remapXMLIllegalCharToPUA(
       checkForExistingPUA = false,
       replaceCRWithLF = false) _

   def toNumericCharacterEntity(c: Char, sb: StringBuilder) = {
     val i = c.toInt
     Assert.usage(i > 0) // NUL cannot be represented at all in XML.
     val s = Integer.toHexString(i).toUpperCase()
     sb.append("&#x")
     sb.append(s)
     sb.append(";")
   }
 }

 trait GetAttributesMixin extends ThrowsSDE {
   def xml: Node

   /**
    * Use to retrieve things that are not format properties.
    */
   def getAttributeRequired(name: String) = {
     getAttributeOption(name) match {
       case None => schemaDefinitionError("The attribute '" + name + "' is required.")
       case Some(s) => s
     }
   }

   /**
    * Use to retrieve things that are not format properties.
    */
   def getAttributeOption(name: String): Option[String] = {
     val attrString = xml.attribute(name).map { _.text }
     attrString
   }

   def getAttributeOption(ns: NS, name: String): Option[String] = {
     //
     // Most annoying, but this doesn't work....
     // val res = xml.attribute(ns.toString, name).map{ _.text }
     val attr = (xml \ ("@{" + ns.toString + "}" + name))
     if (attr.length == 0) None
     else Some(attr.text)
   }

   /**
    * For picking off the short-form annotations.
    */
   def dfdlAttributes(n: Node) = XMLUtils.dfdlAttributes(n)
   def dafAttributes(n: Node) = XMLUtils.dafAttributes(n)

 }

 class QNamePrefixNotInScopeException(pre: String, loc: LookupLocation)
   extends Exception("Prefix " + pre + " not found in scope. Location: " + loc.toString)

 // Commented out for now, but we may reactivate this to
 // do more validation stuff in the TDMLRunner. So keeping in the
 // source like this.
 //
 //object XMLSchemaUtils {
 //  /**
 //   * validate a DFDL schema.
 //   *
 //   * This validates the XML Schema language subset that DFDL uses, and also all the annotations
 //   * hung off of it.
 //   */
 //  def validateDFDLSchema(doc: Node) = {
 //    // TODO: should this do something other than throw an exception on a validation error?
 //    //
 //    // Users will write DFDL Schemas, using the xs or xsd prefix (usually) bound to the XML Schema namespace,
 //    // and the dfdl prefix (usually) bound to the DFDL namespace.
 //    //
 //    // However, we don't want to validate using the XML Schema for XML Schema (which would be the usual interpretation
 //    // of validating an XML Schema), instead we want to use the schema for the DFDL Subset of XML Schema.
 //    //
 //    // So, the hack here, is we're going to textually substitute the URIs, so that the validator doesn't have to be
 //    // modified to do this switch, and we don't have to lie in the DFDL Subset schema, and claim it is realizing the
 //    // XML Schema URI.
 //    //
 //    // However, we should consider whether there is a better way to do this involving either (a) lying and having the
 //    // DFDL Subset Schema pretend it is the XSD schema, or we can play some catalog tricks perhaps.
 //    //
 //    // Also, the way this whole thing finds the necessary schemas is a bit daft. It should look in the jar or files,
 //    // but it should be using an XML Catalog.
 //    //
 //    val docstring = doc.toString()
 //    val xmlnsURI = "http://www.w3.org/2001/XMLSchema";
 //    val xsdSubsetURI = "http://www.w3.org/2001/XMLSchema";
 //    val docReplaced = docstring.replaceAll(xmlnsURI, xsdSubsetURI)
 //    val docReader = new StringReader(docReplaced)
 //    val schemaResource = Misc.getRequiredResource(Validator.dfdlSchemaFileName()).toURI()
 //    val res =
 //      try {
 //        Validator.validateXMLStream(schemaResource, docReader)
 //      } catch {
 //        case e: ...  => {
 //          val exc = e
 //          // System.err.println(exc.getMessage())
 //          // Really useful place for a breakpoint.
 //          throw e
 //        }
 //      }
 //    res
 //  }
 //}