daffodil-lib/src/main/scala/org/apache/daffodil/xml/DaffodilConstructingLoader.scala - daffodil - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.daffodil.xml

 import java.io.BufferedInputStream
 import java.net.URI

 import scala.io.Source
 import scala.xml._
 import scala.xml.include.sax.EncodingHeuristics
 import scala.xml.parsing.ConstructingParser

 import org.apache.daffodil.exceptions.Assert

 /**
  * Scala 2.11 deprecated the Position object so it is no longer public.
  * However, we still need a way to decode the integer positions that contain
  * both line/col information. Scala provided no other way to decode this, so
  * this copies the relevant bits from:
  *
  * https://github.com/scala/scala/blob/2.11.x/src/library/scala/io/Position.scala
  *
  * Note that if scala ever changes thes values, line/column numbers will be off
  */
 object Position {
   /** Number of bits used to encode the line number */
   final val LINE_BITS = 20
   /** Number of bits used to encode the column number */
   final val COLUMN_BITS = 31 - LINE_BITS // no negatives => 31
   /** Mask to decode the line number */
   final val LINE_MASK = (1 << LINE_BITS) - 1
   /** Mask to decode the column number */
   final val COLUMN_MASK = (1 << COLUMN_BITS) - 1

   final def line(pos: Int): Int = (pos >> COLUMN_BITS) & LINE_MASK

   final def column(pos: Int): Int = pos & COLUMN_MASK
 }

 /**
  * Loads XML using the Scala ConstructingParser for XML.
  *
  * Necessary as this xml loading technique handles the <![CDATA[...]]>
  * properly, creating PCData nodes for the contents of these, and not otherwise
  * messing with the contents.
  *
  * This code is effectively our fork of the Scala ConstructingParser. This
  * works around some bugs in it.
  *
  * Xerces, unfortunately, messes with the contents of these CDATA regions,
  * normalizes whitespace inside them, and generally makes it impossible to do things
  * in XML that depend on line-structure of element content to being preserved.
  *
  * We have places where line structure matters. Specifically regular expressions
  * have a free-form syntax with comments that extend to end-of-line. If we always
  * wrap these with CDATA, and use this loader, not Xerces, then these will be
  * preserved properly.
  *
  * Also, enhanced so that when addPositionAttributes is true, it will capture
  * file/line/column info for every element and add it
  * as attributes onto each XML element.
  *
  * The way the constructing loader (aka ConstructingParser (for XML))
  * gets positions is different. It is given just an offset into the document file/stream,
  * and it therefore must synthesize line number/col number info itself.
  *
  * This primary constructor is package private as the normalizeCRLFtoLF feature
  * is only for test/exploratory usage, or if a future need arises to preserve the
  * non-normalizaing behavior.
  *
  * @param uri URI for the XML to be loaded.
  * @param errorHandler Called back on load errors.
  * @param addPositionAttributes Use true if you want dafint:file,
  *                              dafint:col, and dafint:line attributes.
  *                              Defaults to false.
  * @param normalizeCRLFtoLF Use true to emulate the scala XML load
  *                          behavior of normalizing CRLF to LF, and solitary CR to LF.
  *                          Defaults to true. Should only be changed in special circumstances
  *                          as not normalizing CRLFs is non-standard for XML.
  *
  */
 class DaffodilConstructingLoader private[xml] (uri: URI,
   errorHandler: org.xml.sax.ErrorHandler,
   addPositionAttributes: Boolean,
   normalizeCRLFtoLF: Boolean)
   extends ConstructingParser({
     // Note: we must open the XML carefully since it might be in some non
     // default encoding (we have tests that have UTF-16 for example)

     // must be buffered to support mark(), needed by heuristics
     val is = new BufferedInputStream(uri.toURL.openStream())
     val enc = EncodingHeuristics.readEncodingFromStream(is)
     Source.fromInputStream(is, enc)
   }, true) {

   /**
    * Public constructor insists on normalizingCRLFtoLF behavior.
    */
   def this (uri: URI,
     errorHandler: org.xml.sax.ErrorHandler,
     addPositionAttributes: Boolean = false) =
     this(uri, errorHandler, addPositionAttributes, normalizeCRLFtoLF = true)

   /**
    * Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
    *
    * Coverage is off, because this should never be hit, because
    * the loader always has loaded the data with xerces prior to
    * this loader (for validation purposes), and that will have caught
    * the doctype being in the XML.
    *
    * However, under code maintenance, suppose someone turned that off
    * or made that pass optional (for performance reasons perhaps). Then this
    * provides a last-gasp attempt to protect from DOCTYPE-related
    * insecurity.
    */
   // $COVERAGE-OFF$
   override def parseDTD(): Unit = {
     val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
     throw e
   }
   // $COVERAGE-ON$

   // This one line is a bit of a hack to get consistent line numbers. The
   // scala-xml libary reads XML from a scala.io.Source which maintains private
   // line/col information about where in the Source we are reading from (i.e.
   // scala.io.Source.pos). The problem is that when CDATA or a processing
   // instruction is encountered, the library switches to a custom
   // "WithLookAhead" scala.io.Source that buffers the original Source. This
   // lookahead Source allows it to peek ahead a few characters, which is used
   // to find the end of CDATA and processing instructions. The problem is that
   // when it switches to this new Source, we lose position information since
   // that information is private to each Source. This causes line information
   // to reset to zero when the first CDATA or processing instruction is found.
   // And there is no good way to copy position information from one source to
   // another. So, what we can do is call this lookahead() function before any
   // XML is parsed. This causes the ConstructingLoader to immediately switch to
   // the buffering source. There may be some slight overhead for buffering, but
   // at last our line numbers are correct.
   lookahead()


   private def makeSAXParseException(pos: Int, msg: String) = {
     val line = Position.line(pos)
     val col = Position.column(pos)
     val exc = new org.xml.sax.SAXParseException(msg, null, uri.toString, line, col)
     exc
   }

   override def reportSyntaxError(pos: Int, msg: String): Unit = {
     val exc = makeSAXParseException(pos, msg)
     errorHandler.error(exc)
   }

   /*
    * Callback method invoked by MarkupParser after parsing an element, between
    * the elemStart and elemEnd callbacks. This adds daffodil file/line/column
    * information as attributes to the existing input attrs, modifying the scope
    * if necessary, then creates an element using the super def elem function.
    *
    *  @param pos      the position in the source file
    *  @param pre      the prefix
    *  @param local    the local name
    *  @param attrs    the attributes (metadata)
    *  @param scope    the namespace binding scope
    *  @param empty    `true` if the element was previously empty; `false` otherwise.
    *  @param args     the children of this element
    */
   override def elem(
     pos: Int,
     pre: String,
     local: String,
     attrs: MetaData,
     scope: NamespaceBinding,
     empty: Boolean,
     nodes: NodeSeq): NodeSeq = {

     val nsURI = NS(scope.getURI(pre))
     val isFileRootNode = (local.equalsIgnoreCase("schema") && nsURI == XMLUtils.XSD_NAMESPACE) ||
       (local.equalsIgnoreCase("testSuite") && nsURI == XMLUtils.TDML_NAMESPACE)
     val alreadyHasLineCol = attrs.exists {
       case PrefixedAttribute(XMLUtils.INT_PREFIX, attr, _, _) => {
         attr.equalsIgnoreCase(XMLUtils.COLUMN_ATTRIBUTE_NAME) ||
           attr.equalsIgnoreCase(XMLUtils.LINE_ATTRIBUTE_NAME)
       }
       case _ => false
     }

     val newAttrs: MetaData = {
       if (addPositionAttributes && !alreadyHasLineCol) {
         val withFile: MetaData =
           if (isFileRootNode) {
             new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.FILE_ATTRIBUTE_NAME, uri.toString, attrs)
           } else {
             attrs
           }
         val withCol: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.COLUMN_ATTRIBUTE_NAME, Position.column(pos).toString, withFile)
         val withLine: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.LINE_ATTRIBUTE_NAME, Position.line(pos).toString, withCol)
         withLine
       } else {
         attrs
       }
     }

     // add the dafint prefix if it doesn't already exist
     val intPrefix = scope.getPrefix(XMLUtils.INT_NS)
     val newScope = if (addPositionAttributes && intPrefix == null) {
       NamespaceBinding(XMLUtils.INT_PREFIX, XMLUtils.INT_NS, scope)
     } else {
       Assert.usage(intPrefix == null || intPrefix == XMLUtils.INT_PREFIX) // can't deal with some other binding for dafint
       scope
     }

     super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
   }

   /**
    * To emulate the behavior of Xerces loader (standard scala loader)
    * we have to normalize CRLF to LF, and solitary CR to LF.
    *
    * This is optional controlled by a constructor parameter.
    */
   override def text(pos: Int, txt: String): Text = {
     val newText:String = {
       if (normalizeCRLFtoLF && txt.contains("\r")) {
         txt.
           replaceAll("\r\n", "\n").
           replaceAll("\r", "\n")
       } else {
         txt
       }
     }
     //
     // On MS-Windows the TDML Runner previously would load XML
     // files and due to git autoCRLF=true, they would
     // have CRLFs in them. The loader the TDML Runner WAS
     // using (not any more) was preserving these CRLFs
     // in the XML infoset data, and so tests could come
     // to depend on this and be non-portable between
     // unix (LF only) and windows (CRLF only).
     //
     // Furthermore, the TDML file itself used to be loaded with this
     // CRLF-preserving loader.
     //
     // The TDML Runner now always normalizes CRLF or
     // isolated CR to LF like regular XML loaders do,
     // for both the TDML file itself, and any files it
     // loads. So this is no longer an issue.
     //
     super.text(pos, newText)
   }

   /**
    * We override this to force the ConstrutingParser to process CDATA regions
    * specially with an override-able method named cdata.
    *
    * Strangely, if you look at the implementation of this in the MarkupParser
    * trait, it calls the handler for text, but then it ignores the result of that
    * and constructs a PCDATA node from the original text.
    *
    * It's possible this is a bug fix.
    */
   override def xCharData: NodeSeq = {
     xToken("[CDATA[")
     def mkResult(pos: Int, s: String): NodeSeq = {
       val s1 = cdata(pos, s).text
       PCData(s1)
     }
     xTakeUntil(mkResult, () => pos, "]]>")
   }

   /**
    * Same CRLF/CR => LF processing as text gets.
    */
   def cdata(pos: Int, s: String): NodeSeq = {
     text(pos, s)
   }

   /**
    * Same CRLF/CR => LF processing as text gets.
    */
   override def comment(pos: Int, s: String): Comment = {
     Comment(text(pos, s).text)
   }

   /**
    * Same CRLF/CR => LF processing as text gets.
    */
   override def procInstr(pos: Int, target: String, txt: String) =
     ProcInstr(target, text(pos, txt).text)

   private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {

     var info_ver: Option[String] = None
     var info_enc: Option[String] = None
     var info_stdl: Option[Boolean] = None

     var n = 0
     m("version") match {
       case null =>
       case Text("1.0") =>
         info_ver = Some("1.0"); n += 1
       case _ => reportSyntaxError("cannot deal with versions != 1.0")
     }

     m("encoding") match {
       case null =>
       case Text(enc) =>
         if (!isValidIANAEncoding(enc))
           reportSyntaxError("\"" + enc + "\" is not a valid encoding")
         else {
           info_enc = Some(enc)
           n += 1
         }
     }

     m("standalone") match {
       case null =>
       case Text("yes") =>
         info_stdl = Some(true); n += 1
       case Text("no") =>
         info_stdl = Some(false); n += 1
       case _ => reportSyntaxError("either 'yes' or 'no' expected")
     }

     if (m.length - n != 0) {
       reportSyntaxError(
         "only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
     }

     (info_ver, info_enc, info_stdl)
   }

   /**
    * Override of document to make it tolerant of the start of the file
    * being whitespace instead of a "<" character
    *
    * This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
    * a true replacement (bug fix) on the original ConstructingParser method
    * that it overrides.
    */
   override def document(): Document = {
     doc = new Document()
     this.dtd = null
     var children: NodeSeq = null

     if ('<' == ch) {
       nextch()
       if ('?' == ch) {
         nextch()
         // It's probably an XML prolog, but
         // there are cases where there is no XML Prolog, but a starting
         // PI of <?xml-model href="...."?>
         // So we have to recognize as a general PI, then look and see if
         // it is a prolog.
         val name = xName
         xSpace()
         val (md, scp) = xAttributes(TopScope)
         if (scp != TopScope)
           reportSyntaxError("no xmlns definitions allowed.")
         xToken('?')
         xToken('>')
         if (name == "xml") {
           val info_prolog = parseXMLPrologAttributes(md)
           doc.version = info_prolog._1
           doc.encoding = info_prolog._2
           doc.standAlone = info_prolog._3
         } else {
           // not an xml prolog. It's some other PI
           // do nothing. We're just skipping those PIs
         }
         children = content(TopScope)
       } else {
         val ts = new NodeBuffer()
         content1(TopScope, ts) // the 1 suffix means "without the first < character"
         ts &+ content(TopScope)
         children = NodeSeq.fromSeq(ts)
       }
     } else {
       children = content(TopScope)
     }

     var isErr = false
     var elemCount = 0
     var theNode: Node = null
     children.foreach { c =>
       c match {
         case _: ProcInstr => // skip
         case _: Comment => // skip
         // $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
         case _: EntityRef => {
           reportSyntaxError("no entity references allowed here")
           isErr = true
         }
         // $COVERAGE-ON$
         case s: SpecialNode => {
           val txt = s.toString.trim()
           if (txt.length > 0) {
             reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
             isErr = true
           }
         }
         case m: Elem =>
           elemCount += 1
           theNode = m
       }
     }
     if (1 != elemCount) {
       reportSyntaxError("document must contain exactly one element")
       isErr = true
     }

     if (!isErr) {
       doc.children = children
       doc.docElem = theNode
       doc
     } else {
       null
     }
   }

   def load(): Node = {
     val res =
       try {
         this.initialize
         val doc = this.document()
         if (doc == null) null
         else doc.docElem
       } catch {
         case e: Exception => {
           val exc = makeSAXParseException(curInput.pos, e.toString)
           errorHandler.fatalError(exc) // good place for a breakpoint
           null
         }
       }
     res
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.daffodil.xml

	import java.io.BufferedInputStream
	import java.net.URI

	import scala.io.Source
	import scala.xml._
	import scala.xml.include.sax.EncodingHeuristics
	import scala.xml.parsing.ConstructingParser

	import org.apache.daffodil.exceptions.Assert

	/**
	* Scala 2.11 deprecated the Position object so it is no longer public.
	* However, we still need a way to decode the integer positions that contain
	* both line/col information. Scala provided no other way to decode this, so
	* this copies the relevant bits from:
	*
	* https://github.com/scala/scala/blob/2.11.x/src/library/scala/io/Position.scala
	*
	* Note that if scala ever changes thes values, line/column numbers will be off
	*/
	object Position {
	/** Number of bits used to encode the line number */
	final val LINE_BITS = 20
	/** Number of bits used to encode the column number */
	final val COLUMN_BITS = 31 - LINE_BITS // no negatives => 31
	/** Mask to decode the line number */
	final val LINE_MASK = (1 << LINE_BITS) - 1
	/** Mask to decode the column number */
	final val COLUMN_MASK = (1 << COLUMN_BITS) - 1

	final def line(pos: Int): Int = (pos >> COLUMN_BITS) & LINE_MASK

	final def column(pos: Int): Int = pos & COLUMN_MASK
	}

	/**
	* Loads XML using the Scala ConstructingParser for XML.
	*
	* Necessary as this xml loading technique handles the <![CDATA[...]]>
	* properly, creating PCData nodes for the contents of these, and not otherwise
	* messing with the contents.
	*
	* This code is effectively our fork of the Scala ConstructingParser. This
	* works around some bugs in it.
	*
	* Xerces, unfortunately, messes with the contents of these CDATA regions,
	* normalizes whitespace inside them, and generally makes it impossible to do things
	* in XML that depend on line-structure of element content to being preserved.
	*
	* We have places where line structure matters. Specifically regular expressions
	* have a free-form syntax with comments that extend to end-of-line. If we always
	* wrap these with CDATA, and use this loader, not Xerces, then these will be
	* preserved properly.
	*
	* Also, enhanced so that when addPositionAttributes is true, it will capture
	* file/line/column info for every element and add it
	* as attributes onto each XML element.
	*
	* The way the constructing loader (aka ConstructingParser (for XML))
	* gets positions is different. It is given just an offset into the document file/stream,
	* and it therefore must synthesize line number/col number info itself.
	*
	* This primary constructor is package private as the normalizeCRLFtoLF feature
	* is only for test/exploratory usage, or if a future need arises to preserve the
	* non-normalizaing behavior.
	*
	* @param uri URI for the XML to be loaded.
	* @param errorHandler Called back on load errors.
	* @param addPositionAttributes Use true if you want dafint:file,
	* dafint:col, and dafint:line attributes.
	* Defaults to false.
	* @param normalizeCRLFtoLF Use true to emulate the scala XML load
	* behavior of normalizing CRLF to LF, and solitary CR to LF.
	* Defaults to true. Should only be changed in special circumstances
	* as not normalizing CRLFs is non-standard for XML.
	*
	*/
	class DaffodilConstructingLoader private[xml] (uri: URI,
	errorHandler: org.xml.sax.ErrorHandler,
	addPositionAttributes: Boolean,
	normalizeCRLFtoLF: Boolean)
	extends ConstructingParser({
	// Note: we must open the XML carefully since it might be in some non
	// default encoding (we have tests that have UTF-16 for example)

	// must be buffered to support mark(), needed by heuristics
	val is = new BufferedInputStream(uri.toURL.openStream())
	val enc = EncodingHeuristics.readEncodingFromStream(is)
	Source.fromInputStream(is, enc)
	}, true) {

	/**
	* Public constructor insists on normalizingCRLFtoLF behavior.
	*/
	def this (uri: URI,
	errorHandler: org.xml.sax.ErrorHandler,
	addPositionAttributes: Boolean = false) =
	this(uri, errorHandler, addPositionAttributes, normalizeCRLFtoLF = true)

	/**
	* Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
	*
	* Coverage is off, because this should never be hit, because
	* the loader always has loaded the data with xerces prior to
	* this loader (for validation purposes), and that will have caught
	* the doctype being in the XML.
	*
	* However, under code maintenance, suppose someone turned that off
	* or made that pass optional (for performance reasons perhaps). Then this
	* provides a last-gasp attempt to protect from DOCTYPE-related
	* insecurity.
	*/
	// $COVERAGE-OFF$
	override def parseDTD(): Unit = {
	val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
	throw e
	}
	// $COVERAGE-ON$

	// This one line is a bit of a hack to get consistent line numbers. The
	// scala-xml libary reads XML from a scala.io.Source which maintains private
	// line/col information about where in the Source we are reading from (i.e.
	// scala.io.Source.pos). The problem is that when CDATA or a processing
	// instruction is encountered, the library switches to a custom
	// "WithLookAhead" scala.io.Source that buffers the original Source. This
	// lookahead Source allows it to peek ahead a few characters, which is used
	// to find the end of CDATA and processing instructions. The problem is that
	// when it switches to this new Source, we lose position information since
	// that information is private to each Source. This causes line information
	// to reset to zero when the first CDATA or processing instruction is found.
	// And there is no good way to copy position information from one source to
	// another. So, what we can do is call this lookahead() function before any
	// XML is parsed. This causes the ConstructingLoader to immediately switch to
	// the buffering source. There may be some slight overhead for buffering, but
	// at last our line numbers are correct.
	lookahead()


	private def makeSAXParseException(pos: Int, msg: String) = {
	val line = Position.line(pos)
	val col = Position.column(pos)
	val exc = new org.xml.sax.SAXParseException(msg, null, uri.toString, line, col)
	exc
	}

	override def reportSyntaxError(pos: Int, msg: String): Unit = {
	val exc = makeSAXParseException(pos, msg)
	errorHandler.error(exc)
	}

	/*
	* Callback method invoked by MarkupParser after parsing an element, between
	* the elemStart and elemEnd callbacks. This adds daffodil file/line/column
	* information as attributes to the existing input attrs, modifying the scope
	* if necessary, then creates an element using the super def elem function.
	*
	* @param pos the position in the source file
	* @param pre the prefix
	* @param local the local name
	* @param attrs the attributes (metadata)
	* @param scope the namespace binding scope
	* @param empty `true` if the element was previously empty; `false` otherwise.
	* @param args the children of this element
	*/
	override def elem(
	pos: Int,
	pre: String,
	local: String,
	attrs: MetaData,
	scope: NamespaceBinding,
	empty: Boolean,
	nodes: NodeSeq): NodeSeq = {

	val nsURI = NS(scope.getURI(pre))
	val isFileRootNode = (local.equalsIgnoreCase("schema") && nsURI == XMLUtils.XSD_NAMESPACE) \|\|
	(local.equalsIgnoreCase("testSuite") && nsURI == XMLUtils.TDML_NAMESPACE)
	val alreadyHasLineCol = attrs.exists {
	case PrefixedAttribute(XMLUtils.INT_PREFIX, attr, _, _) => {
	attr.equalsIgnoreCase(XMLUtils.COLUMN_ATTRIBUTE_NAME) \|\|
	attr.equalsIgnoreCase(XMLUtils.LINE_ATTRIBUTE_NAME)
	}
	case _ => false
	}

	val newAttrs: MetaData = {
	if (addPositionAttributes && !alreadyHasLineCol) {
	val withFile: MetaData =
	if (isFileRootNode) {
	new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.FILE_ATTRIBUTE_NAME, uri.toString, attrs)
	} else {
	attrs
	}
	val withCol: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.COLUMN_ATTRIBUTE_NAME, Position.column(pos).toString, withFile)
	val withLine: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.LINE_ATTRIBUTE_NAME, Position.line(pos).toString, withCol)
	withLine
	} else {
	attrs
	}
	}

	// add the dafint prefix if it doesn't already exist
	val intPrefix = scope.getPrefix(XMLUtils.INT_NS)
	val newScope = if (addPositionAttributes && intPrefix == null) {
	NamespaceBinding(XMLUtils.INT_PREFIX, XMLUtils.INT_NS, scope)
	} else {
	Assert.usage(intPrefix == null \|\| intPrefix == XMLUtils.INT_PREFIX) // can't deal with some other binding for dafint
	scope
	}

	super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
	}

	/**
	* To emulate the behavior of Xerces loader (standard scala loader)
	* we have to normalize CRLF to LF, and solitary CR to LF.
	*
	* This is optional controlled by a constructor parameter.
	*/
	override def text(pos: Int, txt: String): Text = {
	val newText:String = {
	if (normalizeCRLFtoLF && txt.contains("\r")) {
	txt.
	replaceAll("\r\n", "\n").
	replaceAll("\r", "\n")
	} else {
	txt
	}
	}
	//
	// On MS-Windows the TDML Runner previously would load XML
	// files and due to git autoCRLF=true, they would
	// have CRLFs in them. The loader the TDML Runner WAS
	// using (not any more) was preserving these CRLFs
	// in the XML infoset data, and so tests could come
	// to depend on this and be non-portable between
	// unix (LF only) and windows (CRLF only).
	//
	// Furthermore, the TDML file itself used to be loaded with this
	// CRLF-preserving loader.
	//
	// The TDML Runner now always normalizes CRLF or
	// isolated CR to LF like regular XML loaders do,
	// for both the TDML file itself, and any files it
	// loads. So this is no longer an issue.
	//
	super.text(pos, newText)
	}

	/**
	* We override this to force the ConstrutingParser to process CDATA regions
	* specially with an override-able method named cdata.
	*
	* Strangely, if you look at the implementation of this in the MarkupParser
	* trait, it calls the handler for text, but then it ignores the result of that
	* and constructs a PCDATA node from the original text.
	*
	* It's possible this is a bug fix.
	*/
	override def xCharData: NodeSeq = {
	xToken("[CDATA[")
	def mkResult(pos: Int, s: String): NodeSeq = {
	val s1 = cdata(pos, s).text
	PCData(s1)
	}
	xTakeUntil(mkResult, () => pos, "]]>")
	}

	/**
	* Same CRLF/CR => LF processing as text gets.
	*/
	def cdata(pos: Int, s: String): NodeSeq = {
	text(pos, s)
	}

	/**
	* Same CRLF/CR => LF processing as text gets.
	*/
	override def comment(pos: Int, s: String): Comment = {
	Comment(text(pos, s).text)
	}

	/**
	* Same CRLF/CR => LF processing as text gets.
	*/
	override def procInstr(pos: Int, target: String, txt: String) =
	ProcInstr(target, text(pos, txt).text)

	private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {

	var info_ver: Option[String] = None
	var info_enc: Option[String] = None
	var info_stdl: Option[Boolean] = None

	var n = 0
	m("version") match {
	case null =>
	case Text("1.0") =>
	info_ver = Some("1.0"); n += 1
	case _ => reportSyntaxError("cannot deal with versions != 1.0")
	}

	m("encoding") match {
	case null =>
	case Text(enc) =>
	if (!isValidIANAEncoding(enc))
	reportSyntaxError("\"" + enc + "\" is not a valid encoding")
	else {
	info_enc = Some(enc)
	n += 1
	}
	}

	m("standalone") match {
	case null =>
	case Text("yes") =>
	info_stdl = Some(true); n += 1
	case Text("no") =>
	info_stdl = Some(false); n += 1
	case _ => reportSyntaxError("either 'yes' or 'no' expected")
	}

	if (m.length - n != 0) {
	reportSyntaxError(
	"only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
	}

	(info_ver, info_enc, info_stdl)
	}

	/**
	* Override of document to make it tolerant of the start of the file
	* being whitespace instead of a "<" character
	*
	* This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
	* a true replacement (bug fix) on the original ConstructingParser method
	* that it overrides.
	*/
	override def document(): Document = {
	doc = new Document()
	this.dtd = null
	var children: NodeSeq = null

	if ('<' == ch) {
	nextch()
	if ('?' == ch) {
	nextch()
	// It's probably an XML prolog, but
	// there are cases where there is no XML Prolog, but a starting
	// PI of <?xml-model href="...."?>
	// So we have to recognize as a general PI, then look and see if
	// it is a prolog.
	val name = xName
	xSpace()
	val (md, scp) = xAttributes(TopScope)
	if (scp != TopScope)
	reportSyntaxError("no xmlns definitions allowed.")
	xToken('?')
	xToken('>')
	if (name == "xml") {
	val info_prolog = parseXMLPrologAttributes(md)
	doc.version = info_prolog._1
	doc.encoding = info_prolog._2
	doc.standAlone = info_prolog._3
	} else {
	// not an xml prolog. It's some other PI
	// do nothing. We're just skipping those PIs
	}
	children = content(TopScope)
	} else {
	val ts = new NodeBuffer()
	content1(TopScope, ts) // the 1 suffix means "without the first < character"
	ts &+ content(TopScope)
	children = NodeSeq.fromSeq(ts)
	}
	} else {
	children = content(TopScope)
	}

	var isErr = false
	var elemCount = 0
	var theNode: Node = null
	children.foreach { c =>
	c match {
	case _: ProcInstr => // skip
	case _: Comment => // skip
	// $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
	case _: EntityRef => {
	reportSyntaxError("no entity references allowed here")
	isErr = true
	}
	// $COVERAGE-ON$
	case s: SpecialNode => {
	val txt = s.toString.trim()
	if (txt.length > 0) {
	reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
	isErr = true
	}
	}
	case m: Elem =>
	elemCount += 1
	theNode = m
	}
	}
	if (1 != elemCount) {
	reportSyntaxError("document must contain exactly one element")
	isErr = true
	}

	if (!isErr) {
	doc.children = children
	doc.docElem = theNode
	doc
	} else {
	null
	}
	}

	def load(): Node = {
	val res =
	try {
	this.initialize
	val doc = this.document()
	if (doc == null) null
	else doc.docElem
	} catch {
	case e: Exception => {
	val exc = makeSAXParseException(curInput.pos, e.toString)
	errorHandler.fatalError(exc) // good place for a breakpoint
	null
	}
	}
	res
	}
	}