blob: eadb27f19091bc38aa27c978cf902a396cefd54d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.daffodil.xml
import java.io.BufferedInputStream
import java.net.URI
import scala.io.Source
import scala.xml._
import scala.xml.include.sax.EncodingHeuristics
import scala.xml.parsing.ConstructingParser
import org.apache.daffodil.exceptions.Assert
/**
* Scala 2.11 deprecated the Position object so it is no longer public.
* However, we still need a way to decode the integer positions that contain
* both line/col information. Scala provided no other way to decode this, so
* this copies the relevant bits from:
*
* https://github.com/scala/scala/blob/2.11.x/src/library/scala/io/Position.scala
*
* Note that if scala ever changes thes values, line/column numbers will be off
*/
object Position {
/** Number of bits used to encode the line number */
final val LINE_BITS = 20
/** Number of bits used to encode the column number */
final val COLUMN_BITS = 31 - LINE_BITS // no negatives => 31
/** Mask to decode the line number */
final val LINE_MASK = (1 << LINE_BITS) - 1
/** Mask to decode the column number */
final val COLUMN_MASK = (1 << COLUMN_BITS) - 1
final def line(pos: Int): Int = (pos >> COLUMN_BITS) & LINE_MASK
final def column(pos: Int): Int = pos & COLUMN_MASK
}
/**
* Loads XML using the Scala ConstructingParser for XML.
*
* Necessary as this xml loading technique handles the <![CDATA[...]]>
* properly, creating PCData nodes for the contents of these, and not otherwise
* messing with the contents.
*
* This code is effectively our fork of the Scala ConstructingParser. This
* works around some bugs in it.
*
* Xerces, unfortunately, messes with the contents of these CDATA regions,
* normalizes whitespace inside them, and generally makes it impossible to do things
* in XML that depend on line-structure of element content to being preserved.
*
* We have places where line structure matters. Specifically regular expressions
* have a free-form syntax with comments that extend to end-of-line. If we always
* wrap these with CDATA, and use this loader, not Xerces, then these will be
* preserved properly.
*
* Also, enhanced so that when addPositionAttributes is true, it will capture
* file/line/column info for every element and add it
* as attributes onto each XML element.
*
* The way the constructing loader (aka ConstructingParser (for XML))
* gets positions is different. It is given just an offset into the document file/stream,
* and it therefore must synthesize line number/col number info itself.
*
* This primary constructor is package private as the normalizeCRLFtoLF feature
* is only for test/exploratory usage, or if a future need arises to preserve the
* non-normalizaing behavior.
*
* @param uri URI for the XML to be loaded.
* @param errorHandler Called back on load errors.
* @param addPositionAttributes Use true if you want dafint:file,
* dafint:col, and dafint:line attributes.
* Defaults to false.
* @param normalizeCRLFtoLF Use true to emulate the scala XML load
* behavior of normalizing CRLF to LF, and solitary CR to LF.
* Defaults to true. Should only be changed in special circumstances
* as not normalizing CRLFs is non-standard for XML.
*
*/
class DaffodilConstructingLoader private[xml] (uri: URI,
errorHandler: org.xml.sax.ErrorHandler,
addPositionAttributes: Boolean,
normalizeCRLFtoLF: Boolean)
extends ConstructingParser({
// Note: we must open the XML carefully since it might be in some non
// default encoding (we have tests that have UTF-16 for example)
// must be buffered to support mark(), needed by heuristics
val is = new BufferedInputStream(uri.toURL.openStream())
val enc = EncodingHeuristics.readEncodingFromStream(is)
Source.fromInputStream(is, enc)
}, true) {
/**
* Public constructor insists on normalizingCRLFtoLF behavior.
*/
def this (uri: URI,
errorHandler: org.xml.sax.ErrorHandler,
addPositionAttributes: Boolean = false) =
this(uri, errorHandler, addPositionAttributes, normalizeCRLFtoLF = true)
/**
* Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
*
* Coverage is off, because this should never be hit, because
* the loader always has loaded the data with xerces prior to
* this loader (for validation purposes), and that will have caught
* the doctype being in the XML.
*
* However, under code maintenance, suppose someone turned that off
* or made that pass optional (for performance reasons perhaps). Then this
* provides a last-gasp attempt to protect from DOCTYPE-related
* insecurity.
*/
// $COVERAGE-OFF$
override def parseDTD(): Unit = {
val e = makeSAXParseException(pos, "DOCTYPE is disallowed.")
throw e
}
// $COVERAGE-ON$
// This one line is a bit of a hack to get consistent line numbers. The
// scala-xml libary reads XML from a scala.io.Source which maintains private
// line/col information about where in the Source we are reading from (i.e.
// scala.io.Source.pos). The problem is that when CDATA or a processing
// instruction is encountered, the library switches to a custom
// "WithLookAhead" scala.io.Source that buffers the original Source. This
// lookahead Source allows it to peek ahead a few characters, which is used
// to find the end of CDATA and processing instructions. The problem is that
// when it switches to this new Source, we lose position information since
// that information is private to each Source. This causes line information
// to reset to zero when the first CDATA or processing instruction is found.
// And there is no good way to copy position information from one source to
// another. So, what we can do is call this lookahead() function before any
// XML is parsed. This causes the ConstructingLoader to immediately switch to
// the buffering source. There may be some slight overhead for buffering, but
// at last our line numbers are correct.
lookahead()
private def makeSAXParseException(pos: Int, msg: String) = {
val line = Position.line(pos)
val col = Position.column(pos)
val exc = new org.xml.sax.SAXParseException(msg, null, uri.toString, line, col)
exc
}
override def reportSyntaxError(pos: Int, msg: String): Unit = {
val exc = makeSAXParseException(pos, msg)
errorHandler.error(exc)
}
/*
* Callback method invoked by MarkupParser after parsing an element, between
* the elemStart and elemEnd callbacks. This adds daffodil file/line/column
* information as attributes to the existing input attrs, modifying the scope
* if necessary, then creates an element using the super def elem function.
*
* @param pos the position in the source file
* @param pre the prefix
* @param local the local name
* @param attrs the attributes (metadata)
* @param scope the namespace binding scope
* @param empty `true` if the element was previously empty; `false` otherwise.
* @param args the children of this element
*/
override def elem(
pos: Int,
pre: String,
local: String,
attrs: MetaData,
scope: NamespaceBinding,
empty: Boolean,
nodes: NodeSeq): NodeSeq = {
val nsURI = NS(scope.getURI(pre))
val isFileRootNode = (local.equalsIgnoreCase("schema") && nsURI == XMLUtils.XSD_NAMESPACE) ||
(local.equalsIgnoreCase("testSuite") && nsURI == XMLUtils.TDML_NAMESPACE)
val alreadyHasLineCol = attrs.exists {
case PrefixedAttribute(XMLUtils.INT_PREFIX, attr, _, _) => {
attr.equalsIgnoreCase(XMLUtils.COLUMN_ATTRIBUTE_NAME) ||
attr.equalsIgnoreCase(XMLUtils.LINE_ATTRIBUTE_NAME)
}
case _ => false
}
val newAttrs: MetaData = {
if (addPositionAttributes && !alreadyHasLineCol) {
val withFile: MetaData =
if (isFileRootNode) {
new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.FILE_ATTRIBUTE_NAME, uri.toString, attrs)
} else {
attrs
}
val withCol: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.COLUMN_ATTRIBUTE_NAME, Position.column(pos).toString, withFile)
val withLine: MetaData = new PrefixedAttribute(XMLUtils.INT_PREFIX, XMLUtils.LINE_ATTRIBUTE_NAME, Position.line(pos).toString, withCol)
withLine
} else {
attrs
}
}
// add the dafint prefix if it doesn't already exist
val intPrefix = scope.getPrefix(XMLUtils.INT_NS)
val newScope = if (addPositionAttributes && intPrefix == null) {
NamespaceBinding(XMLUtils.INT_PREFIX, XMLUtils.INT_NS, scope)
} else {
Assert.usage(intPrefix == null || intPrefix == XMLUtils.INT_PREFIX) // can't deal with some other binding for dafint
scope
}
super.elem(pos, pre, local, newAttrs, newScope, empty, nodes)
}
/**
* To emulate the behavior of Xerces loader (standard scala loader)
* we have to normalize CRLF to LF, and solitary CR to LF.
*
* This is optional controlled by a constructor parameter.
*/
override def text(pos: Int, txt: String): Text = {
val newText:String = {
if (normalizeCRLFtoLF && txt.contains("\r")) {
txt.
replaceAll("\r\n", "\n").
replaceAll("\r", "\n")
} else {
txt
}
}
//
// On MS-Windows the TDML Runner previously would load XML
// files and due to git autoCRLF=true, they would
// have CRLFs in them. The loader the TDML Runner WAS
// using (not any more) was preserving these CRLFs
// in the XML infoset data, and so tests could come
// to depend on this and be non-portable between
// unix (LF only) and windows (CRLF only).
//
// Furthermore, the TDML file itself used to be loaded with this
// CRLF-preserving loader.
//
// The TDML Runner now always normalizes CRLF or
// isolated CR to LF like regular XML loaders do,
// for both the TDML file itself, and any files it
// loads. So this is no longer an issue.
//
super.text(pos, newText)
}
/**
* We override this to force the ConstrutingParser to process CDATA regions
* specially with an override-able method named cdata.
*
* Strangely, if you look at the implementation of this in the MarkupParser
* trait, it calls the handler for text, but then it ignores the result of that
* and constructs a PCDATA node from the original text.
*
* It's possible this is a bug fix.
*/
override def xCharData: NodeSeq = {
xToken("[CDATA[")
def mkResult(pos: Int, s: String): NodeSeq = {
val s1 = cdata(pos, s).text
PCData(s1)
}
xTakeUntil(mkResult, () => pos, "]]>")
}
/**
* Same CRLF/CR => LF processing as text gets.
*/
def cdata(pos: Int, s: String): NodeSeq = {
text(pos, s)
}
/**
* Same CRLF/CR => LF processing as text gets.
*/
override def comment(pos: Int, s: String): Comment = {
Comment(text(pos, s).text)
}
/**
* Same CRLF/CR => LF processing as text gets.
*/
override def procInstr(pos: Int, target: String, txt: String) =
ProcInstr(target, text(pos, txt).text)
private def parseXMLPrologAttributes(m: MetaData): (Option[String], Option[String], Option[Boolean]) = {
var info_ver: Option[String] = None
var info_enc: Option[String] = None
var info_stdl: Option[Boolean] = None
var n = 0
m("version") match {
case null =>
case Text("1.0") =>
info_ver = Some("1.0"); n += 1
case _ => reportSyntaxError("cannot deal with versions != 1.0")
}
m("encoding") match {
case null =>
case Text(enc) =>
if (!isValidIANAEncoding(enc))
reportSyntaxError("\"" + enc + "\" is not a valid encoding")
else {
info_enc = Some(enc)
n += 1
}
}
m("standalone") match {
case null =>
case Text("yes") =>
info_stdl = Some(true); n += 1
case Text("no") =>
info_stdl = Some(false); n += 1
case _ => reportSyntaxError("either 'yes' or 'no' expected")
}
if (m.length - n != 0) {
reportSyntaxError(
"only 'version', 'encoding', and 'standalone' attributes are expected in xml prolog. Found: " + m)
}
(info_ver, info_enc, info_stdl)
}
/**
* Override of document to make it tolerant of the start of the file
* being whitespace instead of a "<" character
*
* This does not handle DOCTYPEs (aka DTDs) at all. Hence, is not
* a true replacement (bug fix) on the original ConstructingParser method
* that it overrides.
*/
override def document(): Document = {
doc = new Document()
this.dtd = null
var children: NodeSeq = null
if ('<' == ch) {
nextch()
if ('?' == ch) {
nextch()
// It's probably an XML prolog, but
// there are cases where there is no XML Prolog, but a starting
// PI of <?xml-model href="...."?>
// So we have to recognize as a general PI, then look and see if
// it is a prolog.
val name = xName
xSpace()
val (md, scp) = xAttributes(TopScope)
if (scp != TopScope)
reportSyntaxError("no xmlns definitions allowed.")
xToken('?')
xToken('>')
if (name == "xml") {
val info_prolog = parseXMLPrologAttributes(md)
doc.version = info_prolog._1
doc.encoding = info_prolog._2
doc.standAlone = info_prolog._3
} else {
// not an xml prolog. It's some other PI
// do nothing. We're just skipping those PIs
}
children = content(TopScope)
} else {
val ts = new NodeBuffer()
content1(TopScope, ts) // the 1 suffix means "without the first < character"
ts &+ content(TopScope)
children = NodeSeq.fromSeq(ts)
}
} else {
children = content(TopScope)
}
var isErr = false
var elemCount = 0
var theNode: Node = null
children.foreach { c =>
c match {
case _: ProcInstr => // skip
case _: Comment => // skip
// $COVERAGE-OFF$ // constructing parser never creates these - probably due to a bug
case _: EntityRef => {
reportSyntaxError("no entity references allowed here")
isErr = true
}
// $COVERAGE-ON$
case s: SpecialNode => {
val txt = s.toString.trim()
if (txt.length > 0) {
reportSyntaxError("non-empty text nodes not allowed: '" + txt + "'.")
isErr = true
}
}
case m: Elem =>
elemCount += 1
theNode = m
}
}
if (1 != elemCount) {
reportSyntaxError("document must contain exactly one element")
isErr = true
}
if (!isErr) {
doc.children = children
doc.docElem = theNode
doc
} else {
null
}
}
def load(): Node = {
val res =
try {
this.initialize
val doc = this.document()
if (doc == null) null
else doc.docElem
} catch {
case e: Exception => {
val exc = makeSAXParseException(curInput.pos, e.toString)
errorHandler.fatalError(exc) // good place for a breakpoint
null
}
}
res
}
}