blob: fadef22c8bf94789ba4202a7d7b07a0125f7e7ee [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// This file holds some of the grunt work methods of XMLScanner.cpp to keep
// it a little more readable.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/BinMemInputStream.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/TransService.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/util/XMLUni.hpp>
#include <xercesc/util/XMLURL.hpp>
#include <xercesc/sax/InputSource.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/URLInputSource.hpp>
#include <xercesc/framework/XMLErrorReporter.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLPScanToken.hpp>
#include <xercesc/framework/XMLRefInfo.hpp>
#include <xercesc/framework/XMLValidator.hpp>
#include <xercesc/internal/XMLScanner.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
#include <xercesc/internal/XMLInternalErrorHandler.hpp>
#include <xercesc/parsers/IDOMParser.hpp>
#include <xercesc/dom/DOM_DOMException.hpp>
#include <xercesc/sax/EntityResolver.hpp>
#include <xercesc/validators/common/ContentLeafNameTypeVector.hpp>
#include <xercesc/validators/datatype/DatatypeValidator.hpp>
#include <xercesc/validators/schema/SchemaSymbols.hpp>
#include <xercesc/validators/schema/SchemaGrammar.hpp>
#include <xercesc/validators/schema/TraverseSchema.hpp>
#include <xercesc/validators/schema/SubstitutionGroupComparator.hpp>
#include <xercesc/validators/schema/identity/XPathMatcherStack.hpp>
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods
// ---------------------------------------------------------------------------
//
// This method is called from scanStartTagNS() to build up the list of
// XMLAttr objects that will be passed out in the start tag callout. We
// get the key/value pairs from the raw scan of explicitly provided attrs,
// which have not been normalized. And we get the element declaration from
// which we will get any defaulted or fixed attribute defs and add those
// in as well.
//
unsigned int
XMLScanner::buildAttList(const RefVectorOf<KVStringPair>& providedAttrs
, const unsigned int attCount
, XMLElementDecl* elemDecl
, RefVectorOf<XMLAttr>& toFill)
{
//
// Ask the element to clear the 'provided' flag on all of the att defs
// that it owns, and to return us a boolean indicating whether it has
// any defs.
//
const bool hasDefs = elemDecl->resetDefs();
//
// If there are no expliclitily provided attributes and there are no
// defined attributes for the element, the we don't have anything to do.
// So just return zero in this case.
//
if (!hasDefs && !attCount)
return 0;
// Keep up with how many attrs we end up with total
unsigned int retCount = 0;
//
// And get the current size of the output vector. This lets us use
// existing elements until we fill it, then start adding new ones.
//
const unsigned int curAttListSize = toFill.size();
//
// We need a buffer into which raw scanned attribute values will be
// normalized.
//
XMLBufBid bbNormal(&fBufMgr);
XMLBuffer& normBuf = bbNormal.getBuffer();
//
// Loop through our explicitly provided attributes, which are in the raw
// scanned form, and build up XMLAttr objects.
//
unsigned int index;
for (index = 0; index < attCount; index++)
{
const KVStringPair* curPair = providedAttrs.elementAt(index);
//
// We have to split the name into its prefix and name parts. Then
// we map the prefix to its URI.
//
const XMLCh* const namePtr = curPair->getKey();
ArrayJanitor<XMLCh> janName(0);
// use a stack-based buffer when possible.
XMLCh tempBuffer[100];
const int colonInd = XMLString::indexOf(namePtr, chColon);
const XMLCh* prefPtr = XMLUni::fgZeroLenString;
const XMLCh* suffPtr = XMLUni::fgZeroLenString;
if (colonInd != -1)
{
// We have to split the string, so make a copy.
if (XMLString::stringLen(namePtr) < sizeof(tempBuffer) / sizeof(tempBuffer[0]))
{
XMLString::copyString(tempBuffer, namePtr);
tempBuffer[colonInd] = chNull;
prefPtr = tempBuffer;
}
else
{
janName.reset(XMLString::replicate(namePtr));
janName[colonInd] = chNull;
prefPtr = janName.get();
}
suffPtr = prefPtr + colonInd + 1;
}
else
{
// No colon, so we just have a name with no prefix
suffPtr = namePtr;
}
//
// Map the prefix to a URI id. We tell him that we are mapping an
// attr prefix, so any xmlns attrs at this level will not affect it.
//
const unsigned int uriId = resolvePrefix(prefPtr, ElemStack::Mode_Attribute);
//
// If the uri comes back as the xmlns or xml URI or its just a name
// and that name is 'xmlns', then we handle it specially. So set a
// boolean flag that lets us quickly below know which we are dealing
// with.
//
const bool isNSAttr = (uriId == fXMLNSNamespaceId)
|| (uriId == fXMLNamespaceId)
|| !XMLString::compareString(suffPtr, XMLUni::fgXMLNSString)
|| !XMLString::compareString(getURIText(uriId), SchemaSymbols::fgURI_XSI);
//
// If its not a special case namespace attr of some sort, then we
// do normal checking and processing.
//
XMLAttDef::AttTypes attType;
if (!isNSAttr || fGrammarType == Grammar::DTDGrammarType)
{
// Some checking for attribute wild card first (for schema)
bool laxThisOne = false;
bool skipThisOne = false;
XMLAttDef* attDefForWildCard = 0;
XMLAttDef* attDef = 0;
if (fGrammarType == Grammar::SchemaGrammarType) {
SchemaAttDef* attWildCard = ((SchemaElementDecl*)elemDecl)->getAttWildCard();
if (attWildCard) {
//if schema, see if we should lax or skip the validation of this attribute
if (anyAttributeValidation(attWildCard, uriId, skipThisOne, laxThisOne)) {
SchemaGrammar* sGrammar = (SchemaGrammar*) fGrammarResolver->getGrammar(getURIText(uriId));
if (sGrammar && sGrammar->getGrammarType() == Grammar::SchemaGrammarType) {
RefHashTableOf<XMLAttDef>* attRegistry = sGrammar->getAttributeDeclRegistry();
if (attRegistry) {
attDefForWildCard = attRegistry->get(suffPtr);
}
}
}
}
//retrieve the att def
attDef = ((SchemaElementDecl*)elemDecl)->getAttDef(suffPtr, uriId);
if (!attDef) {
// not find, see if the attDef should be qualified or not
if (uriId == fEmptyNamespaceId) {
attDef = ((SchemaElementDecl*)elemDecl)->getAttDef(suffPtr, fURIStringPool->getId(fGrammar->getTargetNamespace()));
if (fValidate
&& attDef
&& !attDefForWildCard
&& !skipThisOne
&& !laxThisOne
&& attDef->getCreateReason() != XMLAttDef::JustFaultIn) {
// the attribute should be qualified
fValidator->emitError
(
XMLValid::AttributeNotQualified
, attDef->getFullName()
);
}
}
else {
attDef = ((SchemaElementDecl*)elemDecl)->getAttDef(suffPtr, fEmptyNamespaceId);
if (fValidate
&& attDef
&& !attDefForWildCard
&& !skipThisOne
&& !laxThisOne
&& attDef->getCreateReason() != XMLAttDef::JustFaultIn) {
// the attribute should be qualified
fValidator->emitError
(
XMLValid::AttributeNotUnQualified
, attDef->getFullName()
);
}
}
}
}
//
// Find this attribute within the parent element. We pass both
// the uriID/name and the raw QName buffer, since we don't know
// how the derived validator and its elements store attributes.
//
bool wasAdded = false;
if (!attDef) {
attDef = elemDecl->findAttr
(
curPair->getKey()
, uriId
, suffPtr
, prefPtr
, XMLElementDecl::AddIfNotFound
, wasAdded
);
}
if (wasAdded)
{
// This is to tell the Validator that this attribute was
// faulted-in, was not an attribute in the attdef originally
attDef->setCreateReason(XMLAttDef::JustFaultIn);
}
if (!attDefForWildCard) {
if (wasAdded)
{
if (fValidate && !skipThisOne && !laxThisOne)
{
//
// Its not valid for this element, so issue an error if we are
// validating.
//
XMLBufBid bbURI(&fBufMgr);
XMLBuffer& bufURI = bbURI.getBuffer();
getURIText(uriId, bufURI);
XMLBufBid bbMsg(&fBufMgr);
XMLBuffer& bufMsg = bbMsg.getBuffer();
bufMsg.append(chOpenCurly);
bufMsg.append(bufURI.getRawBuffer());
bufMsg.append(chCloseCurly);
bufMsg.append(suffPtr);
fValidator->emitError
(
XMLValid::AttNotDefinedForElement
, bufMsg.getRawBuffer()
, elemDecl->getFullName()
);
}
}
else
{
// If this attribute was faulted-in and first occurence,
// then emit an error
if (fValidate
&& attDef->getCreateReason() == XMLAttDef::JustFaultIn
&& !attDef->getProvided()
&& !skipThisOne
&& !laxThisOne)
{
XMLBufBid bbURI(&fBufMgr);
XMLBuffer& bufURI = bbURI.getBuffer();
getURIText(uriId, bufURI);
XMLBufBid bbMsg(&fBufMgr);
XMLBuffer& bufMsg = bbMsg.getBuffer();
bufMsg.append(chOpenCurly);
bufMsg.append(bufURI.getRawBuffer());
bufMsg.append(chCloseCurly);
bufMsg.append(suffPtr);
fValidator->emitError
(
XMLValid::AttNotDefinedForElement
, bufMsg.getRawBuffer()
, elemDecl->getFullName()
);
}
}
}
//
// If its already provided, then there are more than one of
// this attribute in this start tag, so emit an error.
//
if (attDef->getProvided())
{
emitError
(
XMLErrs::AttrAlreadyUsedInSTag
, attDef->getFullName()
, elemDecl->getFullName()
);
}
else
{
attDef->setProvided(true);
}
//
// Now normalize the raw value since we have the attribute type. We
// don't care about the return status here. If it failed, an error
// was issued, which is all we care about.
//
if (attDefForWildCard && (wasAdded || (!wasAdded && attDef->getCreateReason() == XMLAttDef::JustFaultIn))) {
normalizeAttValue
(
attDefForWildCard
, curPair->getValue()
, normBuf
);
//
// If we found an attdef for this one, then lets validate it.
//
if (fValidate && !skipThisOne)
{
// normalize the attribute according to schema whitespace facet
XMLBufBid bbtemp(&fBufMgr);
XMLBuffer& tempBuf = bbtemp.getBuffer();
DatatypeValidator* tempDV = ((SchemaAttDef*) attDefForWildCard)->getDatatypeValidator();
((SchemaValidator*) fValidator)->normalizeWhiteSpace(tempDV, normBuf.getRawBuffer(), tempBuf);
normBuf.set(tempBuf.getRawBuffer());
fValidator->validateAttrValue
(
attDefForWildCard
, normBuf.getRawBuffer()
);
}
// Save the type for later use
attType = attDefForWildCard->getType();
}
else {
normalizeAttValue
(
attDef
, curPair->getValue()
, normBuf
);
//
// If we found an attdef for this one, then lets validate it.
//
if (!wasAdded && attDef->getCreateReason() != XMLAttDef::JustFaultIn)
{
if (fValidate && !skipThisOne)
{
if (fGrammarType == Grammar::SchemaGrammarType)
{
// normalize the attribute according to schema whitespace facet
XMLBufBid bbtemp(&fBufMgr);
XMLBuffer& tempBuf = bbtemp.getBuffer();
DatatypeValidator* tempDV = ((SchemaAttDef*) attDef)->getDatatypeValidator();
((SchemaValidator*) fValidator)->normalizeWhiteSpace(tempDV, normBuf.getRawBuffer(), tempBuf);
normBuf.set(tempBuf.getRawBuffer());
}
fValidator->validateAttrValue
(
attDef
, normBuf.getRawBuffer()
);
}
}
// Save the type for later use
attType = attDef->getType();
}
}
else
{
// Just normalize as CDATA
attType = XMLAttDef::CData;
normalizeAttRawValue
(
curPair->getKey()
, curPair->getValue()
, normBuf
);
}
//
// Add this attribute to the attribute list that we use to pass them
// to the handler. We reuse its existing elements but expand it as
// required.
//
XMLAttr* curAttr;
if (retCount >= curAttListSize)
{
curAttr = new XMLAttr
(
uriId
, suffPtr
, prefPtr
, normBuf.getRawBuffer()
, attType
, true
);
toFill.addElement(curAttr);
}
else
{
curAttr = toFill.elementAt(retCount);
curAttr->set
(
uriId
, suffPtr
, prefPtr
, normBuf.getRawBuffer()
, attType
);
curAttr->setSpecified(true);
}
// Bump the count of attrs in the list
retCount++;
}
//
// Now, if there are any attributes declared by this element, let's
// go through them and make sure that any required ones are provided,
// and fault in any fixed ones and defaulted ones that are not provided
// literally.
//
if (hasDefs)
{
//
// Check after all specified attrs are scanned
// (1) report error for REQUIRED attrs that are missing (V_TAGc)
// (2) add default attrs if missing (FIXED and NOT_FIXED)
//
XMLAttDefList& attDefList = elemDecl->getAttDefList();
while (attDefList.hasMoreElements())
{
// Get the current att def, for convenience and its def type
const XMLAttDef& curDef = attDefList.nextElement();
const XMLAttDef::DefAttTypes defType = curDef.getDefaultType();
if (!curDef.getProvided())
{
//the attributes is not provided
if (fValidate)
{
// If we are validating and its required, then an error
if ((defType == XMLAttDef::Required) ||
(defType == XMLAttDef::Required_And_Fixed) )
{
fValidator->emitError
(
XMLValid::RequiredAttrNotProvided
, curDef.getFullName()
);
}
else if ((defType == XMLAttDef::Default) ||
(defType == XMLAttDef::Fixed) )
{
if (fStandalone && curDef.isExternal())
{
//
// XML 1.0 Section 2.9
// Document is standalone, so attributes must not be defaulted.
//
fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName());
}
}
}
//
// Fault in the value if needed, and bump the att count.
// We have to
//
if ((defType == XMLAttDef::Default)
|| (defType == XMLAttDef::Fixed))
{
XMLAttr* curAtt;
if (retCount >= curAttListSize)
{
curAtt = new XMLAttr;
fValidator->faultInAttr(*curAtt, curDef);
fAttrList->addElement(curAtt);
}
else
{
curAtt = fAttrList->elementAt(retCount);
fValidator->faultInAttr(*curAtt, curDef);
}
if (fGrammarType == Grammar::DTDGrammarType)
{
//
// Map the new attribute's prefix to a URI id and store
// that in the attribute object.
//
curAtt->setURIId
(
resolvePrefix(curAtt->getPrefix(), ElemStack::Mode_Attribute)
);
}
// Indicate it was not explicitly specified and bump count
curAtt->setSpecified(false);
retCount++;
}
}
else
{
//attribute is provided
// (schema) report error for PROHIBITED attrs that are present (V_TAGc)
if (defType == XMLAttDef::Prohibited && fValidate)
fValidator->emitError
(
XMLValid::ProhibitedAttributePresent
, curDef.getFullName()
);
}
}
}
return retCount;
}
//
// This method is called after the content scan to insure that all the
// ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
// an XML 1.0 rule, so we can do here in the core.
//
void XMLScanner::checkIDRefs()
{
//
//
// Iterate the id ref list. If we find any entries here which are used
// but not declared, then that's an error.
//
RefHashTableOfEnumerator<XMLRefInfo> refEnum(fIDRefList);
while (refEnum.hasMoreElements())
{
// Get a ref to the current element
const XMLRefInfo& curRef = refEnum.nextElement();
// If its used but not declared, then its an error
if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
}
}
//
// This just does a simple check that the passed progressive scan token is
// legal for this scanner.
//
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
return ((fScannerId == toCheck.fScannerId)
&& (fSequenceId == toCheck.fSequenceId));
}
//
// This method will take a raw attribute value and normalize it according to
// the rules of the attribute type. It will put the resulting value into the
// passed buffer.
//
// This code assumes that escaped characters in the original value (via char
// refs) are prefixed by a 0xFFFF character. This is because some characters
// are legal if escaped only. And some escape chars are not subject to
// normalization rules.
//
bool XMLScanner::normalizeAttValue( const XMLAttDef* const attDef
, const XMLCh* const value
, XMLBuffer& toFill)
{
// A simple state value for a whitespace processing state machine
enum States
{
InWhitespace
, InContent
};
// Get the type and name
const XMLAttDef::AttTypes type = attDef->getType();
const XMLCh* const attrName = attDef->getFullName();
// Assume its going to go fine, and empty the target buffer in preperation
bool retVal = true;
toFill.reset();
//
// Get attribute def - to check to see if it's declared externally or not
//
bool isAttExternal = attDef->isExternal();
//
// Loop through the chars of the source value and normalize it according
// to the type.
//
States curState = InContent;
bool escaped;
bool firstNonWS = false;
XMLCh nextCh;
const XMLCh* srcPtr = value;
while (*srcPtr)
{
//
// Get the next character from the source. We have to watch for
// escaped characters (which are indicated by a 0xFFFF value followed
// by the char that was escaped.)
//
nextCh = *srcPtr;
escaped = (nextCh == 0xFFFF);
if (escaped)
nextCh = *++srcPtr;
//
// If its not escaped, then make sure its not a < character, which is
// not allowed in attribute values.
//
if (!escaped && (*srcPtr == chOpenAngle))
{
emitError(XMLErrs::BracketInAttrValue, attrName);
retVal = false;
}
if (type == XMLAttDef::CData || type > XMLAttDef::Notation)
{
if (!escaped)
{
if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
{
//
// Check Validity Constraint for Standalone document declaration
// XML 1.0, Section 2.9
//
if (fStandalone && fValidate && isAttExternal)
{
//
// Can't have a standalone document declaration of "yes" if attribute
// values are subject to normalisation
//
fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
}
nextCh = chSpace;
}
}
}
else
{
if (curState == InWhitespace)
{
if (!XMLReader::isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
srcPtr++;
continue;
}
}
else if (curState == InContent)
{
if (XMLReader::isWhitespace(nextCh))
{
curState = InWhitespace;
srcPtr++;
//
// Check Validity Constraint for Standalone document declaration
// XML 1.0, Section 2.9
//
if (fStandalone && fValidate && isAttExternal)
{
if (!firstNonWS || (nextCh != chSpace) || (!*srcPtr) || XMLReader::isWhitespace(*srcPtr))
{
//
// Can't have a standalone document declaration of "yes" if attribute
// values are subject to normalisation
//
fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
}
}
continue;
}
firstNonWS = true;
}
}
// Add this char to the target buffer
toFill.append(nextCh);
// And move up to the next character in the source
srcPtr++;
}
return retVal;
}
//
// This method will just normalize the input value as CDATA without
// any standalone checking.
//
bool XMLScanner::normalizeAttRawValue( const XMLCh* const attrName
, const XMLCh* const value
, XMLBuffer& toFill)
{
// A simple state value for a whitespace processing state machine
enum States
{
InWhitespace
, InContent
};
// Assume its going to go fine, and empty the target buffer in preperation
bool retVal = true;
toFill.reset();
//
// Loop through the chars of the source value and normalize it according
// to the type.
//
States curState = InContent;
bool escaped;
bool firstNonWS = false;
XMLCh nextCh;
const XMLCh* srcPtr = value;
while (*srcPtr)
{
//
// Get the next character from the source. We have to watch for
// escaped characters (which are indicated by a 0xFFFF value followed
// by the char that was escaped.)
//
nextCh = *srcPtr;
escaped = (nextCh == 0xFFFF);
if (escaped)
nextCh = *++srcPtr;
//
// If its not escaped, then make sure its not a < character, which is
// not allowed in attribute values.
//
if (!escaped && (*srcPtr == chOpenAngle))
{
emitError(XMLErrs::BracketInAttrValue, attrName);
retVal = false;
}
if (!escaped)
{
//
// NOTE: Yes this is a little redundant in that a 0x20 is
// replaced with an 0x20. But its faster to do this (I think)
// than checking for 9, A, and D separately.
//
if (XMLReader::isWhitespace(nextCh))
nextCh = chSpace;
}
// Add this char to the target buffer
toFill.append(nextCh);
// And move up to the next character in the source
srcPtr++;
}
return retVal;
}
unsigned int
XMLScanner::resolvePrefix( const XMLCh* const prefix
, const ElemStack::MapModes mode)
{
//
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
//
if (!XMLString::compareString(prefix, XMLUni::fgXMLNSString))
return fXMLNSNamespaceId;
else if (!XMLString::compareString(prefix, XMLUni::fgXMLString))
return fXMLNamespaceId;
//
// Ask the element stack to search up itself for a mapping for the
// passed prefix.
//
bool unknown;
unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
// If it was unknown, then the URI was faked in but we have to issue an error
if (unknown)
emitError(XMLErrs::UnknownPrefix, prefix);
return uriId;
}
unsigned int
XMLScanner::resolvePrefix( const XMLCh* const prefix
, XMLBuffer& bufToFill
, const ElemStack::MapModes mode)
{
//
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
//
if (!XMLString::compareString(prefix, XMLUni::fgXMLNSString))
return fXMLNSNamespaceId;
else if (!XMLString::compareString(prefix, XMLUni::fgXMLString))
return fXMLNamespaceId;
//
// Ask the element stack to search up itself for a mapping for the
// passed prefix.
//
bool unknown;
unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
// If it was unknown, then the URI was faked in but we have to issue an error
if (unknown)
emitError(XMLErrs::UnknownPrefix, prefix);
getURIText(uriId,bufToFill);
return uriId;
}
//
// This method will reset the scanner data structures, and related plugged
// in stuff, for a new scan session. We get the input source for the primary
// XML entity, create the reader for it, and push it on the stack so that
// upon successful return from here we are ready to go.
//
void XMLScanner::scanReset(const InputSource& src)
{
//
// This call implicitly tells us that we are going to reuse the scanner
// if it was previously used. So tell the validator to reset itself.
//
// But, if the fReuseGrammar flag is set, then don't reset it.
//
// NOTE: The ReaderMgr is flushed on the way out, because that is
// required to insure that files are closed.
//
if (!fReuseGrammar) {
fGrammarResolver->reset();
resetEntityDeclPool();
if (fDoNamespaces)
resetURIStringPool();
// create a default grammar first
fGrammar = new DTDGrammar();
//
if (fValidatorFromUser) {
if (fValidator->handlesDTD())
fValidator->setGrammar(fGrammar);
}
else {
// set fValidator as fDTDValidator
fValidator = fDTDValidator;
fValidator->setGrammar(fGrammar);
}
fGrammarType = fGrammar->getGrammarType();
fGrammarResolver->putGrammar(XMLUni::fgZeroLenString, fGrammar);
if (fValScheme == Val_Auto) {
fValidate = false;
}
}
else {
// reusing grammar, thus the fGrammar must pre-exist already
// make sure the validator handles this reuse grammar type
if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoSchemaValidator);
else {
fValidator = fSchemaValidator;
}
}
else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoDTDValidator);
else {
fValidator = fDTDValidator;
}
}
if (!fValidator->getGrammar())
fValidator->setGrammar(fGrammar);
}
//
// And for all installed handlers, send reset events. This gives them
// a chance to flush any cached data.
//
if (fDocHandler)
fDocHandler->resetDocument();
if (fEntityHandler)
fEntityHandler->resetEntities();
if (fErrorReporter)
fErrorReporter->resetErrors();
// Clear out the id reference list
fIDRefList->removeAll();
// Reset the Root Element Name
delete [] fRootElemName;
fRootElemName = 0;
// Reset IdentityConstraints
fMatcherStack->clear();
//
// Reset the element stack, and give it the latest ids for the special
// URIs it has to know about.
//
fElemStack.reset
(
fEmptyNamespaceId
, fUnknownNamespaceId
, fXMLNamespaceId
, fXMLNSNamespaceId
);
// Reset some status flags
fInException = false;
fStandalone = false;
fErrorCount = 0;
fHasNoDTD = true;
fSeeXsi = false;
// Reset the validators
fDTDValidator->reset();
fSchemaValidator->reset();
if (fValidatorFromUser)
fValidator->reset();
//
// Handle the creation of the XML reader object for this input source.
// This will provide us with transcoding and basic lexing services.
//
XMLReader* newReader = fReaderMgr.createReader
(
src
, true
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
);
if (!newReader) {
if (src.getIssueFatalErrorIfNotFound())
ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId());
else
ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId());
}
// Push this read onto the reader manager
fReaderMgr.pushReader(newReader, 0);
}
//
// This method is called between markup in content. It scans for character
// data that is sent to the document handler. It watches for any markup
// characters that would indicate that the character data has ended. It also
// handles expansion of general and character entities.
//
// sendData() is a local static helper for this method which handles some
// code that must be done in three different places here.
//
void XMLScanner::sendCharData(XMLBuffer& toSend)
{
// If no data in the buffer, then nothing to do
if (toSend.isEmpty())
return;
//
// We do different things according to whether we are validating or
// not. If not, its always just characters; else, it depends on the
// current element's content model.
//
if (fValidate)
{
// Get the raw data we need for the callback
const XMLCh* const rawBuf = toSend.getRawBuffer();
const unsigned int len = toSend.getLen();
// And see if the current element is a 'Children' style content model
const ElemStack::StackElem* topElem = fElemStack.topElement();
// Get the character data opts for the current element
XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
if (charOpts == XMLElementDecl::NoCharData)
{
// They definitely cannot handle any type of char data
fValidator->emitError(XMLValid::NoCharDataInCM);
}
else if (XMLReader::isAllSpaces(rawBuf, len))
{
//
// Its all spaces. So, if they can take spaces, then send it
// as ignorable whitespace. If they can handle any char data
// send it as characters.
//
if (charOpts == XMLElementDecl::SpacesOk) {
if (fDocHandler)
fDocHandler->ignorableWhitespace(rawBuf, len, false);
}
else if (charOpts == XMLElementDecl::AllCharData)
{
if (fGrammarType != Grammar::SchemaGrammarType)
{
if (fDocHandler)
fDocHandler->docCharacters(rawBuf, len, false);
}
else
{
// The normalized data can only be as large as the
// original size, so this will avoid allocating way
// too much or too little memory.
XMLBuffer toFill(len+1);
// normalize the character according to schema whitespace facet
DatatypeValidator* tempDV = ((SchemaElementDecl*) topElem->fThisElement)->getDatatypeValidator();
((SchemaValidator*) fValidator)->normalizeWhiteSpace(tempDV, rawBuf, toFill);
// call all active identity constraints
unsigned int count = fMatcherStack->getMatcherCount();
for (unsigned int i = 0; i < count; i++) {
fMatcherStack->getMatcherAt(i)->docCharacters(toFill.getRawBuffer(), toFill.getLen());
}
if (fDocHandler)
fDocHandler->docCharacters(toFill.getRawBuffer(), toFill.getLen(), false);
}
}
}
else
{
//
// If they can take any char data, then send it. Otherwise, they
// can only handle whitespace and can't handle this stuff so
// issue an error.
//
if (charOpts == XMLElementDecl::AllCharData)
{
if (fGrammarType != Grammar::SchemaGrammarType)
{
if (fDocHandler)
fDocHandler->docCharacters(rawBuf, len, false);
}
else
{
// The normalized data can only be as large as the
// original size, so this will avoid allocating way
// too much or too little memory.
XMLBuffer toFill(len+1);
// normalize the character according to schema whitespace facet
DatatypeValidator* tempDV = ((SchemaElementDecl*) topElem->fThisElement)->getDatatypeValidator();
((SchemaValidator*) fValidator)->normalizeWhiteSpace(tempDV, rawBuf, toFill);
// call all active identity constraints
unsigned int count = fMatcherStack->getMatcherCount();
for (unsigned int i = 0; i < count; i++) {
fMatcherStack->getMatcherAt(i)->docCharacters(toFill.getRawBuffer(), toFill.getLen());
}
if (fDocHandler)
fDocHandler->docCharacters(toFill.getRawBuffer(), toFill.getLen(), false);
}
}
else
{
fValidator->emitError(XMLValid::NoCharDataInCM);
}
}
}
else
{
// call all active identity constraints
if (fGrammarType == Grammar::SchemaGrammarType) {
unsigned int count = fMatcherStack->getMatcherCount();
for (unsigned int i = 0; i < count; i++) {
fMatcherStack->getMatcherAt(i)->docCharacters(toSend.getRawBuffer(), toSend.getLen());
}
}
// Always assume its just char data if not validating
if (fDocHandler)
fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
}
// Reset buffer
toSend.reset();
}
//
// This method will handle figuring out what the next top level token is
// in the input stream. It will return an enumerated value that indicates
// what it believes the next XML level token must be. It will eat as many
// chars are required to figure out what is next.
//
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
//
// Get the next character and use it to guesstimate what the next token
// is going to be. We turn on end of entity exceptions when we do this
// in order to catch the scenario where the current entity ended at
// the > of some markup.
//
XMLCh nextCh;
{
ThrowEOEJanitor janMgr(&fReaderMgr, true);
nextCh = fReaderMgr.peekNextChar();
}
//
// Check for special chars. Start with the most
// obvious end of file, which should be legal here at top level.
//
if (!nextCh)
return Token_EOF;
//
// If it's not a '<' we must be in content.
//
// This includes entity references '&' of some sort. These must
// be character data because that's the only place a reference can
// occur in content.
//
if (nextCh != chOpenAngle)
return Token_CharData;
//
//
// Ok it had to have been a '<' character. So get it out of the reader
// and store the reader number where we saw it, passing it back to the
// caller.
//
fReaderMgr.getNextChar();
orgReader = fReaderMgr.getCurrentReaderNum();
//
// Ok, so lets go through the things that it could be at this point which
// are all some form of markup.
//
nextCh = fReaderMgr.peekNextChar();
if (nextCh == chForwardSlash)
{
fReaderMgr.getNextChar();
return Token_EndTag;
}
else if (nextCh == chBang)
{
static const XMLCh gCDATAStr[] =
{
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chNull
};
static const XMLCh gCommentString[] =
{
chBang, chDash, chDash, chNull
};
if (fReaderMgr.skippedString(gCDATAStr))
return Token_CData;
if (fReaderMgr.skippedString(gCommentString))
return Token_Comment;
emitError(XMLErrs::ExpectedCommentOrCDATA);
return Token_Unknown;
}
else if (nextCh == chQuestion)
{
// It must be a PI
fReaderMgr.getNextChar();
return Token_PI;
}
//
// Assume its an element name, so return with a start tag token. If it
// turns out not to be, then it will fail when it cannot get a valid tag.
//
return Token_StartTag;
}
//
// This method is called with a key/value string pair that represents an
// xmlns="xxx" or xmlns:xxx="yyy" attribute. This method will update the
// current top of the element stack based on this data. We know that when
// we get here, that it is one of these forms, so we don't bother confirming
// it.
//
void XMLScanner::updateNSMap(const XMLCh* const attrName
, const XMLCh* const attrValue)
{
// We need a buffer to normalize the attribute value into
XMLBufBid bbNormal(&fBufMgr);
XMLBuffer& normalBuf = bbNormal.getBuffer();
// We either have the default prefix (""), or we point it into the attr
// name parameter. Note that the xmlns is not the prefix we care about
// here. To us, the 'prefix' is really the local part of the attrName
// parameter.
//
const XMLCh* prefPtr = XMLUni::fgZeroLenString;
const unsigned int colonOfs = XMLString::indexOf(attrName, chColon);
if (colonOfs != -1)
prefPtr = &attrName[colonOfs + 1];
//
// Normalize the value into the passed buffer. In this case, we don't
// care about the return value. An error was issued for the error, which
// is all we care about here.
//
normalizeAttRawValue(attrName, attrValue, normalBuf);
//
// Ok, we have to get the unique id for the attribute value, which is the
// URI that this value should be mapped to. The validator has the
// namespace string pool, so we ask him to find or add this new one. Then
// we ask the element stack to add this prefix to URI Id mapping.
//
fElemStack.addPrefix
(
prefPtr
, fURIStringPool->addOrFind(normalBuf.getRawBuffer())
);
}
void XMLScanner::scanRawAttrListforNameSpaces(const RefVectorOf<KVStringPair>* theRawAttrList, int attCount) {
// Schema Xsi Type yyyy (e.g. xsi:type="yyyyy")
XMLBufBid bbXsi(&fBufMgr);
XMLBuffer& fXsiType = bbXsi.getBuffer();
//
// Make an initial pass through the list and find any xmlns attributes or
// schema attributes.
// When we find one, send it off to be used to update the element stack's
// namespace mappings.
//
int index = 0;
for (index = 0; index < attCount; index++)
{
// each attribute has the prefix:suffix="value"
const KVStringPair* curPair = fRawAttrList->elementAt(index);
const XMLCh* valuePtr = curPair->getValue();
const XMLCh* rawPtr = curPair->getKey();
QName attName(rawPtr, fEmptyNamespaceId);
const XMLCh* suffPtr = attName.getLocalPart();
// If either the key begins with "xmlns:" or its just plain
// "xmlns", then use it to update the map.
//
if (!XMLString::compareNString(rawPtr, XMLUni::fgXMLNSColonString, 6)
|| !XMLString::compareString(rawPtr, XMLUni::fgXMLNSString))
{
updateNSMap(rawPtr, valuePtr);
// if the schema URI is seen in the the valuePtr, set the boolean seeXsi
if (!XMLString::compareString(valuePtr, SchemaSymbols::fgURI_XSI)) {
fSeeXsi = true;
}
}
}
// walk through the list again to deal with "xsi:...."
if (fDoSchema && fSeeXsi)
{
for (index = 0; index < attCount; index++)
{
// each attribute has the prefix:suffix="value"
const KVStringPair* curPair = fRawAttrList->elementAt(index);
const XMLCh* valuePtr = curPair->getValue();
const XMLCh* rawPtr = curPair->getKey();
QName attName(rawPtr, fEmptyNamespaceId);
const XMLCh* prefPtr = attName.getPrefix();
const XMLCh* suffPtr = attName.getLocalPart();
// if schema URI has been seen, scan for the schema location and uri
// and resolve the schema grammar; or scan for schema type
if (resolvePrefix(prefPtr, ElemStack::Mode_Attribute) == fSchemaNamespaceId) {
if (!fReuseGrammar) {
if (!XMLString::compareString(suffPtr, SchemaSymbols::fgXSI_SCHEMALOCACTION))
parseSchemaLocation(valuePtr);
else if (!XMLString::compareString(suffPtr, SchemaSymbols::fgXSI_NONAMESPACESCHEMALOCACTION))
resolveSchemaGrammar(valuePtr, XMLUni::fgZeroLenString);
}
if (!XMLString::compareString(suffPtr, SchemaSymbols::fgXSI_TYPE)) {
fXsiType.set(valuePtr);
}
else if (!XMLString::compareString(suffPtr, SchemaSymbols::fgATT_NILL)
&& fValidator && fValidator->handlesSchema()
&& !XMLString::compareString(valuePtr, SchemaSymbols::fgATTVAL_TRUE)) {
((SchemaValidator*)fValidator)->setNillable(true);
}
}
}
if (fValidator && fValidator->handlesSchema()) {
if (!fXsiType.isEmpty()) {
unsigned int uriId = resolveQName (
fXsiType.getRawBuffer()
, fNameBuf
, fPrefixBuf
, ElemStack::Mode_Element
);
((SchemaValidator*)fValidator)->setXsiType(fPrefixBuf.getRawBuffer(), fNameBuf.getRawBuffer(), uriId);
}
}
}
}
void XMLScanner::parseSchemaLocation(const XMLCh* const schemaLocationStr)
{
RefVectorOf<XMLCh>* schemaLocation = XMLString::tokenizeString(schemaLocationStr);
unsigned int size = schemaLocation->size();
if (size % 2 != 0 ) {
emitError(XMLErrs::BadSchemaLocation);
} else {
for(unsigned int i=0; i<size; i=i+2) {
resolveSchemaGrammar(schemaLocation->elementAt(i+1), schemaLocation->elementAt(i));
}
}
delete schemaLocation;
}
void XMLScanner::resolveSchemaGrammar(const XMLCh* const loc, const XMLCh* const uri) {
Grammar* grammar = fGrammarResolver->getGrammar(uri);
if (!grammar || grammar->getGrammarType() == Grammar::DTDGrammarType) {
IDOMParser parser;
XMLInternalErrorHandler internalErrorHandler(fErrorHandler);
parser.setValidationScheme(IDOMParser::Val_Never);
parser.setDoNamespaces(true);
parser.setErrorHandler((ErrorHandler*) &internalErrorHandler);
parser.setEntityResolver(fEntityResolver);
// Create a buffer for expanding the system id
XMLBufBid bbSys(&fBufMgr);
XMLBuffer& expSysId = bbSys.getBuffer();
//
// Allow the entity handler to expand the system id if they choose
// to do so.
//
if (fEntityHandler)
{
if (!fEntityHandler->expandSystemId(loc, expSysId))
expSysId.set(loc);
}
else
{
expSysId.set(loc);
}
// Call the entity resolver interface to get an input source
InputSource* srcToFill = 0;
if (fEntityResolver)
{
srcToFill = fEntityResolver->resolveEntity
(
XMLUni::fgZeroLenString
, expSysId.getRawBuffer()
);
}
//
// If they didn't create a source via the entity resolver, then we
// have to create one on our own.
//
if (!srcToFill)
{
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
try
{
XMLURL urlTmp(lastInfo.systemId, expSysId.getRawBuffer());
if (urlTmp.isRelative())
{
ThrowXML
(
MalformedURLException
, XMLExcepts::URL_NoProtocolPresent
);
}
srcToFill = new URLInputSource(urlTmp);
}
catch(const MalformedURLException&)
{
// Its not a URL, so lets assume its a local file name.
srcToFill = new LocalFileInputSource
(
lastInfo.systemId
, expSysId.getRawBuffer()
);
}
}
// Put a janitor on the input source
Janitor<InputSource> janSrc(srcToFill);
// Should just issue warning if the schema is not found
const bool flag = srcToFill->getIssueFatalErrorIfNotFound();
srcToFill->setIssueFatalErrorIfNotFound(false);
parser.parse(*srcToFill) ;
// Reset the InputSource
srcToFill->setIssueFatalErrorIfNotFound(flag);
if (internalErrorHandler.getSawFatal() && fExitOnFirstFatal)
emitError(XMLErrs::SchemaScanFatalError);
IDOM_Document* document = parser.getDocument(); //Our Grammar
if (document != 0) {
IDOM_Element* root = document->getDocumentElement();// This is what we pass to TraverserSchema
if (root != 0)
{
const XMLCh* newUri = root->getAttribute(SchemaSymbols::fgATT_TARGETNAMESPACE);
if (XMLString::compareString(newUri, uri)) {
if (fValidate)
fValidator->emitError(XMLValid::WrongTargetNamespace, loc, uri);
grammar = fGrammarResolver->getGrammar(newUri);
}
if (!grammar || grammar->getGrammarType() == Grammar::DTDGrammarType) {
//
// Since we have seen a grammar, set our validation flag
// at this point if the validation scheme is auto
//
if (fValScheme == Val_Auto && !fValidate) {
fValidate = true;
fElemStack.setValidationFlag(fValidate);
}
// we have seen a schema, so set up the fValidator as fSchemaValidator
if (!fValidator->handlesSchema())
{
if (fValidatorFromUser) {
// the fValidator is from user
ThrowXML(RuntimeException, XMLExcepts::Gen_NoSchemaValidator);
}
else {
fValidator = fSchemaValidator;
}
}
grammar = new SchemaGrammar();
TraverseSchema traverseSchema(root, fURIStringPool, (SchemaGrammar*) grammar, fGrammarResolver, this, fValidator, srcToFill->getSystemId(), fEntityResolver, fErrorHandler);
if (fGrammarType == Grammar::DTDGrammarType) {
fGrammar = grammar;
fGrammarType = Grammar::SchemaGrammarType;
fValidator->setGrammar(fGrammar);
}
if (!fReuseGrammar && fValidate) {
// validate the Schema scan so far
fValidator->preContentValidation(fReuseGrammar);
}
}
}
}
}
else {
//
// Since we have seen a grammar, set our validation flag
// at this point if the validation scheme is auto
//
if (fValScheme == Val_Auto && !fValidate) {
fValidate = true;
fElemStack.setValidationFlag(fValidate);
}
// we have seen a schema, so set up the fValidator as fSchemaValidator
if (!fValidator->handlesSchema())
{
if (fValidatorFromUser) {
// the fValidator is from user
ThrowXML(RuntimeException, XMLExcepts::Gen_NoSchemaValidator);
}
else {
fValidator = fSchemaValidator;
}
}
if (fGrammarType == Grammar::DTDGrammarType) {
fGrammar = grammar;
fGrammarType = Grammar::SchemaGrammarType;
fValidator->setGrammar(fGrammar);
}
}
}
// ---------------------------------------------------------------------------
// XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
//
// This guy just scans out a single or double quoted string of characters.
// It does not pass any judgement on the contents and assumes that it is
// illegal to have another quote of the same kind inside the string's
// contents.
//
// NOTE: This is for simple stuff like the strings in the XMLDecl which
// cannot have any entities inside them. So this guy does not handle any
// end of entity stuff.
//
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = fReaderMgr.getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
//
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
//
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
//
// This method is called to do a raw scan of an attribute value. It does not
// do normalization (since we don't know their types yet.) It just scans the
// value and does entity expansion.
//
// End of entity's must be dealt with here. During DTD scan, they can come
// from external entities. During content, they can come from any entity.
// We just eat the end of entity and continue with our scan until we come
// to the closing quote. If an unterminated value causes us to go through
// subsequent entities, that will cause errors back in the calling code,
// but there's little we can do about it here.
//
bool XMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
// Get another char. Use second char if one is waiting
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = fReaderMgr.getNextChar();
}
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
//
// Check for our ending quote. It has to be in the same entity
// as where we started. Quotes in nested entities are ignored.
//
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr.getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr.getCurrentReaderNum())
{
emitError(XMLErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref . We ignore the empty flag in
// this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
// If it was not returned directly, then jump back up
if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
{
emitError(XMLErrs::Expected2ndSurrogateChar);
}
else
gotLeadingSurrogate = true;
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate) {
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!XMLReader::isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
}
}
gotLeadingSurrogate = false;
}
//
// If it was escaped, then put in a 0xFFFF value. This will
// be used later during validation and normalization of the
// value to know that the following character was via an
// escape char.
//
if (escaped)
toFill.append(0xFFFF);
// Else add it to the buffer
toFill.append(nextCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
bool XMLScanner::scanAttValue( const XMLAttDef* const attDef
, XMLBuffer& toFill)
{
enum States
{
InWhitespace
, InContent
};
// Get the type and name
const XMLAttDef::AttTypes type = attDef->getType();
const XMLCh* const attrName = attDef->getFullName();
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// Get attribute def - to check to see if it's declared externally or not
//
bool isAttExternal = attDef->isExternal();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = InContent;
bool firstNonWS = false;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
// Get another char. Use second char if one is waiting
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = fReaderMgr.getNextChar();
}
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
// Check for our ending quote in the same entity
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr.getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr.getCurrentReaderNum())
{
emitError(XMLErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref now, before we let it affect our
// whitespace normalization logic below. We ignore the empty flag
// in this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
// Its got to at least be a valid XML character
if (!XMLReader::isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
}
}
gotLeadingSurrogate = false;
}
//
// If its not escaped, then make sure its not a < character, which
// is not allowed in attribute values.
//
if (!escaped && (nextCh == chOpenAngle))
emitError(XMLErrs::BracketInAttrValue, attrName);
//
// If the attribute is a CDATA type we do simple replacement of
// tabs and new lines with spaces, if the character is not escaped
// by way of a char ref.
//
// Otherwise, we do the standard non-CDATA normalization of
// compressing whitespace to single spaces and getting rid of leading
// and trailing whitespace.
//
if (type == XMLAttDef::CData)
{
if (!escaped)
{
if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
{
//
// Check Validity Constraint for Standalone document declaration
// XML 1.0, Section 2.9
//
if (fStandalone && fValidate && isAttExternal)
{
//
// Can't have a standalone document declaration of "yes" if attribute
// values are subject to normalisation
//
fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
}
nextCh = chSpace;
}
}
}
else
{
if (curState == InWhitespace)
{
if ((escaped && nextCh != chSpace) || !XMLReader::isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
continue;
}
}
else if (curState == InContent)
{
if ((nextCh == chSpace) ||
(XMLReader::isWhitespace(nextCh) && !escaped))
{
curState = InWhitespace;
//
// Check Validity Constraint for Standalone document declaration
// XML 1.0, Section 2.9
//
if (fStandalone && fValidate && isAttExternal)
{
if (!firstNonWS || (nextCh != chSpace) || (fReaderMgr.lookingAtSpace()))
{
//
// Can't have a standalone document declaration of "yes" if attribute
// values are subject to normalisation
//
fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
}
}
continue;
}
firstNonWS = true;
}
}
// Else add it to the buffer
toFill.append(nextCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
//
// This method scans a CDATA section. It collects the character into one
// of the temp buffers and calls the document handler, if any, with the
// characters. It assumes that the <![CDATA string has been scanned before
// this call.
//
void XMLScanner::scanCDSection()
{
//
// This is the CDATA section opening sequence, minus the '<' character.
// We use this to watch for nested CDATA sections, which are illegal.
//
static const XMLCh CDataPrefix[] =
{
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chOpenSquare, chNull
};
static const XMLCh CDataClose[] =
{
chCloseSquare, chCloseAngle, chNull
};
//
// The next character should be the opening square bracket. If not
// issue an error, but then try to recover by skipping any whitespace
// and checking again.
//
if (!fReaderMgr.skippedChar(chOpenSquare))
{
emitError(XMLErrs::ExpectedOpenSquareBracket);
fReaderMgr.skipPastSpaces();
// If we still don't find it, then give up, else keep going
if (!fReaderMgr.skippedChar(chOpenSquare))
return;
}
// Get a buffer for this
XMLBufBid bbCData(&fBufMgr);
//
// We just scan forward until we hit the end of CDATA section sequence.
// CDATA is effectively a big escape mechanism so we don't treat markup
// characters specially here.
//
bool emittedError = false;
while (true)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for unexpected end of file
if (!nextCh)
{
emitError(XMLErrs::UnterminatedCDATASection);
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
}
if (fValidate && fStandalone && (XMLReader::isWhitespace(nextCh)))
{
//
// This document is standalone; this ignorable CDATA whitespace is forbidden.
// XML 1.0, Section 2.9
//
// And see if the current element is a 'Children' style content model
const ElemStack::StackElem* topElem = fElemStack.topElement();
if (topElem->fThisElement->isExternal()) {
// Get the character data opts for the current element
XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
if (charOpts == XMLElementDecl::SpacesOk) // Element Content
{
// Error - standalone should have a value of "no" as whitespace detected in an
// element type with element content whose element declaration was external
//
fValidator->emitError(XMLValid::NoWSForStandalone);
}
}
}
//
// If this is a close square bracket it could be our closing
// sequence.
//
if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
{
if (fGrammarType == Grammar::SchemaGrammarType) {
// call all active identity constraints
unsigned int count = fMatcherStack->getMatcherCount();
for (unsigned int i = 0; i < count; i++) {
fMatcherStack->getMatcherAt(i)->docCharacters(bbCData.getRawBuffer(), bbCData.getLen());
}
}
// If we have a doc handler, call it
if (fDocHandler)
{
fDocHandler->docCharacters
(
bbCData.getRawBuffer()
, bbCData.getLen()
, true
);
}
// And we are done
break;
}
//
// Make sure its a valid character. But if we've emitted an error
// already, don't bother with the overhead since we've already told
// them about it.
//
if (!emittedError)
{
if (!XMLReader::isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
emittedError = true;
}
}
// Add it to the buffer
bbCData.append(nextCh);
}
}
void XMLScanner::scanCharData(XMLBuffer& toUse)
{
//
// We have to watch for the stupid ]]> sequence, which is illegal in
// character data. So this is a little state machine that handles that.
//
enum States
{
State_Waiting
, State_GotOne
, State_GotTwo
};
// Reset the buffer before we start
toUse.reset();
// Turn on the 'throw at end' flag of the reader manager
ThrowEOEJanitor jan(&fReaderMgr, true);
//
// In order to be more efficient we have to use kind of a deeply nested
// set of blocks here. The outer block puts on a try and catches end of
// entity exceptions. The inner loop is the per-character loop. If we
// put the try inside the inner loop, it would work but would require
// the exception handling code setup/teardown code to be invoked for
// each character.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = State_Waiting;
bool escaped = false;
bool gotLeadingSurrogate = false;
bool notDone = true;
while (notDone)
{
try
{
while (true)
{
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
// Eat through as many plain content characters as possible without
// needing special handling. Moving most content characters here,
// in this one call, rather than running the overall loop once
// per content character, is a speed optimization.
//
if (curState == State_Waiting && !gotLeadingSurrogate)
{
fReaderMgr.movePlainContentChars(toUse);
}
// Try to get another char from the source
// The code from here on down covers all contengencies,
//
if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
{
// If we were waiting for a trailing surrogate, its an error
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
notDone = false;
break;
}
}
//
// Watch for a reference. Note that the escapement mechanism
// is ignored in this content.
//
if (nextCh == chAmpersand)
{
sendCharData(toUse);
// Turn off the throwing at the end of entity during this
ThrowEOEJanitor jan(&fReaderMgr, false);
if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
else
{
escaped = false;
}
// Keep the state machine up to date
if (!escaped)
{
if (nextCh == chCloseSquare)
{
if (curState == State_Waiting)
curState = State_GotOne;
else if (curState == State_GotOne)
curState = State_GotTwo;
}
else if (nextCh == chCloseAngle)
{
if (curState == State_GotTwo)
emitError(XMLErrs::BadSequenceInCharData);
curState = State_Waiting;
}
else
{
curState = State_Waiting;
}
}
else
{
curState = State_Waiting;
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
// Make sure the returned char is a valid XML char
if (!XMLReader::isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
}
gotLeadingSurrogate = false;
}
// Add this char to the buffer
toUse.append(nextCh);
}
}
catch(const EndOfEntityException& toCatch)
{
//
// Some entity ended, so we have to send any accumulated
// chars and send an end of entity event.
//
sendCharData(toUse);
gotLeadingSurrogate = false;
if (fDocHandler)
fDocHandler->endEntityReference(toCatch.getEntity());
}
}
//
// Check the validity constraints as per XML 1.0 Section 2.9
//
const XMLCh* rawBuf = toUse.getRawBuffer();
const unsigned int len = toUse.getLen();
if (fValidate && fStandalone)
{
// See if the text contains whitespace
// Get the raw data we need for the callback
const bool isSpaces = XMLReader::containsWhiteSpace(rawBuf, len);
if (isSpaces)
{
// And see if the current element is a 'Children' style content model
const ElemStack::StackElem* topElem = fElemStack.topElement();
if (topElem->fThisElement->isExternal()) {
// Get the character data opts for the current element
XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
if (charOpts == XMLElementDecl::SpacesOk) // => Element Content
{
// Error - standalone should have a value of "no" as whitespace detected in an
// element type with element content whose element declaration was external
//
fValidator->emitError(XMLValid::NoWSForStandalone);
}
}
}
}
// Send any char data that we accumulated into the buffer
sendCharData(toUse);
}
//
// This method scans a character reference and returns the character that
// was refered to. It assumes that we've already scanned the &# characters
// that prefix the numeric code.
//
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
//
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
//
unsigned int radix = 10;
if (fReaderMgr.skippedChar(chLatin_x))
{
radix = 16;
}
else if (fReaderMgr.skippedChar(chLatin_X))
{
emitError(XMLErrs::HexRadixMustBeLowerCase);
radix = 16;
}
while (true)
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
fReaderMgr.getNextChar();
break;
}
//
// Convert this char to a binary value, or bail out if its not
// one.
//
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
else
{
// Return a zero
toFill = 0;
//
// If we got at least a sigit, then do an unterminated ref error.
// Else, do an expected a numerical ref thing.
//
if (gotOne)
emitError(XMLErrs::UnterminatedCharRef);
else
emitError(XMLErrs::ExpectedNumericalCharRef);
// Return failure
return false;
}
//
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
//
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
emitError(XMLErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
}
// Indicate that we got at least one good digit
gotOne = true;
// And eat the last char
fReaderMgr.getNextChar();
}
//
// [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] // any Unicode character, excluding the
// | [#xE000-#xFFFD] | [#x10000-#x10FFFF] // surrogate blocks, FFFE, and FFFF.
//
bool validChar = false;
if (value < 0x20)
{
if (value == 0x09 || value == 0x0A || value == 0x0D)
{
validChar = true;
}
}
else if (value <= 0xD7FF || (value >= 0xE000 && (value <= 0xFFFD || (value >= 0x10000 && value <= 0x10FFFF))))
{
validChar = true;
}
if (!validChar)
{
//
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
// Return the char (or chars)
if (value >= 0x10000)
{
value -= 0x10000;
toFill = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else
{
toFill = XMLCh(value);
second = 0;
}
return true;
}
//
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
//
void XMLScanner::scanComment()
{
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(&fBufMgr);
//
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
//
States curState = InText;
bool gotLeadingSurrogate = false;
while (true)
{
// Get the next character
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file
if (!nextCh)
{
emitError(XMLErrs::UnterminatedComment);
ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
else if (!XMLReader::isXMLChar(nextCh)) {
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
gotLeadingSurrogate = false;
}
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
//
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
//
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
emitError(XMLErrs::IllegalSequenceInComment);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If we have an available handler, call back with the comment.
if (fDocHandler)
{
fDocHandler->docComment
(
bbComment.getRawBuffer()
);
}
}
//
// Most equal signs can have white space around them, so this little guy
// just makes the calling code cleaner by eating whitespace.
//
bool XMLScanner::scanEq()
{
fReaderMgr.skipPastSpaces();
if (fReaderMgr.skippedChar(chEqual))
{
fReaderMgr.skipPastSpaces();
return true;
}
return false;
}
//
// This method will scan a general/character entity ref. It will either
// expand a char ref and return it directly, or push a reader for a general
// entity.
//
// The return value indicates whether the char parameters hold the value
// or whether the value was pushed as a reader, or that it failed.
//
// The escaped flag tells the caller whether the returned parameter resulted
// from a character reference, which escapes the character in some cases. It
// only makes any difference if the return value indicates the value was
// returned directly.
//
XMLScanner::EntityExpRes
XMLScanner::scanEntityRef( const bool inAttVal
, XMLCh& firstCh
, XMLCh& secondCh
, bool& escaped)
{
// Assume no escape
secondCh = 0;
escaped = false;
// We have to insure that its all in one entity
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// If the next char is a pound, then its a character reference and we
// need to expand it always.
//
if (fReaderMgr.skippedChar(chPound))
{
//
// Its a character reference, so scan it and get back the numeric
// value it represents.
//
if (!scanCharRef(firstCh, secondCh))
return EntityExp_Failed;
escaped = true;
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
return EntityExp_Returned;
}
// Expand it since its a normal entity ref
XMLBufBid bbName(&fBufMgr);
if (!fReaderMgr.getName(bbName.getBuffer()))
{
emitError(XMLErrs::ExpectedEntityRefName);
return EntityExp_Failed;
}
//
// Next char must be a semi-colon. But if its not, just emit
// an error and try to continue.
//
if (!fReaderMgr.skippedChar(chSemiColon))
emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
// Make sure we ended up on the same entity reader as the & char
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
// Look up the name in the general entity pool
XMLEntityDecl* decl = fEntityDeclPool->getByKey(bbName.getRawBuffer());
// If it does not exist, then obviously an error
if (!decl)
{
// XML 1.0 Section 4.1
// Well-formedness Constraint for entity not found:
// In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
// or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
// or a parameter entity
//
// Else it's Validity Constraint
//
if (fStandalone || fHasNoDTD)
emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
else {
if (fValidate)
fValidator->emitError(XMLValid::VC_EntityNotFound, bbName.getRawBuffer());
}
return EntityExp_Failed;
}
//
// XML 1.0 Section 2.9
// If we are a standalone document, then it has to have been declared
// in the internal subset. Keep going though.
//
if (fStandalone && !decl->getDeclaredInIntSubset() && fValidate)
fValidator->emitError(XMLValid::IllegalRefInStandalone, bbName.getRawBuffer());
if (decl->isExternal())
{
// If its unparsed, then its not valid here
if (decl->isUnparsed())
{
emitError(XMLErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
return EntityExp_Failed;
}
// If we are in an attribute value, then not valid but keep going
if (inAttVal)
emitError(XMLErrs::NoExtRefsInAttValue);
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = fReaderMgr.createReader
(
decl->getSystemId()
, decl->getPublicId()
, false
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the source so it gets cleaned up on exit
Janitor<InputSource> janSrc(srcUsed);
//
// If the creation failed, and its not because the source was empty,
// then emit an error and return.
//
if (!reader)
ThrowXML1(RuntimeException, XMLExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
//
// Push the reader. If its a recursive expansion, then emit an error
// and return an failure.
//
if (!fReaderMgr.pushReader(reader, decl))
{
emitError(XMLErrs::RecursiveEntity, decl->getName());
return EntityExp_Failed;
}
//
// Do a start entity reference event.
//
// <TBD> For now, we supress them in att values. Later, when
// the stuff is in place to correctly allow DOM to handle them
// we'll turn this back on.
//
if (fDocHandler && !inAttVal)
fDocHandler->startEntityReference(*decl);
// If it starts with the XML string, then parse a text decl
if (checkXMLDecl(true))
scanXMLDecl(Decl_Text);
}
else
{
//
// If its one of the special char references, then we can return
// it as a character, and its considered escaped.
//
if (decl->getIsSpecialChar())
{
firstCh = decl->getValue()[0];
escaped = true;
return EntityExp_Returned;
}
//
// Create a reader over a memory stream over the entity value
// We force it to assume UTF-16 by passing in an encoding
// string. This way it won't both trying to predecode the
// first line, looking for an XML/TextDecl.
//
XMLReader* valueReader = fReaderMgr.createIntEntReader
(
decl->getName()
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, decl->getValue()
, decl->getValueLen()
, false
);
//
// Try to push the entity reader onto the reader manager stack,
// where it will become the subsequent input. If it fails, that
// means the entity is recursive, so issue an error. The reader
// will have just been discarded, but we just keep going.
//
if (!fReaderMgr.pushReader(valueReader, decl))
emitError(XMLErrs::RecursiveEntity, decl->getName());
//
// Do a start entity reference event.
//
// <TBD> For now, we supress them in att values. Later, when
// the stuff is in place to correctly allow DOM to handle them
// we'll turn this back on.
//
if (fDocHandler && !inAttVal)
fDocHandler->startEntityReference(*decl);
// If it starts with the XML string, then it's an error
if (checkXMLDecl(true)) {
emitError(XMLErrs::TextDeclNotLegalHere);
fReaderMgr.skipPastChar(chCloseAngle);
}
}
return EntityExp_Pushed;
}
unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
{
fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
return toFill.getLen();
}
bool XMLScanner::switchGrammar(unsigned int newGrammarNameSpaceIndex)
{
XMLBufBid bbURI(&fBufMgr);
XMLBuffer& bufURI = bbURI.getBuffer();
getURIText(newGrammarNameSpaceIndex, bufURI);
Grammar* tempGrammar = fGrammarResolver->getGrammar(bufURI.getRawBuffer());
if (!tempGrammar) {
// This is a case where namespaces is on with a DTD grammar.
tempGrammar = fGrammarResolver->getGrammar(XMLUni::fgZeroLenString);
}
if (!tempGrammar)
return false;
else {
fGrammar = tempGrammar;
fGrammarType = fGrammar->getGrammarType();
if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoSchemaValidator);
else {
fValidator = fSchemaValidator;
}
}
else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoDTDValidator);
else {
fValidator = fDTDValidator;
}
}
fValidator->setGrammar(fGrammar);
return true;
}
}
bool XMLScanner::switchGrammar(const XMLCh* const newGrammarNameSpace)
{
Grammar* tempGrammar = fGrammarResolver->getGrammar(newGrammarNameSpace);
if (!tempGrammar) {
// This is a case where namespaces is on with a DTD grammar.
tempGrammar = fGrammarResolver->getGrammar(XMLUni::fgZeroLenString);
}
if (!tempGrammar)
return false;
else {
fGrammar = tempGrammar;
fGrammarType = fGrammar->getGrammarType();
if (fGrammarType == Grammar::SchemaGrammarType && !fValidator->handlesSchema()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoSchemaValidator);
else {
fValidator = fSchemaValidator;
}
}
else if (fGrammarType == Grammar::DTDGrammarType && !fValidator->handlesDTD()) {
if (fValidatorFromUser)
ThrowXML(RuntimeException, XMLExcepts::Gen_NoDTDValidator);
else {
fValidator = fDTDValidator;
}
}
fValidator->setGrammar(fGrammar);
return true;
}
}
// check if we should skip or lax the validation of the element
// if skip - no validation
// if lax - validate only if the element if found
bool XMLScanner::laxElementValidation(QName* element, ContentLeafNameTypeVector* cv,
const XMLContentModel* const cm,
const unsigned int parentElemDepth)
{
bool skipThisOne = false;
bool laxThisOne = false;
unsigned int elementURI = element->getURI();
unsigned int currState = fElemState[parentElemDepth];
if (currState == XMLContentModel::gInvalidTrans) {
return laxThisOne;
}
SubstitutionGroupComparator comparator(fGrammarResolver, fURIStringPool);
if (cv) {
unsigned int i = 0;
unsigned int leafCount = cv->getLeafCount();
for (; i < leafCount; i++) {
QName* fElemMap = cv->getLeafNameAt(i);
unsigned int uri = fElemMap->getURI();
unsigned int nextState;
bool anyEncountered = false;
ContentSpecNode::NodeTypes type = cv->getLeafTypeAt(i);
if (type == ContentSpecNode::Leaf) {
if (((uri == elementURI)
&& !XMLString::compareString(fElemMap->getLocalPart(), element->getLocalPart()))
|| comparator.isEquivalentTo(element, fElemMap)) {
nextState = cm->getNextState(currState, i);
if (nextState != XMLContentModel::gInvalidTrans) {
fElemState[parentElemDepth] = nextState;
break;
}
}
} else if ((type & 0x0f) == ContentSpecNode::Any) {
anyEncountered = true;
}
else if ((type & 0x0f) == ContentSpecNode::Any_Other) {
if (uri != elementURI) {
anyEncountered = true;
}
}
else if ((type & 0x0f) == ContentSpecNode::Any_NS) {
if (uri == elementURI) {
anyEncountered = true;
}
}
if (anyEncountered) {
nextState = cm->getNextState(currState, i);
if (nextState != XMLContentModel::gInvalidTrans) {
fElemState[parentElemDepth] = nextState;
if (type == ContentSpecNode::Any_Skip ||
type == ContentSpecNode::Any_NS_Skip ||
type == ContentSpecNode::Any_Other_Skip) {
skipThisOne = true;
}
else if (type == ContentSpecNode::Any_Lax ||
type == ContentSpecNode::Any_NS_Lax ||
type == ContentSpecNode::Any_Other_Lax) {
laxThisOne = true;
}
break;
}
}
} // for
if (i == leafCount) { // no match
fElemState[parentElemDepth] = XMLContentModel::gInvalidTrans;
return laxThisOne;
}
} // if
if (skipThisOne) {
fValidate = false;
fElemStack.setValidationFlag(fValidate);
}
return laxThisOne;
}
// check if there is an AnyAttribute, and if so, see if we should lax or skip
// if skip - no validation
// if lax - validate only if the attribute if found
bool XMLScanner::anyAttributeValidation(SchemaAttDef* attWildCard, unsigned int uriId, bool& skipThisOne, bool& laxThisOne)
{
XMLAttDef::AttTypes wildCardType = attWildCard->getType();
bool anyEncountered = false;
skipThisOne = false;
laxThisOne = false;
if (wildCardType == XMLAttDef::Any_Any)
anyEncountered = true;
else if (wildCardType == XMLAttDef::Any_Other) {
if (attWildCard->getAttName()->getURI() != uriId)
anyEncountered = true;
}
else if (wildCardType == XMLAttDef::Any_List) {
ValueVectorOf<unsigned int>* nameURIList = attWildCard->getNamespaceList();
unsigned int listSize = (nameURIList) ? nameURIList->size() : 0;
if (listSize) {
for (unsigned int i=0; i < listSize; i++) {
if (nameURIList->elementAt(i) == uriId)
anyEncountered = true;
}
}
}
if (anyEncountered) {
XMLAttDef::DefAttTypes defType = attWildCard->getDefaultType();
if (defType == XMLAttDef::ProcessContents_Skip) {
// attribute should just be bypassed,
skipThisOne = true;
}
else if (defType == XMLAttDef::ProcessContents_Lax) {
laxThisOne = true;
}
}
return anyEncountered;
}