blob: 277c973423835a42d4db92b2ebf237b541051e43 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.1 1999/11/09 01:03:38 twl
* Initial revision
*
* Revision 1.4 1999/11/08 20:45:42 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/BinMemInputStream.hpp>
#include <util/FlagJanitor.hpp>
#include <util/Janitor.hpp>
#include <util/PlatformUtils.hpp>
#include <util/UnexpectedEOFException.hpp>
#include <util/URL.hpp>
#include <util/XMLUni.hpp>
#include <framework/XMLBufferMgr.hpp>
#include <framework/XMLDocumentHandler.hpp>
#include <framework/XMLElementDecl.hpp>
#include <framework/XMLEntityHandler.hpp>
#include <framework/XMLNotationDecl.hpp>
#include <internal/URLInputSource.hpp>
#include <internal/EndOfEntityException.hpp>
#include <internal/ReaderMgr.hpp>
#include <internal/XMLScanner.hpp>
#include <validators/DTD/ContentSpecNode.hpp>
#include <validators/DTD/DTDEntityDecl.hpp>
#include <validators/DTD/DTDValidator.hpp>
#include <validators/DTD/DocTypeHandler.hpp>
#include <validators/DTD/MixedContentModel.hpp>
// ---------------------------------------------------------------------------
// Local methods
// ---------------------------------------------------------------------------
//
// This method automates the grunt work of looking at a char and see if its
// a repetition suffix. If so, it creates a new correct rep node and wraps
// the pass node in it. Otherwise, it returns the previous node.
//
static ContentSpecNode*
makeRepNode(const XMLCh testCh, ContentSpecNode* const prevNode)
{
if (testCh == chQuestion)
{
return new ContentSpecNode
(
ContentSpecNode::ZeroOrOne
, prevNode
, 0
);
}
else if (testCh == chPlus)
{
return new ContentSpecNode
(
ContentSpecNode::OneOrMore
, prevNode
, 0
);
}
else if (testCh == chAsterisk)
{
return new ContentSpecNode
(
ContentSpecNode::ZeroOrMore
, prevNode
, 0
);
}
// Just return the incoming node
return prevNode;
}
// ---------------------------------------------------------------------------
// DTDValidator: Private scanning methods
// ---------------------------------------------------------------------------
bool DTDValidator::checkForPERef(const bool spaceRequired
, const bool inLiteral
, const bool inMarkup
, const bool throwAtEndExt)
{
bool gotSpace = false;
//
// See if we have any spaces up front. If so, then skip them and set
// the gotSpaces flag.
//
if (getReaderMgr()->skippedSpace())
{
getReaderMgr()->skipPastSpaces();
gotSpace = true;
}
// If the next char is a percent, then expand the PERef
if (getReaderMgr()->skippedChar(chPercent))
expandPERef(false, inLiteral, inMarkup, throwAtEndExt);
// And skip any more spaces in the expanded value
if (getReaderMgr()->skippedSpace())
{
getReaderMgr()->skipPastSpaces();
gotSpace = true;
}
return gotSpace;
}
bool DTDValidator::expandPERef( const bool scanExternal
, const bool inLiteral
, const bool inMarkup
, const bool throwEndOfExt)
{
XMLBufBid bbName(getBufMgr());
//
// If we are in the internal subset and in markup, then this is
// an error but we go ahead and do it anyway.
//
if (fInternalSubset && inMarkup)
getScanner()->emitError(XML4CErrs::PERefInMarkupInIntSubset);
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedPEName);
// Skip the semicolon if that's what we ended up on
getReaderMgr()->skippedChar(chSemiColon);
return false;
}
// If no terminating semicolon, emit an error but try to keep going
if (!getReaderMgr()->skippedChar(chSemiColon))
getScanner()->emitError(XML4CErrs::UnterminatedEntityRef);
//
// Look it up in the PE decl pool and see if it exists. If not, just
// emit an error and continue.
//
XMLEntityDecl* decl = findEntityDecl(bbName.getRawBuffer(), true);
if (!decl)
{
getScanner()->emitError(XML4CErrs::EntityNotFound, bbName.getRawBuffer());
return false;
}
//
// If we are a standalone document, then it has to have been declared
// in the internal subset. Keep going though.
//
if (getScanner()->getStandalone() && !decl->getDeclaredInIntSubset())
getScanner()->emitError(XML4CErrs::IllegalRefInStandalone, bbName.getRawBuffer());
//
// Okee dokee, we found it. So create either a memory stream with
// the entity value contents, or a file stream if its an external
// entity.
//
if (decl->isExternal())
{
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = getReaderMgr()->createReader
(
decl->getSystemId()
, decl->getPublicId()
, false
, inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
, XMLReader::Type_PE
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the source so its cleaned up on exit
Janitor<InputSource> janSrc(srcUsed);
// If the creation failed then throw an exception
if (!reader)
ThrowXML1(RuntimeException, XML4CExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
// Set the 'throw at end' flag, to the one we were given
reader->setThrowAtEnd(throwEndOfExt);
//
// Push the reader. If its a recursive expansion, then emit an error
// and return an failure.
//
if (!getReaderMgr()->pushReader(reader, decl))
{
getScanner()->emitError(XML4CErrs::RecursiveEntity, decl->getName());
return false;
}
//
// If the caller wants us to scan the external entity, then lets
// do that now.
//
if (scanExternal)
{
XMLEntityHandler* entHandler = getScanner()->getEntityHandler();
// If we have an entity handler, tell it we are starting this entity
if (entHandler)
entHandler->startInputSource(*srcUsed);
//
// Scan the external entity now. The parameter tells it that
// it is not in an include section. Get the current reader
// level so we can catch partial markup errors and be sure
// to get back to here if we get an exception out of the
// ext subset scan.
//
const unsigned int readerNum = getReaderMgr()->getCurrentReaderNum();
try
{
scanExtSubsetDecl(false);
}
catch(...)
{
// Pop the reader back to the original level
getReaderMgr()->cleanStackBackTo(readerNum);
// End the input source, even though its not happy
if (entHandler)
entHandler->endInputSource(*srcUsed);
throw;
}
// If we have an entity handler, tell it we are ending this entity
if (entHandler)
entHandler->endInputSource(*srcUsed);
}
}
else
{
// Create a reader over a memory stream over the entity value
XMLReader* valueReader = getReaderMgr()->createIntEntReader
(
decl->getName()
, inLiteral ? XMLReader::RefFrom_Literal : XMLReader::RefFrom_NonLiteral
, XMLReader::Type_PE
, decl->getValue()
, decl->getValueLen()
, false
);
//
// Trt to push the entity reader onto the reader manager stack,
// where it will become the subsequent input. If it fails, that
// means the entity is recursive, so issue an error. The reader
// will have just been discarded, but we just keep going.
//
if (!getReaderMgr()->pushReader(valueReader, decl))
getScanner()->emitError(XML4CErrs::RecursiveEntity, decl->getName());
}
return true;
}
bool DTDValidator::getQuotedString(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!getReaderMgr()->skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = getReaderMgr()->getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
//
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
//
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
XMLAttDef*
DTDValidator::scanAttDef(DTDElementDecl& parentElem, XMLBuffer& bufToUse)
{
// Check for PE ref or optional whitespace
checkForPERef(false, false, true);
// Get the name of the attribute
if (!getReaderMgr()->getName(bufToUse))
{
getScanner()->emitError(XML4CErrs::ExpectedAttrName);
return 0;
}
//
// Look up this attribute in the parent element's attribute list. If
// it already exists, then use the dummy.
//
DTDAttDef* decl = parentElem.getAttDef(bufToUse.getRawBuffer());
if (decl)
{
// It already exists, so put out a warning
getScanner()->emitError(XML4CErrs::AttListAlreadyExists, bufToUse.getRawBuffer());
// Use the dummy decl to parse into
if (!fDumAttDef)
fDumAttDef = new DTDAttDef;
fDumAttDef->setName(bufToUse.getRawBuffer());
decl = fDumAttDef;
}
else
{
// It does not already exist so create a new one
decl = new DTDAttDef(bufToUse.getRawBuffer());
// Give it the next available unique id
decl->setId(fNextAttrId++);
}
// Set a flag to indicate whether we are doing a dummy parse
const bool isIgnored = (decl == fDumAttDef);
// Space is required here, so check for PE ref, and require space
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
//
// Next has to be one of the attribute type strings. This tells us what
// is to follow.
//
if (getReaderMgr()->skippedString(XMLUni::fgCDATAString))
{
decl->setType(XMLAttDef::CData);
}
else if (getReaderMgr()->skippedString(XMLUni::fgIDString))
{
if (!getReaderMgr()->skippedString(XMLUni::fgRefString))
decl->setType(XMLAttDef::ID);
else if (!getReaderMgr()->skippedChar(chLatin_S))
decl->setType(XMLAttDef::IDRef);
else
decl->setType(XMLAttDef::IDRefs);
}
else if (getReaderMgr()->skippedString(XMLUni::fgEntitString))
{
if (getReaderMgr()->skippedChar(chLatin_Y))
{
decl->setType(XMLAttDef::Entity);
}
else if (getReaderMgr()->skippedString(XMLUni::fgIESString))
{
decl->setType(XMLAttDef::Entities);
}
else
{
getScanner()->emitError(XML4CErrs::ExpectedAttributeType);
if (isIgnored)
delete decl;
return 0;
}
}
else if (getReaderMgr()->skippedString(XMLUni::fgNmTokenString))
{
if (getReaderMgr()->skippedChar(chLatin_S))
decl->setType(XMLAttDef::NmTokens);
else
decl->setType(XMLAttDef::NmToken);
}
else if (getReaderMgr()->skippedString(XMLUni::fgNotationString))
{
// Check for PE ref and require space
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
decl->setType(XMLAttDef::Notation);
if (!scanEnumeration(bufToUse, true))
{
if (isIgnored)
delete decl;
return 0;
}
// Set the value as the enumeration for this decl
decl->setEnumeration(bufToUse.getRawBuffer());
}
else if (getReaderMgr()->skippedChar(chOpenParen))
{
decl->setType(XMLAttDef::Enumeration);
if (!scanEnumeration(bufToUse, false))
{
if (isIgnored)
delete decl;
return 0;
}
// Set the value as the enumeration for this decl
decl->setEnumeration(bufToUse.getRawBuffer());
}
else
{
getScanner()->emitError(XML4CErrs::ExpectedAttributeType);
if (isIgnored)
delete decl;
return 0;
}
// Space is required here, so check for PE ref, and require space
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// And then scan for the optional default value declaration
scanDefaultDecl(*decl);
// Add this guy to element's attr list (if not ignoring.)
if (!isIgnored)
parentElem.addAttDef(decl);
// If validating, then do a couple of validation constraints
if (getScanner()->getDoValidation())
{
if (decl->getType() == XMLAttDef::ID)
{
if ((decl->getDefaultType() != XMLAttDef::Implied)
&& (decl->getDefaultType() != XMLAttDef::Required))
{
emitError(XML4CValid::BadIDAttrDefType, decl->getFullName());
}
}
}
// If we have a doc type handler, tell it about this attdef.
if (fDocTypeHandler)
fDocTypeHandler->attDef(parentElem, *decl, isIgnored);
return decl;
}
void DTDValidator::scanAttListDecl()
{
// Space is required here, so check for a PE ref
if (!checkForPERef(true, false, true))
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// Next should be the name of the element it belongs to, so get a buffer
// and get the name into it.
//
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedElementName);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// Find this element's declaration. If it has not been declared yet,
// we will force one into the list, but not mark it as declared.
//
DTDElementDecl* elemDecl = fElemDeclPool->getByKey(bbName.getRawBuffer());
if (!elemDecl)
{
//
// Lets fault in a declaration and add it to the pool. We mark
// it having been created because of an attlist. Later, if its
// declared, this will be updated.
//
elemDecl = new DTDElementDecl(bbName.getRawBuffer());
elemDecl->setCreateReason(XMLElementDecl::AttList);
fElemDeclPool->put(elemDecl);
}
// If we have a doc type handler, tell it the att list is starting
if (fDocTypeHandler)
fDocTypeHandler->startAttList(*elemDecl);
//
// Now we loop until we are done with all of the attributes in this
// list. We need a buffer to use for local processing.
//
XMLBufBid bbTmp(getBufMgr());
XMLBuffer& tmpBuf = bbTmp.getBuffer();
bool seenAnId = false;
while (true)
{
// Get the next char out and see what it tells us to do
const XMLCh nextCh = getReaderMgr()->peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
if (nextCh == chCloseAngle)
{
// We are done with this attribute list
getReaderMgr()->getNextChar();
break;
}
else if (XMLReader::isWhitespace(nextCh))
{
//
// If advanced callbacks are enabled and we have a doc
// type handler, then gather up the white space and call
// back on the doctype handler. Otherwise, just skip
// whitespace.
//
if (fDocTypeHandler)
{
getReaderMgr()->getSpaces(tmpBuf);
fDocTypeHandler->doctypeWhitespace
(
tmpBuf.getRawBuffer()
, tmpBuf.getLen()
);
}
else
{
getReaderMgr()->skipPastSpaces();
}
}
else if (nextCh == chPercent)
{
// Eat the percent and expand the ref
getReaderMgr()->getNextChar();
expandPERef(false, false, true);
}
else
{
//
// It must be an attribute name, so scan it. We let
// it use our local buffer for its name scanning.
//
XMLAttDef* attDef = scanAttDef(*elemDecl, tmpBuf);
if (!attDef)
{
getReaderMgr()->skipPastChar(chCloseAngle);
break;
}
//
// If we are validating and its an ID type, then we have to
// make sure that we have not seen an id attribute yet. Set
// the flag to say that we've seen one now also.
//
if (attDef->getType() == XMLAttDef::ID)
{
if (seenAnId)
emitError(XML4CValid::MultipleIdAttrs, elemDecl->getFullName());
seenAnId = true;
}
}
}
// If we have a doc type handler, tell it the att list is ending
if (fDocTypeHandler)
fDocTypeHandler->endAttList(*elemDecl);
}
//
// This method is called to scan the value of an attribute in content. This
// involves some normalization and replacement of general entity and
// character references.
//
// End of entity's must be dealt with here. During DTD scan, they can come
// from external entities. During content, they can come from any entity.
// We just eat the end of entity and continue with our scan until we come
// to the closing quote. If an unterminated value causes us to go through
// subsequent entities, that will cause errors back in the calling code,
// but there's little we can do about it here.
//
bool DTDValidator::scanAttValue( XMLBuffer& toFill
, const XMLAttDef::AttTypes type)
{
enum States
{
InWhitespace
, InContent
};
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!getReaderMgr()->skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = getReaderMgr()->getCurrentReaderNum();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = InContent;
bool firstNonWS = false;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
// Get another char. Use second char from prevous is its there
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = getReaderMgr()->getNextChar();
}
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
// Check for our ending quote in the same entity
if (nextCh == quoteCh)
{
if (curReader == getReaderMgr()->getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > getReaderMgr()->getCurrentReaderNum())
{
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref now, before we let it affect our
// whitespace normalization logic below. We ignore the empty flag
// in this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
if (scanEntityRef(nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
getScanner()->emitError(XML4CErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
getScanner()->emitError(XML4CErrs::Expected2ndSurrogateChar);
}
gotLeadingSurrogate = false;
// Its got to at least be a valid XML character
if (!XMLReader::isXMLChar(nextCh))
getScanner()->emitError(XML4CErrs::InvalidCharacter);
}
//
// If its not escaped, then make sure its not a < character, which
// is not allowed in attribute values.
//
if (!escaped && (nextCh == chOpenAngle))
getScanner()->emitError(XML4CErrs::BracketInAttrValue);
//
// If the attribute is a CDATA type we do simple replacement of
// tabs and new lines with spaces, if the character is not escaped
// by way of a char ref.
//
// Otherwise, we do the standard non-CDATA normalization of
// compressing whitespace to single spaces and getting rid of
// leading and trailing whitespace.
//
if (type == XMLAttDef::CData)
{
if (!escaped)
{
if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
nextCh = chSpace;
}
}
else
{
if (curState == InWhitespace)
{
if (!XMLReader::isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
continue;
}
}
else if (curState == InContent)
{
if (XMLReader::isWhitespace(nextCh))
{
curState = InWhitespace;
continue;
}
firstNonWS = true;
}
}
// Else add it to the buffer
toFill.append(nextCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
bool DTDValidator::scanCharRef(XMLCh& first, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
//
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
//
unsigned int radix = 10;
if (getReaderMgr()->skippedChar(chLatin_x))
{
radix = 16;
}
else if (getReaderMgr()->skippedChar(chLatin_X))
{
getScanner()->emitError(XML4CErrs::HexRadixMustBeLowerCase);
radix = 16;
}
while (true)
{
const XMLCh nextCh = getReaderMgr()->peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
getReaderMgr()->getNextChar();
break;
}
//
// Convert this char to a binary value, or bail out if its not
// one.
//
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
else
{
//
// If we got at least a sigit, then do an unterminated ref
// error. Else, do an expected a numerical ref thing.
//
if (gotOne)
getScanner()->emitError(XML4CErrs::UnterminatedCharRef);
else
getScanner()->emitError(XML4CErrs::ExpectedNumericalCharRef);
return false;
}
//
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
//
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
getScanner()->emitError(XML4CErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
}
// Indicate that we got at least one good digit
gotOne = true;
// Eat the char we just processed
getReaderMgr()->getNextChar();
}
// Return the char (or chars)
if (value >= 0x10000)
{
value -= 0x10000;
first = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else
{
first = XMLCh(value);
second = 0;
}
return true;
}
ContentSpecNode* DTDValidator::scanChildren(XMLBuffer& bufToUse)
{
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
// We have to check entity nesting here
unsigned int curReader;
//
// We know that the caller just saw an opening parenthesis, so we need
// to parse until we hit the end of it, recursing for other nested
// parentheses we see.
//
// We have to check for one up front, since it could be something like
// (((a)*)) etc...
//
ContentSpecNode* curNode = 0;
if (getReaderMgr()->skippedChar(chOpenParen))
{
curReader = getReaderMgr()->getCurrentReaderNum();
// Lets call ourself and get back the resulting node
curNode = scanChildren(bufToUse);
// If that failed, no need to go further, return failure
if (!curNode)
return 0;
if (curReader != getReaderMgr()->getCurrentReaderNum())
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
}
else
{
// Not a nested paren, so it must be a leaf node
if (!getReaderMgr()->getName(bufToUse))
{
getScanner()->emitError(XML4CErrs::ExpectedElementName);
return 0;
}
//
// Create a leaf node for it. If we can find the element id for
// this element, then use it. Else, we have to fault in an element
// decl, marked as created because of being in a content model.
//
unsigned int elemId = findElemId(bufToUse.getRawBuffer());
if (elemId == XMLElementDecl::fgInvalidElemId)
{
DTDElementDecl* decl = new DTDElementDecl(bufToUse.getRawBuffer());
decl->setCreateReason(XMLElementDecl::InContentModel);
fElemDeclPool->put(decl);
elemId = decl->getId();
}
curNode = new ContentSpecNode(elemId);
// Check for a PE ref here, but don't require spaces
const bool gotSpaces = checkForPERef(false, false, true);
// Check for a repetition character after the leaf
const XMLCh repCh = getReaderMgr()->peekNextChar();
ContentSpecNode* tmpNode = makeRepNode(repCh, curNode);
if (tmpNode != curNode)
{
if (gotSpaces)
getScanner()->emitError(XML4CErrs::UnexpectedWhitespace);
getReaderMgr()->getNextChar();
curNode = tmpNode;
}
}
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
//
// Ok, the next character tells us what kind of content this particular
// model this particular parentesized section is. Its either a choice if
// we see ',', a sequence if we see '|', or a single leaf node if we see
// a closing paren.
//
const XMLCh opCh = getReaderMgr()->peekNextChar();
if ((opCh != chComma)
&& (opCh != chPipe)
&& (opCh != chCloseParen))
{
// Not a legal char, so delete our node and return failure
getScanner()->emitError(XML4CErrs::ExpectedSeqChoiceLeaf);
delete curNode;
return 0;
}
//
// Create the head node of the correct type. We need this to remember
// the top of the local tree. If it was a single subexpr, then just
// set the head node to the current node. For the others, we'll build
// the tree off the second child as we move across.
//
ContentSpecNode* headNode = 0;
ContentSpecNode::NodeTypes curType;
if (opCh == chComma)
{
curType = ContentSpecNode::Sequence;
headNode = new ContentSpecNode(curType, curNode, 0);
curNode = headNode;
}
else if (opCh == chPipe)
{
curType = ContentSpecNode::Choice;
headNode = new ContentSpecNode(curType, curNode, 0);
curNode = headNode;
}
else
{
headNode = curNode;
getReaderMgr()->getNextChar();
}
//
// If it was a sequence or choice, we just loop until we get to the
// end of our section, adding each new leaf or sub expression to the
// right child of the current node, and making that new node the current
// node.
//
if ((opCh == chComma) || (opCh == chPipe))
{
ContentSpecNode* lastNode = 0;
while (true)
{
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
//
// The next thing must either be another | or , character followed
// by another leaf or subexpression, or a closing parenthesis.
//
if (getReaderMgr()->skippedChar(chCloseParen))
{
//
// We've hit the end of this section, so break out. But, we
// need to see if we left a partial sequence of choice node
// without a second node. If so, we have to undo that and
// put its left child into the right node of the previous
// node.
//
if ((curNode->getType() == ContentSpecNode::Choice)
|| (curNode->getType() == ContentSpecNode::Sequence))
{
if (!curNode->getSecond())
{
ContentSpecNode* saveFirst = curNode->orphanFirst();
lastNode->setSecond(saveFirst);
curNode = lastNode;
}
}
break;
}
else if (getReaderMgr()->skippedChar(opCh))
{
// Check for a PE ref here, but don't require spaces
checkForPERef(false, false, true);
if (getReaderMgr()->skippedChar(chOpenParen))
{
curReader = getReaderMgr()->getCurrentReaderNum();
// Recurse to handle this new guy
ContentSpecNode* subNode = scanChildren(bufToUse);
// If it failed, we are done, clean up here and return failure
if (!subNode)
{
delete headNode;
return 0;
}
if (curReader != getReaderMgr()->getCurrentReaderNum())
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
// Else patch it in and make it the new current
ContentSpecNode* newCur = new ContentSpecNode
(
curType
, subNode
, 0
);
curNode->setSecond(newCur);
lastNode = curNode;
curNode = newCur;
}
else
{
//
// Got to be a leaf node, so get a name. If we cannot get
// one, then clean up and get outa here.
//
if (!getReaderMgr()->getName(bufToUse))
{
delete headNode;
getScanner()->emitError(XML4CErrs::ExpectedElementName);
return 0;
}
//
// Create a leaf node for it. If we can find the element
// id for this element, then use it. Else, we have to
// fault in an element decl, marked as created because
// of being in a content model.
//
unsigned int elemId = findElemId(bufToUse.getRawBuffer());
if (elemId == XMLElementDecl::fgInvalidElemId)
{
DTDElementDecl* decl = new DTDElementDecl(bufToUse.getRawBuffer());
decl->setCreateReason(XMLElementDecl::InContentModel);
fElemDeclPool->put(decl);
elemId = decl->getId();
}
ContentSpecNode* tmpLeaf = new ContentSpecNode(elemId);
// Check for a repetition character after the leaf
const XMLCh repCh = getReaderMgr()->peekNextChar();
ContentSpecNode* tmpLeaf2 = makeRepNode(repCh, tmpLeaf);
if (tmpLeaf != tmpLeaf2)
getReaderMgr()->getNextChar();
//
// Create a new sequence or choice node, with the leaf
// (or rep surrounding it) we just got as its first node.
// Make the new node the second node of the current node,
// and then make it the current node.
//
ContentSpecNode* newCur = new ContentSpecNode
(
curType
, tmpLeaf2
, 0
);
curNode->setSecond(newCur);
lastNode = curNode;
curNode = newCur;
}
}
else
{
// Cannot be valid
if (opCh == chComma)
getScanner()->emitError(XML4CErrs::ExpectedChoiceOrCloseParen);
else
getScanner()->emitError(XML4CErrs::ExpectedSeqOrCloseParen);
delete headNode;
return 0;
}
}
}
//
// We saw the terminating parenthesis so lets check for any repetition
// character, and create a node for that, making the head node the child
// of it.
//
XMLCh repCh = getReaderMgr()->peekNextChar();
ContentSpecNode* retNode = makeRepNode(repCh, headNode);
if (retNode != headNode)
getReaderMgr()->getNextChar();
return retNode;
}
//
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
//
void DTDValidator::scanComment()
{
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(getBufMgr());
//
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
//
States curState = InText;
while (true)
{
// Get the next character
const XMLCh nextCh = getReaderMgr()->getNextChar();
// Watch for an end of file
if (!nextCh)
{
getScanner()->emitError(XML4CErrs::UnterminatedComment);
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
}
// Make sure its a valid XML character
if (!XMLReader::isXMLChar(nextCh))
getScanner()->emitError(XML4CErrs::InvalidCharacter);
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
//
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
//
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
getScanner()->emitError(XML4CErrs::IllegalSequenceInComment);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If there is a doc type handler, then pass on the comment stuff
if (fDocTypeHandler)
fDocTypeHandler->doctypeComment(bbComment.getRawBuffer());
}
bool DTDValidator::scanContentSpec(DTDElementDecl& toFill)
{
//
// Check for for a couple of the predefined content type strings. If
// its not one of these, its got to be a parenthesized reg ex type
// expression.
//
if (getReaderMgr()->skippedString(XMLUni::fgEmptyString))
{
toFill.setModelType(DTDElementDecl::Empty);
return true;
}
if (getReaderMgr()->skippedString(XMLUni::fgAnyString))
{
toFill.setModelType(DTDElementDecl::Any);
return true;
}
// Its got to be a parenthesized regular expression
if (!getReaderMgr()->skippedChar(chOpenParen))
{
getScanner()->emitError(XML4CErrs::ExpectedContentSpecExpr);
return false;
}
// Get the current reader id, so we can test for partial markup
const unsigned int curReader = getReaderMgr()->getCurrentReaderNum();
// We could have a PE ref here, but don't require space
checkForPERef(false, false, true);
//
// Now we look for a PCDATA string. If its PCDATA, then it must be a
// MIXED model. Otherwise, it must be a regular list of children in
// a regular expression perhaps.
//
bool status;
if (getReaderMgr()->skippedString(XMLUni::fgPCDATAString))
{
// Set the model to mixed
toFill.setModelType(DTDElementDecl::Mixed);
status = scanMixed(toFill);
//
// If we are validating we have to check that there are no multiple
// uses of any child elements.
//
if (getScanner()->getDoValidation())
{
if (((const MixedContentModel*)toFill.getContentModel())->hasDups())
emitError(XML4CValid::RepElemInMixed);
}
}
else
{
//
// We have to do a recursive scan of the content model. Create a
// buffer for it to use, for efficiency. It returns the top ofthe
// content spec node tree, which we set if successful.
//
toFill.setModelType(DTDElementDecl::Children);
XMLBufBid bbTmp(getBufMgr());
ContentSpecNode* resNode = scanChildren(bbTmp.getBuffer());
status = (resNode != 0);
if (status)
toFill.setContentSpec(resNode);
}
// Make sure we are on the same reader as where we started
if (curReader != getReaderMgr()->getCurrentReaderNum())
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
return status;
}
void DTDValidator::scanDefaultDecl(DTDAttDef& toFill)
{
if (getReaderMgr()->skippedString(XMLUni::fgRequiredString))
{
toFill.setDefaultType(XMLAttDef::Required);
return;
}
if (getReaderMgr()->skippedString(XMLUni::fgImpliedString))
{
toFill.setDefaultType(XMLAttDef::Implied);
return;
}
if (getReaderMgr()->skippedString(XMLUni::fgFixedString))
{
//
// There must be space before the fixed value. If there is not, then
// emit an error but keep going.
//
if (!getReaderMgr()->skippedSpace())
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
else
getReaderMgr()->skipPastSpaces();
toFill.setDefaultType(XMLAttDef::Fixed);
}
else
{
toFill.setDefaultType(XMLAttDef::Default);
}
//
// If we got here, its fixed or default, so we need to get a value.
// If we don't, then emit an error but just set the default value to
// an empty string and try to keep going.
//
XMLBufBid bbValue(getBufMgr());
if (!scanAttValue(bbValue.getBuffer(), toFill.getType()))
getScanner()->emitError(XML4CErrs::ExpectedDefAttrDecl);
toFill.setValue(bbValue.getRawBuffer());
}
//
// This method handles the high level logic of scanning the DOCType
// declaration. This kicks off both the scanning of the internal subset and
// the scanning of the external subset, if any.
//
// When we get here the '<!DOCTYPE' part has already been scanned, which is
// what told us that we had a doc type decl to parse.
//
void DTDValidator::scanDocTypeDecl(const bool reuseValidator)
{
// There must be some space after DOCTYPE
if (!getReaderMgr()->skipPastSpaces())
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// Just skip the Doctype declaration and return
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// Get a buffer for the root element
XMLBufBid bbRootName(getBufMgr());
//
// Get a name from the input, which should be the name of the root
// element of the upcoming content.
//
getReaderMgr()->getName(bbRootName.getBuffer());
if (bbRootName.isEmpty())
{
getScanner()->emitError(XML4CErrs::NoRootElemInDOCTYPE);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// This element obviously is not going to exist in the element decl
// pool yet, but we need to store away an element id. So force it into
// the element decl pool, marked as being there because it was in
// the DOCTYPE. Later, when its declared, the status will be updated.
//
DTDElementDecl* rootDecl = new DTDElementDecl(bbRootName.getRawBuffer());
rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
fRootElemId = fElemDeclPool->put(rootDecl);
// Skip any spaces after the name
getReaderMgr()->skipPastSpaces();
bool hasIntSubset = false;
bool hasExtSubset = false;
XMLCh* sysId = 0;
XMLCh* pubId = 0;
//
// If the next character is '[' then we have no external subset cause
// there is no system id, just the opening character of the internal
// subset. Else, has to be an id.
//
if (getReaderMgr()->skippedChar(chOpenSquare))
{
hasIntSubset = true;
}
else
{
// Indicate we have an external subset
hasExtSubset = true;
// Get buffers for the ids
XMLBufBid bbPubId(getBufMgr());
XMLBufBid bbSysId(getBufMgr());
// Get the external subset id
if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
{
getScanner()->emitError(XML4CErrs::ExpectedSystemOrPublicId);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// Get copies of the ids we got
pubId = XMLString::replicate(bbPubId.getRawBuffer());
sysId = XMLString::replicate(bbSysId.getRawBuffer());
// Skip spaces and check again for the opening of an internal subset
getReaderMgr()->skipPastSpaces();
if (getReaderMgr()->skippedChar(chOpenSquare))
hasIntSubset = true;
}
// Insure that the ids get cleaned up, if they got allocated
ArrayJanitor<XMLCh> janSysId(sysId);
ArrayJanitor<XMLCh> janPubId(pubId);
//
// If we have a doc type handler and advanced callbacks are enabled,
// call the doctype event.
//
if (fDocTypeHandler)
fDocTypeHandler->doctypeDecl(*rootDecl, pubId, sysId, hasIntSubset);
//
// Ok, if we had an internal subset, we are just past the [ character
// and need to parse that first.
//
if (hasIntSubset)
{
// We can't have any internal subset if we are reusing the validator
if (reuseValidator)
ThrowXML(RuntimeException, XML4CExcepts::Val_CantHaveIntSS);
// Indicate we are in the internal subset now
FlagJanitor<bool> janContentFlag(&fInternalSubset, true);
// Set the current subset flag to let the scanning code know which
if (!scanInternalSubset())
{
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// Do a sanity check that some expanded PE did not propogate out of
// the doctype. This could happen if it was terminated early by bad
// syntax.
//
if (getReaderMgr()->getCurrentReaderNum() > 1)
{
getScanner()->emitError(XML4CErrs::PEPropogated);
// Ask the reader manager to pop back down to the main level
getReaderMgr()->cleanStackBackTo(1);
}
getReaderMgr()->skipPastSpaces();
}
// And that should leave us at the closing > of the DOCTYPE line
if (!getReaderMgr()->skippedChar(chCloseAngle))
{
//
// Do a special check for the common scenario of an extra ] char at
// the end. This is easy to recover from.
//
if (getReaderMgr()->skippedChar(chCloseSquare)
&& getReaderMgr()->skippedChar(chCloseAngle))
{
getScanner()->emitError(XML4CErrs::ExtraCloseSquare);
}
else
{
getScanner()->emitError(XML4CErrs::UnterminatedDOCTYPE);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
//
// If we had an external subset, then we need to deal with that one
// next. If we are reusing the validator, then don't scan it.
//
if (hasExtSubset && !reuseValidator)
{
// Indicate we are in the external subset now
FlagJanitor<bool> janContentFlag(&fInternalSubset, false);
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = getReaderMgr()->createReader
(
sysId
, pubId
, false
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the input source
Janitor<InputSource> janSrc(srcUsed);
//
// If it failed then throw an exception
//
if (!reader)
ThrowXML1(RuntimeException, XML4CExcepts::Gen_CouldNotOpenDTD, srcUsed->getSystemId());
//
// In order to make the processing work consistently, we have to
// make this look like an external entity. So create an entity
// decl and fill it in and push it with the reader, as happens
// with an external entity. Put a janitor on it to insure it gets
// cleaned up. The reader manager does not adopt them.
//
const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
DTDEntityDecl* declDTD = new DTDEntityDecl(gDTDStr);
declDTD->setSystemId(sysId);
Janitor<DTDEntityDecl> janDecl(declDTD);
// Mark this one as a throw at end
reader->setThrowAtEnd(true);
// And push it onto the stack, with its pseudo name
getReaderMgr()->pushReader(reader, declDTD);
// Tell it its not in an include section
scanExtSubsetDecl(false);
}
}
//
// This is called after seeing '<!ELEMENT' which indicates that an element
// markup is starting. This guy scans the rest of it and adds it to the
// element decl pool if it has not already been declared.
//
void DTDValidator::scanElementDecl()
{
//
// Space is legal (required actually) here so check for a PE ref. If
// we don't get our whitespace, then issue and error, but try to keep
// going.
//
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// Get a buffer for the element name and scan in the name
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedElementName);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// Look this guy up in the element decl pool
DTDElementDecl* decl = fElemDeclPool->getByKey(bbName.getRawBuffer());
//
// If it does not exist, then we need to create it. If it does and
// its marked as declared, then that's an error, but we still need to
// scan over the content model so use the dummy declaration that the
// parsing code can fill in.
//
bool isNew = false;
if (decl)
{
if (decl->isDeclared())
{
if (getScanner()->getDoValidation())
emitError(XML4CValid::ElementAlreadyExists, bbName.getRawBuffer());
if (!fDumElemDecl)
fDumElemDecl = new DTDElementDecl(bbName.getRawBuffer());
else
fDumElemDecl->setName(bbName.getRawBuffer());
}
}
else
{
// Create the new empty declaration to fill in. Mark it declared
decl = new DTDElementDecl(bbName.getRawBuffer());
isNew = true;
// Put it in the element decl pool
fElemDeclPool->put(decl);
}
// Set a flag for whether we will ignore this one
const bool isIgnored = (decl == fDumElemDecl);
// Another check for a PE ref, with at least required whitespace
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// And now scan the content model for this guy.
if (!scanContentSpec(*decl))
{
getReaderMgr()->skipPastChar(chCloseAngle);
if (isNew)
delete decl;
return;
}
// Another check for a PE ref, but we don't require whitespace here
checkForPERef(false, false, true);
// If this is not one we are ignoring, then set it declared
if (!isIgnored)
decl->setCreateReason(XMLElementDecl::Declared);
// And we should have the ending angle bracket
if (!getReaderMgr()->skippedChar(chCloseAngle))
{
getScanner()->emitError(XML4CErrs::UnterminatedElementDecl, bbName.getRawBuffer());
getReaderMgr()->skipPastChar(chCloseAngle);
}
//
// If we have a DTD handler tell it about the new element decl. We
// tell it if its one that can be ignored, cause its an override of a
// previously existing decl. If it is being ignored, only call back
// if advanced callbacks are enabled.
//
if (fDocTypeHandler)
{
fDocTypeHandler->elementDecl
(
*decl
, isIgnored
);
}
}
//
// This method will process a general or parameter entity reference. The
// entity name and entity text will be stored in the entity pool. The value
// of the entity will be scanned for any other parameter entity or char
// references which will be expanded. So the stored value can only have
// general entity references when done.
//
void DTDValidator::scanEntityDecl()
{
//
// Space is required here, but we cannot check for a PE Ref since
// there could be a legal (no-ref) percent sign here. Since any
// entity that ended here would be illegal, we just skip spaces
// and then check for a percent.
//
if (!getReaderMgr()->lookingAtSpace())
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
else
getReaderMgr()->skipPastSpaces();
const bool isPEDecl = getReaderMgr()->skippedChar(chPercent);
//
// If a PE decl, then eat the percent and check for spaces or a
// PE ref on the other side of it. At least spaces are required.
//
if (isPEDecl)
{
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
}
//
// Now lets get a name, which should be the name of the entity. We
// have to get a buffer for this.
//
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedPEName);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// If namespaces are enabled, then no colons allowed
if (getScanner()->getDoNamespaces())
{
if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
getScanner()->emitError(XML4CErrs::ColonNotLegalWithNS);
}
//
// See if this entity already exists. If so, then the existing one
// takes precendence. So we use the local dummy decl to parse into
// and just ignore the results.
//
DTDEntityDecl* entityDecl = 0;
if (isPEDecl)
entityDecl = fPEntityDeclPool->getByKey(bbName.getRawBuffer());
else
entityDecl = fEntityDeclPool->getByKey(bbName.getRawBuffer());
if (entityDecl)
{
if (!fDumEntityDecl)
fDumEntityDecl = new DTDEntityDecl;
fDumEntityDecl->setName(bbName.getRawBuffer());
entityDecl = fDumEntityDecl;
}
else
{
// Its not in existence already, then create an entity decl for it
entityDecl = new DTDEntityDecl(bbName.getRawBuffer());
//
// Set the declaration location. The parameter indicates whether
// its declared in the content/internal subset, so we do
// whether its not in the external subset.
//
entityDecl->setDeclaredInIntSubset(fInternalSubset);
}
// Set a flag that indicates whether we are ignoring this one
const bool isIgnored = (entityDecl == fDumEntityDecl);
// Set the PE flag on it
entityDecl->setIsParameter(isPEDecl);
//
// Space is legal (required actually) here so check for a PE ref. If
// we don't get our whitespace, then issue an error, but try to keep
// going.
//
if (!checkForPERef(true, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// According to the type call the value scanning method
if (!scanEntityDef(*entityDecl, isPEDecl))
{
getReaderMgr()->skipPastChar(chCloseAngle);
if (!isIgnored)
delete entityDecl;
getScanner()->emitError(XML4CErrs::ExpectedEntityValue);
return;
}
// Space is legal (but not required) here so check for a PE ref
checkForPERef(false, false, true);
// And then we have to have the closing angle bracket
if (!getReaderMgr()->skippedChar(chCloseAngle))
{
getScanner()->emitError(XML4CErrs::UnterminatedEntityDecl);
getReaderMgr()->skipPastChar(chCloseAngle);
}
//
// And add this guy to the appropriate entity decl pool, if it was
// not an override of an existing entity that we just parsed into the
// dummy decl.
//
if (!isIgnored)
{
if (isPEDecl)
fPEntityDeclPool->put(entityDecl);
else
fEntityDeclPool->put(entityDecl);
}
//
// If we have a doc type handler, then call it. But only call it for
// ignored elements if advanced callbacks are enabled.
//
if (fDocTypeHandler)
{
fDocTypeHandler->entityDecl
(
*entityDecl
, isPEDecl
, (entityDecl == fDumEntityDecl)
);
}
}
//
// This method will scan a general/character entity ref. It will either
// expand a char ref and return the value directly, or it will expand
// a general entity and a reader for it onto the reader stack.
//
// The return value indicates whether the value was returned directly or
// pushed as a reader or it failed.
//
// The escaped flag tells the caller whether the returnd parameter resulted
// from a character reference, which escapes the character in some cases. It
// only makes any difference if the return indicates the value was returned
// directly.
//
// NOTE: This is only called when scanning attribute values, so we always
// expand general entities.
//
DTDValidator::EntityExpRes
DTDValidator::scanEntityRef(XMLCh& firstCh, XMLCh& secondCh, bool& escaped)
{
// Assume no escape and no second char
escaped = false;
secondCh = 0;
// We have to insure its all done in a single entity
const unsigned int curReader = getReaderMgr()->getCurrentReaderNum();
//
// If the next char is a pound, then its a character reference and we
// need to expand it always.
//
if (getReaderMgr()->skippedChar(chPound))
{
//
// Its a character reference, so scan it and get back the numeric
// value it represents. If it fails, just return immediately.
//
if (!scanCharRef(firstCh, secondCh))
return EntityExp_Failed;
if (curReader != getReaderMgr()->getCurrentReaderNum())
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
// Its now escaped since it was a char ref
escaped = true;
return EntityExp_Returned;
}
// Get the name of the general entity
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedEntityRefName);
return EntityExp_Failed;
}
//
// Next char must be a semi-colon. But if its not, just emit
// an error and try to continue.
//
if (!getReaderMgr()->skippedChar(chSemiColon))
getScanner()->emitError(XML4CErrs::UnterminatedEntityRef);
// Make sure it was all in one entity reader
if (curReader != getReaderMgr()->getCurrentReaderNum())
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
// Look it up the name the general entity pool
XMLEntityDecl* decl = findEntityDecl(bbName.getRawBuffer(), false);
// If it does not exist, then obviously an error
if (!decl)
{
getScanner()->emitError(XML4CErrs::EntityNotFound, bbName.getRawBuffer());
return EntityExp_Failed;
}
//
// If we are a standalone document, then it has to have been declared
// in the internal subset. Keep going though.
//
if (getScanner()->getStandalone() && !decl->getDeclaredInIntSubset())
getScanner()->emitError(XML4CErrs::IllegalRefInStandalone, bbName.getRawBuffer());
//
// If its a special char reference, then its escaped and we can return
// it directly.
//
if (decl->getIsSpecialChar())
{
firstCh = decl->getValue()[0];
escaped = true;
return EntityExp_Returned;
}
if (decl->isExternal())
{
// If its unparsed, then its not valid here
if (decl->isUnparsed())
{
getScanner()->emitError(XML4CErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
return EntityExp_Failed;
}
// We are in an attribute value, so not valid. But keep going
getScanner()->emitError(XML4CErrs::NoExtRefsInAttValue);
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = getReaderMgr()->createReader
(
decl->getSystemId()
, decl->getPublicId()
, false
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the source so it gets cleaned up on exit
Janitor<InputSource> janSrc(srcUsed);
//
// If the creation failed then throw an exception
//
if (!reader)
ThrowXML1(RuntimeException, XML4CExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
//
// Push the reader. If its a recursive expansion, then emit an error
// and return an failure.
//
if (!getReaderMgr()->pushReader(reader, decl))
{
getScanner()->emitError(XML4CErrs::RecursiveEntity, decl->getName());
return EntityExp_Failed;
}
// Do a start entity reference event
if (getScanner()->getDocHandler())
getScanner()->getDocHandler()->startEntityReference(*decl);
// If it starts with the XML string, then parse a text decl
if (getReaderMgr()->skippedString(XMLUni::fgXMLDeclString))
scanTextDecl();
}
else
{
//
// Create a reader over a memory stream over the entity value
// We force it to assume UTF-16 by passing in an encoding
// string. This way it won't both trying to predecode the
// first line, looking for an XML/TextDecl.
//
XMLReader* valueReader = getReaderMgr()->createIntEntReader
(
decl->getName()
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, decl->getValue()
, decl->getValueLen()
, false
);
//
// Trt to push the entity reader onto the reader manager stack,
// where it will become the subsequent input. If it fails, that
// means the entity is recursive, so issue an error. The reader
// will have just been discarded, but we just keep going.
//
if (!getReaderMgr()->pushReader(valueReader, decl))
getScanner()->emitError(XML4CErrs::RecursiveEntity, decl->getName());
// Do a start entity reference event
if (getScanner()->getDocHandler())
getScanner()->getDocHandler()->startEntityReference(*decl);
}
return EntityExp_Pushed;
}
//
// This method will scan a quoted literal of an entity value. It has to
// deal with replacement of PE references; however, since this is a DTD
// scanner, all such entity literals are in entity decls and therefore
// general entities are not expanded.
//
bool DTDValidator::scanEntityLiteral(XMLBuffer& toFill, const bool isPE)
{
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!getReaderMgr()->skipIfQuote(quoteCh))
return false;
// Get a buffer for pulling in entity names when we see GE refs
XMLBufBid bbName(getBufMgr());
XMLBuffer& nameBuf = bbName.getBuffer();
// Remember the current reader
const unsigned int orgReader = getReaderMgr()->getCurrentReaderNum();
//
// Loop until we see the ending quote character, handling any references
// in the process.
//
XMLCh nextCh;
XMLCh secondCh = 0;
bool gotLeadingSurrogate = false;
while (true)
{
// Get the second char if we have one, else get another
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = getReaderMgr()->getNextChar();
}
//
// Watch specifically for EOF and issue a more meaningful error
// if that occurs (since an unterminated quoted char can cause
// this easily.)
//
if (!nextCh)
{
getScanner()->emitError(XML4CErrs::UnterminatedEntityLiteral);
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
}
//
// Break out on our terminating quote char when we are back in the
// same reader. Otherwise, we might trigger on a nested quote char
// in an expanded entity.
//
if ((nextCh == quoteCh)
&& (getReaderMgr()->getCurrentReaderNum() == orgReader))
{
break;
}
if (nextCh == chPercent)
{
//
// Put the PE's value on the reader stack and then jump back
// to the top to start processing it. The parameter indicates
// that it should not scan the reference's content as an external
// subset.
//
expandPERef(false, true, true);
continue;
}
//
// Ok, now that all the other special stuff is checked, we can
// look for a general entity. In here, we cannot have a naked &
// and will only expand numerical char refs or the intrinsic char
// refs. Others will be left alone.
//
if (nextCh == chAmpersand)
{
//
// Here, we only expand numeric char refs, but not any general
// entities. However, the stupid XML spec requires that we check
// and make sure it does refer to a general entity if its not
// a char ref (i.e. no naked '&' chars.)
//
if (getReaderMgr()->skippedChar(chPound))
{
// If it failed, then just jump back to the top and try to pick up
if (!scanCharRef(nextCh, secondCh))
{
gotLeadingSurrogate = false;
continue;
}
}
else
{
if (!getReaderMgr()->getName(nameBuf))
{
getScanner()->emitError(XML4CErrs::ExpectedEntityRefName);
}
else
{
//
// Since we are not expanding any of this, we have to
// put the amp and name into the target buffer as data.
//
toFill.append(chAmpersand);
toFill.append(nameBuf.getRawBuffer());
// Make sure we skipped a trailing semicolon
if (!getReaderMgr()->skippedChar(chSemiColon))
getScanner()->emitError(XML4CErrs::UnterminatedEntityRef);
// And make the new character the semicolon
nextCh = chSemiColon;
}
// Either way here we reset the surrogate flag
gotLeadingSurrogate = false;
}
}
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
getScanner()->emitError(XML4CErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
getScanner()->emitError(XML4CErrs::Expected2ndSurrogateChar);
}
else if (!XMLReader::isXMLChar(nextCh))
{
getScanner()->emitError(XML4CErrs::InvalidCharacter);
getReaderMgr()->skipPastChar(quoteCh);
return false;
}
gotLeadingSurrogate = false;
}
// Looks ok, so add it to the literal
toFill.append(nextCh);
}
//
// If we got here and did not get back to the original reader level,
// then we propogated some entity out of the literal, so issue an
// error, but don't fail.
//
if (getReaderMgr()->getCurrentReaderNum() != orgReader)
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
return true;
}
//
// This method is called after the entity name has been scanned, and any
// PE referenced following the name is handled. The passed decl will be
// filled in with the info scanned.
//
bool DTDValidator::scanEntityDef(DTDEntityDecl& decl, const bool isPEDecl)
{
// Its got to be an entity literal
if (getReaderMgr()->lookingAtChar(chSingleQuote)
|| getReaderMgr()->lookingAtChar(chDoubleQuote))
{
// Get a buffer for the literal
XMLBufBid bbValue(getBufMgr());
if (!scanEntityLiteral(bbValue.getBuffer(), isPEDecl))
return false;
// Set it on the entity decl
decl.setValue(bbValue.getRawBuffer());
return true;
}
//
// Its got to be an external entity, so there must be an external id.
// Get buffers for them and scan an external id into them.
//
XMLBufBid bbPubId(getBufMgr());
XMLBufBid bbSysId(getBufMgr());
if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_External))
return false;
// Fill in the id fields of the decl with the info we got
decl.setPublicId(bbPubId.getRawBuffer());
decl.setSystemId(bbSysId.getRawBuffer());
// If its a PE decl, we are done
bool gotSpaces = checkForPERef(false, false, true);
if (isPEDecl)
{
//
// Check for a common error here. NDATA is not allowed for PEs
// so check for the NDATA string. If found give a nice meaningful
// error and continue parsing to eat the NDATA text.
//
if (gotSpaces)
{
if (getReaderMgr()->skippedString(XMLUni::fgNDATAString))
getScanner()->emitError(XML4CErrs::NDATANotValidForPE);
}
else
{
return true;
}
}
// If looking at close angle now, we are done
if (getReaderMgr()->lookingAtChar(chCloseAngle))
return true;
// Else we had to have seem the whitespace
if (!gotSpaces)
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// We now have to see a notation data string
if (!getReaderMgr()->skippedString(XMLUni::fgNDATAString))
getScanner()->emitError(XML4CErrs::ExpectedNDATA);
// Space is required here, but try to go on if not
if (!checkForPERef(false, false, true))
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
// Get a name
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedNotationName);
return false;
}
// Set the decl's notation name
decl.setNotationName(bbName.getRawBuffer());
return true;
}
//
// This method is called after an attribute decl name or a notation decl has
// been scanned and then an opening parenthesis was see, indicating the list
// of values. It scans the enumeration values and creates a single string
// which has a single space between each value.
//
// The terminating close paren ends this scan.
//
bool DTDValidator::scanEnumeration(XMLBuffer& toFill, const bool notation)
{
// Reset the passed buffer
toFill.reset();
// Check for PE ref but don't require space
checkForPERef(false, false, true);
// If this is a notation, we need an opening paren
if (notation)
{
if (!getReaderMgr()->skippedChar(chOpenParen))
getScanner()->emitError(XML4CErrs::ExpectedOpenParen);
}
// We need a local buffer to use as well
XMLBufBid bbTmp(getBufMgr());
while (true)
{
// Space is allowed here for either type so check for PE ref
checkForPERef(false, false, true);
// And then get either a name or a name token
bool success;
if (notation)
success = getReaderMgr()->getName(bbTmp.getBuffer());
else
success = getReaderMgr()->getNameToken(bbTmp.getBuffer());
if (!success)
{
getScanner()->emitError(XML4CErrs::ExpectedEnumValue);
return false;
}
// Append this value to the target value
toFill.append(bbTmp.getRawBuffer(), bbTmp.getLen());
// Space is allowed here for either type so check for PE ref
checkForPERef(false, false, true);
// Check for the terminating paren
if (getReaderMgr()->skippedChar(chCloseParen))
break;
// And append a space separator
toFill.append(chSpace);
// Check for the pipe character separator
if (!getReaderMgr()->skippedChar(chPipe))
{
getScanner()->emitError(XML4CErrs::ExpectedEnumSepOrParen);
return false;
}
}
return true;
}
bool DTDValidator::scanEq()
{
getReaderMgr()->skipPastSpaces();
if (getReaderMgr()->skippedChar(chEqual))
{
getReaderMgr()->skipPastSpaces();
return true;
}
return false;
}
//
// This method is called when an external entity reference is seen in the
// DTD or an external DTD subset is encountered, and their contents pushed
// onto the reader stack. This method will scan that contents.
//
void DTDValidator::scanExtSubsetDecl(const bool inIncludeSect)
{
bool bAcceptDecl = !inIncludeSect;
// Get a buffer for whitespace
XMLBufBid bbSpace(getBufMgr());
//
// If we have a doc type handler and we are not being called recursively
// to handle an include section, tell it the ext subset starts
//
if (fDocTypeHandler && !inIncludeSect)
fDocTypeHandler->startExtSubset();
//
// We have to play a trick here if the current entity we are parsing
// is a PE. Because the spooling code will put out a whitespace before
// and after an expanded PE if its being scanned outside the context of
// a literal entity, this will confuse this external subset code.
//
// So, we see if that is what is happening and, if so, eat the single
// space, a check for the <?xml string. If we find it, we parse that
// markup right now and put the space back.
//
if (getReaderMgr()->isScanningPERefOutOfLiteral())
{
if (getReaderMgr()->skippedSpace())
{
if (getReaderMgr()->skippedString(XMLUni::fgXMLDeclStringSpace))
{
scanTextDecl();
bAcceptDecl = false;
// <TBD> Figure out how to do this
// getReaderMgr()->unGet(chSpace);
}
}
}
// Get the current reader number
const unsigned int orgReader = getReaderMgr()->getCurrentReaderNum();
//
// Loop until we hit the end of the external subset entity. Note that
// we use a double loop here in order to avoid the overhead of doing
// the exception setup/teardown work on every loop.
//
bool inMarkup = false;
bool inCharData = false;
while (true)
{
try
{
while (true)
{
const XMLCh nextCh = getReaderMgr()->peekNextChar();
if (nextCh == chOpenAngle)
{
// Get the reader we started this on
const unsigned int orgReader = getReaderMgr()->getCurrentReaderNum();
//
// Now scan the markup. Set the flag so that we will know that
// we were in markup if an end of entity exception occurs.
//
getReaderMgr()->getNextChar();
inMarkup = true;
scanMarkupDecl(bAcceptDecl);
inMarkup = false;
//
// And see if we got back to the same level. If not, then its
// a partial markup error.
//
if (getReaderMgr()->getCurrentReaderNum() != orgReader)
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
}
else if (XMLReader::isWhitespace(nextCh))
{
//
// If we have a doc type handler, and advanced callbacks are
// enabled, then gather up whitespace and call back. Otherwise
// just skip whitespaces.
//
if (fDocTypeHandler)
{
inCharData = true;
getReaderMgr()->getSpaces(bbSpace.getBuffer());
inCharData = false;
fDocTypeHandler->doctypeWhitespace
(
bbSpace.getRawBuffer()
, bbSpace.getLen()
);
}
else
{
//
// If we hit an end of entity in the middle of white
// space, that's fine. We'll just come back in here
// again on the next round and skip some more.
//
getReaderMgr()->skipPastSpaces();
}
}
else if (nextCh == chPercent)
{
//
// Expand (and scan if external) the reference value. Tell
// it to throw an end of entity exception at the end of the
// entity.
//
getReaderMgr()->getNextChar();
expandPERef(true, false, false, true);
}
else if (inIncludeSect && (nextCh == chCloseSquare))
{
//
// Its the end of a conditional include section. So scan it and
// decrement the include depth counter.
//
getReaderMgr()->getNextChar();
if (!getReaderMgr()->skippedChar(chCloseSquare))
{
getScanner()->emitError(XML4CErrs::ExpectedEndOfConditional);
getReaderMgr()->skipPastChar(chCloseAngle);
}
else if (!getReaderMgr()->skippedChar(chCloseAngle))
{
getScanner()->emitError(XML4CErrs::ExpectedEndOfConditional);
getReaderMgr()->skipPastChar(chCloseAngle);
}
return;
}
else
{
getReaderMgr()->getNextChar();
if (!XMLReader::isXMLChar(nextCh))
getScanner()->emitError(XML4CErrs::InvalidCharacter);
else
getScanner()->emitError(XML4CErrs::InvalidDocumentStructure);
// Try to get realigned
static const XMLCh toSkip[] =
{
chPercent, chCloseSquare, chOpenAngle, chNull
};
getReaderMgr()->skipUntilInOrWS(toSkip);
}
bAcceptDecl = false;
}
}
catch(const EndOfEntityException& toCatch)
{
//
// If the external entity ended while we were in markup, then that's
// a partial markup error.
//
if (inMarkup)
{
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
inMarkup = false;
}
// If we were in char data, then send what we got
if (inCharData)
{
// Send what we got, then rethrow
if (fDocTypeHandler)
{
fDocTypeHandler->doctypeWhitespace
(
bbSpace.getRawBuffer()
, bbSpace.getLen()
);
}
inCharData = false;
}
//
// If the entity that just ended was the entity that we started
// on, then this is the end of the external subset.
//
if (orgReader == toCatch.getReaderNum())
break;
}
}
// If we have a doc type handler, tell it the ext subset ends
if (fDocTypeHandler)
fDocTypeHandler->endExtSubset();
}
//
// This method will scan for an id, either public or external.
//
bool DTDValidator::scanId( XMLBuffer& pubIdToFill
, XMLBuffer& sysIdToFill
, const IDTypes whatKind)
{
// Clean out both return buffers
pubIdToFill.reset();
sysIdToFill.reset();
//
// Check first for the system id first. If we find it, and system id
// is one of the legal values, then lets try to scan it.
//
if (getReaderMgr()->skippedString(XMLUni::fgSysIDString))
{
// If they were looking for a public id, then we failed
if (whatKind == IDType_Public)
{
getScanner()->emitError(XML4CErrs::ExpectedPublicId);
return false;
}
// We must skip spaces
if (!getReaderMgr()->skipPastSpaces())
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
return false;
}
// Get the system literal value
return scanSystemLiteral(sysIdToFill);
}
// See if we have a public id string. If not, we are done and found nothing
if (!getReaderMgr()->skippedString(XMLUni::fgPubIDString))
return false;
//
// So following this we must have whitespace, a public literal, whitespace,
// and a system literal.
//
if (!getReaderMgr()->skipPastSpaces())
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
//
// Just in case, if they just forgot the whitespace but the next char
// is a single or double quote, then keep going.
//
const XMLCh chPeek = getReaderMgr()->peekNextChar();
if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
return false;
}
if (!scanPublicLiteral(pubIdToFill))
{
getScanner()->emitError(XML4CErrs::ExpectedPublicId);
return false;
}
// If they wanted a public id, then this is all
if (whatKind == IDType_Public)
return true;
// Else lets get the system id
if (!getReaderMgr()->skipPastSpaces())
{
//
// In order to recover best here we need to see if we don't have
// whitespace because the next thing is a quote or because the next
// thing is some non-quote character.
//
const XMLCh chPeek = getReaderMgr()->peekNextChar();
const bool bIsQuote = ((chPeek == chDoubleQuote)
|| (chPeek == chSingleQuote));
if (whatKind == IDType_External)
{
//
// If its an external Id, then we need to see the system id.
// So, emit the error. But, if the next char is a quote, don't
// give up since its probably going to work. The user just
// missed the separating space. Otherwise, fail.
//
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
if (!bIsQuote)
return false;
}
else
{
//
// We can legally return here. But, if the next char is a quote,
// then that's probably not what was desired, since its probably
// just that space was forgotten and there really is a system
// id to follow.
//
// So treat it like missing whitespace if so and keep going.
// Else, just return success.
//
if (bIsQuote)
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
else
return true;
}
}
if (!scanSystemLiteral(sysIdToFill))
{
// Its only an error to not have it if we are doing an external
if (whatKind == IDType_External)
{
getScanner()->emitError(XML4CErrs::ExpectedSystemId);
return false;
}
}
return true;
}
//
// This method will scan the contents of an ignored section. It assumes that
// we already are in the body, i.e. we've seen <![IGNORE[ at this point. So
// we have to just scan until we see a matching ]]> closing markup.
//
void DTDValidator::scanIgnoredSection()
{
//
// Depth starts at one because we are already in one section and want
// to parse until we hit its end.
//
unsigned long depth = 1;
while (true)
{
const XMLCh nextCh = getReaderMgr()->getNextChar();
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
if (nextCh == chOpenAngle)
{
if (getReaderMgr()->skippedChar(chBang)
&& getReaderMgr()->skippedChar(chOpenSquare))
{
depth++;
}
}
else if (nextCh == chCloseSquare)
{
if (getReaderMgr()->skippedChar(chCloseSquare))
{
while (getReaderMgr()->skippedChar(chCloseSquare))
{
// Do nothing, just skip them
}
if (getReaderMgr()->skippedChar(chCloseAngle))
{
depth--;
if (!depth)
break;
}
}
}
else if (!XMLReader::isXMLChar(nextCh))
{
getScanner()->emitError(XML4CErrs::InvalidCharacter);
}
}
}
//
// This method scans the entire internal subset. All we can have here is
// decl markup, and PE references. The expanded PE references must contain
// whole markup, so we don't have to worry about their content at this
// level. We just scan them, expand them, push them, and parse their content
// right there, via the expandERef() method.
//
bool DTDValidator::scanInternalSubset()
{
// If we have a doc type handler, tell it the internal subset starts
if (fDocTypeHandler)
fDocTypeHandler->startIntSubset();
// Get a buffer for whitespace
XMLBufBid bbSpace(getBufMgr());
bool noErrors = true;
while (true)
{
const XMLCh nextCh = getReaderMgr()->peekNextChar();
//
// If we get an end of file marker, just unget it and return a
// failure status. The caller will then see the end of file and
// faill out correctly.
//
if (!nextCh)
return false;
// Watch for the end of internal subset marker
if (nextCh == chCloseSquare)
{
getReaderMgr()->getNextChar();
break;
}
if (nextCh == chPercent)
{
//
// Expand (and scan if external) the reference value. Tell
// it to set the reader to cause an end of entity exception
// when this reader dies, which is what the scanExtSubset
// method wants (who is called to scan this.)
//
getReaderMgr()->getNextChar();
expandPERef(true, false, false, true);
}
else if (nextCh == chOpenAngle)
{
// Remember this reader before we start the scan
const unsigned int orgReader = getReaderMgr()->getCurrentReaderNum();
// And scan this markup
getReaderMgr()->getNextChar();
scanMarkupDecl(false);
// If we did not get back to entry level, then partial markup
if (getReaderMgr()->getCurrentReaderNum() != orgReader)
getScanner()->emitError(XML4CErrs::PartialMarkupInEntity);
}
else if (XMLReader::isWhitespace(nextCh))
{
//
// IF we are doing advanced callbacks and have a doc type
// handler, then get the whitespace and call the doc type
// handler with it. Otherwise, just skip whitespace.
//
if (fDocTypeHandler)
{
getReaderMgr()->getSpaces(bbSpace.getBuffer());
fDocTypeHandler->doctypeWhitespace
(
bbSpace.getRawBuffer()
, bbSpace.getLen()
);
}
else
{
getReaderMgr()->skipPastSpaces();
}
}
else
{
getReaderMgr()->getNextChar();
// Not valid, so emit an error
getScanner()->emitError(XML4CErrs::InvalidCharacterInIntSubset);
//
// If an '>', then probably an abnormally terminated
// internal subset so just return.
//
if (nextCh == chCloseAngle)
{
noErrors = false;
break;
}
//
// Otherwise, try to sync back up by scanning forward for
// a reasonable start character.
//
static const XMLCh toSkip[] =
{
chPercent, chCloseSquare, chOpenAngle, chNull
};
getReaderMgr()->skipUntilInOrWS(toSkip);
}
}
// If we have a doc type handler, tell it the internal subset ends
if (fDocTypeHandler)
fDocTypeHandler->endIntSubset();
return noErrors;
}
//
// This method is called once we see a < in the input of an int/ext subset,
// which indicates the start of some sort of markup.
//
void DTDValidator::scanMarkupDecl(const bool parseTextDecl)
{
//
// We only have two valid first characters here. One is a ! which opens
// some markup decl. The other is a ?, which could begin either a PI
// or a text decl. If parseTextDecl is false, we cannot accept a text
// decl.
//
const XMLCh nextCh = getReaderMgr()->getNextChar();
if (nextCh == chBang)
{
if (getReaderMgr()->skippedChar(chDash))
{
if (getReaderMgr()->skippedChar(chDash))
{
scanComment();
}
else
{
getScanner()->emitError(XML4CErrs::CommentsMustStartWith);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
else if (getReaderMgr()->skippedChar(chOpenSquare))
{
//
// Its a conditional section. This is only valid in the external
// subset, so issue an error if we aren't there.
//
if (fInternalSubset)
{
getScanner()->emitError(XML4CErrs::ConditionalSectInIntSubset);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// A PE ref can happen here, but space is not required
checkForPERef(false, false, true);
if (getReaderMgr()->skippedString(XMLUni::fgIncludeString))
{
checkForPERef(false, false, true);
// Check for the following open square bracket
if (!getReaderMgr()->skippedChar(chOpenSquare))
getScanner()->emitError(XML4CErrs::ExpectedINCLUDEBracket);
checkForPERef(false, false, true);
//
// Recurse back to the ext subset call again, telling it its
// in an include section.
//
scanExtSubsetDecl(true);
}
else if (getReaderMgr()->skippedString(XMLUni::fgIgnoreString))
{
checkForPERef(false, false, true);
// Check for the following open square bracket
if (!getReaderMgr()->skippedChar(chOpenSquare))
getScanner()->emitError(XML4CErrs::ExpectedINCLUDEBracket);
checkForPERef(false, false, true);
// And scan over the ignored part
scanIgnoredSection();
}
else
{
getScanner()->emitError(XML4CErrs::ExpectedIncOrIgn);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
else if (getReaderMgr()->skippedString(XMLUni::fgAttListString))
{
scanAttListDecl();
}
else if (getReaderMgr()->skippedString(XMLUni::fgElemString))
{
scanElementDecl();
}
else if (getReaderMgr()->skippedString(XMLUni::fgEntityString))
{
scanEntityDecl();
}
else if (getReaderMgr()->skippedString(XMLUni::fgNotationString))
{
scanNotationDecl();
}
else
{
getScanner()->emitError(XML4CErrs::ExpectedMarkupDecl);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
else if (nextCh == chQuestion)
{
// It could be a PI or the XML declaration. Check for Decl
bool gotDecl = getReaderMgr()->skippedString(XMLUni::fgXMLStringSpace);
//
// Just in case, check for upper case. If found, issue
// an error, but keep going.
//
if (!gotDecl)
{
gotDecl = getReaderMgr()->skippedString(XMLUni::fgXMLStringSpaceU);
if (gotDecl)
getScanner()->emitError(XML4CErrs::XMLDeclMustBeLowerCase);
}
if (gotDecl)
{
// If we are not accepting text decls, its an error
if (parseTextDecl)
{
scanTextDecl();
}
else
{
// Emit the error and skip past this markup
getScanner()->emitError(XML4CErrs::TextDeclNotLegalHere);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
else
{
// It has to be a PI
scanPI();
}
}
else
{
// Can't be valid so emit error and try to skip past end of this decl
getScanner()->emitError(XML4CErrs::ExpectedMarkupDecl);
getReaderMgr()->skipPastChar(chCloseAngle);
}
}
//
// This method is called for a mixed model element's content mode. We've
// already scanned past the '(PCDATA' part by the time we get here. So
// everything else is element names separated by | characters until we
// hit the end. The passed element decl's content model is filled in with
// the information found.
//
bool DTDValidator::scanMixed(DTDElementDecl& toFill)
{
//
// The terminating star is only required if there is something more
// than (PCDATA).
//
bool starRequired = false;
// Get a buffer to be used below to get element names
XMLBufBid bbName(getBufMgr());
XMLBuffer& nameBuf = bbName.getBuffer();
//
// Create an initial content spec node. Its just a leaf node with a
// PCDATA element id. This current node pointer will be pushed down the
// tree as we go.
//
ContentSpecNode* curNode = new ContentSpecNode(XMLElementDecl::fgPCDataElemId);
//
// Set the initial leaf as the temporary head. If we hit the first choice
// node, it will be set up here. When done, this is the node that's set
// as the content spec for the element.
//
ContentSpecNode* headNode = curNode;
// Remember the original node so we can sense the first choice node
ContentSpecNode* orgNode = curNode;
//
// We just loop around, getting the | character at the top and then
// looking for the next element name. We keep up with the last node
// and add each new one to its right node.
//
while (true)
{
// Spaces are legal here, so check for a PE ref, but don't require space
checkForPERef(false, false, true);
//
// If its a star, then tell them they can't have reps in
// mixed model, but eat it and keep going.
//
if (getReaderMgr()->skippedChar(chAsterisk))
{
getScanner()->emitError(XML4CErrs::NoRepInMixed);
continue;
}
// Check for the next choice indicator
if (!getReaderMgr()->skippedChar(chPipe))
{
// Has to be the closing paren now.
if (!getReaderMgr()->skippedChar(chCloseParen))
{
getScanner()->emitError(XML4CErrs::UnterminatedContentModel);
delete headNode;
return false;
}
if (!getReaderMgr()->skippedChar(chAsterisk) && starRequired)
getScanner()->emitError(XML4CErrs::ExpectedAsterisk);
//
// Create a zero or more node and make the original head
// node its first child.
//
headNode = new ContentSpecNode
(
ContentSpecNode::ZeroOrMore
, headNode
, 0
);
// Store the head node as the content spec of the element.
toFill.setContentSpec(headNode);
break;
}
// Its more than just a PCDATA, so an ending star will be required now
starRequired = true;
// Space is legal here so check for a PE ref, but don't require space
checkForPERef(false, false, true);
// Get a name token
if (!getReaderMgr()->getName(nameBuf))
{
getScanner()->emitError(XML4CErrs::ExpectedElementName);
delete headNode;
return false;
}
//
// Create a leaf node for it. If we can find the element id for
// this element, then use it. Else, we have to fault in an element
// decl, marked as created because of being in a content model.
//
unsigned int elemId = findElemId(nameBuf.getRawBuffer());
if (elemId == XMLElementDecl::fgInvalidElemId)
{
DTDElementDecl* decl = new DTDElementDecl(nameBuf.getRawBuffer());
decl->setCreateReason(XMLElementDecl::InContentModel);
fElemDeclPool->put(decl);
elemId = decl->getId();
}
//
// If the current node is the original node, this is the first choice
// node, so create an initial choice node with the current node and
// the new element id. Store this as the head node.
//
// Otherwise, we have to steal the right node of the previous choice
// and weave in another choice node there, which has the old choice
// as its left and the new leaf as its right.
//
if (curNode == orgNode)
{
curNode = new ContentSpecNode
(
ContentSpecNode::Choice
, curNode
, new ContentSpecNode(elemId)
);
// Remember the top node
headNode = curNode;
}
else
{
ContentSpecNode* oldRight = curNode->orphanSecond();
curNode->setSecond
(
new ContentSpecNode
(
ContentSpecNode::Choice
, oldRight
, new ContentSpecNode(elemId)
)
);
// Make the new right node the current node
curNode = curNode->getSecond();
}
}
return true;
}
//
// This method is called when we see a '<!NOTATION' string while scanning
// markup decl. It parses out the notation and its id and stores a new
// notation decl object in the notation decl pool.
//
void DTDValidator::scanNotationDecl()
{
// Space is required here so check for a PE ref, and require space
if (!checkForPERef(true, false, true))
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// And now we get a name, which is the name of the notation. Get a
// buffer for the name.
//
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getName(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::ExpectedNotationName);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// If namespaces are enabled, then no colons allowed
if (getScanner()->getDoNamespaces())
{
if (XMLString::indexOf(bbName.getRawBuffer(), chColon) != -1)
getScanner()->emitError(XML4CErrs::ColonNotLegalWithNS);
}
// Space is required here so check for a PE ref, and require space
if (!checkForPERef(true, false, true))
{
getScanner()->emitError(XML4CErrs::ExpectedWhitespace);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
//
// And scan an external or public id. We need buffers to use for both
// of these.
//
XMLBufBid bbPubId(getBufMgr());
XMLBufBid bbSysId(getBufMgr());
if (!scanId(bbPubId.getBuffer(), bbSysId.getBuffer(), IDType_Either))
{
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// We can have an optional space or PE ref here
checkForPERef(false, false, true);
//
// See if it already exists. If so, add it to the notatino decl pool.
// Otherwise, if advanced callbacks are on, create a temp one and
// call out for that one.
//
XMLNotationDecl* decl = fNotationDeclPool->getByKey(bbName.getRawBuffer());
bool isIgnoring = (decl != 0);
if (isIgnoring)
{
getScanner()->emitError(XML4CErrs::NotationAlreadyExists, bbName.getRawBuffer());
}
else
{
// Fill in a new notation declaration and add it to the pool
decl = new XMLNotationDecl
(
bbName.getRawBuffer()
, bbPubId.getRawBuffer()
, bbSysId.getRawBuffer()
);
fNotationDeclPool->put(decl);
}
//
// If we have a document type handler, then tell it about this. If we
// are ignoring it, only call out if advanced callbacks are enabled.
//
if (fDocTypeHandler)
{
fDocTypeHandler->notationDecl
(
*decl
, isIgnoring
);
}
// And one more optional space or PE ref
checkForPERef(false, false, true);
// And skip the terminating bracket
if (!getReaderMgr()->skippedChar(chCloseAngle))
getScanner()->emitError(XML4CErrs::UnterminatedNotationDecl);
}
//
// Scans a PI and calls the appropriate callbacks. A PI can happen in either
// the document or the DTD, so it calls the appropriate handler according
// to the fInDocument flag.
//
// At entry we have just scanned the <? part, and need to now start on the
// PI target name.
//
void DTDValidator::scanPI()
{
const XMLCh* namePtr = 0;
const XMLCh* targetPtr = 0;
// And skip any subsequent spaces before the name
getReaderMgr()->skipPastSpaces();
// Get a buffer for the PI name and scan it in
XMLBufBid bbName(getBufMgr());
if (!getReaderMgr()->getNameToken(bbName.getBuffer()))
{
getScanner()->emitError(XML4CErrs::PINameExpected);
getReaderMgr()->skipPastChar(chCloseAngle);
return;
}
// Point the name pointer at the raw data
namePtr = bbName.getRawBuffer();
// See if it issome form of 'xml' and emit a warning
if (XMLString::compareIString(namePtr, XMLUni::fgXMLString))
getScanner()->emitError(XML4CErrs::NoPIStartsWithXML);
// If namespaces are enabled, then no colons allowed
if (getScanner()->getDoNamespaces())
{
if (XMLString::indexOf(namePtr, chColon) != -1)
getScanner()->emitError(XML4CErrs::ColonNotLegalWithNS);
}
//
// If we don't hit a space next, then the PI has no target. If we do
// then get out the target. Get a buffer for it as well
//
XMLBufBid bbTarget(getBufMgr());
if (getReaderMgr()->skippedSpace())
{
// Skip any leading spaces
getReaderMgr()->skipPastSpaces();
// It does have a target, so lets move on to deal with that.
while (1)
{
const XMLCh nextCh = getReaderMgr()->getNextChar();
// Watch for an end of file, which is always bad here
if (!nextCh)
{
getScanner()->emitError(XML4CErrs::UnterminatedPI);
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
}