blob: b36aea100e57d06515bf5f2f92736fd9d227f2af [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.1 1999/11/09 01:08:24 twl
* Initial revision
*
* Revision 1.6 1999/11/08 20:56:55 droddey
* If the main xml entity does not exist, we need to get the error handling for that
* inside the main XMLScanner::scanDocument() try block so that it gets reported
* in the normal way. We have to add a little extra safety code because, when this
* happens, there is no reader on the reader stack to get position ino from.
*
* Revision 1.5 1999/11/08 20:44:52 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
// ---------------------------------------------------------------------------
// This file holds some of the grunt work methods of XMLScanner.cpp to keep
// it a little more readable.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/BinMemInputStream.hpp>
#include <util/Janitor.hpp>
#include <util/PlatformUtils.hpp>
#include <util/UnexpectedEOFException.hpp>
#include <util/URL.hpp>
#include <util/XMLUni.hpp>
#include <sax/InputSource.hpp>
#include <framework/XMLErrorReporter.hpp>
#include <framework/XMLDocumentHandler.hpp>
#include <framework/XMLElementDecl.hpp>
#include <framework/XMLEntityDecl.hpp>
#include <framework/XMLEntityHandler.hpp>
#include <framework/XMLPScanToken.hpp>
#include <framework/XMLRefInfo.hpp>
#include <framework/XMLValidator.hpp>
#include <internal/XMLScanner.hpp>
#include <internal/EndOfEntityException.hpp>
#include <internal/URLInputSource.hpp>
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods
// ---------------------------------------------------------------------------
//
// This method is called from scanStartTag() to build up the list of
// XMLAttr objects that will be passed out in the start tag callout. We
// get the key/value pairs from the raw scan of explicitly provided attrs,
// which have not been normalized. And we get the element declaration from
// which we will get any defaulted or fixed attribute defs and add those
// in as well.
//
unsigned int
XMLScanner::buildAttList(const RefVectorOf<KVStringPair>& providedAttrs
, const unsigned int attCount
, XMLElementDecl& elemDecl
, RefVectorOf<XMLAttr>& toFill)
{
//
// Ask the element to clear the 'provided' flag on all of the att defs
// that it owns, and to return us a boolean indicating whether it has
// any defs.
//
const bool hasDefs = elemDecl.resetDefs();
//
// If there are no expliclitily provided attributes and there are no
// defined attributes for the element, the we don't have anything to do.
// So just return zero in this case.
//
if (!hasDefs && !attCount)
return 0;
// Keep up with how many attrs we end up with total
unsigned int retCount = 0;
//
// And get the current size of the output vector. This lets us use
// existing elements until we fill it, then start adding new ones.
//
const unsigned int curAttListSize = toFill.size();
//
// We need a buffer into which raw scanned attribute values will be
// normalized.
//
XMLBufBid bbNormal(&fBufMgr);
XMLBuffer& normBuf = bbNormal.getBuffer();
//
// Loop through our explicitly provided attributes, which are in the raw
// scanned form, and build up XMLAttr objects.
//
unsigned int index;
for (index = 0; index < attCount; index++)
{
const KVStringPair* curPair = providedAttrs.elementAt(index);
//
// We have to split the name into its prefix and name parts. Then
// we map the prefix to its URI.
//
XMLCh* namePtr = XMLString::replicate(curPair->getKey());
ArrayJanitor<XMLCh> janName(namePtr);
const int colonInd = XMLString::indexOf(namePtr, chColon);
const XMLCh* prefPtr = XMLUni::fgZeroLenString;
const XMLCh* suffPtr = XMLUni::fgZeroLenString;
if (colonInd != -1)
{
namePtr[colonInd] = chNull;
prefPtr = namePtr;
suffPtr = &namePtr[colonInd + 1];
}
else
{
// No colon, so we just have a name with no prefix
suffPtr = namePtr;
}
//
// Map the prefix to a URI id. We tell him that we are mapping an
// attr prefix, so any xmlns attrs at this level will not affect it.
//
const unsigned int uriId = resolvePrefix(prefPtr, ElemStack::Mode_Attribute);
//
// If the uri comes back as the xmlns or xml URI or its just a name
// and that name is 'xmlns', then we handle it specially. So set a
// boolean flag that lets us quickly below know which we are dealing
// with.
//
const bool isNSAttr = (uriId == fValidator->getXMLNSNamespaceId())
|| (uriId == fValidator->getXMLNamespaceId())
|| !XMLString::compareString(suffPtr, XMLUni::fgXMLNSString);
//
// If its not a special case namespace attr of some sort, then we
// do normal checking and processing.
//
XMLAttDef::AttTypes attType;
if (!isNSAttr)
{
//
// Find this attribute within the parent element. We pass both
// the uriID/name and the raw QName buffer, since we don't know
// how the derived validator and its elements store attributes.
//
bool wasAdded;
XMLAttDef* attDef = elemDecl.findAttr
(
curPair->getKey()
, uriId
, suffPtr
, XMLElementDecl::AddIfNotFound
, wasAdded
);
//
// Its not valid for this element, so issue an error if we are
// validating. If its an XMLNS type attribute, then its not an
// error for it not to be predefined.
//
if (wasAdded)
{
if (fDoValidation && !isNSAttr)
{
XMLBuffer bufURI;
fValidator->getURIText(uriId, bufURI);
fValidator->emitError
(
XML4CValid::AttNotDefinedForElement
, bufURI.getRawBuffer()
, suffPtr
, elemDecl.getFullName()
);
}
}
// Mark this one as provided (even if it was faulted in)
attDef->setProvided(true);
//
// Now normalize the raw value since we have the attribute type. We
// don't care about the return status here. If it failed, an error
// was issued, which is all we care about.
//
normalizeAttValue(curPair->getValue(), attDef->getType(), normBuf);
//
// If we found an attdef for this one, then lets validate it.
//
if (!wasAdded)
{
fValidator->validateAttrValue
(
*attDef
, normBuf.getRawBuffer()
);
}
// Save the type for later use
attType = attDef->getType();
// Make sure it's not a dup of anything so far
for (unsigned int inner = 0; inner < retCount; inner++)
{
// If they have the same URI, then compare names
if (uriId == toFill.elementAt(inner)->getURIId())
{
if (!XMLString::compareString(suffPtr, toFill.elementAt(inner)->getName()))
{
emitError
(
XML4CErrs::AttrAlreadyUsedInSTag
, attDef->getFullName()
, elemDecl.getFullName()
);
}
}
}
}
else
{
// Just normalize as CDATA
attType = XMLAttDef::CData;
normalizeAttValue(curPair->getValue(), XMLAttDef::CData, normBuf);
}
//
// Add this attribute to the attribute list that we use to pass them
// to the handler. We reuse its existing elements but expand it as
// required.
//
XMLAttr* curAttr;
if (retCount >= curAttListSize)
{
curAttr = new XMLAttr
(
uriId
, suffPtr
, prefPtr
, normBuf.getRawBuffer()
, attType
, true
);
toFill.addElement(curAttr);
}
else
{
curAttr = toFill.elementAt(retCount);
curAttr->set
(
uriId
, suffPtr
, prefPtr
, normBuf.getRawBuffer()
, attType
);
curAttr->setSpecified(true);
}
// Bump the count of attrs in the list
retCount++;
}
//
// Now, if there are any attributes declared by this element, let's
// go through them and make sure that any required ones are provided,
// and fault in any fixed ones and defaulted ones that are not provided
// literally.
//
if (hasDefs)
{
XMLAttDefList& attDefList = elemDecl.getAttDefList();
while (attDefList.hasMoreElements())
{
// Get the current att def, for convenience and its def type
const XMLAttDef& curDef = attDefList.nextElement();
const XMLAttDef::DefAttTypes defType = curDef.getDefaultType();
if (!curDef.getProvided())
{
if (fDoValidation)
{
// If we are validating and its required, then an error
if (defType == XMLAttDef::Required)
{
fValidator->emitError
(
XML4CValid::RequiredAttrNotProvided
, curDef.getFullName()
);
}
}
//
// Fault in the value if needed, and bump the att count.
// We have to
//
if ((defType == XMLAttDef::Default)
|| (defType == XMLAttDef::Fixed))
{
XMLAttr* curAtt;
if (retCount >= curAttListSize)
{
curAtt = new XMLAttr;
fValidator->faultInAttr(*curAtt, curDef);
fAttrList->addElement(curAtt);
}
else
{
curAtt = fAttrList->elementAt(retCount);
fValidator->faultInAttr(*curAtt, curDef);
}
//
// Map the new attribute's prefix to a URI id and store
// that in the attribute object.
//
curAtt->setURIId
(
resolvePrefix(curAtt->getPrefix(), ElemStack::Mode_Attribute)
);
// Indicate it was not explicitly specified and bump count
curAtt->setSpecified(false);
retCount++;
}
}
}
}
return retCount;
}
//
// This method is called after the content scan to insure that all the
// ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
// an XML 1.0 rule, so we can do here in the core.
//
void XMLScanner::checkIDRefs()
{
//
//
// Iterate the id ref list. If we find any entries here which are used
// but not declared, then that's an error.
//
RefHashTableOfEnumerator<XMLRefInfo> refEnum(fIDRefList);
while (refEnum.hasMoreElements())
{
// Get a ref to the current element
const XMLRefInfo& curRef = refEnum.nextElement();
// If its used but not declared, then its an error
if (!curRef.getDeclared() && curRef.getUsed())
fValidator->emitError(XML4CValid::IDNotDeclared, curRef.getRefName());
}
}
//
// This just does a simple check that the passed progressive scan token is
// legal for this scanner.
//
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
return ((fScannerId == toCheck.fScannerId)
&& (fSequenceId == toCheck.fSequenceId));
}
//
// This method will take a raw attribute value and normalize it according to
// the rules of the attribute type. It will put the resulting value into the
// passed buffer.
//
// This code assumes that escaped characters in the original value (via char
// refs) are prefixed by a 0xFFFF character. This is because some characters
// are legal if escaped only. And some escape chars are not subject to
// normalization rules.
//
bool XMLScanner::normalizeAttValue( const XMLCh* const value
, const XMLAttDef::AttTypes type
, XMLBuffer& toFill)
{
// A simple state value for a whitespace processing state machine
enum States
{
InWhitespace
, InContent
};
// Assume its going to go fine, and empty the target buffer in preperation
bool retVal = true;
toFill.reset();
//
// Loop through the chars of the source value and normalize it according
// to the type.
//
States curState = InContent;
bool escaped;
bool firstNonWS = false;
XMLCh nextCh;
const XMLCh* srcPtr = value;
while (*srcPtr)
{
//
// Get the next character from the source. We have to watch for
// escaped characters (which are indicated by a 0xFFFF value followed
// by the char that was escaped.)
//
nextCh = *srcPtr;
escaped = (nextCh == 0xFFFF);
if (escaped)
nextCh = *++srcPtr;
//
// If its not escaped, then make sure its not a < character, which is
// not allowed in attribute values.
//
if (!escaped && (*srcPtr == chOpenAngle))
{
emitError(XML4CErrs::BracketInAttrValue);
retVal = false;
}
if (type == XMLAttDef::CData)
{
if (!escaped)
{
//
// NOTE: Yes this is a little redundant in that a 0x20 is
// replaced with an 0x20. But its faster to do this (I think)
// than checking for 9, A, and D separately.
//
if (XMLReader::isWhitespace(nextCh))
nextCh = chSpace;
}
}
else
{
if (curState == InWhitespace)
{
if (!XMLReader::isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
srcPtr++;
continue;
}
}
else if (curState == InContent)
{
if (XMLReader::isWhitespace(nextCh))
{
curState = InWhitespace;
srcPtr++;
continue;
}
firstNonWS = true;
}
}
// Add this char to the target buffer
toFill.append(nextCh);
// And move up to the next character in the source
srcPtr++;
}
return retVal;
}
unsigned int
XMLScanner::resolvePrefix( const XMLCh* const prefix
, const ElemStack::MapModes mode)
{
//
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
//
if (!XMLString::compareIString(prefix, XMLUni::fgXMLNSString))
return fValidator->getXMLNSNamespaceId();
else if (!XMLString::compareIString(prefix, XMLUni::fgXMLString))
return fValidator->getXMLNamespaceId();
//
// Ask the element stack to search up itself for a mapping for the
// passed prefix.
//
bool unknown;
unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
// If it was unknown, then the URI was faked in but we have to issue an error
if (unknown)
emitError(XML4CErrs::UnknownPrefix, prefix);
return uriId;
}
unsigned int
XMLScanner::resolvePrefix( const XMLCh* const prefix
, XMLBuffer& bufToFill
, const ElemStack::MapModes mode)
{
//
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
//
if (!XMLString::compareIString(prefix, XMLUni::fgXMLNSString))
return fValidator->getXMLNSNamespaceId();
else if (!XMLString::compareIString(prefix, XMLUni::fgXMLString))
return fValidator->getXMLNamespaceId();
//
// Ask the element stack to search up itself for a mapping for the
// passed prefix.
//
bool unknown;
unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
// If it was unknown, then the URI was faked in but we have to issue an error
if (unknown)
emitError(XML4CErrs::UnknownPrefix, prefix);
fValidator->getURIText(uriId, bufToFill);
return uriId;
}
unsigned int
XMLScanner::resolveQName( const XMLCh* const qName
, XMLBuffer& nameBuf
, XMLBuffer& prefixBuf
, const ElemStack::MapModes mode)
{
// Reset both target buffers in case we don't get anything for either
nameBuf.reset();
prefixBuf.reset();
//
// Lets split out the qName into a URI and name buffer first. The URI
// can be empty.
//
const int colonPos = XMLString::indexOf(qName, chColon);
unsigned int uriId = 0;
if (colonPos == -1)
{
//
// Its all name with no prefix, so put the whole thing into the name
// buffer. Then map the empty string to a URI, since the empty string
// represents the default namespace. This will either return some
// explicit URI which the default namespace is mapped to, or the
// the default global namespace.
//
nameBuf.append(qName);
bool unknown;
uriId = fElemStack.mapPrefixToURI(prefixBuf.getRawBuffer(), mode, unknown);
#if defined(XML4C_DEBUG)
if (unknown)
{
// <TBD> This one should never be unknown
}
#endif
}
else
{
//
// Copy the chars up to but not including the colon into the prefix
// buffer.
//
prefixBuf.append(qName, colonPos);
// And copy over the rest of the chars to the name buffer
nameBuf.append(&qName[colonPos+1]);
//
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
//
if (!XMLString::compareIString(prefixBuf.getRawBuffer(), XMLUni::fgXMLNSString))
uriId = fValidator->getXMLNSNamespaceId();
else if (!XMLString::compareIString(prefixBuf.getRawBuffer(), XMLUni::fgXMLString))
uriId = fValidator->getXMLNamespaceId();
else
{
bool unknown;
uriId = fElemStack.mapPrefixToURI(prefixBuf.getRawBuffer(), mode, unknown);
if (unknown)
emitError(XML4CErrs::UnknownPrefix, prefixBuf.getRawBuffer());
}
}
return uriId;
}
//
// This method will reset the scanner data structures, and related plugged
// in stuff, for a new scan session. We get the input source for the primary
// XML entity, create the reader for it, and push it on the stack so that
// upon successful return from here we are ready to go.
//
void XMLScanner::scanReset(const InputSource& src)
{
//
// This call implicitly tells us that we are going to reuse the scanner
// if it was previously used. So tell the validator to reset itself.
//
// But, if the fReuseValidator flag is set, then don't reset it.
//
// NOTE: The ReaderMgr is flushed on the way out, because that is
// required to insure that files are closed.
//
if (!fReuseValidator)
fValidator->reset();
//
// And for all installed handlers, send reset events. This gives them
// a chance to flush any cached data.
//
if (fDocHandler)
fDocHandler->resetDocument();
if (fEntityHandler)
fEntityHandler->resetEntities();
if (fErrorReporter)
fErrorReporter->resetErrors();
// Clear out the id reference list
fIDRefList->removeAll();
//
// Reset the element stack, and give it the latest ids for the special
// URIs it has to know about.
//
fElemStack.reset
(
fValidator->getEmptyNamespaceId()
, fValidator->getGlobalNamespaceId()
, fValidator->getUnknownNamespaceId()
, fValidator->getXMLNamespaceId()
, fValidator->getXMLNSNamespaceId()
);
// Reset some status flags
fInException = false;
fStandalone = false;
//
// Handle the creation of the XML reader object for this input source.
// This will provide us with transcoding and basic lexing services.
//
XMLReader* newReader = fReaderMgr.createReader
(
src
, true
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
);
if (!newReader)
ThrowXML1(RuntimeException, XML4CExcepts::Scan_CouldNotOpenSource, src.getSystemId());
// Push this read onto the reader manager
fReaderMgr.pushReader(newReader, 0);
//
// We know that the file is legal now, so lets get the base directory
// off of it and store it. If there is no directory component on the
// path, then this returns a null.
//
// We have to assume it could be a URL so we create a temporary URL
// and ask it for the path part of itself. That will insure that if its
// relative we really see it as a relative path.
//
URL tmpURL;
try
{
tmpURL.setURL(src.getSystemId());
//
// Its a valid URL so its assumed to be fully qualified. Get the
// base part of the path part of the URL.
//
fReaderMgr.setBasePath(XMLPlatformUtils::getBasePath(tmpURL.getPath()));
}
catch(const MalformedURLException&)
{
//
// Its not a URL, so assume its just a plain file path and could
// be partial, so get the complete path.
//
fReaderMgr.setBasePath(XMLPlatformUtils::getBasePath(src.getSystemId()));
}
}
//
// This method is called between markup in content. It scans for character
// data that is sent to the document handler. It watches for any markup
// characters that would indicate that the character data has ended. It also
// handles expansion of general and character entities.
//
// sendData() is a local static helper for this method which handles some
// code that must be done in three different places here.
//
void XMLScanner::sendCharData(XMLBuffer& toSend)
{
// If no data in the buffer, then nothing to do
if (toSend.isEmpty())
return;
// Get the raw data we need for the callback
const XMLCh* rawBuf = toSend.getRawBuffer();
const unsigned int len = toSend.getLen();
//
// We do different things according to whether we are validating or
// not. If not, its always just characters; else, it depends on the
// current element's content model.
//
if (fDoValidation)
{
// See if all the text is whitespace
const bool isSpaces = XMLReader::isAllSpaces(rawBuf, len);
// And see if the current element is a 'Children' style content model
const ElemStack::StackElem* topElem = fElemStack.topElement();
// Get the character data opts for the current element
XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();
if (charOpts == XMLElementDecl::NoCharData)
{
// They definitely cannot handle any type of char data
fValidator->emitError(XML4CValid::NoCharDataInCM);
}
else if (isSpaces)
{
//
// Its all spaces. So, if they can take spaces, then send it
// as ignorable whitespace. If they can handle any char data
// send it as characters.
//
if (fDocHandler)
{
if (charOpts == XMLElementDecl::SpacesOk)
fDocHandler->ignorableWhitespace(rawBuf, len, false);
else if (charOpts == XMLElementDecl::AllCharData)
fDocHandler->docCharacters(rawBuf, len, false);
}
}
else
{
//
// If they can take any char data, then send it. Otherwise, they
// can only handle whitespace and can't handle this stuff so
// issue an error.
//
if (charOpts == XMLElementDecl::AllCharData)
{
if (fDocHandler)
fDocHandler->docCharacters(rawBuf, len, false);
}
else
{
fValidator->emitError(XML4CValid::NoCharDataInCM);
}
}
}
else
{
// Always assume its just char data if not validating
if (fDocHandler)
fDocHandler->docCharacters(rawBuf, len, false);
}
// Reset buffer
toSend.reset();
}
//
// This method will handle figuring out what the next top level token is
// in the input stream. It will return an enumerated value that indicates
// what it believes the next XML level token must be. It will eat as many
// chars are required to figure out what is next.
//
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
//
// Get the next character and use it to guesstimate what the next token
// is going to be.
//
XMLCh nextCh = fReaderMgr.peekNextChar();
// If its not one of the special chars, then assume its char data
if (!XMLReader::isSpecialTokenSenseChar(nextCh))
return Token_CharData;
//
// Else its something special so lets check them out. Start with the most
// obvious end of file, which should be legal here at top level.
//
if (!nextCh)
return Token_EOF;
//
// See if its an entity reference of some sort. If so, assume it must
// be character data because that's the only place a reference can
// occur in content.
//
if (nextCh == chAmpersand)
return Token_CharData;
//
// Ok it had to have been a '<' character. So get it out of the reader
// and store the reader number where we saw it, passing it back to the
// caller.
//
fReaderMgr.getNextChar();
orgReader = fReaderMgr.getCurrentReaderNum();
//
// Ok, so lets go through the things that it could be at this point which
// are all some form of markup.
//
nextCh = fReaderMgr.peekNextChar();
if (nextCh == chBang)
{
static const XMLCh gCDATAStr[] =
{
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chNull
};
static const XMLCh gCommentString[] =
{
chBang, chDash, chDash, chNull
};
if (fReaderMgr.skippedString(gCDATAStr))
return Token_CData;
if (fReaderMgr.skippedString(gCommentString))
return Token_Comment;
emitError(XML4CErrs::ExpectedCommentOrCDATA);
return Token_Unknown;
}
else if (nextCh == chQuestion)
{
// It must be a PI
fReaderMgr.getNextChar();
return Token_PI;
}
else if (nextCh == chForwardSlash)
{
fReaderMgr.getNextChar();
return Token_EndTag;
}
//
// Assume its an element name, so return with a start tag token. If it
// turns out not to be, then it will fail when it cannot get a valid tag.
//
return Token_StartTag;
}
//
// This method is called with a key/value string pair that represents an
// xmlns="xxx" or xmlns:xxx="yyy" attribute. This method will update the
// current top of the element stack based on this data. We know that when
// we get here, that it is one of these forms, so we don't bother confirming
// it.
//
void XMLScanner::updateNSMap(const XMLCh* const attrName
, const XMLCh* const attrValue)
{
// We need a buffer to normalize the attribute value into
XMLBufBid bbNormal(&fBufMgr);
XMLBuffer& normalBuf = bbNormal.getBuffer();
// We either have the default prefix (""), or we point it into the attr
// name parameter. Note that the xmlns is not the prefix we care about
// here. To us, the 'prefix' is really the local part of the attrName
// parameter.
//
const XMLCh* prefPtr = XMLUni::fgZeroLenString;
const unsigned int colonOfs = XMLString::indexOf(attrName, chColon);
if (colonOfs != -1)
prefPtr = &attrName[colonOfs + 1];
//
// Normalize the value into the passed buffer. In this case, we don't
// care about the return value. An error was issued for the error, which
// is all we care about here.
//
normalizeAttValue(attrValue, XMLAttDef::CData, normalBuf);
//
// Ok, we have to get the unique id for the attribute value, which is the
// URI that this value should be mapped to. The validator has the
// namespace string pool, so we ask him to find or add this new one. Then
// we ask the element stack to add this prefix to URI Id mapping.
//
fElemStack.addPrefix
(
prefPtr
, fValidator->addOrFindNSId(normalBuf.getRawBuffer())
);
}
// ---------------------------------------------------------------------------
// XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
//
// This guy just scans out a single or double quoted string of characters.
// It does not pass any judgement on the contents and assumes that it is
// illegal to have another quote of the same kind inside the string's
// contents.
//
// NOTE: This is for simple stuff like the strings in the XMLDecl which
// cannot have any entities inside them. So this guy does not handle any
// end of entity stuff.
//
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = fReaderMgr.getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
//
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
//
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
//
// This method is called to do a raw scan of an attribute value. It does not
// do normalization (since we don't know their types yet.) It just scans the
// value and does entity expansion.
//
// End of entity's must be dealt with here. During DTD scan, they can come
// from external entities. During content, they can come from any entity.
// We just eat the end of entity and continue with our scan until we come
// to the closing quote. If an unterminated value causes us to go through
// subsequent entities, that will cause errors back in the calling code,
// but there's little we can do about it here.
//
bool XMLScanner::basicAttrValueScan(XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
// Get another char. Use second char if one is waiting
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = fReaderMgr.getNextChar();
}
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
//
// Check for our ending quote. It has to be in the same entity
// as where we started. Quotes in nested entities are ignored.
//
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr.getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr.getCurrentReaderNum())
{
emitError(XML4CErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref . We ignore the empty flag in
// this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
// If it was not returned directly, then jump back up
if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
{
emitError(XML4CErrs::Expected2ndSurrogateChar);
}
else
{
// Its got to at least be a valid XML character
if (!XMLReader::isXMLChar(nextCh))
emitError(XML4CErrs::InvalidCharacter);
gotLeadingSurrogate = true;
}
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XML4CErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
}
gotLeadingSurrogate = false;
}
//
// If it was escaped, then put in a 0xFFFF value. This will
// be used later during validation and normalization of the
// value to know that the following character was via an
// escape char.
//
if (escaped)
toFill.append(0xFFFF);
// Else add it to the buffer
toFill.append(nextCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
bool XMLScanner::scanAttValue( XMLBuffer& toFill
, const XMLAttDef::AttTypes type)
{
enum States
{
InWhitespace
, InContent
};
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
//
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
//
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = InContent;
bool firstNonWS = false;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
// Get another char. Use second char if one is waiting
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
nextCh = fReaderMgr.getNextChar();
}
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
// Its got to at least be a valid XML character
if (!XMLReader::isXMLChar(nextCh))
emitError(XML4CErrs::InvalidCharacter);
// Check for our ending quote in the same entity
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr.getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr.getCurrentReaderNum())
{
emitError(XML4CErrs::PartialMarkupInEntity);
return false;
}
}
//
// Check for an entity ref now, before we let it affect our
// whitespace normalization logic below. We ignore the empty flag
// in this one.
//
escaped = false;
if (nextCh == chAmpersand)
{
if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XML4CErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
}
gotLeadingSurrogate = false;
}
//
// If its not escaped, then make sure its not a < character, which
// is not allowed in attribute values.
//
if (!escaped && (nextCh == chOpenAngle))
emitError(XML4CErrs::BracketInAttrValue);
//
// If the attribute is a CDATA type we do simple replacement of
// tabs and new lines with spaces, if the character is not escaped
// by way of a char ref.
//
// Otherwise, we do the standard non-CDATA normalization of
// compressing whitespace to single spaces and getting rid of leading
// and trailing whitespace.
//
if (type == XMLAttDef::CData)
{
if (!escaped)
{
if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
nextCh = chSpace;
}
}
else
{
if (curState == InWhitespace)
{
if (!XMLReader::isWhitespace(nextCh))
{
if (firstNonWS)
toFill.append(chSpace);
curState = InContent;
firstNonWS = true;
}
else
{
continue;
}
}
else if (curState == InContent)
{
if (XMLReader::isWhitespace(nextCh))
{
curState = InWhitespace;
continue;
}
firstNonWS = true;
}
}
// Else add it to the buffer
toFill.append(nextCh);
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
//
// This method scans a CDATA section. It collects the character into one
// of the temp buffers and calls the document handler, if any, with the
// characters. It assumes that the <![CDATA string has been scanned before
// this call.
//
void XMLScanner::scanCDSection()
{
//
// This is the CDATA section opening sequence, minus the '<' character.
// We use this to watch for nested CDATA sections, which are illegal.
//
static const XMLCh CDataPrefix[] =
{
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chOpenSquare, chNull
};
static const XMLCh CDataClose[] =
{
chCloseSquare, chCloseAngle, chNull
};
//
// The next character should be the opening square bracket. If not
// issue an error, but then try to recover by skipping any whitespace
// and checking again.
//
if (!fReaderMgr.skippedChar(chOpenSquare))
{
emitError(XML4CErrs::ExpectedOpenSquareBracket);
fReaderMgr.skipPastSpaces();
// If we still don't find it, then give up, else keep going
if (!fReaderMgr.skippedChar(chOpenSquare))
return;
}
// Get a buffer for this
XMLBufBid bbCData(&fBufMgr);
//
// We just scan forward until we hit the end of CDATA section sequence.
// CDATA is effectively a big escape mechanism so we don't treat markup
// characters specially here.
//
bool emittedError = false;
unsigned int nestCount = 0;
while (true)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for unexpected end of file
if (!nextCh)
{
emitError(XML4CErrs::UnterminatedCDATASection);
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
}
//
// If this is a close square bracket it could be our closing
// sequence. Be sure though to handle nested CDATA sections. They
// are illegal, but we've already issued the error and just need to
// be sure not to fall out early.
//
if (nextCh == chCloseSquare)
{
if (fReaderMgr.skippedString(CDataClose))
{
if (nestCount)
{
// We are nested, so just bump down the counter
nestCount--;
}
else
{
// If we have a doc handler, call it
if (fDocHandler)
{
fDocHandler->docCharacters
(
bbCData.getRawBuffer()
, bbCData.getLen()
, true
);
}
// And we are done
break;
}
}
}
else if (nextCh == chOpenAngle)
{
//
// Watch for nested CDATA sections. We got the '<' character,
// so now we check for the remainder of the ![CDATA[ sequence.
// If its there, we bump a counter and issue an error.
//
if (fReaderMgr.skippedString(CDataPrefix))
{
// Bump up the nesting count
nestCount++;
// And issue the nested CDATA error
emitError(XML4CErrs::NestedCDATA);
}
}
//
// Make sure its a valid character. But if we've emitted an error
// already, don't both with the overhead since we've already told
// them about it.
//
if (!emittedError)
{
if (!XMLReader::isXMLChar(nextCh))
{
emitError(XML4CErrs::InvalidCharacter);
emittedError = true;
}
}
// Add it to the buffer
bbCData.append(nextCh);
}
}
void XMLScanner::scanCharData(XMLBuffer& toUse)
{
//
// We have to watch for the stupid ]]> sequence, which is illegal in
// character data. So this is a little state machine that handles that.
//
enum States
{
State_Waiting
, State_GotOne
, State_GotTwo
};
// Reset the buffer before we start
toUse.reset();
// Turn on the 'throw at end' flag of the reader manager
ThrowEOEJanitor jan(&fReaderMgr, true);
//
// In order to be more efficient we have to use kind of a deeply nested
// set of blocks here. The outer block puts on a try and catches end of
// entity exceptions. The inner loop is the per-character loop. If we
// put the try inside the inner loop, it would work but would require
// the exception handling code setup/teardown code to be invoked for
// each character.
//
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = State_Waiting;
bool escaped = false;
bool gotLeadingSurrogate = false;
bool notDone = true;
while (notDone)
{
try
{
while (true)
{
//
// Ok, lets get char data from the the readers until we hit
// a special char. It returns the char that made us break out
// of the loop. Use any second char first.
//
if (secondCh)
{
nextCh = secondCh;
secondCh = 0;
}
else
{
// Reset the surrogate flag and get another block of chars
nextCh = fReaderMgr.getCharData(toUse, *this, gotLeadingSurrogate);
//
// If we get an EOF, that's not really expected here, but it
// could happen if there was effectively no content, so just
// return.
//
// Also, if we get the open angle of some markup, then break
// out.
//
if ((nextCh == chOpenAngle) || !nextCh)
{
// If we were waiting for a trailing surrogate, its an error
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
notDone = false;
break;
}
// Get the break char out of the input buffer now
fReaderMgr.getNextChar();
}
//
// Watch for a reference. Note that the escapement mechanism
// is ignored in this content.
//
if (nextCh == chAmpersand)
{
sendCharData(toUse);
// Turn off the throwing at the end of entity during this
ThrowEOEJanitor jan(&fReaderMgr, false);
if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
else
{
escaped = false;
}
// Keep the state machine up to date
if (!escaped)
{
if (nextCh == chCloseSquare)
{
if (curState == State_Waiting)
curState = State_GotOne;
else if (curState == State_GotOne)
curState = State_GotTwo;
}
else if (nextCh == chCloseAngle)
{
if (curState == State_GotTwo)
emitError(XML4CErrs::BadSequenceInCharData);
curState = State_Waiting;
}
else
{
curState = State_Waiting;
}
}
else
{
curState = State_Waiting;
}
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
//
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
//
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
//
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
//
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XML4CErrs::Unexpected2ndSurrogateChar);
}
else
{
//
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
//
if (gotLeadingSurrogate)
emitError(XML4CErrs::Expected2ndSurrogateChar);
// Make sure the returned char is a valid XML char
if (!XMLReader::isXMLChar(nextCh))
emitError(XML4CErrs::InvalidCharacter);
}
gotLeadingSurrogate = false;
}
// Add this char to the buffer
toUse.append(nextCh);
}
}
catch(const EndOfEntityException& toCatch)
{
//
// Some entity ended, so we have to send any accumulated
// chars and send an end of entity event.
//
sendCharData(toUse);
gotLeadingSurrogate = false;
if (fDocHandler)
fDocHandler->endEntityReference(toCatch.getEntity());
}
}
// Send any char data that we accumulated into the buffer
sendCharData(toUse);
}
//
// This method scans a character reference and returns the character that
// was refered to. It assumes that we've already scanned the &# characters
// that prefix the numeric code.
//
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
//
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
//
unsigned int radix = 10;
if (fReaderMgr.skippedChar(chLatin_x))
{
radix = 16;
}
else if (fReaderMgr.skippedChar(chLatin_X))
{
emitError(XML4CErrs::HexRadixMustBeLowerCase);
radix = 16;
}
while (true)
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
fReaderMgr.getNextChar();
break;
}
//
// Convert this char to a binary value, or bail out if its not
// one.
//
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
else
{
// Return a zero
toFill = 0;
//
// If we got at least a sigit, then do an unterminated ref error.
// Else, do an expected a numerical ref thing.
//
if (gotOne)
emitError(XML4CErrs::UnterminatedCharRef);
else
emitError(XML4CErrs::ExpectedNumericalCharRef);
// Return failure
return false;
}
//
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
//
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
emitError(XML4CErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
}
// Indicate that we got at least one good digit
gotOne = true;
// And eat the last char
fReaderMgr.getNextChar();
}
// Return the char (or chars)
if (value >= 0x10000)
{
value -= 0x10000;
toFill = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else
{
toFill = XMLCh(value);
second = 0;
}
return true;
}
//
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
//
void XMLScanner::scanComment()
{
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(&fBufMgr);
//
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
//
States curState = InText;
while (true)
{
// Get the next character
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file
if (!nextCh)
{
emitError(XML4CErrs::UnterminatedComment);
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
}
// Make sure its a valid XML character
if (!XMLReader::isXMLChar(nextCh))
emitError(XML4CErrs::InvalidCharacter);
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
//
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
//
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
emitError(XML4CErrs::IllegalSequenceInComment);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If we have an available handler, call back with the comment.
if (fDocHandler)
{
fDocHandler->docComment
(
bbComment.getRawBuffer()
);
}
}
//
// Most equal signs can have white space around them, so this little guy
// just makes the calling code cleaner by eating whitespace.
//
bool XMLScanner::scanEq()
{
fReaderMgr.skipPastSpaces();
if (fReaderMgr.skippedChar(chEqual))
{
fReaderMgr.skipPastSpaces();
return true;
}
return false;
}
//
// This method will scan a general/character entity ref. It will either
// expand a char ref and return it directly, or push a reader for a general
// entity.
//
// The return value indicates whether the char parameters hold the value
// or whether the value was pushed as a reader, or that it failed.
//
// The escaped flag tells the caller whether the returned parameter resulted
// from a character reference, which escapes the character in some cases. It
// only makes any difference if the return value indicates the value was
// returned directly.
//
XMLScanner::EntityExpRes
XMLScanner::scanEntityRef( const bool inAttVal
, XMLCh& firstCh
, XMLCh& secondCh
, bool& escaped)
{
// Assume no escape
secondCh = 0;
escaped = false;
// We have to insure that its all in one entity
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
//
// If the next char is a pound, then its a character reference and we
// need to expand it always.
//
if (fReaderMgr.skippedChar(chPound))
{
//
// Its a character reference, so scan it and get back the numeric
// value it represents.
//
if (!scanCharRef(firstCh, secondCh))
return EntityExp_Failed;
escaped = true;
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XML4CErrs::PartialMarkupInEntity);
return EntityExp_Returned;
}
// Expand it since its a normal entity ref
XMLBufBid bbName(&fBufMgr);
if (!fReaderMgr.getName(bbName.getBuffer()))
{
emitError(XML4CErrs::ExpectedEntityRefName);
return EntityExp_Failed;
}
//
// Next char must be a semi-colon. But if its not, just emit
// an error and try to continue.
//
if (!fReaderMgr.skippedChar(chSemiColon))
emitError(XML4CErrs::UnterminatedEntityRef);
// Make sure we ended up on the same entity reader as the & char
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XML4CErrs::PartialMarkupInEntity);
// Look up the name in the general entity pool
XMLEntityDecl* decl = fValidator->findEntityDecl(bbName.getRawBuffer(), false);
// If it does not exist, then obviously an error
if (!decl)
{
emitError(XML4CErrs::EntityNotFound, bbName.getRawBuffer());
return EntityExp_Failed;
}
//
// If we are a standalone document, then it has to have been declared
// in the internal subset. Keep going though.
//
if (fStandalone && !decl->getDeclaredInIntSubset())
emitError(XML4CErrs::IllegalRefInStandalone, bbName.getRawBuffer());
if (decl->isExternal())
{
// If its unparsed, then its not valid here
if (decl->isUnparsed())
{
emitError(XML4CErrs::NoUnparsedEntityRefs, bbName.getRawBuffer());
return EntityExp_Failed;
}
// If we are in an attribute value, then not valid but keep going
if (inAttVal)
emitError(XML4CErrs::NoExtRefsInAttValue);
// And now create a reader to read this entity
InputSource* srcUsed;
XMLReader* reader = fReaderMgr.createReader
(
decl->getSystemId()
, decl->getPublicId()
, false
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
, srcUsed
);
// Put a janitor on the source so it gets cleaned up on exit
Janitor<InputSource> janSrc(srcUsed);
//
// If the creation failed, and its not because the source was empty,
// then emit an error and return.
//
if (!reader)
ThrowXML1(RuntimeException, XML4CExcepts::Gen_CouldNotOpenExtEntity, srcUsed->getSystemId());
//
// Push the reader. If its a recursive expansion, then emit an error
// and return an failure.
//
if (!fReaderMgr.pushReader(reader, decl))
{
emitError(XML4CErrs::RecursiveEntity, decl->getName());
return EntityExp_Failed;
}
// Do a start entity reference event
if (fDocHandler)
fDocHandler->startEntityReference(*decl);
// If it starts with the XML string, then parse a text decl
if (fReaderMgr.skippedString(XMLUni::fgXMLDeclString))
scanXMLDecl(Decl_Text);
}
else
{
//
// If its one of the special char references, then we can return
// it as a character, and its considered escaped.
//
if (decl->getIsSpecialChar())
{
firstCh = decl->getValue()[0];
escaped = true;
return EntityExp_Returned;
}
//
// Create a reader over a memory stream over the entity value
// We force it to assume UTF-16 by passing in an encoding
// string. This way it won't both trying to predecode the
// first line, looking for an XML/TextDecl.
//
XMLReader* valueReader = fReaderMgr.createIntEntReader
(
decl->getName()
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, decl->getValue()
, decl->getValueLen()
, false
);
//
// Try to push the entity reader onto the reader manager stack,
// where it will become the subsequent input. If it fails, that
// means the entity is recursive, so issue an error. The reader
// will have just been discarded, but we just keep going.
//
if (!fReaderMgr.pushReader(valueReader, decl))
emitError(XML4CErrs::RecursiveEntity, decl->getName());
// Do a start entity reference event
if (fDocHandler)
fDocHandler->startEntityReference(*decl);
}
return EntityExp_Pushed;
}
//
// This method will scan for an id, either public or external. It can look
// for either and tell the caller what it found, or it can be told to look
// for a particular type.
//
bool XMLScanner::scanId( XMLBuffer& pubIdToFill
, XMLBuffer& sysIdToFill
, const IDTypes whatKind)
{
// Clean out both return buffers
pubIdToFill.reset();
sysIdToFill.reset();
//
// Check first for the system id first. If we find it, and system id
// is one of the legal values, then lets try to scan it.
//
if (fReaderMgr.skippedString(XMLUni::fgSysIDString))
{
// If they were looking for a public id, then we failed
if (whatKind == IDType_Public)
{
emitError(XML4CErrs::ExpectedPublicId);
return false;
}
// We must skip spaces
if (!fReaderMgr.skipPastSpaces())
{
emitError(XML4CErrs::ExpectedWhitespace);
return false;
}
// Get the system literal value
return scanSystemLiteral(sysIdToFill);
}
// See if we have a public id string. If not, we are done and found nothing
if (!fReaderMgr.skippedString(XMLUni::fgPubIDString))
return false;
//
// So following this we must have whitespace, a public literal, whitespace,
// and a system literal.
//
if (!fReaderMgr.skipPastSpaces())
{
emitError(XML4CErrs::ExpectedWhitespace);
//
// Just in case, if they just forgot the whitespace but the next char
// is a single or double quote, then keep going.
//
const XMLCh chPeek = fReaderMgr.peekNextChar();
if ((chPeek != chDoubleQuote) && (chPeek != chSingleQuote))
return false;
}
if (!scanPublicLiteral(pubIdToFill))
{
emitError(XML4CErrs::ExpectedPublicId);
return false;
}
// If they wanted a public id, then this is all
if (whatKind == IDType_Public)
return true;
// Else lets get the system id
if (!fReaderMgr.skipPastSpaces())
{
//
// In order to recover best here we need to see if we don't have
// whitespace because the next thing is a quote or because the next
// thing is some non-quote character.
//
const XMLCh chPeek = fReaderMgr.peekNextChar();
const bool bIsQuote = ((chPeek == chDoubleQuote)
|| (chPeek == chSingleQuote));
if (whatKind == IDType_External)
{
//
// If its an external Id, then we need to see the system id.
// So, emit the error. But, if the next char is a quote, don't
// give up since its probably going to work. The user just
// missed the separating space. Otherwise, fail.
//
emitError(XML4CErrs::ExpectedWhitespace);
if (!bIsQuote)
return false;
}
else
{
//
// We can legally return here. But, if the next char is a quote,
// then that's probably not what was desired, since its probably
// just that space was forgotten and there really is a system
// id to follow.
//
// So treat it like missing whitespace if so and keep going.
// Else, just return success.
//
if (bIsQuote)
emitError(XML4CErrs::ExpectedWhitespace);
else
return true;
}
}
if (!scanSystemLiteral(sysIdToFill))
{
emitError(XML4CErrs::ExpectedSystemId);
return false;
}
return true;
}
//
// This method scans a public literal. It must be quoted and all of its
// characters must be valid public id characters. The quotes are discarded
// and the results are returned.
//
bool XMLScanner::scanPublicLiteral(XMLBuffer& toFill)
{
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
while (true)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
if (nextCh == quoteCh)
break;
//
// If its not a valid public id char, then report it but keep going
// since that's the best recovery scheme.
//
if (!XMLReader::isPublicIdChar(nextCh))
emitError(XML4CErrs::InvalidPublicIdChar);
toFill.append(nextCh);
}
return true;
}
//
// This method handles scanning in a quoted system literal. It expects to
// start on the open quote and returns after eating the ending quote. There
// are not really any restrictions on the contents of system literals.
//
bool XMLScanner::scanSystemLiteral(XMLBuffer& toFill)
{
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
bool retVal = true;
while (retVal)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for EOF
if (!nextCh)
ThrowXML(UnexpectedEOFException, XML4CExcepts::Gen_UnexpectedEOF);
// Break out on terminating quote
if (nextCh == quoteCh)
break;
toFill.append(nextCh);
}
return retVal;
}
unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
{
fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
return toFill.getLen();
}