blob: eb829e68f223758d316c56ecfe11cf196261e6f8 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.1 1999/11/09 01:08:23 twl
* Initial revision
*
* Revision 1.4 1999/11/08 20:44:52 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
#if !defined(XMLSCANNER_HPP)
#define XMLSCANNER_HPP
#include <util/KVStringPair.hpp>
#include <util/RefVectorOf.hpp>
#include <util/XMLString.hpp>
#include <framework/XMLAttr.hpp>
#include <framework/XMLBufferMgr.hpp>
#include <framework/XMLErrorCodes.hpp>
#include <framework/XMLRefInfo.hpp>
#include <framework/XMLPScanToken.hpp>
#include <internal/ElemStack.hpp>
#include <internal/ReaderMgr.hpp>
class InputSource;
class XMLDocumentHandler;
class XMLDocumentTypeHandler;
class XMLElementDecl;
class XMLEntityHandler;
class XMLErrorReporter;
class XMLMsgLoader;
class XMLValidator;
//
// This is the mondo scanner class, which does the vast majority of the
// work of parsing. It handles reading in input and spitting out events
// to installed handlers.
//
class XMLPARSER_EXPORT XMLScanner
{
public :
// -----------------------------------------------------------------------
// Public class types
//
// NOTE: These should really be private, but some of the compilers we
// have to deal with are too stupid to understand this.
//
// DeclTypes
// Used by scanXMLDecl() to know what type of decl it should scan.
// Text decls have slightly different rules from XMLDecls.
//
// EntityExpRes
// These are the values returned from the entity expansion method,
// to indicate how it went.
//
// XMLTokens
// These represent the possible types of input we can get while
// scanning content.
// -----------------------------------------------------------------------
enum DeclTypes
{
Decl_Text
, Decl_XML
};
enum EntityExpRes
{
EntityExp_Pushed
, EntityExp_Returned
, EntityExp_Failed
};
enum XMLTokens
{
Token_CData
, Token_CharData
, Token_Comment
, Token_EndTag
, Token_EOF
, Token_PI
, Token_StartTag
, Token_Unknown
};
// -----------------------------------------------------------------------
// Constructors and Destructor
// -----------------------------------------------------------------------
XMLScanner
(
XMLValidator* const validator
);
XMLScanner
(
XMLDocumentHandler* const docHandler
, XMLEntityHandler* const entityHandler
, XMLErrorReporter* const errReporter
, XMLValidator* const validator
);
~XMLScanner();
// -----------------------------------------------------------------------
// Error emitter methods
// -----------------------------------------------------------------------
void emitError(const XML4CErrs::Codes toEmit);
void emitError
(
const XML4CErrs::Codes toEmit
, const XMLCh* const text1
, const XMLCh* const text2 = 0
, const XMLCh* const text3 = 0
, const XMLCh* const text4 = 0
);
void emitError
(
const XML4CErrs::Codes toEmit
, const char* const text1
, const char* const text2 = 0
, const char* const text3 = 0
, const char* const text4 = 0
);
// -----------------------------------------------------------------------
// Getter methods
// -----------------------------------------------------------------------
const XMLDocumentHandler* getDocHandler() const;
XMLDocumentHandler* getDocHandler();
bool getDoNamespaces() const;
bool getDoValidation() const;
const XMLEntityHandler* getEntityHandler() const;
XMLEntityHandler* getEntityHandler();
const XMLErrorReporter* getErrorReporter() const;
XMLErrorReporter* getErrorReporter();
bool getExitOnFirstFatal() const;
RefHashTableOf<XMLRefInfo>& getIDRefList();
bool getInException() const;
const RefHashTableOf<XMLRefInfo>& getIDRefList() const;
bool getLastExtLocation
(
XMLCh* const sysIdToFill
, const unsigned int maxSysIdChars
, XMLCh* const pubIdToFill
, const unsigned int maxPubIdChars
, unsigned int& lineToFill
, unsigned int& colToFill
);
bool getStandalone() const;
const XMLValidator* getValidator() const;
XMLValidator* getValidator();
// -----------------------------------------------------------------------
// Setter methods
// -----------------------------------------------------------------------
void setDocHandler(XMLDocumentHandler* const docHandler);
void setDoNamespaces(const bool doNamespaces);
void setDoValidation(const bool validate);
void setEntityHandler(XMLEntityHandler* const docTypeHandler);
void setErrorReporter(XMLErrorReporter* const errHandler);
void setExitOnFirstFatal(const bool newValue);
void setValidator(XMLValidator* const validator);
// -----------------------------------------------------------------------
// Document scanning methods
//
// scanDocument() does the entire source document. scanFirst() and
// scanNext() support a progressive parse.
// -----------------------------------------------------------------------
void scanDocument
(
const InputSource& src
, const bool reuseValidator = false
);
bool scanFirst
(
const InputSource& src
, XMLPScanToken& toFill
, const bool reuseValidator = false
);
bool scanNext(XMLPScanToken& toFill);
private :
// -----------------------------------------------------------------------
// Private class types
// -----------------------------------------------------------------------
enum IDTypes
{
IDType_Public
, IDType_External
, IDType_Either
};
enum DTDSubsets
{
Subset_Internal
, Subset_External
};
// -----------------------------------------------------------------------
// Unimplemented constructors and operators
// -----------------------------------------------------------------------
XMLScanner();
XMLScanner(const XMLScanner&);
void operator=(const XMLScanner&);
// -----------------------------------------------------------------------
// Private helper methods
// -----------------------------------------------------------------------
void commonInit();
// -----------------------------------------------------------------------
// Private helper methods
//
// These are implemented in XMLScanner2.cpp, to keep the main file from
// becoming so bloated. We can't have any bloated files.
// -----------------------------------------------------------------------
unsigned int buildAttList
(
const RefVectorOf<KVStringPair>& providedAttrs
, const unsigned int attCount
, XMLElementDecl& elemDecl
, RefVectorOf<XMLAttr>& toFill
);
void checkIDRefs();
bool isLegalToken(const XMLPScanToken& toCheck);
bool normalizeAttValue
(
const XMLCh* const value
, const XMLAttDef::AttTypes type
, XMLBuffer& toFill
);
unsigned int resolveQName
(
const XMLCh* const qName
, XMLBuffer& nameBufToFill
, XMLBuffer& prefixBufToFill
, const ElemStack::MapModes mode
);
unsigned int resolvePrefix
(
const XMLCh* const prefix
, const ElemStack::MapModes mode
);
unsigned int resolvePrefix
(
const XMLCh* const prefix
, XMLBuffer& uriBufToFill
, const ElemStack::MapModes mode
);
void scanReset(const InputSource& src);
void sendCharData(XMLBuffer& toSend);
XMLTokens senseNextToken(unsigned int& orgReader);
void updateNSMap
(
const XMLCh* const attrName
, const XMLCh* const attrValue
);
void validateAttrValue
(
const XMLCh* const valueText
, const XMLAttDef::AttTypes type
, const XMLAttDef::DefAttTypes defType
, const XMLCh* const defText
, const XMLCh* const fullName
, const XMLCh* const enumList
);
// -----------------------------------------------------------------------
// Private scanning methods
// -----------------------------------------------------------------------
bool basicAttrValueScan(XMLBuffer& toFill);
bool getQuotedString(XMLBuffer& toFill);
unsigned int rawAttrScan
(
RefVectorOf<KVStringPair>& toFill
, bool& isEmpty
);
bool scanAttValue(XMLBuffer& toFill, const XMLAttDef::AttTypes type);
void scanCDSection();
void scanCharData(XMLBuffer& toToUse);
bool scanCharRef(XMLCh& toFill, XMLCh& second);
void scanComment();
bool scanContent(const bool extEntity);
void scanDocTypeDecl();
void scanEndTag(bool& gotData);
EntityExpRes scanEntityRef
(
const bool inAttVal
, XMLCh& firstCh
, XMLCh& secondCh
, bool& escaped
);
bool scanEq();
bool scanId
(
XMLBuffer& pubIdToFill
, XMLBuffer& sysIdToFill
, const IDTypes whatKind
);
void scanIgnoredSection();
bool scanInternalSubset();
void scanMiscellaneous();
void scanPI();
void scanProlog();
bool scanPublicLiteral(XMLBuffer& toFill);
bool scanStartTag(bool& gotData);
bool scanStartTagNS(bool& gotData);
bool scanSystemLiteral(XMLBuffer& toFill);
void scanXMLDecl(const DeclTypes type);
unsigned int scanUpToWSOr
(
XMLBuffer& toFill
, const XMLCh chEndChar
);
// -----------------------------------------------------------------------
// Data members
//
// fAttrList
// Every time we get a new element start tag, we have to pass to
// the document handler the attributes found. To make it more
// efficient we keep this ref vector of XMLAttr objects around. We
// just reuse it over and over, allowing it to grow to meet the
// peek need.
//
// fBaseDir
// This is the base directory, from which the initial XML file
// was loaded. It is set after the file is successfully opened,
// so we know it to be valid. If the initial file had no path
// component, then this is left null. It is used to handle relative
// paths of DTD and external entity system ids.
//
// fBufMgr
// This is a manager for temporary buffers used during scanning.
// For efficiency we must use a set of static buffers, but we have
// to insure that they are not incorrectly reused. So this manager
// provides the smarts to hand out buffers as required.
//
// fDocHandler
// The client code's document handler. If zero, then no document
// handler callouts are done. We don't adopt it.
//
// fDoNamespaces
// This flag indicates whether the client code wants us to do
// namespaces or not. If the installed validator indicates that it
// has to do namespaces, then this is ignored.
//
// fDoValidation
// Indicates whether any validation should be done. A validator
// can still be installed and it will build up all the data
// structures for the DTD/Schema declarations, but the scanner will
// not ask it to validate anything.
//
// fElemStack
// This is the element stack that is used to track the elements that
// are currently being worked on.
//
// fEntityHandler
// The client code's entity handler. If zero, then no entity handler
// callouts are done. We don't adopt it.
//
// fErrorReporter
// The client code's error handler. If zero, then no error handler
// callouts are done. We don't adopt it.
//
// fExitOnFirstFatal
// This indicates whether we bail out on the first fatal XML error
// or not. It defaults to true, which is the strict XML way, but it
// can be changed.
//
// fIDRefList
// This is a list of XMLRefInfo objects. This member lets us do all
// needed ID-IDREF balancing checks.
//
// fInException
// To avoid a circular freakout when we catch an exception and emit
// it, which would normally throw again if the 'fail on first error'
// flag is one.
//
// fRawAttrList
// During the initial scan of the attributes we can only do a raw
// scan for key/value pairs. So this vector is used to store them
// until they can be processed (and put into fAttrList.)
//
// fReaderMgr
// This is the reader manager, from which we get characters. It
// manages the reader stack for us, and provides a lot of convenience
// methods to do specialized checking for chars, sequences of chars,
// skipping chars, etc...
//
// fReuseValidator
// This flag is set on a per-scan basis. So its provided in the
// scanDocument() and scanFirst() methods, and applies for that
// one pass. It indicates that the validator should not be reused
// and that any external structural description should be ignored.
// There cannot be any internal subset.
//
// fScannerId
// fSequenceId
// These are used for progressive parsing, to make sure that the
// client code does the right thing at the right time.
//
// fStandalone
// Indicates whether the document is standalone or not. Defaults to
// no, but can be overridden in the XMLDecl.
//
// fValidator
// The installed validator. We look at them via the abstract
// validator interface, and don't know what it actual is.
//
//
// fAttName
// fAttValue
// fCDataBuf
// fNameBuf
// fQNameBuf
// fPrefixBuf
// For the most part, buffers are obtained from the fBufMgr object
// on the fly. However, for the start tag scan, we have a set of
// fixed buffers for performance reasons. These are used a lot and
// there are a number of them, so asking the buffer manager each
// time for new buffers is a bit too much overhead.
// -----------------------------------------------------------------------
RefVectorOf<XMLAttr>* fAttrList;
XMLCh* fBaseDir;
XMLBufferMgr fBufMgr;
XMLDocumentHandler* fDocHandler;
bool fDoNamespaces;
bool fDoValidation;
ElemStack fElemStack;
XMLEntityHandler* fEntityHandler;
XMLErrorReporter* fErrorReporter;
bool fExitOnFirstFatal;
RefHashTableOf<XMLRefInfo>* fIDRefList;
bool fInException;
RefVectorOf<KVStringPair>* fRawAttrList;
ReaderMgr fReaderMgr;
bool fReuseValidator;
XMLUInt32 fScannerId;
XMLUInt32 fSequenceId;
bool fStandalone;
XMLValidator* fValidator;
XMLBuffer fAttNameBuf;
XMLBuffer fAttValueBuf;
XMLBuffer fCDataBuf;
XMLBuffer fNameBuf;
XMLBuffer fQNameBuf;
XMLBuffer fPrefixBuf;
XMLBuffer fURIBuf;
};
// ---------------------------------------------------------------------------
// XMLScanner: Getter methods
// ---------------------------------------------------------------------------
inline const XMLDocumentHandler* XMLScanner::getDocHandler() const
{
return fDocHandler;
}
inline XMLDocumentHandler* XMLScanner::getDocHandler()
{
return fDocHandler;
}
inline bool XMLScanner::getDoNamespaces() const
{
return fDoNamespaces;
}
inline bool XMLScanner::getDoValidation() const
{
return fDoValidation;
}
inline const XMLEntityHandler* XMLScanner::getEntityHandler() const
{
return fEntityHandler;
}
inline XMLEntityHandler* XMLScanner::getEntityHandler()
{
return fEntityHandler;
}
inline const XMLErrorReporter* XMLScanner::getErrorReporter() const
{
return fErrorReporter;
}
inline XMLErrorReporter* XMLScanner::getErrorReporter()
{
return fErrorReporter;
}
inline bool XMLScanner::getExitOnFirstFatal() const
{
return fExitOnFirstFatal;
}
inline RefHashTableOf<XMLRefInfo>& XMLScanner::getIDRefList()
{
return *fIDRefList;
}
inline bool XMLScanner::getInException() const
{
return fInException;
}
inline const RefHashTableOf<XMLRefInfo>& XMLScanner::getIDRefList() const
{
return *fIDRefList;
}
inline bool XMLScanner::getStandalone() const
{
return fStandalone;
}
inline const XMLValidator* XMLScanner::getValidator() const
{
return fValidator;
}
inline XMLValidator* XMLScanner::getValidator()
{
return fValidator;
}
// ---------------------------------------------------------------------------
// XMLScanner: Setter methods
// ---------------------------------------------------------------------------
inline void XMLScanner::setDoNamespaces(const bool doNamespaces)
{
fDoNamespaces = doNamespaces;
}
inline void XMLScanner::setDocHandler(XMLDocumentHandler* const docHandler)
{
fDocHandler = docHandler;
}
inline void XMLScanner::setErrorReporter(XMLErrorReporter* const errHandler)
{
fErrorReporter = errHandler;
}
inline void XMLScanner::setEntityHandler(XMLEntityHandler* const entityHandler)
{
fEntityHandler = entityHandler;
fReaderMgr.setEntityHandler(entityHandler);
}
inline void XMLScanner::setExitOnFirstFatal(const bool newValue)
{
fExitOnFirstFatal = newValue;
}
#endif