blob: e7102c3808b70952364a1256b99c6d8aa0b23f96 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// This sample program which invokes the DOMParser to build a DOM tree for
// the specified input file. It then walks the tree, and prints out the data
// as an XML file.
//
// Limitations:
// 1. The encoding="xxx" clause in the XML header should reflect
// the system local code page, but does not.
// 2. Cases where the XML data contains characters that can not
// be represented in the system local code page are not handled.
// 3. Enabled namespace processing won't affect the output, since
// DOM doesn't do namespace yet. But it will confirm that all
// prefixes are correctly mapped, else you'll get errors.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/framework/XMLFormatter.hpp>
#include <xercesc/util/TranscodingException.hpp>
#include <xercesc/dom/DOM_DOMException.hpp>
#include <xercesc/parsers/DOMParser.hpp>
#include <xercesc/dom/DOM.hpp>
#include "DOMTreeErrorReporter.hpp"
#include <string.h>
#include <stdlib.h>
// ---------------------------------------------------------------------------
// Local const data
//
// Note: This is the 'safe' way to do these strings. If you compiler supports
// L"" style strings, and portability is not a concern, you can use
// those types constants directly.
// ---------------------------------------------------------------------------
static const XMLCh gEndElement[] = { chOpenAngle, chForwardSlash, chNull };
static const XMLCh gEndPI[] = { chQuestion, chCloseAngle, chNull};
static const XMLCh gStartPI[] = { chOpenAngle, chQuestion, chNull };
static const XMLCh gXMLDecl1[] =
{
chOpenAngle, chQuestion, chLatin_x, chLatin_m, chLatin_l
, chSpace, chLatin_v, chLatin_e, chLatin_r, chLatin_s, chLatin_i
, chLatin_o, chLatin_n, chEqual, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl2[] =
{
chDoubleQuote, chSpace, chLatin_e, chLatin_n, chLatin_c
, chLatin_o, chLatin_d, chLatin_i, chLatin_n, chLatin_g, chEqual
, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl3[] =
{
chDoubleQuote, chSpace, chLatin_s, chLatin_t, chLatin_a
, chLatin_n, chLatin_d, chLatin_a, chLatin_l, chLatin_o
, chLatin_n, chLatin_e, chEqual, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl4[] =
{
chDoubleQuote, chQuestion, chCloseAngle
, chLF, chNull
};
static const XMLCh gStartCDATA[] =
{
chOpenAngle, chBang, chOpenSquare, chLatin_C, chLatin_D,
chLatin_A, chLatin_T, chLatin_A, chOpenSquare, chNull
};
static const XMLCh gEndCDATA[] =
{
chCloseSquare, chCloseSquare, chCloseAngle, chNull
};
static const XMLCh gStartComment[] =
{
chOpenAngle, chBang, chDash, chDash, chNull
};
static const XMLCh gEndComment[] =
{
chDash, chDash, chCloseAngle, chNull
};
static const XMLCh gStartDoctype[] =
{
chOpenAngle, chBang, chLatin_D, chLatin_O, chLatin_C, chLatin_T,
chLatin_Y, chLatin_P, chLatin_E, chSpace, chNull
};
static const XMLCh gPublic[] =
{
chLatin_P, chLatin_U, chLatin_B, chLatin_L, chLatin_I,
chLatin_C, chSpace, chDoubleQuote, chNull
};
static const XMLCh gSystem[] =
{
chLatin_S, chLatin_Y, chLatin_S, chLatin_T, chLatin_E,
chLatin_M, chSpace, chDoubleQuote, chNull
};
static const XMLCh gStartEntity[] =
{
chOpenAngle, chBang, chLatin_E, chLatin_N, chLatin_T, chLatin_I,
chLatin_T, chLatin_Y, chSpace, chNull
};
static const XMLCh gNotation[] =
{
chLatin_N, chLatin_D, chLatin_A, chLatin_T, chLatin_A,
chSpace, chDoubleQuote, chNull
};
// ---------------------------------------------------------------------------
// Local classes
// ---------------------------------------------------------------------------
class DOMPrintFormatTarget : public XMLFormatTarget
{
public:
DOMPrintFormatTarget() {};
~DOMPrintFormatTarget() {};
// -----------------------------------------------------------------------
// Implementations of the format target interface
// -----------------------------------------------------------------------
void writeChars(const XMLByte* const toWrite,
const unsigned int count,
XMLFormatter * const formatter)
{
// Surprisingly, Solaris was the only platform on which
// required the char* cast to print out the string correctly.
// Without the cast, it was printing the pointer value in hex.
// Quite annoying, considering every other platform printed
// the string with the explicit cast to char* below.
cout.write((char *) toWrite, (int) count);
};
private:
// -----------------------------------------------------------------------
// Unimplemented methods.
// -----------------------------------------------------------------------
DOMPrintFormatTarget(const DOMPrintFormatTarget& other);
void operator=(const DOMPrintFormatTarget& rhs);
};
// ---------------------------------------------------------------------------
// Local data
//
// gXmlFile
// The path to the file to parser. Set via command line.
//
// gDoNamespaces
// Indicates whether namespace processing should be done.
//
// gDoSchema
// Indicates whether schema processing should be done.
//
// gSchemaFullChecking
// Indicates whether full schema constraint checking should be done.
//
// gDoCreate
// Indicates whether entity reference nodes needs to be created or not
// Defaults to false
//
// gEncodingName
// The encoding we are to output in. If not set on the command line,
// then it is defaults to the encoding of the input XML file.
//
// gValScheme
// Indicates what validation scheme to use. It defaults to 'auto', but
// can be set via the -v= command.
//
// ---------------------------------------------------------------------------
static char* gXmlFile = 0;
static bool gDoNamespaces = false;
static bool gDoSchema = false;
static bool gSchemaFullChecking = false;
static bool gDoCreate = false;
static XMLCh* gEncodingName = 0;
static XMLFormatter::UnRepFlags gUnRepFlags = XMLFormatter::UnRep_CharRef;
static DOMParser::ValSchemes gValScheme = DOMParser::Val_Auto;
static XMLFormatter* gFormatter = 0;
// ---------------------------------------------------------------------------
// Forward references
// ---------------------------------------------------------------------------
void usage();
ostream& operator<<(ostream& target, const DOMString& toWrite);
ostream& operator<<(ostream& target, DOM_Node& toWrite);
XMLFormatter& operator<< (XMLFormatter& strm, const DOMString& s);
// ---------------------------------------------------------------------------
//
// Usage()
//
// ---------------------------------------------------------------------------
void usage()
{
cout << "\nUsage:\n"
" DOMPrint [options] <XML file>\n\n"
"This program invokes the DOM parser, and builds the DOM tree.\n"
"It then traverses the DOM tree and prints the contents of the\n"
"tree for the specified XML file.\n\n"
"Options:\n"
" -e create entity reference nodes. Default is no expansion.\n"
" -u=xxx Handle unrepresentable chars [fail | rep | ref*].\n"
" -v=xxx Validation scheme [always | never | auto*].\n"
" -n Enable namespace processing. Default is off.\n"
" -s Enable schema processing. Default is off.\n"
" -f Enable full schema constraint checking. Defaults to off.\n"
" -x=XXX Use a particular encoding for output. Default is\n"
" the same encoding as the input XML file. UTF-8 if\n"
" input XML file has not XML declaration.\n"
" -? Show this help.\n\n"
" * = Default if not provided explicitly.\n\n"
"The parser has intrinsic support for the following encodings:\n"
" UTF-8, USASCII, ISO8859-1, UTF-16[BL]E, UCS-4[BL]E,\n"
" WINDOWS-1252, IBM1140, IBM037.\n"
<< endl;
}
// ---------------------------------------------------------------------------
//
// main
//
// ---------------------------------------------------------------------------
int main(int argC, char* argV[])
{
int retval = 0;
// Initialize the XML4C2 system
try
{
XMLPlatformUtils::Initialize();
}
catch(const XMLException& toCatch)
{
cerr << "Error during Xerces-c Initialization.\n"
<< " Exception message:"
<< DOMString(toCatch.getMessage()) << endl;
return 1;
}
// Check command line and extract arguments.
if (argC < 2)
{
usage();
XMLPlatformUtils::Terminate();
return 1;
}
// See if non validating dom parser configuration is requested.
int parmInd;
for (parmInd = 1; parmInd < argC; parmInd++)
{
// Break out on first parm not starting with a dash
if (argV[parmInd][0] != '-')
break;
// Watch for special case help request
if (!strcmp(argV[parmInd], "-?"))
{
usage();
XMLPlatformUtils::Terminate();
return 2;
}
else if (!strncmp(argV[parmInd], "-v=", 3)
|| !strncmp(argV[parmInd], "-V=", 3))
{
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "never"))
gValScheme = DOMParser::Val_Never;
else if (!strcmp(parm, "auto"))
gValScheme = DOMParser::Val_Auto;
else if (!strcmp(parm, "always"))
gValScheme = DOMParser::Val_Always;
else
{
cerr << "Unknown -v= value: " << parm << endl;
XMLPlatformUtils::Terminate();
return 2;
}
}
else if (!strcmp(argV[parmInd], "-n")
|| !strcmp(argV[parmInd], "-N"))
{
gDoNamespaces = true;
}
else if (!strcmp(argV[parmInd], "-s")
|| !strcmp(argV[parmInd], "-S"))
{
gDoSchema = true;
}
else if (!strcmp(argV[parmInd], "-f")
|| !strcmp(argV[parmInd], "-F"))
{
gSchemaFullChecking = true;
}
else if (!strcmp(argV[parmInd], "-e")
|| !strcmp(argV[parmInd], "-E"))
{
gDoCreate = true;
}
else if (!strncmp(argV[parmInd], "-x=", 3)
|| !strncmp(argV[parmInd], "-X=", 3))
{
// Get out the encoding name
gEncodingName = XMLString::transcode( &(argV[parmInd][3]) );
}
else if (!strncmp(argV[parmInd], "-u=", 3)
|| !strncmp(argV[parmInd], "-U=", 3))
{
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "fail"))
gUnRepFlags = XMLFormatter::UnRep_Fail;
else if (!strcmp(parm, "rep"))
gUnRepFlags = XMLFormatter::UnRep_Replace;
else if (!strcmp(parm, "ref"))
gUnRepFlags = XMLFormatter::UnRep_CharRef;
else
{
cerr << "Unknown -u= value: " << parm << endl;
XMLPlatformUtils::Terminate();
return 2;
}
}
// else if (!strcmp(argV[parmInd], "-NoEscape"))
// {
// gDoEscapes = false;
// }
else
{
cerr << "Unknown option '" << argV[parmInd]
<< "', ignoring it.\n" << endl;
}
}
//
// And now we have to have only one parameter left and it must be
// the file name.
//
if (parmInd + 1 != argC)
{
usage();
XMLPlatformUtils::Terminate();
return 1;
}
gXmlFile = argV[parmInd];
//
// Create our parser, then attach an error handler to the parser.
// The parser will call back to methods of the ErrorHandler if it
// discovers errors during the course of parsing the XML document.
//
DOMParser *parser = new DOMParser;
parser->setValidationScheme(gValScheme);
parser->setDoNamespaces(gDoNamespaces);
parser->setDoSchema(gDoSchema);
parser->setValidationSchemaFullChecking(gSchemaFullChecking);
DOMTreeErrorReporter *errReporter = new DOMTreeErrorReporter();
parser->setErrorHandler(errReporter);
parser->setCreateEntityReferenceNodes(gDoCreate);
parser->setToCreateXMLDeclTypeNode(true);
//
// Parse the XML file, catching any XML exceptions that might propogate
// out of it.
//
bool errorsOccured = false;
try
{
parser->parse(gXmlFile);
int errorCount = parser->getErrorCount();
if (errorCount > 0)
errorsOccured = true;
}
catch (const XMLException& e)
{
cerr << "An error occured during parsing\n Message: "
<< DOMString(e.getMessage()) << endl;
errorsOccured = true;
}
catch (const DOM_DOMException& e)
{
cerr << "A DOM error occured during parsing\n DOMException code: "
<< e.code << endl;
errorsOccured = true;
}
catch (...)
{
cerr << "An error occured during parsing\n " << endl;
errorsOccured = true;
}
// If the parse was successful, output the document data from the DOM tree
if (!errorsOccured && !errReporter->getSawErrors())
{
DOM_Node doc = parser->getDocument();
DOMPrintFormatTarget* formatTarget = new DOMPrintFormatTarget();
if (gEncodingName == 0)
{
DOMString encNameStr("UTF-8");
DOM_Node aNode = doc.getFirstChild();
if (aNode.getNodeType() == DOM_Node::XML_DECL_NODE)
{
DOMString aStr = ((DOM_XMLDecl &)aNode).getEncoding();
if (aStr != "")
{
encNameStr = aStr;
}
}
unsigned int lent = encNameStr.length();
gEncodingName = new XMLCh[lent + 1];
XMLString::copyNString(gEncodingName, encNameStr.rawBuffer(), lent);
gEncodingName[lent] = 0;
}
try
{
gFormatter = new XMLFormatter(gEncodingName, formatTarget,
XMLFormatter::NoEscapes, gUnRepFlags);
cout << doc;
*gFormatter << chLF; // add linefeed in requested output encoding
cout << flush;
}
catch (XMLException& e)
{
cerr << "An error occurred during creation of output transcoder. Msg is:"
<< endl
<< DOMString(e.getMessage()) << endl;
retval = 4;
}
delete formatTarget;
delete gFormatter;
}
else
retval = 4;
delete [] gEncodingName;
//
// Clean up the error handler. The parser does not adopt handlers
// since they could be many objects or one object installed for multiple
// handlers.
//
delete errReporter;
//
// Delete the parser itself. Must be done prior to calling Terminate, below.
//
delete parser;
// And call the termination method
XMLPlatformUtils::Terminate();
// DomMemDebug().print();
//
// The DOM document and its contents are reference counted, and need
// no explicit deletion.
//
return retval;
}
// ---------------------------------------------------------------------------
// ostream << DOM_Node
//
// Stream out a DOM node, and, recursively, all of its children. This
// function is the heart of writing a DOM tree out as XML source. Give it
// a document node and it will do the whole thing.
// ---------------------------------------------------------------------------
ostream& operator<<(ostream& target, DOM_Node& toWrite)
{
// Get the name and value out for convenience
DOMString nodeName = toWrite.getNodeName();
DOMString nodeValue = toWrite.getNodeValue();
unsigned long lent = nodeValue.length();
switch (toWrite.getNodeType())
{
case DOM_Node::TEXT_NODE:
{
gFormatter->formatBuf(nodeValue.rawBuffer(),
lent, XMLFormatter::CharEscapes);
break;
}
case DOM_Node::PROCESSING_INSTRUCTION_NODE :
{
*gFormatter << XMLFormatter::NoEscapes << gStartPI << nodeName;
if (lent > 0)
{
*gFormatter << chSpace << nodeValue;
}
*gFormatter << XMLFormatter::NoEscapes << gEndPI;
break;
}
case DOM_Node::DOCUMENT_NODE :
{
DOM_Node child = toWrite.getFirstChild();
while( child != 0)
{
target << child;
// add linefeed in requested output encoding
*gFormatter << chLF;
target << flush;
child = child.getNextSibling();
}
break;
}
case DOM_Node::ELEMENT_NODE :
{
// The name has to be representable without any escapes
*gFormatter << XMLFormatter::NoEscapes
<< chOpenAngle << nodeName;
// Output the element start tag.
// Output any attributes on this element
DOM_NamedNodeMap attributes = toWrite.getAttributes();
int attrCount = attributes.getLength();
for (int i = 0; i < attrCount; i++)
{
DOM_Node attribute = attributes.item(i);
//
// Again the name has to be completely representable. But the
// attribute can have refs and requires the attribute style
// escaping.
//
*gFormatter << XMLFormatter::NoEscapes
<< chSpace << attribute.getNodeName()
<< chEqual << chDoubleQuote
<< XMLFormatter::AttrEscapes
<< attribute.getNodeValue()
<< XMLFormatter::NoEscapes
<< chDoubleQuote;
}
//
// Test for the presence of children, which includes both
// text content and nested elements.
//
DOM_Node child = toWrite.getFirstChild();
if (child != 0)
{
// There are children. Close start-tag, and output children.
// No escapes are legal here
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
while( child != 0)
{
target << child;
child = child.getNextSibling();
}
//
// Done with children. Output the end tag.
//
*gFormatter << XMLFormatter::NoEscapes << gEndElement
<< nodeName << chCloseAngle;
}
else
{
//
// There were no children. Output the short form close of
// the element start tag, making it an empty-element tag.
//
*gFormatter << XMLFormatter::NoEscapes << chForwardSlash << chCloseAngle;
}
break;
}
case DOM_Node::ENTITY_REFERENCE_NODE:
{
DOM_Node child;
#if 0
for (child = toWrite.getFirstChild();
child != 0;
child = child.getNextSibling())
{
target << child;
}
#else
//
// Instead of printing the refernece tree
// we'd output the actual text as it appeared in the xml file.
// This would be the case when -e option was chosen
//
*gFormatter << XMLFormatter::NoEscapes << chAmpersand
<< nodeName << chSemiColon;
#endif
break;
}
case DOM_Node::CDATA_SECTION_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartCDATA
<< nodeValue << gEndCDATA;
break;
}
case DOM_Node::COMMENT_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartComment
<< nodeValue << gEndComment;
break;
}
case DOM_Node::DOCUMENT_TYPE_NODE:
{
DOM_DocumentType doctype = (DOM_DocumentType &)toWrite;;
*gFormatter << XMLFormatter::NoEscapes << gStartDoctype
<< nodeName;
DOMString id = doctype.getPublicId();
if (id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace << gPublic
<< id << chDoubleQuote;
id = doctype.getSystemId();
if (id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace
<< chDoubleQuote << id << chDoubleQuote;
}
}
else
{
id = doctype.getSystemId();
if (id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace << gSystem
<< id << chDoubleQuote;
}
}
id = doctype.getInternalSubset();
if (id !=0)
*gFormatter << XMLFormatter::NoEscapes << chOpenSquare
<< id << chCloseSquare;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
break;
}
case DOM_Node::ENTITY_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartEntity
<< nodeName;
DOMString id = ((DOM_Entity &)toWrite).getPublicId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gPublic
<< id << chDoubleQuote;
id = ((DOM_Entity &)toWrite).getSystemId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gSystem
<< id << chDoubleQuote;
id = ((DOM_Entity &)toWrite).getNotationName();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gNotation
<< id << chDoubleQuote;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle << chLF;
break;
}
case DOM_Node::XML_DECL_NODE:
{
DOMString str;
*gFormatter << gXMLDecl1 << ((DOM_XMLDecl &)toWrite).getVersion();
*gFormatter << gXMLDecl2 << gEncodingName;
str = ((DOM_XMLDecl &)toWrite).getStandalone();
if (str != 0)
*gFormatter << gXMLDecl3 << str;
*gFormatter << gXMLDecl4;
break;
}
default:
cerr << "Unrecognized node type = "
<< (long)toWrite.getNodeType() << endl;
}
return target;
}
// ---------------------------------------------------------------------------
// ostream << DOMString
//
// Stream out a DOM string. Doing this requires that we first transcode
// to char * form in the default code page for the system
// ---------------------------------------------------------------------------
ostream& operator<< (ostream& target, const DOMString& s)
{
char *p = s.transcode();
target << p;
delete [] p;
return target;
}
XMLFormatter& operator<< (XMLFormatter& strm, const DOMString& s)
{
unsigned int lent = s.length();
if (lent <= 0)
return strm;
XMLCh* buf = new XMLCh[lent + 1];
XMLString::copyNString(buf, s.rawBuffer(), lent);
buf[lent] = 0;
strm << buf;
delete [] buf;
return strm;
}