blob: 1ac62edbd1594e4d5779a36f86c9f86efccfd2d5 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 2001, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// This sample program which invokes the DOMParser to build a DOM tree for
// the specified input file. It then walks the tree, and prints out the data
// as an XML file.
//
// Limitations:
// 1. The encoding="xxx" clause in the XML header should reflect
// the system local code page, but does not.
// 2. Cases where the XML data contains characters that can not
// be represented in the system local code page are not handled.
// 3. Enabled namespace processing won't affect the output, since
// DOM doesn't do namespace yet. But it will confirm that all
// prefixes are correctly mapped, else you'll get errors.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/framework/XMLFormatter.hpp>
#include <xercesc/util/TranscodingException.hpp>
#include <xercesc/idom/IDOM_DOMException.hpp>
#include <xercesc/parsers/IDOMParser.hpp>
#include <xercesc/idom/IDOM.hpp>
#include "IDOMTreeErrorReporter.hpp"
#include <string.h>
#include <stdlib.h>
// ---------------------------------------------------------------------------
// Local const data
//
// Note: This is the 'safe' way to do these strings. If you compiler supports
// L"" style strings, and portability is not a concern, you can use
// those types constants directly.
// ---------------------------------------------------------------------------
static const XMLCh gEndElement[] = { chOpenAngle, chForwardSlash, chNull };
static const XMLCh gEndPI[] = { chQuestion, chCloseAngle, chNull};
static const XMLCh gStartPI[] = { chOpenAngle, chQuestion, chNull };
static const XMLCh gXMLDecl1[] =
{
chOpenAngle, chQuestion, chLatin_x, chLatin_m, chLatin_l
, chSpace, chLatin_v, chLatin_e, chLatin_r, chLatin_s, chLatin_i
, chLatin_o, chLatin_n, chEqual, chDoubleQuote, chDigit_1
, chPeriod, chDigit_0, chNull
};
static const XMLCh gXMLDecl2[] =
{
chDoubleQuote, chSpace, chLatin_e, chLatin_n, chLatin_c
, chLatin_o, chLatin_d, chLatin_i, chLatin_n, chLatin_g, chEqual
, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl3[] =
{
chDoubleQuote, chQuestion, chCloseAngle
, chLF, chNull
};
static const XMLCh gStartCDATA[] =
{
chOpenAngle, chBang, chOpenSquare, chLatin_C, chLatin_D,
chLatin_A, chLatin_T, chLatin_A, chOpenSquare, chNull
};
static const XMLCh gEndCDATA[] =
{
chCloseSquare, chCloseSquare, chCloseAngle, chNull
};
static const XMLCh gStartComment[] =
{
chOpenAngle, chBang, chDash, chDash, chNull
};
static const XMLCh gEndComment[] =
{
chDash, chDash, chCloseAngle, chNull
};
static const XMLCh gStartDoctype[] =
{
chOpenAngle, chBang, chLatin_D, chLatin_O, chLatin_C, chLatin_T,
chLatin_Y, chLatin_P, chLatin_E, chSpace, chNull
};
static const XMLCh gPublic[] =
{
chLatin_P, chLatin_U, chLatin_B, chLatin_L, chLatin_I,
chLatin_C, chSpace, chDoubleQuote, chNull
};
static const XMLCh gSystem[] =
{
chLatin_S, chLatin_Y, chLatin_S, chLatin_T, chLatin_E,
chLatin_M, chSpace, chDoubleQuote, chNull
};
static const XMLCh gStartEntity[] =
{
chOpenAngle, chBang, chLatin_E, chLatin_N, chLatin_T, chLatin_I,
chLatin_T, chLatin_Y, chSpace, chNull
};
static const XMLCh gNotation[] =
{
chLatin_N, chLatin_D, chLatin_A, chLatin_T, chLatin_A,
chSpace, chDoubleQuote, chNull
};
// ---------------------------------------------------------------------------
// Local classes
// ---------------------------------------------------------------------------
class IDOMPrintFormatTarget : public XMLFormatTarget
{
public:
IDOMPrintFormatTarget() {};
~IDOMPrintFormatTarget() {};
// -----------------------------------------------------------------------
// Implementations of the format target interface
// -----------------------------------------------------------------------
void writeChars(const XMLByte* const toWrite,
const unsigned int count,
XMLFormatter * const formatter)
{
// Surprisingly, Solaris was the only platform on which
// required the char* cast to print out the string correctly.
// Without the cast, it was printing the pointer value in hex.
// Quite annoying, considering every other platform printed
// the string with the explicit cast to char* below.
cout.write((char *) toWrite, (int) count);
};
private:
// -----------------------------------------------------------------------
// Unimplemented methods.
// -----------------------------------------------------------------------
IDOMPrintFormatTarget(const IDOMPrintFormatTarget& other);
void operator=(const IDOMPrintFormatTarget& rhs);
};
// ---------------------------------------------------------------------------
// Local data
//
// gXmlFile
// The path to the file to parser. Set via command line.
//
// gDoNamespaces
// Indicates whether namespace processing should be done.
//
// gDoSchema
// Indicates whether schema processing should be done.
//
// gSchemaFullChecking
// Indicates whether full schema constraint checking should be done.
//
// gDoCreate
// Indicates whether entity reference nodes needs to be created or not
// Defaults to false
//
// gEncodingName
// The encoding we are to output in. If not set on the command line,
// then it is defaults to the encoding of the input XML file.
//
// gValScheme
// Indicates what validation scheme to use. It defaults to 'auto', but
// can be set via the -v= command.
//
// ---------------------------------------------------------------------------
static char* gXmlFile = 0;
static bool gDoNamespaces = false;
static bool gDoSchema = false;
static bool gSchemaFullChecking = false;
static bool gDoCreate = false;
static const XMLCh* gEncodingName = 0;
static bool gEncodingNameisOnHeap = false;
static XMLFormatter::UnRepFlags gUnRepFlags = XMLFormatter::UnRep_CharRef;
static IDOMParser::ValSchemes gValScheme = IDOMParser::Val_Auto;
static XMLFormatter* gFormatter = 0;
// ---------------------------------------------------------------------------
// Forward references
// ---------------------------------------------------------------------------
void usage();
ostream& operator<<(ostream& target, const XMLCh * toWrite);
ostream& operator<<(ostream& target, IDOM_Node *toWrite);
// XMLFormatter& operator<< (XMLFormatter& strm, const XMLCh *s);
// ---------------------------------------------------------------------------
//
// Usage()
//
// ---------------------------------------------------------------------------
void usage()
{
cout << "\nUsage:\n"
" IDOMPrint [options] <XML file>\n\n"
"This program invokes the IDOM parser, and builds the DOM tree.\n"
"It then traverses the DOM tree and prints the contents of the\n"
"tree for the specified XML file.\n\n"
"Options:\n"
" -e create entity reference nodes. Default is no expansion.\n"
" -u=xxx Handle unrepresentable chars [fail | rep | ref*].\n"
" -v=xxx Validation scheme [always | never | auto*].\n"
" -n Enable namespace processing. Default is off.\n"
" -s Enable schema processing. Default is off.\n"
" -f Enable full schema constraint checking. Defaults is off.\n"
" -x=XXX Use a particular encoding for output. Default is\n"
" the same encoding as the input XML file. UTF-8 if\n"
" input XML file has not XML declaration.\n"
" -? Show this help.\n\n"
" * = Default if not provided explicitly.\n\n"
"The parser has intrinsic support for the following encodings:\n"
" UTF-8, USASCII, ISO8859-1, UTF-16[BL]E, UCS-4[BL]E,\n"
" WINDOWS-1252, IBM1140, IBM037.\n"
<< endl;
}
// ---------------------------------------------------------------------------
//
// main
//
// ---------------------------------------------------------------------------
int main(int argC, char* argV[])
{
int retval = 0;
// Initialize the XML4C2 system
try
{
XMLPlatformUtils::Initialize();
}
catch(const XMLException &toCatch)
{
cerr << "Error during Xerces-c Initialization.\n"
<< " Exception message:"
<< StrX(toCatch.getMessage()) << endl;
return 1;
}
// Check command line and extract arguments.
if (argC < 2)
{
usage();
XMLPlatformUtils::Terminate();
return 1;
}
// See if non validating dom parser configuration is requested.
int parmInd;
for (parmInd = 1; parmInd < argC; parmInd++)
{
// Break out on first parm not starting with a dash
if (argV[parmInd][0] != '-')
break;
// Watch for special case help request
if (!strcmp(argV[parmInd], "-?"))
{
usage();
XMLPlatformUtils::Terminate();
return 2;
}
else if (!strncmp(argV[parmInd], "-v=", 3)
|| !strncmp(argV[parmInd], "-V=", 3))
{
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "never"))
gValScheme = IDOMParser::Val_Never;
else if (!strcmp(parm, "auto"))
gValScheme = IDOMParser::Val_Auto;
else if (!strcmp(parm, "always"))
gValScheme = IDOMParser::Val_Always;
else
{
cerr << "Unknown -v= value: " << parm << endl;
XMLPlatformUtils::Terminate();
return 2;
}
}
else if (!strcmp(argV[parmInd], "-n")
|| !strcmp(argV[parmInd], "-N"))
{
gDoNamespaces = true;
}
else if (!strcmp(argV[parmInd], "-s")
|| !strcmp(argV[parmInd], "-S"))
{
gDoSchema = true;
}
else if (!strcmp(argV[parmInd], "-f")
|| !strcmp(argV[parmInd], "-F"))
{
gSchemaFullChecking = true;
}
else if (!strcmp(argV[parmInd], "-e")
|| !strcmp(argV[parmInd], "-E"))
{
gDoCreate = true;
}
else if (!strncmp(argV[parmInd], "-x=", 3)
|| !strncmp(argV[parmInd], "-X=", 3))
{
// Get out the encoding name
gEncodingName = XMLString::transcode( &(argV[parmInd][3]) );
gEncodingNameisOnHeap = true;
}
else if (!strncmp(argV[parmInd], "-u=", 3)
|| !strncmp(argV[parmInd], "-U=", 3))
{
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "fail"))
gUnRepFlags = XMLFormatter::UnRep_Fail;
else if (!strcmp(parm, "rep"))
gUnRepFlags = XMLFormatter::UnRep_Replace;
else if (!strcmp(parm, "ref"))
gUnRepFlags = XMLFormatter::UnRep_CharRef;
else
{
cerr << "Unknown -u= value: " << parm << endl;
XMLPlatformUtils::Terminate();
return 2;
}
}
// else if (!strcmp(argV[parmInd], "-NoEscape"))
// {
// gDoEscapes = false;
// }
else
{
cerr << "Unknown option '" << argV[parmInd]
<< "', ignoring it.\n" << endl;
}
}
//
// And now we have to have only one parameter left and it must be
// the file name.
//
if (parmInd + 1 != argC)
{
usage();
XMLPlatformUtils::Terminate();
return 1;
}
gXmlFile = argV[parmInd];
//
// Create our parser, then attach an error handler to the parser.
// The parser will call back to methods of the ErrorHandler if it
// discovers errors during the course of parsing the XML document.
//
IDOMParser *parser = new IDOMParser;
parser->setValidationScheme(gValScheme);
parser->setDoNamespaces(gDoNamespaces);
parser->setDoSchema(gDoSchema);
parser->setValidationSchemaFullChecking(gSchemaFullChecking);
IDOMTreeErrorReporter *errReporter = new IDOMTreeErrorReporter();
parser->setErrorHandler(errReporter);
parser->setCreateEntityReferenceNodes(gDoCreate);
//
// Parse the XML file, catching any XML exceptions that might propogate
// out of it.
//
bool errorsOccured = false;
try
{
parser->parse(gXmlFile);
}
catch (const XMLException& e)
{
cerr << "An error occured during parsing\n Message: "
<< StrX(e.getMessage()) << endl;
errorsOccured = true;
}
catch (const IDOM_DOMException& e)
{
cerr << "A DOM error occured during parsing\n DOMException code: "
<< e.code << endl;
errorsOccured = true;
}
catch (...)
{
cerr << "An error occured during parsing\n " << endl;
errorsOccured = true;
}
// If the parse was successful, output the document data from the DOM tree
if (!errorsOccured && !errReporter->getSawErrors())
{
IDOM_Node *doc = parser->getDocument();
IDOMPrintFormatTarget* formatTarget = new IDOMPrintFormatTarget();
// Figure out the encoding that we want to use to write the document.
// If command line specified, use that,
// otherwise use utf-8.
//
if (gEncodingName == 0) // if no encoding specified on command line
{
static const XMLCh utf_8[] = {chLatin_U, chLatin_T, chLatin_F, chDash, chDigit_8, 0};
gEncodingName = utf_8;
}
//
// Create the output formatter and actually write the document HERE!
//
try
{
gFormatter = new XMLFormatter(gEncodingName, formatTarget,
XMLFormatter::NoEscapes, gUnRepFlags);
//print out the XML Decl node first
*gFormatter << gXMLDecl1 ;
*gFormatter << gXMLDecl2 << gEncodingName;
*gFormatter << gXMLDecl3;
cout << doc;
*gFormatter << chLF; // add linefeed in requested output encoding
cout << flush;
}
catch (XMLException& e)
{
cerr << "An error occurred during creation of output transcoder. Msg is:"
<< endl
<< StrX(e.getMessage()) << endl;
retval = 4;
}
delete formatTarget;
delete gFormatter;
}
else
retval = 4;
//
// Clean up the error handler. The parser does not adopt handlers
// since they could be many objects or one object installed for multiple
// handlers.
//
delete errReporter;
//
// Delete the parser itself. Must be done prior to calling Terminate, below.
//
delete parser;
// And call the termination method
XMLPlatformUtils::Terminate();
if (gEncodingNameisOnHeap)
delete (void *)gEncodingName; // const problems.
return retval;
}
// ---------------------------------------------------------------------------
// ostream << DOM_Node
//
// Stream out a DOM node, and, recursively, all of its children. This
// function is the heart of writing a DOM tree out as XML source. Give it
// a document node and it will do the whole thing.
// ---------------------------------------------------------------------------
ostream& operator<<(ostream& target, IDOM_Node *toWrite)
{
// Get the name and value out for convenience
const XMLCh * nodeName = toWrite->getNodeName();
const XMLCh * nodeValue = toWrite->getNodeValue();
unsigned long lent = XMLString::stringLen(nodeValue);
switch (toWrite->getNodeType())
{
case IDOM_Node::TEXT_NODE:
{
gFormatter->formatBuf(nodeValue,
lent, XMLFormatter::CharEscapes);
break;
}
case IDOM_Node::PROCESSING_INSTRUCTION_NODE :
{
*gFormatter << XMLFormatter::NoEscapes << gStartPI << nodeName;
if (lent > 0)
{
*gFormatter << chSpace << nodeValue;
}
*gFormatter << XMLFormatter::NoEscapes << gEndPI;
break;
}
case IDOM_Node::DOCUMENT_NODE :
{
IDOM_Node *child = toWrite->getFirstChild();
while( child != 0)
{
target << child;
// add linefeed in requested output encoding
*gFormatter << chLF;
target << flush;
child = child->getNextSibling();
}
break;
}
case IDOM_Node::ELEMENT_NODE :
{
// The name has to be representable without any escapes
*gFormatter << XMLFormatter::NoEscapes
<< chOpenAngle << nodeName;
// Output the element start tag.
// Output any attributes on this element
IDOM_NamedNodeMap *attributes = toWrite->getAttributes();
int attrCount = attributes->getLength();
for (int i = 0; i < attrCount; i++)
{
IDOM_Node *attribute = attributes->item(i);
//
// Again the name has to be completely representable. But the
// attribute can have refs and requires the attribute style
// escaping.
//
*gFormatter << XMLFormatter::NoEscapes
<< chSpace << attribute->getNodeName()
<< chEqual << chDoubleQuote
<< XMLFormatter::AttrEscapes
<< attribute->getNodeValue()
<< XMLFormatter::NoEscapes
<< chDoubleQuote;
}
//
// Test for the presence of children, which includes both
// text content and nested elements.
//
IDOM_Node *child = toWrite->getFirstChild();
if (child != 0)
{
// There are children. Close start-tag, and output children.
// No escapes are legal here
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
while( child != 0)
{
target << child;
child = child->getNextSibling();
}
//
// Done with children. Output the end tag.
//
*gFormatter << XMLFormatter::NoEscapes << gEndElement
<< nodeName << chCloseAngle;
}
else
{
//
// There were no children. Output the short form close of
// the element start tag, making it an empty-element tag.
//
*gFormatter << XMLFormatter::NoEscapes << chForwardSlash << chCloseAngle;
}
break;
}
case IDOM_Node::ENTITY_REFERENCE_NODE:
{
#if 0
IDOM_Node *child;
for (child = toWrite->getFirstChild();
child != 0;
child = child->getNextSibling())
{
target << child;
}
#else
//
// Instead of printing the refernece tree
// we'd output the actual text as it appeared in the xml file.
// This would be the case when -e option was chosen
//
*gFormatter << XMLFormatter::NoEscapes << chAmpersand
<< nodeName << chSemiColon;
#endif
break;
}
case IDOM_Node::CDATA_SECTION_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartCDATA
<< nodeValue << gEndCDATA;
break;
}
case IDOM_Node::COMMENT_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartComment
<< nodeValue << gEndComment;
break;
}
case IDOM_Node::DOCUMENT_TYPE_NODE:
{
IDOM_DocumentType *doctype = (IDOM_DocumentType *)toWrite;;
*gFormatter << XMLFormatter::NoEscapes << gStartDoctype
<< nodeName;
const XMLCh *id = doctype->getPublicId();
if (id != 0 && *id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace << gPublic
<< id << chDoubleQuote;
}
id = doctype->getSystemId();
if (id != 0 && *id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace
<< chDoubleQuote << id << chDoubleQuote;
}
id = doctype->getSystemId();
if (id != 0 && *id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace << gSystem
<< id << chDoubleQuote;
}
id = doctype->getInternalSubset();
if (id != 0 && *id != 0)
*gFormatter << XMLFormatter::NoEscapes << chOpenSquare
<< id << chCloseSquare;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
break;
}
case IDOM_Node::ENTITY_NODE:
{
*gFormatter << XMLFormatter::NoEscapes << gStartEntity
<< nodeName;
const XMLCh * id = ((IDOM_Entity *)toWrite)->getPublicId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gPublic
<< id << chDoubleQuote;
id = ((IDOM_Entity *)toWrite)->getSystemId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gSystem
<< id << chDoubleQuote;
id = ((IDOM_Entity *)toWrite)->getNotationName();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gNotation
<< id << chDoubleQuote;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle << chLF;
break;
}
default:
cerr << "Unrecognized node type = "
<< (long)toWrite->getNodeType() << endl;
}
return target;
}
// ---------------------------------------------------------------------------
// ostream << XMLCh *
//
// Stream out a DOM string. Doing this requires that we first transcode
// to char * form in the default code page for the system
// ---------------------------------------------------------------------------
ostream& operator<< (ostream& target, const XMLCh *s)
{
char *p = XMLString::transcode(s);
target << p;
delete [] p;
return target;
}