src/util/URL.cpp - xerces-c - Git at Google

 /*
  * The Apache Software License, Version 1.1
  *
  * Copyright (c) 1999 The Apache Software Foundation.  All rights
  * reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. The end-user documentation included with the redistribution,
  *    if any, must include the following acknowledgment:
  *       "This product includes software developed by the
  *        Apache Software Foundation (http://www.apache.org/)."
  *    Alternately, this acknowledgment may appear in the software itself,
  *    if and wherever such third-party acknowledgments normally appear.
  *
  * 4. The names "Xerces" and "Apache Software Foundation" must
  *    not be used to endorse or promote products derived from this
  *    software without prior written permission. For written
  *    permission, please contact apache\@apache.org.
  *
  * 5. Products derived from this software may not be called "Apache",
  *    nor may "Apache" appear in their name, without prior written
  *    permission of the Apache Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation, and was
  * originally based on software copyright (c) 1999, International
  * Business Machines, Inc., http://www.ibm.com .  For more information
  * on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  */

 /**
  * $Log$
  * Revision 1.1  1999/11/09 01:04:20  twl
  * Initial revision
  *
  * Revision 1.3  1999/11/08 20:45:16  rahul
  * Swat for adding in Product name and CVS comment log variable.
  *
  */


 // ---------------------------------------------------------------------------
 //  Includes
 // ---------------------------------------------------------------------------
 #include <util/BinFileInputStream.hpp>
 #include <util/Janitor.hpp>
 #include <util/PlatformUtils.hpp>
 #include <util/RuntimeException.hpp>
 #include <util/URL.hpp>
 #include <util/XMLString.hpp>
 #include <util/XMLUni.hpp>


 // ---------------------------------------------------------------------------
 //  Local types
 //
 //  TypeEntry
 //      This structure defines a single entry in the list of URL types. Each
 //      entry indicates the prefix for that type of URL, and the SourceTypes
 //      value it maps to.
 // ---------------------------------------------------------------------------
 struct TypeEntry
 {
     URL::Protocols  protocol;
     const XMLCh*    prefix;
     bool            supported;
 };


 // ---------------------------------------------------------------------------
 //  Local data
 //
 //  gXXXString
 //      These are the strings for our prefix types. They all have to be
 //      Unicode strings all the time, so we can't just do regular strings.
 //
 //  gTypeList
 //      The list of URL types that we support
 //
 //  gMaxProtoLen
 //  gMaxColonPos
 //      The length of the longest protocol string and from that the maximum
 //      index at which we must see the colon.
 //
 //      NOTE:!!! Be sure to keep this up to date!
 // ---------------------------------------------------------------------------
 static const XMLCh  gFileString[] =
 {
         chLatin_f, chLatin_i, chLatin_l, chLatin_e, chColon
     ,   chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gFTPString[]  =
 {
         chLatin_f, chLatin_t, chLatin_p, chColon, chForwardSlash
     ,   chForwardSlash, chNull
 };

 static const XMLCh gGopherString[]  =
 {
         chLatin_g, chLatin_o, chLatin_p, chLatin_h, chLatin_e
     ,   chLatin_r, chColon, chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gHTTPString[] =
 {
         chLatin_h, chLatin_t, chLatin_t, chLatin_p, chColon
     ,   chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gLocalHostString[] =
 {
         chLatin_l, chLatin_o, chLatin_c, chLatin_a, chLatin_l
     ,   chLatin_h, chLatin_o, chLatin_s, chLatin_t, chNull
 };

 static const XMLCh gMailToString[]  =
 {
         chLatin_m, chLatin_a, chLatin_i, chLatin_l, chLatin_t
     ,   chLatin_o, chColon, chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gNewsString[] =
 {
         chLatin_n, chLatin_e, chLatin_w, chLatin_s, chColon
     ,   chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gNNTPString[] =
 {
         chLatin_n, chLatin_n, chLatin_t, chLatin_p, chColon
     ,   chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gTelnetString[]  =
 {
         chLatin_t, chLatin_e, chLatin_l, chLatin_n, chLatin_e
     ,   chLatin_t, chColon, chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gWaisString[]  =
 {
         chLatin_w, chLatin_a, chLatin_i, chLatin_s, chColon
     ,   chForwardSlash, chForwardSlash, chNull
 };

 static const XMLCh gProsperoString[]  =
 {
         chLatin_p, chLatin_r, chLatin_o, chLatin_s, chLatin_p
     ,   chLatin_e, chLatin_r, chLatin_o, chColon, chForwardSlash
     ,   chForwardSlash, chNull
 };

 static TypeEntry gTypeList[URL::Protocols_Count] =
 {
         { URL::File     , gFileString       , true }
     ,   { URL::HTTP     , gHTTPString       , false }
     ,   { URL::FTP      , gFTPString        , false }
     ,   { URL::Gopher   , gGopherString     , false }
     ,   { URL::MailTo   , gMailToString     , false }
     ,   { URL::News     , gNewsString       , false }
     ,   { URL::NNTP     , gNNTPString       , false }
     ,   { URL::Telnet   , gTelnetString     , false }
     ,   { URL::Wais     , gWaisString       , false }
     ,   { URL::Prospero , gProsperoString   , false }
 };

 // !!! Keep these up to date with list above!
 static const unsigned int gMaxProtoLen = 11;
 static const unsigned int gMaxColonPos = gMaxProtoLen - 3;


 // ---------------------------------------------------------------------------
 //  Local methods
 // ---------------------------------------------------------------------------
 static bool isHexDigit(const XMLCh toCheck)
 {
     if ((toCheck >= chDigit_0) && (toCheck <= chDigit_9)
     ||  (toCheck >= chLatin_A) && (toCheck <= chLatin_Z)
     ||  (toCheck >= chLatin_a) && (toCheck <= chLatin_z))
     {
         return true;
     }
     return false;
 }

 static unsigned int xlatHexDigit(const XMLCh toXlat)
 {
     if ((toXlat >= chDigit_0) && (toXlat <= chDigit_9))
         return (unsigned int)(toXlat - chDigit_0);

     if ((toXlat >= chLatin_A) && (toXlat <= chLatin_Z))
         return (unsigned int)(toXlat - chLatin_A) + 10;

     return (unsigned int)(toXlat - chLatin_a) + 10;
 }


 // ---------------------------------------------------------------------------
 //  URL: Constructors and Destructor
 // ---------------------------------------------------------------------------
 URL::URL() :

     fFullURL(0)
     , fHost(0)
     , fPath(0)
     , fProtocol(URL::File)
 {
 }

 URL::URL(const URL& toCopy) :

     fFullURL(XMLString::replicate(toCopy.fFullURL))
     , fHost(XMLString::replicate(toCopy.fHost))
     , fPath(XMLString::replicate(toCopy.fPath))
     , fProtocol(toCopy.fProtocol)
 {
 }

 URL::~URL()
 {
     cleanup();
 }


 // ---------------------------------------------------------------------------
 //  URL: Public operators
 // ---------------------------------------------------------------------------
 URL& URL::operator=(const URL& toAssign)
 {
     if (this == &toAssign)
         return *this;

     // Clean up our stuff
     cleanup();

     // And copy his stuff
     fFullURL = XMLString::replicate(toAssign.fFullURL);
     fHost = XMLString::replicate(toAssign.fHost);
     fPath = XMLString::replicate(toAssign.fPath);
     fProtocol = toAssign.fProtocol;

     return *this;
 }

 bool URL::operator==(const URL& toCompare) const
 {
     // Test the obvious one first
     if (fProtocol != toCompare.fProtocol)
         return false;

     //
     //  Oh well, we have to test the components. Don't test the original
     //  URLs, because normalization might have occured that would have made
     //  them equal even though actual text of the full URLs is not.
     //
     if (XMLString::compareString(fPath, toCompare.fPath))
         return false;

     if (XMLString::compareString(fHost, toCompare.fHost))
         return false;

     return true;
 }


 // ---------------------------------------------------------------------------
 //  URL: Getter methods
 // ---------------------------------------------------------------------------
 const XMLCh* URL::getProtocol() const
 {
     return gTypeList[fProtocol].prefix;
 }


 // ---------------------------------------------------------------------------
 //  URL: Setter methods
 // ---------------------------------------------------------------------------
 void URL::setURL(const XMLCh* const urlText)
 {
     fFullURL = XMLString::replicate(urlText);
     try
     {
         parse();
     }

     catch(...)
     {
         cleanup();
         throw;
     }
 }

 void URL::setURL(const char* const urlText)
 {
     // Transcode the passed string to Unicode
     fFullURL = XMLString::transcode(urlText);
     try
     {
         parse();
     }

     catch(...)
     {
         cleanup();
         throw;
     }
 }


 // ---------------------------------------------------------------------------
 //  URL: Miscellaneous methods
 // ---------------------------------------------------------------------------
 BinInputStream* URL::makeNewStream() const
 {
     switch(fProtocol)
     {
         case URL::File  :
         {
             BinFileInputStream* retStrm = new BinFileInputStream(getPath());
             if (!retStrm->getIsOpen())
             {
                 delete retStrm;
                 return 0;
             }
             return retStrm;
             break;
         }

         default :
             ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
             break;
     }
     return 0;
 }


 // ---------------------------------------------------------------------------
 //  URL: Private helper methods
 // ---------------------------------------------------------------------------

 //
 //  Just a central place to handle cleanup, since its done from a number
 //  of different spots.
 //
 void URL::cleanup()
 {
     delete [] fFullURL;
     fFullURL = 0;
     delete [] fHost;
     fHost = 0;
     delete [] fPath;
     fPath = 0;
 }


 //
 //  This method searches our list of protocols and sees if the passed text
 //  starts with one of them. The prefix is the whole thing up to the second
 //  forward slash. The length of the text is passed so that obvious failures
 //  can be found quickly.
 //
 URL::Protocols URL::findType(unsigned int& curPos) const
 {
     XMLCh tmpStr[gMaxProtoLen+1];

     //
     //  Remember the current position so we can do exploratory reads from
     //  the URL. Then look forward for a colon.
     //
     const unsigned int orgPos = curPos;
     unsigned int tmpPos = curPos;
     unsigned int tmpIndex = 0;
     while (true)
     {
         // Get another char from the source URL. Indicate end of text is ok
         const XMLCh nextCh = getNextCh(tmpPos, true);

         // If we hit the end, then no good
         if (!nextCh)
             ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

         // Store this new character
         tmpStr[tmpIndex++] = nextCh;

         // If we hit the colon, break out
         if (nextCh == chColon)
             break;

         // If we exceed the max colon pos without finding a colon, then no good
         if (tmpIndex > gMaxColonPos)
             ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
     }

     //
     //  See if the next two chars are forward slashes, If not, then undo
     //  our read and return local. Else store them and compare against
     //  the list. Indicate that end of input is ok here.
     //
     const bool gotSlashes = (getNextCh(tmpPos, true) == chForwardSlash)
                             && (getNextCh(tmpPos, true) == chForwardSlash);
     if (!gotSlashes)
         ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

     // Store the slashes in our temp string too
     tmpStr[tmpIndex++] = chForwardSlash;
     tmpStr[tmpIndex++] = chForwardSlash;

     // Update the caller's position and cap off our string
     curPos = tmpPos;
     tmpStr[tmpIndex] = chNull;

     //
     //  Ok, lets see if tmpStr matches any of the prefixes in our list of
     //  protocols.
     //
     for (unsigned int index = 0; index < Protocols_Count; index++)
     {
         if (!XMLString::compareString(tmpStr, gTypeList[index].prefix))
             return gTypeList[index].protocol;
     }

     // Cannot be a supported URL protocol
     ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
 }


 //
 //  This method is used during the parse. It gets the next character out of
 //  the source URL (in fFullURL, which is a copy of the original text) and
 //  returns it. It updates the passed position with the new position.
 //
 //  The primary job of this method is to handle escaped characters, by reading
 //  them in and converting them to a Unicode char.
 //
 XMLCh URL::getNextCh(unsigned int& pos, const bool endOk) const
 {
     //
     //  If we are at the end of the URL, then either return a zero or
     //  throw if end of URL is not legal here.
     //
     if (!fFullURL[pos])
     {
         if (!endOk)
             ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
         return chNull;
     }

     //
     //  See if the current character is a '%'. If so, then we need to
     //  deal with an escaped character.
     //
     if (fFullURL[pos] == chPercent)
     {
         XMLCh escapedChar = 0;

         // There must be at least two more characters
         if (!fFullURL[pos+1] || !fFullURL[pos+2])
             ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

         // Get them out and test them
         const XMLCh test1 = fFullURL[pos+1];
         const XMLCh test2 = fFullURL[pos+2];

         if (!isHexDigit(test1) || !isHexDigit(test2))
             ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

         // Convert these to a character
         escapedChar = XMLCh((xlatHexDigit(test1) << 4) + xlatHexDigit(test2));

         // Bump the position up
         pos += 3;

         return escapedChar;
     }

     // Else just return the current char and bump the position
     return fFullURL[pos++];
 }


 //
 //  This method is called to parse the text into its components and validate
 //  that the URL is legal. It uses getNextCh() to pull characters out of the
 //  URL.
 //
 void URL::parse()
 {
     // This is the current position that we track during the parse
     unsigned int curPos = 0;
     XMLCh nextCh;
     const unsigned int bufSize = 2047;
     XMLCh tmpBuf[bufSize + 1];
     unsigned int bufIndex;

     //
     //  Search the text for a prefix. We will get back the type of the prefix
     //  and the current position will be updated.
     //
     fProtocol = findType(curPos);

     //  In order to distinguish between a malformed URL and an unsupported
     //  URL, we watch here for the types that we support. If its not supported
     //  we throw a runtime exception.
     //
     if (!gTypeList[fProtocol].supported)
         ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);

     //
     //  Check the next char. It must be either a forward slash or it will
     //  be the first char of the host name. We don't allow end of input
     //  here so it will cause an exception if we hit it.
     //
     nextCh = getNextCh(curPos);
     if (nextCh == chForwardSlash)
     {
         // There is no host so make an empty one
         fHost = new XMLCh[1];
         fHost[0] = chNull;
     }
      else
     {
         // Put in the lookahead char we did above before we enter the loop
         bufIndex = 0;
         tmpBuf[bufIndex++] = nextCh;

         // And now read up to the slash separator
         while (true)
         {
             // Get the next char, end of input is not valid here
             const XMLCh nextCh = getNextCh(curPos);

             // Break out on the forward slash
             if (nextCh == chForwardSlash)
                 break;

             // Otherwise, save it
             tmpBuf[bufIndex++] = nextCh;

             // If we max out on the temp buffer, definitely bad
             if (bufIndex >= bufSize)
                 ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
         }

         // Cap the temp buffer and replicate to our host member
         tmpBuf[bufIndex] = chNull;
         fHost = XMLString::replicate(tmpBuf);
     }

     //
     //  Now we need to get the path part of the URL. This should be the
     //  rest of the content. So we just go until we get a null char back
     //  from the character spooler. This gets rid of all escaped chars
     //  in the URL.
     //
     bufIndex = 0;
     while (true)
     {
         // Tell it that end of input is ok
         const XMLCh nextCh = getNextCh(curPos, true);
         tmpBuf[bufIndex++] = nextCh;

         // Break out at the end
         if (!nextCh)
             break;
     }

     //
     //  And get our own copy of the temp buffer as the path. If the path
     //  does not start with 'x:', then assume that its a Unix style path
     //  and put in a leading '/'.
     //
     tmpBuf[bufIndex] = chNull;
     if ((((tmpBuf[0] >= chLatin_A) && (tmpBuf[0] <= chLatin_Z))
     ||   ((tmpBuf[0] >= chLatin_a) && (tmpBuf[0] <= chLatin_z)))
     &&  (tmpBuf[1] == chColon))
     {
         fPath = XMLString::replicate(tmpBuf);
     }
      else
     {
         fPath = new XMLCh[XMLString::stringLen(tmpBuf) + 2];
         fPath[0] = chForwardSlash;
         XMLString::copyString(&fPath[1], tmpBuf);
     }

     //
     //  <TBD> When we have more complete support, get rid of this. But for
     //  now we only support file:// (which is checked above) and for files
     //  we only support an empty host or "localhost" which means the same
     //  thing.
     //
     if (fHost[0])
     {
         if (XMLString::compareString(fHost, gLocalHostString))
             ThrowXML(MalformedURLException, XML4CExcepts::URL_OnlyLocalHost);
     }
 }
	/*
	* The Apache Software License, Version 1.1
	*
	* Copyright (c) 1999 The Apache Software Foundation. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The end-user documentation included with the redistribution,
	* if any, must include the following acknowledgment:
	* "This product includes software developed by the
	* Apache Software Foundation (http://www.apache.org/)."
	* Alternately, this acknowledgment may appear in the software itself,
	* if and wherever such third-party acknowledgments normally appear.
	*
	* 4. The names "Xerces" and "Apache Software Foundation" must
	* not be used to endorse or promote products derived from this
	* software without prior written permission. For written
	* permission, please contact apache\@apache.org.
	*
	* 5. Products derived from this software may not be called "Apache",
	* nor may "Apache" appear in their name, without prior written
	* permission of the Apache Software Foundation.
	*
	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation, and was
	* originally based on software copyright (c) 1999, International
	* Business Machines, Inc., http://www.ibm.com . For more information
	* on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*/

	/**
	* $Log$
	* Revision 1.1 1999/11/09 01:04:20 twl
	* Initial revision
	*
	* Revision 1.3 1999/11/08 20:45:16 rahul
	* Swat for adding in Product name and CVS comment log variable.
	*
	*/


	// ---------------------------------------------------------------------------
	// Includes
	// ---------------------------------------------------------------------------
	#include <util/BinFileInputStream.hpp>
	#include <util/Janitor.hpp>
	#include <util/PlatformUtils.hpp>
	#include <util/RuntimeException.hpp>
	#include <util/URL.hpp>
	#include <util/XMLString.hpp>
	#include <util/XMLUni.hpp>


	// ---------------------------------------------------------------------------
	// Local types
	//
	// TypeEntry
	// This structure defines a single entry in the list of URL types. Each
	// entry indicates the prefix for that type of URL, and the SourceTypes
	// value it maps to.
	// ---------------------------------------------------------------------------
	struct TypeEntry
	{
	URL::Protocols protocol;
	const XMLCh* prefix;
	bool supported;
	};


	// ---------------------------------------------------------------------------
	// Local data
	//
	// gXXXString
	// These are the strings for our prefix types. They all have to be
	// Unicode strings all the time, so we can't just do regular strings.
	//
	// gTypeList
	// The list of URL types that we support
	//
	// gMaxProtoLen
	// gMaxColonPos
	// The length of the longest protocol string and from that the maximum
	// index at which we must see the colon.
	//
	// NOTE:!!! Be sure to keep this up to date!
	// ---------------------------------------------------------------------------
	static const XMLCh gFileString[] =
	{
	chLatin_f, chLatin_i, chLatin_l, chLatin_e, chColon
	, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gFTPString[] =
	{
	chLatin_f, chLatin_t, chLatin_p, chColon, chForwardSlash
	, chForwardSlash, chNull
	};

	static const XMLCh gGopherString[] =
	{
	chLatin_g, chLatin_o, chLatin_p, chLatin_h, chLatin_e
	, chLatin_r, chColon, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gHTTPString[] =
	{
	chLatin_h, chLatin_t, chLatin_t, chLatin_p, chColon
	, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gLocalHostString[] =
	{
	chLatin_l, chLatin_o, chLatin_c, chLatin_a, chLatin_l
	, chLatin_h, chLatin_o, chLatin_s, chLatin_t, chNull
	};

	static const XMLCh gMailToString[] =
	{
	chLatin_m, chLatin_a, chLatin_i, chLatin_l, chLatin_t
	, chLatin_o, chColon, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gNewsString[] =
	{
	chLatin_n, chLatin_e, chLatin_w, chLatin_s, chColon
	, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gNNTPString[] =
	{
	chLatin_n, chLatin_n, chLatin_t, chLatin_p, chColon
	, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gTelnetString[] =
	{
	chLatin_t, chLatin_e, chLatin_l, chLatin_n, chLatin_e
	, chLatin_t, chColon, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gWaisString[] =
	{
	chLatin_w, chLatin_a, chLatin_i, chLatin_s, chColon
	, chForwardSlash, chForwardSlash, chNull
	};

	static const XMLCh gProsperoString[] =
	{
	chLatin_p, chLatin_r, chLatin_o, chLatin_s, chLatin_p
	, chLatin_e, chLatin_r, chLatin_o, chColon, chForwardSlash
	, chForwardSlash, chNull
	};

	static TypeEntry gTypeList[URL::Protocols_Count] =
	{
	{ URL::File , gFileString , true }
	, { URL::HTTP , gHTTPString , false }
	, { URL::FTP , gFTPString , false }
	, { URL::Gopher , gGopherString , false }
	, { URL::MailTo , gMailToString , false }
	, { URL::News , gNewsString , false }
	, { URL::NNTP , gNNTPString , false }
	, { URL::Telnet , gTelnetString , false }
	, { URL::Wais , gWaisString , false }
	, { URL::Prospero , gProsperoString , false }
	};

	// !!! Keep these up to date with list above!
	static const unsigned int gMaxProtoLen = 11;
	static const unsigned int gMaxColonPos = gMaxProtoLen - 3;


	// ---------------------------------------------------------------------------
	// Local methods
	// ---------------------------------------------------------------------------
	static bool isHexDigit(const XMLCh toCheck)
	{
	if ((toCheck >= chDigit_0) && (toCheck <= chDigit_9)
	\|\| (toCheck >= chLatin_A) && (toCheck <= chLatin_Z)
	\|\| (toCheck >= chLatin_a) && (toCheck <= chLatin_z))
	{
	return true;
	}
	return false;
	}

	static unsigned int xlatHexDigit(const XMLCh toXlat)
	{
	if ((toXlat >= chDigit_0) && (toXlat <= chDigit_9))
	return (unsigned int)(toXlat - chDigit_0);

	if ((toXlat >= chLatin_A) && (toXlat <= chLatin_Z))
	return (unsigned int)(toXlat - chLatin_A) + 10;

	return (unsigned int)(toXlat - chLatin_a) + 10;
	}



	// ---------------------------------------------------------------------------
	// URL: Constructors and Destructor
	// ---------------------------------------------------------------------------
	URL::URL() :

	fFullURL(0)
	, fHost(0)
	, fPath(0)
	, fProtocol(URL::File)
	{
	}

	URL::URL(const URL& toCopy) :

	fFullURL(XMLString::replicate(toCopy.fFullURL))
	, fHost(XMLString::replicate(toCopy.fHost))
	, fPath(XMLString::replicate(toCopy.fPath))
	, fProtocol(toCopy.fProtocol)
	{
	}

	URL::~URL()
	{
	cleanup();
	}


	// ---------------------------------------------------------------------------
	// URL: Public operators
	// ---------------------------------------------------------------------------
	URL& URL::operator=(const URL& toAssign)
	{
	if (this == &toAssign)
	return *this;

	// Clean up our stuff
	cleanup();

	// And copy his stuff
	fFullURL = XMLString::replicate(toAssign.fFullURL);
	fHost = XMLString::replicate(toAssign.fHost);
	fPath = XMLString::replicate(toAssign.fPath);
	fProtocol = toAssign.fProtocol;

	return *this;
	}

	bool URL::operator==(const URL& toCompare) const
	{
	// Test the obvious one first
	if (fProtocol != toCompare.fProtocol)
	return false;

	//
	// Oh well, we have to test the components. Don't test the original
	// URLs, because normalization might have occured that would have made
	// them equal even though actual text of the full URLs is not.
	//
	if (XMLString::compareString(fPath, toCompare.fPath))
	return false;

	if (XMLString::compareString(fHost, toCompare.fHost))
	return false;

	return true;
	}



	// ---------------------------------------------------------------------------
	// URL: Getter methods
	// ---------------------------------------------------------------------------
	const XMLCh* URL::getProtocol() const
	{
	return gTypeList[fProtocol].prefix;
	}


	// ---------------------------------------------------------------------------
	// URL: Setter methods
	// ---------------------------------------------------------------------------
	void URL::setURL(const XMLCh* const urlText)
	{
	fFullURL = XMLString::replicate(urlText);
	try
	{
	parse();
	}

	catch(...)
	{
	cleanup();
	throw;
	}
	}

	void URL::setURL(const char* const urlText)
	{
	// Transcode the passed string to Unicode
	fFullURL = XMLString::transcode(urlText);
	try
	{
	parse();
	}

	catch(...)
	{
	cleanup();
	throw;
	}
	}



	// ---------------------------------------------------------------------------
	// URL: Miscellaneous methods
	// ---------------------------------------------------------------------------
	BinInputStream* URL::makeNewStream() const
	{
	switch(fProtocol)
	{
	case URL::File :
	{
	BinFileInputStream* retStrm = new BinFileInputStream(getPath());
	if (!retStrm->getIsOpen())
	{
	delete retStrm;
	return 0;
	}
	return retStrm;
	break;
	}

	default :
	ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
	break;
	}
	return 0;
	}


	// ---------------------------------------------------------------------------
	// URL: Private helper methods
	// ---------------------------------------------------------------------------

	//
	// Just a central place to handle cleanup, since its done from a number
	// of different spots.
	//
	void URL::cleanup()
	{
	delete [] fFullURL;
	fFullURL = 0;
	delete [] fHost;
	fHost = 0;
	delete [] fPath;
	fPath = 0;
	}


	//
	// This method searches our list of protocols and sees if the passed text
	// starts with one of them. The prefix is the whole thing up to the second
	// forward slash. The length of the text is passed so that obvious failures
	// can be found quickly.
	//
	URL::Protocols URL::findType(unsigned int& curPos) const
	{
	XMLCh tmpStr[gMaxProtoLen+1];

	//
	// Remember the current position so we can do exploratory reads from
	// the URL. Then look forward for a colon.
	//
	const unsigned int orgPos = curPos;
	unsigned int tmpPos = curPos;
	unsigned int tmpIndex = 0;
	while (true)
	{
	// Get another char from the source URL. Indicate end of text is ok
	const XMLCh nextCh = getNextCh(tmpPos, true);

	// If we hit the end, then no good
	if (!nextCh)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

	// Store this new character
	tmpStr[tmpIndex++] = nextCh;

	// If we hit the colon, break out
	if (nextCh == chColon)
	break;

	// If we exceed the max colon pos without finding a colon, then no good
	if (tmpIndex > gMaxColonPos)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
	}

	//
	// See if the next two chars are forward slashes, If not, then undo
	// our read and return local. Else store them and compare against
	// the list. Indicate that end of input is ok here.
	//
	const bool gotSlashes = (getNextCh(tmpPos, true) == chForwardSlash)
	&& (getNextCh(tmpPos, true) == chForwardSlash);
	if (!gotSlashes)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

	// Store the slashes in our temp string too
	tmpStr[tmpIndex++] = chForwardSlash;
	tmpStr[tmpIndex++] = chForwardSlash;

	// Update the caller's position and cap off our string
	curPos = tmpPos;
	tmpStr[tmpIndex] = chNull;

	//
	// Ok, lets see if tmpStr matches any of the prefixes in our list of
	// protocols.
	//
	for (unsigned int index = 0; index < Protocols_Count; index++)
	{
	if (!XMLString::compareString(tmpStr, gTypeList[index].prefix))
	return gTypeList[index].protocol;
	}

	// Cannot be a supported URL protocol
	ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
	}


	//
	// This method is used during the parse. It gets the next character out of
	// the source URL (in fFullURL, which is a copy of the original text) and
	// returns it. It updates the passed position with the new position.
	//
	// The primary job of this method is to handle escaped characters, by reading
	// them in and converting them to a Unicode char.
	//
	XMLCh URL::getNextCh(unsigned int& pos, const bool endOk) const
	{
	//
	// If we are at the end of the URL, then either return a zero or
	// throw if end of URL is not legal here.
	//
	if (!fFullURL[pos])
	{
	if (!endOk)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
	return chNull;
	}

	//
	// See if the current character is a '%'. If so, then we need to
	// deal with an escaped character.
	//
	if (fFullURL[pos] == chPercent)
	{
	XMLCh escapedChar = 0;

	// There must be at least two more characters
	if (!fFullURL[pos+1] \|\| !fFullURL[pos+2])
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

	// Get them out and test them
	const XMLCh test1 = fFullURL[pos+1];
	const XMLCh test2 = fFullURL[pos+2];

	if (!isHexDigit(test1) \|\| !isHexDigit(test2))
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);

	// Convert these to a character
	escapedChar = XMLCh((xlatHexDigit(test1) << 4) + xlatHexDigit(test2));

	// Bump the position up
	pos += 3;

	return escapedChar;
	}

	// Else just return the current char and bump the position
	return fFullURL[pos++];
	}


	//
	// This method is called to parse the text into its components and validate
	// that the URL is legal. It uses getNextCh() to pull characters out of the
	// URL.
	//
	void URL::parse()
	{
	// This is the current position that we track during the parse
	unsigned int curPos = 0;
	XMLCh nextCh;
	const unsigned int bufSize = 2047;
	XMLCh tmpBuf[bufSize + 1];
	unsigned int bufIndex;

	//
	// Search the text for a prefix. We will get back the type of the prefix
	// and the current position will be updated.
	//
	fProtocol = findType(curPos);

	// In order to distinguish between a malformed URL and an unsupported
	// URL, we watch here for the types that we support. If its not supported
	// we throw a runtime exception.
	//
	if (!gTypeList[fProtocol].supported)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);

	//
	// Check the next char. It must be either a forward slash or it will
	// be the first char of the host name. We don't allow end of input
	// here so it will cause an exception if we hit it.
	//
	nextCh = getNextCh(curPos);
	if (nextCh == chForwardSlash)
	{
	// There is no host so make an empty one
	fHost = new XMLCh[1];
	fHost[0] = chNull;
	}
	else
	{
	// Put in the lookahead char we did above before we enter the loop
	bufIndex = 0;
	tmpBuf[bufIndex++] = nextCh;

	// And now read up to the slash separator
	while (true)
	{
	// Get the next char, end of input is not valid here
	const XMLCh nextCh = getNextCh(curPos);

	// Break out on the forward slash
	if (nextCh == chForwardSlash)
	break;

	// Otherwise, save it
	tmpBuf[bufIndex++] = nextCh;

	// If we max out on the temp buffer, definitely bad
	if (bufIndex >= bufSize)
	ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
	}

	// Cap the temp buffer and replicate to our host member
	tmpBuf[bufIndex] = chNull;
	fHost = XMLString::replicate(tmpBuf);
	}

	//
	// Now we need to get the path part of the URL. This should be the
	// rest of the content. So we just go until we get a null char back
	// from the character spooler. This gets rid of all escaped chars
	// in the URL.
	//
	bufIndex = 0;
	while (true)
	{
	// Tell it that end of input is ok
	const XMLCh nextCh = getNextCh(curPos, true);
	tmpBuf[bufIndex++] = nextCh;

	// Break out at the end
	if (!nextCh)
	break;
	}

	//
	// And get our own copy of the temp buffer as the path. If the path
	// does not start with 'x:', then assume that its a Unix style path
	// and put in a leading '/'.
	//
	tmpBuf[bufIndex] = chNull;
	if ((((tmpBuf[0] >= chLatin_A) && (tmpBuf[0] <= chLatin_Z))
	\|\| ((tmpBuf[0] >= chLatin_a) && (tmpBuf[0] <= chLatin_z)))
	&& (tmpBuf[1] == chColon))
	{
	fPath = XMLString::replicate(tmpBuf);
	}
	else
	{
	fPath = new XMLCh[XMLString::stringLen(tmpBuf) + 2];
	fPath[0] = chForwardSlash;
	XMLString::copyString(&fPath[1], tmpBuf);
	}

	//
	// <TBD> When we have more complete support, get rid of this. But for
	// now we only support file:// (which is checked above) and for files
	// we only support an empty host or "localhost" which means the same
	// thing.
	//
	if (fHost[0])
	{
	if (XMLString::compareString(fHost, gLocalHostString))
	ThrowXML(MalformedURLException, XML4CExcepts::URL_OnlyLocalHost);
	}
	}