blob: 90a2bbd17c3170abc3e8c0f9d7432c47ad065fc7 [file] [log] [blame]
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Log$
* Revision 1.1 1999/11/09 01:04:20 twl
* Initial revision
*
* Revision 1.3 1999/11/08 20:45:16 rahul
* Swat for adding in Product name and CVS comment log variable.
*
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/BinFileInputStream.hpp>
#include <util/Janitor.hpp>
#include <util/PlatformUtils.hpp>
#include <util/RuntimeException.hpp>
#include <util/URL.hpp>
#include <util/XMLString.hpp>
#include <util/XMLUni.hpp>
// ---------------------------------------------------------------------------
// Local types
//
// TypeEntry
// This structure defines a single entry in the list of URL types. Each
// entry indicates the prefix for that type of URL, and the SourceTypes
// value it maps to.
// ---------------------------------------------------------------------------
struct TypeEntry
{
URL::Protocols protocol;
const XMLCh* prefix;
bool supported;
};
// ---------------------------------------------------------------------------
// Local data
//
// gXXXString
// These are the strings for our prefix types. They all have to be
// Unicode strings all the time, so we can't just do regular strings.
//
// gTypeList
// The list of URL types that we support
//
// gMaxProtoLen
// gMaxColonPos
// The length of the longest protocol string and from that the maximum
// index at which we must see the colon.
//
// NOTE:!!! Be sure to keep this up to date!
// ---------------------------------------------------------------------------
static const XMLCh gFileString[] =
{
chLatin_f, chLatin_i, chLatin_l, chLatin_e, chColon
, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gFTPString[] =
{
chLatin_f, chLatin_t, chLatin_p, chColon, chForwardSlash
, chForwardSlash, chNull
};
static const XMLCh gGopherString[] =
{
chLatin_g, chLatin_o, chLatin_p, chLatin_h, chLatin_e
, chLatin_r, chColon, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gHTTPString[] =
{
chLatin_h, chLatin_t, chLatin_t, chLatin_p, chColon
, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gLocalHostString[] =
{
chLatin_l, chLatin_o, chLatin_c, chLatin_a, chLatin_l
, chLatin_h, chLatin_o, chLatin_s, chLatin_t, chNull
};
static const XMLCh gMailToString[] =
{
chLatin_m, chLatin_a, chLatin_i, chLatin_l, chLatin_t
, chLatin_o, chColon, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gNewsString[] =
{
chLatin_n, chLatin_e, chLatin_w, chLatin_s, chColon
, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gNNTPString[] =
{
chLatin_n, chLatin_n, chLatin_t, chLatin_p, chColon
, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gTelnetString[] =
{
chLatin_t, chLatin_e, chLatin_l, chLatin_n, chLatin_e
, chLatin_t, chColon, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gWaisString[] =
{
chLatin_w, chLatin_a, chLatin_i, chLatin_s, chColon
, chForwardSlash, chForwardSlash, chNull
};
static const XMLCh gProsperoString[] =
{
chLatin_p, chLatin_r, chLatin_o, chLatin_s, chLatin_p
, chLatin_e, chLatin_r, chLatin_o, chColon, chForwardSlash
, chForwardSlash, chNull
};
static TypeEntry gTypeList[URL::Protocols_Count] =
{
{ URL::File , gFileString , true }
, { URL::HTTP , gHTTPString , false }
, { URL::FTP , gFTPString , false }
, { URL::Gopher , gGopherString , false }
, { URL::MailTo , gMailToString , false }
, { URL::News , gNewsString , false }
, { URL::NNTP , gNNTPString , false }
, { URL::Telnet , gTelnetString , false }
, { URL::Wais , gWaisString , false }
, { URL::Prospero , gProsperoString , false }
};
// !!! Keep these up to date with list above!
static const unsigned int gMaxProtoLen = 11;
static const unsigned int gMaxColonPos = gMaxProtoLen - 3;
// ---------------------------------------------------------------------------
// Local methods
// ---------------------------------------------------------------------------
static bool isHexDigit(const XMLCh toCheck)
{
if ((toCheck >= chDigit_0) && (toCheck <= chDigit_9)
|| (toCheck >= chLatin_A) && (toCheck <= chLatin_Z)
|| (toCheck >= chLatin_a) && (toCheck <= chLatin_z))
{
return true;
}
return false;
}
static unsigned int xlatHexDigit(const XMLCh toXlat)
{
if ((toXlat >= chDigit_0) && (toXlat <= chDigit_9))
return (unsigned int)(toXlat - chDigit_0);
if ((toXlat >= chLatin_A) && (toXlat <= chLatin_Z))
return (unsigned int)(toXlat - chLatin_A) + 10;
return (unsigned int)(toXlat - chLatin_a) + 10;
}
// ---------------------------------------------------------------------------
// URL: Constructors and Destructor
// ---------------------------------------------------------------------------
URL::URL() :
fFullURL(0)
, fHost(0)
, fPath(0)
, fProtocol(URL::File)
{
}
URL::URL(const URL& toCopy) :
fFullURL(XMLString::replicate(toCopy.fFullURL))
, fHost(XMLString::replicate(toCopy.fHost))
, fPath(XMLString::replicate(toCopy.fPath))
, fProtocol(toCopy.fProtocol)
{
}
URL::~URL()
{
cleanup();
}
// ---------------------------------------------------------------------------
// URL: Public operators
// ---------------------------------------------------------------------------
URL& URL::operator=(const URL& toAssign)
{
if (this == &toAssign)
return *this;
// Clean up our stuff
cleanup();
// And copy his stuff
fFullURL = XMLString::replicate(toAssign.fFullURL);
fHost = XMLString::replicate(toAssign.fHost);
fPath = XMLString::replicate(toAssign.fPath);
fProtocol = toAssign.fProtocol;
return *this;
}
bool URL::operator==(const URL& toCompare) const
{
// Test the obvious one first
if (fProtocol != toCompare.fProtocol)
return false;
//
// Oh well, we have to test the components. Don't test the original
// URLs, because normalization might have occured that would have made
// them equal even though actual text of the full URLs is not.
//
if (XMLString::compareString(fPath, toCompare.fPath))
return false;
if (XMLString::compareString(fHost, toCompare.fHost))
return false;
return true;
}
// ---------------------------------------------------------------------------
// URL: Getter methods
// ---------------------------------------------------------------------------
const XMLCh* URL::getProtocol() const
{
return gTypeList[fProtocol].prefix;
}
// ---------------------------------------------------------------------------
// URL: Setter methods
// ---------------------------------------------------------------------------
void URL::setURL(const XMLCh* const urlText)
{
fFullURL = XMLString::replicate(urlText);
try
{
parse();
}
catch(...)
{
cleanup();
throw;
}
}
void URL::setURL(const char* const urlText)
{
// Transcode the passed string to Unicode
fFullURL = XMLString::transcode(urlText);
try
{
parse();
}
catch(...)
{
cleanup();
throw;
}
}
// ---------------------------------------------------------------------------
// URL: Miscellaneous methods
// ---------------------------------------------------------------------------
BinInputStream* URL::makeNewStream() const
{
switch(fProtocol)
{
case URL::File :
{
BinFileInputStream* retStrm = new BinFileInputStream(getPath());
if (!retStrm->getIsOpen())
{
delete retStrm;
return 0;
}
return retStrm;
break;
}
default :
ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
break;
}
return 0;
}
// ---------------------------------------------------------------------------
// URL: Private helper methods
// ---------------------------------------------------------------------------
//
// Just a central place to handle cleanup, since its done from a number
// of different spots.
//
void URL::cleanup()
{
delete [] fFullURL;
fFullURL = 0;
delete [] fHost;
fHost = 0;
delete [] fPath;
fPath = 0;
}
//
// This method searches our list of protocols and sees if the passed text
// starts with one of them. The prefix is the whole thing up to the second
// forward slash. The length of the text is passed so that obvious failures
// can be found quickly.
//
URL::Protocols URL::findType(unsigned int& curPos) const
{
XMLCh tmpStr[gMaxProtoLen+1];
//
// Remember the current position so we can do exploratory reads from
// the URL. Then look forward for a colon.
//
const unsigned int orgPos = curPos;
unsigned int tmpPos = curPos;
unsigned int tmpIndex = 0;
while (true)
{
// Get another char from the source URL. Indicate end of text is ok
const XMLCh nextCh = getNextCh(tmpPos, true);
// If we hit the end, then no good
if (!nextCh)
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
// Store this new character
tmpStr[tmpIndex++] = nextCh;
// If we hit the colon, break out
if (nextCh == chColon)
break;
// If we exceed the max colon pos without finding a colon, then no good
if (tmpIndex > gMaxColonPos)
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
}
//
// See if the next two chars are forward slashes, If not, then undo
// our read and return local. Else store them and compare against
// the list. Indicate that end of input is ok here.
//
const bool gotSlashes = (getNextCh(tmpPos, true) == chForwardSlash)
&& (getNextCh(tmpPos, true) == chForwardSlash);
if (!gotSlashes)
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
// Store the slashes in our temp string too
tmpStr[tmpIndex++] = chForwardSlash;
tmpStr[tmpIndex++] = chForwardSlash;
// Update the caller's position and cap off our string
curPos = tmpPos;
tmpStr[tmpIndex] = chNull;
//
// Ok, lets see if tmpStr matches any of the prefixes in our list of
// protocols.
//
for (unsigned int index = 0; index < Protocols_Count; index++)
{
if (!XMLString::compareString(tmpStr, gTypeList[index].prefix))
return gTypeList[index].protocol;
}
// Cannot be a supported URL protocol
ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
}
//
// This method is used during the parse. It gets the next character out of
// the source URL (in fFullURL, which is a copy of the original text) and
// returns it. It updates the passed position with the new position.
//
// The primary job of this method is to handle escaped characters, by reading
// them in and converting them to a Unicode char.
//
XMLCh URL::getNextCh(unsigned int& pos, const bool endOk) const
{
//
// If we are at the end of the URL, then either return a zero or
// throw if end of URL is not legal here.
//
if (!fFullURL[pos])
{
if (!endOk)
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
return chNull;
}
//
// See if the current character is a '%'. If so, then we need to
// deal with an escaped character.
//
if (fFullURL[pos] == chPercent)
{
XMLCh escapedChar = 0;
// There must be at least two more characters
if (!fFullURL[pos+1] || !fFullURL[pos+2])
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
// Get them out and test them
const XMLCh test1 = fFullURL[pos+1];
const XMLCh test2 = fFullURL[pos+2];
if (!isHexDigit(test1) || !isHexDigit(test2))
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
// Convert these to a character
escapedChar = XMLCh((xlatHexDigit(test1) << 4) + xlatHexDigit(test2));
// Bump the position up
pos += 3;
return escapedChar;
}
// Else just return the current char and bump the position
return fFullURL[pos++];
}
//
// This method is called to parse the text into its components and validate
// that the URL is legal. It uses getNextCh() to pull characters out of the
// URL.
//
void URL::parse()
{
// This is the current position that we track during the parse
unsigned int curPos = 0;
XMLCh nextCh;
const unsigned int bufSize = 2047;
XMLCh tmpBuf[bufSize + 1];
unsigned int bufIndex;
//
// Search the text for a prefix. We will get back the type of the prefix
// and the current position will be updated.
//
fProtocol = findType(curPos);
// In order to distinguish between a malformed URL and an unsupported
// URL, we watch here for the types that we support. If its not supported
// we throw a runtime exception.
//
if (!gTypeList[fProtocol].supported)
ThrowXML(MalformedURLException, XML4CExcepts::URL_UnsupportedProto);
//
// Check the next char. It must be either a forward slash or it will
// be the first char of the host name. We don't allow end of input
// here so it will cause an exception if we hit it.
//
nextCh = getNextCh(curPos);
if (nextCh == chForwardSlash)
{
// There is no host so make an empty one
fHost = new XMLCh[1];
fHost[0] = chNull;
}
else
{
// Put in the lookahead char we did above before we enter the loop
bufIndex = 0;
tmpBuf[bufIndex++] = nextCh;
// And now read up to the slash separator
while (true)
{
// Get the next char, end of input is not valid here
const XMLCh nextCh = getNextCh(curPos);
// Break out on the forward slash
if (nextCh == chForwardSlash)
break;
// Otherwise, save it
tmpBuf[bufIndex++] = nextCh;
// If we max out on the temp buffer, definitely bad
if (bufIndex >= bufSize)
ThrowXML(MalformedURLException, XML4CExcepts::URL_MalformedURL);
}
// Cap the temp buffer and replicate to our host member
tmpBuf[bufIndex] = chNull;
fHost = XMLString::replicate(tmpBuf);
}
//
// Now we need to get the path part of the URL. This should be the
// rest of the content. So we just go until we get a null char back
// from the character spooler. This gets rid of all escaped chars
// in the URL.
//
bufIndex = 0;
while (true)
{
// Tell it that end of input is ok
const XMLCh nextCh = getNextCh(curPos, true);
tmpBuf[bufIndex++] = nextCh;
// Break out at the end
if (!nextCh)
break;
}
//
// And get our own copy of the temp buffer as the path. If the path
// does not start with 'x:', then assume that its a Unix style path
// and put in a leading '/'.
//
tmpBuf[bufIndex] = chNull;
if ((((tmpBuf[0] >= chLatin_A) && (tmpBuf[0] <= chLatin_Z))
|| ((tmpBuf[0] >= chLatin_a) && (tmpBuf[0] <= chLatin_z)))
&& (tmpBuf[1] == chColon))
{
fPath = XMLString::replicate(tmpBuf);
}
else
{
fPath = new XMLCh[XMLString::stringLen(tmpBuf) + 2];
fPath[0] = chForwardSlash;
XMLString::copyString(&fPath[1], tmpBuf);
}
//
// <TBD> When we have more complete support, get rid of this. But for
// now we only support file:// (which is checked above) and for files
// we only support an empty host or "localhost" which means the same
// thing.
//
if (fHost[0])
{
if (XMLString::compareString(fHost, gLocalHostString))
ThrowXML(MalformedURLException, XML4CExcepts::URL_OnlyLocalHost);
}
}