blob: 1facb53ba0bae2a83cae6d17e00f1f9e4c37c7eb [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/XMLReader.hpp>
#include <xercesc/util/BitOps.hpp>
#include <xercesc/util/BinInputStream.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/TransService.hpp>
#include <xercesc/util/XMLEBCDICTranscoder.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/Janitor.hpp>
namespace XERCES_CPP_NAMESPACE {
// ---------------------------------------------------------------------------
// XMLReader: Query Methods
// ---------------------------------------------------------------------------
// Checks whether all of the chars in the passed buffer are whitespace or
// not. Breaks out on the first non-whitespace.
//
bool XMLReader::isAllSpaces(const XMLCh* const toCheck
, const XMLSize_t count) const
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
return false;
}
return true;
}
//
// Checks whether at least one of the chars in the passed buffer are whitespace or
// not.
//
bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck
, const XMLSize_t count) const
{
const XMLCh* curCh = toCheck;
const XMLCh* endPtr = toCheck + count;
while (curCh < endPtr)
{
if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
return true;
}
return false;
}
//
// This one is not called terribly often, so call the XMLChar utility
//
bool XMLReader::isPublicIdChar(const XMLCh toCheck) const
{
if (fXMLVersion == XMLV1_1)
return XMLChar1_1::isPublicIdChar(toCheck);
else
return XMLChar1_0::isPublicIdChar(toCheck);
}
// ---------------------------------------------------------------------------
// XMLReader: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, XMLSize_t lowWaterMark
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncodingStr(0)
, fForcedEncoding(false)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fLowWaterMark (lowWaterMark)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Use the recognizer class to get a basic sense of what family of
// encodings this file is in. We'll start off with a reader of that
// type, and update it later if needed when we read the XMLDecl line.
//
fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);
#if defined(XERCES_DEBUG)
if ((fEncoding < XMLRecognizer::Encodings_Min)
|| (fEncoding > XMLRecognizer::Encodings_Max))
{
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
}
#endif
fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// This will check to see if the first line is an XMLDecl and, if
// so, decode that first line manually one character at a time. This
// leaves enough characters in the buffer that the high level code
// can get through the Decl and call us back with the real encoding.
//
doInitDecode();
//
// NOTE: We won't create a transcoder until we either get a call to
// setEncoding() or we get a call to refreshCharBuffer() and no
// transcoder has been set yet.
//
}
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, const XMLCh* const encodingStr
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, XMLSize_t lowWaterMark
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncoding(XMLRecognizer::UTF_8)
, fEncodingStr(0)
, fForcedEncoding(true)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fLowWaterMark (lowWaterMark)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Copy the encoding string to our member
fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager);
XMLString::upperCaseASCII(fEncodingStr);
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Map the passed encoding name to one of our enums. If it does not
// match one of the intrinsic encodings, it will come back 'other',
// which tells us to create a transcoder based reader.
//
fEncoding = XMLRecognizer::encodingForName(fEncodingStr);
// test the presence of the BOM and remove it from the source
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
if (fRawBytesAvail > 4 &&
(((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))) )
{
fRawBufIndex += 4;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
if (fRawBytesAvail < 2)
break;
const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
}
break;
}
case XMLRecognizer::EBCDIC:
case XMLRecognizer::US_ASCII:
case XMLRecognizer::XERCES_XMLCH:
case XMLRecognizer::OtherEncoding:
case XMLRecognizer::Encodings_Count:
{
// silence warning about enumeration not being used
break;
}
}
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// Create a transcoder for the encoding. Since the encoding has been
// forced, this will be the one we will use, period.
//
XMLTransService::Codes failReason;
if (fEncoding == XMLRecognizer::OtherEncoding)
{
//
// fEncodingStr not pre-recognized, use it
// directly for transcoder
//
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
}
else
{
//
// Use the recognized fEncoding to create the transcoder
//
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
}
if (!fTranscoder)
{
// We are about to throw which means the d-tor won't be called.
// Clean up some memory.
//
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fSystemId);
ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
//
// Note that, unlike above, we do not do an initial decode of the
// first line. We take the caller's word that the encoding is correct
// and just assume that the first bulk decode (kicked off by the first
// get of a character) will work.
//
// So we do here the slipping in of the leading space if required.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
{
// This represents no data from the source
fCharSizeBuf[fCharsAvail] = 0;
fCharOfsBuf[fCharsAvail] = 0;
fCharBuf[fCharsAvail++] = chSpace;
}
}
XMLReader::XMLReader(const XMLCh* const pubId
, const XMLCh* const sysId
, BinInputStream* const streamToAdopt
, XMLRecognizer::Encodings encodingEnum
, const RefFrom from
, const Types type
, const Sources source
, const bool throwAtEnd
, const bool calculateSrcOfs
, XMLSize_t lowWaterMark
, const XMLVersion version
, MemoryManager* const manager) :
fCharIndex(0)
, fCharsAvail(0)
, fCurCol(1)
, fCurLine(1)
, fEncoding(XMLRecognizer::UTF_8)
, fEncodingStr(0)
, fForcedEncoding(true)
, fNoMore(false)
, fPublicId(XMLString::replicate(pubId, manager))
, fRawBufIndex(0)
, fRawBytesAvail(0)
, fLowWaterMark (lowWaterMark)
, fReaderNum(0xFFFFFFFF)
, fRefFrom(from)
, fSentTrailingSpace(false)
, fSource(source)
, fSrcOfsBase(0)
, fSrcOfsSupported(false)
, fCalculateSrcOfs(calculateSrcOfs)
, fSystemId(XMLString::replicate(sysId, manager))
, fStream(streamToAdopt)
, fSwapped(false)
, fThrowAtEnd(throwAtEnd)
, fTranscoder(0)
, fType(type)
, fMemoryManager(manager)
{
setXMLVersion(version);
// Do an initial load of raw bytes
refreshRawBuffer();
// Ask the transcoding service if it supports src offset info
fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();
//
// Use the passed encoding code
//
fEncoding = encodingEnum;
fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);
// Check whether the fSwapped flag should be set or not
checkForSwapped();
//
// Create a transcoder for the encoding. Since the encoding has been
// forced, this will be the one we will use, period.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
{
// We are about to throw which means the d-tor won't be called.
// Clean up some memory.
//
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fSystemId);
ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
//
// Note that, unlike above, we do not do an initial decode of the
// first line. We take the caller's word that the encoding is correct
// and just assume that the first bulk decode (kicked off by the first
// get of a character) will work.
//
// So we do here the slipping in of the leading space if required.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
{
// This represents no data from the source
fCharSizeBuf[fCharsAvail] = 0;
fCharOfsBuf[fCharsAvail] = 0;
fCharBuf[fCharsAvail++] = chSpace;
}
}
XMLReader::~XMLReader()
{
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fSystemId);
delete fStream;
delete fTranscoder;
}
// ---------------------------------------------------------------------------
// XMLReader: Character buffer management methods
// ---------------------------------------------------------------------------
XMLFilePos XMLReader::getSrcOffset() const
{
if (!fSrcOfsSupported || !fCalculateSrcOfs)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager);
//
// Take the current source offset and add in the sizes that we've
// eaten from the source so far.
//
if( fCharIndex == 0 ) {
return fSrcOfsBase;
}
if( fCharIndex < fCharsAvail ) {
return (fSrcOfsBase + fCharOfsBuf[fCharIndex]);
}
return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]);
}
bool XMLReader::refreshCharBuffer()
{
// If the no more flag is set, then don't bother doing anything.
if (fNoMore)
return false;
XMLSize_t startInd;
// See if we have any existing chars.
const XMLSize_t spareChars = fCharsAvail - fCharIndex;
// If we are full, then don't do anything.
if (spareChars == kCharBufSize)
return true;
//
// If no transcoder has been created yet, then we never saw the
// any encoding="" string and the encoding was not forced, so lets
// create one now. We know that it won't change now.
//
// However, note that if we autosensed EBCDIC, then we have to
// consider it an error if we never got an encoding since we don't
// know what variant of EBCDIC it is.
//
if (!fTranscoder)
{
if (fEncoding == XMLRecognizer::EBCDIC)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager);
// Ask the transcoding service to make use a transcoder
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
{
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, fEncodingStr
, fMemoryManager
);
}
}
//
// Add the number of source bytes eaten so far to the base src
// offset member.
//
if (fCalculateSrcOfs) {
for (startInd = 0; startInd < fCharIndex; startInd++)
fSrcOfsBase += fCharSizeBuf[startInd];
}
//
// If there are spare chars, then move then down to the bottom. We
// have to move the char sizes down also.
//
startInd = 0;
if (spareChars)
{
for (XMLSize_t index = fCharIndex; index < fCharsAvail; index++)
{
fCharBuf[startInd] = fCharBuf[index];
fCharSizeBuf[startInd] = fCharSizeBuf[index];
startInd++;
}
}
//
// And then get more chars, starting after any spare chars that were
// left over from the last time.
//
fCharsAvail = xcodeMoreChars
(
&fCharBuf[startInd]
, &fCharSizeBuf[startInd]
, kCharBufSize - spareChars
);
// Add back in the spare chars
fCharsAvail += spareChars;
// Reset the buffer index to zero, so we start from the 0th char again
fCharIndex = 0;
//
// If no chars available, then we have to check for one last thing. If
// this is reader for a PE and its not being expanded inside a literal,
// then unget a trailing space. We use a boolean to avoid triggering
// this more than once.
//
if (!fCharsAvail
&& (fType == Type_PE)
&& (fRefFrom == RefFrom_NonLiteral)
&& !fSentTrailingSpace)
{
fCharBuf[0] = chSpace;
fCharsAvail = 1;
fSentTrailingSpace = true;
}
//
// If we get here with no more chars, then set the fNoMore flag which
// lets us optimize and know without checking that no more chars are
// available.
//
if (!fCharsAvail)
fNoMore = true;
// Calculate fCharOfsBuf using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
unsigned int last = 0;
fCharOfsBuf[0] = 0;
for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = last+fCharSizeBuf[index-1];
last = fCharOfsBuf[index];
// code was:
// fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
// but on Solaris 64 bit with sun studio 11 this didn't work as
// every value of fCharOfsBuf[] was 1.
}
}
return (fCharsAvail != 0);
}
// ---------------------------------------------------------------------------
// XMLReader: Scanning methods
// ---------------------------------------------------------------------------
bool XMLReader::getName(XMLBuffer& toFill, const bool token)
{
// Ok, first lets see if we have chars in the buffer. If not, then lets
// reload.
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
XMLSize_t charIndex_start = fCharIndex;
// Lets check the first char for being a first name char. If not, then
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
if (!token)
{
if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
// if there isn't one more char in the buffer, read more data
if (fCharIndex+1 == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
// reset the start buffer to the new location of the cursor
charIndex_start = fCharIndex;
}
if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
return false;
// Looks ok, so lets eat it
fCharIndex += 2;
}
else {
if (!isFirstNameChar(fCharBuf[fCharIndex]))
return false;
// Looks ok, so lets eat it
fCharIndex ++;
}
}
// And now we loop until we run out of data in this reader or we hit
// a non-name char.
while (true)
{
while (fCharIndex < fCharsAvail)
{
// Check the current char and take it if its a name char. Else
// break out.
if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
{
// if there isn't one more char in the buffer, read more data
if (fCharIndex+1 == fCharsAvail)
{
// but first copy the accepted character(s), and update column
if (fCharIndex != charIndex_start)
{
fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
}
if (!refreshCharBuffer())
break;
charIndex_start = fCharIndex;
}
if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
(fCharBuf[fCharIndex+1] > 0xDFFF) )
break;
fCharIndex += 2;
}
else
{
if (!isNameChar(fCharBuf[fCharIndex]))
break;
fCharIndex++;
}
}
// we have to copy the accepted character(s), and update column
if (fCharIndex != charIndex_start)
{
fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
}
// something is wrong if there is still something in the buffer
// or if we don't get no more, then break out.
if ((fCharIndex < fCharsAvail) ||
!refreshCharBuffer())
break;
charIndex_start = fCharIndex;
}
return !toFill.isEmpty();
}
bool XMLReader::getNCName(XMLBuffer& toFill)
{
if (fCharIndex == fCharsAvail && !refreshCharBuffer())
return false;
XMLSize_t charIndex_start = fCharIndex, count;
// Lets check the first char for being a first name char. If not, then
// what's the point in living mannnn? Just give up now. We only do this
// if its a name and not a name token that they want.
if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
// if there isn't one more char in the buffer, read more data
if (fCharIndex+1 == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
// reset the start buffer to the new location of the cursor
charIndex_start = fCharIndex;
}
if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
return false;
// Looks ok, so lets eat it
fCharIndex += 2;
}
else {
if (!isFirstNCNameChar(fCharBuf[fCharIndex])) {
return false;
}
// Looks ok, so lets eat it
fCharIndex++;
}
do
{
if (fCharIndex == fCharsAvail)
{
// we have to copy the accepted character(s), and update the column number,
// before getting new data and losing the value of fCharIndex
if((count = fCharIndex - charIndex_start)!=0)
{
fCurCol += (XMLFileLoc)count;
toFill.append(&fCharBuf[charIndex_start], count);
}
if(!refreshCharBuffer())
return true;
charIndex_start = fCharIndex;
}
// Check the current char and take it if it's a name char
while(fCharIndex < fCharsAvail)
{
if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))
{
// if there isn't one more char in the buffer, read more data
if (fCharIndex+1 == fCharsAvail)
{
// but first copy the accepted character(s), and update column
if (fCharIndex != charIndex_start)
{
fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
}
if (!refreshCharBuffer())
break;
charIndex_start = fCharIndex;
}
if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
(fCharBuf[fCharIndex+1] > 0xDFFF) )
break;
fCharIndex += 2;
}
else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
else break;
}
// if we didn't consume the entire buffer, we are done
} while(fCharIndex == fCharsAvail);
// we have to copy the accepted character(s), and update column
if((count = fCharIndex - charIndex_start)!=0)
{
fCurCol += (XMLFileLoc)count;
toFill.append(&fCharBuf[charIndex_start], count);
}
return true;
}
bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition)
{
// We are only looking for two iterations (i.e. 'NCNAME':'NCNAME').
// We will stop when we finished scanning for a QName (i.e. either a second
// colon or an invalid char).
if(!getNCName(toFill))
{
*colonPosition = -1;
return false;
}
if (fCharIndex == fCharsAvail && !refreshCharBuffer())
{
*colonPosition = -1;
return true;
}
if (fCharBuf[fCharIndex] != chColon)
{
*colonPosition = -1;
return true;
}
*colonPosition = (int)toFill.getLen();
toFill.append(chColon);
fCharIndex++;
fCurCol++;
return getNCName(toFill);
}
bool XMLReader::getSpaces(XMLBuffer& toFill)
{
//
// We just loop until we either hit a non-space or the end of this
// entity. We return true if we returned because of a non-space and
// false if because of end of entity.
//
// NOTE: We have to maintain line/col info here and we have to do
// whitespace normalization if we are not already internalized.
//
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
//
if (isWhitespace(curCh))
{
// Eat this char
fCharIndex++;
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of thelogical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL(curCh, false);
}
// Ok we can add this guy to our buffer
toFill.append(curCh);
}
else
{
// Return true to indicate we broke out due to a whitespace
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
return false;
}
bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
{
while (true)
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// Get the current char out of the buffer
XMLCh curCh = fCharBuf[fCharIndex];
//
// See if its not a white space or our target char, then process
// it. Else, we need to return.
//
if (!isWhitespace(curCh) && (curCh != toCheck))
{
// Eat this char
fCharIndex++;
//
// 'curCh' is not a whitespace(x20|x9|xD|xA), so we only can
// have end-of-line combinations with a leading chNEL(x85) or
// chLineSeparator(x2028)
//
// 0010000000101000 chLineSeparator
// 0000000010000101 chNEL
// ---------------------
// 1101111101010010 == ~(chNEL|chLineSeparator)
//
// if the result of the logical-& operation is
// true : 'curCh' can not be chNEL or chLineSeparator
// false : 'curCh' can be chNEL or chLineSeparator
//
if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) )
{
fCurCol++;
} else
{
handleEOL(curCh, false);
}
// Add it to our buffer
toFill.append(curCh);
}
else
{
return true;
}
}
//
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
//
if (!refreshCharBuffer())
break;
}
// We never hit any non-space and ate up the whole reader
return false;
}
bool XMLReader::skipIfQuote(XMLCh& chGotten)
{
if (fCharIndex == fCharsAvail && !refreshCharBuffer())
return false;
chGotten = fCharBuf[fCharIndex];
if ((chGotten == chDoubleQuote) || (chGotten == chSingleQuote))
{
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl)
{
// DO NOT set the skippedSomething to 'false', but change it to be 'true' only
// We enter a loop where we skip over spaces until we hit the end of
// this reader or a non-space value. The return indicates whether we
// hit the non-space (true) or the end (false).
do
{
// Loop through the current chars in the buffer
while (fCharIndex < fCharsAvail)
{
// See if its a white space char. If so, then process it. Else
// we've hit a non-space and need to return.
if (isWhitespace(fCharBuf[fCharIndex]))
{
// Get the current char out of the buffer and eat it
XMLCh curCh = fCharBuf[fCharIndex++];
skippedSomething = true;
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of the logical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL(curCh, inDecl);
}
}
else
return true;
}
// We've eaten up the current buffer, so lets try to reload it. If
// we don't get anything new, then break out. If we do, then we go
// back to the top to keep getting spaces.
} while(refreshCharBuffer());
// We never hit any non-space and ate up the whole reader
return false;
}
bool XMLReader::skippedChar(const XMLCh toSkip)
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is the one we want. If so, then we need
// to eat it and return true.
//
if (fCharBuf[fCharIndex] == toSkip)
{
fCharIndex++;
fCurCol++;
return true;
}
return false;
}
bool XMLReader::skippedSpace()
{
//
// If the buffer is empty, then try to reload it. If we still get
// nothing, then return false.
//
if (fCharIndex == fCharsAvail)
{
if (!refreshCharBuffer())
return false;
}
//
// See if the current char is a whitespace. If so, then we need to eat
// it and return true.
//
const XMLCh curCh = fCharBuf[fCharIndex];
if (isWhitespace(curCh))
{
// Eat the character
fCharIndex++;
//
// 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
// end-of-line combinations with a leading chCR(xD) or chLF(xA)
//
// 100000 x20
// 001001 x9
// 001010 chLF
// 001101 chCR
// -----------
// 000110 == (chCR|chLF) & ~(0x9|0x20)
//
// if the result of the logical-& operation is
// true : 'curCh' must be xA or xD
// false : 'curCh' must be x20 or x9
//
if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
{
fCurCol++;
} else
{
handleEOL((XMLCh&)curCh, false);
}
return true;
}
return false;
}
bool XMLReader::skippedString(const XMLCh* const toSkip)
{
// This function works on strings that are smaller than kCharBufSize.
// This function guarantees that in case the comparison is unsuccessful
// the fCharIndex will point to the original data.
//
// Get the length of the string to skip.
//
const XMLSize_t srcLen = XMLString::stringLen(toSkip);
XMLSize_t charsLeft = charsLeftInBuffer();
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
while (charsLeft < srcLen)
{
if (!refreshCharBuffer())
return false;
XMLSize_t tmp = charsLeftInBuffer();
if (tmp == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = tmp;
}
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen * sizeof(XMLCh)))
return false;
// Add the source length to the current column to get it back right.
//
fCurCol += (XMLFileLoc)srcLen;
// And get the character buffer index back right by just adding the
// source len to it.
//
fCharIndex += srcLen;
return true;
}
bool XMLReader::skippedStringLong(const XMLCh* toSkip)
{
// This function works on strings that are potentially longer than
// kCharBufSize (e.g., end tag). This function does not guarantee
// that in case the comparison is unsuccessful the fCharIndex will
// point to the original data.
//
XMLSize_t srcLen = XMLString::stringLen(toSkip);
XMLSize_t charsLeft = charsLeftInBuffer();
while (srcLen != 0)
{
// Fill up the buffer with as much data as possible.
//
while (charsLeft < srcLen && charsLeft != kCharBufSize)
{
if (!refreshCharBuffer())
return false;
XMLSize_t tmp = charsLeftInBuffer();
if (tmp == charsLeft) // if the refreshCharBuf() did not add anything
return false; // new give up and return.
charsLeft = tmp;
}
XMLSize_t n = charsLeft < srcLen ? charsLeft : srcLen;
if (memcmp(&fCharBuf[fCharIndex], toSkip, n * sizeof(XMLCh)))
return false;
toSkip += n;
srcLen -= n;
fCharIndex += n;
fCurCol += (XMLFileLoc)n;
charsLeft -= n;
}
return true;
}
//
// This is just to peek if the next coming buffer
// matches the string toPeek.
// Similar to skippedString, but just the fCharIndex and fCurCol are not updated
//
bool XMLReader::peekString(const XMLCh* const toPeek)
{
// Get the length of the string to skip
const XMLSize_t srcLen = XMLString::stringLen(toPeek);
//
// See if the current reader has enough chars to test against this
// string. If not, then ask it to reload its buffer. If that does not
// get us enough, then it cannot match.
//
// NOTE: This works because strings never have to cross a reader! And
// a string to skip will never have a new line in it, so we will never
// miss adjusting the current line.
//
XMLSize_t charsLeft = charsLeftInBuffer();
while (charsLeft < srcLen)
{
refreshCharBuffer();
XMLSize_t t = charsLeftInBuffer();
if (t == charsLeft) // if the refreshCharBuf() did not add anything new
return false; // give up and return.
charsLeft = t;
}
//
// Ok, now we now that the current reader has enough chars in its
// buffer and that its index is back at zero. So we can do a quick and
// dirty comparison straight to its buffer with no requirement to unget
// if it fails.
//
if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh)))
return false;
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Setter methods (most are inlined)
// ---------------------------------------------------------------------------
bool XMLReader::setEncoding(const XMLCh* const newEncoding)
{
//
// If the encoding was forced, then we ignore the new value and just
// return with success. If it was forced, then we are to use that
// encoding without question. Note that, if we are forced, we created
// a transcoder up front so there is no need to do one here in that
// case.
//
if (fForcedEncoding)
return true;
//
// upperCase the newEncoding first for better performance
//
XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
XMLString::upperCaseASCII(inputEncoding);
XMLRecognizer::Encodings newBaseEncoding;
//
// Check for non-endian specific UTF-16 or UCS-4. If so, and if we
// are already in one of the endian versions of those encodings,
// then just keep it and go on. Otherwise, its not valid.
//
if (XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString2)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString3)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString4)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString5)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString6)
|| XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString7))
{
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UTF_16L)
&& (fEncoding != XMLRecognizer::UTF_16B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UTF_16L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
}
else {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
}
}
else if (XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString)
|| XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString2)
|| XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString3)
|| XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString4)
|| XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString5))
{
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UCS_4L)
&& (fEncoding != XMLRecognizer::UCS_4B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UCS_4L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
}
else {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = 0;
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
}
}
else
{
//
// Try to map the string to one of our standard encodings. If its not
// one of them, then it has to be one of the non-intrinsic encodings,
// in which case we have to delete our intrinsic encoder and create a
// new one.
//
newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding);
//
// If it does not come back as one of the auto-sensed encodings, then we
// have to possibly replace it and at least check a few things.
//
if (newBaseEncoding == XMLRecognizer::OtherEncoding)
{
//
// We already know it's none of those non-endian special cases,
// so just replicate the new name and use it directly to create the transcoder
//
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = inputEncoding;
// Check for a pre-created transcoder to delete.
if (fTranscoder) {
delete fTranscoder;
fTranscoder = 0;
}
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
}
else
{
// Store the new encoding string since it is just an intrinsic
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = inputEncoding;
}
}
if (!fTranscoder) {
//
// Now we can create a transcoder using the recognized fEncoding. We
// might get back a transcoder for an intrinsically supported encoding,
// or we might get one from the underlying transcoding service.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
newBaseEncoding
, failReason
, kCharBufSize
, fMemoryManager
);
if (!fTranscoder)
ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
}
// Update the base encoding member with the new base encoding found
fEncoding = newBaseEncoding;
// Looks ok to us
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Private helper methods
// ---------------------------------------------------------------------------
//
// This is called when the encoding flag is set and just sets the fSwapped
// flag appropriately.
//
void XMLReader::checkForSwapped()
{
// Assume not swapped
fSwapped = false;
if (XMLPlatformUtils::fgXMLChBigEndian)
{
if ((fEncoding == XMLRecognizer::UTF_16L)
|| (fEncoding == XMLRecognizer::UCS_4L))
{
fSwapped = true;
}
}
else
{
if ((fEncoding == XMLRecognizer::UTF_16B)
|| (fEncoding == XMLRecognizer::UCS_4B))
{
fSwapped = true;
}
}
}
//
// This is called from the constructor when the encoding is not forced.
// We assume that the encoding has been auto-sensed at this point and that
// fSwapped is set correctly.
//
// In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
// The fact that we got here, means that there is one, because that's the
// only way we can autosense those.
//
void XMLReader::doInitDecode()
{
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
// Remove bom if any
if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) )
{
for (XMLSize_t i = 0; i < fRawBytesAvail; i++)
fRawByteBuf[i] = fRawByteBuf[i+4];
fRawBytesAvail -=4;
}
// Look at the raw buffer as UCS4 chars
const UCS4Ch* asUCS = reinterpret_cast<const UCS4Ch*>(fRawByteBuf);
while (fRawBufIndex < fRawBytesAvail)
{
// Make sure there are at least sizeof(UCS4Ch) bytes to consume.
if (fRawBufIndex + sizeof(UCS4Ch) > fRawBytesAvail) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Get out the current 4 byte value and inc our raw buf index
UCS4Ch curVal = *asUCS++;
fRawBufIndex += sizeof(UCS4Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
// Make sure its at least semi legal. If not, undo and throw
if (curVal > 0xFFFF)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Convert the value to an XML char and store it
fCharSizeBuf[fCharsAvail] = 4;
fCharBuf[fCharsAvail++] = XMLCh(curVal);
// Break out on the > character
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
// Don't move to char buf - no one wants to see it.
// Note: this causes any encoding= declaration to override
// the BOM's attempt to say that the encoding is utf-8.
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
asChars += XMLRecognizer::fgUTF8BOMLen;
}
//
// First check that there are enough bytes to even see the
// decl indentifier. If not, get out now with no action since
// there is no decl.
//
if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
break;
// Check for the opening sequence. If not, then no decl
if (XMLString::compareNString( asChars
, XMLRecognizer::fgASCIIPre
, XMLRecognizer::fgASCIIPreLen))
{
break;
}
while (fRawBufIndex < fRawBytesAvail)
{
const char curCh = *asChars++;
fRawBufIndex++;
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Looks ok, so store it
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = XMLCh(curCh);
// Break out on a > character
if (curCh == chCloseAngle)
break;
//
// A char greater than 0x7F is not allowed in this case. If
// so, undo and throw.
//
if (curCh & 0x80)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
//
// If there is a decl here, we just truncate back the characters
// as we go. No surrogate creation would be allowed here in legal
// XML, so we consider it a transoding error if we find one.
//
if (fRawBytesAvail < 2)
break;
XMLSize_t postBOMIndex = 0;
const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
asUTF16++;
postBOMIndex = fRawBufIndex;
}
// First check that there are enough raw bytes for there to even
// be a decl indentifier. If not, then nothing to do.
//
if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
{
fRawBufIndex = postBOMIndex;
break;
}
//
// See we get a match on the prefix. If not, then reset and
// break out.
//
if (fEncoding == XMLRecognizer::UTF_16B)
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
else
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
while (fRawBufIndex < fRawBytesAvail)
{
// Make sure there are at least sizeof(UTF16Ch) bytes to consume.
if (fRawBufIndex + sizeof(UTF16Ch) > fRawBytesAvail) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Get out the current 2 byte value
UTF16Ch curVal = *asUTF16++;
fRawBufIndex += sizeof(UTF16Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
//
// Store it and bump the target index, implicitly converting
// if UTF16Ch and XMLCh are not the same size.
//
fCharSizeBuf[fCharsAvail] = 2;
fCharBuf[fCharsAvail++] = curVal;
// Break out on a > char
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::EBCDIC :
{
//
// We use special support in the intrinsic EBCDIC-US transcoder
// to go through one char at a time.
//
const XMLByte* srcPtr = fRawByteBuf;
while (1)
{
// Transcode one char from the source
const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
fRawBufIndex++;
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
//
// And put it into the character buffer. This stuff has to
// look like it was normally transcoded.
//
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = chCur;
// If its a > char, then break out
if (chCur == chCloseAngle)
break;
// Watch for using up all input and get out
if (fRawBufIndex == fRawBytesAvail)
break;
}
break;
}
default :
// It should never be anything else here
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fSystemId);
ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
break;
}
//
// Ok, by the time we get here, if its a legal XML file we have eaten
// the XML/TextDecl. So, if we are a PE and are being referenced from
// outside a literal, then we need to throw in an arbitrary space that
// is required by XML.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
fCharBuf[fCharsAvail++] = chSpace;
// Calculate fCharOfsBuf buffer using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
fCharOfsBuf[0] = 0;
for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
}
}
}
//
// This method is called internally when we run out of bytes in the raw
// buffer. We just read as many bytes as we can into the raw buffer again
// and store the number of bytes we got.
//
void XMLReader::refreshRawBuffer()
{
// Make sure we don't underflow on the subtraction.
if (fRawBufIndex > fRawBytesAvail) {
ThrowXMLwithMemMgr1
(
RuntimeException
, XMLExcepts::Str_StartIndexPastEnd
, fSystemId
, fMemoryManager
);
}
//
// If there are any bytes left, move them down to the start. There
// should only ever be (max bytes per char - 1) at the most.
//
const XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;
// Move the existing ones down
for (XMLSize_t index = 0; index < bytesLeft; index++)
fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];
//
// And then read into the buffer past the existing bytes. Add back in
// that many to the bytes read, and subtract that many from the bytes
// requested.
//
fRawBytesAvail = fStream->readBytes
(
&fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
) + bytesLeft;
//
// We need to reset the buffer index back to the start in all cases,
// since any trailing data was copied down to the start.
//
fRawBufIndex = 0;
}
//
// This method is called internally when we run out of characters in the
// trancoded character buffer. We transcode up to another maxChars chars
// from the
//
XMLSize_t
XMLReader::xcodeMoreChars( XMLCh* const bufToFill
, unsigned char* const charSizes
, const XMLSize_t maxChars)
{
XMLSize_t charsDone = 0;
XMLSize_t bytesEaten = 0;
bool needMode = false;
while (!bytesEaten)
{
// If our raw buffer is low, then lets load up another batch of
// raw bytes now.
//
XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;
if (needMode || bytesLeft == 0 || bytesLeft < fLowWaterMark)
{
refreshRawBuffer();
// If there are no characters or if we need more but didn't get
// any, return zero now.
//
if (fRawBytesAvail == 0 ||
(needMode && (bytesLeft == fRawBytesAvail - fRawBufIndex)))
return 0;
}
// Ask the transcoder to internalize another batch of chars. It is
// possible that there is data in the raw buffer but the transcoder
// is unable to produce anything because transcoding of multi-byte
// encodings may have left a few bytes representing a partial
// character in the buffer that can't be used until the next chunk
// (and the rest of the character) is read. In this case set the
// needMore flag and try again.
//
charsDone = fTranscoder->transcodeFrom
(
&fRawByteBuf[fRawBufIndex]
, fRawBytesAvail - fRawBufIndex
, bufToFill
, maxChars
, bytesEaten
, charSizes
);
if (bytesEaten == 0)
needMode = true;
else
fRawBufIndex += bytesEaten;
}
return charsDone;
}
/***
*
* XML1.1
*
* 2.11 End-of-Line Handling
*
* XML parsed entities are often stored in computer files which, for editing
* convenience, are organized into lines. These lines are typically separated
* by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
*
* To simplify the tasks of applications, the XML processor MUST behave as if
* it normalized all line breaks in external parsed entities (including the document
* entity) on input, before parsing, by translating all of the following to a single
* #xA character:
*
* 1. the two-character sequence #xD #xA
* 2. the two-character sequence #xD #x85
* 3. the single character #x85
* 4. the single character #x2028
* 5. any #xD character that is not immediately followed by #xA or #x85.
*
*
***/
void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
{
// 1. the two-character sequence #xD #xA
// 2. the two-character sequence #xD #x85
// 5. any #xD character that is not immediately followed by #xA or #x85.
switch(curCh)
{
case chCR:
fCurCol = 1;
fCurLine++;
//
// If not already internalized, then convert it to an
// LF and eat any following LF.
//
if (fSource == Source_External)
{
if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
{
if ( fCharBuf[fCharIndex] == chLF ||
((fCharBuf[fCharIndex] == chNEL) && fNEL) )
{
fCharIndex++;
}
}
curCh = chLF;
}
break;
case chLF:
fCurCol = 1;
fCurLine++;
break;
// 3. the single character #x85
// 4. the single character #x2028
case chNEL:
case chLineSeparator:
if (inDecl && fXMLVersion == XMLV1_1)
{
/***
* XML1.1
*
* 2.11 End-of-Line Handling
* ...
* The characters #x85 and #x2028 cannot be reliably recognized and translated
* until an entity's encoding declaration (if present) has been read.
* Therefore, it is a fatal error to use them within the XML declaration or
* text declaration.
*
***/
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_NelLsepinDecl
, fSystemId
, fMemoryManager
);
}
if (fNEL && fSource == Source_External)
{
fCurCol = 1;
fCurLine++;
curCh = chLF;
}
break;
default:
fCurCol++;
}
}
}