src/xercesc/internal/XMLReader.cpp - xerces-c - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /*
  * $Id$
  */

 // ---------------------------------------------------------------------------
 //  Includes
 // ---------------------------------------------------------------------------
 #include <xercesc/internal/XMLReader.hpp>
 #include <xercesc/util/BitOps.hpp>
 #include <xercesc/util/BinInputStream.hpp>
 #include <xercesc/util/PlatformUtils.hpp>
 #include <xercesc/util/RuntimeException.hpp>
 #include <xercesc/util/TransService.hpp>
 #include <xercesc/util/XMLEBCDICTranscoder.hpp>
 #include <xercesc/util/XMLString.hpp>
 #include <xercesc/util/Janitor.hpp>

 namespace XERCES_CPP_NAMESPACE {

 // ---------------------------------------------------------------------------
 //  XMLReader: Query Methods
 // ---------------------------------------------------------------------------
 //  Checks whether all of the chars in the passed buffer are whitespace or
 //  not. Breaks out on the first non-whitespace.
 //
 bool XMLReader::isAllSpaces(const   XMLCh* const    toCheck
                             , const XMLSize_t       count) const
 {
     const XMLCh* curCh = toCheck;
     const XMLCh* endPtr = toCheck + count;
     while (curCh < endPtr)
     {
         if (!(fgCharCharsTable[*curCh++] & gWhitespaceCharMask))
             return false;
     }
     return true;
 }


 //
 //  Checks whether at least one of the chars in the passed buffer are whitespace or
 //  not.
 //
 bool XMLReader::containsWhiteSpace(const   XMLCh* const    toCheck
                             , const XMLSize_t     count) const
 {
     const XMLCh* curCh = toCheck;
     const XMLCh* endPtr = toCheck + count;
     while (curCh < endPtr)
     {
         if (fgCharCharsTable[*curCh++] & gWhitespaceCharMask)
             return true;
     }
     return false;
 }

 //
 //  This one is not called terribly often, so call the XMLChar utility
 //
 bool XMLReader::isPublicIdChar(const XMLCh toCheck) const
 {
     if (fXMLVersion == XMLV1_1)
         return XMLChar1_1::isPublicIdChar(toCheck);
     else
         return XMLChar1_0::isPublicIdChar(toCheck);
 }

 // ---------------------------------------------------------------------------
 //  XMLReader: Constructors and Destructor
 // ---------------------------------------------------------------------------
 XMLReader::XMLReader(const  XMLCh* const          pubId
                     , const XMLCh* const          sysId
                     ,       BinInputStream* const streamToAdopt
                     , const RefFrom               from
                     , const Types                 type
                     , const Sources               source
                     , const bool                  throwAtEnd
                     , const bool                  calculateSrcOfs
                     ,       XMLSize_t             lowWaterMark
                     , const XMLVersion            version
                     ,       MemoryManager* const  manager) :
     fCharIndex(0)
     , fCharsAvail(0)
     , fCurCol(1)
     , fCurLine(1)
     , fEncodingStr(0)
     , fForcedEncoding(false)
     , fNoMore(false)
     , fPublicId(XMLString::replicate(pubId, manager))
     , fRawBufIndex(0)
     , fRawBytesAvail(0)
     , fLowWaterMark (lowWaterMark)
     , fReaderNum(0xFFFFFFFF)
     , fRefFrom(from)
     , fSentTrailingSpace(false)
     , fSource(source)
     , fSrcOfsBase(0)
     , fSrcOfsSupported(false)
     , fCalculateSrcOfs(calculateSrcOfs)
     , fSystemId(XMLString::replicate(sysId, manager))
     , fStream(streamToAdopt)
     , fSwapped(false)
     , fThrowAtEnd(throwAtEnd)
     , fTranscoder(0)
     , fType(type)
     , fMemoryManager(manager)
 {
     setXMLVersion(version);

     // Do an initial load of raw bytes
     refreshRawBuffer();

     // Ask the transcoding service if it supports src offset info
     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();

     //
     //  Use the recognizer class to get a basic sense of what family of
     //  encodings this file is in. We'll start off with a reader of that
     //  type, and update it later if needed when we read the XMLDecl line.
     //
     fEncoding = XMLRecognizer::basicEncodingProbe(fRawByteBuf, fRawBytesAvail);

     #if defined(XERCES_DEBUG)
     if ((fEncoding < XMLRecognizer::Encodings_Min)
     ||  (fEncoding > XMLRecognizer::Encodings_Max))
     {
         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
     }
     #endif

     fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);

     // Check whether the fSwapped flag should be set or not
     checkForSwapped();

     //
     //  This will check to see if the first line is an XMLDecl and, if
     //  so, decode that first line manually one character at a time. This
     //  leaves enough characters in the buffer that the high level code
     //  can get through the Decl and call us back with the real encoding.
     //
     doInitDecode();

     //
     //  NOTE: We won't create a transcoder until we either get a call to
     //  setEncoding() or we get a call to refreshCharBuffer() and no
     //  transcoder has been set yet.
     //
 }


 XMLReader::XMLReader(const  XMLCh* const          pubId
                     , const XMLCh* const          sysId
                     ,       BinInputStream* const streamToAdopt
                     , const XMLCh* const          encodingStr
                     , const RefFrom               from
                     , const Types                 type
                     , const Sources               source
                     , const bool                  throwAtEnd
                     , const bool                  calculateSrcOfs
                     ,       XMLSize_t             lowWaterMark
                     , const XMLVersion            version
                     ,       MemoryManager* const  manager) :
     fCharIndex(0)
     , fCharsAvail(0)
     , fCurCol(1)
     , fCurLine(1)
     , fEncoding(XMLRecognizer::UTF_8)
     , fEncodingStr(0)
     , fForcedEncoding(true)
     , fNoMore(false)
     , fPublicId(XMLString::replicate(pubId, manager))
     , fRawBufIndex(0)
     , fRawBytesAvail(0)
     , fLowWaterMark (lowWaterMark)
     , fReaderNum(0xFFFFFFFF)
     , fRefFrom(from)
     , fSentTrailingSpace(false)
     , fSource(source)
     , fSrcOfsBase(0)
     , fSrcOfsSupported(false)
     , fCalculateSrcOfs(calculateSrcOfs)
     , fSystemId(XMLString::replicate(sysId, manager))
     , fStream(streamToAdopt)
     , fSwapped(false)
     , fThrowAtEnd(throwAtEnd)
     , fTranscoder(0)
     , fType(type)
     , fMemoryManager(manager)
 {
     setXMLVersion(version);

     // Do an initial load of raw bytes
     refreshRawBuffer();

     // Copy the encoding string to our member
     fEncodingStr = XMLString::replicate(encodingStr, fMemoryManager);
     XMLString::upperCaseASCII(fEncodingStr);

     // Ask the transcoding service if it supports src offset info
     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();

     //
     //  Map the passed encoding name to one of our enums. If it does not
     //  match one of the intrinsic encodings, it will come back 'other',
     //  which tells us to create a transcoder based reader.
     //
     fEncoding = XMLRecognizer::encodingForName(fEncodingStr);

     //  test the presence of the BOM and remove it from the source
     switch(fEncoding)
     {
         case XMLRecognizer::UCS_4B :
         case XMLRecognizer::UCS_4L :
         {
             if (fRawBytesAvail > 4 &&
                 (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
                  ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)))  )
             {
                 fRawBufIndex += 4;
             }
             break;
         }
         case XMLRecognizer::UTF_8 :
         {
             // Look at the raw buffer as short chars
             const char* asChars = (const char*)fRawByteBuf;

             if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
                 XMLString::compareNString(  asChars
                                             , XMLRecognizer::fgUTF8BOM
                                             , XMLRecognizer::fgUTF8BOMLen) == 0)
             {
                 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
             }
             break;
         }
         case XMLRecognizer::UTF_16B :
         case XMLRecognizer::UTF_16L :
         {
             if (fRawBytesAvail < 2)
                 break;

             const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
             if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
             {
                 fRawBufIndex += sizeof(UTF16Ch);
             }
             break;
         }
         case XMLRecognizer::EBCDIC:
         case XMLRecognizer::US_ASCII:
         case XMLRecognizer::XERCES_XMLCH:
         case XMLRecognizer::OtherEncoding:
         case XMLRecognizer::Encodings_Count:
         {
             // silence warning about enumeration not being used
             break;
         }
     }

     // Check whether the fSwapped flag should be set or not
     checkForSwapped();

     //
     //  Create a transcoder for the encoding. Since the encoding has been
     //  forced, this will be the one we will use, period.
     //
     XMLTransService::Codes failReason;
     if (fEncoding == XMLRecognizer::OtherEncoding)
     {
         //
         //  fEncodingStr not  pre-recognized, use it
         //  directly for transcoder
         //
         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
         (
             fEncodingStr
             , failReason
             , kCharBufSize
             , fMemoryManager
         );
     }
      else
     {
         //
         //  Use the recognized fEncoding to create the transcoder
         //
         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
         (
             fEncoding
             , failReason
             , kCharBufSize
             , fMemoryManager
         );

     }

     if (!fTranscoder)
     {
         // We are about to throw which means the d-tor won't be called.
         // Clean up some memory.
         //
         fMemoryManager->deallocate(fPublicId);
         fMemoryManager->deallocate(fSystemId);
         ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);

         ThrowXMLwithMemMgr1
         (
             TranscodingException
             , XMLExcepts::Trans_CantCreateCvtrFor
             , fEncodingStr
             , fMemoryManager
         );
     }

     //
     //  Note that, unlike above, we do not do an initial decode of the
     //  first line. We take the caller's word that the encoding is correct
     //  and just assume that the first bulk decode (kicked off by the first
     //  get of a character) will work.
     //
     //  So we do here the slipping in of the leading space if required.
     //
     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
     {
         // This represents no data from the source
         fCharSizeBuf[fCharsAvail] = 0;
         fCharOfsBuf[fCharsAvail] = 0;
         fCharBuf[fCharsAvail++] = chSpace;
     }
 }


 XMLReader::XMLReader(const  XMLCh* const          pubId
                     , const XMLCh* const          sysId
                     ,       BinInputStream* const streamToAdopt
                     , XMLRecognizer::Encodings    encodingEnum
                     , const RefFrom               from
                     , const Types                 type
                     , const Sources               source
                     , const bool                  throwAtEnd
                     , const bool                  calculateSrcOfs
                     ,       XMLSize_t             lowWaterMark
                     , const XMLVersion            version
                     ,       MemoryManager* const  manager) :
     fCharIndex(0)
     , fCharsAvail(0)
     , fCurCol(1)
     , fCurLine(1)
     , fEncoding(XMLRecognizer::UTF_8)
     , fEncodingStr(0)
     , fForcedEncoding(true)
     , fNoMore(false)
     , fPublicId(XMLString::replicate(pubId, manager))
     , fRawBufIndex(0)
     , fRawBytesAvail(0)
     , fLowWaterMark (lowWaterMark)
     , fReaderNum(0xFFFFFFFF)
     , fRefFrom(from)
     , fSentTrailingSpace(false)
     , fSource(source)
     , fSrcOfsBase(0)
     , fSrcOfsSupported(false)
     , fCalculateSrcOfs(calculateSrcOfs)
     , fSystemId(XMLString::replicate(sysId, manager))
     , fStream(streamToAdopt)
     , fSwapped(false)
     , fThrowAtEnd(throwAtEnd)
     , fTranscoder(0)
     , fType(type)
     , fMemoryManager(manager)
 {
     setXMLVersion(version);

     // Do an initial load of raw bytes
     refreshRawBuffer();

     // Ask the transcoding service if it supports src offset info
     fSrcOfsSupported = XMLPlatformUtils::fgTransService->supportsSrcOfs();

     //
     //  Use the passed encoding code
     //
     fEncoding = encodingEnum;
     fEncodingStr = XMLString::replicate(XMLRecognizer::nameForEncoding(fEncoding, fMemoryManager), fMemoryManager);

     // Check whether the fSwapped flag should be set or not
     checkForSwapped();

     //
     //  Create a transcoder for the encoding. Since the encoding has been
     //  forced, this will be the one we will use, period.
     //
     XMLTransService::Codes failReason;
     fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
     (
         fEncoding
         , failReason
         , kCharBufSize
         , fMemoryManager
     );

     if (!fTranscoder)
     {
         // We are about to throw which means the d-tor won't be called.
         // Clean up some memory.
         //
         fMemoryManager->deallocate(fPublicId);
         fMemoryManager->deallocate(fSystemId);
         ArrayJanitor<XMLCh> jan (fEncodingStr, fMemoryManager);

         ThrowXMLwithMemMgr1
         (
             TranscodingException
             , XMLExcepts::Trans_CantCreateCvtrFor
             , fEncodingStr
             , fMemoryManager
         );
     }

     //
     //  Note that, unlike above, we do not do an initial decode of the
     //  first line. We take the caller's word that the encoding is correct
     //  and just assume that the first bulk decode (kicked off by the first
     //  get of a character) will work.
     //
     //  So we do here the slipping in of the leading space if required.
     //
     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
     {
         // This represents no data from the source
         fCharSizeBuf[fCharsAvail] = 0;
         fCharOfsBuf[fCharsAvail] = 0;
         fCharBuf[fCharsAvail++] = chSpace;
     }
 }


 XMLReader::~XMLReader()
 {
     fMemoryManager->deallocate(fEncodingStr);
     fMemoryManager->deallocate(fPublicId);
     fMemoryManager->deallocate(fSystemId);
     delete fStream;
     delete fTranscoder;
 }


 // ---------------------------------------------------------------------------
 //  XMLReader: Character buffer management methods
 // ---------------------------------------------------------------------------
 XMLFilePos XMLReader::getSrcOffset() const
 {
     if (!fSrcOfsSupported || !fCalculateSrcOfs)
         ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_SrcOfsNotSupported, fMemoryManager);

     //
     //  Take the current source offset and add in the sizes that we've
     //  eaten from the source so far.
     //
     if( fCharIndex == 0 ) {
         return fSrcOfsBase;
     }

     if( fCharIndex < fCharsAvail ) {

         return (fSrcOfsBase + fCharOfsBuf[fCharIndex]);
     }

     return (fSrcOfsBase + fCharOfsBuf[fCharIndex-1] + fCharSizeBuf[fCharIndex-1]);
 }


 bool XMLReader::refreshCharBuffer()
 {
     // If the no more flag is set, then don't bother doing anything.
     if (fNoMore)
         return false;

     XMLSize_t startInd;

     // See if we have any existing chars.
     const XMLSize_t spareChars = fCharsAvail - fCharIndex;

     // If we are full, then don't do anything.
     if (spareChars == kCharBufSize)
         return true;

     //
     //  If no transcoder has been created yet, then we never saw the
     //  any encoding="" string and the encoding was not forced, so lets
     //  create one now. We know that it won't change now.
     //
     //  However, note that if we autosensed EBCDIC, then we have to
     //  consider it an error if we never got an encoding since we don't
     //  know what variant of EBCDIC it is.
     //
     if (!fTranscoder)
     {
         if (fEncoding == XMLRecognizer::EBCDIC)
             ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Reader_EncodingStrRequired, fMemoryManager);

         // Ask the transcoding service to make use a transcoder
         XMLTransService::Codes failReason;
         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
         (
             fEncodingStr
             , failReason
             , kCharBufSize
             , fMemoryManager
         );

         if (!fTranscoder)
         {
             ThrowXMLwithMemMgr1
             (
                 TranscodingException
                 , XMLExcepts::Trans_CantCreateCvtrFor
                 , fEncodingStr
                 , fMemoryManager
             );
         }
     }

     //
     //  Add the number of source bytes eaten so far to the base src
     //  offset member.
     //
     if (fCalculateSrcOfs) {
         for (startInd = 0; startInd < fCharIndex; startInd++)
             fSrcOfsBase += fCharSizeBuf[startInd];
     }

     //
     //  If there are spare chars, then move then down to the bottom. We
     //  have to move the char sizes down also.
     //
     startInd = 0;
     if (spareChars)
     {
         for (XMLSize_t index = fCharIndex; index < fCharsAvail; index++)
         {
             fCharBuf[startInd] = fCharBuf[index];
             fCharSizeBuf[startInd] = fCharSizeBuf[index];
             startInd++;
         }
     }

     //
     //  And then get more chars, starting after any spare chars that were
     //  left over from the last time.
     //
     fCharsAvail = xcodeMoreChars
     (
         &fCharBuf[startInd]
         , &fCharSizeBuf[startInd]
         , kCharBufSize - spareChars
     );

     // Add back in the spare chars
     fCharsAvail += spareChars;

     // Reset the buffer index to zero, so we start from the 0th char again
     fCharIndex = 0;

     //
     //  If no chars available, then we have to check for one last thing. If
     //  this is reader for a PE and its not being expanded inside a literal,
     //  then unget a trailing space. We use a boolean to avoid triggering
     //  this more than once.
     //
     if (!fCharsAvail
     &&  (fType == Type_PE)
     &&  (fRefFrom == RefFrom_NonLiteral)
     &&  !fSentTrailingSpace)
     {
         fCharBuf[0] = chSpace;
         fCharsAvail = 1;
         fSentTrailingSpace = true;
     }

     //
     //  If we get here with no more chars, then set the fNoMore flag which
     //  lets us optimize and know without checking that no more chars are
     //  available.
     //
     if (!fCharsAvail)
         fNoMore = true;

     //  Calculate fCharOfsBuf using the elements from fCharBufSize
     if (fCalculateSrcOfs)
     {
         unsigned int last = 0;
         fCharOfsBuf[0] = 0;
         for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
             fCharOfsBuf[index] = last+fCharSizeBuf[index-1];
             last = fCharOfsBuf[index];
             // code was:
             // fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
             // but on Solaris 64 bit with sun studio 11 this didn't work as
             // every value of fCharOfsBuf[] was 1.
         }
     }

     return (fCharsAvail != 0);
 }


 // ---------------------------------------------------------------------------
 //  XMLReader: Scanning methods
 // ---------------------------------------------------------------------------
 bool XMLReader::getName(XMLBuffer& toFill, const bool token)
 {
     //  Ok, first lets see if we have chars in the buffer. If not, then lets
     //  reload.
     if (fCharIndex == fCharsAvail)
     {
         if (!refreshCharBuffer())
             return false;
     }

     XMLSize_t charIndex_start = fCharIndex;

     //  Lets check the first char for being a first name char. If not, then
     //  what's the point in living mannnn? Just give up now. We only do this
     //  if its a name and not a name token that they want.
     if (!token)
     {
         if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
             // if there isn't one more char in the buffer, read more data
             if (fCharIndex+1 == fCharsAvail)
             {
                 if (!refreshCharBuffer())
                     return false;
                 // reset the start buffer to the new location of the cursor
                 charIndex_start = fCharIndex;
             }
             if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
                 return false;

             // Looks ok, so lets eat it
             fCharIndex += 2;
         }
         else {
             if (!isFirstNameChar(fCharBuf[fCharIndex]))
                 return false;

             // Looks ok, so lets eat it
             fCharIndex ++;
         }

     }

     //  And now we loop until we run out of data in this reader or we hit
     //  a non-name char.
     while (true)
     {
         while (fCharIndex < fCharsAvail)
         {
             //  Check the current char and take it if its a name char. Else
             //  break out.
             if ( (fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F) )
             {
                 // if there isn't one more char in the buffer, read more data
                 if (fCharIndex+1 == fCharsAvail)
                 {
                     // but first copy the accepted character(s), and update column
                     if (fCharIndex != charIndex_start)
                     {
                         fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
                         toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
                     }

                     if (!refreshCharBuffer())
                         break;

                     charIndex_start = fCharIndex;
                 }
                 if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
                         (fCharBuf[fCharIndex+1] > 0xDFFF)  )
                     break;
                 fCharIndex += 2;

             }
             else
             {
                 if (!isNameChar(fCharBuf[fCharIndex]))
                     break;
                 fCharIndex++;
             }
         }

         // we have to copy the accepted character(s), and update column
         if (fCharIndex != charIndex_start)
         {
             fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
             toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
         }

         // something is wrong if there is still something in the buffer
         // or if we don't get no more, then break out.
         if ((fCharIndex < fCharsAvail) ||
              !refreshCharBuffer())
             break;

         charIndex_start = fCharIndex;
     }

     return !toFill.isEmpty();
 }

 bool XMLReader::getNCName(XMLBuffer& toFill)
 {
     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
         return false;

     XMLSize_t charIndex_start = fCharIndex, count;
     //  Lets check the first char for being a first name char. If not, then
     //  what's the point in living mannnn? Just give up now. We only do this
     //  if its a name and not a name token that they want.
     if ((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F)) {
         // if there isn't one more char in the buffer, read more data
         if (fCharIndex+1 == fCharsAvail)
         {
             if (!refreshCharBuffer())
                 return false;
             // reset the start buffer to the new location of the cursor
             charIndex_start = fCharIndex;
         }
         if ((fCharBuf[fCharIndex+1] < 0xDC00) || (fCharBuf[fCharIndex+1] > 0xDFFF))
             return false;

         // Looks ok, so lets eat it
         fCharIndex += 2;
     }
     else {
         if (!isFirstNCNameChar(fCharBuf[fCharIndex])) {
             return false;
         }

         // Looks ok, so lets eat it
         fCharIndex++;
     }

     do
     {
         if (fCharIndex == fCharsAvail)
         {
             // we have to copy the accepted character(s), and update the column number,
             // before getting new data and losing the value of fCharIndex
             if((count = fCharIndex - charIndex_start)!=0)
             {
                 fCurCol += (XMLFileLoc)count;
                 toFill.append(&fCharBuf[charIndex_start], count);
             }
             if(!refreshCharBuffer())
                 return true;
             charIndex_start = fCharIndex;
         }

         //  Check the current char and take it if it's a name char
         while(fCharIndex < fCharsAvail)
         {
             if((fCharBuf[fCharIndex] >= 0xD800) && (fCharBuf[fCharIndex] <= 0xDB7F))
             {
                 // if there isn't one more char in the buffer, read more data
                 if (fCharIndex+1 == fCharsAvail)
                 {
                     // but first copy the accepted character(s), and update column
                     if (fCharIndex != charIndex_start)
                     {
                         fCurCol += (XMLFileLoc)(fCharIndex - charIndex_start);
                         toFill.append(&fCharBuf[charIndex_start], fCharIndex - charIndex_start);
                     }

                     if (!refreshCharBuffer())
                         break;

                     charIndex_start = fCharIndex;
                 }
                 if ( (fCharBuf[fCharIndex+1] < 0xDC00) ||
                     (fCharBuf[fCharIndex+1] > 0xDFFF)  )
                     break;
                 fCharIndex += 2;
             }
             else if(isNCNameChar(fCharBuf[fCharIndex])) fCharIndex++;
             else break;
         }
         // if we didn't consume the entire buffer, we are done
     } while(fCharIndex == fCharsAvail);

     // we have to copy the accepted character(s), and update column
     if((count = fCharIndex - charIndex_start)!=0)
     {
         fCurCol += (XMLFileLoc)count;
         toFill.append(&fCharBuf[charIndex_start], count);
     }
     return true;
 }

 bool XMLReader::getQName(XMLBuffer& toFill, int* colonPosition)
 {
     // We are only looking for two iterations (i.e. 'NCNAME':'NCNAME').
     // We will stop when we finished scanning for a QName (i.e. either a second
     // colon or an invalid char).
     if(!getNCName(toFill))
     {
         *colonPosition = -1;
         return false;
     }
     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
     {
         *colonPosition = -1;
         return true;
     }
     if (fCharBuf[fCharIndex] != chColon)
     {
         *colonPosition = -1;
         return true;
     }

     *colonPosition = (int)toFill.getLen();
     toFill.append(chColon);
     fCharIndex++;
     fCurCol++;
     return getNCName(toFill);
 }

 bool XMLReader::getSpaces(XMLBuffer& toFill)
 {
     //
     //  We just loop until we either hit a non-space or the end of this
     //  entity. We return true if we returned because of a non-space and
     //  false if because of end of entity.
     //
     //  NOTE:   We have to maintain line/col info here and we have to do
     //          whitespace normalization if we are not already internalized.
     //
     while (true)
     {
         // Loop through the current chars in the buffer
         while (fCharIndex < fCharsAvail)
         {
             // Get the current char out of the buffer
             XMLCh curCh = fCharBuf[fCharIndex];

             //
             //  See if its a white space char. If so, then process it. Else
             //  we've hit a non-space and need to return.
             //
             if (isWhitespace(curCh))
             {
                 // Eat this char
                 fCharIndex++;

                 //
                 //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
                 //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
                 //
                 //  100000 x20
                 //  001001 x9
                 //  001010 chLF
                 //  001101 chCR
                 //  -----------
                 //  000110 == (chCR|chLF) & ~(0x9|0x20)
                 //
                 //  if the result of thelogical-& operation is
                 //  true  : 'curCh' must be xA  or xD
                 //  false : 'curCh' must be x20 or x9
                 //
                 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
                 {
                     fCurCol++;
                 } else
                 {
                     handleEOL(curCh, false);
                 }

                 // Ok we can add this guy to our buffer
                 toFill.append(curCh);
             }
              else
             {
                 // Return true to indicate we broke out due to a whitespace
                 return true;
             }
         }

         //
         //  We've eaten up the current buffer, so lets try to reload it. If
         //  we don't get anything new, then break out. If we do, then we go
         //  back to the top to keep getting spaces.
         //
         if (!refreshCharBuffer())
             break;
     }
     return false;
 }


 bool XMLReader::getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck)
 {
     while (true)
     {
         // Loop through the current chars in the buffer
         while (fCharIndex < fCharsAvail)
         {
             // Get the current char out of the buffer
             XMLCh curCh = fCharBuf[fCharIndex];

             //
             //  See if its not a white space or our target char, then process
             //  it. Else, we need to return.
             //
             if (!isWhitespace(curCh) && (curCh != toCheck))
             {
                 // Eat this char
                 fCharIndex++;

                 //
                 //  'curCh' is not a whitespace(x20|x9|xD|xA), so we only can
                 //  have end-of-line combinations with a leading chNEL(x85) or
                 //  chLineSeparator(x2028)
                 //
                 //  0010000000101000 chLineSeparator
                 //  0000000010000101 chNEL
                 //  ---------------------
                 //  1101111101010010 == ~(chNEL|chLineSeparator)
                 //
                 //  if the result of the logical-& operation is
                 //  true  : 'curCh' can not be chNEL or chLineSeparator
                 //  false : 'curCh' can be chNEL or chLineSeparator
                 //
                 if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) )
                 {
                     fCurCol++;
                 } else
                 {
                     handleEOL(curCh, false);
                 }

                 // Add it to our buffer
                 toFill.append(curCh);
             }
              else
             {
                 return true;
             }
         }

         //
         //  We've eaten up the current buffer, so lets try to reload it. If
         //  we don't get anything new, then break out. If we do, then we go
         //  back to the top to keep getting spaces.
         //
         if (!refreshCharBuffer())
             break;
     }

     // We never hit any non-space and ate up the whole reader
     return false;

 }

 bool XMLReader::skipIfQuote(XMLCh& chGotten)
 {
     if (fCharIndex == fCharsAvail && !refreshCharBuffer())
         return false;

     chGotten = fCharBuf[fCharIndex];
     if ((chGotten == chDoubleQuote) || (chGotten == chSingleQuote))
     {
         fCharIndex++;
         fCurCol++;
         return true;
     }
     return false;
 }


 bool XMLReader::skipSpaces(bool& skippedSomething, bool inDecl)
 {
     //  DO NOT set the skippedSomething to 'false', but change it to be 'true' only

     //  We enter a loop where we skip over spaces until we hit the end of
     //  this reader or a non-space value. The return indicates whether we
     //  hit the non-space (true) or the end (false).
     do
     {
         // Loop through the current chars in the buffer
         while (fCharIndex < fCharsAvail)
         {
             //  See if its a white space char. If so, then process it. Else
             //  we've hit a non-space and need to return.
             if (isWhitespace(fCharBuf[fCharIndex]))
             {
                 // Get the current char out of the buffer and eat it
                 XMLCh curCh = fCharBuf[fCharIndex++];
                 skippedSomething = true;
                 //
                 //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
                 //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
                 //
                 //  100000 x20
                 //  001001 x9
                 //  001010 chLF
                 //  001101 chCR
                 //  -----------
                 //  000110 == (chCR|chLF) & ~(0x9|0x20)
                 //
                 //  if the result of the logical-& operation is
                 //  true  : 'curCh' must be xA  or xD
                 //  false : 'curCh' must be x20 or x9
                 //
                 if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
                 {
                     fCurCol++;
                 } else
                 {
                     handleEOL(curCh, inDecl);
                 }
             }
             else
                 return true;
         }

         //  We've eaten up the current buffer, so lets try to reload it. If
         //  we don't get anything new, then break out. If we do, then we go
         //  back to the top to keep getting spaces.
     } while(refreshCharBuffer());

     // We never hit any non-space and ate up the whole reader
     return false;
 }

 bool XMLReader::skippedChar(const XMLCh toSkip)
 {
     //
     //  If the buffer is empty, then try to reload it. If we still get
     //  nothing, then return false.
     //
     if (fCharIndex == fCharsAvail)
     {
         if (!refreshCharBuffer())
             return false;
     }

     //
     //  See if the current char is the one we want. If so, then we need
     //  to eat it and return true.
     //
     if (fCharBuf[fCharIndex] == toSkip)
     {
         fCharIndex++;
         fCurCol++;
         return true;
     }
     return false;
 }


 bool XMLReader::skippedSpace()
 {
     //
     //  If the buffer is empty, then try to reload it. If we still get
     //  nothing, then return false.
     //
     if (fCharIndex == fCharsAvail)
     {
         if (!refreshCharBuffer())
             return false;
     }

     //
     //  See if the current char is a whitespace. If so, then we need to eat
     //  it and return true.
     //
     const XMLCh curCh = fCharBuf[fCharIndex];
     if (isWhitespace(curCh))
     {
         // Eat the character
         fCharIndex++;

         //
         //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
         //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
         //
         //  100000 x20
         //  001001 x9
         //  001010 chLF
         //  001101 chCR
         //  -----------
         //  000110 == (chCR|chLF) & ~(0x9|0x20)
         //
         //  if the result of the logical-& operation is
         //  true  : 'curCh' must be xA  or xD
         //  false : 'curCh' must be x20 or x9
         //
         if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
         {
             fCurCol++;
         } else
         {
             handleEOL((XMLCh&)curCh, false);
         }

         return true;
     }
     return false;
 }

 bool XMLReader::skippedString(const XMLCh* const toSkip)
 {
     // This function works on strings that are smaller than kCharBufSize.
     // This function guarantees that in case the comparison is unsuccessful
     // the fCharIndex will point to the original data.
     //

     // Get the length of the string to skip.
     //
     const XMLSize_t srcLen = XMLString::stringLen(toSkip);
     XMLSize_t charsLeft = charsLeftInBuffer();

     //  See if the current reader has enough chars to test against this
     //  string. If not, then ask it to reload its buffer. If that does not
     //  get us enough, then it cannot match.
     //
     //  NOTE: This works because strings never have to cross a reader! And
     //  a string to skip will never have a new line in it, so we will never
     //  miss adjusting the current line.
     //
     while (charsLeft < srcLen)
     {
       if (!refreshCharBuffer())
         return false;

       XMLSize_t tmp = charsLeftInBuffer();
       if (tmp == charsLeft) // if the refreshCharBuf() did not add anything new
         return false;     // give up and return.

       charsLeft = tmp;
     }

     //  Ok, now we now that the current reader has enough chars in its
     //  buffer and that its index is back at zero. So we can do a quick and
     //  dirty comparison straight to its buffer with no requirement to unget
     //  if it fails.
     //
     if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen * sizeof(XMLCh)))
       return false;

     // Add the source length to the current column to get it back right.
     //
     fCurCol += (XMLFileLoc)srcLen;

     //  And get the character buffer index back right by just adding the
     //  source len to it.
     //
     fCharIndex += srcLen;

     return true;
 }

 bool XMLReader::skippedStringLong(const XMLCh* toSkip)
 {
     // This function works on strings that are potentially longer than
     // kCharBufSize (e.g., end tag). This function does not guarantee
     // that in case the comparison is unsuccessful the fCharIndex will
     // point to the original data.
     //

     XMLSize_t srcLen = XMLString::stringLen(toSkip);
     XMLSize_t charsLeft = charsLeftInBuffer();

     while (srcLen != 0)
     {
       // Fill up the buffer with as much data as possible.
       //
       while (charsLeft < srcLen && charsLeft != kCharBufSize)
       {
         if (!refreshCharBuffer())
           return false;

         XMLSize_t tmp = charsLeftInBuffer();
         if (tmp == charsLeft) // if the refreshCharBuf() did not add anything
           return false;       // new give up and return.

         charsLeft = tmp;
       }

       XMLSize_t n = charsLeft < srcLen ? charsLeft : srcLen;

       if (memcmp(&fCharBuf[fCharIndex], toSkip, n * sizeof(XMLCh)))
         return false;

       toSkip += n;
       srcLen -= n;

       fCharIndex += n;
       fCurCol += (XMLFileLoc)n;
       charsLeft -= n;
     }

     return true;
 }

 //
 // This is just to peek if the next coming buffer
 // matches the string toPeek.
 // Similar to skippedString, but just the fCharIndex and fCurCol are not updated
 //
 bool XMLReader::peekString(const XMLCh* const toPeek)
 {
     // Get the length of the string to skip
     const XMLSize_t srcLen = XMLString::stringLen(toPeek);

     //
     //  See if the current reader has enough chars to test against this
     //  string. If not, then ask it to reload its buffer. If that does not
     //  get us enough, then it cannot match.
     //
     //  NOTE: This works because strings never have to cross a reader! And
     //  a string to skip will never have a new line in it, so we will never
     //  miss adjusting the current line.
     //
     XMLSize_t charsLeft = charsLeftInBuffer();
     while (charsLeft < srcLen)
     {
          refreshCharBuffer();
          XMLSize_t t = charsLeftInBuffer();
          if (t == charsLeft)   // if the refreshCharBuf() did not add anything new
              return false;     //   give up and return.
          charsLeft = t;
 	}


     //
     //  Ok, now we now that the current reader has enough chars in its
     //  buffer and that its index is back at zero. So we can do a quick and
     //  dirty comparison straight to its buffer with no requirement to unget
     //  if it fails.
     //
     if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh)))
         return false;

     return true;
 }


 // ---------------------------------------------------------------------------
 //  XMLReader: Setter methods (most are inlined)
 // ---------------------------------------------------------------------------
 bool XMLReader::setEncoding(const XMLCh* const newEncoding)
 {
     //
     //  If the encoding was forced, then we ignore the new value and just
     //  return with success. If it was forced, then we are to use that
     //  encoding without question. Note that, if we are forced, we created
     //  a transcoder up front so there is no need to do one here in that
     //  case.
     //
     if (fForcedEncoding)
         return true;

     //
     // upperCase the newEncoding first for better performance
     //
     XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
     XMLString::upperCaseASCII(inputEncoding);

     XMLRecognizer::Encodings newBaseEncoding;
     //
     //  Check for non-endian specific UTF-16 or UCS-4. If so, and if we
     //  are already in one of the endian versions of those encodings,
     //  then just keep it and go on. Otherwise, its not valid.
     //
     if (XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString2)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString3)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString4)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString5)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString6)
     ||  XMLString::equals(inputEncoding, XMLUni::fgUTF16EncodingString7))
     {
         fMemoryManager->deallocate(inputEncoding);

         if ((fEncoding != XMLRecognizer::UTF_16L)
         &&  (fEncoding != XMLRecognizer::UTF_16B))
         {
             return false;
         }

         // Override with the original endian specific encoding
         newBaseEncoding = fEncoding;

         if (fEncoding == XMLRecognizer::UTF_16L) {
             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = 0;
             fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
         }
         else {
             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = 0;
             fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
         }
     }
     else if (XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString)
          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString2)
          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString3)
          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString4)
          ||  XMLString::equals(inputEncoding, XMLUni::fgUCS4EncodingString5))
     {
         fMemoryManager->deallocate(inputEncoding);

         if ((fEncoding != XMLRecognizer::UCS_4L)
         &&  (fEncoding != XMLRecognizer::UCS_4B))
         {
             return false;
         }

         // Override with the original endian specific encoding
         newBaseEncoding = fEncoding;

         if (fEncoding == XMLRecognizer::UCS_4L) {

             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = 0;
             fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
         }
         else {

             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = 0;
             fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
         }
     }
      else
     {
         //
         //  Try to map the string to one of our standard encodings. If its not
         //  one of them, then it has to be one of the non-intrinsic encodings,
         //  in which case we have to delete our intrinsic encoder and create a
         //  new one.
         //
         newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding);

         //
         //  If it does not come back as one of the auto-sensed encodings, then we
         //  have to possibly replace it and at least check a few things.
         //
         if (newBaseEncoding == XMLRecognizer::OtherEncoding)
         {
             //
             // We already know it's none of those non-endian special cases,
             // so just replicate the new name and use it directly to create the transcoder
             //
             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = inputEncoding;

             // Check for a pre-created transcoder to delete.
             if (fTranscoder) {
                 delete fTranscoder;
                 fTranscoder = 0;
             }

             XMLTransService::Codes failReason;
             fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
             (
                 fEncodingStr
                 , failReason
                 , kCharBufSize
                 , fMemoryManager
             );

             if (!fTranscoder)
                 ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
         }
         else
         {
             // Store the new encoding string since it is just an intrinsic
             fMemoryManager->deallocate(fEncodingStr);
             fEncodingStr = inputEncoding;
         }
     }

     if (!fTranscoder) {
         //
         //  Now we can create a transcoder using the recognized fEncoding.  We
         //  might get back a transcoder for an intrinsically supported encoding,
         //  or we might get one from the underlying transcoding service.
         //
         XMLTransService::Codes failReason;
         fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
         (
             newBaseEncoding
             , failReason
             , kCharBufSize
             , fMemoryManager
         );

         if (!fTranscoder)
             ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
     }

     // Update the base encoding member with the new base encoding found
     fEncoding = newBaseEncoding;

     // Looks ok to us
     return true;
 }


 // ---------------------------------------------------------------------------
 //  XMLReader: Private helper methods
 // ---------------------------------------------------------------------------

 //
 //  This is called when the encoding flag is set and just sets the fSwapped
 //  flag appropriately.
 //
 void XMLReader::checkForSwapped()
 {
     // Assume not swapped
     fSwapped = false;

 	if (XMLPlatformUtils::fgXMLChBigEndian)
 	{
         if ((fEncoding == XMLRecognizer::UTF_16L)
         ||  (fEncoding == XMLRecognizer::UCS_4L))
         {
             fSwapped = true;
         }
     }
     else
     {
         if ((fEncoding == XMLRecognizer::UTF_16B)
         ||  (fEncoding == XMLRecognizer::UCS_4B))
         {
             fSwapped = true;
         }
     }
 }


 //
 //  This is called from the constructor when the encoding is not forced.
 //  We assume that the encoding has been auto-sensed at this point and that
 //  fSwapped is set correctly.
 //
 //  In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
 //  The fact that we got here, means that there is one, because that's the
 //  only way we can autosense those.
 //
 void XMLReader::doInitDecode()
 {
     switch(fEncoding)
     {
         case XMLRecognizer::UCS_4B :
         case XMLRecognizer::UCS_4L :
         {
             // Remove bom if any
             if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
                 ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )
             {
                 for (XMLSize_t i = 0; i < fRawBytesAvail; i++)
                     fRawByteBuf[i] = fRawByteBuf[i+4];

                 fRawBytesAvail -=4;
             }

             // Look at the raw buffer as UCS4 chars
             const UCS4Ch* asUCS = reinterpret_cast<const UCS4Ch*>(fRawByteBuf);

             while (fRawBufIndex < fRawBytesAvail)
             {
                 // Make sure there are at least sizeof(UCS4Ch) bytes to consume.
                 if (fRawBufIndex + sizeof(UCS4Ch) > fRawBytesAvail) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Make sure we don't exhaust the limited prolog buffer size.
                 // Leave room for a space added at the end of this function.
                 if (fCharsAvail == kCharBufSize - 1) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Get out the current 4 byte value and inc our raw buf index
                 UCS4Ch curVal = *asUCS++;
                 fRawBufIndex += sizeof(UCS4Ch);

                 // Swap if that is required for this machine
                 if (fSwapped)
                     curVal = BitOps::swapBytes(curVal);

                 // Make sure its at least semi legal. If not, undo and throw
                 if (curVal > 0xFFFF)
                 {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Convert the value to an XML char and store it
                 fCharSizeBuf[fCharsAvail] = 4;
                 fCharBuf[fCharsAvail++] = XMLCh(curVal);

                 // Break out on the > character
                 if (curVal == chCloseAngle)
                     break;
             }
             break;
         }

         case XMLRecognizer::UTF_8 :
         {
             // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.
             //   Don't move to char buf - no one wants to see it.
             //   Note: this causes any encoding= declaration to override
             //         the BOM's attempt to say that the encoding is utf-8.

             // Look at the raw buffer as short chars
             const char* asChars = (const char*)fRawByteBuf;

             if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
                 XMLString::compareNString(  asChars
                                             , XMLRecognizer::fgUTF8BOM
                                             , XMLRecognizer::fgUTF8BOMLen) == 0)
             {
                 fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
                 asChars      += XMLRecognizer::fgUTF8BOMLen;
             }

             //
             //  First check that there are enough bytes to even see the
             //  decl indentifier. If not, get out now with no action since
             //  there is no decl.
             //
             if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
                 break;

             // Check for the opening sequence. If not, then no decl
             if (XMLString::compareNString(  asChars
                                             , XMLRecognizer::fgASCIIPre
                                             , XMLRecognizer::fgASCIIPreLen))
             {
                 break;
             }

             while (fRawBufIndex < fRawBytesAvail)
             {
                 const char curCh = *asChars++;
                 fRawBufIndex++;

                 // Make sure we don't exhaust the limited prolog buffer size.
                 // Leave room for a space added at the end of this function.
                 if (fCharsAvail == kCharBufSize - 1) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Looks ok, so store it
                 fCharSizeBuf[fCharsAvail] = 1;
                 fCharBuf[fCharsAvail++] = XMLCh(curCh);

                 // Break out on a > character
                 if (curCh == chCloseAngle)
                     break;

                 //
                 //  A char greater than 0x7F is not allowed in this case. If
                 //  so, undo and throw.
                 //
                 if (curCh & 0x80)
                 {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }
             }
             break;
         }

         case XMLRecognizer::UTF_16B :
         case XMLRecognizer::UTF_16L :
         {
             //
             //  If there is a decl here, we just truncate back the characters
             //  as we go. No surrogate creation would be allowed here in legal
             //  XML, so we consider it a transoding error if we find one.
             //
             if (fRawBytesAvail < 2)
                 break;

             XMLSize_t postBOMIndex = 0;
             const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
             if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
             {
                 fRawBufIndex += sizeof(UTF16Ch);
                 asUTF16++;
                 postBOMIndex = fRawBufIndex;
             }

             //  First check that there are enough raw bytes for there to even
             //  be a decl indentifier. If not, then nothing to do.
             //
             if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
             {
                 fRawBufIndex = postBOMIndex;
                 break;
             }

             //
             //  See we get a match on the prefix. If not, then reset and
             //  break out.
             //
             if (fEncoding == XMLRecognizer::UTF_16B)
             {
                 if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
                 {
                     fRawBufIndex = postBOMIndex;
                     break;
                 }
             }
              else
             {
                 if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
                 {
                     fRawBufIndex = postBOMIndex;
                     break;
                 }
             }

             while (fRawBufIndex < fRawBytesAvail)
             {
                 // Make sure there are at least sizeof(UTF16Ch) bytes to consume.
                 if (fRawBufIndex + sizeof(UTF16Ch) > fRawBytesAvail) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Make sure we don't exhaust the limited prolog buffer size.
                 // Leave room for a space added at the end of this function.
                 if (fCharsAvail == kCharBufSize - 1) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }

                 // Get out the current 2 byte value
                 UTF16Ch curVal = *asUTF16++;
                 fRawBufIndex += sizeof(UTF16Ch);

                 // Swap if that is required for this machine
                 if (fSwapped)
                     curVal = BitOps::swapBytes(curVal);

                 //
                 //  Store it and bump the target index, implicitly converting
                 //  if UTF16Ch and XMLCh are not the same size.
                 //
                 fCharSizeBuf[fCharsAvail] = 2;
                 fCharBuf[fCharsAvail++] = curVal;

                 // Break out on a > char
                 if (curVal == chCloseAngle)
                     break;
             }
             break;
         }

         case XMLRecognizer::EBCDIC :
         {
             //
             //  We use special support in the intrinsic EBCDIC-US transcoder
             //  to go through one char at a time.
             //
             const XMLByte* srcPtr = fRawByteBuf;
             while (1)
             {
                 // Transcode one char from the source
                 const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
                 fRawBufIndex++;

                 // Make sure we don't exhaust the limited prolog buffer size.
                 // Leave room for a space added at the end of this function.
                 if (fCharsAvail == kCharBufSize - 1) {
                     fCharsAvail = 0;
                     fRawBufIndex = 0;
                     fMemoryManager->deallocate(fPublicId);
                     fMemoryManager->deallocate(fEncodingStr);
                     ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                     ThrowXMLwithMemMgr1
                     (
                         TranscodingException
                         , XMLExcepts::Reader_CouldNotDecodeFirstLine
                         , fSystemId
                         , fMemoryManager
                     );
                 }


                 //
                 //  And put it into the character buffer. This stuff has to
                 //  look like it was normally transcoded.
                 //
                 fCharSizeBuf[fCharsAvail] = 1;
                 fCharBuf[fCharsAvail++] = chCur;

                 // If its a > char, then break out
                 if (chCur == chCloseAngle)
                     break;

                 // Watch for using up all input and get out
                 if (fRawBufIndex == fRawBytesAvail)
                     break;
             }
             break;
         }

         default :
             // It should never be anything else here
             fMemoryManager->deallocate(fPublicId);
             fMemoryManager->deallocate(fEncodingStr);
             fMemoryManager->deallocate(fSystemId);
             ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
             break;
     }

     //
     //  Ok, by the time we get here, if its a legal XML file we have eaten
     //  the XML/TextDecl. So, if we are a PE and are being referenced from
     //  outside a literal, then we need to throw in an arbitrary space that
     //  is required by XML.
     //
     if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
         fCharBuf[fCharsAvail++] = chSpace;

     //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize
     if (fCalculateSrcOfs)
     {
         fCharOfsBuf[0] = 0;
         for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
             fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
         }
     }
 }


 //
 //  This method is called internally when we run out of bytes in the raw
 //  buffer. We just read as many bytes as we can into the raw buffer again
 //  and store the number of bytes we got.
 //
 void XMLReader::refreshRawBuffer()
 {
     // Make sure we don't underflow on the subtraction.
     if (fRawBufIndex > fRawBytesAvail) {
         ThrowXMLwithMemMgr1
         (
             RuntimeException
             , XMLExcepts::Str_StartIndexPastEnd
             , fSystemId
             , fMemoryManager
         );
     }

     //
     //  If there are any bytes left, move them down to the start. There
     //  should only ever be (max bytes per char - 1) at the most.
     //
     const XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;

     // Move the existing ones down
     for (XMLSize_t index = 0; index < bytesLeft; index++)
         fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];

     //
     //  And then read into the buffer past the existing bytes. Add back in
     //  that many to the bytes read, and subtract that many from the bytes
     //  requested.
     //
     fRawBytesAvail = fStream->readBytes
     (
         &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
     ) + bytesLeft;

     //
     //  We need to reset the buffer index back to the start in all cases,
     //  since any trailing data was copied down to the start.
     //
     fRawBufIndex = 0;
 }


 //
 //  This method is called internally when we run out of characters in the
 //  trancoded character buffer. We transcode up to another maxChars chars
 //  from the
 //
 XMLSize_t
 XMLReader::xcodeMoreChars(          XMLCh* const            bufToFill
                             ,       unsigned char* const    charSizes
                             , const XMLSize_t               maxChars)
 {
     XMLSize_t charsDone = 0;
     XMLSize_t bytesEaten = 0;
     bool needMode = false;

     while (!bytesEaten)
     {
         // If our raw buffer is low, then lets load up another batch of
         // raw bytes now.
         //
         XMLSize_t bytesLeft = fRawBytesAvail - fRawBufIndex;
         if (needMode || bytesLeft == 0 || bytesLeft < fLowWaterMark)
         {
             refreshRawBuffer();

             // If there are no characters or if we need more but didn't get
             // any, return zero now.
             //
             if (fRawBytesAvail == 0 ||
                 (needMode && (bytesLeft == fRawBytesAvail - fRawBufIndex)))
                 return 0;
         }

         // Ask the transcoder to internalize another batch of chars. It is
         // possible that there is data in the raw buffer but the transcoder
         // is unable to produce anything because transcoding of multi-byte
         // encodings may have left a few bytes representing a partial
         // character in the buffer that can't be used until the next chunk
         // (and the rest of the character) is read. In this case set the
         // needMore flag and try again.
         //

         charsDone = fTranscoder->transcodeFrom
           (
             &fRawByteBuf[fRawBufIndex]
             , fRawBytesAvail - fRawBufIndex
             , bufToFill
             , maxChars
             , bytesEaten
             , charSizes
           );

         if (bytesEaten == 0)
             needMode = true;
         else
             fRawBufIndex += bytesEaten;
     }

     return charsDone;
 }

 /***
  *
  * XML1.1
  *
  * 2.11 End-of-Line Handling
  *
  *    XML parsed entities are often stored in computer files which, for editing
  *    convenience, are organized into lines. These lines are typically separated
  *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
  *
  *    To simplify the tasks of applications, the XML processor MUST behave as if
  *    it normalized all line breaks in external parsed entities (including the document
  *    entity) on input, before parsing, by translating all of the following to a single
  *    #xA character:
  *
  *  1. the two-character sequence #xD #xA
  *  2. the two-character sequence #xD #x85
  *  3. the single character #x85
  *  4. the single character #x2028
  *  5. any #xD character that is not immediately followed by #xA or #x85.
  *
  *
  ***/
 void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
 {
     // 1. the two-character sequence #xD #xA
     // 2. the two-character sequence #xD #x85
     // 5. any #xD character that is not immediately followed by #xA or #x85.
     switch(curCh)
     {
     case chCR:
         fCurCol = 1;
         fCurLine++;

         //
         //  If not already internalized, then convert it to an
         //  LF and eat any following LF.
         //
         if (fSource == Source_External)
         {
             if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
             {
                 if ( fCharBuf[fCharIndex] == chLF              ||
                     ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
                 {
                     fCharIndex++;
                 }
             }
             curCh = chLF;
         }
         break;

     case chLF:
         fCurCol = 1;
         fCurLine++;
         break;

     // 3. the single character #x85
     // 4. the single character #x2028
     case chNEL:
     case chLineSeparator:
         if (inDecl && fXMLVersion == XMLV1_1)
         {

         /***
          * XML1.1
          *
          * 2.11 End-of-Line Handling
          *  ...
          *   The characters #x85 and #x2028 cannot be reliably recognized and translated
          *   until an entity's encoding declaration (if present) has been read.
          *   Therefore, it is a fatal error to use them within the XML declaration or
          *   text declaration.
          *
          ***/
             ThrowXMLwithMemMgr1
                 (
                 TranscodingException
                 , XMLExcepts::Reader_NelLsepinDecl
                 , fSystemId
                 , fMemoryManager
                 );
         }

         if (fNEL && fSource == Source_External)
         {
             fCurCol = 1;
             fCurLine++;
             curCh = chLF;
         }
         break;
     default:
         fCurCol++;
     }
 }

 }