AOO410/main/sax/source/expatwrap/xml2utf.cxx - openoffice - Git at Google

 /**************************************************************
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *************************************************************/


 #include <string.h>

 #include <sal/types.h>

 #include <rtl/textenc.h>
 #include <rtl/tencinfo.h>


 #include <com/sun/star/io/XInputStream.hpp>

 using namespace rtl;
 using namespace ::com::sun::star::uno;
 using namespace ::com::sun::star::io;

 #include "xml2utf.hxx"

 namespace sax_expatwrap {

 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
 	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
 {

 	Sequence<sal_Int8> seqIn;

 	if( ! m_in.is() ) {
 		throw NotConnectedException();
 	}
 	if( ! m_bStarted ) {
 		nMaxToRead = Max( 512 , nMaxToRead );  	// it should be possible to find the encoding attribute
 						     					// within the first 512 bytes == 128 chars in UCS-4
 	}

 	sal_Int32 nRead;
 	Sequence< sal_Int8 > seqStart;
 	while( sal_True )
 	{
 		nRead = m_in->readSomeBytes( seq , nMaxToRead );

 		if( nRead + seqStart.getLength())
 		{
 			// if nRead is 0, the file is already eof.
 			if( ! m_bStarted && nRead )
 			{
 				// ensure that enough data is available to parse encoding
 				if( seqStart.getLength() )
 				{
 				  // prefix with what we had so far.
 				  sal_Int32 nLength = seq.getLength();
 				  seq.realloc( seqStart.getLength() + nLength );

 				  memmove (seq.getArray() + seqStart.getLength(),
 					   seq.getConstArray(),
 					   nLength);
 				  memcpy  (seq.getArray(),
 					   seqStart.getConstArray(),
 					   seqStart.getLength());
 				}

 				// autodetection with the first bytes
 				if( ! isEncodingRecognizable( seq ) )
 				{
 				  // remember what we have so far.
 				  seqStart = seq;

 				  // read more !
 				  continue;
 				}
 				if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
 					// initialize decoding
 					initializeDecoding();
 				}
 				nRead = seq.getLength();
 				seqStart = Sequence < sal_Int8 > ();
 			}

 			// do the encoding
 			if( m_pText2Unicode && m_pUnicode2Text &&
 				m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {

 				Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
 				seq = m_pUnicode2Text->convert(	seqUnicode.getConstArray(),	seqUnicode.getLength() );
 			}

 			if( ! m_bStarted )
 			{
 				// it must now be ensured, that no encoding attribute exist anymore
 				// ( otherwise the expat-Parser will crash )
 				// This must be done after decoding !
 				// ( e.g. Files decoded in ucs-4 cannot be read properly )
 				m_bStarted = sal_True;
 				removeEncoding( seq );
 			}
 			nRead = seq.getLength();
 		}

 		break;
 	}
 	return nRead;
 }


 XMLFile2UTFConverter::~XMLFile2UTFConverter()
 {
 	if( m_pText2Unicode )
 		delete m_pText2Unicode;
 	if( m_pUnicode2Text )
 		delete m_pUnicode2Text;
 }


 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
 {
 	const sal_Int8 *pSource = seq.getArray();
 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
 	{

 		// scan for encoding
 		OString str( (sal_Char * ) pSource , seq.getLength() );

 		// cut sequence to first line break
 		// find first line break;
 		int nMax = str.indexOf( 10 );
 		if( nMax >= 0 )
 		{
 			str = str.copy( 0 , nMax );
 		}

 		int nFound = str.indexOf( " encoding" );
 		if( nFound >= 0 ) {
 			int nStop;
 			int nStart = str.indexOf( "\"" , nFound );
 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 			{
 				nStart = str.indexOf( "'" , nFound );
 				nStop  = str.indexOf( "'" , nStart +1 );
 			}
 			else
 			{
 				nStop  = str.indexOf( "\"" , nStart +1);
 			}

 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 			{
 				// remove encoding tag from file
 				memmove(        &( seq.getArray()[nFound] ) ,
 								&( seq.getArray()[nStop+1]) ,
 								seq.getLength() - nStop -1);
 				seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
 //				str = String( (char * ) seq.getArray() , seq.getLen() );
 			}
 		}
 	}
 }

 // Checks, if enough data has been accumulated to recognize the encoding
 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
 {
 	const sal_Int8 *pSource = seq.getConstArray();
 	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;

 	if( seq.getLength() < 8 ) {
 		// no recognition possible, when less than 8 bytes are available
 		return sal_False;
 	}

 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
 		// scan if the <?xml tag finishes within this buffer
 		bCheckIfFirstClosingBracketExsists = sal_True;
 	}
 	else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
 			 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
 	{
 		// check for utf-16
 		bCheckIfFirstClosingBracketExsists = sal_True;
 	}
 	else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
 		     ( '?' == pSource[5] || '?' == pSource[7] ) )
 	{
 		// check for
 		bCheckIfFirstClosingBracketExsists = sal_True;
 	}

 	if( bCheckIfFirstClosingBracketExsists )
 	{
 		for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
 		{
 			// whole <?xml tag is valid
 			if( '>' == pSource[ i ] )
 			{
 				return sal_True;
 			}
 		}
 		return sal_False;
 	}

 	// No <? tag in front, no need for a bigger buffer
 	return sal_True;
 }

 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
 {
 	const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
 	sal_Bool bReturn = sal_True;

 	if( seq.getLength() < 4 ) {
 		// no recognition possible, when less than 4 bytes are available
 		return sal_False;
 	}

 	// first level : detect possible file formats
 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {

 		// scan for encoding
 		OString str( (const sal_Char *) pSource , seq.getLength() );

 		// cut sequence to first line break
 		//find first line break;
 		int nMax = str.indexOf( 10 );
 		if( nMax >= 0 )
 		{
 			str = str.copy( 0 , nMax );
 		}

 		int nFound = str.indexOf( " encoding" );
 		if( nFound < str.getLength() ) {
 			int nStop;
 			int nStart = str.indexOf( "\"" , nFound );
 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
 			{
 				nStart = str.indexOf( "'" , nFound );
 				nStop  = str.indexOf( "'" , nStart +1 );
 			}
 			else
 			{
 				nStop  = str.indexOf( "\"" , nStart +1);
 			}
 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
 			{
 				// encoding found finally
 				m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
 			}
 		}
 	}
 	else if( 0xFE == pSource[0] &&
 	         0xFF == pSource[1] ) {
 		// UTF-16 big endian
 		// conversion is done so that encoding information can be easily extracted
 		m_sEncoding = "utf-16";
 	}
 	else if( 0xFF == pSource[0] &&
 	         0xFE == pSource[1] ) {
 		// UTF-16 little endian
 		// conversion is done so that encoding information can be easily extracted
 		m_sEncoding = "utf-16";
 	}
 	else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
 		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
 		// The byte order mark is simply added

 		// simply add the byte order mark !
 		seq.realloc( seq.getLength() + 2 );
 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 		((sal_uInt8*)seq.getArray())[0] = 0xFE;
 		((sal_uInt8*)seq.getArray())[1] = 0xFF;

 		m_sEncoding = "utf-16";
 	}
 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
 		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
 		// The byte order mark is simply added

 		seq.realloc( seq.getLength() + 2 );
 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
 		((sal_uInt8*)seq.getArray())[0] = 0xFF;
 		((sal_uInt8*)seq.getArray())[1] = 0xFE;

 		m_sEncoding = "utf-16";
 	}
     else if( 0xEF == pSource[0] &&
              0xBB == pSource[1] &&
              0xBF == pSource[2] )
     {
         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
         // The BOM is removed.
         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
         seq.realloc( seq.getLength() - 3 );
         m_sEncoding = "utf-8";
     }
 	else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
 		// UCS-4 big endian
 		m_sEncoding = "ucs-4";
 	}
 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
 		// UCS-4 little endian
 		m_sEncoding = "ucs-4";
 	}
 	else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
 	         0xa7 == static_cast<unsigned char> (pSource[2]) &&
 	         0x94 == static_cast<unsigned char> (pSource[3]) ) {
 		// EBCDIC
 		bReturn = sal_False;   // must be extended
 	}
 	else {
 		// other
 		// UTF8 is directly recognized by the parser.
 		bReturn = sal_False;
 	}

 	return bReturn;
 }

 void XMLFile2UTFConverter::initializeDecoding()
 {

 	if( m_sEncoding.getLength() )
 	{
 		rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
 		if( encoding != RTL_TEXTENCODING_UTF8 )
 		{
 			m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
 			m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
 		}
 	}
 }


 //----------------------------------------------
 //
 // Text2UnicodeConverter
 //
 //----------------------------------------------
 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
 {
 	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
 	if( RTL_TEXTENCODING_DONTKNOW == encoding )
 	{
 		m_bCanContinue = sal_False;
 		m_bInitialized = sal_False;
 	}
 	else
 	{
 		init( encoding );
 	}
 }

 Text2UnicodeConverter::~Text2UnicodeConverter()
 {
 	if( m_bInitialized )
 	{
 		rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
 		rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
 	}
 }

 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
 {
 	m_bCanContinue = sal_True;
 	m_bInitialized = sal_True;

 	m_convText2Unicode 	= rtl_createTextToUnicodeConverter(encoding);
 	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
 	m_rtlEncoding = encoding;
 }


 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
 {
 	sal_uInt32 uiInfo;
 	sal_Size nSrcCvtBytes 	= 0;
 	sal_Size nTargetCount 	= 0;
 	sal_Size nSourceCount   = 0;

 	// the whole source size
 	sal_Int32 	nSourceSize = seqText.getLength() + m_seqSource.getLength();
 	Sequence<sal_Unicode> 	seqUnicode ( nSourceSize );

 	const sal_Int8 *pbSource = seqText.getConstArray();
 	sal_Int8 *pbTempMem = 0;

 	if( m_seqSource.getLength() ) {
 		// put old rest and new byte sequence into one array
 		pbTempMem = new sal_Int8[ nSourceSize ];
 		memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
 		memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
 		pbSource = pbTempMem;

 		// set to zero again
 		m_seqSource = Sequence< sal_Int8 >();
 	}

 	while( sal_True ) {

 		/* All invalid characters are transformed to the unicode undefined char */
 		nTargetCount += 	rtl_convertTextToUnicode(
 									m_convText2Unicode,
 									m_contextText2Unicode,
 									( const sal_Char * ) &( pbSource[nSourceCount] ),
 									nSourceSize - nSourceCount ,
 									&( seqUnicode.getArray()[ nTargetCount ] ),
 									seqUnicode.getLength() - nTargetCount,
 									RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
 									RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 									RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
 									&uiInfo,
 									&nSrcCvtBytes );
 		nSourceCount += nSrcCvtBytes;

 		if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
 			// save necessary bytes for next conversion
 			seqUnicode.realloc( seqUnicode.getLength() * 2 );
 			continue;
 		}
 		break;
 	}
 	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
 		m_seqSource.realloc( nSourceSize - nSourceCount );
 		memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
 	}


 	if( pbTempMem ) {
 		delete [] pbTempMem;
 	}

 	// set to correct unicode size
 	seqUnicode.realloc( nTargetCount );

 	return seqUnicode;
 }


 //----------------------------------------------
 //
 // Unicode2TextConverter
 //
 //----------------------------------------------
 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
 {
 	init( encoding );
 }


 Unicode2TextConverter::~Unicode2TextConverter()
 {
 	if( m_bInitialized ) {
 		rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
 		rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
 	}
 }


 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
 {
 	sal_Unicode *puTempMem = 0;

 	if( m_seqSource.getLength() ) {
 		// For surrogates !
 		// put old rest and new byte sequence into one array
 		// In general when surrogates are used, they should be rarely
 		// cut off between two convert()-calls. So this code is used
 		// rarely and the extra copy is acceptable.
 		puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
 		memcpy( puTempMem ,
 				m_seqSource.getConstArray() ,
 				m_seqSource.getLength() * sizeof( sal_Unicode ) );
 		memcpy(
 			&(puTempMem[ m_seqSource.getLength() ]) ,
 			puSource ,
 			nSourceSize*sizeof( sal_Unicode ) );
 		puSource = puTempMem;
 		nSourceSize += m_seqSource.getLength();

 		m_seqSource = Sequence< sal_Unicode > ();
 	}


 	sal_Size nTargetCount = 0;
 	sal_Size nSourceCount = 0;

 	sal_uInt32 uiInfo;
 	sal_Size nSrcCvtChars;

 	// take nSourceSize * 3 as preference
 	// this is an upper boundary for converting to utf8,
 	// which most often used as the target.
 	sal_Int32 nSeqSize =  nSourceSize * 3;

 	Sequence<sal_Int8> 	seqText( nSeqSize );
 	sal_Char *pTarget = (sal_Char *) seqText.getArray();
 	while( sal_True ) {

 		nTargetCount += rtl_convertUnicodeToText(
 									m_convUnicode2Text,
 									m_contextUnicode2Text,
 									&( puSource[nSourceCount] ),
 									nSourceSize - nSourceCount ,
 									&( pTarget[nTargetCount] ),
 									nSeqSize - nTargetCount,
 									RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
 									RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
 									&uiInfo,
 									&nSrcCvtChars);
 		nSourceCount += nSrcCvtChars;

 		if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
 			nSeqSize = nSeqSize *2;
 			seqText.realloc( nSeqSize );  // double array size
 			pTarget = ( sal_Char * ) seqText.getArray();
 			continue;
 		}
 		break;
 	}

 	// for surrogates
 	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
 		m_seqSource.realloc( nSourceSize - nSourceCount );
 		memcpy( m_seqSource.getArray() ,
 				&(puSource[nSourceCount]),
 				(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
 	}

 	if( puTempMem ) {
 		delete [] puTempMem;
 	}

 	// reduce the size of the buffer (fast, no copy necessary)
 	seqText.realloc( nTargetCount );

 	return seqText;
 }

 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
 {
 	m_bCanContinue = sal_True;
 	m_bInitialized = sal_True;

 	m_convUnicode2Text 	= rtl_createUnicodeToTextConverter( encoding );
 	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
 	m_rtlEncoding = encoding;
 };


 }
	/**************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*************************************************************/


	#include <string.h>

	#include <sal/types.h>

	#include <rtl/textenc.h>
	#include <rtl/tencinfo.h>


	#include <com/sun/star/io/XInputStream.hpp>

	using namespace rtl;
	using namespace ::com::sun::star::uno;
	using namespace ::com::sun::star::io;

	#include "xml2utf.hxx"

	namespace sax_expatwrap {

	sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
	{

	Sequence<sal_Int8> seqIn;

	if( ! m_in.is() ) {
	throw NotConnectedException();
	}
	if( ! m_bStarted ) {
	nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute
	// within the first 512 bytes == 128 chars in UCS-4
	}

	sal_Int32 nRead;
	Sequence< sal_Int8 > seqStart;
	while( sal_True )
	{
	nRead = m_in->readSomeBytes( seq , nMaxToRead );

	if( nRead + seqStart.getLength())
	{
	// if nRead is 0, the file is already eof.
	if( ! m_bStarted && nRead )
	{
	// ensure that enough data is available to parse encoding
	if( seqStart.getLength() )
	{
	// prefix with what we had so far.
	sal_Int32 nLength = seq.getLength();
	seq.realloc( seqStart.getLength() + nLength );

	memmove (seq.getArray() + seqStart.getLength(),
	seq.getConstArray(),
	nLength);
	memcpy (seq.getArray(),
	seqStart.getConstArray(),
	seqStart.getLength());
	}

	// autodetection with the first bytes
	if( ! isEncodingRecognizable( seq ) )
	{
	// remember what we have so far.
	seqStart = seq;

	// read more !
	continue;
	}
	if( scanForEncoding( seq ) \|\| m_sEncoding.getLength() ) {
	// initialize decoding
	initializeDecoding();
	}
	nRead = seq.getLength();
	seqStart = Sequence < sal_Int8 > ();
	}

	// do the encoding
	if( m_pText2Unicode && m_pUnicode2Text &&
	m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {

	Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
	seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() );
	}

	if( ! m_bStarted )
	{
	// it must now be ensured, that no encoding attribute exist anymore
	// ( otherwise the expat-Parser will crash )
	// This must be done after decoding !
	// ( e.g. Files decoded in ucs-4 cannot be read properly )
	m_bStarted = sal_True;
	removeEncoding( seq );
	}
	nRead = seq.getLength();
	}

	break;
	}
	return nRead;
	}


	XMLFile2UTFConverter::~XMLFile2UTFConverter()
	{
	if( m_pText2Unicode )
	delete m_pText2Unicode;
	if( m_pUnicode2Text )
	delete m_pUnicode2Text;
	}


	void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
	{
	const sal_Int8 *pSource = seq.getArray();
	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
	{

	// scan for encoding
	OString str( (sal_Char * ) pSource , seq.getLength() );

	// cut sequence to first line break
	// find first line break;
	int nMax = str.indexOf( 10 );
	if( nMax >= 0 )
	{
	str = str.copy( 0 , nMax );
	}

	int nFound = str.indexOf( " encoding" );
	if( nFound >= 0 ) {
	int nStop;
	int nStart = str.indexOf( "\"" , nFound );
	if( nStart < 0 \|\| str.indexOf( "'" , nFound ) < nStart )
	{
	nStart = str.indexOf( "'" , nFound );
	nStop = str.indexOf( "'" , nStart +1 );
	}
	else
	{
	nStop = str.indexOf( "\"" , nStart +1);
	}

	if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
	{
	// remove encoding tag from file
	memmove( &( seq.getArray()[nFound] ) ,
	&( seq.getArray()[nStop+1]) ,
	seq.getLength() - nStop -1);
	seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
	// str = String( (char * ) seq.getArray() , seq.getLen() );
	}
	}
	}
	}

	// Checks, if enough data has been accumulated to recognize the encoding
	sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
	{
	const sal_Int8 *pSource = seq.getConstArray();
	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;

	if( seq.getLength() < 8 ) {
	// no recognition possible, when less than 8 bytes are available
	return sal_False;
	}

	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
	// scan if the <?xml tag finishes within this buffer
	bCheckIfFirstClosingBracketExsists = sal_True;
	}
	else if( ('<' == pSource[0] \|\| '<' == pSource[2] ) &&
	( ('?' == pSource[4] \|\| '?' == pSource[6] ) ) )
	{
	// check for utf-16
	bCheckIfFirstClosingBracketExsists = sal_True;
	}
	else if( ( '<' == pSource[1] \|\| '<' == pSource[3] ) &&
	( '?' == pSource[5] \|\| '?' == pSource[7] ) )
	{
	// check for
	bCheckIfFirstClosingBracketExsists = sal_True;
	}

	if( bCheckIfFirstClosingBracketExsists )
	{
	for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
	{
	// whole <?xml tag is valid
	if( '>' == pSource[ i ] )
	{
	return sal_True;
	}
	}
	return sal_False;
	}

	// No <? tag in front, no need for a bigger buffer
	return sal_True;
	}

	sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
	{
	const sal_uInt8 pSource = reinterpret_cast<const sal_uInt8>( seq.getConstArray() );
	sal_Bool bReturn = sal_True;

	if( seq.getLength() < 4 ) {
	// no recognition possible, when less than 4 bytes are available
	return sal_False;
	}

	// first level : detect possible file formats
	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {

	// scan for encoding
	OString str( (const sal_Char *) pSource , seq.getLength() );

	// cut sequence to first line break
	//find first line break;
	int nMax = str.indexOf( 10 );
	if( nMax >= 0 )
	{
	str = str.copy( 0 , nMax );
	}

	int nFound = str.indexOf( " encoding" );
	if( nFound < str.getLength() ) {
	int nStop;
	int nStart = str.indexOf( "\"" , nFound );
	if( nStart < 0 \|\| str.indexOf( "'" , nFound ) < nStart )
	{
	nStart = str.indexOf( "'" , nFound );
	nStop = str.indexOf( "'" , nStart +1 );
	}
	else
	{
	nStop = str.indexOf( "\"" , nStart +1);
	}
	if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
	{
	// encoding found finally
	m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
	}
	}
	}
	else if( 0xFE == pSource[0] &&
	0xFF == pSource[1] ) {
	// UTF-16 big endian
	// conversion is done so that encoding information can be easily extracted
	m_sEncoding = "utf-16";
	}
	else if( 0xFF == pSource[0] &&
	0xFE == pSource[1] ) {
	// UTF-16 little endian
	// conversion is done so that encoding information can be easily extracted
	m_sEncoding = "utf-16";
	}
	else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
	// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
	// The byte order mark is simply added

	// simply add the byte order mark !
	seq.realloc( seq.getLength() + 2 );
	memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
	((sal_uInt8*)seq.getArray())[0] = 0xFE;
	((sal_uInt8*)seq.getArray())[1] = 0xFF;

	m_sEncoding = "utf-16";
	}
	else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
	// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
	// The byte order mark is simply added

	seq.realloc( seq.getLength() + 2 );
	memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
	((sal_uInt8*)seq.getArray())[0] = 0xFF;
	((sal_uInt8*)seq.getArray())[1] = 0xFE;

	m_sEncoding = "utf-16";
	}
	else if( 0xEF == pSource[0] &&
	0xBB == pSource[1] &&
	0xBF == pSource[2] )
	{
	// UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
	// The BOM is removed.
	memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
	seq.realloc( seq.getLength() - 3 );
	m_sEncoding = "utf-8";
	}
	else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
	// UCS-4 big endian
	m_sEncoding = "ucs-4";
	}
	else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
	// UCS-4 little endian
	m_sEncoding = "ucs-4";
	}
	else if( 0x4c == pSource[0] && 0x6f == pSource[1] &&
	0xa7 == static_cast<unsigned char> (pSource[2]) &&
	0x94 == static_cast<unsigned char> (pSource[3]) ) {
	// EBCDIC
	bReturn = sal_False; // must be extended
	}
	else {
	// other
	// UTF8 is directly recognized by the parser.
	bReturn = sal_False;
	}

	return bReturn;
	}

	void XMLFile2UTFConverter::initializeDecoding()
	{

	if( m_sEncoding.getLength() )
	{
	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
	if( encoding != RTL_TEXTENCODING_UTF8 )
	{
	m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
	m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
	}
	}
	}


	//----------------------------------------------
	//
	// Text2UnicodeConverter
	//
	//----------------------------------------------
	Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
	{
	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
	if( RTL_TEXTENCODING_DONTKNOW == encoding )
	{
	m_bCanContinue = sal_False;
	m_bInitialized = sal_False;
	}
	else
	{
	init( encoding );
	}
	}

	Text2UnicodeConverter::~Text2UnicodeConverter()
	{
	if( m_bInitialized )
	{
	rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
	rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
	}
	}

	void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
	{
	m_bCanContinue = sal_True;
	m_bInitialized = sal_True;

	m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding);
	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
	m_rtlEncoding = encoding;
	}


	Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
	{
	sal_uInt32 uiInfo;
	sal_Size nSrcCvtBytes = 0;
	sal_Size nTargetCount = 0;
	sal_Size nSourceCount = 0;

	// the whole source size
	sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength();
	Sequence<sal_Unicode> seqUnicode ( nSourceSize );

	const sal_Int8 *pbSource = seqText.getConstArray();
	sal_Int8 *pbTempMem = 0;

	if( m_seqSource.getLength() ) {
	// put old rest and new byte sequence into one array
	pbTempMem = new sal_Int8[ nSourceSize ];
	memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
	memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
	pbSource = pbTempMem;

	// set to zero again
	m_seqSource = Sequence< sal_Int8 >();
	}

	while( sal_True ) {

	/* All invalid characters are transformed to the unicode undefined char */
	nTargetCount += rtl_convertTextToUnicode(
	m_convText2Unicode,
	m_contextText2Unicode,
	( const sal_Char * ) &( pbSource[nSourceCount] ),
	nSourceSize - nSourceCount ,
	&( seqUnicode.getArray()[ nTargetCount ] ),
	seqUnicode.getLength() - nTargetCount,
	RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT \|
	RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT \|
	RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
	&uiInfo,
	&nSrcCvtBytes );
	nSourceCount += nSrcCvtBytes;

	if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
	// save necessary bytes for next conversion
	seqUnicode.realloc( seqUnicode.getLength() * 2 );
	continue;
	}
	break;
	}
	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
	m_seqSource.realloc( nSourceSize - nSourceCount );
	memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
	}


	if( pbTempMem ) {
	delete [] pbTempMem;
	}

	// set to correct unicode size
	seqUnicode.realloc( nTargetCount );

	return seqUnicode;
	}



	//----------------------------------------------
	//
	// Unicode2TextConverter
	//
	//----------------------------------------------
	Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
	{
	init( encoding );
	}


	Unicode2TextConverter::~Unicode2TextConverter()
	{
	if( m_bInitialized ) {
	rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
	rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
	}
	}


	Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
	{
	sal_Unicode *puTempMem = 0;

	if( m_seqSource.getLength() ) {
	// For surrogates !
	// put old rest and new byte sequence into one array
	// In general when surrogates are used, they should be rarely
	// cut off between two convert()-calls. So this code is used
	// rarely and the extra copy is acceptable.
	puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
	memcpy( puTempMem ,
	m_seqSource.getConstArray() ,
	m_seqSource.getLength() * sizeof( sal_Unicode ) );
	memcpy(
	&(puTempMem[ m_seqSource.getLength() ]) ,
	puSource ,
	nSourceSize*sizeof( sal_Unicode ) );
	puSource = puTempMem;
	nSourceSize += m_seqSource.getLength();

	m_seqSource = Sequence< sal_Unicode > ();
	}


	sal_Size nTargetCount = 0;
	sal_Size nSourceCount = 0;

	sal_uInt32 uiInfo;
	sal_Size nSrcCvtChars;

	// take nSourceSize * 3 as preference
	// this is an upper boundary for converting to utf8,
	// which most often used as the target.
	sal_Int32 nSeqSize = nSourceSize * 3;

	Sequence<sal_Int8> seqText( nSeqSize );
	sal_Char pTarget = (sal_Char ) seqText.getArray();
	while( sal_True ) {

	nTargetCount += rtl_convertUnicodeToText(
	m_convUnicode2Text,
	m_contextUnicode2Text,
	&( puSource[nSourceCount] ),
	nSourceSize - nSourceCount ,
	&( pTarget[nTargetCount] ),
	nSeqSize - nTargetCount,
	RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT \|
	RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
	&uiInfo,
	&nSrcCvtChars);
	nSourceCount += nSrcCvtChars;

	if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
	nSeqSize = nSeqSize *2;
	seqText.realloc( nSeqSize ); // double array size
	pTarget = ( sal_Char * ) seqText.getArray();
	continue;
	}
	break;
	}

	// for surrogates
	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
	m_seqSource.realloc( nSourceSize - nSourceCount );
	memcpy( m_seqSource.getArray() ,
	&(puSource[nSourceCount]),
	(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
	}

	if( puTempMem ) {
	delete [] puTempMem;
	}

	// reduce the size of the buffer (fast, no copy necessary)
	seqText.realloc( nTargetCount );

	return seqText;
	}

	void Unicode2TextConverter::init( rtl_TextEncoding encoding )
	{
	m_bCanContinue = sal_True;
	m_bInitialized = sal_True;

	m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding );
	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
	m_rtlEncoding = encoding;
	};


	}