| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| // MARKER(update_precomp.py): autogen include statement, do not remove |
| #include "precompiled_svtools.hxx" |
| |
| /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */ |
| |
| #include <stdio.h> // for EOF |
| #include <rtl/tencinfo.h> |
| #include <tools/stream.hxx> |
| #include <tools/debug.hxx> |
| #include <svtools/rtftoken.h> |
| #include <svtools/rtfkeywd.hxx> |
| #include <svtools/parrtf.hxx> |
| |
| const int MAX_STRING_LEN = 1024; |
| const int MAX_TOKEN_LEN = 128; |
| |
| #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9') |
| #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) |
| |
| SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize ) |
| : SvParser( rIn, nStackSize ), |
| eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet |
| nUCharOverread( 1 ) |
| { |
| // default ist ANSI-CodeSet |
| SetSrcEncoding( RTL_TEXTENCODING_MS_1252 ); |
| bRTF_InTextRead = false; |
| } |
| |
| SvRTFParser::~SvRTFParser() |
| { |
| } |
| |
| |
| |
| |
| int SvRTFParser::_GetNextToken() |
| { |
| int nRet = 0; |
| do { |
| int bNextCh = true; |
| switch( nNextCh ) |
| { |
| case '\\': |
| { |
| // Steuerzeichen |
| switch( nNextCh = GetNextChar() ) |
| { |
| case '{': |
| case '}': |
| case '\\': |
| case '+': // habe ich in einem RTF-File gefunden |
| case '~': // nonbreaking space |
| case '-': // optional hyphen |
| case '_': // nonbreaking hyphen |
| case '\'': // HexValue |
| nNextCh = '\\'; |
| rInput.SeekRel( -1 ); |
| ScanText(); |
| nRet = RTF_TEXTTOKEN; |
| bNextCh = 0 == nNextCh; |
| break; |
| |
| case '*': // ignoreflag |
| nRet = RTF_IGNOREFLAG; |
| break; |
| case ':': // subentry in an index entry |
| nRet = RTF_SUBENTRYINDEX; |
| break; |
| case '|': // formula-charakter |
| nRet = RTF_FORMULA; |
| break; |
| |
| case 0x0a: |
| case 0x0d: |
| nRet = RTF_PAR; |
| break; |
| |
| default: |
| if( RTF_ISALPHA( nNextCh ) ) |
| { |
| aToken = '\\'; |
| { |
| String aStrBuffer; |
| sal_Unicode* pStr = aStrBuffer.AllocBuffer( |
| MAX_TOKEN_LEN ); |
| xub_StrLen nStrLen = 0; |
| do { |
| *(pStr + nStrLen++) = nNextCh; |
| if( MAX_TOKEN_LEN == nStrLen ) |
| { |
| aToken += aStrBuffer; |
| aToken.GetBufferAccess(); // make unique string! |
| nStrLen = 0; |
| } |
| nNextCh = GetNextChar(); |
| } while( RTF_ISALPHA( nNextCh ) ); |
| if( nStrLen ) |
| { |
| aStrBuffer.ReleaseBufferAccess( nStrLen ); |
| aToken += aStrBuffer; |
| } |
| } |
| |
| // Minus fuer numerischen Parameter |
| int bNegValue = false; |
| if( '-' == nNextCh ) |
| { |
| bNegValue = true; |
| nNextCh = GetNextChar(); |
| } |
| |
| // evt. Numerischer Parameter |
| if( RTF_ISDIGIT( nNextCh ) ) |
| { |
| nTokenValue = 0; |
| do { |
| nTokenValue *= 10; |
| nTokenValue += nNextCh - '0'; |
| nNextCh = GetNextChar(); |
| } while( RTF_ISDIGIT( nNextCh ) ); |
| if( bNegValue ) |
| nTokenValue = -nTokenValue; |
| bTokenHasValue=true; |
| } |
| else if( bNegValue ) // das Minus wieder zurueck |
| { |
| nNextCh = '-'; |
| rInput.SeekRel( -1 ); |
| } |
| if( ' ' == nNextCh ) // Blank gehoert zum Token! |
| nNextCh = GetNextChar(); |
| |
| // suche das Token in der Tabelle: |
| if( 0 == (nRet = GetRTFToken( aToken )) ) |
| // Unknown Control |
| nRet = RTF_UNKNOWNCONTROL; |
| |
| // bug 76812 - unicode token handled as normal text |
| bNextCh = false; |
| switch( nRet ) |
| { |
| case RTF_UC: |
| if( 0 <= nTokenValue ) |
| { |
| nUCharOverread = (sal_uInt8)nTokenValue; |
| #if 1 |
| //cmc: other ifdef breaks #i3584 |
| aParserStates.top(). |
| nUCharOverread = nUCharOverread; |
| #else |
| if( !nUCharOverread ) |
| nUCharOverread = aParserStates.top().nUCharOverread; |
| else |
| aParserStates.top(). |
| nUCharOverread = nUCharOverread; |
| #endif |
| } |
| aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text |
| // read next token |
| nRet = 0; |
| break; |
| |
| case RTF_UPR: |
| if (!_inSkipGroup) { |
| // UPR - overread the group with the ansi |
| // informations |
| while( '{' != _GetNextToken() ) |
| ; |
| SkipGroup(); |
| _GetNextToken(); // overread the last bracket |
| nRet = 0; |
| } |
| break; |
| |
| case RTF_U: |
| if( !bRTF_InTextRead ) |
| { |
| nRet = RTF_TEXTTOKEN; |
| aToken = (sal_Unicode)nTokenValue; |
| |
| // overread the next n "RTF" characters. This |
| // can be also \{, \}, \'88 |
| for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) |
| { |
| sal_Unicode cAnsi = nNextCh; |
| while( 0xD == cAnsi ) |
| cAnsi = GetNextChar(); |
| while( 0xA == cAnsi ) |
| cAnsi = GetNextChar(); |
| |
| if( '\\' == cAnsi && |
| '\'' == ( cAnsi = GetNextChar() )) |
| // HexValue ueberlesen |
| cAnsi = GetHexValue(); |
| nNextCh = GetNextChar(); |
| } |
| ScanText(); |
| bNextCh = 0 == nNextCh; |
| } |
| break; |
| } |
| } |
| else if( SVPAR_PENDING != eState ) |
| { |
| // Bug 34631 - "\ " ueberlesen - Blank als Zeichen |
| // eState = SVPAR_ERROR; |
| bNextCh = false; |
| } |
| break; |
| } |
| } |
| break; |
| |
| case sal_Unicode(EOF): |
| eState = SVPAR_ACCEPTED; |
| nRet = nNextCh; |
| break; |
| |
| case '{': |
| { |
| if( 0 <= nOpenBrakets ) |
| { |
| RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() ); |
| aParserStates.push( aState ); |
| } |
| ++nOpenBrakets; |
| DBG_ASSERT( |
| static_cast<size_t>(nOpenBrakets) == aParserStates.size(), |
| "ParserStateStack unequal to bracket count" ); |
| nRet = nNextCh; |
| } |
| break; |
| |
| case '}': |
| --nOpenBrakets; |
| if( 0 <= nOpenBrakets ) |
| { |
| aParserStates.pop(); |
| if( !aParserStates.empty() ) |
| { |
| const RtfParserState_Impl& rRPS = |
| aParserStates.top(); |
| nUCharOverread = rRPS.nUCharOverread; |
| SetSrcEncoding( rRPS.eCodeSet ); |
| } |
| else |
| { |
| nUCharOverread = 1; |
| SetSrcEncoding( GetCodeSet() ); |
| } |
| } |
| DBG_ASSERT( |
| static_cast<size_t>(nOpenBrakets) == aParserStates.size(), |
| "ParserStateStack unequal to bracket count" ); |
| nRet = nNextCh; |
| break; |
| |
| case 0x0d: |
| case 0x0a: |
| break; |
| |
| default: |
| // es folgt normaler Text |
| ScanText(); |
| nRet = RTF_TEXTTOKEN; |
| bNextCh = 0 == nNextCh; |
| break; |
| } |
| |
| if( bNextCh ) |
| nNextCh = GetNextChar(); |
| |
| } while( !nRet && SVPAR_WORKING == eState ); |
| return nRet; |
| } |
| |
| |
| sal_Unicode SvRTFParser::GetHexValue() |
| { |
| // Hex-Wert sammeln |
| register int n; |
| register sal_Unicode nHexVal = 0; |
| |
| for( n = 0; n < 2; ++n ) |
| { |
| nHexVal *= 16; |
| nNextCh = GetNextChar(); |
| if( nNextCh >= '0' && nNextCh <= '9' ) |
| nHexVal += (nNextCh - 48); |
| else if( nNextCh >= 'a' && nNextCh <= 'f' ) |
| nHexVal += (nNextCh - 87); |
| else if( nNextCh >= 'A' && nNextCh <= 'F' ) |
| nHexVal += (nNextCh - 55); |
| } |
| return nHexVal; |
| } |
| |
| void SvRTFParser::ScanText( const sal_Unicode cBreak ) |
| { |
| String aStrBuffer; |
| int bWeiter = true; |
| while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN) |
| { |
| int bNextCh = true; |
| switch( nNextCh ) |
| { |
| case '\\': |
| { |
| switch (nNextCh = GetNextChar()) |
| { |
| case '\'': |
| { |
| |
| #if 0 |
| // #i35653 patch from cmc |
| ByteString aByteString(static_cast<char>(GetHexValue())); |
| if (aByteString.Len()) |
| aStrBuffer.Append(String(aByteString, GetSrcEncoding())); |
| #else |
| ByteString aByteString; |
| while (1) |
| { |
| aByteString.Append((char)GetHexValue()); |
| |
| bool bBreak = false; |
| sal_Char nSlash = '\\'; |
| while (!bBreak) |
| { |
| wchar_t __next=GetNextChar(); |
| if (__next>0xFF) // fix for #i43933# and #i35653# |
| { |
| if (aByteString.Len()) |
| aStrBuffer.Append(String(aByteString, GetSrcEncoding())); |
| aStrBuffer.Append((sal_Unicode)__next); |
| |
| aByteString.Erase(); |
| continue; |
| } |
| nSlash = (sal_Char)__next; |
| while (nSlash == 0xD || nSlash == 0xA) |
| nSlash = (sal_Char)GetNextChar(); |
| |
| switch (nSlash) |
| { |
| case '{': |
| case '}': |
| case '\\': |
| bBreak = true; |
| break; |
| default: |
| aByteString.Append(nSlash); |
| break; |
| } |
| } |
| |
| nNextCh = GetNextChar(); |
| |
| if (nSlash != '\\' || nNextCh != '\'') |
| { |
| rInput.SeekRel(-1); |
| nNextCh = nSlash; |
| break; |
| } |
| } |
| |
| bNextCh = false; |
| |
| if (aByteString.Len()) |
| aStrBuffer.Append(String(aByteString, GetSrcEncoding())); |
| #endif |
| } |
| break; |
| case '\\': |
| case '}': |
| case '{': |
| case '+': // habe ich in einem RTF-File gefunden |
| aStrBuffer.Append(nNextCh); |
| break; |
| case '~': // nonbreaking space |
| aStrBuffer.Append(static_cast< sal_Unicode >(0xA0)); |
| break; |
| case '-': // optional hyphen |
| aStrBuffer.Append(static_cast< sal_Unicode >(0xAD)); |
| break; |
| case '_': // nonbreaking hyphen |
| aStrBuffer.Append(static_cast< sal_Unicode >(0x2011)); |
| break; |
| |
| case 'u': |
| // UNI-Code Zeichen lesen |
| { |
| nNextCh = GetNextChar(); |
| rInput.SeekRel( -2 ); |
| |
| if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) ) |
| { |
| bRTF_InTextRead = true; |
| |
| String sSave( aToken ); |
| nNextCh = '\\'; |
| #ifdef DBG_UTIL |
| int nToken = |
| #endif |
| _GetNextToken(); |
| DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" ); |
| // dont convert symbol chars |
| aStrBuffer.Append( |
| static_cast< sal_Unicode >(nTokenValue)); |
| |
| // overread the next n "RTF" characters. This |
| // can be also \{, \}, \'88 |
| for( sal_uInt8 m = 0; m < nUCharOverread; ++m ) |
| { |
| sal_Unicode cAnsi = nNextCh; |
| while( 0xD == cAnsi ) |
| cAnsi = GetNextChar(); |
| while( 0xA == cAnsi ) |
| cAnsi = GetNextChar(); |
| |
| if( '\\' == cAnsi && |
| '\'' == ( cAnsi = GetNextChar() )) |
| // HexValue ueberlesen |
| cAnsi = GetHexValue(); |
| nNextCh = GetNextChar(); |
| } |
| bNextCh = false; |
| aToken = sSave; |
| bRTF_InTextRead = false; |
| } |
| else |
| { |
| nNextCh = '\\'; |
| bWeiter = false; // Abbrechen, String zusammen |
| } |
| } |
| break; |
| |
| default: |
| rInput.SeekRel( -1 ); |
| nNextCh = '\\'; |
| bWeiter = false; // Abbrechen, String zusammen |
| break; |
| } |
| } |
| break; |
| |
| case sal_Unicode(EOF): |
| eState = SVPAR_ERROR; |
| // weiter |
| case '{': |
| case '}': |
| bWeiter = false; |
| break; |
| |
| case 0x0a: |
| case 0x0d: |
| break; |
| |
| default: |
| if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN) |
| bWeiter = false; |
| else |
| { |
| do { |
| // alle anderen Zeichen kommen in den Text |
| aStrBuffer.Append(nNextCh); |
| |
| if (sal_Unicode(EOF) == (nNextCh = GetNextChar())) |
| { |
| if (aStrBuffer.Len()) |
| aToken += aStrBuffer; |
| return; |
| } |
| } while |
| ( |
| (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) && |
| (aStrBuffer.Len() < MAX_STRING_LEN) |
| ); |
| bNextCh = false; |
| } |
| } |
| |
| if( bWeiter && bNextCh ) |
| nNextCh = GetNextChar(); |
| } |
| |
| if (aStrBuffer.Len()) |
| aToken += aStrBuffer; |
| } |
| |
| |
| short SvRTFParser::_inSkipGroup=0; |
| |
| void SvRTFParser::SkipGroup() |
| { |
| short nBrackets=1; |
| if (_inSkipGroup>0) |
| return; |
| _inSkipGroup++; |
| #if 1 //#i16185# fecking \bin keyword |
| do |
| { |
| switch (nNextCh) |
| { |
| case '{': |
| ++nBrackets; |
| break; |
| case '}': |
| if (!--nBrackets) { |
| _inSkipGroup--; |
| return; |
| } |
| break; |
| } |
| int nToken = _GetNextToken(); |
| if (nToken == RTF_BIN) |
| { |
| rInput.SeekRel(-1); |
| rInput.SeekRel(nTokenValue); |
| nNextCh = GetNextChar(); |
| } |
| while (nNextCh==0xa || nNextCh==0xd) |
| { |
| nNextCh = GetNextChar(); |
| } |
| } while (sal_Unicode(EOF) != nNextCh && IsParserWorking()); |
| #else |
| sal_Unicode cPrev = 0; |
| do { |
| switch( nNextCh ) |
| { |
| case '{': |
| if( '\\' != cPrev ) |
| ++nBrackets; |
| break; |
| |
| case '}': |
| if( '\\' != cPrev && !--nBrackets ) |
| return; |
| break; |
| |
| case '\\': |
| if( '\\' == cPrev ) |
| nNextCh = 0; |
| break; |
| } |
| cPrev = nNextCh; |
| nNextCh = GetNextChar(); |
| } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() ); |
| #endif |
| |
| if( SVPAR_PENDING != eState && '}' != nNextCh ) |
| eState = SVPAR_ERROR; |
| _inSkipGroup--; |
| } |
| |
| void SvRTFParser::ReadUnknownData() { SkipGroup(); } |
| void SvRTFParser::ReadBitmapData() { SkipGroup(); } |
| void SvRTFParser::ReadOLEData() { SkipGroup(); } |
| |
| |
| SvParserState SvRTFParser::CallParser() |
| { |
| sal_Char cFirstCh; |
| nNextChPos = rInput.Tell(); |
| rInput >> cFirstCh; nNextCh = cFirstCh; |
| eState = SVPAR_WORKING; |
| nOpenBrakets = 0; |
| SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); |
| eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet |
| |
| // die 1. beiden Token muessen '{' und \\rtf sein !! |
| if( '{' == GetNextToken() && RTF_RTF == GetNextToken() ) |
| { |
| AddRef(); |
| Continue( 0 ); |
| if( SVPAR_PENDING != eState ) |
| ReleaseRef(); // dann brauchen wir den Parser nicht mehr! |
| } |
| else |
| eState = SVPAR_ERROR; |
| |
| return eState; |
| } |
| |
| void SvRTFParser::Continue( int nToken ) |
| { |
| // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(), |
| // "Zeichensatz wurde geaendert." ); |
| |
| if( !nToken ) |
| nToken = GetNextToken(); |
| |
| while( IsParserWorking() ) |
| { |
| SaveState( nToken ); |
| switch( nToken ) |
| { |
| case '}': |
| if( nOpenBrakets ) |
| goto NEXTTOKEN; |
| eState = SVPAR_ACCEPTED; |
| break; |
| |
| case '{': |
| // eine unbekannte Gruppe ? |
| { |
| if( RTF_IGNOREFLAG != GetNextToken() ) |
| nToken = SkipToken( -1 ); |
| else if( RTF_UNKNOWNCONTROL != GetNextToken() ) |
| nToken = SkipToken( -2 ); |
| else |
| { |
| // gleich herausfiltern |
| ReadUnknownData(); |
| nToken = GetNextToken(); |
| if( '}' != nToken ) |
| eState = SVPAR_ERROR; |
| break; // auf zum naechsten Token!! |
| } |
| } |
| goto NEXTTOKEN; |
| |
| case RTF_UNKNOWNCONTROL: |
| break; // unbekannte Token ueberspringen |
| case RTF_NEXTTYPE: |
| case RTF_ANSITYPE: |
| SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 ); |
| break; |
| case RTF_MACTYPE: |
| SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN ); |
| break; |
| case RTF_PCTYPE: |
| SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 ); |
| break; |
| case RTF_PCATYPE: |
| SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 ); |
| break; |
| case RTF_ANSICPG: |
| eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue); |
| SetEncoding(eCodeSet); |
| break; |
| default: |
| NEXTTOKEN: |
| NextToken( nToken ); |
| break; |
| } |
| if( IsParserWorking() ) |
| SaveState( 0 ); // bis hierhin abgearbeitet, |
| // weiter mit neuem Token! |
| nToken = GetNextToken(); |
| } |
| if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets ) |
| eState = SVPAR_ERROR; |
| } |
| |
| void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc ) |
| { |
| if (eEnc == RTL_TEXTENCODING_DONTKNOW) |
| eEnc = GetCodeSet(); |
| |
| if (!aParserStates.empty()) |
| aParserStates.top().eCodeSet = eEnc; |
| SetSrcEncoding(eEnc); |
| } |
| |
| #ifdef USED |
| void SvRTFParser::SaveState( int nToken ) |
| { |
| SvParser::SaveState( nToken ); |
| } |
| |
| void SvRTFParser::RestoreState() |
| { |
| SvParser::RestoreState(); |
| } |
| #endif |
| |
| /* vi:set tabstop=4 shiftwidth=4 expandtab: */ |