| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| // MARKER(update_precomp.py): autogen include statement, do not remove |
| #include "precompiled_svtools.hxx" |
| |
| #include <ctype.h> |
| #include <stdio.h> |
| #include <tools/stream.hxx> |
| #include <tools/debug.hxx> |
| #include <tools/color.hxx> |
| #include <rtl/ustrbuf.hxx> |
| #include <rtl/strbuf.hxx> |
| #ifndef _SVSTDARR_HXX |
| #define _SVSTDARR_ULONGS |
| #include <svl/svstdarr.hxx> |
| #endif |
| |
| #include <tools/tenccvt.hxx> |
| #include <tools/datetime.hxx> |
| #include <svl/inettype.hxx> |
| #include <comphelper/string.hxx> |
| #include <com/sun/star/beans/PropertyAttribute.hpp> |
| #include <com/sun/star/document/XDocumentProperties.hpp> |
| |
| #include <svtools/parhtml.hxx> |
| #include <svtools/htmltokn.h> |
| #include <svtools/htmlkywd.hxx> |
| |
| |
| using namespace ::com::sun::star; |
| |
| |
| const sal_Int32 MAX_LEN( 1024L ); |
| //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ]; |
| const sal_Int32 MAX_MACRO_LEN( 1024 ); |
| |
| const sal_Int32 MAX_ENTITY_LEN( 8L ); |
| |
| /* */ |
| |
| // Tabellen zum Umwandeln von Options-Werten in Strings |
| |
| // <INPUT TYPE=xxx> |
| static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] = |
| { |
| { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT }, |
| { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD }, |
| { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX }, |
| { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO }, |
| { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE }, |
| { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE }, |
| { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE }, |
| { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN }, |
| { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT }, |
| { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE }, |
| { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET }, |
| { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON }, |
| { 0, 0 } |
| }; |
| |
| // <TABLE FRAME=xxx> |
| static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] = |
| { |
| { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID }, |
| { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE }, |
| { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW }, |
| { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES }, |
| { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS }, |
| { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS }, |
| { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES }, |
| { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX }, |
| { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX }, |
| { 0, 0 } |
| }; |
| |
| // <TABLE RULES=xxx> |
| static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] = |
| { |
| { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE }, |
| { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS }, |
| { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS }, |
| { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS }, |
| { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL }, |
| { 0, 0 } |
| }; |
| |
| |
| SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr) |
| |
| /* */ |
| |
| sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const |
| { |
| sal_uInt16 nValue = nDflt; |
| |
| while( pOptEnums->pName ) |
| if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) |
| break; |
| else |
| pOptEnums++; |
| |
| if( pOptEnums->pName ) |
| nValue = pOptEnums->nValue; |
| |
| return nValue; |
| } |
| |
| sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const |
| { |
| while( pOptEnums->pName ) |
| { |
| if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) |
| break; |
| else |
| pOptEnums++; |
| } |
| |
| const sal_Char *pName = pOptEnums->pName; |
| if( pName ) |
| rEnum = pOptEnums->nValue; |
| |
| return (pName != 0); |
| } |
| |
| HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken, |
| const String& rValue ) |
| : aValue(rValue) |
| , aToken(rToken) |
| , nToken( nTok ) |
| { |
| DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END, |
| "HTMLOption: unbekanntes Token" ); |
| } |
| |
| sal_uInt32 HTMLOption::GetNumber() const |
| { |
| DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && |
| nToken<HTML_OPTION_NUMBER_END) || |
| (nToken>=HTML_OPTION_CONTEXT_START && |
| nToken<HTML_OPTION_CONTEXT_END) || |
| nToken==HTML_O_VALUE, |
| "GetNumber: Option ist nicht numerisch" ); |
| String aTmp( aValue ); |
| aTmp.EraseLeadingChars(); |
| sal_Int32 nTmp = aTmp.ToInt32(); |
| return nTmp >= 0 ? (sal_uInt32)nTmp : 0; |
| } |
| |
| sal_Int32 HTMLOption::GetSNumber() const |
| { |
| DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) || |
| (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END), |
| "GetSNumber: Option ist nicht numerisch" ); |
| String aTmp( aValue ); |
| aTmp.EraseLeadingChars(); |
| return aTmp.ToInt32(); |
| } |
| |
| void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const |
| { |
| if( rLongs.Count() ) |
| rLongs.Remove( 0, rLongs.Count() ); |
| |
| if( bSpaceDelim ) |
| { |
| // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach |
| // alle Tiffern aus dem String |
| sal_Bool bInNum = sal_False; |
| sal_uLong nNum = 0; |
| for( xub_StrLen i=0; i<aValue.Len(); i++ ) |
| { |
| register sal_Unicode c = aValue.GetChar( i ); |
| if( c>='0' && c<='9' ) |
| { |
| nNum *= 10; |
| nNum += (c - '0'); |
| bInNum = sal_True; |
| } |
| else if( bInNum ) |
| { |
| rLongs.Insert( nNum, rLongs.Count() ); |
| bInNum = sal_False; |
| nNum = 0; |
| } |
| } |
| if( bInNum ) |
| { |
| rLongs.Insert( nNum, rLongs.Count() ); |
| } |
| } |
| else |
| { |
| // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet |
| // und auch mal eine 0 eingefuegt |
| xub_StrLen nPos = 0; |
| while( nPos < aValue.Len() ) |
| { |
| register sal_Unicode c; |
| while( nPos < aValue.Len() && |
| ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' || |
| c == '\n' || c== '\r' ) ) |
| nPos++; |
| |
| if( nPos==aValue.Len() ) |
| rLongs.Insert( sal_uLong(0), rLongs.Count() ); |
| else |
| { |
| xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos ); |
| if( STRING_NOTFOUND==nEnd ) |
| { |
| sal_Int32 nTmp = aValue.Copy(nPos).ToInt32(); |
| rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, |
| rLongs.Count() ); |
| nPos = aValue.Len(); |
| } |
| else |
| { |
| sal_Int32 nTmp = |
| aValue.Copy(nPos,nEnd-nPos).ToInt32(); |
| rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, |
| rLongs.Count() ); |
| nPos = nEnd+1; |
| } |
| } |
| } |
| } |
| } |
| |
| void HTMLOption::GetColor( Color& rColor ) const |
| { |
| DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE, |
| "GetColor: Option spezifiziert keine Farbe" ); |
| |
| String aTmp( aValue ); |
| aTmp.ToUpperAscii(); |
| sal_uLong nColor = ULONG_MAX; |
| if( '#'!=aTmp.GetChar( 0 ) ) |
| nColor = GetHTMLColor( aTmp ); |
| |
| if( ULONG_MAX == nColor ) |
| { |
| nColor = 0; |
| xub_StrLen nPos = 0; |
| for( sal_uInt32 i=0; i<6; i++ ) |
| { |
| // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt, |
| // maximal drei Zeichen, die kleiner als '0' sind werden |
| // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich |
| // irgendwelche HTML-Autoren noch so einfallen lassen... |
| register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ ) |
| : '0'; |
| if( c < '0' ) |
| { |
| c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; |
| if( c < '0' ) |
| c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; |
| } |
| nColor *= 16; |
| if( c >= '0' && c <= '9' ) |
| nColor += (c - 48); |
| else if( c >= 'A' && c <= 'F' ) |
| nColor += (c - 55); |
| } |
| } |
| |
| rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) ); |
| rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8)); |
| rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) ); |
| } |
| |
| HTMLInputType HTMLOption::GetInputType() const |
| { |
| DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" ); |
| return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT ); |
| } |
| |
| HTMLTableFrame HTMLOption::GetTableFrame() const |
| { |
| DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" ); |
| return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID ); |
| } |
| |
| HTMLTableRules HTMLOption::GetTableRules() const |
| { |
| DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" ); |
| return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE ); |
| } |
| |
| /* */ |
| |
| HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc ) |
| : SvParser( rIn ) |
| { |
| bNewDoc = bReadNewDoc; |
| bReadListing = bReadXMP = bReadPRE = bReadTextArea = |
| bReadScript = bReadStyle = |
| bEndTokenFound = bIsInBody = bReadNextChar = |
| bReadComment = sal_False; |
| bIsInHeader = sal_True; |
| pOptions = new HTMLOptions; |
| |
| //#i76649, default to UTF-8 for HTML unless we know differently |
| SetSrcEncoding(RTL_TEXTENCODING_UTF8); |
| } |
| |
| HTMLParser::~HTMLParser() |
| { |
| if( pOptions && pOptions->Count() ) |
| pOptions->DeleteAndDestroy( 0, pOptions->Count() ); |
| delete pOptions; |
| } |
| |
| SvParserState __EXPORT HTMLParser::CallParser() |
| { |
| eState = SVPAR_WORKING; |
| nNextCh = GetNextChar(); |
| SaveState( 0 ); |
| |
| nPre_LinePos = 0; |
| bPre_IgnoreNewPara = sal_False; |
| |
| AddRef(); |
| Continue( 0 ); |
| if( SVPAR_PENDING != eState ) |
| ReleaseRef(); // dann brauchen wir den Parser nicht mehr! |
| |
| return eState; |
| } |
| |
| void HTMLParser::Continue( int nToken ) |
| { |
| if( !nToken ) |
| nToken = GetNextToken(); |
| |
| while( IsParserWorking() ) |
| { |
| SaveState( nToken ); |
| nToken = FilterToken( nToken ); |
| |
| if( nToken ) |
| NextToken( nToken ); |
| |
| if( IsParserWorking() ) |
| SaveState( 0 ); // bis hierhin abgearbeitet, |
| // weiter mit neuem Token! |
| nToken = GetNextToken(); |
| } |
| } |
| |
| int HTMLParser::FilterToken( int nToken ) |
| { |
| switch( nToken ) |
| { |
| case sal_Unicode(EOF): |
| nToken = 0; |
| break; // nicht verschicken |
| |
| case HTML_HEAD_OFF: |
| bIsInBody = sal_True; |
| case HTML_HEAD_ON: |
| bIsInHeader = HTML_HEAD_ON == nToken; |
| break; |
| |
| case HTML_BODY_ON: |
| case HTML_FRAMESET_ON: |
| bIsInHeader = sal_False; |
| bIsInBody = HTML_BODY_ON == nToken; |
| break; |
| |
| case HTML_BODY_OFF: |
| bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False; |
| break; |
| |
| case HTML_HTML_OFF: |
| nToken = 0; |
| bReadPRE = bReadListing = bReadXMP = sal_False; |
| break; // HTML_ON wurde auch nicht verschickt ! |
| |
| case HTML_PREFORMTXT_ON: |
| StartPRE(); |
| break; |
| |
| case HTML_PREFORMTXT_OFF: |
| FinishPRE(); |
| break; |
| |
| case HTML_LISTING_ON: |
| StartListing(); |
| break; |
| |
| case HTML_LISTING_OFF: |
| FinishListing(); |
| break; |
| |
| case HTML_XMP_ON: |
| StartXMP(); |
| break; |
| |
| case HTML_XMP_OFF: |
| FinishXMP(); |
| break; |
| |
| default: |
| if( bReadPRE ) |
| nToken = FilterPRE( nToken ); |
| else if( bReadListing ) |
| nToken = FilterListing( nToken ); |
| else if( bReadXMP ) |
| nToken = FilterXMP( nToken ); |
| |
| break; |
| } |
| |
| return nToken; |
| } |
| |
| #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9') |
| #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) |
| #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) ) |
| #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) ) |
| #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127) |
| // --> OD 2006-07-26 #138464# |
| #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') ) |
| // <-- |
| |
| int HTMLParser::ScanText( const sal_Unicode cBreak ) |
| { |
| ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); |
| int bWeiter = sal_True; |
| int bEqSignFound = sal_False; |
| sal_Unicode cQuote = 0U; |
| |
| while( bWeiter && IsParserWorking() ) |
| { |
| int bNextCh = sal_True; |
| switch( nNextCh ) |
| { |
| case '&': |
| bEqSignFound = sal_False; |
| if( bReadXMP ) |
| sTmpBuffer.append( (sal_Unicode)'&' ); |
| else |
| { |
| sal_uLong nStreamPos = rInput.Tell(); |
| sal_uLong nLinePos = GetLinePos(); |
| |
| sal_Unicode cChar = 0U; |
| if( '#' == (nNextCh = GetNextChar()) ) |
| { |
| nNextCh = GetNextChar(); |
| // --> OD 2006-07-26 #138464# |
| // consider hexadecimal digits |
| const sal_Bool bIsHex( 'x' == nNextCh ); |
| const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) ); |
| if ( bIsDecOrHex ) |
| { |
| if ( bIsHex ) |
| { |
| nNextCh = GetNextChar(); |
| while ( HTML_ISHEXDIGIT(nNextCh) ) |
| { |
| cChar = cChar * 16U + |
| ( nNextCh <= '9' |
| ? sal_Unicode( nNextCh - '0' ) |
| : ( nNextCh <= 'F' |
| ? sal_Unicode( nNextCh - 'A' + 10 ) |
| : sal_Unicode( nNextCh - 'a' + 10 ) ) ); |
| nNextCh = GetNextChar(); |
| } |
| } |
| else |
| { |
| do |
| { |
| cChar = cChar * 10U + sal_Unicode( nNextCh - '0'); |
| nNextCh = GetNextChar(); |
| } |
| while( HTML_ISDIGIT(nNextCh) ); |
| } |
| |
| if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc && |
| RTL_TEXTENCODING_UCS2 != eSrcEnc && |
| RTL_TEXTENCODING_UTF8 != eSrcEnc && |
| cChar < 256 ) |
| { |
| sal_Unicode cOrig = cChar; |
| cChar = ByteString::ConvertToUnicode( |
| (sal_Char)cChar, eSrcEnc ); |
| if( 0U == cChar ) |
| { |
| // #73398#: If the character could not be |
| // converted, because a conversion is not |
| // available, do no conversion at all. |
| cChar = cOrig; |
| } |
| } |
| } |
| // <-- |
| else |
| nNextCh = 0U; |
| } |
| else if( HTML_ISALPHA( nNextCh ) ) |
| { |
| ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN ); |
| xub_StrLen nPos = 0L; |
| do |
| { |
| sEntityBuffer.append( nNextCh ); |
| nPos++; |
| nNextCh = GetNextChar(); |
| } |
| while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) && |
| !rInput.IsEof() ); |
| |
| if( IsParserWorking() && !rInput.IsEof() ) |
| { |
| String sEntity( sEntityBuffer.getStr(), nPos ); |
| cChar = GetHTMLCharName( sEntity ); |
| |
| // nicht gefunden ( == 0 ), dann Klartext |
| // oder ein Zeichen das als Attribut eingefuegt |
| // wird |
| if( 0U == cChar && ';' != nNextCh ) |
| { |
| DBG_ASSERT( rInput.Tell() - nStreamPos == |
| (sal_uLong)(nPos+1L)*GetCharSize(), |
| "UTF-8 geht hier schief" ); |
| for( xub_StrLen i=nPos-1L; i>1L; i-- ) |
| { |
| nNextCh = sEntityBuffer[i]; |
| sEntityBuffer.setLength( i ); |
| sEntity.Assign( sEntityBuffer.getStr(), i ); |
| cChar = GetHTMLCharName( sEntity ); |
| if( cChar ) |
| { |
| rInput.SeekRel( -(long) |
| ((nPos-i)*GetCharSize()) ); |
| nlLinePos -= sal_uInt32(nPos-i); |
| nPos = i; |
| ClearTxtConvContext(); |
| break; |
| } |
| } |
| } |
| |
| if( !cChar ) // unbekanntes Zeichen? |
| { |
| // dann im Stream zurueck, das '&' als Zeichen |
| // einfuegen und mit dem nachfolgenden Zeichen |
| // wieder aufsetzen |
| sTmpBuffer.append( (sal_Unicode)'&' ); |
| |
| // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); |
| // nlLinePos -= nPos; |
| DBG_ASSERT( rInput.Tell()-nStreamPos == |
| (sal_uLong)(nPos+1)*GetCharSize(), |
| "Falsche Stream-Position" ); |
| DBG_ASSERT( nlLinePos-nLinePos == |
| (sal_uLong)(nPos+1), |
| "Falsche Zeilen-Position" ); |
| rInput.Seek( nStreamPos ); |
| nlLinePos = nLinePos; |
| ClearTxtConvContext(); |
| break; |
| } |
| |
| // 1 == Non Breaking Space |
| // 2 == SoftHyphen |
| |
| if( cChar < 3U ) |
| { |
| if( '>' == cBreak ) |
| { |
| // Wenn der Inhalt eines Tags gelesen wird, |
| // muessen wir ein Space bzw. - daraus machen |
| switch( cChar ) |
| { |
| case 1U: cChar = ' '; break; |
| case 2U: cChar = '-'; break; |
| default: |
| DBG_ASSERT( cChar==1U, |
| "\0x00 sollte doch schon laengt abgefangen sein!" ); |
| break; |
| } |
| } |
| else |
| { |
| // Wenn kein Tag gescannt wird, enstprechendes |
| // Token zurueckgeben |
| aToken += |
| String( sTmpBuffer.makeStringAndClear() ); |
| if( cChar ) |
| { |
| if( aToken.Len() ) |
| { |
| // mit dem Zeichen wieder aufsetzen |
| nNextCh = '&'; |
| // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); |
| // nlLinePos -= nPos; |
| DBG_ASSERT( rInput.Tell()-nStreamPos == |
| (sal_uLong)(nPos+1)*GetCharSize(), |
| "Falsche Stream-Position" ); |
| DBG_ASSERT( nlLinePos-nLinePos == |
| (sal_uLong)(nPos+1), |
| "Falsche Zeilen-Position" ); |
| rInput.Seek( nStreamPos ); |
| nlLinePos = nLinePos; |
| ClearTxtConvContext(); |
| return HTML_TEXTTOKEN; |
| } |
| |
| // Hack: _GetNextChar soll nicht das |
| // naechste Zeichen lesen |
| if( ';' != nNextCh ) |
| aToken += ' '; |
| if( 1U == cChar ) |
| return HTML_NONBREAKSPACE; |
| if( 2U == cChar ) |
| return HTML_SOFTHYPH; |
| } |
| aToken += (sal_Unicode)'&'; |
| aToken += |
| String(sEntityBuffer.makeStringAndClear()); |
| break; |
| } |
| } |
| } |
| else |
| nNextCh = 0U; |
| } |
| // MIB 03/02/2000: &{...};-JavaScript-Macros are not |
| // supported any longer. |
| else if( IsParserWorking() ) |
| { |
| sTmpBuffer.append( (sal_Unicode)'&' ); |
| bNextCh = sal_False; |
| break; |
| } |
| |
| bNextCh = (';' == nNextCh); |
| if( cBreak=='>' && (cChar=='\\' || cChar=='\'' || |
| cChar=='\"' || cChar==' ') ) |
| { |
| // ' und " mussen innerhalb von Tags mit einem |
| // gekennzeichnet werden, um sie von ' und " als Klammern |
| // um Optionen zu unterscheiden. Logischerweise muss |
| // deshalb auch ein \ gekeenzeichnet werden. Ausserdem |
| // schuetzen wir ein Space, weil es kein Trennzeichen |
| // zwischen Optionen ist. |
| sTmpBuffer.append( (sal_Unicode)'\\' ); |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| } |
| if( IsParserWorking() ) |
| { |
| if( cChar ) |
| sTmpBuffer.append( cChar ); |
| } |
| else if( SVPAR_PENDING==eState && '>'!=cBreak ) |
| { |
| // Mit dem '&' Zeichen wieder aufsetzen, der Rest |
| // wird als Texttoken zurueckgegeben. |
| if( aToken.Len() || sTmpBuffer.getLength() ) |
| { |
| // Der bisherige Text wird von _GetNextChar() |
| // zurueckgegeben und beim naechsten Aufruf wird |
| // ein neues Zeichen gelesen. Also muessen wir uns |
| // noch vor das & stellen. |
| nNextCh = 0U; |
| rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() ); |
| nlLinePos = nLinePos-1; |
| ClearTxtConvContext(); |
| bReadNextChar = sal_True; |
| } |
| bNextCh = sal_False; |
| } |
| } |
| break; |
| case '=': |
| if( '>'==cBreak && !cQuote ) |
| bEqSignFound = sal_True; |
| sTmpBuffer.append( nNextCh ); |
| break; |
| |
| case '\\': |
| if( '>'==cBreak ) |
| { |
| // Innerhalb von Tags kennzeichnen |
| sTmpBuffer.append( (sal_Unicode)'\\' ); |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| } |
| sTmpBuffer.append( (sal_Unicode)'\\' ); |
| break; |
| |
| case '\"': |
| case '\'': |
| if( '>'==cBreak ) |
| { |
| if( bEqSignFound ) |
| cQuote = nNextCh; |
| else if( cQuote && (cQuote==nNextCh ) ) |
| cQuote = 0U; |
| } |
| sTmpBuffer.append( nNextCh ); |
| bEqSignFound = sal_False; |
| break; |
| |
| case sal_Unicode(EOF): |
| if( rInput.IsEof() ) |
| { |
| // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird |
| // abcä<EOF> nicht angezeigt, also lassen wir das in Zukunft. |
| // if( '>' != cBreak ) |
| // eState = SVPAR_ACCEPTED; |
| bWeiter = sal_False; |
| } |
| else |
| { |
| sTmpBuffer.append( nNextCh ); |
| } |
| break; |
| |
| case '<': |
| bEqSignFound = sal_False; |
| if( '>'==cBreak ) |
| sTmpBuffer.append( nNextCh ); |
| else |
| bWeiter = sal_False; // Abbrechen, String zusammen |
| break; |
| |
| case '\f': |
| if( '>' == cBreak ) |
| { |
| // Beim Scannen von Optionen wie ein Space behandeln |
| sTmpBuffer.append( (sal_Unicode)' ' ); |
| } |
| else |
| { |
| // sonst wird es ein eigenes Token |
| bWeiter = sal_False; |
| } |
| break; |
| |
| case '\r': |
| case '\n': |
| if( '>'==cBreak ) |
| { |
| // #26979# cr/lf in Tag wird in _GetNextToken() behandeln |
| sTmpBuffer.append( nNextCh ); |
| break; |
| } |
| else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) |
| { |
| bWeiter = sal_False; |
| break; |
| } |
| // Bug 18984: CR-LF -> Blank |
| // Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln |
| // kein break!! |
| case '\t': |
| if( '\t'==nNextCh && bReadPRE && '>'!=cBreak ) |
| { |
| // In <PRE>: Tabs nach oben durchreichen |
| bWeiter = sal_False; |
| break; |
| } |
| // kein break |
| case '\x0b': |
| if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && |
| '>'!=cBreak ) |
| { |
| break; |
| } |
| nNextCh = ' '; |
| // kein break; |
| case ' ': |
| sTmpBuffer.append( nNextCh ); |
| if( '>'!=cBreak && (!bReadListing && !bReadXMP && |
| !bReadPRE && !bReadTextArea) ) |
| { |
| // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln |
| do { |
| if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && |
| rInput.IsEof() ) |
| { |
| if( aToken.Len() || sTmpBuffer.getLength() > 1L ) |
| { |
| // ausser den Blanks wurde noch etwas geselen |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| return HTML_TEXTTOKEN; |
| } |
| else |
| // nur Blanks gelesen: dann darf kein Text |
| // mehr zurueckgegeben werden und _GetNextToken |
| // muss auf EOF laufen |
| return 0; |
| } |
| } while ( ' ' == nNextCh || '\t' == nNextCh || |
| '\r' == nNextCh || '\n' == nNextCh || |
| '\x0b' == nNextCh ); |
| bNextCh = sal_False; |
| } |
| break; |
| |
| default: |
| bEqSignFound = sal_False; |
| if( (nNextCh==cBreak && !cQuote) || |
| (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 )) |
| bWeiter = sal_False; |
| else |
| { |
| do { |
| // alle anderen Zeichen kommen in den Text |
| sTmpBuffer.append( nNextCh ); |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| { |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| if( (sal_uLong(aToken.Len()) + MAX_LEN) > |
| sal_uLong(STRING_MAXLEN & ~1 ) ) |
| { |
| nNextCh = GetNextChar(); |
| return HTML_TEXTTOKEN; |
| } |
| } |
| if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && |
| rInput.IsEof() ) || |
| !IsParserWorking() ) |
| { |
| if( sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| return HTML_TEXTTOKEN; |
| } |
| } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) ); |
| bNextCh = sal_False; |
| } |
| } |
| |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| |
| if( bWeiter && bNextCh ) |
| nNextCh = GetNextChar(); |
| } |
| |
| if( sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| |
| return HTML_TEXTTOKEN; |
| } |
| |
| int HTMLParser::_GetNextRawToken() |
| { |
| ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); |
| |
| if( bEndTokenFound ) |
| { |
| // beim letzten Aufruf haben wir das End-Token bereits gefunden, |
| // deshalb muessen wir es nicht noch einmal suchen |
| bReadScript = sal_False; |
| bReadStyle = sal_False; |
| aEndToken.Erase(); |
| bEndTokenFound = sal_False; |
| |
| return 0; |
| } |
| |
| // per default geben wir HTML_RAWDATA zurueck |
| int bWeiter = sal_True; |
| int nToken = HTML_RAWDATA; |
| SaveState( 0 ); |
| while( bWeiter && IsParserWorking() ) |
| { |
| int bNextCh = sal_True; |
| switch( nNextCh ) |
| { |
| case '<': |
| { |
| // Vielleicht haben wir das Ende erreicht |
| |
| // das bisher gelesene erstmal retten |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| |
| // und die Position im Stream merken |
| sal_uLong nStreamPos = rInput.Tell(); |
| sal_uLong nLineNr = GetLineNr(); |
| sal_uLong nLinePos = GetLinePos(); |
| |
| // Start eines End-Token? |
| int bOffState = sal_False; |
| if( '/' == (nNextCh = GetNextChar()) ) |
| { |
| bOffState = sal_True; |
| nNextCh = GetNextChar(); |
| } |
| else if( '!' == nNextCh ) |
| { |
| sTmpBuffer.append( nNextCh ); |
| nNextCh = GetNextChar(); |
| } |
| |
| // jetzt die Buchstaben danach lesen |
| while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) && |
| IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) |
| { |
| sTmpBuffer.append( nNextCh ); |
| nNextCh = GetNextChar(); |
| } |
| |
| String aTok( sTmpBuffer.getStr(), |
| sal::static_int_cast< xub_StrLen >( |
| sTmpBuffer.getLength()) ); |
| aTok.ToUpperAscii(); |
| sal_Bool bDone = sal_False; |
| if( bReadScript || aEndToken.Len() ) |
| { |
| if( !bReadComment ) |
| { |
| if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 ) |
| == COMPARE_EQUAL ) |
| { |
| bReadComment = sal_True; |
| } |
| else |
| { |
| // ein Script muss mit "</SCRIPT>" aufhoehren, wobei |
| // wir es mit dem ">" aus sicherheitsgruenden |
| // erstmal nicht so genau nehmen |
| bDone = bOffState && // '>'==nNextCh && |
| COMPARE_EQUAL == ( bReadScript |
| ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script) |
| : aTok.CompareTo(aEndToken) ); |
| } |
| } |
| if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 && |
| aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) ) |
| { |
| // hier ist ein Kommentar der Art <!-----> zuende |
| bReadComment = sal_False; |
| } |
| } |
| else |
| { |
| // ein Style-Sheet kann mit </STYLE>, </HEAD> oder |
| // <BODY> aughoehren |
| if( bOffState ) |
| bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style) |
| == COMPARE_EQUAL || |
| aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head) |
| == COMPARE_EQUAL; |
| else |
| bDone = |
| aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL; |
| } |
| |
| if( bDone ) |
| { |
| // das war's, jetzt muessen wir gegebenenfalls den |
| // bisher gelesenen String zurueckgeben und dnach normal |
| // weitermachen |
| |
| bWeiter = sal_False; |
| |
| // nToken==0 heisst, dass _GetNextToken gleich weiterliest |
| if( !aToken.Len() && (bReadStyle || bReadScript) ) |
| { |
| // wir koennen sofort die Umgebung beeden und |
| // das End-Token parsen |
| bReadScript = sal_False; |
| bReadStyle = sal_False; |
| aEndToken.Erase(); |
| nToken = 0; |
| } |
| else |
| { |
| // wir muessen bReadScript/bReadStyle noch am |
| // Leben lassen und koennen erst beim naechsten |
| // mal das End-Token Parsen |
| bEndTokenFound = sal_True; |
| } |
| |
| // jetzt fahren wir im Stream auf das '<' zurueck |
| rInput.Seek( nStreamPos ); |
| SetLineNr( nLineNr ); |
| SetLinePos( nLinePos ); |
| ClearTxtConvContext(); |
| nNextCh = '<'; |
| |
| // den String wollen wir nicht an das Token haengen |
| sTmpBuffer.setLength( 0L ); |
| } |
| else |
| { |
| // "</" merken, alles andere steht noch im buffer |
| aToken += (sal_Unicode)'<'; |
| if( bOffState ) |
| aToken += (sal_Unicode)'/'; |
| |
| bNextCh = sal_False; |
| } |
| } |
| break; |
| case '-': |
| sTmpBuffer.append( nNextCh ); |
| if( bReadComment ) |
| { |
| sal_Bool bTwoMinus = sal_False; |
| nNextCh = GetNextChar(); |
| while( '-' == nNextCh && IsParserWorking() ) |
| { |
| bTwoMinus = sal_True; |
| |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| sTmpBuffer.append( nNextCh ); |
| nNextCh = GetNextChar(); |
| } |
| |
| if( '>' == nNextCh && IsParserWorking() && bTwoMinus ) |
| bReadComment = sal_False; |
| |
| bNextCh = sal_False; |
| } |
| break; |
| |
| case '\r': |
| // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist) |
| nNextCh = GetNextChar(); |
| if( nNextCh=='\n' ) |
| nNextCh = GetNextChar(); |
| bWeiter = sal_False; |
| break; |
| case '\n': |
| // \n beendet das aktuelle Text-Token (auch wenn es leer ist) |
| nNextCh = GetNextChar(); |
| bWeiter = sal_False; |
| break; |
| case sal_Unicode(EOF): |
| // eof beendet das aktuelle Text-Token und tut so, als ob |
| // ein End-Token gelesen wurde |
| if( rInput.IsEof() ) |
| { |
| bWeiter = sal_False; |
| if( aToken.Len() || sTmpBuffer.getLength() ) |
| { |
| bEndTokenFound = sal_True; |
| } |
| else |
| { |
| bReadScript = sal_False; |
| bReadStyle = sal_False; |
| aEndToken.Erase(); |
| nToken = 0; |
| } |
| break; |
| } |
| // kein break |
| default: |
| // alle anderen Zeichen landen im Buffer |
| sTmpBuffer.append( nNextCh ); |
| break; |
| } |
| |
| if( (!bWeiter && sTmpBuffer.getLength() > 0L) || |
| MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| |
| if( bWeiter && bNextCh ) |
| nNextCh = GetNextChar(); |
| } |
| |
| if( IsParserWorking() ) |
| SaveState( 0 ); |
| else |
| nToken = 0; |
| |
| return nToken; |
| } |
| |
| // scanne das naechste Token, |
| int __EXPORT HTMLParser::_GetNextToken() |
| { |
| int nRet = 0; |
| sSaveToken.Erase(); |
| |
| // die Optionen loeschen |
| if( pOptions->Count() ) |
| pOptions->DeleteAndDestroy( 0, pOptions->Count() ); |
| |
| if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! |
| return 0; |
| |
| sal_Bool bReadNextCharSave = bReadNextChar; |
| if( bReadNextChar ) |
| { |
| DBG_ASSERT( !bEndTokenFound, |
| "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" ); |
| nNextCh = GetNextChar(); |
| if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! |
| return 0; |
| bReadNextChar = sal_False; |
| } |
| |
| if( bReadScript || bReadStyle || aEndToken.Len() ) |
| { |
| nRet = _GetNextRawToken(); |
| if( nRet || !IsParserWorking() ) |
| return nRet; |
| } |
| |
| do { |
| int bNextCh = sal_True; |
| switch( nNextCh ) |
| { |
| case '<': |
| { |
| sal_uLong nStreamPos = rInput.Tell(); |
| sal_uLong nLineNr = GetLineNr(); |
| sal_uLong nLinePos = GetLinePos(); |
| |
| int bOffState = sal_False; |
| if( '/' == (nNextCh = GetNextChar()) ) |
| { |
| bOffState = sal_True; |
| nNextCh = GetNextChar(); |
| } |
| if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984# |
| { |
| ::rtl::OUStringBuffer sTmpBuffer; |
| do { |
| sTmpBuffer.append( nNextCh ); |
| if( MAX_LEN == sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| nNextCh = GetNextChar(); |
| } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) && |
| IsParserWorking() && !rInput.IsEof() ); |
| |
| if( sTmpBuffer.getLength() ) |
| aToken += String(sTmpBuffer.makeStringAndClear()); |
| |
| // Blanks ueberlesen |
| while( HTML_ISSPACE( nNextCh ) && IsParserWorking() ) |
| nNextCh = GetNextChar(); |
| |
| if( !IsParserWorking() ) |
| { |
| if( SVPAR_PENDING == eState ) |
| bReadNextChar = bReadNextCharSave; |
| break; |
| } |
| |
| // suche das Token in der Tabelle: |
| sSaveToken = aToken; |
| aToken.ToUpperAscii(); |
| if( 0 == (nRet = GetHTMLToken( aToken )) ) |
| // Unknown Control |
| nRet = HTML_UNKNOWNCONTROL_ON; |
| |
| // Wenn es ein Token zum ausschalten ist ... |
| if( bOffState ) |
| { |
| if( HTML_TOKEN_ONOFF & nRet ) |
| { |
| // und es ein Off-Token gibt, das daraus machen |
| ++nRet; |
| } |
| else if( HTML_LINEBREAK!=nRet ) |
| { |
| // und es kein Off-Token gibt, ein unbekanntes |
| // Token daraus machen (ausser </BR>, das wird |
| // wie <BR> behandelt |
| nRet = HTML_UNKNOWNCONTROL_OFF; |
| } |
| } |
| |
| if( nRet == HTML_COMMENT ) |
| { |
| // fix: sSaveToken wegen Gross-/Kleinschreibung |
| // als Anfang des Kommentars benutzen und ein |
| // Space anhaengen. |
| aToken = sSaveToken; |
| if( '>'!=nNextCh ) |
| aToken += (sal_Unicode)' '; |
| sal_uLong nCStreamPos = 0; |
| sal_uLong nCLineNr = 0; |
| sal_uLong nCLinePos = 0; |
| xub_StrLen nCStrLen = 0; |
| |
| sal_Bool bDone = sal_False; |
| // bis zum schliessenden --> lesen. wenn keins gefunden |
| // wurde beim der ersten > wieder aufsetzen |
| while( !bDone && !rInput.IsEof() && IsParserWorking() ) |
| { |
| if( '>'==nNextCh ) |
| { |
| if( !nCStreamPos ) |
| { |
| nCStreamPos = rInput.Tell(); |
| nCStrLen = aToken.Len(); |
| nCLineNr = GetLineNr(); |
| nCLinePos = GetLinePos(); |
| } |
| bDone = aToken.Len() >= 2 && |
| aToken.Copy(aToken.Len()-2,2). |
| EqualsAscii( "--" ); |
| if( !bDone ) |
| aToken += nNextCh; |
| } |
| else |
| aToken += nNextCh; |
| if( !bDone ) |
| nNextCh = GetNextChar(); |
| } |
| if( !bDone && IsParserWorking() && nCStreamPos ) |
| { |
| rInput.Seek( nCStreamPos ); |
| SetLineNr( nCLineNr ); |
| SetLinePos( nCLinePos ); |
| ClearTxtConvContext(); |
| aToken.Erase( nCStrLen ); |
| nNextCh = '>'; |
| } |
| } |
| else |
| { |
| // den TokenString koennen wir jetzt verwerfen |
| aToken.Erase(); |
| } |
| |
| // dann lesen wir mal alles bis zur schliessenden '>' |
| if( '>' != nNextCh && IsParserWorking() ) |
| { |
| ScanText( '>' ); |
| if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) |
| { |
| // zurueck hinter die < gehen und dort neu |
| // aufsetzen, das < als Text zurueckgeben |
| rInput.Seek( nStreamPos ); |
| SetLineNr( nLineNr ); |
| SetLinePos( nLinePos ); |
| ClearTxtConvContext(); |
| |
| aToken = '<'; |
| nRet = HTML_TEXTTOKEN; |
| nNextCh = GetNextChar(); |
| bNextCh = sal_False; |
| break; |
| } |
| } |
| if( SVPAR_PENDING == eState ) |
| bReadNextChar = bReadNextCharSave; |
| } |
| else |
| { |
| if( bOffState ) |
| { |
| // einfach alles wegschmeissen |
| ScanText( '>' ); |
| if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) |
| { |
| // zurueck hinter die < gehen und dort neu |
| // aufsetzen, das < als Text zurueckgeben |
| rInput.Seek( nStreamPos ); |
| SetLineNr( nLineNr ); |
| SetLinePos( nLinePos ); |
| ClearTxtConvContext(); |
| |
| aToken = '<'; |
| nRet = HTML_TEXTTOKEN; |
| nNextCh = GetNextChar(); |
| bNextCh = sal_False; |
| break; |
| } |
| if( SVPAR_PENDING == eState ) |
| bReadNextChar = bReadNextCharSave; |
| aToken.Erase(); |
| } |
| else if( '%' == nNextCh ) |
| { |
| nRet = HTML_UNKNOWNCONTROL_ON; |
| |
| sal_uLong nCStreamPos = rInput.Tell(); |
| sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos(); |
| |
| sal_Bool bDone = sal_False; |
| // bis zum schliessenden %> lesen. wenn keins gefunden |
| // wurde beim der ersten > wieder aufsetzen |
| while( !bDone && !rInput.IsEof() && IsParserWorking() ) |
| { |
| bDone = '>'==nNextCh && aToken.Len() >= 1 && |
| '%' == aToken.GetChar( aToken.Len()-1 ); |
| if( !bDone ) |
| { |
| aToken += nNextCh; |
| nNextCh = GetNextChar(); |
| } |
| } |
| if( !bDone && IsParserWorking() ) |
| { |
| rInput.Seek( nCStreamPos ); |
| SetLineNr( nCLineNr ); |
| SetLinePos( nCLinePos ); |
| ClearTxtConvContext(); |
| aToken.AssignAscii( "<%", 2 ); |
| nRet = HTML_TEXTTOKEN; |
| break; |
| } |
| if( IsParserWorking() ) |
| { |
| sSaveToken = aToken; |
| aToken.Erase(); |
| } |
| } |
| else |
| { |
| aToken = '<'; |
| nRet = HTML_TEXTTOKEN; |
| bNextCh = sal_False; |
| break; |
| } |
| } |
| |
| if( IsParserWorking() ) |
| { |
| bNextCh = '>' == nNextCh; |
| switch( nRet ) |
| { |
| case HTML_TEXTAREA_ON: |
| bReadTextArea = sal_True; |
| break; |
| case HTML_TEXTAREA_OFF: |
| bReadTextArea = sal_False; |
| break; |
| case HTML_SCRIPT_ON: |
| if( !bReadTextArea ) |
| bReadScript = sal_True; |
| break; |
| case HTML_SCRIPT_OFF: |
| if( !bReadTextArea ) |
| { |
| bReadScript = sal_False; |
| // JavaScript kann den Stream veraendern |
| // also muss das letzte Zeichen nochmals |
| // gelesen werden |
| bReadNextChar = sal_True; |
| bNextCh = sal_False; |
| } |
| break; |
| |
| case HTML_STYLE_ON: |
| bReadStyle = sal_True; |
| break; |
| case HTML_STYLE_OFF: |
| bReadStyle = sal_False; |
| break; |
| } |
| |
| } |
| } |
| break; |
| |
| case sal_Unicode(EOF): |
| if( rInput.IsEof() ) |
| { |
| eState = SVPAR_ACCEPTED; |
| nRet = nNextCh; |
| } |
| else |
| { |
| // normalen Text lesen |
| goto scan_text; |
| } |
| break; |
| |
| case '\f': |
| // Form-Feeds werden jetzt extra nach oben gereicht |
| nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR |
| break; |
| |
| case '\n': |
| case '\r': |
| if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) |
| { |
| sal_Unicode c = GetNextChar(); |
| if( ( '\n' != nNextCh || '\r' != c ) && |
| ( '\r' != nNextCh || '\n' != c ) ) |
| { |
| bNextCh = sal_False; |
| nNextCh = c; |
| } |
| nRet = HTML_NEWPARA; |
| break; |
| } |
| // kein break ! |
| case '\t': |
| if( bReadPRE ) |
| { |
| nRet = HTML_TABCHAR; |
| break; |
| } |
| // kein break ! |
| case ' ': |
| // kein break ! |
| default: |
| |
| scan_text: |
| // es folgt "normaler" Text |
| nRet = ScanText(); |
| bNextCh = 0 == aToken.Len(); |
| |
| // der Text sollte noch verarbeitet werden |
| if( !bNextCh && eState == SVPAR_PENDING ) |
| { |
| eState = SVPAR_WORKING; |
| bReadNextChar = sal_True; |
| } |
| |
| break; |
| } |
| |
| if( bNextCh && SVPAR_WORKING == eState ) |
| { |
| nNextCh = GetNextChar(); |
| if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet ) |
| { |
| bReadNextChar = sal_True; |
| eState = SVPAR_WORKING; |
| } |
| } |
| |
| } while( !nRet && SVPAR_WORKING == eState ); |
| |
| if( SVPAR_PENDING == eState ) |
| nRet = -1; // irgendwas ungueltiges |
| |
| return nRet; |
| } |
| |
| void HTMLParser::UnescapeToken() |
| { |
| xub_StrLen nPos=0; |
| |
| sal_Bool bEscape = sal_False; |
| while( nPos < aToken.Len() ) |
| { |
| sal_Bool bOldEscape = bEscape; |
| bEscape = sal_False; |
| if( '\\'==aToken.GetChar(nPos) && !bOldEscape ) |
| { |
| aToken.Erase( nPos, 1 ); |
| bEscape = sal_True; |
| } |
| else |
| { |
| nPos++; |
| } |
| } |
| } |
| |
| // hole die Optionen |
| const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const |
| { |
| // wenn die Option fuer das aktuelle Token schon einmal |
| // geholt wurden, geben wir sie noch einmal zurueck |
| if( pOptions->Count() ) |
| return pOptions; |
| |
| xub_StrLen nPos = 0; |
| while( nPos < aToken.Len() ) |
| { |
| // ein Zeichen ? Dann faengt hier eine Option an |
| if( HTML_ISALPHA( aToken.GetChar(nPos) ) ) |
| { |
| int nToken; |
| String aValue; |
| xub_StrLen nStt = nPos; |
| sal_Unicode cChar = 0; |
| |
| // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt. |
| // Netscape achtet aber nur auf "=" und Leerzeichen (siehe |
| // Mozilla: PA_FetchRequestedNameValues in |
| // lipparse/pa_mdl.c |
| // while( nPos < aToken.Len() && |
| // ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) ) |
| while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) && |
| HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) ) |
| nPos++; |
| |
| String sName( aToken.Copy( nStt, nPos-nStt ) ); |
| |
| //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben |
| // also nur fuers Suchen in UpperCase wandeln |
| String sNameUpperCase( sName ); |
| sNameUpperCase.ToUpperAscii(); |
| |
| nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig |
| DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN, |
| "GetOption: unbekannte HTML-Option" ); |
| sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START || |
| nToken >= HTML_OPTION_SCRIPT_END) && |
| (!pNoConvertToken || nToken != *pNoConvertToken); |
| |
| while( nPos < aToken.Len() && |
| ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || |
| HTML_ISSPACE(cChar) ) ) |
| nPos++; |
| |
| // hat die Option auch einen Wert? |
| if( nPos!=aToken.Len() && '='==cChar ) |
| { |
| nPos++; |
| |
| while( nPos < aToken.Len() && |
| ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || |
| ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) ) |
| nPos++; |
| |
| if( nPos != aToken.Len() ) |
| { |
| xub_StrLen nLen = 0; |
| nStt = nPos; |
| if( ('"'==cChar) || ('\'')==cChar ) |
| { |
| sal_Unicode cEnd = cChar; |
| nPos++; nStt++; |
| sal_Bool bDone = sal_False; |
| sal_Bool bEscape = sal_False; |
| while( nPos < aToken.Len() && !bDone ) |
| { |
| sal_Bool bOldEscape = bEscape; |
| bEscape = sal_False; |
| cChar = aToken.GetChar(nPos); |
| switch( cChar ) |
| { |
| case '\r': |
| case '\n': |
| if( bStripCRLF ) |
| ((String &)aToken).Erase( nPos, 1 ); |
| else |
| nPos++, nLen++; |
| break; |
| case '\\': |
| if( bOldEscape ) |
| { |
| nPos++, nLen++; |
| } |
| else |
| { |
| ((String &)aToken).Erase( nPos, 1 ); |
| bEscape = sal_True; |
| } |
| break; |
| case '"': |
| case '\'': |
| bDone = !bOldEscape && cChar==cEnd; |
| if( !bDone ) |
| nPos++, nLen++; |
| break; |
| default: |
| nPos++, nLen++; |
| break; |
| } |
| } |
| if( nPos!=aToken.Len() ) |
| nPos++; |
| } |
| else |
| { |
| // hier sind wir etwas laxer als der |
| // Standard und erlauben alles druckbare |
| sal_Bool bEscape = sal_False; |
| sal_Bool bDone = sal_False; |
| while( nPos < aToken.Len() && !bDone ) |
| { |
| sal_Bool bOldEscape = bEscape; |
| bEscape = sal_False; |
| sal_Unicode c = aToken.GetChar(nPos); |
| switch( c ) |
| { |
| case ' ': |
| bDone = !bOldEscape; |
| if( !bDone ) |
| nPos++, nLen++; |
| break; |
| |
| case '\t': |
| case '\r': |
| case '\n': |
| bDone = sal_True; |
| break; |
| |
| case '\\': |
| if( bOldEscape ) |
| { |
| nPos++, nLen++; |
| } |
| else |
| { |
| ((String &)aToken).Erase( nPos, 1 ); |
| bEscape = sal_True; |
| } |
| break; |
| |
| default: |
| if( HTML_ISPRINTABLE( c ) ) |
| nPos++, nLen++; |
| else |
| bDone = sal_True; |
| break; |
| } |
| } |
| } |
| |
| if( nLen ) |
| aValue = aToken.Copy( nStt, nLen ); |
| } |
| } |
| |
| // Wir kennen das Token und koennen es Speichern |
| HTMLOption *pOption = |
| new HTMLOption( |
| sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue ); |
| |
| pOptions->Insert( pOption, pOptions->Count() ); |
| |
| } |
| else |
| // white space un unerwartete Zeichen ignorieren wie |
| nPos++; |
| } |
| |
| return pOptions; |
| } |
| |
| int HTMLParser::FilterPRE( int nToken ) |
| { |
| switch( nToken ) |
| { |
| #ifdef HTML_BEHAVIOUR |
| // diese werden laut Definition zu LFs |
| case HTML_PARABREAK_ON: |
| case HTML_LINEBREAK: |
| nToken = HTML_NEWPARA; |
| #else |
| // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung |
| case HTML_PARABREAK_ON: |
| nToken = HTML_LINEBREAK; |
| case HTML_LINEBREAK: |
| #endif |
| case HTML_NEWPARA: |
| nPre_LinePos = 0; |
| if( bPre_IgnoreNewPara ) |
| nToken = 0; |
| break; |
| |
| case HTML_TABCHAR: |
| { |
| xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >( |
| 8 - (nPre_LinePos % 8)); |
| DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" ); |
| aToken.Expand( nSpaces, ' ' ); |
| nPre_LinePos += nSpaces; |
| nToken = HTML_TEXTTOKEN; |
| } |
| break; |
| // diese bleiben erhalten |
| case HTML_TEXTTOKEN: |
| nPre_LinePos += aToken.Len(); |
| break; |
| |
| case HTML_SELECT_ON: |
| case HTML_SELECT_OFF: |
| case HTML_BODY_ON: |
| case HTML_FORM_ON: |
| case HTML_FORM_OFF: |
| case HTML_INPUT: |
| case HTML_OPTION: |
| case HTML_TEXTAREA_ON: |
| case HTML_TEXTAREA_OFF: |
| |
| case HTML_IMAGE: |
| case HTML_APPLET_ON: |
| case HTML_APPLET_OFF: |
| case HTML_PARAM: |
| case HTML_EMBED: |
| |
| case HTML_HEAD1_ON: |
| case HTML_HEAD1_OFF: |
| case HTML_HEAD2_ON: |
| case HTML_HEAD2_OFF: |
| case HTML_HEAD3_ON: |
| case HTML_HEAD3_OFF: |
| case HTML_HEAD4_ON: |
| case HTML_HEAD4_OFF: |
| case HTML_HEAD5_ON: |
| case HTML_HEAD5_OFF: |
| case HTML_HEAD6_ON: |
| case HTML_HEAD6_OFF: |
| case HTML_BLOCKQUOTE_ON: |
| case HTML_BLOCKQUOTE_OFF: |
| case HTML_ADDRESS_ON: |
| case HTML_ADDRESS_OFF: |
| case HTML_HORZRULE: |
| |
| case HTML_CENTER_ON: |
| case HTML_CENTER_OFF: |
| case HTML_DIVISION_ON: |
| case HTML_DIVISION_OFF: |
| |
| case HTML_SCRIPT_ON: |
| case HTML_SCRIPT_OFF: |
| case HTML_RAWDATA: |
| |
| case HTML_TABLE_ON: |
| case HTML_TABLE_OFF: |
| case HTML_CAPTION_ON: |
| case HTML_CAPTION_OFF: |
| case HTML_COLGROUP_ON: |
| case HTML_COLGROUP_OFF: |
| case HTML_COL_ON: |
| case HTML_COL_OFF: |
| case HTML_THEAD_ON: |
| case HTML_THEAD_OFF: |
| case HTML_TFOOT_ON: |
| case HTML_TFOOT_OFF: |
| case HTML_TBODY_ON: |
| case HTML_TBODY_OFF: |
| case HTML_TABLEROW_ON: |
| case HTML_TABLEROW_OFF: |
| case HTML_TABLEDATA_ON: |
| case HTML_TABLEDATA_OFF: |
| case HTML_TABLEHEADER_ON: |
| case HTML_TABLEHEADER_OFF: |
| |
| case HTML_ANCHOR_ON: |
| case HTML_ANCHOR_OFF: |
| case HTML_BOLD_ON: |
| case HTML_BOLD_OFF: |
| case HTML_ITALIC_ON: |
| case HTML_ITALIC_OFF: |
| case HTML_STRIKE_ON: |
| case HTML_STRIKE_OFF: |
| case HTML_STRIKETHROUGH_ON: |
| case HTML_STRIKETHROUGH_OFF: |
| case HTML_UNDERLINE_ON: |
| case HTML_UNDERLINE_OFF: |
| case HTML_BASEFONT_ON: |
| case HTML_BASEFONT_OFF: |
| case HTML_FONT_ON: |
| case HTML_FONT_OFF: |
| case HTML_BLINK_ON: |
| case HTML_BLINK_OFF: |
| case HTML_SPAN_ON: |
| case HTML_SPAN_OFF: |
| case HTML_SUBSCRIPT_ON: |
| case HTML_SUBSCRIPT_OFF: |
| case HTML_SUPERSCRIPT_ON: |
| case HTML_SUPERSCRIPT_OFF: |
| case HTML_BIGPRINT_ON: |
| case HTML_BIGPRINT_OFF: |
| case HTML_SMALLPRINT_OFF: |
| case HTML_SMALLPRINT_ON: |
| |
| case HTML_EMPHASIS_ON: |
| case HTML_EMPHASIS_OFF: |
| case HTML_CITIATION_ON: |
| case HTML_CITIATION_OFF: |
| case HTML_STRONG_ON: |
| case HTML_STRONG_OFF: |
| case HTML_CODE_ON: |
| case HTML_CODE_OFF: |
| case HTML_SAMPLE_ON: |
| case HTML_SAMPLE_OFF: |
| case HTML_KEYBOARD_ON: |
| case HTML_KEYBOARD_OFF: |
| case HTML_VARIABLE_ON: |
| case HTML_VARIABLE_OFF: |
| case HTML_DEFINSTANCE_ON: |
| case HTML_DEFINSTANCE_OFF: |
| case HTML_SHORTQUOTE_ON: |
| case HTML_SHORTQUOTE_OFF: |
| case HTML_LANGUAGE_ON: |
| case HTML_LANGUAGE_OFF: |
| case HTML_AUTHOR_ON: |
| case HTML_AUTHOR_OFF: |
| case HTML_PERSON_ON: |
| case HTML_PERSON_OFF: |
| case HTML_ACRONYM_ON: |
| case HTML_ACRONYM_OFF: |
| case HTML_ABBREVIATION_ON: |
| case HTML_ABBREVIATION_OFF: |
| case HTML_INSERTEDTEXT_ON: |
| case HTML_INSERTEDTEXT_OFF: |
| case HTML_DELETEDTEXT_ON: |
| case HTML_DELETEDTEXT_OFF: |
| case HTML_TELETYPE_ON: |
| case HTML_TELETYPE_OFF: |
| |
| break; |
| |
| // der Rest wird als unbekanntes Token behandelt |
| default: |
| if( nToken ) |
| { |
| nToken = |
| ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) |
| ? HTML_UNKNOWNCONTROL_OFF |
| : HTML_UNKNOWNCONTROL_ON ); |
| } |
| break; |
| } |
| |
| bPre_IgnoreNewPara = sal_False; |
| |
| return nToken; |
| } |
| |
| int HTMLParser::FilterXMP( int nToken ) |
| { |
| switch( nToken ) |
| { |
| case HTML_NEWPARA: |
| if( bPre_IgnoreNewPara ) |
| nToken = 0; |
| case HTML_TEXTTOKEN: |
| case HTML_NONBREAKSPACE: |
| case HTML_SOFTHYPH: |
| break; // bleiben erhalten |
| |
| default: |
| if( nToken ) |
| { |
| if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) ) |
| { |
| sSaveToken.Insert( '<', 0 ); |
| sSaveToken.Insert( '/', 1 ); |
| } |
| else |
| sSaveToken.Insert( '<', 0 ); |
| if( aToken.Len() ) |
| { |
| UnescapeToken(); |
| sSaveToken += (sal_Unicode)' '; |
| aToken.Insert( sSaveToken, 0 ); |
| } |
| else |
| aToken = sSaveToken; |
| aToken += (sal_Unicode)'>'; |
| nToken = HTML_TEXTTOKEN; |
| } |
| break; |
| } |
| |
| bPre_IgnoreNewPara = sal_False; |
| |
| return nToken; |
| } |
| |
| int HTMLParser::FilterListing( int nToken ) |
| { |
| switch( nToken ) |
| { |
| case HTML_NEWPARA: |
| if( bPre_IgnoreNewPara ) |
| nToken = 0; |
| case HTML_TEXTTOKEN: |
| case HTML_NONBREAKSPACE: |
| case HTML_SOFTHYPH: |
| break; // bleiben erhalten |
| |
| default: |
| if( nToken ) |
| { |
| nToken = |
| ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) |
| ? HTML_UNKNOWNCONTROL_OFF |
| : HTML_UNKNOWNCONTROL_ON ); |
| } |
| break; |
| } |
| |
| bPre_IgnoreNewPara = sal_False; |
| |
| return nToken; |
| } |
| |
| FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader, |
| sal_Bool bSwitchToUCS2, |
| rtl_TextEncoding eEnc ) |
| { |
| // Einer der folgenden regulaeren Ausdrucke muss sich auf den String |
| // anwenden lassen, damit das Dok ein HTML-Dokument ist. |
| // |
| // ^[^<]*<[^ \t]*[> \t] |
| // ------- |
| // ^<! |
| // |
| // wobei der unterstrichene Teilausdruck einem HTML-Token |
| // ensprechen muss |
| |
| ByteString sCmp; |
| sal_Bool bUCS2B = sal_False; |
| if( bSwitchToUCS2 ) |
| { |
| if( 0xfeU == (sal_uChar)pHeader[0] && |
| 0xffU == (sal_uChar)pHeader[1] ) |
| { |
| eEnc = RTL_TEXTENCODING_UCS2; |
| bUCS2B = sal_True; |
| } |
| else if( 0xffU == (sal_uChar)pHeader[0] && |
| 0xfeU == (sal_uChar)pHeader[1] ) |
| { |
| eEnc = RTL_TEXTENCODING_UCS2; |
| } |
| } |
| if |
| ( |
| RTL_TEXTENCODING_UCS2 == eEnc && |
| ( |
| (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) || |
| (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1]) |
| ) |
| ) |
| { |
| if( 0xfe == (sal_uChar)pHeader[0] ) |
| bUCS2B = sal_True; |
| |
| xub_StrLen nLen; |
| for( nLen = 2; |
| pHeader[nLen] != 0 || pHeader[nLen+1] != 0; |
| nLen+=2 ) |
| ; |
| |
| ::rtl::OStringBuffer sTmp( (nLen - 2)/2 ); |
| for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 ) |
| { |
| sal_Unicode cUC; |
| if( bUCS2B ) |
| cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1]; |
| else |
| cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos]; |
| if( 0U == cUC ) |
| break; |
| |
| sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' ); |
| } |
| sCmp = ByteString( sTmp.makeStringAndClear() ); |
| } |
| else |
| { |
| sCmp = (sal_Char *)pHeader; |
| } |
| |
| sCmp.ToUpperAscii(); |
| |
| // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen |
| xub_StrLen nStart = sCmp.Search( '<' ); |
| if( STRING_NOTFOUND == nStart ) |
| return sal_False; |
| nStart++; |
| |
| // danach duerfen beliebige andere Zeichen bis zu einem blank oder |
| // '>' kommen |
| sal_Char c; |
| xub_StrLen nPos; |
| for( nPos = nStart; nPos<sCmp.Len(); nPos++ ) |
| { |
| if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) ) |
| break; |
| } |
| |
| // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML |
| if( nPos==nStart ) |
| return sal_False; |
| |
| // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes |
| // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht |
| // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML |
| // interpretiert. |
| String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US ); |
| int nTok = GetHTMLToken( sTest ); |
| if( 0 != nTok && HTML_DIRLIST_ON != nTok ) |
| return sal_True; |
| |
| // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#) |
| if( nStart == 1 && '!' == sCmp.GetChar( 1 ) ) |
| return sal_True; |
| |
| // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen |
| nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html ); |
| if( nStart!=STRING_NOTFOUND && |
| nStart>0 && '<'==sCmp.GetChar(nStart-1) && |
| nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) ) |
| return sal_True; |
| |
| // sonst ist es wohl doch eher kein HTML-Dokument |
| return sal_False; |
| } |
| |
| sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL ) |
| { |
| if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) || |
| rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL ) |
| return sal_False; |
| |
| sal_Bool bFound = sal_False; |
| |
| if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL ) |
| { |
| String aName( rURL.Copy(16) ); |
| switch( aName.GetChar(0) ) |
| { |
| case 'b': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary ); |
| break; |
| case 'i': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) || |
| aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index ); |
| break; |
| case 'm': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) || |
| aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie ); |
| break; |
| case 's': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound ); |
| break; |
| case 't': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) || |
| aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text ); |
| break; |
| case 'u': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown ); |
| break; |
| } |
| } |
| else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL ) |
| { |
| String aName( rURL.Copy(14) ); |
| switch( aName.GetChar(0) ) |
| { |
| case 'b': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata ); |
| break; |
| case 'd': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed ); |
| break; |
| case 'e': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed ); |
| break; |
| case 'i': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure ); |
| break; |
| case 'n': |
| bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound ); |
| break; |
| } |
| } |
| if( bFound ) |
| { |
| String sTmp ( rURL ); |
| rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image ); |
| rURL.Append( sTmp ); |
| } |
| |
| return bFound; |
| } |
| |
| #ifdef USED |
| void HTMLParser::SaveState( int nToken ) |
| { |
| SvParser::SaveState( nToken ); |
| } |
| |
| void HTMLParser::RestoreState() |
| { |
| SvParser::RestoreState(); |
| } |
| #endif |
| |
| |
| enum eHtmlMetas { |
| HTML_META_NONE = 0, |
| HTML_META_AUTHOR, |
| HTML_META_DESCRIPTION, |
| HTML_META_KEYWORDS, |
| HTML_META_REFRESH, |
| HTML_META_CLASSIFICATION, |
| HTML_META_CREATED, |
| HTML_META_CHANGEDBY, |
| HTML_META_CHANGED, |
| HTML_META_GENERATOR, |
| HTML_META_SDFOOTNOTE, |
| HTML_META_SDENDNOTE, |
| HTML_META_CONTENT_TYPE |
| }; |
| |
| // <META NAME=xxx> |
| static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] = |
| { |
| { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR }, |
| { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED }, |
| { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY }, |
| { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION}, |
| { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE }, |
| { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED }, |
| { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION }, |
| { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS }, |
| { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR }, |
| { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH }, |
| { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE }, |
| { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE }, |
| { 0, 0 } |
| }; |
| |
| |
| void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & ) |
| { |
| } |
| |
| bool HTMLParser::ParseMetaOptionsImpl( |
| const uno::Reference<document::XDocumentProperties> & i_xDocProps, |
| SvKeyValueIterator *i_pHTTPHeader, |
| const HTMLOptions *i_pOptions, |
| rtl_TextEncoding& o_rEnc ) |
| { |
| String aName, aContent; |
| sal_uInt16 nAction = HTML_META_NONE; |
| bool bHTTPEquiv = false, bChanged = false; |
| |
| for ( sal_uInt16 i = i_pOptions->Count(); i; ) |
| { |
| const HTMLOption *pOption = (*i_pOptions)[ --i ]; |
| switch ( pOption->GetToken() ) |
| { |
| case HTML_O_NAME: |
| aName = pOption->GetString(); |
| if ( HTML_META_NONE==nAction ) |
| { |
| pOption->GetEnum( nAction, aHTMLMetaNameTable ); |
| } |
| break; |
| case HTML_O_HTTPEQUIV: |
| aName = pOption->GetString(); |
| pOption->GetEnum( nAction, aHTMLMetaNameTable ); |
| bHTTPEquiv = true; |
| break; |
| case HTML_O_CONTENT: |
| aContent = pOption->GetString(); |
| break; |
| } |
| } |
| |
| if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction ) |
| { |
| // if it is not a Description, remove CRs and LFs from CONTENT |
| aContent.EraseAllChars( _CR ); |
| aContent.EraseAllChars( _LF ); |
| } |
| else |
| { |
| // convert line endings for Description |
| aContent.ConvertLineEnd(); |
| } |
| |
| |
| if ( bHTTPEquiv && i_pHTTPHeader ) |
| { |
| // #57232#: Netscape seems to just ignore a closing ", so we do too |
| if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) ) |
| { |
| aContent.Erase( aContent.Len() - 1 ); |
| } |
| SvKeyValue aKeyValue( aName, aContent ); |
| i_pHTTPHeader->Append( aKeyValue ); |
| } |
| |
| switch ( nAction ) |
| { |
| case HTML_META_AUTHOR: |
| if (i_xDocProps.is()) { |
| i_xDocProps->setAuthor( aContent ); |
| bChanged = true; |
| } |
| break; |
| case HTML_META_DESCRIPTION: |
| if (i_xDocProps.is()) { |
| i_xDocProps->setDescription( aContent ); |
| bChanged = true; |
| } |
| break; |
| case HTML_META_KEYWORDS: |
| if (i_xDocProps.is()) { |
| i_xDocProps->setKeywords( |
| ::comphelper::string::convertCommaSeparated(aContent)); |
| bChanged = true; |
| } |
| break; |
| case HTML_META_CLASSIFICATION: |
| if (i_xDocProps.is()) { |
| i_xDocProps->setSubject( aContent ); |
| bChanged = true; |
| } |
| break; |
| |
| case HTML_META_CHANGEDBY: |
| if (i_xDocProps.is()) { |
| i_xDocProps->setModifiedBy( aContent ); |
| } |
| break; |
| |
| case HTML_META_CREATED: |
| case HTML_META_CHANGED: |
| if ( i_xDocProps.is() && aContent.Len() && |
| aContent.GetTokenCount() == 2 ) |
| { |
| Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() ); |
| Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() ); |
| DateTime aDateTime( aDate, aTime ); |
| ::util::DateTime uDT(aDateTime.Get100Sec(), |
| aDateTime.GetSec(), aDateTime.GetMin(), |
| aDateTime.GetHour(), aDateTime.GetDay(), |
| aDateTime.GetMonth(), aDateTime.GetYear()); |
| if ( HTML_META_CREATED==nAction ) |
| i_xDocProps->setCreationDate( uDT ); |
| else |
| i_xDocProps->setModificationDate( uDT ); |
| bChanged = true; |
| } |
| break; |
| |
| case HTML_META_REFRESH: |
| DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, |
| "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" ); |
| break; |
| |
| case HTML_META_CONTENT_TYPE: |
| if ( aContent.Len() ) |
| { |
| o_rEnc = GetEncodingByMIME( aContent ); |
| } |
| break; |
| |
| case HTML_META_NONE: |
| if ( !bHTTPEquiv ) |
| { |
| if (i_xDocProps.is()) |
| { |
| uno::Reference<beans::XPropertyContainer> xUDProps |
| = i_xDocProps->getUserDefinedProperties(); |
| try { |
| xUDProps->addProperty(aName, |
| beans::PropertyAttribute::REMOVEABLE, |
| uno::makeAny(::rtl::OUString(aContent))); |
| AddMetaUserDefined(aName); |
| bChanged = true; |
| } catch (uno::Exception &) { |
| // ignore |
| } |
| } |
| } |
| break; |
| default: |
| break; |
| } |
| |
| return bChanged; |
| } |
| |
| bool HTMLParser::ParseMetaOptions( |
| const uno::Reference<document::XDocumentProperties> & i_xDocProps, |
| SvKeyValueIterator *i_pHeader ) |
| { |
| sal_uInt16 nContentOption = HTML_O_CONTENT; |
| rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; |
| |
| bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader, |
| GetOptions(&nContentOption), |
| eEnc ); |
| |
| // If the encoding is set by a META tag, it may only overwrite the |
| // current encoding if both, the current and the new encoding, are 1-sal_uInt8 |
| // encodings. Everything else cannot lead to reasonable results. |
| if (RTL_TEXTENCODING_DONTKNOW != eEnc && |
| rtl_isOctetTextEncoding( eEnc ) && |
| rtl_isOctetTextEncoding( GetSrcEncoding() ) ) |
| { |
| eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973# |
| SetSrcEncoding( eEnc ); |
| } |
| |
| return bRet; |
| } |
| |
| rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime ) |
| { |
| ByteString sType; |
| ByteString sSubType; |
| INetContentTypeParameterList aParameters; |
| ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); |
| if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) |
| { |
| const INetContentTypeParameter * pCharset |
| = aParameters.find("charset"); |
| if (pCharset != 0) |
| { |
| ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); |
| return GetExtendedCompatibilityTextEncoding( |
| rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) ); |
| } |
| } |
| return RTL_TEXTENCODING_DONTKNOW; |
| } |
| |
| rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader ) |
| { |
| rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; |
| if( pHTTPHeader ) |
| { |
| SvKeyValue aKV; |
| for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont; |
| bCont = pHTTPHeader->GetNext( aKV ) ) |
| { |
| if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) ) |
| { |
| if( aKV.GetValue().Len() ) |
| { |
| eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() ); |
| } |
| } |
| } |
| } |
| return eRet; |
| } |
| |
| sal_Bool HTMLParser::SetEncodingByHTTPHeader( |
| SvKeyValueIterator *pHTTPHeader ) |
| { |
| sal_Bool bRet = sal_False; |
| rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader ); |
| if(RTL_TEXTENCODING_DONTKNOW != eEnc) |
| { |
| SetSrcEncoding( eEnc ); |
| bRet = sal_True; |
| } |
| return bRet; |
| } |
| |
| |