| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| #include "HtmlFmtFlt.hxx" |
| |
| #include <rtl/string.h> |
| |
| #include <string> |
| #include <sstream> |
| #include <vector> |
| #include <iomanip> |
| |
| #include <boost/assert.hpp> |
| |
| using namespace com::sun::star::uno; |
| |
| //------------------------------------------------------------------------------ |
| // converts the openoffice text/html clipboard format to the HTML Format |
| // well known under MS Windows |
| // the MS HTML Format has a header before the real html data |
| // |
| // Version:1.0 Version number of the clipboard. Staring is 0.9 |
| // StartHTML: Byte count from the beginning of the clipboard to the start |
| // of the context, or -1 if no context |
| // EndHTML: Byte count from the beginning of the clipboard to the end |
| // of the context, or -1 if no context |
| // StartFragment: Byte count from the beginning of the clipboard to the |
| // start of the fragment |
| // EndFragment: Byte count from the beginning of the clipboard to the |
| // end of the fragment |
| // StartSelection: Byte count from the beginning of the clipboard to the |
| // start of the selection |
| // EndSelection: Byte count from the beginning of the clipboard to the |
| // end of the selection |
| // |
| // StartSelection and EndSelection are optional |
| // The fragment should be preceded and followed by the HTML comments |
| // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the |
| // text |
| //------------------------------------------------------------------------------ |
| |
| namespace // private |
| { |
| std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment) |
| { |
| std::ostringstream htmlHeader; |
| htmlHeader << "Version:1.0" << '\r' << '\n'; |
| htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n'; |
| htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n'; |
| htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n'; |
| htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n'; |
| return htmlHeader.str(); |
| } |
| |
| } // namespace private |
| |
| |
| // the office allways writes the start and end html tag in upper cases and |
| // without spaces both tags don't allow parameters |
| const std::string TAG_HTML = std::string("<HTML>"); |
| const std::string TAG_END_HTML = std::string("</HTML>"); |
| |
| // The body tag may have parameters so we need to search for the |
| // closing '>' manually e.g. <BODY param> #92840# |
| const std::string TAG_BODY = std::string("<BODY"); |
| const std::string TAG_END_BODY = std::string("</BODY"); |
| |
| Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml) |
| { |
| OSL_ASSERT(aTextHtml.getLength() > 0); |
| |
| if (!(aTextHtml.getLength() > 0)) |
| return Sequence<sal_Int8>(); |
| |
| // fill the buffer with dummy values to calc the exact length |
| std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0); |
| size_t lHtmlFormatHeader = dummyHtmlHeader.length(); |
| |
| std::string textHtml( |
| reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()), |
| reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength()); |
| |
| std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so |
| std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>? |
| |
| // The body tag may have parameters so we need to search for the |
| // closing '>' manually e.g. <BODY param> #92840# |
| std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1; |
| std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader; |
| |
| std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment); |
| htmlFormat += textHtml; |
| |
| Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0' |
| rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength()); |
| |
| rtl_copyMemory( |
| static_cast<void*>(byteSequence.getArray()), |
| static_cast<const void*>(htmlFormat.c_str()), |
| htmlFormat.length()); |
| |
| return byteSequence; |
| } |
| |
| const char* HtmlStartTag = "<html"; |
| |
| Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat) |
| { |
| BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided"); |
| |
| Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat); |
| sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray()); |
| sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1; |
| const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag); |
| |
| BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all"); |
| |
| // It doesn't seem to be HTML? Well then simply return what has been |
| // provided in non-debug builds |
| if (htmlStartTag == NULL) |
| { |
| return aHTMLFormat; |
| } |
| |
| sal_Int32 len = dataEnd - htmlStartTag; |
| Sequence<sal_Int8> plainHtmlData(len); |
| |
| rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len); |
| |
| return plainHtmlData; |
| } |
| |
| /* A simple format detection. We are just comparing the first few bytes |
| of the provided byte sequence to see whether or not it is the MS |
| Office Html format. If it shows that this is not reliable enough we |
| can improve this |
| */ |
| const char HtmlFormatStart[] = "Version:"; |
| int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1); |
| |
| bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence) |
| { |
| if (aHtmlSequence.getLength() < HtmlFormatStartLen) |
| return false; |
| |
| return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart, |
| HtmlFormatStartLen, |
| reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()), |
| HtmlFormatStartLen) == 0; |
| } |