| /************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *************************************************************/ |
| |
| |
| |
| // MARKER(update_precomp.py): autogen include statement, do not remove |
| #include "precompiled_i18npool.hxx" |
| |
| #include <rtl/ustrbuf.hxx> |
| #include <i18nutil/casefolding.hxx> |
| #include <i18nutil/unicode.hxx> |
| |
| #include <comphelper/processfactory.hxx> |
| #include <osl/diagnose.h> |
| |
| #include <string.h> |
| |
| #include "characterclassificationImpl.hxx" |
| #include "breakiteratorImpl.hxx" |
| |
| #define TRANSLITERATION_ALL |
| #include "transliteration_body.hxx" |
| |
| using namespace ::com::sun::star::uno; |
| using namespace ::com::sun::star::lang; |
| using namespace ::rtl; |
| |
| #define A2OU(x) OUString::createFromAscii(x) |
| |
| namespace com { namespace sun { namespace star { namespace i18n { |
| |
| |
| Transliteration_body::Transliteration_body() |
| { |
| nMappingType = 0; |
| transliterationName = "Transliteration_body"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_body"; |
| } |
| |
| sal_Int16 SAL_CALL Transliteration_body::getType() throw(RuntimeException) |
| { |
| return TransliterationType::ONE_TO_ONE; |
| } |
| |
| sal_Bool SAL_CALL Transliteration_body::equals( |
| const OUString& /*str1*/, sal_Int32 /*pos1*/, sal_Int32 /*nCount1*/, sal_Int32& /*nMatch1*/, |
| const OUString& /*str2*/, sal_Int32 /*pos2*/, sal_Int32 /*nCount2*/, sal_Int32& /*nMatch2*/) |
| throw(RuntimeException) |
| { |
| throw RuntimeException(); |
| } |
| |
| Sequence< OUString > SAL_CALL |
| Transliteration_body::transliterateRange( const OUString& str1, const OUString& str2 ) |
| throw( RuntimeException) |
| { |
| Sequence< OUString > ostr(2); |
| ostr[0] = str1; |
| ostr[1] = str2; |
| return ostr; |
| } |
| |
| |
| static sal_uInt8 lcl_getMappingTypeForToggleCase( sal_uInt8 nMappingType, sal_Unicode cChar ) |
| { |
| sal_uInt8 nRes = nMappingType; |
| |
| // take care of TOGGLE_CASE transliteration: |
| // nMappingType should not be a combination of flags, thuse we decide now |
| // which one to use. |
| if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) |
| { |
| const sal_Int16 nType = unicode::getUnicodeType( cChar ); |
| if (nType & 0x02 /* lower case*/) |
| nRes = MappingTypeLowerToUpper; |
| else |
| { |
| // should also work properly for non-upper characters like white spacs, numbers, ... |
| nRes = MappingTypeUpperToLower; |
| } |
| } |
| |
| return nRes; |
| } |
| |
| |
| OUString SAL_CALL |
| Transliteration_body::transliterate( |
| const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
| Sequence< sal_Int32 >& offset) |
| throw(RuntimeException) |
| { |
| #if 0 |
| /* Performance optimization: |
| * The two realloc() consume 48% (32% grow, 16% shrink) runtime of this method! |
| * getValue() needs about 15%, so there is equal balance if we trade the second |
| * (shrinking) realloc() for a getValue(). But if the caller initializes the |
| * sequence to nCount elements there isn't any change in size necessary in most |
| * cases (one-to-one mapping) and we gain 33%. |
| * |
| * Of that constellation the getValue() method takes 20% upon each call, so 40% |
| * for both. By remembering the first calls' results we could gain some extra |
| * percentage again, but unfortunately getValue() may return a reference to a |
| * static buffer, so we can't store the pointer directly but would have to |
| * copy-construct an array, which doesn't give us any advantage. |
| * |
| * Much more is accomplished by working directly on the sequence buffer |
| * returned by getArray() instead of using operator[] for each and every |
| * access. |
| * |
| * And while we're at it: now that we know the size in advance we don't need to |
| * copy the buffer anymore, just create the real string buffer and let the |
| * return value take ownership. |
| * |
| * All together these changes result in the new implementation needing only 62% |
| * of the time of the old implementation (in other words: that one was 1.61 |
| * times slower ...) |
| */ |
| |
| // Allocate the max possible buffer. Try to use stack instead of heap which |
| // would have to be reallocated most times anyway. |
| const sal_Int32 nLocalBuf = 512 * NMAPPINGMAX; |
| sal_Unicode aLocalBuf[nLocalBuf], *out = aLocalBuf, *aHeapBuf = NULL; |
| |
| const sal_Unicode *in = inStr.getStr() + startPos; |
| |
| if (nCount > 512) |
| out = aHeapBuf = (sal_Unicode*) malloc((nCount * NMAPPINGMAX) * sizeof(sal_Unicode)); |
| |
| if (useOffset) |
| offset.realloc(nCount * NMAPPINGMAX); |
| sal_Int32 j = 0; |
| for (sal_Int32 i = 0; i < nCount; i++) { |
| Mapping &map = casefolding::getValue(in, i, nCount, aLocale, nMappingType); |
| for (sal_Int32 k = 0; k < map.nmap; k++) { |
| if (useOffset) |
| offset[j] = i + startPos; |
| out[j++] = map.map[k]; |
| } |
| } |
| if (useOffset) |
| offset.realloc(j); |
| |
| OUString r(out, j); |
| |
| if (aHeapBuf) |
| free(aHeapBuf); |
| |
| return r; |
| #else |
| const sal_Unicode *in = inStr.getStr() + startPos; |
| |
| // Two different blocks to eliminate the if(useOffset) condition inside the |
| // inner k loop. Yes, on massive use even such small things do count. |
| if ( useOffset ) |
| { |
| sal_Int32 nOffCount = 0, i; |
| for (i = 0; i < nCount; i++) |
| { |
| // take care of TOGGLE_CASE transliteration: |
| sal_uInt8 nTmpMappingType = nMappingType; |
| if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) |
| nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); |
| |
| const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); |
| nOffCount += map.nmap; |
| } |
| rtl_uString* pStr = x_rtl_uString_new_WithLength( nOffCount ); // our x_rtl_ustring.h |
| sal_Unicode* out = pStr->buffer; |
| |
| if ( nOffCount != offset.getLength() ) |
| offset.realloc( nOffCount ); |
| |
| sal_Int32 j = 0; |
| sal_Int32 * pArr = offset.getArray(); |
| for (i = 0; i < nCount; i++) |
| { |
| // take care of TOGGLE_CASE transliteration: |
| sal_uInt8 nTmpMappingType = nMappingType; |
| if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) |
| nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); |
| |
| const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); |
| for (sal_Int32 k = 0; k < map.nmap; k++) |
| { |
| pArr[j] = i + startPos; |
| out[j++] = map.map[k]; |
| } |
| } |
| out[j] = 0; |
| |
| return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr> |
| } |
| else |
| { |
| // In the simple case of no offset sequence used we can eliminate the |
| // first getValue() loop. We could also assume that most calls result |
| // in identical string lengths, thus using a preallocated |
| // OUStringBuffer could be an easy way to assemble the return string |
| // without too much hassle. However, for single characters the |
| // OUStringBuffer::append() method is quite expensive compared to a |
| // simple array operation, so it pays here to copy the final result |
| // instead. |
| |
| // Allocate the max possible buffer. Try to use stack instead of heap, |
| // which would have to be reallocated most times anyways. |
| const sal_Int32 nLocalBuf = 2048; |
| sal_Unicode aLocalBuf[ nLocalBuf * NMAPPINGMAX ], *out = aLocalBuf, *pHeapBuf = NULL; |
| if ( nCount > nLocalBuf ) |
| out = pHeapBuf = new sal_Unicode[ nCount * NMAPPINGMAX ]; |
| |
| sal_Int32 j = 0; |
| for ( sal_Int32 i = 0; i < nCount; i++) |
| { |
| // take care of TOGGLE_CASE transliteration: |
| sal_uInt8 nTmpMappingType = nMappingType; |
| if (nMappingType == (MappingTypeLowerToUpper | MappingTypeUpperToLower)) |
| nTmpMappingType = lcl_getMappingTypeForToggleCase( nMappingType, in[i] ); |
| |
| const Mapping &map = casefolding::getValue( in, i, nCount, aLocale, nTmpMappingType ); |
| for (sal_Int32 k = 0; k < map.nmap; k++) |
| { |
| out[j++] = map.map[k]; |
| } |
| } |
| |
| OUString aRet( out, j ); |
| if ( pHeapBuf ) |
| delete [] pHeapBuf; |
| return aRet; |
| } |
| #endif |
| } |
| |
| OUString SAL_CALL |
| Transliteration_body::transliterateChar2String( sal_Unicode inChar ) throw(RuntimeException) |
| { |
| const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); |
| rtl_uString* pStr = x_rtl_uString_new_WithLength( map.nmap ); // our x_rtl_ustring.h |
| sal_Unicode* out = pStr->buffer; |
| sal_Int32 i; |
| |
| for (i = 0; i < map.nmap; i++) |
| out[i] = map.map[i]; |
| out[i] = 0; |
| |
| return OUString( pStr, SAL_NO_ACQUIRE ); // take over ownership of <pStr> |
| } |
| |
| sal_Unicode SAL_CALL |
| Transliteration_body::transliterateChar2Char( sal_Unicode inChar ) throw(MultipleCharsOutputException, RuntimeException) |
| { |
| const Mapping &map = casefolding::getValue(&inChar, 0, 1, aLocale, nMappingType); |
| if (map.nmap > 1) |
| throw MultipleCharsOutputException(); |
| return map.map[0]; |
| } |
| |
| OUString SAL_CALL |
| Transliteration_body::folding( const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
| Sequence< sal_Int32 >& offset) throw(RuntimeException) |
| { |
| return this->transliterate(inStr, startPos, nCount, offset); |
| } |
| |
| Transliteration_casemapping::Transliteration_casemapping() |
| { |
| nMappingType = 0; |
| transliterationName = "casemapping(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_casemapping"; |
| } |
| |
| void SAL_CALL |
| Transliteration_casemapping::setMappingType( const sal_uInt8 rMappingType, const Locale& rLocale ) |
| { |
| nMappingType = rMappingType; |
| aLocale = rLocale; |
| } |
| |
| Transliteration_u2l::Transliteration_u2l() |
| { |
| nMappingType = MappingTypeUpperToLower; |
| transliterationName = "upper_to_lower(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_u2l"; |
| } |
| |
| Transliteration_l2u::Transliteration_l2u() |
| { |
| nMappingType = MappingTypeLowerToUpper; |
| transliterationName = "lower_to_upper(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_l2u"; |
| } |
| |
| Transliteration_togglecase::Transliteration_togglecase() |
| { |
| // usually nMappingType must NOT be a combiantion of different flages here, |
| // but we take care of that problem in Transliteration_body::transliterate above |
| // before that value is used. There we will decide which of both is to be used on |
| // a per character basis. |
| nMappingType = MappingTypeLowerToUpper | MappingTypeUpperToLower; |
| transliterationName = "toggle(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_togglecase"; |
| } |
| |
| Transliteration_titlecase::Transliteration_titlecase() |
| { |
| nMappingType = MappingTypeToTitle; |
| transliterationName = "title(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_titlecase"; |
| } |
| |
| #if 0 |
| struct LigatureData |
| { |
| sal_uInt32 cChar; |
| sal_Char * pUtf8Text; |
| }; |
| |
| // available Unicode ligatures: |
| // http://www.unicode.org/charts |
| // http://www.unicode.org/charts/PDF/UFB00.pdf |
| static LigatureData aLigatures[] = |
| { |
| { 0x0FB00, "ff" }, |
| { 0x0FB01, "fi" }, |
| { 0x0FB02, "fl" }, |
| { 0x0FB03, "ffi" }, |
| { 0x0FB04, "ffl" }, |
| { 0x0FB05, "ft" }, |
| { 0x0FB06, "st" }, |
| |
| { 0x0FB13, "\xD5\xB4\xD5\xB6" }, // Armenian small men now |
| { 0x0FB14, "\xD5\xB4\xD5\xA5" }, // Armenian small men ech |
| { 0x0FB15, "\xD5\xB4\xD5\xAB" }, // Armenian small men ini |
| { 0x0FB16, "\xD5\xBE\xD5\xB6" }, // Armenian small vew now |
| { 0x0FB17, "\xD5\xB4\xD5\xAD" }, // Armenian small men xeh |
| { 0x00000, "" } |
| }; |
| |
| static inline bool lcl_IsLigature( sal_uInt32 cChar ) |
| { |
| return (0x0FB00 <= cChar && cChar <= 0x0FB06) || (0x0FB13 <= cChar && cChar <= 0x0FB17); |
| } |
| |
| static rtl::OUString lcl_ResolveLigature( sal_uInt32 cChar ) |
| { |
| rtl::OUString aRes; |
| if (lcl_IsLigature( cChar )) |
| { |
| LigatureData *pFound = NULL; |
| LigatureData *pData = aLigatures; |
| while (!pFound && pData->cChar != 0) |
| { |
| if (pData->cChar == cChar) |
| pFound = pData; |
| ++pData; |
| } |
| if (pFound) |
| aRes = rtl::OUString( pFound->pUtf8Text, strlen( pFound->pUtf8Text ), RTL_TEXTENCODING_UTF8 ); |
| } |
| else |
| aRes = rtl::OUString( &cChar, 1 ); |
| return aRes; |
| } |
| #endif // if 0 |
| |
| static rtl::OUString transliterate_titlecase_Impl( |
| const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
| const Locale &rLocale, |
| Sequence< sal_Int32 >& offset ) |
| throw(RuntimeException) |
| { |
| const OUString aText( inStr.copy( startPos, nCount ) ); |
| |
| OUString aRes; |
| if (aText.getLength() > 0) |
| { |
| Reference< XMultiServiceFactory > xMSF = ::comphelper::getProcessServiceFactory(); |
| CharacterClassificationImpl aCharClassImpl( xMSF ); |
| |
| // because aCharClassImpl.toTitle does not handle ligatures or ß but will raise |
| // an exception we need to handle the first chara manually... |
| |
| // we don't want to change surrogates by accident, thuse we use proper code point iteration |
| sal_Int32 nPos = 0; |
| sal_uInt32 cFirstChar = aText.iterateCodePoints( &nPos ); |
| OUString aResolvedLigature( &cFirstChar, 1 ); //lcl_ResolveLigature( cFirstChar ) ); |
| // toUpper can be used to properly resolve ligatures and characters like ß |
| aResolvedLigature = aCharClassImpl.toUpper( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); |
| // since toTitle will leave all-uppercase text unchanged we first need to |
| // use toLower to bring possible 2nd and following charas in lowercase |
| aResolvedLigature = aCharClassImpl.toLower( aResolvedLigature, 0, aResolvedLigature.getLength(), rLocale ); |
| sal_Int32 nResolvedLen = aResolvedLigature.getLength(); |
| |
| // now we can properly use toTitle to get the expected result for the resolved string. |
| // The rest of the text should just become lowercase. |
| aRes = aCharClassImpl.toTitle( aResolvedLigature, 0, nResolvedLen, rLocale ); |
| aRes += aCharClassImpl.toLower( aText, 1, aText.getLength() - 1, rLocale ); |
| offset.realloc( aRes.getLength() ); |
| |
| sal_Int32 *pOffset = offset.getArray(); |
| sal_Int32 nLen = offset.getLength(); |
| for (sal_Int32 i = 0; i < nLen; ++i) |
| { |
| sal_Int32 nIdx = 0; |
| if (i >= nResolvedLen) |
| nIdx = i - nResolvedLen + 1; |
| pOffset[i] = nIdx; |
| } |
| } |
| #if OSL_DEBUG_LEVEL > 1 |
| const sal_Int32 *pCOffset = offset.getConstArray(); |
| (void) pCOffset; |
| #endif |
| |
| return aRes; |
| } |
| |
| |
| // this function expects to be called on a word-by-word basis, |
| // namely that startPos points to the first char of the word |
| rtl::OUString SAL_CALL Transliteration_titlecase::transliterate( |
| const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
| Sequence< sal_Int32 >& offset ) |
| throw(RuntimeException) |
| { |
| return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); |
| } |
| |
| |
| Transliteration_sentencecase::Transliteration_sentencecase() |
| { |
| nMappingType = MappingTypeToTitle; // though only to be applied to the first word... |
| transliterationName = "sentence(generic)"; |
| implementationName = "com.sun.star.i18n.Transliteration.Transliteration_sentencecase"; |
| } |
| |
| |
| // this function expects to be called on a sentence-by-sentence basis, |
| // namely that startPos points to the first word (NOT first char!) in the sentence |
| rtl::OUString SAL_CALL Transliteration_sentencecase::transliterate( |
| const OUString& inStr, sal_Int32 startPos, sal_Int32 nCount, |
| Sequence< sal_Int32 >& offset ) |
| throw(RuntimeException) |
| { |
| return transliterate_titlecase_Impl( inStr, startPos, nCount, aLocale, offset ); |
| } |
| |
| |
| } } } } |
| |