| /*************************************************************************** |
| * |
| * collate.cpp - specializations of collate facet |
| * |
| * $Id$ |
| * |
| *************************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed |
| * with this work for additional information regarding copyright |
| * ownership. The ASF licenses this file to you under the Apache |
| * License, Version 2.0 (the "License"); you may not use this file |
| * except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| * implied. See the License for the specific language governing |
| * permissions and limitations under the License. |
| * |
| * Copyright 2001-2008 Rogue Wave Software, Inc. |
| * |
| **************************************************************************/ |
| |
| #define _RWSTD_LIB_SRC |
| |
| #if 3 == __GNUG__ && 3 > __GNUC_MINOR__ \ |
| || 3 == __GNUC_MINOR__ && 1 >__GNUC_PATCHLEVEL__ |
| |
| // working around a gcc bug (PR #29570) |
| # include <rw/_config.h> |
| # ifndef _RWSTD_NO_EXTERN_TEMPLATE |
| # define _RWSTD_NO_EXTERN_TEMPLATE |
| # endif |
| #endif // gcc >= 3.0 && gcc < 3.3.1 |
| |
| #include <rw/_defs.h> |
| |
| #include <limits> // for numeric_limits |
| |
| #include <limits.h> |
| #include <stdlib.h> // for wcstombs() |
| #include <string.h> // for memchr(), memcpy() |
| |
| #ifndef _RWSTD_NO_WCHAR_H |
| # include <wchar.h> // for wcscoll(), wcsxfrm(), wmemcmp() |
| #endif // _RWSTD_NO_WCHAR_H |
| |
| #include <loc/_collate.h> |
| #include <loc/_locale.h> |
| #include <loc/_localedef.h> |
| |
| #include "locale_body.h" |
| #include "podarray.h" // for __rw_pod_array |
| #include "setlocale.h" // for __rw_setlocale |
| |
| // define _RWSTD_MB_MAX to the greater of MB_LEN_MAX and 8 |
| // where 8 the maximum length necessary to encode a Unicode |
| // character in UTF-8 |
| #if _RWSTD_MB_LEN_MAX < 8 |
| # define _RWSTD_MB_MAX 8 |
| #else |
| # define _RWSTD_MB_MAX _RWSTD_MB_LEN_MAX |
| #endif // _RWSTD_MB_LEN_MAX |
| |
| |
| #if defined (_RWSTD_NO_WCSCOLL) && !defined (_RWSTD_NO_WCSCOLL_IN_LIBC) |
| |
| extern "C" { |
| |
| // declare if not declared in the system header(s) |
| int wcscoll (const wchar_t*, const wchar_t*) _LIBC_THROWS (); |
| |
| # undef _RWSTD_NO_WCSCOLL |
| |
| } // extern "C" |
| |
| #endif // _RWSTD_NO_WCSCOLL && !_RWSTD_NO_WCSCOLL_IN_LIBC |
| |
| |
| #ifdef _RWSTD_NO_WCSXFRM |
| # ifndef _RWSTD_NO_WCSXFRM_IN_LIBC |
| |
| extern "C" { |
| |
| // declare if not declared in the system header(s) |
| _RWSTD_SIZE_T wcsxfrm (wchar_t*, const wchar_t*, _RWSTD_SIZE_T) _LIBC_THROWS (); |
| |
| # define _RWSTD_WCSXFRM wcsxfrm |
| # undef _RWSTD_NO_WCSXFRM |
| |
| } // extern "C" |
| |
| # else |
| # define _RWSTD_WCSXFRM _RW::__rw_wcsxfrm |
| # endif // _RWSTD_NO_WCSXFRM_IN_LIBC |
| #else |
| # define _RWSTD_WCSXFRM wcsxfrm |
| #endif // _RWSTD_NO_WCSXFRM |
| |
| |
| #if defined (_RWSTD_NO_WCSTOMBS) && !defined (_RWSTD_NO_WCSTOMBS_IN_LIBC) |
| |
| extern "C" { |
| |
| // declare if not declared in the system header(s) |
| _RWSTD_DLLIMPORT _RWSTD_SIZE_T |
| wcstombs (char*, const wchar_t*, _RWSTD_SIZE_T) _LIBC_THROWS (); |
| |
| # undef _RWSTD_NO_WCSTOMBS |
| |
| } // extern "C" |
| |
| #endif // _RWSTD_NO_WCSTOMBS && !_RWSTD_NO_WCSTOMBS_IN_LIBC |
| |
| |
| // for convenience |
| typedef unsigned char UChar; |
| |
| |
| _RWSTD_NAMESPACE (__rw) { |
| |
| #undef min |
| #undef max |
| |
| |
| // computes LC_XXX category from a numeric facet id, returns the |
| // LC_XXX category for standard facets, LC_ALL for all others |
| int __rw_get_cat (int); |
| |
| |
| #ifndef _RWSTD_NO_WCHAR_T |
| |
| _RWSTD_INTERNAL void |
| __rw_append_weight (const _RW::__rw_collate_t *impl, |
| const unsigned *weights, |
| _STD::wstring &out) |
| { |
| // append the weight(s) to the out string |
| for (int i = 0; i < impl->longest_weight; ++i, ++weights) { |
| |
| const unsigned int wt = *weights; |
| |
| // if the weight is not an IGNORE weight |
| // then add it to the out string |
| if (wt && wt != _RWSTD_UINT_MAX) { |
| out += wchar_t (wt); |
| } |
| |
| } |
| } |
| |
| #endif // _RWSTD_NO_WCHAR_T |
| |
| |
| _RWSTD_INTERNAL void |
| __rw_append_weight (const _RW::__rw_collate_t *impl, |
| const unsigned int *weights, |
| _STD::string &out) |
| { |
| // append the weight(s) to the out string |
| for (int i = 0; i < impl->longest_weight; ++i, ++weights) { |
| |
| unsigned wt = *weights; |
| |
| // if the weight is not an IGNORE weight |
| // then add it to the out string |
| |
| if (wt && wt != _RWSTD_UINT_MAX) { |
| |
| while (_RWSTD_CHAR_MAX < wt) { |
| out += char (_RWSTD_CHAR_MAX); |
| wt -= _RWSTD_CHAR_MAX; |
| } |
| |
| // the weight will fit a signed char |
| // so add it to the out str |
| out += char (wt); |
| } |
| } |
| } |
| |
| |
| template <class _STR_T> |
| static void |
| __rw_process_offsets (_RW::__rw_collate_t *impl, |
| const unsigned **start, |
| const unsigned **end, |
| _STR_T &out) |
| { |
| |
| // process the offset list for each pass. |
| |
| for (unsigned int pass = 0; pass < impl->num_weights; pass++) { |
| // first get the order for the start element in this pass |
| unsigned int shift_value = (impl->num_weights - (pass + 1)) * 2; |
| |
| const unsigned int** cur_start = start; |
| const unsigned int** cur_end = start; |
| |
| if (*cur_start == *end) |
| return; |
| |
| do { |
| // calculate the order from the bitmask order in front of each |
| // character's weight information. Each character may have |
| // different orderings although most likely there will be groups |
| // of characters with the same ordering. |
| unsigned int cur_ord = (**cur_end >> shift_value) & 0x00000003; |
| while (cur_end != end && ((**cur_end >> shift_value) |
| & 0x00000003) == cur_ord) { |
| cur_end++; |
| } |
| |
| const unsigned int** cur; |
| switch (cur_ord) { |
| case 0: |
| // forward weight type |
| for (cur = cur_start; cur < cur_end; cur++) { |
| const unsigned int* weightp = *cur; |
| weightp = weightp + 1 + pass * impl->longest_weight; |
| |
| // qualify the call to make sure the function |
| // is found during lookup when declared static |
| _RW::__rw_append_weight (impl, weightp, out); |
| } |
| break; |
| |
| case 1: |
| // backward weight type |
| for (cur = cur_end - 1; cur >= cur_start; cur --) { |
| const unsigned int* weightp = *cur; |
| weightp = weightp + 1 + pass * impl->longest_weight; |
| |
| // qualify the call to make sure the function |
| // is found during lookup when declared static |
| _RW::__rw_append_weight (impl, weightp, out); |
| } |
| break; |
| |
| case 2: |
| // forward,position weight type. |
| // The string with a non-IGNOREd element after the |
| // fewest IGNORED elements should collate first |
| for (cur = cur_start; cur < cur_end; cur++) { |
| const unsigned int* weightp = *cur; |
| weightp = weightp + 1 + pass * impl->longest_weight; |
| // output the CHAR_MAX value for every IGNORE |
| // weight that we see. The first string that has a |
| // non-ignore value will collate first |
| |
| if (*weightp == 0) { |
| typedef typename _STR_T::value_type CharT; |
| |
| const CharT ign = |
| _STD::numeric_limits<CharT>::max (); |
| |
| out += ign; |
| } |
| else { |
| // qualify the call to make sure the function |
| // is found during lookup when declared static |
| _RW::__rw_append_weight (impl, weightp, out); |
| } |
| } |
| break; |
| |
| case 3: |
| // backward,position weight type |
| for (cur = cur_end - 1; cur >= cur_start; cur --) { |
| const unsigned int* weightp = *cur; |
| weightp = weightp + 1 + pass * impl->longest_weight; |
| // non-ignore value will collate first |
| if (*weightp == 0) { |
| typedef typename _STR_T::value_type CharT; |
| |
| const CharT ign = |
| _STD::numeric_limits<CharT>::max (); |
| |
| out += ign; |
| } |
| else |
| // qualify the call to make sure the function |
| // is found during lookup when declared static |
| _RW::__rw_append_weight (impl, weightp, out); |
| } |
| break; |
| } |
| |
| // append a 1 to designate the end of the pass |
| typedef typename _STR_T::value_type CharT; |
| out += CharT (1); |
| |
| cur_start = cur_end; |
| } while (cur_end != end); |
| } |
| } |
| |
| #ifndef _RWSTD_NO_WCHAR_T |
| |
| static int |
| __rw_get_w_ce_offset (const __rw_collate_t *impl, |
| const wchar_t** cur_char, |
| const wchar_t* end) |
| { |
| // obtain the offset of a wide collating element. If no collating |
| // elements can be made with the current position of the string |
| // then return -1 |
| |
| if (impl->largest_ce > 1) { |
| |
| unsigned int cur_tab = 0; |
| |
| char utf8_enc [_RWSTD_MB_MAX]; |
| |
| const unsigned int* tab; |
| while (*cur_char < end) { |
| |
| // convert the next wchar_t character to a utf8 encoded character |
| const _RWSTD_SIZE_T nbytes = |
| _RW::__rw_itoutf8 (**cur_char, utf8_enc); |
| |
| for (_RWSTD_SIZE_T i = 0; i < nbytes; i++) { |
| |
| const unsigned c1 = impl->get_first_char_in_w_ce_tab (cur_tab); |
| const unsigned c2 = impl->get_last_char_in_w_ce_tab (cur_tab); |
| |
| if (UChar (utf8_enc [i]) < c1 || UChar (utf8_enc [i]) > c2) |
| return -1; |
| |
| tab = impl->get_w_ce_tab (cur_tab); |
| |
| const unsigned int next_off = tab [UChar (utf8_enc [i]) - c1]; |
| if (next_off == _RWSTD_UINT_MAX) { |
| return -1; |
| } |
| |
| if (next_off & 0x80000000) { |
| cur_tab = next_off &~ 0x80000000; |
| continue; |
| } |
| |
| return next_off; |
| } |
| (*cur_char)++; |
| } |
| } |
| return -1; |
| } |
| |
| |
| static int |
| __rw_get_wchar_offset (const __rw_collate_t *impl, |
| const wchar_t** cur_char, |
| const wchar_t* end) |
| { |
| // get the offset of the weight information for the current wide character |
| |
| unsigned cur_tab = 0; |
| char utf8_enc [_RWSTD_MB_MAX]; |
| |
| while (*cur_char < end) { |
| // convert the next wchar_t character to a utf8 encoded character |
| const _RWSTD_SIZE_T nbytes = |
| _RW::__rw_itoutf8 (**cur_char, utf8_enc); |
| |
| for (_RWSTD_SIZE_T i = 0; i < nbytes; i++) { |
| const unsigned c1 = impl->get_first_char_in_w_tab (cur_tab); |
| if (UChar (utf8_enc [i]) < c1) |
| return -1; |
| |
| const unsigned* const tab = impl->get_w_tab (cur_tab); |
| |
| const unsigned int next_off = tab [UChar (utf8_enc [i]) - c1]; |
| if (next_off == _RWSTD_UINT_MAX) { |
| return -1; |
| } |
| |
| if (next_off & 0x80000000) { |
| cur_tab = next_off &~ 0x80000000; |
| continue; |
| } |
| |
| return next_off; |
| } |
| (*cur_char)++; |
| } |
| return -1; |
| } |
| |
| #endif // _RWSTD_NO_WCHAR_T |
| |
| |
| static int |
| __rw_get_n_ce_offset (const __rw_collate_t *impl, |
| const char **cur_char, |
| const char *end) |
| { |
| if (impl->largest_ce > 1) { |
| unsigned int cur_tab = 0; |
| const unsigned int* tab; |
| |
| while (*cur_char < end ) { |
| const unsigned c1 = impl->get_first_char_in_n_ce_tab (cur_tab); |
| const unsigned c2 = impl->get_last_char_in_n_ce_tab (cur_tab); |
| if (UChar (**cur_char) < c1 || UChar (**cur_char) > c2) |
| return -1; |
| |
| tab = impl->get_n_ce_tab (cur_tab); |
| |
| unsigned int next_off = tab [UChar (**cur_char) - c1]; |
| if (next_off == _RWSTD_UINT_MAX) { |
| return -1; |
| } |
| |
| if (next_off & 0x80000000) { |
| (*cur_char)++; |
| cur_tab = next_off &~ 0x80000000; |
| } |
| else { |
| return next_off; |
| } |
| } |
| } |
| return -1; |
| } |
| |
| |
| static int |
| __rw_get_char_offset (const __rw_collate_t *impl, |
| const char** cur_char, |
| const char* end) |
| { |
| unsigned int cur_tab = 0; |
| |
| while (*cur_char < end) { |
| unsigned c1 = impl->get_first_char_in_n_tab (cur_tab); |
| if (UChar (**cur_char) < c1) |
| return -1; |
| |
| const unsigned int* tab = impl->get_n_tab (cur_tab); |
| c1 = impl->get_first_char_in_n_tab (cur_tab); |
| |
| unsigned int next_off = tab [UChar (**cur_char) - c1]; |
| if (next_off == _RWSTD_UINT_MAX) { |
| return -1; |
| } |
| |
| if (next_off & 0x80000000) { |
| (*cur_char)++; |
| cur_tab = next_off &~ 0x80000000; |
| } |
| else { |
| return next_off; |
| } |
| } |
| |
| return -1; |
| } |
| |
| |
| // returns true if the character sequence starting at `from' |
| // does not form a valid complete multibyte character |
| static inline bool |
| __rw_is_invalid (const unsigned int* first_tab, const char *from) |
| { |
| return 0 != (__rw_mbtowco (first_tab, from, 0) & 0x80000000); |
| } |
| |
| |
| // same as strxfrm() except that it takes the number of characters |
| // in an array that may contain embedded NULs; these are inserted |
| // into the transformed string |
| static _STD::string |
| __rw_strnxfrm (const char *src, _RWSTD_SIZE_T nchars) |
| { |
| _STD::string res; |
| |
| char buf [256]; |
| char *pbuf = buf; |
| |
| _RWSTD_SIZE_T bufsize = sizeof buf; |
| char *psrc = buf; |
| |
| while (nchars) { |
| |
| // using a C-style cast instead of static_cast to avoid |
| // a gcc 2.95.2 bug causing an error on some platforms: |
| // static_cast from `void *' to `const char *' |
| const char* const last = (const char*)memchr (src, '\0', nchars); |
| |
| if (0 == last) { |
| |
| // no NUL found in the initial portion of the source string |
| // that fits into the local temporary buffer; copy as many |
| // characters as fit into the buffer |
| |
| if (bufsize <= nchars) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| pbuf = new char [nchars + 1]; |
| } |
| |
| psrc = pbuf; |
| memcpy (psrc, src, nchars); |
| |
| // append a terminating NUL and decrement the number |
| // of characters that remain to be processed |
| psrc [nchars] = '\0'; |
| src += nchars; |
| nchars = 0; |
| } |
| else { |
| |
| // terminating NUL found in the source buffer |
| nchars -= (last - src) + 1; |
| psrc = _RWSTD_CONST_CAST (char*, src); |
| src += (last - src) + 1; |
| } |
| |
| // provide a destination buffer to strxfrm() in case |
| // it's buggy (such as MSVC's) and tries to write to |
| // the buffer even if it's 0 |
| char just_in_case_buf [8]; |
| const _RWSTD_SIZE_T dst_size = strxfrm (just_in_case_buf, psrc, 0); |
| |
| // check for strxfrm() errors |
| if (0 == (dst_size << 1)) |
| return _STD::string (); |
| |
| _RWSTD_SIZE_T res_size = res.size (); |
| |
| _TRY { |
| // resize the result string to fit itself plus the result |
| // of the transformation including the terminatin NUL |
| // appended by strxfrm() |
| res.resize (res_size + dst_size + 1); |
| } |
| _CATCH (...) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| _RETHROW; |
| } |
| |
| // transfor the source string up to the terminating NUL |
| _RWSTD_SIZE_T xfrm_size = |
| strxfrm (&res [0] + res_size, psrc, dst_size + 1); |
| |
| #if defined _MSC_VER && _MSC_VER < 1400 |
| // compute the correct value that should have been returned from |
| // strxfrm() after the transformation has completed (MSVC strxfrm() |
| // returns a bogus result; see PR #29935) |
| xfrm_size = strlen (&res [0] + res_size); |
| #endif // MSVC < 8.0 |
| |
| // increment the size of the result string by the number |
| // of transformed characters excluding the terminating NUL |
| // if strxfrm() transforms the empty string into the empty |
| // string, keep the terminating NUL, otherwise drop it |
| res_size += xfrm_size + (last && !*psrc && !xfrm_size); |
| |
| _TRY { |
| res.resize (res_size); |
| } |
| _CATCH (...) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| _RETHROW; |
| } |
| } |
| |
| if (pbuf != buf) |
| delete[] pbuf; |
| |
| return res; |
| } |
| |
| |
| #ifndef _RWSTD_NO_WCHAR_T |
| |
| # ifdef _RWSTD_NO_WCSXFRM |
| |
| // implements wcsxfrm() using wcstombs() and strxfrm() on platforms |
| // such as some versions of BSD where the function isn't defined in |
| // the C Standard Library |
| static _RWSTD_SIZE_T |
| __rw_wcsxfrm (wchar_t *dst, const wchar_t *src, _RWSTD_SIZE_T dstsize) |
| { |
| // src must be non-null |
| _RWSTD_ASSERT (0 != src); |
| |
| // dst is permitted to be null only when (dstsize == 0) |
| _RWSTD_ASSERT (0 == dstsize || dst); |
| |
| # ifndef _RWSTD_NO_WCSTOMBS |
| |
| // convert wide string to a multibyte string before tranforming it |
| // using strxfrm() and widening the result into the destination buffer |
| |
| const _RWSTD_SIZE_T srclen = _RWSTD_WCSLEN (src); |
| |
| // compute the size of the temporary nearrow buffer where to narrow |
| // the source wide string to |
| const _RWSTD_SIZE_T needbytes = |
| (dstsize ? dstsize : srclen) * MB_LEN_MAX; |
| |
| char narrow_buf [256]; |
| char* const nbuf = |
| sizeof narrow_buf < needbytes ? new char [needbytes + 1] : narrow_buf; |
| |
| _RWSTD_SIZE_T result; |
| |
| const _RWSTD_SIZE_T nmbchars = wcstombs (nbuf, src, needbytes); |
| |
| if (_RWSTD_SIZE_MAX == nmbchars) |
| result = _RWSTD_SIZE_MAX; |
| else { |
| // allocate a small buffer 8 times the size of the multibyte |
| // buffer (where 8 is a guess at the maximum number of bytes |
| // needed to transform the longest multibyte character) |
| char xfrm_buf [sizeof narrow_buf * 8]; |
| const _RWSTD_SIZE_T xbufsize = sizeof xfrm_buf; |
| const _RWSTD_SIZE_T xbufneed = needbytes * 8; |
| |
| // allocate a larger buffer if the small statically buffer |
| // isn't big enough |
| char* const xbuf = |
| xbufsize < xbufneed ? new char [xbufneed + 1] : xfrm_buf; |
| |
| // transform the multibyte character string into the narrow |
| // buffer, storing the returned value |
| result = strxfrm (xbuf, nbuf, xbufneed); |
| |
| if (_RWSTD_SIZE_MAX != result && dstsize) { |
| // widen the bytes (not characters) of the transformed string |
| // if the transformation was successful and the size of the |
| // destination buffer is non-zero |
| |
| if (result < dstsize) |
| dstsize = result; |
| |
| for (_RWSTD_SIZE_T i = 0; i != dstsize; ++i) |
| dst [i] = wchar_t (UChar (xbuf [i])); |
| } |
| |
| // free the transformation buffer if dynamically allocated |
| if (xbuf != xfrm_buf) |
| delete[] xbuf; |
| } |
| |
| // free the multibyte buffer if dynamically allocated |
| if (nbuf != narrow_buf) |
| delete[] nbuf; |
| |
| return result; |
| |
| # else // if defined (_RWSTD_NO_WCSTOMBS) |
| |
| _RWSTD_UNUSED (dst); |
| _RWSTD_UNUSED (src); |
| _RWSTD_UNUSED (dstsize); |
| |
| // fail when there is no way to convert a wchar_t array |
| // to a multibyte string |
| return _RWSTD_SIZE_MAX; |
| |
| # endif // _RWSTD_NO_WCSTOMBS |
| |
| } |
| |
| # endif // _RWSTD_NO_WCSXFRM |
| |
| |
| // same as wcsxfrm() except that it takes the number of characters |
| // in an array that may contain embedded NULs; these are inserted |
| // into the transformed string |
| static _STD::wstring |
| __rw_wcsnxfrm (const wchar_t *src, _RWSTD_SIZE_T nchars) |
| { |
| _STD::wstring res; |
| |
| wchar_t buf [256]; |
| wchar_t *pbuf = buf; |
| |
| _RWSTD_SIZE_T bufsize = sizeof buf / sizeof *buf; |
| wchar_t *psrc = buf; |
| |
| while (nchars) { |
| |
| typedef _STD::char_traits<wchar_t> Traits; |
| |
| const wchar_t* const last = Traits::find (src, nchars, L'\0'); |
| |
| if (0 == last) { |
| |
| // no NUL found in the initial portion of the source string |
| // that fits into the local temporary buffer; copy as many |
| // characters as fit into the buffer |
| |
| if (bufsize <= nchars) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| pbuf = new wchar_t [nchars + 1]; |
| } |
| |
| psrc = pbuf; |
| memcpy (psrc, src, nchars * sizeof *psrc); |
| |
| // append a terminating NUL and decrement the number |
| // of characters that remain to be processed |
| psrc [nchars] = 0; |
| src += nchars; |
| nchars = 0; |
| } |
| else { |
| |
| // terminating NUL found in the source buffer |
| nchars -= (last - src) + 1; |
| psrc = _RWSTD_CONST_CAST (wchar_t*, src); |
| src += (last - src) + 1; |
| } |
| |
| // provide a destination buffer to strxfrm() in case |
| // it's buggy (such as MSVC's) and tries to write to |
| // the buffer even if it's 0 |
| wchar_t just_in_case_buf [8]; |
| |
| const _RWSTD_SIZE_T dst_size = |
| _RWSTD_WCSXFRM (just_in_case_buf, psrc, 0); |
| |
| // check for wcsxfrm() errors |
| if (_RWSTD_SIZE_MAX == dst_size) |
| return _STD::wstring (); |
| |
| _RWSTD_SIZE_T res_size = res.size (); |
| |
| _TRY { |
| // resize the result string to fit itself plus the result |
| // of the transformation including the terminatin NUL |
| // appended by strxfrm() |
| res.resize (res_size + dst_size + 1); |
| } |
| _CATCH (...) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| _RETHROW; |
| } |
| |
| // transfor the source string up to the terminating NUL |
| _RWSTD_SIZE_T xfrm_size = |
| _RWSTD_WCSXFRM (&res [0] + res_size, psrc, dst_size + 1); |
| |
| # if defined _MSC_VER && _MSC_VER < 1400 |
| // compute the correct value that should have been returned from |
| // strxfrm() after the transformation has completed (MSVC strxfrm() |
| // returns a bogus result; see PR #29935) |
| xfrm_size = Traits::length (&res [0] + res_size); |
| # endif // MSVC < 8.0 |
| |
| // increment the size of the result string by the number |
| // of transformed characters excluding the terminating NUL |
| // if strxfrm() transforms the empty string into the empty |
| // string, keep the terminating NUL, otherwise drop it |
| res_size += xfrm_size + (last && !*psrc && !xfrm_size); |
| |
| _TRY { |
| res.resize (res_size); |
| } |
| _CATCH (...) { |
| if (pbuf != buf) |
| delete[] pbuf; |
| _RETHROW; |
| } |
| } |
| |
| if (pbuf != buf) |
| delete[] pbuf; |
| |
| return res; |
| } |
| |
| #endif // _RWSTD_NO_WCHAR_T |
| |
| |
| template <class _CharT> |
| long __rw_hash (const _CharT *lo, const _CharT *hi) _THROWS (()) |
| { |
| // Peter Weinberger's generic hashing algorithm, adapted by Andrew |
| // Binstock from a version by Allen Holub (see Andrew Binstock, |
| // "Hashing Revisited", Dr. Dobb's Journal, April 1996) |
| |
| const int long_bits = _RWSTD_CHAR_BIT * int (sizeof (long)); |
| const int one_eighth = long_bits / 8; |
| const int three_fourths = long_bits * 3 / 4; |
| // subexpression parenthesized to prevent MSVC 13.00 warning C4554 |
| const long hi_bits = ~0L << (long_bits - one_eighth); |
| |
| long res = 0; |
| |
| for ( ; lo != hi; ++lo) { |
| typedef _STD::char_traits<_CharT> _Traits; |
| |
| res = (res << one_eighth) + _Traits::to_int_type (*lo); |
| |
| const long tmp = res & hi_bits; |
| if (tmp) |
| res = (res ^ (tmp >> three_fourths)) & ~hi_bits; |
| } |
| |
| return res; |
| } |
| |
| #ifndef _RWSTD_NO_INSTANTIATE |
| # ifndef _RWSTD_NO_EXPLICIT_FUNC_INSTANTIATION |
| |
| // explicitly instantiate for compilers with automatic template |
| // instantiation that don't emit symbols for implicitly instantiated |
| // templates even of they are completely defined in .cpp files |
| // (e.g., Compaq C++) |
| |
| template long __rw_hash (const char*, const char*) _THROWS(()); |
| |
| # ifndef _RWSTD_NO_WCHAR_T |
| |
| template long __rw_hash (const wchar_t*, const wchar_t*) _THROWS(()); |
| |
| # endif // _RWSTD_NO_WCHAR_T |
| |
| # endif // _RWSTD_NO_EXPLICIT_FUNC_INSTANTIATION |
| #endif // _RWSTD_NO_INSTANTIATE |
| |
| |
| } // namespace __rw |
| |
| |
| _RWSTD_NAMESPACE (std) { |
| |
| |
| _RW::__rw_facet_id collate<char>::id; |
| |
| |
| // outlined to avoid generating a vtable in each translation unit |
| // that uses the class |
| /* virtual */ collate<char>:: |
| ~collate () /* nothrow */ |
| { |
| // no-op |
| } |
| |
| |
| int collate<char>:: |
| do_compare (const char_type *__lo1, const char_type *__hi1, |
| const char_type *__lo2, const char_type *__hi2) const |
| { |
| _RWSTD_ASSERT (__lo1 <= __hi1 && __lo2 <= __hi2); |
| |
| const _RWSTD_SIZE_T __len1 = __hi1 - __lo1; |
| const _RWSTD_SIZE_T __len2 = __hi2 - __lo2; |
| |
| const int cmp = memcmp (__lo1, __lo2, __len1 < __len2 ? __len1 : __len2); |
| |
| if (cmp) |
| return cmp < 0 ? -1 : 1; |
| |
| return __len1 < __len2 ? -1 : __len2 < __len1 ? +1 : 0; |
| } |
| |
| |
| collate<char>::string_type collate<char>:: |
| do_transform (const char_type *__lo, const char_type *__hi) const |
| { |
| _RWSTD_ASSERT (0 != __lo); |
| _RWSTD_ASSERT (__lo <= __hi); |
| |
| return string_type (__lo, size_t (__hi - __lo)); |
| } |
| |
| |
| long collate<char>:: |
| do_hash (const char_type *__lo, const char_type *__hi) const |
| { |
| // hash the result of do_transform, so that keys that transform equally |
| // will hash equally, as per 22.2.4.1.2, p3 |
| |
| const string_type __str = do_transform (__lo, __hi); |
| |
| __lo = __str.data (); |
| __hi = __lo + __str.length (); |
| |
| const long res = _RW::__rw_hash (__lo, __hi); |
| |
| return res; |
| } |
| |
| |
| // outlined to avoid generating a vtable in each translation unit |
| // that uses the class |
| /* virtual */ collate_byname<char>:: |
| ~collate_byname () /* nothrow */ |
| { |
| // no-op |
| } |
| |
| |
| int collate_byname<char>:: |
| do_compare (const char* low1, const char* high1, |
| const char* low2, const char* high2) const |
| { |
| const string_type s1 = do_transform (low1, high1); |
| const string_type s2 = do_transform (low2, high2); |
| |
| // FIXME: optimize: doing a full transformation of the two |
| // strings is not necessary, it might be quicker to only do |
| // a partial transformation |
| const int cmp = s1.compare (s2); |
| |
| // normalize |
| return cmp < 0 ? -1 : cmp ? 1 : 0; |
| } |
| |
| |
| collate_byname<char>::string_type |
| collate_byname<char>:: |
| do_transform (const char* low, const char* high) const |
| { |
| _RWSTD_ASSERT (low <= high); |
| |
| const int ccvt_cat = _RW::__rw_get_cat ((_C_wcodecvt_byname + 1) / 2); |
| |
| _RW::__rw_collate_t* const impl = |
| _RWSTD_CONST_CAST (_RW::__rw_collate_t*, |
| _RWSTD_STATIC_CAST (const _RW::__rw_collate_t*, _C_data ())); |
| |
| if (!impl || (this->_C_opts & this->_C_use_libc)) { |
| |
| // set the global libc locale in a thread-safe way |
| const _RW::__rw_setlocale clocale (_C_name, _RWSTD_LC_COLLATE); |
| |
| return _RW::__rw_strnxfrm (low, high - low); |
| } |
| |
| _RWSTD_ASSERT (0 != impl); |
| |
| // the maximum size that we could need to hold all the weight offsets |
| // is high - low |
| _RW::__rw_pod_array<const unsigned int*, 1024> indexes; |
| |
| // first go through the string getting a weight offset for |
| // each character, in the process check for collating elements. |
| const char* tmp_lo = low; |
| |
| for (; tmp_lo < high; tmp_lo++) { |
| const char* tmp_lo2 = tmp_lo; |
| int ret = _RW::__rw_get_n_ce_offset (impl, &tmp_lo2, high); |
| |
| if (ret == -1) { |
| tmp_lo2 = tmp_lo; |
| ret = _RW::__rw_get_char_offset (impl, &tmp_lo2, high); |
| if (ret == -1) { |
| // The character is not explicitely defined, but it |
| // may have been defined implicitly by UNDEFINED. |
| // because we cannot tell the difference between |
| // an UNDEFINED character and an invalid character |
| // in the collate database we must use the codecvt |
| // database to discover this information |
| if (impl->undefined_optimization) { |
| |
| _RWSTD_SIZE_T size; |
| |
| const _RW::__rw_codecvt_t *cvt = |
| _RWSTD_STATIC_CAST (const _RW::__rw_codecvt_t*, |
| _RW::__rw_get_facet_data ( |
| ccvt_cat, size, _C_name, |
| impl->codeset_name ())); |
| |
| if (_RW::__rw_is_invalid (cvt->n_to_w_tab(), tmp_lo2)) |
| return 0; |
| |
| const unsigned int *pwt = |
| impl->get_weight (impl->undefined_weight_idx); |
| indexes.append (&pwt, 1); |
| |
| tmp_lo = tmp_lo2; |
| } |
| } |
| else { |
| const unsigned int *pwt = impl->get_weight (ret); |
| indexes.append (&pwt, 1); |
| tmp_lo = tmp_lo2; |
| } |
| } |
| else { |
| // we found a collating element |
| const unsigned int *pwt = impl->get_weight (ret); |
| indexes.append (&pwt, 1); |
| tmp_lo = tmp_lo2; |
| } |
| } |
| // now process the weights |
| string_type out; |
| _RW::__rw_process_offsets (impl, indexes.data (), |
| indexes.data () + indexes.size (), out); |
| |
| return out; |
| } |
| |
| |
| #ifndef _RWSTD_NO_WCHAR_T |
| |
| _RW::__rw_facet_id collate<wchar_t>::id; |
| |
| |
| // outlined to avoid generating a vtable in each translation unit |
| // that uses the class |
| /* virtual */ collate<wchar_t>:: |
| ~collate () /* nothrow */ |
| { |
| // no-op |
| } |
| |
| |
| int collate<wchar_t>:: |
| do_compare (const char_type *__lo1, const char_type *__hi1, |
| const char_type *__lo2, const char_type *__hi2) const |
| { |
| _RWSTD_ASSERT (__lo1 <= __hi1 && __lo2 <= __hi2); |
| |
| const _RWSTD_SIZE_T __len1 = __hi1 - __lo1; |
| const _RWSTD_SIZE_T __len2 = __hi2 - __lo2; |
| |
| #ifndef _RWSTD_NO_WMEMCMP |
| |
| const int cmp = wmemcmp (__lo1, __lo2, __len1 < __len2 ? __len1 : __len2); |
| |
| if (cmp) |
| return cmp < 0 ? -1 : 1; |
| |
| #else // if defined (_RWSTD_NO_WMEMCMP) |
| |
| for (_RWSTD_SIZE_T __len = __len1 < __len2 ? __len1 : __len2; |
| __len--; ++__lo1, ++__lo2) { |
| |
| typedef string_type::traits_type _Traits; |
| typedef _Traits::int_type _Int; |
| |
| // avoid arithmetic on unknown char types |
| const _Int __i1 = _Traits::to_int_type (*__lo1); |
| const _Int __i2 = _Traits::to_int_type (*__lo2); |
| |
| // use int_type to prevent signed versus unsigned char comparison |
| if (!_Traits::eq_int_type (__i1, __i2)) |
| return __i1 < __i2 ? -1 : 1; |
| } |
| |
| #endif // _RWSTD_NO_WMEMCMP |
| |
| return __len1 < __len2 ? -1 : __len2 < __len1 ? +1 : 0; |
| } |
| |
| |
| collate<wchar_t>::string_type collate<wchar_t>:: |
| do_transform (const char_type *__lo, const char_type *__hi) const |
| { |
| _RWSTD_ASSERT (0 != __lo); |
| _RWSTD_ASSERT (__lo <= __hi); |
| |
| return string_type (__lo, size_t (__hi - __lo)); |
| } |
| |
| |
| long collate<wchar_t>:: |
| do_hash (const char_type *__lo, const char_type *__hi) const |
| { |
| // hash the result of do_transform, so that keys that transform equally |
| // will hash equally, as per 22.2.4.1.2, p3 |
| |
| const string_type __str = do_transform (__lo, __hi); |
| |
| __lo = __str.data (); |
| __hi = __lo + __str.length (); |
| |
| const long res = _RW::__rw_hash (__lo, __hi); |
| |
| return res; |
| } |
| |
| |
| // outlined to avoid generating a vtable in each translation unit |
| // that uses the class |
| /* virtual */ collate_byname<wchar_t>:: |
| ~collate_byname () /* nothrow */ |
| { |
| // no-op |
| } |
| |
| |
| int collate_byname<wchar_t>:: |
| do_compare (const wchar_t* low1, const wchar_t* high1, |
| const wchar_t* low2, const wchar_t* high2) const |
| { |
| if (this->_C_opts & this->_C_use_libstd) { |
| const string_type s1 = do_transform (low1, high1); |
| const string_type s2 = do_transform (low2, high2); |
| |
| // FIXME: optimize |
| return s1.compare (s2); |
| } |
| |
| #ifndef _RWSTD_NO_WCSCOLL |
| |
| // use the system C library to compare the strings |
| |
| _RW::__rw_setlocale clocale (this->_C_name, _RWSTD_LC_COLLATE); |
| |
| const _RWSTD_SIZE_T len1 = high1 - low1; |
| const _RWSTD_SIZE_T len2 = high2 - low2; |
| const _RWSTD_SIZE_T len = len1 + len2; |
| |
| // small local buffer |
| wchar_t local_buffer [256]; |
| const _RWSTD_SIZE_T bufsize = sizeof local_buffer / sizeof *local_buffer; |
| |
| // allocate only if local buffer is too small |
| wchar_t* const wbuf = |
| len + 2 >= bufsize ? new wchar_t [len + 2] : local_buffer; |
| |
| // copy and null-terminate first sequence |
| char_traits<wchar_t>::copy (wbuf, low1, len1); |
| wbuf [len1] = '\0'; |
| |
| // append and null-terminate first sequence |
| char_traits<wchar_t>::copy (wbuf + len1 + 1, low2, len2); |
| wbuf [len1 + 1 + len2] = '\0'; |
| |
| // compare sequences using wcscoll() |
| const int result = wcscoll (wbuf, wbuf + len1 + 1); |
| |
| // deallocate only if allocated |
| if (wbuf != local_buffer) |
| delete[] wbuf; |
| |
| return result ? result > 0 ? 1 : -1 : 0; |
| |
| #else // if defined (_RWSTD_NO_WCSCOLL) |
| |
| // transform strings first and compare the transformed results |
| const string_type s1 = do_transform (low1, high1); |
| const string_type s2 = do_transform (low2, high2); |
| |
| return s1.compare (s2); |
| |
| #endif // _RWSTD_NO_WCSCOLL |
| |
| } |
| |
| |
| collate_byname<wchar_t>::string_type |
| collate_byname<wchar_t>:: |
| do_transform (const wchar_t* low, const wchar_t* high) const |
| { |
| const int ccvt_cat = _RW::__rw_get_cat ((_C_wcodecvt_byname + 1) / 2); |
| |
| _RW::__rw_collate_t *impl = |
| _RWSTD_CONST_CAST (_RW::__rw_collate_t*, _RWSTD_STATIC_CAST ( |
| const _RW::__rw_collate_t*, _C_data ())); |
| |
| if (!impl || (this->_C_opts & this->_C_use_libc)) { |
| |
| // set the global libc locale in a thread-safe way |
| const _RW::__rw_setlocale clocale (_C_name, _RWSTD_LC_COLLATE); |
| |
| return _RW::__rw_wcsnxfrm (low, high - low); |
| } |
| else { |
| _RWSTD_ASSERT (0 != impl); |
| // the maximum size that we could need to hold all the weight offsets |
| // is high - low |
| _RW::__rw_pod_array<const unsigned int*, 1024> indexes; |
| |
| // first go through the string getting a weight offset for |
| // each character, in the process check for collating elements. |
| for (const wchar_t* tmp_lo =low; tmp_lo < high; tmp_lo++) { |
| const wchar_t* tmp_lo2 = tmp_lo; |
| int ret = _RW::__rw_get_w_ce_offset (impl, &tmp_lo2, high); |
| if (ret == -1) { |
| tmp_lo2 = tmp_lo; |
| ret = _RW::__rw_get_wchar_offset (impl, &tmp_lo2, high); |
| if (ret == -1) { |
| // The character is not explicitely defined, but it |
| // may have been defined implicitly by UNDEFINED. |
| // because we cannot tell the difference between |
| // an UNDEFINED character and an invalid character |
| // in the collate database we must use the codecvt |
| // database to discover this information |
| if (impl->undefined_optimization) { |
| _RWSTD_SIZE_T size; |
| const _RW::__rw_codecvt_t *cvt = |
| _RWSTD_STATIC_CAST (const _RW::__rw_codecvt_t*, |
| _RW::__rw_get_facet_data ( |
| ccvt_cat, size, _C_name, |
| impl->codeset_name ())); |
| |
| char tmp [_RWSTD_MB_MAX]; |
| |
| const _RWSTD_SIZE_T nbytes = |
| _RW::__rw_itoutf8 (*tmp_lo2, tmp); |
| |
| tmp [nbytes] = '\0'; |
| |
| if (_RW::__rw_is_invalid (cvt->w_to_n_tab (), tmp)) |
| return 0; |
| |
| const unsigned int *pwt = |
| impl->get_weight (impl->undefined_weight_idx); |
| indexes.append (&pwt, 1); |
| tmp_lo = tmp_lo2; |
| } |
| } |
| else { |
| const unsigned int *pwt = impl->get_weight (ret); |
| indexes.append (&pwt, 1); |
| tmp_lo = tmp_lo2; |
| } |
| } |
| else { |
| // we found a collating element |
| const unsigned int *pwt = impl->get_weight (ret); |
| indexes.append (&pwt, 1); |
| tmp_lo = tmp_lo2; |
| } |
| } |
| // now process the weights |
| string_type out; |
| _RW::__rw_process_offsets (impl, indexes.data (), |
| indexes.data () + indexes.size (), out); |
| |
| return out; |
| } |
| } |
| |
| #endif // _RWSTD_NO_WCHAR_T |
| |
| |
| |
| } // namespace std |