| /************************************************************************** |
| * |
| * mbsrtowcs.cpp |
| * |
| * Example program to demonstrate an implementation of the C Standard |
| * Library function mbsrtowcs() in terms of the C++ Standard Library |
| * codecvt facet. |
| * |
| * $Id$ |
| * |
| *************************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed |
| * with this work for additional information regarding copyright |
| * ownership. The ASF licenses this file to you under the Apache |
| * License, Version 2.0 (the "License"); you may not use this file |
| * except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| * implied. See the License for the specific language governing |
| * permissions and limitations under the License. |
| * |
| **************************************************************************/ |
| |
| #include <cassert> // for assert() |
| #include <cerrno> // for EILSEQ, errno |
| #include <cstring> // for strlen() |
| #include <cwchar> // for mbstate_t |
| #include <ios> // for hex |
| #include <iostream> // for cout |
| #include <locale> // for codecvt, locale |
| |
| #include <examples.h> |
| |
| |
| // my_mbsrtowcs() behaves |
| std::size_t |
| my_mbsrtowcs (std::mbstate_t *pstate, |
| wchar_t *dst, |
| const char *src, |
| std::size_t size) |
| { |
| const std::locale global; |
| |
| typedef std::codecvt<wchar_t, char, std::mbstate_t> CodeCvt; |
| |
| // retrieve the codecvt facet from the global locale |
| const CodeCvt &cvt = std::use_facet<CodeCvt>(global); |
| |
| // use a small local buffer when dst is null and ignore size |
| wchar_t buf [4]; |
| if (0 == dst) { |
| dst = buf; |
| size = sizeof buf / sizeof *buf; |
| } |
| |
| // set up pointers into the source sequence |
| const char* from = src; |
| const char* const from_end = from + std::strlen (from); |
| const char* from_next = from; |
| |
| // set up pointers into the destination sequence |
| wchar_t* to = dst; |
| wchar_t* const to_end = to + size; |
| wchar_t* to_next; |
| |
| // number of non-NUL wide characters stored in destination buffer |
| std::size_t nconv = 0; |
| |
| // use a local state when pstate is null (i.e., emulate mbstowcs) |
| std::mbstate_t state = std::mbstate_t (); |
| if (0 == pstate) |
| pstate = &state; |
| |
| for ( ; from_next != from_end && to != to_end; |
| from = from_next, to = dst == buf ? dst : to_next) { |
| |
| // convert a (sub)sequence of the source buffer into |
| // the destination buffer |
| const std::codecvt_base::result res = |
| cvt.in (*pstate, |
| from, from_end, from_next, |
| to, to_end, to_next); |
| |
| // verify the consistency of the xxx_next pointers |
| assert (from <= from_next && from_next <= from_end); |
| assert (to <= to_next && to_next <= to_end); |
| |
| // process conversion result |
| switch (res) { |
| |
| case std::codecvt_base::error: |
| // conversion error |
| errno = EILSEQ; |
| return std::size_t (-1); |
| |
| case std::codecvt_base::noconv: |
| // only codecvt<T, T> (i.e., facets where intern_type and |
| // extern_type are identical) is allowed to return noconv |
| // treat this case as an error even though it indicates |
| // a bad (incorrectly implemented) codecvt facet |
| return std::size_t (-1); |
| |
| case std::codecvt_base::partial: |
| // partial conversion (incomplete character or not enough |
| // room in destination buffer to convert the entire source |
| // sequence) |
| if (dst != buf || std::size_t (to_next - to) < size) { |
| errno = EILSEQ; |
| return std::size_t (-1); |
| } |
| |
| nconv += to_next - to; |
| break; |
| |
| case std::codecvt_base::ok: |
| // complete conversion of an initial subsequence (but not |
| // necessarily all) of the source buffer |
| nconv += to_next - to; |
| |
| if (dst == buf && from_next == from_end) |
| return nconv; |
| |
| break; |
| } |
| } |
| |
| return nconv; |
| } |
| |
| |
| int main () |
| { |
| static const char* const mbs [] = { |
| "a", "abc", |
| // <U0391>: Greek letter Alpha |
| "\xce\x91", |
| // <U0391><U0392>: Greek letters Alpha Beta |
| "\xce\x91\xce\x92", |
| // <U0391><U0392><U0393>: Greek letters Alpha Beta Gamma |
| "\xce\x91\xce\x92\xce\x93", |
| // <U0966>: Devangari digit 0 |
| "\xe0\xa5\xa6", |
| // <U0967><U0966>: Devangari digits 10 |
| "\xe0\xa5\xa7\xe0\xa5\xa6", |
| // <U0968><U0967><U0966>: Devangari digits 210 |
| "\xe0\xa5\xa8\xe0\xa5\xa7\xe0\xa5\xa6" |
| }; |
| |
| typedef std::codecvt_byname<wchar_t, char, std::mbstate_t> CodeCvt; |
| |
| // create a UCS/UTF-8 codecvt facet and install it in a locale |
| const std::locale utf (std::locale (""), new CodeCvt ("UTF-8@UCS")); |
| |
| // set the global locale to use the UCS/UTF-8 codecvt facet |
| std::locale::global (utf); |
| |
| // iterate over examples of UTF-8 sequences and output the wide |
| // character sequence each converts to |
| for (std::size_t i = 0; i != sizeof mbs / sizeof *mbs; ++i) { |
| |
| wchar_t *dst = 0; |
| |
| // initialize state to the initial shift state |
| std::mbstate_t state = std::mbstate_t (); |
| |
| // obtain the length of the wide character sequence |
| // corresponding to the multibyte source sequence, |
| // not including the terminating NUL |
| const std::size_t length = |
| my_mbsrtowcs (&state, 0, mbs [i], std::size_t (-1)); |
| |
| if (std::size_t (-1) == length) { |
| std::cerr << "Error computing length of destination sequence.\n"; |
| continue; |
| } |
| |
| // allocate a wide character buffer large enough to hold |
| // the converted sequence including the terminating NUL |
| dst = new wchar_t [length + 1]; |
| |
| // reset state to the initial shift state |
| state = std::mbstate_t (); |
| |
| // convert the narrow character source sequence into |
| // the wide character buffer |
| const std::size_t nconv = |
| my_mbsrtowcs (&state, dst, mbs [i], length + 1); |
| |
| if (length != nconv) { |
| std::cerr << "Error converting source sequence.\n"; |
| continue; |
| } |
| |
| // NUL-terminate the converted string |
| dst [nconv] = L'\0'; |
| |
| // write out the wide and the narrow sequences |
| std::cout << "UCS-2 (" << std::dec << length << "): " << std::hex; |
| |
| for (const wchar_t *pwc = dst; *pwc != L'\0'; ++pwc) |
| std::cout << "U+" << unsigned (*pwc) << ' '; |
| |
| std::cout << " ==> UTF-8: "; |
| |
| typedef unsigned char UChar; |
| |
| for (const char *pc = mbs [i]; *pc; ++pc) |
| std::cout << "\\x" << int (UChar (*pc)); |
| |
| std::cout << "\"\n"; |
| |
| delete[] dst; |
| } |
| |
| return 0; |
| } |