| /*------------------------------------------------------------------------- |
| * |
| * Utility functions for conversion procs. |
| * |
| * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.68 2010/01/02 16:57:56 momjian Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| #include "mb/pg_wchar.h" |
| |
| |
| /* |
| * LATINn ---> MIC when the charset's local codes map directly to MIC |
| * |
| * l points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| */ |
| void |
| latin2mic(const unsigned char *l, unsigned char *p, int len, |
| int lc, int encoding) |
| { |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0) |
| report_invalid_encoding(encoding, (const char *) l, len); |
| if (IS_HIGHBIT_SET(c1)) |
| *p++ = lc; |
| *p++ = c1; |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| } |
| |
| /* |
| * MIC ---> LATINn when the charset's local codes map directly to MIC |
| * |
| * mic points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| */ |
| void |
| mic2latin(const unsigned char *mic, unsigned char *p, int len, |
| int lc, int encoding) |
| { |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *mic; |
| if (c1 == 0) |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
| if (!IS_HIGHBIT_SET(c1)) |
| { |
| /* easy for ASCII */ |
| *p++ = c1; |
| mic++; |
| len--; |
| } |
| else |
| { |
| int l = pg_mic_mblen(mic); |
| |
| if (len < l) |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
| len); |
| if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) |
| report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
| (const char *) mic, len); |
| *p++ = mic[1]; |
| mic += 2; |
| len -= 2; |
| } |
| } |
| *p = '\0'; |
| } |
| |
| |
| /* |
| * ASCII ---> MIC |
| * |
| * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set |
| * characters, here we must take a hard line because we don't know |
| * the appropriate MIC equivalent. |
| */ |
| void |
| pg_ascii2mic(const unsigned char *l, unsigned char *p, int len) |
| { |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0 || IS_HIGHBIT_SET(c1)) |
| report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len); |
| *p++ = c1; |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| } |
| |
| /* |
| * MIC ---> ASCII |
| */ |
| void |
| pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len) |
| { |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *mic; |
| if (c1 == 0 || IS_HIGHBIT_SET(c1)) |
| report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII, |
| (const char *) mic, len); |
| *p++ = c1; |
| mic++; |
| len--; |
| } |
| *p = '\0'; |
| } |
| |
| /* |
| * latin2mic_with_table: a generic single byte charset encoding |
| * conversion from a local charset to the mule internal code. |
| * |
| * l points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * tab holds conversion entries for the local charset |
| * starting from 128 (0x80). each entry in the table |
| * holds the corresponding code point for the mule internal code. |
| */ |
| void |
| latin2mic_with_table(const unsigned char *l, |
| unsigned char *p, |
| int len, |
| int lc, |
| int encoding, |
| const unsigned char *tab) |
| { |
| unsigned char c1, |
| c2; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0) |
| report_invalid_encoding(encoding, (const char *) l, len); |
| if (!IS_HIGHBIT_SET(c1)) |
| *p++ = c1; |
| else |
| { |
| c2 = tab[c1 - HIGHBIT]; |
| if (c2) |
| { |
| *p++ = lc; |
| *p++ = c2; |
| } |
| else |
| report_untranslatable_char(encoding, PG_MULE_INTERNAL, |
| (const char *) l, len); |
| } |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| } |
| |
| /* |
| * mic2latin_with_table: a generic single byte charset encoding |
| * conversion from the mule internal code to a local charset. |
| * |
| * mic points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * tab holds conversion entries for the mule internal code's |
| * second byte, starting from 128 (0x80). each entry in the table |
| * holds the corresponding code point for the local charset. |
| */ |
| void |
| mic2latin_with_table(const unsigned char *mic, |
| unsigned char *p, |
| int len, |
| int lc, |
| int encoding, |
| const unsigned char *tab) |
| { |
| unsigned char c1, |
| c2; |
| |
| while (len > 0) |
| { |
| c1 = *mic; |
| if (c1 == 0) |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
| if (!IS_HIGHBIT_SET(c1)) |
| { |
| /* easy for ASCII */ |
| *p++ = c1; |
| mic++; |
| len--; |
| } |
| else |
| { |
| int l = pg_mic_mblen(mic); |
| |
| if (len < l) |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
| len); |
| if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || |
| (c2 = tab[mic[1] - HIGHBIT]) == 0) |
| { |
| report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
| (const char *) mic, len); |
| break; /* keep compiler quiet */ |
| } |
| *p++ = c2; |
| mic += 2; |
| len -= 2; |
| } |
| } |
| *p = '\0'; |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for UTF8 -> local code |
| */ |
| static int |
| compare1(const void *p1, const void *p2) |
| { |
| uint32 v1, |
| v2; |
| |
| v1 = *(uint32 *) p1; |
| v2 = ((pg_utf_to_local *) p2)->utf; |
| return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for local code -> UTF8 |
| */ |
| static int |
| compare2(const void *p1, const void *p2) |
| { |
| uint32 v1, |
| v2; |
| |
| v1 = *(uint32 *) p1; |
| v2 = ((pg_local_to_utf *) p2)->code; |
| return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for combined UTF8 -> local code |
| */ |
| static int |
| compare3(const void *p1, const void *p2) |
| { |
| uint32 s1, |
| s2, |
| d1, |
| d2; |
| |
| s1 = *(uint32 *) p1; |
| s2 = *((uint32 *) p1 + 1); |
| d1 = ((pg_utf_to_local_combined *) p2)->utf1; |
| d2 = ((pg_utf_to_local_combined *) p2)->utf2; |
| return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for local code -> combined UTF8 |
| */ |
| static int |
| compare4(const void *p1, const void *p2) |
| { |
| uint32 v1, |
| v2; |
| |
| v1 = *(uint32 *) p1; |
| v2 = ((pg_local_to_utf_combined *) p2)->code; |
| return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
| } |
| |
| /* |
| * convert 32bit wide character to mutibye stream pointed to by iso |
| */ |
| static unsigned char * |
| set_iso_code(unsigned char *iso, uint32 code) |
| { |
| if (code & 0xff000000) |
| *iso++ = code >> 24; |
| if (code & 0x00ff0000) |
| *iso++ = (code & 0x00ff0000) >> 16; |
| if (code & 0x0000ff00) |
| *iso++ = (code & 0x0000ff00) >> 8; |
| if (code & 0x000000ff) |
| *iso++ = code & 0x000000ff; |
| return iso; |
| } |
| |
| /* |
| * UTF8 ---> local code |
| * |
| * utf: input UTF8 string (need not be null-terminated). |
| * iso: pointer to the output area (must be large enough!) |
| * map: the conversion map. |
| * cmap: the conversion map for combined characters. |
| * (optional) |
| * size1: the size of the conversion map. |
| * size2: the size of the conversion map for combined characters |
| * (optional) |
| * encoding: the PG identifier for the local encoding. |
| * len: length of input string. |
| */ |
| void |
| UtfToLocal(const unsigned char *utf, unsigned char *iso, |
| const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, |
| int size1, int size2, int encoding, int len) |
| { |
| uint32 iutf; |
| uint32 cutf[2]; |
| uint32 code; |
| pg_utf_to_local *p; |
| pg_utf_to_local_combined *cp; |
| int l; |
| |
| for (; len > 0; len -= l) |
| { |
| /* "break" cases all represent errors */ |
| if (*utf == '\0') |
| break; |
| |
| l = pg_utf_mblen(utf); |
| |
| if (len < l) |
| break; |
| |
| if (!pg_utf8_islegal(utf, l)) |
| break; |
| |
| if (l == 1) |
| { |
| /* ASCII case is easy */ |
| *iso++ = *utf++; |
| continue; |
| } |
| else if (l == 2) |
| { |
| iutf = *utf++ << 8; |
| iutf |= *utf++; |
| } |
| else if (l == 3) |
| { |
| iutf = *utf++ << 16; |
| iutf |= *utf++ << 8; |
| iutf |= *utf++; |
| } |
| else if (l == 4) |
| { |
| iutf = *utf++ << 24; |
| iutf |= *utf++ << 16; |
| iutf |= *utf++ << 8; |
| iutf |= *utf++; |
| } |
| |
| /* |
| * first, try with combined map if possible |
| */ |
| if (cmap && len > l) |
| { |
| const unsigned char *utf_save = utf; |
| int len_save = len; |
| int l_save = l; |
| |
| len -= l; |
| |
| l = pg_utf_mblen(utf); |
| if (len < l) |
| break; |
| |
| if (!pg_utf8_islegal(utf, l)) |
| break; |
| |
| cutf[0] = iutf; |
| |
| if (l == 1) |
| { |
| if (len_save > 1) |
| { |
| p = bsearch(&cutf[0], map, size1, |
| sizeof(pg_utf_to_local), compare1); |
| if (p == NULL) |
| report_untranslatable_char(PG_UTF8, encoding, |
| (const char *) (utf_save - l_save), len_save); |
| iso = set_iso_code(iso, p->code); |
| } |
| |
| /* ASCII case is easy */ |
| *iso++ = *utf++; |
| continue; |
| } |
| else if (l == 2) |
| { |
| iutf = *utf++ << 8; |
| iutf |= *utf++; |
| } |
| else if (l == 3) |
| { |
| iutf = *utf++ << 16; |
| iutf |= *utf++ << 8; |
| iutf |= *utf++; |
| } |
| else if (l == 4) |
| { |
| iutf = *utf++ << 24; |
| iutf |= *utf++ << 16; |
| iutf |= *utf++ << 8; |
| iutf |= *utf++; |
| } |
| |
| cutf[1] = iutf; |
| cp = bsearch(cutf, cmap, size2, |
| sizeof(pg_utf_to_local_combined), compare3); |
| if (cp) |
| code = cp->code; |
| else |
| { |
| /* not found in combined map. try with ordinary map */ |
| p = bsearch(&cutf[0], map, size1, |
| sizeof(pg_utf_to_local), compare1); |
| if (p == NULL) |
| report_untranslatable_char(PG_UTF8, encoding, |
| (const char *) (utf_save - l_save), len_save); |
| iso = set_iso_code(iso, p->code); |
| |
| p = bsearch(&cutf[1], map, size1, |
| sizeof(pg_utf_to_local), compare1); |
| if (p == NULL) |
| report_untranslatable_char(PG_UTF8, encoding, |
| (const char *) (utf - l), len); |
| code = p->code; |
| } |
| } |
| else /* no cmap or no remaining data */ |
| { |
| p = bsearch(&iutf, map, size1, |
| sizeof(pg_utf_to_local), compare1); |
| if (p == NULL) |
| report_untranslatable_char(PG_UTF8, encoding, |
| (const char *) (utf - l), len); |
| code = p->code; |
| } |
| iso = set_iso_code(iso, code); |
| } |
| |
| if (len > 0) |
| report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
| |
| *iso = '\0'; |
| } |
| |
| /* |
| * local code ---> UTF8 |
| * |
| * iso: input local string (need not be null-terminated). |
| * utf: pointer to the output area (must be large enough!) |
| * map: the conversion map. |
| * cmap: the conversion map for combined characters. |
| * (optional) |
| * size1: the size of the conversion map. |
| * size2: the size of the conversion map for combined characters |
| * (optional) |
| * encoding: the PG identifier for the local encoding. |
| * len: length of input string. |
| */ |
| void |
| LocalToUtf(const unsigned char *iso, unsigned char *utf, |
| const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap, |
| int size1, int size2, int encoding, int len) |
| { |
| unsigned int iiso; |
| int l; |
| pg_local_to_utf *p; |
| pg_local_to_utf_combined *cp; |
| |
| if (!PG_VALID_ENCODING(encoding)) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid encoding number: %d", encoding))); |
| |
| for (; len > 0; len -= l) |
| { |
| /* "break" cases all represent errors */ |
| if (*iso == '\0') |
| break; |
| |
| if (!IS_HIGHBIT_SET(*iso)) |
| { |
| /* ASCII case is easy */ |
| *utf++ = *iso++; |
| l = 1; |
| continue; |
| } |
| |
| l = pg_encoding_verifymb(encoding, (const char *) iso, len); |
| if (l < 0) |
| break; |
| |
| if (l == 1) |
| iiso = *iso++; |
| else if (l == 2) |
| { |
| iiso = *iso++ << 8; |
| iiso |= *iso++; |
| } |
| else if (l == 3) |
| { |
| iiso = *iso++ << 16; |
| iiso |= *iso++ << 8; |
| iiso |= *iso++; |
| } |
| else if (l == 4) |
| { |
| iiso = *iso++ << 24; |
| iiso |= *iso++ << 16; |
| iiso |= *iso++ << 8; |
| iiso |= *iso++; |
| } |
| |
| p = bsearch(&iiso, map, size1, |
| sizeof(pg_local_to_utf), compare2); |
| |
| if (p == NULL) |
| { |
| /* |
| * not found in the ordinary map. if there's a combined character |
| * map, try with it |
| */ |
| if (cmap) |
| { |
| cp = bsearch(&iiso, cmap, size2, |
| sizeof(pg_local_to_utf_combined), compare4); |
| |
| if (cp) |
| { |
| if (cp->utf1 & 0xff000000) |
| *utf++ = cp->utf1 >> 24; |
| if (cp->utf1 & 0x00ff0000) |
| *utf++ = (cp->utf1 & 0x00ff0000) >> 16; |
| if (cp->utf1 & 0x0000ff00) |
| *utf++ = (cp->utf1 & 0x0000ff00) >> 8; |
| if (cp->utf1 & 0x000000ff) |
| *utf++ = cp->utf1 & 0x000000ff; |
| |
| if (cp->utf2 & 0xff000000) |
| *utf++ = cp->utf2 >> 24; |
| if (cp->utf2 & 0x00ff0000) |
| *utf++ = (cp->utf2 & 0x00ff0000) >> 16; |
| if (cp->utf2 & 0x0000ff00) |
| *utf++ = (cp->utf2 & 0x0000ff00) >> 8; |
| if (cp->utf2 & 0x000000ff) |
| *utf++ = cp->utf2 & 0x000000ff; |
| |
| continue; |
| } |
| } |
| |
| report_untranslatable_char(encoding, PG_UTF8, |
| (const char *) (iso - l), len); |
| |
| } |
| else |
| { |
| if (p->utf & 0xff000000) |
| *utf++ = p->utf >> 24; |
| if (p->utf & 0x00ff0000) |
| *utf++ = (p->utf & 0x00ff0000) >> 16; |
| if (p->utf & 0x0000ff00) |
| *utf++ = (p->utf & 0x0000ff00) >> 8; |
| if (p->utf & 0x000000ff) |
| *utf++ = p->utf & 0x000000ff; |
| } |
| } |
| |
| if (len > 0) |
| report_invalid_encoding(encoding, (const char *) iso, len); |
| |
| *utf = '\0'; |
| } |