| /*------------------------------------------------------------------------- |
| * |
| * Utility functions for conversion procs. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/utils/mb/conv.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| #include "mb/pg_wchar.h" |
| |
| |
| /* |
| * local2local: a generic single byte charset encoding |
| * conversion between two ASCII-superset encodings. |
| * |
| * l points to the source string of length len |
| * p is the output area (must be large enough!) |
| * src_encoding is the PG identifier for the source encoding |
| * dest_encoding is the PG identifier for the target encoding |
| * tab holds conversion entries for the source charset |
| * starting from 128 (0x80). each entry in the table holds the corresponding |
| * code point for the target charset, or 0 if there is no equivalent code. |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| local2local(const unsigned char *l, |
| unsigned char *p, |
| int len, |
| int src_encoding, |
| int dest_encoding, |
| const unsigned char *tab, |
| bool noError) |
| { |
| const unsigned char *start = l; |
| unsigned char c1, |
| c2; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(src_encoding, (const char *) l, len); |
| } |
| if (!IS_HIGHBIT_SET(c1)) |
| *p++ = c1; |
| else |
| { |
| c2 = tab[c1 - HIGHBIT]; |
| if (c2) |
| *p++ = c2; |
| else |
| { |
| if (noError) |
| break; |
| report_untranslatable_char(src_encoding, dest_encoding, |
| (const char *) l, len); |
| } |
| } |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| |
| return l - start; |
| } |
| |
| /* |
| * LATINn ---> MIC when the charset's local codes map directly to MIC |
| * |
| * l points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| latin2mic(const unsigned char *l, unsigned char *p, int len, |
| int lc, int encoding, bool noError) |
| { |
| const unsigned char *start = l; |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(encoding, (const char *) l, len); |
| } |
| if (IS_HIGHBIT_SET(c1)) |
| *p++ = lc; |
| *p++ = c1; |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| |
| return l - start; |
| } |
| |
| /* |
| * MIC ---> LATINn when the charset's local codes map directly to MIC |
| * |
| * mic points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| mic2latin(const unsigned char *mic, unsigned char *p, int len, |
| int lc, int encoding, bool noError) |
| { |
| const unsigned char *start = mic; |
| int c1; |
| |
| while (len > 0) |
| { |
| c1 = *mic; |
| if (c1 == 0) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
| } |
| if (!IS_HIGHBIT_SET(c1)) |
| { |
| /* easy for ASCII */ |
| *p++ = c1; |
| mic++; |
| len--; |
| } |
| else |
| { |
| int l = pg_mule_mblen(mic); |
| |
| if (len < l) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
| len); |
| } |
| if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) |
| { |
| if (noError) |
| break; |
| report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
| (const char *) mic, len); |
| } |
| *p++ = mic[1]; |
| mic += 2; |
| len -= 2; |
| } |
| } |
| *p = '\0'; |
| |
| return mic - start; |
| } |
| |
| |
| /* |
| * latin2mic_with_table: a generic single byte charset encoding |
| * conversion from a local charset to the mule internal code. |
| * |
| * l points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * tab holds conversion entries for the local charset |
| * starting from 128 (0x80). each entry in the table holds the corresponding |
| * code point for the mule encoding, or 0 if there is no equivalent code. |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| latin2mic_with_table(const unsigned char *l, |
| unsigned char *p, |
| int len, |
| int lc, |
| int encoding, |
| const unsigned char *tab, |
| bool noError) |
| { |
| const unsigned char *start = l; |
| unsigned char c1, |
| c2; |
| |
| while (len > 0) |
| { |
| c1 = *l; |
| if (c1 == 0) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(encoding, (const char *) l, len); |
| } |
| if (!IS_HIGHBIT_SET(c1)) |
| *p++ = c1; |
| else |
| { |
| c2 = tab[c1 - HIGHBIT]; |
| if (c2) |
| { |
| *p++ = lc; |
| *p++ = c2; |
| } |
| else |
| { |
| if (noError) |
| break; |
| report_untranslatable_char(encoding, PG_MULE_INTERNAL, |
| (const char *) l, len); |
| } |
| } |
| l++; |
| len--; |
| } |
| *p = '\0'; |
| |
| return l - start; |
| } |
| |
| /* |
| * mic2latin_with_table: a generic single byte charset encoding |
| * conversion from the mule internal code to a local charset. |
| * |
| * mic points to the source string of length len |
| * p is the output area (must be large enough!) |
| * lc is the mule character set id for the local encoding |
| * encoding is the PG identifier for the local encoding |
| * tab holds conversion entries for the mule internal code's second byte, |
| * starting from 128 (0x80). each entry in the table holds the corresponding |
| * code point for the local charset, or 0 if there is no equivalent code. |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| mic2latin_with_table(const unsigned char *mic, |
| unsigned char *p, |
| int len, |
| int lc, |
| int encoding, |
| const unsigned char *tab, |
| bool noError) |
| { |
| const unsigned char *start = mic; |
| unsigned char c1, |
| c2; |
| |
| while (len > 0) |
| { |
| c1 = *mic; |
| if (c1 == 0) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); |
| } |
| if (!IS_HIGHBIT_SET(c1)) |
| { |
| /* easy for ASCII */ |
| *p++ = c1; |
| mic++; |
| len--; |
| } |
| else |
| { |
| int l = pg_mule_mblen(mic); |
| |
| if (len < l) |
| { |
| if (noError) |
| break; |
| report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, |
| len); |
| } |
| if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || |
| (c2 = tab[mic[1] - HIGHBIT]) == 0) |
| { |
| if (noError) |
| break; |
| report_untranslatable_char(PG_MULE_INTERNAL, encoding, |
| (const char *) mic, len); |
| break; /* keep compiler quiet */ |
| } |
| *p++ = c2; |
| mic += 2; |
| len -= 2; |
| } |
| } |
| *p = '\0'; |
| |
| return mic - start; |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for combined UTF8 -> local code |
| */ |
| static int |
| compare3(const void *p1, const void *p2) |
| { |
| uint32 s1, |
| s2, |
| d1, |
| d2; |
| |
| s1 = *(const uint32 *) p1; |
| s2 = *((const uint32 *) p1 + 1); |
| d1 = ((const pg_utf_to_local_combined *) p2)->utf1; |
| d2 = ((const pg_utf_to_local_combined *) p2)->utf2; |
| return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); |
| } |
| |
| /* |
| * comparison routine for bsearch() |
| * this routine is intended for local code -> combined UTF8 |
| */ |
| static int |
| compare4(const void *p1, const void *p2) |
| { |
| uint32 v1, |
| v2; |
| |
| v1 = *(const uint32 *) p1; |
| v2 = ((const pg_local_to_utf_combined *) p2)->code; |
| return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
| } |
| |
| /* |
| * store 32bit character representation into multibyte stream |
| */ |
| static inline unsigned char * |
| store_coded_char(unsigned char *dest, uint32 code) |
| { |
| if (code & 0xff000000) |
| *dest++ = code >> 24; |
| if (code & 0x00ff0000) |
| *dest++ = code >> 16; |
| if (code & 0x0000ff00) |
| *dest++ = code >> 8; |
| if (code & 0x000000ff) |
| *dest++ = code; |
| return dest; |
| } |
| |
| /* |
| * Convert a character using a conversion radix tree. |
| * |
| * 'l' is the length of the input character in bytes, and b1-b4 are |
| * the input character's bytes. |
| */ |
| static inline uint32 |
| pg_mb_radix_conv(const pg_mb_radix_tree *rt, |
| int l, |
| unsigned char b1, |
| unsigned char b2, |
| unsigned char b3, |
| unsigned char b4) |
| { |
| if (l == 4) |
| { |
| /* 4-byte code */ |
| |
| /* check code validity */ |
| if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper || |
| b2 < rt->b4_2_lower || b2 > rt->b4_2_upper || |
| b3 < rt->b4_3_lower || b3 > rt->b4_3_upper || |
| b4 < rt->b4_4_lower || b4 > rt->b4_4_upper) |
| return 0; |
| |
| /* perform lookup */ |
| if (rt->chars32) |
| { |
| uint32 idx = rt->b4root; |
| |
| idx = rt->chars32[b1 + idx - rt->b4_1_lower]; |
| idx = rt->chars32[b2 + idx - rt->b4_2_lower]; |
| idx = rt->chars32[b3 + idx - rt->b4_3_lower]; |
| return rt->chars32[b4 + idx - rt->b4_4_lower]; |
| } |
| else |
| { |
| uint16 idx = rt->b4root; |
| |
| idx = rt->chars16[b1 + idx - rt->b4_1_lower]; |
| idx = rt->chars16[b2 + idx - rt->b4_2_lower]; |
| idx = rt->chars16[b3 + idx - rt->b4_3_lower]; |
| return rt->chars16[b4 + idx - rt->b4_4_lower]; |
| } |
| } |
| else if (l == 3) |
| { |
| /* 3-byte code */ |
| |
| /* check code validity */ |
| if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper || |
| b3 < rt->b3_2_lower || b3 > rt->b3_2_upper || |
| b4 < rt->b3_3_lower || b4 > rt->b3_3_upper) |
| return 0; |
| |
| /* perform lookup */ |
| if (rt->chars32) |
| { |
| uint32 idx = rt->b3root; |
| |
| idx = rt->chars32[b2 + idx - rt->b3_1_lower]; |
| idx = rt->chars32[b3 + idx - rt->b3_2_lower]; |
| return rt->chars32[b4 + idx - rt->b3_3_lower]; |
| } |
| else |
| { |
| uint16 idx = rt->b3root; |
| |
| idx = rt->chars16[b2 + idx - rt->b3_1_lower]; |
| idx = rt->chars16[b3 + idx - rt->b3_2_lower]; |
| return rt->chars16[b4 + idx - rt->b3_3_lower]; |
| } |
| } |
| else if (l == 2) |
| { |
| /* 2-byte code */ |
| |
| /* check code validity - first byte */ |
| if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper || |
| b4 < rt->b2_2_lower || b4 > rt->b2_2_upper) |
| return 0; |
| |
| /* perform lookup */ |
| if (rt->chars32) |
| { |
| uint32 idx = rt->b2root; |
| |
| idx = rt->chars32[b3 + idx - rt->b2_1_lower]; |
| return rt->chars32[b4 + idx - rt->b2_2_lower]; |
| } |
| else |
| { |
| uint16 idx = rt->b2root; |
| |
| idx = rt->chars16[b3 + idx - rt->b2_1_lower]; |
| return rt->chars16[b4 + idx - rt->b2_2_lower]; |
| } |
| } |
| else if (l == 1) |
| { |
| /* 1-byte code */ |
| |
| /* check code validity - first byte */ |
| if (b4 < rt->b1_lower || b4 > rt->b1_upper) |
| return 0; |
| |
| /* perform lookup */ |
| if (rt->chars32) |
| return rt->chars32[b4 + rt->b1root - rt->b1_lower]; |
| else |
| return rt->chars16[b4 + rt->b1root - rt->b1_lower]; |
| } |
| return 0; /* shouldn't happen */ |
| } |
| |
| /* |
| * UTF8 ---> local code |
| * |
| * utf: input string in UTF8 encoding (need not be null-terminated) |
| * len: length of input string (in bytes) |
| * iso: pointer to the output area (must be large enough!) |
| (output string will be null-terminated) |
| * map: conversion map for single characters |
| * cmap: conversion map for combined characters |
| * (optional, pass NULL if none) |
| * cmapsize: number of entries in the conversion map for combined characters |
| * (optional, pass 0 if none) |
| * conv_func: algorithmic encoding conversion function |
| * (optional, pass NULL if none) |
| * encoding: PG identifier for the local encoding |
| * |
| * For each character, the cmap (if provided) is consulted first; if no match, |
| * the map is consulted next; if still no match, the conv_func (if provided) |
| * is applied. An error is raised if no match is found. |
| * |
| * See pg_wchar.h for more details about the data structures used here. |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| UtfToLocal(const unsigned char *utf, int len, |
| unsigned char *iso, |
| const pg_mb_radix_tree *map, |
| const pg_utf_to_local_combined *cmap, int cmapsize, |
| utf_local_conversion_func conv_func, |
| int encoding, bool noError) |
| { |
| uint32 iutf; |
| int l; |
| const pg_utf_to_local_combined *cp; |
| const unsigned char *start = utf; |
| |
| if (!PG_VALID_ENCODING(encoding)) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid encoding number: %d", encoding))); |
| |
| for (; len > 0; len -= l) |
| { |
| unsigned char b1 = 0; |
| unsigned char b2 = 0; |
| unsigned char b3 = 0; |
| unsigned char b4 = 0; |
| |
| /* "break" cases all represent errors */ |
| if (*utf == '\0') |
| break; |
| |
| l = pg_utf_mblen(utf); |
| if (len < l) |
| break; |
| |
| if (!pg_utf8_islegal(utf, l)) |
| break; |
| |
| if (l == 1) |
| { |
| /* ASCII case is easy, assume it's one-to-one conversion */ |
| *iso++ = *utf++; |
| continue; |
| } |
| |
| /* collect coded char of length l */ |
| if (l == 2) |
| { |
| b3 = *utf++; |
| b4 = *utf++; |
| } |
| else if (l == 3) |
| { |
| b2 = *utf++; |
| b3 = *utf++; |
| b4 = *utf++; |
| } |
| else if (l == 4) |
| { |
| b1 = *utf++; |
| b2 = *utf++; |
| b3 = *utf++; |
| b4 = *utf++; |
| } |
| else |
| { |
| elog(ERROR, "unsupported character length %d", l); |
| iutf = 0; /* keep compiler quiet */ |
| } |
| iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
| |
| /* First, try with combined map if possible */ |
| if (cmap && len > l) |
| { |
| const unsigned char *utf_save = utf; |
| int len_save = len; |
| int l_save = l; |
| |
| /* collect next character, same as above */ |
| len -= l; |
| |
| l = pg_utf_mblen(utf); |
| if (len < l) |
| { |
| /* need more data to decide if this is a combined char */ |
| utf -= l_save; |
| break; |
| } |
| |
| if (!pg_utf8_islegal(utf, l)) |
| { |
| if (!noError) |
| report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
| utf -= l_save; |
| break; |
| } |
| |
| /* We assume ASCII character cannot be in combined map */ |
| if (l > 1) |
| { |
| uint32 iutf2; |
| uint32 cutf[2]; |
| |
| if (l == 2) |
| { |
| iutf2 = *utf++ << 8; |
| iutf2 |= *utf++; |
| } |
| else if (l == 3) |
| { |
| iutf2 = *utf++ << 16; |
| iutf2 |= *utf++ << 8; |
| iutf2 |= *utf++; |
| } |
| else if (l == 4) |
| { |
| iutf2 = *utf++ << 24; |
| iutf2 |= *utf++ << 16; |
| iutf2 |= *utf++ << 8; |
| iutf2 |= *utf++; |
| } |
| else |
| { |
| elog(ERROR, "unsupported character length %d", l); |
| iutf2 = 0; /* keep compiler quiet */ |
| } |
| |
| cutf[0] = iutf; |
| cutf[1] = iutf2; |
| |
| cp = bsearch(cutf, cmap, cmapsize, |
| sizeof(pg_utf_to_local_combined), compare3); |
| |
| if (cp) |
| { |
| iso = store_coded_char(iso, cp->code); |
| continue; |
| } |
| } |
| |
| /* fail, so back up to reprocess second character next time */ |
| utf = utf_save; |
| len = len_save; |
| l = l_save; |
| } |
| |
| /* Now check ordinary map */ |
| if (map) |
| { |
| uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
| |
| if (converted) |
| { |
| iso = store_coded_char(iso, converted); |
| continue; |
| } |
| } |
| |
| /* if there's a conversion function, try that */ |
| if (conv_func) |
| { |
| uint32 converted = (*conv_func) (iutf); |
| |
| if (converted) |
| { |
| iso = store_coded_char(iso, converted); |
| continue; |
| } |
| } |
| |
| /* failed to translate this character */ |
| utf -= l; |
| if (noError) |
| break; |
| report_untranslatable_char(PG_UTF8, encoding, |
| (const char *) utf, len); |
| } |
| |
| /* if we broke out of loop early, must be invalid input */ |
| if (len > 0 && !noError) |
| report_invalid_encoding(PG_UTF8, (const char *) utf, len); |
| |
| *iso = '\0'; |
| |
| return utf - start; |
| } |
| |
| /* |
| * local code ---> UTF8 |
| * |
| * iso: input string in local encoding (need not be null-terminated) |
| * len: length of input string (in bytes) |
| * utf: pointer to the output area (must be large enough!) |
| (output string will be null-terminated) |
| * map: conversion map for single characters |
| * cmap: conversion map for combined characters |
| * (optional, pass NULL if none) |
| * cmapsize: number of entries in the conversion map for combined characters |
| * (optional, pass 0 if none) |
| * conv_func: algorithmic encoding conversion function |
| * (optional, pass NULL if none) |
| * encoding: PG identifier for the local encoding |
| * |
| * For each character, the map is consulted first; if no match, the cmap |
| * (if provided) is consulted next; if still no match, the conv_func |
| * (if provided) is applied. An error is raised if no match is found. |
| * |
| * See pg_wchar.h for more details about the data structures used here. |
| * |
| * Returns the number of input bytes consumed. If noError is true, this can |
| * be less than 'len'. |
| */ |
| int |
| LocalToUtf(const unsigned char *iso, int len, |
| unsigned char *utf, |
| const pg_mb_radix_tree *map, |
| const pg_local_to_utf_combined *cmap, int cmapsize, |
| utf_local_conversion_func conv_func, |
| int encoding, |
| bool noError) |
| { |
| uint32 iiso; |
| int l; |
| const pg_local_to_utf_combined *cp; |
| const unsigned char *start = iso; |
| |
| if (!PG_VALID_ENCODING(encoding)) |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| errmsg("invalid encoding number: %d", encoding))); |
| |
| for (; len > 0; len -= l) |
| { |
| unsigned char b1 = 0; |
| unsigned char b2 = 0; |
| unsigned char b3 = 0; |
| unsigned char b4 = 0; |
| |
| /* "break" cases all represent errors */ |
| if (*iso == '\0') |
| break; |
| |
| if (!IS_HIGHBIT_SET(*iso)) |
| { |
| /* ASCII case is easy, assume it's one-to-one conversion */ |
| *utf++ = *iso++; |
| l = 1; |
| continue; |
| } |
| |
| l = pg_encoding_verifymbchar(encoding, (const char *) iso, len); |
| if (l < 0) |
| break; |
| |
| /* collect coded char of length l */ |
| if (l == 1) |
| b4 = *iso++; |
| else if (l == 2) |
| { |
| b3 = *iso++; |
| b4 = *iso++; |
| } |
| else if (l == 3) |
| { |
| b2 = *iso++; |
| b3 = *iso++; |
| b4 = *iso++; |
| } |
| else if (l == 4) |
| { |
| b1 = *iso++; |
| b2 = *iso++; |
| b3 = *iso++; |
| b4 = *iso++; |
| } |
| else |
| { |
| elog(ERROR, "unsupported character length %d", l); |
| iiso = 0; /* keep compiler quiet */ |
| } |
| iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4); |
| |
| if (map) |
| { |
| uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4); |
| |
| if (converted) |
| { |
| utf = store_coded_char(utf, converted); |
| continue; |
| } |
| |
| /* If there's a combined character map, try that */ |
| if (cmap) |
| { |
| cp = bsearch(&iiso, cmap, cmapsize, |
| sizeof(pg_local_to_utf_combined), compare4); |
| |
| if (cp) |
| { |
| utf = store_coded_char(utf, cp->utf1); |
| utf = store_coded_char(utf, cp->utf2); |
| continue; |
| } |
| } |
| } |
| |
| /* if there's a conversion function, try that */ |
| if (conv_func) |
| { |
| uint32 converted = (*conv_func) (iiso); |
| |
| if (converted) |
| { |
| utf = store_coded_char(utf, converted); |
| continue; |
| } |
| } |
| |
| /* failed to translate this character */ |
| iso -= l; |
| if (noError) |
| break; |
| report_untranslatable_char(encoding, PG_UTF8, |
| (const char *) iso, len); |
| } |
| |
| /* if we broke out of loop early, must be invalid input */ |
| if (len > 0 && !noError) |
| report_invalid_encoding(encoding, (const char *) iso, len); |
| |
| *utf = '\0'; |
| |
| return iso - start; |
| } |