| /*------------------------------------------------------------------------- |
| * unicode_norm.c |
| * Normalize a Unicode string |
| * |
| * This implements Unicode normalization, per the documentation at |
| * https://www.unicode.org/reports/tr15/. |
| * |
| * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group |
| * |
| * IDENTIFICATION |
| * src/common/unicode_norm.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #ifndef FRONTEND |
| #include "postgres.h" |
| #else |
| #include "postgres_fe.h" |
| #endif |
| |
| #include "common/unicode_norm.h" |
| #ifndef FRONTEND |
| #include "common/unicode_norm_hashfunc.h" |
| #include "common/unicode_normprops_table.h" |
| #include "port/pg_bswap.h" |
| #else |
| #include "common/unicode_norm_table.h" |
| #endif |
| |
| #ifndef FRONTEND |
| #define ALLOC(size) palloc(size) |
| #define FREE(size) pfree(size) |
| #else |
| #define ALLOC(size) malloc(size) |
| #define FREE(size) free(size) |
| #endif |
| |
| /* Constants for calculations with Hangul characters */ |
| #define SBASE 0xAC00 /* U+AC00 */ |
| #define LBASE 0x1100 /* U+1100 */ |
| #define VBASE 0x1161 /* U+1161 */ |
| #define TBASE 0x11A7 /* U+11A7 */ |
| #define LCOUNT 19 |
| #define VCOUNT 21 |
| #define TCOUNT 28 |
| #define NCOUNT VCOUNT * TCOUNT |
| #define SCOUNT LCOUNT * NCOUNT |
| |
| #ifdef FRONTEND |
| /* comparison routine for bsearch() of decomposition lookup table. */ |
| static int |
| conv_compare(const void *p1, const void *p2) |
| { |
| uint32 v1, |
| v2; |
| |
| v1 = *(const uint32 *) p1; |
| v2 = ((const pg_unicode_decomposition *) p2)->codepoint; |
| return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
| } |
| |
| #endif |
| |
| /* |
| * get_code_entry |
| * |
| * Get the entry corresponding to code in the decomposition lookup table. |
| * The backend version of this code uses a perfect hash function for the |
| * lookup, while the frontend version uses a binary search. |
| */ |
| static const pg_unicode_decomposition * |
| get_code_entry(pg_wchar code) |
| { |
| #ifndef FRONTEND |
| int h; |
| uint32 hashkey; |
| pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; |
| |
| /* |
| * Compute the hash function. The hash key is the codepoint with the bytes |
| * in network order. |
| */ |
| hashkey = pg_hton32(code); |
| h = decompinfo.hash(&hashkey); |
| |
| /* An out-of-range result implies no match */ |
| if (h < 0 || h >= decompinfo.num_decomps) |
| return NULL; |
| |
| /* |
| * Since it's a perfect hash, we need only match to the specific codepoint |
| * it identifies. |
| */ |
| if (code != decompinfo.decomps[h].codepoint) |
| return NULL; |
| |
| /* Success! */ |
| return &decompinfo.decomps[h]; |
| #else |
| return bsearch(&(code), |
| UnicodeDecompMain, |
| lengthof(UnicodeDecompMain), |
| sizeof(pg_unicode_decomposition), |
| conv_compare); |
| #endif |
| } |
| |
| /* |
| * Get the combining class of the given codepoint. |
| */ |
| static uint8 |
| get_canonical_class(pg_wchar code) |
| { |
| const pg_unicode_decomposition *entry = get_code_entry(code); |
| |
| /* |
| * If no entries are found, the character used is either an Hangul |
| * character or a character with a class of 0 and no decompositions. |
| */ |
| if (!entry) |
| return 0; |
| else |
| return entry->comb_class; |
| } |
| |
| /* |
| * Given a decomposition entry looked up earlier, get the decomposed |
| * characters. |
| * |
| * Note: the returned pointer can point to statically allocated buffer, and |
| * is only valid until next call to this function! |
| */ |
| static const pg_wchar * |
| get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) |
| { |
| static pg_wchar x; |
| |
| if (DECOMPOSITION_IS_INLINE(entry)) |
| { |
| Assert(DECOMPOSITION_SIZE(entry) == 1); |
| x = (pg_wchar) entry->dec_index; |
| *dec_size = 1; |
| return &x; |
| } |
| else |
| { |
| *dec_size = DECOMPOSITION_SIZE(entry); |
| return &UnicodeDecomp_codepoints[entry->dec_index]; |
| } |
| } |
| |
| /* |
| * Calculate how many characters a given character will decompose to. |
| * |
| * This needs to recurse, if the character decomposes into characters that |
| * are, in turn, decomposable. |
| */ |
| static int |
| get_decomposed_size(pg_wchar code, bool compat) |
| { |
| const pg_unicode_decomposition *entry; |
| int size = 0; |
| int i; |
| const uint32 *decomp; |
| int dec_size; |
| |
| /* |
| * Fast path for Hangul characters not stored in tables to save memory as |
| * decomposition is algorithmic. See |
| * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details |
| * on the matter. |
| */ |
| if (code >= SBASE && code < SBASE + SCOUNT) |
| { |
| uint32 tindex, |
| sindex; |
| |
| sindex = code - SBASE; |
| tindex = sindex % TCOUNT; |
| |
| if (tindex != 0) |
| return 3; |
| return 2; |
| } |
| |
| entry = get_code_entry(code); |
| |
| /* |
| * Just count current code if no other decompositions. A NULL entry is |
| * equivalent to a character with class 0 and no decompositions. |
| */ |
| if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || |
| (!compat && DECOMPOSITION_IS_COMPAT(entry))) |
| return 1; |
| |
| /* |
| * If this entry has other decomposition codes look at them as well. First |
| * get its decomposition in the list of tables available. |
| */ |
| decomp = get_code_decomposition(entry, &dec_size); |
| for (i = 0; i < dec_size; i++) |
| { |
| uint32 lcode = decomp[i]; |
| |
| size += get_decomposed_size(lcode, compat); |
| } |
| |
| return size; |
| } |
| |
| /* |
| * Recompose a set of characters. For hangul characters, the calculation |
| * is algorithmic. For others, an inverse lookup at the decomposition |
| * table is necessary. Returns true if a recomposition can be done, and |
| * false otherwise. |
| */ |
| static bool |
| recompose_code(uint32 start, uint32 code, uint32 *result) |
| { |
| /* |
| * Handle Hangul characters algorithmically, per the Unicode spec. |
| * |
| * Check if two current characters are L and V. |
| */ |
| if (start >= LBASE && start < LBASE + LCOUNT && |
| code >= VBASE && code < VBASE + VCOUNT) |
| { |
| /* make syllable of form LV */ |
| uint32 lindex = start - LBASE; |
| uint32 vindex = code - VBASE; |
| |
| *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT; |
| return true; |
| } |
| /* Check if two current characters are LV and T */ |
| else if (start >= SBASE && start < (SBASE + SCOUNT) && |
| ((start - SBASE) % TCOUNT) == 0 && |
| code >= TBASE && code < (TBASE + TCOUNT)) |
| { |
| /* make syllable of form LVT */ |
| uint32 tindex = code - TBASE; |
| |
| *result = start + tindex; |
| return true; |
| } |
| else |
| { |
| const pg_unicode_decomposition *entry; |
| |
| /* |
| * Do an inverse lookup of the decomposition tables to see if anything |
| * matches. The comparison just needs to be a perfect match on the |
| * sub-table of size two, because the start character has already been |
| * recomposed partially. This lookup uses a perfect hash function for |
| * the backend code. |
| */ |
| #ifndef FRONTEND |
| |
| int h, |
| inv_lookup_index; |
| uint64 hashkey; |
| pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; |
| |
| /* |
| * Compute the hash function. The hash key is formed by concatenating |
| * bytes of the two codepoints in network order. See also |
| * src/common/unicode/generate-unicode_norm_table.pl. |
| */ |
| hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); |
| h = recompinfo.hash(&hashkey); |
| |
| /* An out-of-range result implies no match */ |
| if (h < 0 || h >= recompinfo.num_recomps) |
| return false; |
| |
| inv_lookup_index = recompinfo.inverse_lookup[h]; |
| entry = &UnicodeDecompMain[inv_lookup_index]; |
| |
| if (start == UnicodeDecomp_codepoints[entry->dec_index] && |
| code == UnicodeDecomp_codepoints[entry->dec_index + 1]) |
| { |
| *result = entry->codepoint; |
| return true; |
| } |
| |
| #else |
| |
| int i; |
| |
| for (i = 0; i < lengthof(UnicodeDecompMain); i++) |
| { |
| entry = &UnicodeDecompMain[i]; |
| |
| if (DECOMPOSITION_SIZE(entry) != 2) |
| continue; |
| |
| if (DECOMPOSITION_NO_COMPOSE(entry)) |
| continue; |
| |
| if (start == UnicodeDecomp_codepoints[entry->dec_index] && |
| code == UnicodeDecomp_codepoints[entry->dec_index + 1]) |
| { |
| *result = entry->codepoint; |
| return true; |
| } |
| } |
| #endif /* !FRONTEND */ |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Decompose the given code into the array given by caller. The |
| * decomposition begins at the position given by caller, saving one |
| * lookup on the decomposition table. The current position needs to be |
| * updated here to let the caller know from where to continue filling |
| * in the array result. |
| */ |
| static void |
| decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) |
| { |
| const pg_unicode_decomposition *entry; |
| int i; |
| const uint32 *decomp; |
| int dec_size; |
| |
| /* |
| * Fast path for Hangul characters not stored in tables to save memory as |
| * decomposition is algorithmic. See |
| * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details |
| * on the matter. |
| */ |
| if (code >= SBASE && code < SBASE + SCOUNT) |
| { |
| uint32 l, |
| v, |
| tindex, |
| sindex; |
| pg_wchar *res = *result; |
| |
| sindex = code - SBASE; |
| l = LBASE + sindex / (VCOUNT * TCOUNT); |
| v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; |
| tindex = sindex % TCOUNT; |
| |
| res[*current] = l; |
| (*current)++; |
| res[*current] = v; |
| (*current)++; |
| |
| if (tindex != 0) |
| { |
| res[*current] = TBASE + tindex; |
| (*current)++; |
| } |
| |
| return; |
| } |
| |
| entry = get_code_entry(code); |
| |
| /* |
| * Just fill in with the current decomposition if there are no |
| * decomposition codes to recurse to. A NULL entry is equivalent to a |
| * character with class 0 and no decompositions, so just leave also in |
| * this case. |
| */ |
| if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || |
| (!compat && DECOMPOSITION_IS_COMPAT(entry))) |
| { |
| pg_wchar *res = *result; |
| |
| res[*current] = code; |
| (*current)++; |
| return; |
| } |
| |
| /* |
| * If this entry has other decomposition codes look at them as well. |
| */ |
| decomp = get_code_decomposition(entry, &dec_size); |
| for (i = 0; i < dec_size; i++) |
| { |
| pg_wchar lcode = (pg_wchar) decomp[i]; |
| |
| /* Leave if no more decompositions */ |
| decompose_code(lcode, compat, result, current); |
| } |
| } |
| |
| /* |
| * unicode_normalize - Normalize a Unicode string to the specified form. |
| * |
| * The input is a 0-terminated array of codepoints. |
| * |
| * In frontend, returns a 0-terminated array of codepoints, allocated with |
| * malloc. Or NULL if we run out of memory. In backend, the returned |
| * string is palloc'd instead, and OOM is reported with ereport(). |
| */ |
| pg_wchar * |
| unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) |
| { |
| bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); |
| bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); |
| pg_wchar *decomp_chars; |
| pg_wchar *recomp_chars; |
| int decomp_size, |
| current_size; |
| int count; |
| const pg_wchar *p; |
| |
| /* variables for recomposition */ |
| int last_class; |
| int starter_pos; |
| int target_pos; |
| uint32 starter_ch; |
| |
| /* First, do character decomposition */ |
| |
| /* |
| * Calculate how many characters long the decomposed version will be. |
| */ |
| decomp_size = 0; |
| for (p = input; *p; p++) |
| decomp_size += get_decomposed_size(*p, compat); |
| |
| decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
| if (decomp_chars == NULL) |
| return NULL; |
| |
| /* |
| * Now fill in each entry recursively. This needs a second pass on the |
| * decomposition table. |
| */ |
| current_size = 0; |
| for (p = input; *p; p++) |
| decompose_code(*p, compat, &decomp_chars, ¤t_size); |
| decomp_chars[decomp_size] = '\0'; |
| Assert(decomp_size == current_size); |
| |
| /* Leave if there is nothing to decompose */ |
| if (decomp_size == 0) |
| return decomp_chars; |
| |
| /* |
| * Now apply canonical ordering. |
| */ |
| for (count = 1; count < decomp_size; count++) |
| { |
| pg_wchar prev = decomp_chars[count - 1]; |
| pg_wchar next = decomp_chars[count]; |
| pg_wchar tmp; |
| const uint8 prevClass = get_canonical_class(prev); |
| const uint8 nextClass = get_canonical_class(next); |
| |
| /* |
| * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) |
| * annex 4, a sequence of two adjacent characters in a string is an |
| * exchangeable pair if the combining class (from the Unicode |
| * Character Database) for the first character is greater than the |
| * combining class for the second, and the second is not a starter. A |
| * character is a starter if its combining class is 0. |
| */ |
| if (prevClass == 0 || nextClass == 0) |
| continue; |
| |
| if (prevClass <= nextClass) |
| continue; |
| |
| /* exchange can happen */ |
| tmp = decomp_chars[count - 1]; |
| decomp_chars[count - 1] = decomp_chars[count]; |
| decomp_chars[count] = tmp; |
| |
| /* backtrack to check again */ |
| if (count > 1) |
| count -= 2; |
| } |
| |
| if (!recompose) |
| return decomp_chars; |
| |
| /* |
| * The last phase of NFC and NFKC is the recomposition of the reordered |
| * Unicode string using combining classes. The recomposed string cannot be |
| * longer than the decomposed one, so make the allocation of the output |
| * string based on that assumption. |
| */ |
| recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
| if (!recomp_chars) |
| { |
| FREE(decomp_chars); |
| return NULL; |
| } |
| |
| last_class = -1; /* this eliminates a special check */ |
| starter_pos = 0; |
| target_pos = 1; |
| starter_ch = recomp_chars[0] = decomp_chars[0]; |
| |
| for (count = 1; count < decomp_size; count++) |
| { |
| pg_wchar ch = decomp_chars[count]; |
| int ch_class = get_canonical_class(ch); |
| pg_wchar composite; |
| |
| if (last_class < ch_class && |
| recompose_code(starter_ch, ch, &composite)) |
| { |
| recomp_chars[starter_pos] = composite; |
| starter_ch = composite; |
| } |
| else if (ch_class == 0) |
| { |
| starter_pos = target_pos; |
| starter_ch = ch; |
| last_class = -1; |
| recomp_chars[target_pos++] = ch; |
| } |
| else |
| { |
| last_class = ch_class; |
| recomp_chars[target_pos++] = ch; |
| } |
| } |
| recomp_chars[target_pos] = (pg_wchar) '\0'; |
| |
| FREE(decomp_chars); |
| |
| return recomp_chars; |
| } |
| |
| /* |
| * Normalization "quick check" algorithm; see |
| * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms> |
| */ |
| |
| /* We only need this in the backend. */ |
| #ifndef FRONTEND |
| |
| static const pg_unicode_normprops * |
| qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) |
| { |
| int h; |
| uint32 hashkey; |
| |
| /* |
| * Compute the hash function. The hash key is the codepoint with the bytes |
| * in network order. |
| */ |
| hashkey = pg_hton32(ch); |
| h = norminfo->hash(&hashkey); |
| |
| /* An out-of-range result implies no match */ |
| if (h < 0 || h >= norminfo->num_normprops) |
| return NULL; |
| |
| /* |
| * Since it's a perfect hash, we need only match to the specific codepoint |
| * it identifies. |
| */ |
| if (ch != norminfo->normprops[h].codepoint) |
| return NULL; |
| |
| /* Success! */ |
| return &norminfo->normprops[h]; |
| } |
| |
| /* |
| * Look up the normalization quick check character property |
| */ |
| static UnicodeNormalizationQC |
| qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) |
| { |
| const pg_unicode_normprops *found = NULL; |
| |
| switch (form) |
| { |
| case UNICODE_NFC: |
| found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC); |
| break; |
| case UNICODE_NFKC: |
| found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC); |
| break; |
| default: |
| Assert(false); |
| break; |
| } |
| |
| if (found) |
| return found->quickcheck; |
| else |
| return UNICODE_NORM_QC_YES; |
| } |
| |
| UnicodeNormalizationQC |
| unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input) |
| { |
| uint8 lastCanonicalClass = 0; |
| UnicodeNormalizationQC result = UNICODE_NORM_QC_YES; |
| |
| /* |
| * For the "D" forms, we don't run the quickcheck. We don't include the |
| * lookup tables for those because they are huge, checking for these |
| * particular forms is less common, and running the slow path is faster |
| * for the "D" forms than the "C" forms because you don't need to |
| * recompose, which is slow. |
| */ |
| if (form == UNICODE_NFD || form == UNICODE_NFKD) |
| return UNICODE_NORM_QC_MAYBE; |
| |
| for (const pg_wchar *p = input; *p; p++) |
| { |
| pg_wchar ch = *p; |
| uint8 canonicalClass; |
| UnicodeNormalizationQC check; |
| |
| canonicalClass = get_canonical_class(ch); |
| if (lastCanonicalClass > canonicalClass && canonicalClass != 0) |
| return UNICODE_NORM_QC_NO; |
| |
| check = qc_is_allowed(form, ch); |
| if (check == UNICODE_NORM_QC_NO) |
| return UNICODE_NORM_QC_NO; |
| else if (check == UNICODE_NORM_QC_MAYBE) |
| result = UNICODE_NORM_QC_MAYBE; |
| |
| lastCanonicalClass = canonicalClass; |
| } |
| return result; |
| } |
| |
| #endif /* !FRONTEND */ |