| /* |
| * Daitch-Mokotoff Soundex |
| * |
| * Copyright (c) 2023, PostgreSQL Global Development Group |
| * |
| * This module was originally sponsored by Finance Norway / |
| * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> |
| * |
| * The implementation of the Daitch-Mokotoff Soundex System aims at correctness |
| * and high performance, and can be summarized as follows: |
| * |
| * - The processing of each phoneme is initiated by an O(1) table lookup. |
| * - For phonemes containing more than one character, a coding tree is traversed |
| * to process the complete phoneme. |
| * - The (alternate) soundex codes are produced digit by digit in-place in |
| * another tree structure. |
| * |
| * References: |
| * |
| * https://www.avotaynu.com/soundex.htm |
| * https://www.jewishgen.org/InfoFiles/Soundex.html |
| * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex |
| * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php) |
| * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java) |
| * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm) |
| * |
| * A few notes on other implementations: |
| * |
| * - All other known implementations have the same unofficial rules for "UE", |
| * these are also adapted by this implementation (0, 1, NC). |
| * - The only other known implementation which is capable of generating all |
| * correct soundex codes in all cases is the JOS Soundex Calculator at |
| * https://www.jewishgen.org/jos/jossound.htm |
| * - "J" is considered (only) a vowel in dmlat.php |
| * - The official rules for "RS" are commented out in dmlat.php |
| * - Identical code digits for adjacent letters are not collapsed correctly in |
| * dmsoundex.php when double digit codes are involved. E.g. "BESST" yields |
| * 744300 instead of 743000 as for "BEST". |
| * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java |
| * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java |
| */ |
| |
| #include "postgres.h" |
| |
| #include "catalog/pg_type.h" |
| #include "mb/pg_wchar.h" |
| #include "utils/array.h" |
| #include "utils/builtins.h" |
| #include "utils/memutils.h" |
| |
| |
| /* |
| * The soundex coding chart table is adapted from |
| * https://www.jewishgen.org/InfoFiles/Soundex.html |
| * See daitch_mokotoff_header.pl for details. |
| */ |
| |
| /* Generated coding chart table */ |
| #include "daitch_mokotoff.h" |
| |
| #define DM_CODE_DIGITS 6 |
| |
| /* Node in soundex code tree */ |
| typedef struct dm_node |
| { |
| int soundex_length; /* Length of generated soundex code */ |
| char soundex[DM_CODE_DIGITS]; /* Soundex code */ |
| int is_leaf; /* Candidate for complete soundex code */ |
| int last_update; /* Letter number for last update of node */ |
| char code_digit; /* Last code digit, 0 - 9 */ |
| |
| /* |
| * One or two alternate code digits leading to this node. If there are two |
| * digits, one of them is always an 'X'. Repeated code digits and 'X' lead |
| * back to the same node. |
| */ |
| char prev_code_digits[2]; |
| /* One or two alternate code digits moving forward. */ |
| char next_code_digits[2]; |
| /* ORed together code index(es) used to reach current node. */ |
| int prev_code_index; |
| int next_code_index; |
| /* Possible nodes branching out from this node - digits 0-9. */ |
| struct dm_node *children[10]; |
| /* Next node in linked list. Alternating index for each iteration. */ |
| struct dm_node *next[2]; |
| } dm_node; |
| |
| /* Template for new node in soundex code tree. */ |
| static const dm_node start_node = { |
| .soundex_length = 0, |
| .soundex = "000000", /* Six digits */ |
| .is_leaf = 0, |
| .last_update = 0, |
| .code_digit = '\0', |
| .prev_code_digits = {'\0', '\0'}, |
| .next_code_digits = {'\0', '\0'}, |
| .prev_code_index = 0, |
| .next_code_index = 0, |
| .children = {NULL}, |
| .next = {NULL} |
| }; |
| |
| /* Dummy soundex codes at end of input. */ |
| static const dm_codes end_codes[2] = |
| { |
| { |
| "X", "X", "X" |
| } |
| }; |
| |
| /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */ |
| static const char iso8859_1_to_ascii_upper[] = |
| "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY"; |
| |
| /* Internal C implementation */ |
| static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex); |
| |
| |
| PG_FUNCTION_INFO_V1(daitch_mokotoff); |
| |
| Datum |
| daitch_mokotoff(PG_FUNCTION_ARGS) |
| { |
| text *arg = PG_GETARG_TEXT_PP(0); |
| Datum retval; |
| char *string; |
| ArrayBuildState *soundex; |
| MemoryContext old_ctx, |
| tmp_ctx; |
| |
| /* Work in a temporary context to simplify cleanup. */ |
| tmp_ctx = AllocSetContextCreate(CurrentMemoryContext, |
| "daitch_mokotoff temporary context", |
| ALLOCSET_DEFAULT_SIZES); |
| old_ctx = MemoryContextSwitchTo(tmp_ctx); |
| |
| /* We must convert the string to UTF-8 if it isn't already. */ |
| string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg), |
| PG_UTF8); |
| |
| /* The result is built in this ArrayBuildState. */ |
| soundex = initArrayResult(TEXTOID, tmp_ctx, false); |
| |
| if (!daitch_mokotoff_coding(string, soundex)) |
| { |
| /* No encodable characters in input */ |
| MemoryContextSwitchTo(old_ctx); |
| MemoryContextDelete(tmp_ctx); |
| PG_RETURN_NULL(); |
| } |
| |
| retval = makeArrayResult(soundex, old_ctx); |
| |
| MemoryContextSwitchTo(old_ctx); |
| MemoryContextDelete(tmp_ctx); |
| |
| PG_RETURN_DATUM(retval); |
| } |
| |
| |
| /* Initialize soundex code tree node for next code digit. */ |
| static void |
| initialize_node(dm_node *node, int last_update) |
| { |
| if (node->last_update < last_update) |
| { |
| node->prev_code_digits[0] = node->next_code_digits[0]; |
| node->prev_code_digits[1] = node->next_code_digits[1]; |
| node->next_code_digits[0] = '\0'; |
| node->next_code_digits[1] = '\0'; |
| node->prev_code_index = node->next_code_index; |
| node->next_code_index = 0; |
| node->is_leaf = 0; |
| node->last_update = last_update; |
| } |
| } |
| |
| |
| /* Update soundex code tree node with next code digit. */ |
| static void |
| add_next_code_digit(dm_node *node, int code_index, char code_digit) |
| { |
| /* OR in index 1 or 2. */ |
| node->next_code_index |= code_index; |
| |
| if (!node->next_code_digits[0]) |
| node->next_code_digits[0] = code_digit; |
| else if (node->next_code_digits[0] != code_digit) |
| node->next_code_digits[1] = code_digit; |
| } |
| |
| |
| /* Mark soundex code tree node as leaf. */ |
| static void |
| set_leaf(dm_node *first_node[2], dm_node *last_node[2], |
| dm_node *node, int ix_node) |
| { |
| if (!node->is_leaf) |
| { |
| node->is_leaf = 1; |
| |
| if (first_node[ix_node] == NULL) |
| first_node[ix_node] = node; |
| else |
| last_node[ix_node]->next[ix_node] = node; |
| |
| last_node[ix_node] = node; |
| node->next[ix_node] = NULL; |
| } |
| } |
| |
| |
| /* Find next node corresponding to code digit, or create a new node. */ |
| static dm_node * |
| find_or_create_child_node(dm_node *parent, char code_digit, |
| ArrayBuildState *soundex) |
| { |
| int i = code_digit - '0'; |
| dm_node **nodes = parent->children; |
| dm_node *node = nodes[i]; |
| |
| if (node) |
| { |
| /* Found existing child node. Skip completed nodes. */ |
| return node->soundex_length < DM_CODE_DIGITS ? node : NULL; |
| } |
| |
| /* Create new child node. */ |
| node = palloc_object(dm_node); |
| nodes[i] = node; |
| |
| *node = start_node; |
| memcpy(node->soundex, parent->soundex, sizeof(parent->soundex)); |
| node->soundex_length = parent->soundex_length; |
| node->soundex[node->soundex_length++] = code_digit; |
| node->code_digit = code_digit; |
| node->next_code_index = node->prev_code_index; |
| |
| if (node->soundex_length < DM_CODE_DIGITS) |
| { |
| return node; |
| } |
| else |
| { |
| /* Append completed soundex code to output array. */ |
| text *out = cstring_to_text_with_len(node->soundex, |
| DM_CODE_DIGITS); |
| |
| accumArrayResult(soundex, |
| PointerGetDatum(out), |
| false, |
| TEXTOID, |
| CurrentMemoryContext); |
| return NULL; |
| } |
| } |
| |
| |
| /* Update node for next code digit(s). */ |
| static void |
| update_node(dm_node *first_node[2], dm_node *last_node[2], |
| dm_node *node, int ix_node, |
| int letter_no, int prev_code_index, int next_code_index, |
| const char *next_code_digits, int digit_no, |
| ArrayBuildState *soundex) |
| { |
| int i; |
| char next_code_digit = next_code_digits[digit_no]; |
| int num_dirty_nodes = 0; |
| dm_node *dirty_nodes[2]; |
| |
| initialize_node(node, letter_no); |
| |
| if (node->prev_code_index && !(node->prev_code_index & prev_code_index)) |
| { |
| /* |
| * If the sound (vowel / consonant) of this letter encoding doesn't |
| * correspond to the coding index of the previous letter, we skip this |
| * letter encoding. Note that currently, only "J" can be either a |
| * vowel or a consonant. |
| */ |
| return; |
| } |
| |
| if (next_code_digit == 'X' || |
| (digit_no == 0 && |
| (node->prev_code_digits[0] == next_code_digit || |
| node->prev_code_digits[1] == next_code_digit))) |
| { |
| /* The code digit is the same as one of the previous (i.e. not added). */ |
| dirty_nodes[num_dirty_nodes++] = node; |
| } |
| |
| if (next_code_digit != 'X' && |
| (digit_no > 0 || |
| node->prev_code_digits[0] != next_code_digit || |
| node->prev_code_digits[1])) |
| { |
| /* The code digit is different from one of the previous (i.e. added). */ |
| node = find_or_create_child_node(node, next_code_digit, soundex); |
| if (node) |
| { |
| initialize_node(node, letter_no); |
| dirty_nodes[num_dirty_nodes++] = node; |
| } |
| } |
| |
| for (i = 0; i < num_dirty_nodes; i++) |
| { |
| /* Add code digit leading to the current node. */ |
| add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit); |
| |
| if (next_code_digits[++digit_no]) |
| { |
| update_node(first_node, last_node, dirty_nodes[i], ix_node, |
| letter_no, prev_code_index, next_code_index, |
| next_code_digits, digit_no, |
| soundex); |
| } |
| else |
| { |
| /* Add incomplete leaf node to linked list. */ |
| set_leaf(first_node, last_node, dirty_nodes[i], ix_node); |
| } |
| } |
| } |
| |
| |
| /* Update soundex tree leaf nodes. */ |
| static void |
| update_leaves(dm_node *first_node[2], int *ix_node, int letter_no, |
| const dm_codes *codes, const dm_codes *next_codes, |
| ArrayBuildState *soundex) |
| { |
| int i, |
| j, |
| code_index; |
| dm_node *node, |
| *last_node[2]; |
| const dm_code *code, |
| *next_code; |
| int ix_node_next = (*ix_node + 1) & 1; /* Alternating index: 0, 1 */ |
| |
| /* Initialize for new linked list of leaves. */ |
| first_node[ix_node_next] = NULL; |
| last_node[ix_node_next] = NULL; |
| |
| /* Process all nodes. */ |
| for (node = first_node[*ix_node]; node; node = node->next[*ix_node]) |
| { |
| /* One or two alternate code sequences. */ |
| for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++) |
| { |
| /* Coding for previous letter - before vowel: 1, all other: 2 */ |
| int prev_code_index = (code[0][0] > '1') + 1; |
| |
| /* One or two alternate next code sequences. */ |
| for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++) |
| { |
| /* Determine which code to use. */ |
| if (letter_no == 0) |
| { |
| /* This is the first letter. */ |
| code_index = 0; |
| } |
| else if (next_code[0][0] <= '1') |
| { |
| /* The next letter is a vowel. */ |
| code_index = 1; |
| } |
| else |
| { |
| /* All other cases. */ |
| code_index = 2; |
| } |
| |
| /* One or two sequential code digits. */ |
| update_node(first_node, last_node, node, ix_node_next, |
| letter_no, prev_code_index, code_index, |
| code[code_index], 0, |
| soundex); |
| } |
| } |
| } |
| |
| *ix_node = ix_node_next; |
| } |
| |
| |
| /* |
| * Return next character, converted from UTF-8 to uppercase ASCII. |
| * *ix is the current string index and is incremented by the character length. |
| */ |
| static char |
| read_char(const unsigned char *str, int *ix) |
| { |
| /* Substitute character for skipped code points. */ |
| const char na = '\x1a'; |
| pg_wchar c; |
| |
| /* Decode UTF-8 character to ISO 10646 code point. */ |
| str += *ix; |
| c = utf8_to_unicode(str); |
| |
| /* Advance *ix, but (for safety) not if we've reached end of string. */ |
| if (c) |
| *ix += pg_utf_mblen(str); |
| |
| /* Convert. */ |
| if (c >= (unsigned char) '[' && c <= (unsigned char) ']') |
| { |
| /* ASCII characters [, \, and ] are reserved for conversions below. */ |
| return na; |
| } |
| else if (c < 0x60) |
| { |
| /* Other non-lowercase ASCII characters can be used as-is. */ |
| return (char) c; |
| } |
| else if (c < 0x100) |
| { |
| /* ISO-8859-1 code point; convert to upper-case ASCII via table. */ |
| return iso8859_1_to_ascii_upper[c - 0x60]; |
| } |
| else |
| { |
| /* Conversion of non-ASCII characters in the coding chart. */ |
| switch (c) |
| { |
| case 0x0104: /* LATIN CAPITAL LETTER A WITH OGONEK */ |
| case 0x0105: /* LATIN SMALL LETTER A WITH OGONEK */ |
| return '['; |
| case 0x0118: /* LATIN CAPITAL LETTER E WITH OGONEK */ |
| case 0x0119: /* LATIN SMALL LETTER E WITH OGONEK */ |
| return '\\'; |
| case 0x0162: /* LATIN CAPITAL LETTER T WITH CEDILLA */ |
| case 0x0163: /* LATIN SMALL LETTER T WITH CEDILLA */ |
| case 0x021A: /* LATIN CAPITAL LETTER T WITH COMMA BELOW */ |
| case 0x021B: /* LATIN SMALL LETTER T WITH COMMA BELOW */ |
| return ']'; |
| default: |
| return na; |
| } |
| } |
| } |
| |
| |
| /* Read next ASCII character, skipping any characters not in [A-\]]. */ |
| static char |
| read_valid_char(const char *str, int *ix) |
| { |
| char c; |
| |
| while ((c = read_char((const unsigned char *) str, ix)) != '\0') |
| { |
| if (c >= 'A' && c <= ']') |
| break; |
| } |
| |
| return c; |
| } |
| |
| |
| /* Return sound coding for "letter" (letter sequence) */ |
| static const dm_codes * |
| read_letter(const char *str, int *ix) |
| { |
| char c, |
| cmp; |
| int i, |
| j; |
| const dm_letter *letters; |
| const dm_codes *codes; |
| |
| /* First letter in sequence. */ |
| if ((c = read_valid_char(str, ix)) == '\0') |
| return NULL; |
| |
| letters = &letter_[c - 'A']; |
| codes = letters->codes; |
| i = *ix; |
| |
| /* Any subsequent letters in sequence. */ |
| while ((letters = letters->letters) && (c = read_valid_char(str, &i))) |
| { |
| for (j = 0; (cmp = letters[j].letter); j++) |
| { |
| if (cmp == c) |
| { |
| /* Letter found. */ |
| letters = &letters[j]; |
| if (letters->codes) |
| { |
| /* Coding for letter sequence found. */ |
| codes = letters->codes; |
| *ix = i; |
| } |
| break; |
| } |
| } |
| if (!cmp) |
| { |
| /* The sequence of letters has no coding. */ |
| break; |
| } |
| } |
| |
| return codes; |
| } |
| |
| |
| /* |
| * Generate all Daitch-Mokotoff soundex codes for word, |
| * adding them to the "soundex" ArrayBuildState. |
| * Returns false if string has no encodable characters, else true. |
| */ |
| static bool |
| daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex) |
| { |
| int i = 0; |
| int letter_no = 0; |
| int ix_node = 0; |
| const dm_codes *codes, |
| *next_codes; |
| dm_node *first_node[2], |
| *node; |
| |
| /* First letter. */ |
| if (!(codes = read_letter(word, &i))) |
| { |
| /* No encodable character in input. */ |
| return false; |
| } |
| |
| /* Starting point. */ |
| first_node[ix_node] = palloc_object(dm_node); |
| *first_node[ix_node] = start_node; |
| |
| /* |
| * Loop until either the word input is exhausted, or all generated soundex |
| * codes are completed to six digits. |
| */ |
| while (codes && first_node[ix_node]) |
| { |
| next_codes = read_letter(word, &i); |
| |
| /* Update leaf nodes. */ |
| update_leaves(first_node, &ix_node, letter_no, |
| codes, next_codes ? next_codes : end_codes, |
| soundex); |
| |
| codes = next_codes; |
| letter_no++; |
| } |
| |
| /* Append all remaining (incomplete) soundex codes to output array. */ |
| for (node = first_node[ix_node]; node; node = node->next[ix_node]) |
| { |
| text *out = cstring_to_text_with_len(node->soundex, |
| DM_CODE_DIGITS); |
| |
| accumArrayResult(soundex, |
| PointerGetDatum(out), |
| false, |
| TEXTOID, |
| CurrentMemoryContext); |
| } |
| |
| return true; |
| } |