| /* Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #define C_LUCY_NORMALIZER |
| #define C_LUCY_TOKEN |
| #include <ctype.h> |
| #include "Lucy/Util/ToolSet.h" |
| |
| #include "Lucy/Analysis/Normalizer.h" |
| #include "Clownfish/Boolean.h" |
| #include "Lucy/Analysis/Token.h" |
| #include "Lucy/Analysis/Inversion.h" |
| #include "Lucy/Util/Json.h" |
| |
| #include "utf8proc.h" |
| |
| #define INITIAL_BUFSIZE 63 |
| |
| Normalizer* |
| Normalizer_new(String *form, bool case_fold, bool strip_accents) { |
| Normalizer *self = (Normalizer*)Class_Make_Obj(NORMALIZER); |
| return Normalizer_init(self, form, case_fold, strip_accents); |
| } |
| |
| Normalizer* |
| Normalizer_init(Normalizer *self, String *form, bool case_fold, |
| bool strip_accents) { |
| int options = UTF8PROC_STABLE; |
| NormalizerIVARS *const ivars = Normalizer_IVARS(self); |
| |
| if (form == NULL |
| || Str_Equals_Utf8(form, "NFKC", 4) || Str_Equals_Utf8(form, "nfkc", 4) |
| ) { |
| options |= UTF8PROC_COMPOSE | UTF8PROC_COMPAT; |
| } |
| else if (Str_Equals_Utf8(form, "NFC", 3) || Str_Equals_Utf8(form, "nfc", 3)) { |
| options |= UTF8PROC_COMPOSE; |
| } |
| else if (Str_Equals_Utf8(form, "NFKD", 4) || Str_Equals_Utf8(form, "nfkd", 4)) { |
| options |= UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT; |
| } |
| else if (Str_Equals_Utf8(form, "NFD", 3) || Str_Equals_Utf8(form, "nfd", 3)) { |
| options |= UTF8PROC_DECOMPOSE; |
| } |
| else { |
| THROW(ERR, "Invalid normalization form %o", form); |
| } |
| |
| if (case_fold) { options |= UTF8PROC_CASEFOLD; } |
| if (strip_accents) { options |= UTF8PROC_STRIPMARK; } |
| |
| ivars->options = options; |
| |
| return self; |
| } |
| |
| Inversion* |
| Normalizer_Transform_IMP(Normalizer *self, Inversion *inversion) { |
| // allocate additional space because utf8proc_reencode adds a |
| // terminating null char |
| int32_t static_buffer[INITIAL_BUFSIZE + 1]; |
| int32_t *buffer = static_buffer; |
| ssize_t bufsize = INITIAL_BUFSIZE; |
| Token *token; |
| NormalizerIVARS *const ivars = Normalizer_IVARS(self); |
| |
| while (NULL != (token = Inversion_Next(inversion))) { |
| TokenIVARS *const token_ivars = Token_IVARS(token); |
| ssize_t len |
| = utf8proc_decompose((uint8_t*)token_ivars->text, |
| (ssize_t)token_ivars->len, buffer, bufsize, |
| ivars->options); |
| |
| if (len > bufsize) { |
| // buffer too small, (re)allocate |
| if (buffer != static_buffer) { |
| FREEMEM(buffer); |
| } |
| // allocate additional INITIAL_BUFSIZE items |
| bufsize = len + INITIAL_BUFSIZE; |
| if ((size_t)bufsize >= SIZE_MAX / sizeof(int32_t) - sizeof(int32_t)) { |
| THROW(ERR, "Requested bufsize too large: %u64", |
| (uint64_t)bufsize); |
| } |
| buffer = (int32_t*)MALLOCATE(((size_t)bufsize + 1) * sizeof(int32_t)); |
| len = utf8proc_decompose((uint8_t*)token_ivars->text, |
| (ssize_t)token_ivars->len, buffer, bufsize, |
| ivars->options); |
| } |
| if (len < 0) { |
| continue; |
| } |
| |
| len = utf8proc_reencode(buffer, len, ivars->options); |
| |
| if (len >= 0) { |
| if (len > (ssize_t)token_ivars->len) { |
| if (len >= INT32_MAX - 1) { |
| THROW(ERR, "Normalized result over 2 GB: %u64", |
| (uint64_t)len); |
| } |
| FREEMEM(token_ivars->text); |
| token_ivars->text = (char*)MALLOCATE((size_t)len + 1); |
| } |
| memcpy(token_ivars->text, buffer, len + 1); |
| token_ivars->len = (size_t)len; |
| } |
| } |
| |
| if (buffer != static_buffer) { |
| FREEMEM(buffer); |
| } |
| |
| Inversion_Reset(inversion); |
| return (Inversion*)INCREF(inversion); |
| } |
| |
| Hash* |
| Normalizer_Dump_IMP(Normalizer *self) { |
| Normalizer_Dump_t super_dump |
| = SUPER_METHOD_PTR(NORMALIZER, LUCY_Normalizer_Dump); |
| Hash *dump = super_dump(self); |
| int options = Normalizer_IVARS(self)->options; |
| |
| String *form = options & UTF8PROC_COMPOSE ? |
| options & UTF8PROC_COMPAT ? |
| Str_new_from_trusted_utf8("NFKC", 4) : |
| Str_new_from_trusted_utf8("NFC", 3) : |
| options & UTF8PROC_COMPAT ? |
| Str_new_from_trusted_utf8("NFKD", 4) : |
| Str_new_from_trusted_utf8("NFD", 3); |
| |
| Hash_Store_Utf8(dump, "normalization_form", 18, (Obj*)form); |
| |
| Boolean *case_fold = Bool_singleton(!!(options & UTF8PROC_CASEFOLD)); |
| Hash_Store_Utf8(dump, "case_fold", 9, (Obj*)case_fold); |
| |
| Boolean *strip_accents = Bool_singleton(!!(options & UTF8PROC_STRIPMARK)); |
| Hash_Store_Utf8(dump, "strip_accents", 13, (Obj*)strip_accents); |
| |
| return dump; |
| } |
| |
| Normalizer* |
| Normalizer_Load_IMP(Normalizer *self, Obj *dump) { |
| Normalizer_Load_t super_load |
| = SUPER_METHOD_PTR(NORMALIZER, LUCY_Normalizer_Load); |
| Normalizer *loaded = super_load(self, dump); |
| Hash *source = (Hash*)CERTIFY(dump, HASH); |
| |
| Obj *obj = Hash_Fetch_Utf8(source, "normalization_form", 18); |
| String *form = (String*)CERTIFY(obj, STRING); |
| obj = Hash_Fetch_Utf8(source, "case_fold", 9); |
| bool case_fold = Json_obj_to_bool(CERTIFY(obj, OBJ)); |
| obj = Hash_Fetch_Utf8(source, "strip_accents", 13); |
| bool strip_accents = Json_obj_to_bool(CERTIFY(obj, OBJ)); |
| |
| return Normalizer_init(loaded, form, case_fold, strip_accents); |
| } |
| |
| bool |
| Normalizer_Equals_IMP(Normalizer *self, Obj *other) { |
| if ((Normalizer*)other == self) { return true; } |
| if (!Obj_is_a(other, NORMALIZER)) { return false; } |
| NormalizerIVARS *const ivars = Normalizer_IVARS(self); |
| NormalizerIVARS *const ovars = Normalizer_IVARS((Normalizer*)other); |
| if (ovars->options != ivars->options) { return false; } |
| return true; |
| } |
| |
| |