blob: d5691417579faef5c4c67d20deb6f51f2f095443 [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define C_LUCY_NORMALIZER
#define C_LUCY_TOKEN
#include <ctype.h>
#include "Lucy/Util/ToolSet.h"
#include "Lucy/Analysis/Normalizer.h"
#include "Clownfish/Boolean.h"
#include "Lucy/Analysis/Token.h"
#include "Lucy/Analysis/Inversion.h"
#include "Lucy/Util/Json.h"
#include "utf8proc.h"
#define INITIAL_BUFSIZE 63
Normalizer*
Normalizer_new(String *form, bool case_fold, bool strip_accents) {
Normalizer *self = (Normalizer*)Class_Make_Obj(NORMALIZER);
return Normalizer_init(self, form, case_fold, strip_accents);
}
Normalizer*
Normalizer_init(Normalizer *self, String *form, bool case_fold,
bool strip_accents) {
int options = UTF8PROC_STABLE;
NormalizerIVARS *const ivars = Normalizer_IVARS(self);
if (form == NULL
|| Str_Equals_Utf8(form, "NFKC", 4) || Str_Equals_Utf8(form, "nfkc", 4)
) {
options |= UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
}
else if (Str_Equals_Utf8(form, "NFC", 3) || Str_Equals_Utf8(form, "nfc", 3)) {
options |= UTF8PROC_COMPOSE;
}
else if (Str_Equals_Utf8(form, "NFKD", 4) || Str_Equals_Utf8(form, "nfkd", 4)) {
options |= UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
}
else if (Str_Equals_Utf8(form, "NFD", 3) || Str_Equals_Utf8(form, "nfd", 3)) {
options |= UTF8PROC_DECOMPOSE;
}
else {
THROW(ERR, "Invalid normalization form %o", form);
}
if (case_fold) { options |= UTF8PROC_CASEFOLD; }
if (strip_accents) { options |= UTF8PROC_STRIPMARK; }
ivars->options = options;
return self;
}
Inversion*
Normalizer_Transform_IMP(Normalizer *self, Inversion *inversion) {
// allocate additional space because utf8proc_reencode adds a
// terminating null char
int32_t static_buffer[INITIAL_BUFSIZE + 1];
int32_t *buffer = static_buffer;
ssize_t bufsize = INITIAL_BUFSIZE;
Token *token;
NormalizerIVARS *const ivars = Normalizer_IVARS(self);
while (NULL != (token = Inversion_Next(inversion))) {
TokenIVARS *const token_ivars = Token_IVARS(token);
ssize_t len
= utf8proc_decompose((uint8_t*)token_ivars->text,
(ssize_t)token_ivars->len, buffer, bufsize,
ivars->options);
if (len > bufsize) {
// buffer too small, (re)allocate
if (buffer != static_buffer) {
FREEMEM(buffer);
}
// allocate additional INITIAL_BUFSIZE items
bufsize = len + INITIAL_BUFSIZE;
if ((size_t)bufsize >= SIZE_MAX / sizeof(int32_t) - sizeof(int32_t)) {
THROW(ERR, "Requested bufsize too large: %u64",
(uint64_t)bufsize);
}
buffer = (int32_t*)MALLOCATE(((size_t)bufsize + 1) * sizeof(int32_t));
len = utf8proc_decompose((uint8_t*)token_ivars->text,
(ssize_t)token_ivars->len, buffer, bufsize,
ivars->options);
}
if (len < 0) {
continue;
}
len = utf8proc_reencode(buffer, len, ivars->options);
if (len >= 0) {
if (len > (ssize_t)token_ivars->len) {
if (len >= INT32_MAX - 1) {
THROW(ERR, "Normalized result over 2 GB: %u64",
(uint64_t)len);
}
FREEMEM(token_ivars->text);
token_ivars->text = (char*)MALLOCATE((size_t)len + 1);
}
memcpy(token_ivars->text, buffer, len + 1);
token_ivars->len = (size_t)len;
}
}
if (buffer != static_buffer) {
FREEMEM(buffer);
}
Inversion_Reset(inversion);
return (Inversion*)INCREF(inversion);
}
Hash*
Normalizer_Dump_IMP(Normalizer *self) {
Normalizer_Dump_t super_dump
= SUPER_METHOD_PTR(NORMALIZER, LUCY_Normalizer_Dump);
Hash *dump = super_dump(self);
int options = Normalizer_IVARS(self)->options;
String *form = options & UTF8PROC_COMPOSE ?
options & UTF8PROC_COMPAT ?
Str_new_from_trusted_utf8("NFKC", 4) :
Str_new_from_trusted_utf8("NFC", 3) :
options & UTF8PROC_COMPAT ?
Str_new_from_trusted_utf8("NFKD", 4) :
Str_new_from_trusted_utf8("NFD", 3);
Hash_Store_Utf8(dump, "normalization_form", 18, (Obj*)form);
Boolean *case_fold = Bool_singleton(!!(options & UTF8PROC_CASEFOLD));
Hash_Store_Utf8(dump, "case_fold", 9, (Obj*)case_fold);
Boolean *strip_accents = Bool_singleton(!!(options & UTF8PROC_STRIPMARK));
Hash_Store_Utf8(dump, "strip_accents", 13, (Obj*)strip_accents);
return dump;
}
Normalizer*
Normalizer_Load_IMP(Normalizer *self, Obj *dump) {
Normalizer_Load_t super_load
= SUPER_METHOD_PTR(NORMALIZER, LUCY_Normalizer_Load);
Normalizer *loaded = super_load(self, dump);
Hash *source = (Hash*)CERTIFY(dump, HASH);
Obj *obj = Hash_Fetch_Utf8(source, "normalization_form", 18);
String *form = (String*)CERTIFY(obj, STRING);
obj = Hash_Fetch_Utf8(source, "case_fold", 9);
bool case_fold = Json_obj_to_bool(CERTIFY(obj, OBJ));
obj = Hash_Fetch_Utf8(source, "strip_accents", 13);
bool strip_accents = Json_obj_to_bool(CERTIFY(obj, OBJ));
return Normalizer_init(loaded, form, case_fold, strip_accents);
}
bool
Normalizer_Equals_IMP(Normalizer *self, Obj *other) {
if ((Normalizer*)other == self) { return true; }
if (!Obj_is_a(other, NORMALIZER)) { return false; }
NormalizerIVARS *const ivars = Normalizer_IVARS(self);
NormalizerIVARS *const ovars = Normalizer_IVARS((Normalizer*)other);
if (ovars->options != ivars->options) { return false; }
return true;
}