blob: 6bb0fb71ada45d327fe5af80f13e8412cb89d24c [file] [log] [blame]
/* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdlib.h>
#define C_TESTLUCY_TESTNORMALIZER
#define C_LUCY_NORMALIZER
#define TESTLUCY_USE_SHORT_NAMES
#include "Lucy/Util/ToolSet.h"
#include "Clownfish/Boolean.h"
#include "Clownfish/TestHarness/TestBatchRunner.h"
#include "Clownfish/TestHarness/TestUtils.h"
#include "Lucy/Test.h"
#include "Lucy/Test/Analysis/TestNormalizer.h"
#include "Lucy/Analysis/Normalizer.h"
#include "Lucy/Store/FSFolder.h"
#include "Lucy/Test/TestUtils.h"
#include "Lucy/Util/Json.h"
#include "utf8proc.h"
TestNormalizer*
TestNormalizer_new() {
return (TestNormalizer*)Class_Make_Obj(TESTNORMALIZER);
}
static void
test_Dump_Load_and_Equals(TestBatchRunner *runner) {
Normalizer *normalizer[4];
String *NFC = SSTR_WRAP_C("NFC");
String *NFKC = SSTR_WRAP_C("NFKC");
normalizer[0] = Normalizer_new(NFKC, true, false);
normalizer[1] = Normalizer_new(NFC, true, false);
normalizer[2] = Normalizer_new(NFKC, false, false);
normalizer[3] = Normalizer_new(NFKC, true, true);
TEST_FALSE(runner,
Normalizer_Equals(normalizer[0], (Obj*)normalizer[1]),
"Equals() false with different normalization form");
TEST_FALSE(runner,
Normalizer_Equals(normalizer[0], (Obj*)normalizer[2]),
"Equals() false with different case_fold flag");
TEST_FALSE(runner,
Normalizer_Equals(normalizer[0], (Obj*)normalizer[3]),
"Equals() false with different strip_accents flag");
for (int i = 0; i < 4; ++i) {
Obj *dump = (Obj*)Normalizer_Dump(normalizer[i]);
Normalizer *clone = (Normalizer*)Normalizer_Load(normalizer[i], dump);
TEST_TRUE(runner,
Normalizer_Equals(normalizer[i], (Obj*)clone),
"Dump => Load round trip");
DECREF(normalizer[i]);
DECREF(dump);
DECREF(clone);
}
}
static void
test_normalization(TestBatchRunner *runner) {
FSFolder *modules_folder = TestUtils_modules_folder();
if (modules_folder == NULL) {
SKIP(runner, 13, "Can't locate test data");
return;
}
String *path = Str_newf("unicode/utf8proc/tests.json");
Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path);
if (!tests) { RETHROW(Err_get_error()); }
for (size_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
Hash *test = (Hash*)Vec_Fetch(tests, i);
String *form = (String*)Hash_Fetch_Utf8(
test, "normalization_form", 18);
bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
test, "case_fold", 9));
bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
test, "strip_accents", 13));
Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents);
Vector *words = (Vector*)Hash_Fetch_Utf8(test, "words", 5);
Vector *norms = (Vector*)Hash_Fetch_Utf8(test, "norms", 5);
for (size_t j = 0, max = Vec_Get_Size(words); j < max; j++) {
String *word = (String*)Vec_Fetch(words, j);
Vector *got = Normalizer_Split(normalizer, word);
String *norm = (String*)Vec_Fetch(got, 0);
char *fstr = Str_To_Utf8(form);
char *wstr = Str_To_Utf8(word);
TEST_TRUE(runner,
norm
&& Str_is_a(norm, STRING)
&& Str_Equals(norm, Vec_Fetch(norms, j)),
"Normalize %s %d %d: %s", fstr,
case_fold, strip_accents, wstr
);
free(fstr);
free(wstr);
DECREF(got);
}
DECREF(normalizer);
}
DECREF(tests);
DECREF(modules_folder);
DECREF(path);
}
static void
test_utf8proc_normalization(TestBatchRunner *runner) {
SKIP(runner, 1,
"utf8proc can't handle control chars or Unicode non-chars");
return;
for (int32_t i = 0; i < 100; i++) {
String *source = TestUtils_random_string(rand() % 40);
// Normalize once.
uint8_t *normalized;
int32_t check = utf8proc_map((const uint8_t*)Str_Get_Ptr8(source),
(ssize_t)Str_Get_Size(source),
&normalized,
UTF8PROC_STABLE |
UTF8PROC_COMPOSE |
UTF8PROC_COMPAT |
UTF8PROC_CASEFOLD);
if (check < 0) {
lucy_Json_set_tolerant(1);
String *json = lucy_Json_to_json((Obj*)source);
if (!json) {
json = Str_newf("[failed to encode]");
}
char *str = Str_To_Utf8(json);
FAIL(runner, "Failed to normalize: %s", str);
free(str);
DECREF(json);
DECREF(source);
return;
}
// Normalize again.
size_t normalized_len = strlen((char*)normalized);
uint8_t *dupe;
int32_t dupe_check = utf8proc_map(normalized, (ssize_t)normalized_len, &dupe,
UTF8PROC_STABLE |
UTF8PROC_COMPOSE |
UTF8PROC_COMPAT |
UTF8PROC_CASEFOLD);
if (dupe_check < 0) {
THROW(ERR, "Unexpected normalization error: %i32", dupe_check);
}
int comparison = strcmp((char*)normalized, (char*)dupe);
free(dupe);
free(normalized);
DECREF(source);
if (comparison != 0) {
FAIL(runner, "Not fully normalized");
return;
}
}
PASS(runner, "Normalization successful.");
}
void
TestNormalizer_Run_IMP(TestNormalizer *self, TestBatchRunner *runner) {
TestBatchRunner_Plan(runner, (TestBatch*)self, 21);
test_Dump_Load_and_Equals(runner);
test_normalization(runner);
test_utf8proc_normalization(runner);
}