test/Lucy/Test/Analysis/TestNormalizer.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <stdlib.h>

 #define C_TESTLUCY_TESTNORMALIZER
 #define C_LUCY_NORMALIZER
 #define TESTLUCY_USE_SHORT_NAMES
 #include "Lucy/Util/ToolSet.h"

 #include "Clownfish/Boolean.h"
 #include "Clownfish/TestHarness/TestBatchRunner.h"
 #include "Clownfish/TestHarness/TestUtils.h"
 #include "Lucy/Test.h"
 #include "Lucy/Test/Analysis/TestNormalizer.h"
 #include "Lucy/Analysis/Normalizer.h"
 #include "Lucy/Store/FSFolder.h"
 #include "Lucy/Test/TestUtils.h"
 #include "Lucy/Util/Json.h"
 #include "utf8proc.h"

 TestNormalizer*
 TestNormalizer_new() {
     return (TestNormalizer*)Class_Make_Obj(TESTNORMALIZER);
 }

 static void
 test_Dump_Load_and_Equals(TestBatchRunner *runner) {
     Normalizer *normalizer[4];

     String *NFC  = SSTR_WRAP_C("NFC");
     String *NFKC = SSTR_WRAP_C("NFKC");

     normalizer[0] = Normalizer_new(NFKC, true,  false);
     normalizer[1] = Normalizer_new(NFC,  true,  false);
     normalizer[2] = Normalizer_new(NFKC, false, false);
     normalizer[3] = Normalizer_new(NFKC, true,  true);

     TEST_FALSE(runner,
                Normalizer_Equals(normalizer[0], (Obj*)normalizer[1]),
                "Equals() false with different normalization form");
     TEST_FALSE(runner,
                Normalizer_Equals(normalizer[0], (Obj*)normalizer[2]),
                "Equals() false with different case_fold flag");
     TEST_FALSE(runner,
                Normalizer_Equals(normalizer[0], (Obj*)normalizer[3]),
                "Equals() false with different strip_accents flag");

     for (int i = 0; i < 4; ++i) {
         Obj *dump = (Obj*)Normalizer_Dump(normalizer[i]);
         Normalizer *clone = (Normalizer*)Normalizer_Load(normalizer[i], dump);

         TEST_TRUE(runner,
                   Normalizer_Equals(normalizer[i], (Obj*)clone),
                   "Dump => Load round trip");

         DECREF(normalizer[i]);
         DECREF(dump);
         DECREF(clone);
     }
 }

 static void
 test_normalization(TestBatchRunner *runner) {
     FSFolder *modules_folder = TestUtils_modules_folder();
     if (modules_folder == NULL) {
         SKIP(runner, 13, "Can't locate test data");
         return;
     }

     String *path = Str_newf("unicode/utf8proc/tests.json");
     Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path);
     if (!tests) { RETHROW(Err_get_error()); }

     for (size_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
         Hash *test = (Hash*)Vec_Fetch(tests, i);
         String *form = (String*)Hash_Fetch_Utf8(
                             test, "normalization_form", 18);
         bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
                                               test, "case_fold", 9));
         bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
                                                   test, "strip_accents", 13));
         Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents);
         Vector *words = (Vector*)Hash_Fetch_Utf8(test, "words", 5);
         Vector *norms = (Vector*)Hash_Fetch_Utf8(test, "norms", 5);
         for (size_t j = 0, max = Vec_Get_Size(words); j < max; j++) {
             String *word = (String*)Vec_Fetch(words, j);
             Vector *got  = Normalizer_Split(normalizer, word);
             String *norm = (String*)Vec_Fetch(got, 0);
             char   *fstr = Str_To_Utf8(form);
             char   *wstr = Str_To_Utf8(word);
             TEST_TRUE(runner,
                       norm
                       && Str_is_a(norm, STRING)
                       && Str_Equals(norm, Vec_Fetch(norms, j)),
                       "Normalize %s %d %d: %s", fstr,
                       case_fold, strip_accents, wstr
                      );
             free(fstr);
             free(wstr);
             DECREF(got);
         }
         DECREF(normalizer);
     }

     DECREF(tests);
     DECREF(modules_folder);
     DECREF(path);
 }

 static void
 test_utf8proc_normalization(TestBatchRunner *runner) {
     SKIP(runner, 1,
          "utf8proc can't handle control chars or Unicode non-chars");
     return;

     for (int32_t i = 0; i < 100; i++) {
         String *source = TestUtils_random_string(rand() % 40);

         // Normalize once.
         uint8_t *normalized;
         int32_t check = utf8proc_map((const uint8_t*)Str_Get_Ptr8(source),
                                      (ssize_t)Str_Get_Size(source),
                                      &normalized,
                                      UTF8PROC_STABLE  |
                                      UTF8PROC_COMPOSE |
                                      UTF8PROC_COMPAT  |
                                      UTF8PROC_CASEFOLD);
         if (check < 0) {
             lucy_Json_set_tolerant(1);
             String *json = lucy_Json_to_json((Obj*)source);
             if (!json) {
                 json = Str_newf("[failed to encode]");
             }
             char *str = Str_To_Utf8(json);
             FAIL(runner, "Failed to normalize: %s", str);
             free(str);
             DECREF(json);
             DECREF(source);
             return;
         }

         // Normalize again.
         size_t normalized_len = strlen((char*)normalized);
         uint8_t *dupe;
         int32_t dupe_check = utf8proc_map(normalized, (ssize_t)normalized_len, &dupe,
                                           UTF8PROC_STABLE  |
                                           UTF8PROC_COMPOSE |
                                           UTF8PROC_COMPAT  |
                                           UTF8PROC_CASEFOLD);
         if (dupe_check < 0) {
             THROW(ERR, "Unexpected normalization error: %i32", dupe_check);
         }
         int comparison = strcmp((char*)normalized, (char*)dupe);
         free(dupe);
         free(normalized);
         DECREF(source);
         if (comparison != 0) {
             FAIL(runner, "Not fully normalized");
             return;
         }
     }
     PASS(runner, "Normalization successful.");
 }

 void
 TestNormalizer_Run_IMP(TestNormalizer *self, TestBatchRunner *runner) {
     TestBatchRunner_Plan(runner, (TestBatch*)self, 21);
     test_Dump_Load_and_Equals(runner);
     test_normalization(runner);
     test_utf8proc_normalization(runner);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <stdlib.h>

	#define C_TESTLUCY_TESTNORMALIZER
	#define C_LUCY_NORMALIZER
	#define TESTLUCY_USE_SHORT_NAMES
	#include "Lucy/Util/ToolSet.h"

	#include "Clownfish/Boolean.h"
	#include "Clownfish/TestHarness/TestBatchRunner.h"
	#include "Clownfish/TestHarness/TestUtils.h"
	#include "Lucy/Test.h"
	#include "Lucy/Test/Analysis/TestNormalizer.h"
	#include "Lucy/Analysis/Normalizer.h"
	#include "Lucy/Store/FSFolder.h"
	#include "Lucy/Test/TestUtils.h"
	#include "Lucy/Util/Json.h"
	#include "utf8proc.h"

	TestNormalizer*
	TestNormalizer_new() {
	return (TestNormalizer*)Class_Make_Obj(TESTNORMALIZER);
	}

	static void
	test_Dump_Load_and_Equals(TestBatchRunner *runner) {
	Normalizer *normalizer[4];

	String *NFC = SSTR_WRAP_C("NFC");
	String *NFKC = SSTR_WRAP_C("NFKC");

	normalizer[0] = Normalizer_new(NFKC, true, false);
	normalizer[1] = Normalizer_new(NFC, true, false);
	normalizer[2] = Normalizer_new(NFKC, false, false);
	normalizer[3] = Normalizer_new(NFKC, true, true);

	TEST_FALSE(runner,
	Normalizer_Equals(normalizer[0], (Obj*)normalizer[1]),
	"Equals() false with different normalization form");
	TEST_FALSE(runner,
	Normalizer_Equals(normalizer[0], (Obj*)normalizer[2]),
	"Equals() false with different case_fold flag");
	TEST_FALSE(runner,
	Normalizer_Equals(normalizer[0], (Obj*)normalizer[3]),
	"Equals() false with different strip_accents flag");

	for (int i = 0; i < 4; ++i) {
	Obj dump = (Obj)Normalizer_Dump(normalizer[i]);
	Normalizer clone = (Normalizer)Normalizer_Load(normalizer[i], dump);

	TEST_TRUE(runner,
	Normalizer_Equals(normalizer[i], (Obj*)clone),
	"Dump => Load round trip");

	DECREF(normalizer[i]);
	DECREF(dump);
	DECREF(clone);
	}
	}

	static void
	test_normalization(TestBatchRunner *runner) {
	FSFolder *modules_folder = TestUtils_modules_folder();
	if (modules_folder == NULL) {
	SKIP(runner, 13, "Can't locate test data");
	return;
	}

	String *path = Str_newf("unicode/utf8proc/tests.json");
	Vector tests = (Vector)Json_slurp_json((Folder*)modules_folder, path);
	if (!tests) { RETHROW(Err_get_error()); }

	for (size_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
	Hash test = (Hash)Vec_Fetch(tests, i);
	String form = (String)Hash_Fetch_Utf8(
	test, "normalization_form", 18);
	bool case_fold = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
	test, "case_fold", 9));
	bool strip_accents = Bool_Get_Value((Boolean*)Hash_Fetch_Utf8(
	test, "strip_accents", 13));
	Normalizer *normalizer = Normalizer_new(form, case_fold, strip_accents);
	Vector words = (Vector)Hash_Fetch_Utf8(test, "words", 5);
	Vector norms = (Vector)Hash_Fetch_Utf8(test, "norms", 5);
	for (size_t j = 0, max = Vec_Get_Size(words); j < max; j++) {
	String word = (String)Vec_Fetch(words, j);
	Vector *got = Normalizer_Split(normalizer, word);
	String norm = (String)Vec_Fetch(got, 0);
	char *fstr = Str_To_Utf8(form);
	char *wstr = Str_To_Utf8(word);
	TEST_TRUE(runner,
	norm
	&& Str_is_a(norm, STRING)
	&& Str_Equals(norm, Vec_Fetch(norms, j)),
	"Normalize %s %d %d: %s", fstr,
	case_fold, strip_accents, wstr
	);
	free(fstr);
	free(wstr);
	DECREF(got);
	}
	DECREF(normalizer);
	}

	DECREF(tests);
	DECREF(modules_folder);
	DECREF(path);
	}

	static void
	test_utf8proc_normalization(TestBatchRunner *runner) {
	SKIP(runner, 1,
	"utf8proc can't handle control chars or Unicode non-chars");
	return;

	for (int32_t i = 0; i < 100; i++) {
	String *source = TestUtils_random_string(rand() % 40);

	// Normalize once.
	uint8_t *normalized;
	int32_t check = utf8proc_map((const uint8_t*)Str_Get_Ptr8(source),
	(ssize_t)Str_Get_Size(source),
	&normalized,
	UTF8PROC_STABLE \|
	UTF8PROC_COMPOSE \|
	UTF8PROC_COMPAT \|
	UTF8PROC_CASEFOLD);
	if (check < 0) {
	lucy_Json_set_tolerant(1);
	String json = lucy_Json_to_json((Obj)source);
	if (!json) {
	json = Str_newf("[failed to encode]");
	}
	char *str = Str_To_Utf8(json);
	FAIL(runner, "Failed to normalize: %s", str);
	free(str);
	DECREF(json);
	DECREF(source);
	return;
	}

	// Normalize again.
	size_t normalized_len = strlen((char*)normalized);
	uint8_t *dupe;
	int32_t dupe_check = utf8proc_map(normalized, (ssize_t)normalized_len, &dupe,
	UTF8PROC_STABLE \|
	UTF8PROC_COMPOSE \|
	UTF8PROC_COMPAT \|
	UTF8PROC_CASEFOLD);
	if (dupe_check < 0) {
	THROW(ERR, "Unexpected normalization error: %i32", dupe_check);
	}
	int comparison = strcmp((char)normalized, (char)dupe);
	free(dupe);
	free(normalized);
	DECREF(source);
	if (comparison != 0) {
	FAIL(runner, "Not fully normalized");
	return;
	}
	}
	PASS(runner, "Normalization successful.");
	}

	void
	TestNormalizer_Run_IMP(TestNormalizer self, TestBatchRunner runner) {
	TestBatchRunner_Plan(runner, (TestBatch*)self, 21);
	test_Dump_Load_and_Equals(runner);
	test_normalization(runner);
	test_utf8proc_normalization(runner);
	}