test/Lucy/Test/Analysis/TestStandardTokenizer.c - lucy - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #define C_TESTLUCY_TESTSTANDARDTOKENIZER
 #define TESTLUCY_USE_SHORT_NAMES
 #include "Lucy/Util/ToolSet.h"

 #include "Clownfish/TestHarness/TestBatchRunner.h"
 #include "Lucy/Test.h"
 #include "Lucy/Test/Analysis/TestStandardTokenizer.h"
 #include "Lucy/Analysis/StandardTokenizer.h"
 #include "Lucy/Store/FSFolder.h"
 #include "Lucy/Test/TestUtils.h"
 #include "Lucy/Util/Json.h"

 TestStandardTokenizer*
 TestStandardTokenizer_new() {
     return (TestStandardTokenizer*)Class_Make_Obj(TESTSTANDARDTOKENIZER);
 }

 static void
 test_Dump_Load_and_Equals(TestBatchRunner *runner) {
     StandardTokenizer *tokenizer = StandardTokenizer_new();
     Obj *dump  = StandardTokenizer_Dump(tokenizer);
     StandardTokenizer *clone
         = (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);

     TEST_TRUE(runner,
               StandardTokenizer_Equals(tokenizer, (Obj*)clone),
               "Dump => Load round trip");

     DECREF(tokenizer);
     DECREF(dump);
     DECREF(clone);
 }

 static void
 test_tokenizer(TestBatchRunner *runner) {
     StandardTokenizer *tokenizer = StandardTokenizer_new();

     String *word = SSTR_WRAP_C(
                               " ."
                               "tha\xCC\x82t's"
                               ":"
                               "1,02\xC2\xADZ4.38"
                               "\xE0\xB8\x81\xC2\xAD\xC2\xAD"
                               "\xF0\xA0\x80\x80"
                               "a"
                               "/");
     Vector *got = StandardTokenizer_Split(tokenizer, word);
     String *token = (String*)Vec_Fetch(got, 0);
     char   *token_str = Str_To_Utf8(token);
     TEST_TRUE(runner,
               token
               && Str_is_a(token, STRING)
               && Str_Equals_Utf8(token, "tha\xcc\x82t's", 8),
               "Token: %s", token_str);
     free(token_str);
     token = (String*)Vec_Fetch(got, 1);
     token_str = Str_To_Utf8(token);
     TEST_TRUE(runner,
               token
               && Str_is_a(token, STRING)
               && Str_Equals_Utf8(token, "1,02\xC2\xADZ4.38", 11),
               "Token: %s", token_str);
     free(token_str);
     token = (String*)Vec_Fetch(got, 2);
     token_str = Str_To_Utf8(token);
     TEST_TRUE(runner,
               token
               && Str_is_a(token, STRING)
               && Str_Equals_Utf8(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7),
               "Token: %s", token_str);
     free(token_str);
     token = (String*)Vec_Fetch(got, 3);
     token_str = Str_To_Utf8(token);
     TEST_TRUE(runner,
               token
               && Str_is_a(token, STRING)
               && Str_Equals_Utf8(token, "\xF0\xA0\x80\x80", 4),
               "Token: %s", token_str);
     free(token_str);
     token = (String*)Vec_Fetch(got, 4);
     token_str = Str_To_Utf8(token);
     TEST_TRUE(runner,
               token
               && Str_is_a(token, STRING)
               && Str_Equals_Utf8(token, "a", 1),
               "Token: %s", token_str);
     free(token_str);
     DECREF(got);

     FSFolder *modules_folder = TestUtils_modules_folder();
     if (modules_folder == NULL) {
         SKIP(runner, 1372, "Can't locate test data");
     }
     else {
         String *path = Str_newf("unicode/ucd/WordBreakTest.json");
         Vector *tests = (Vector*)Json_slurp_json((Folder*)modules_folder, path);
         if (!tests) { RETHROW(Err_get_error()); }

         for (size_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
             Hash *test = (Hash*)Vec_Fetch(tests, i);
             String *text = (String*)Hash_Fetch_Utf8(test, "text", 4);
             Vector *wanted = (Vector*)Hash_Fetch_Utf8(test, "words", 5);
             Vector *got = StandardTokenizer_Split(tokenizer, text);
             TEST_TRUE(runner, Vec_Equals(wanted, (Obj*)got), "UCD test #%d",
                       (int)i + 1);
             DECREF(got);
         }

         DECREF(tests);
         DECREF(modules_folder);
         DECREF(path);
     }

     DECREF(tokenizer);
 }

 void
 TestStandardTokenizer_Run_IMP(TestStandardTokenizer *self, TestBatchRunner *runner) {
     TestBatchRunner_Plan(runner, (TestBatch*)self, 1378);
     test_Dump_Load_and_Equals(runner);
     test_tokenizer(runner);
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#define C_TESTLUCY_TESTSTANDARDTOKENIZER
	#define TESTLUCY_USE_SHORT_NAMES
	#include "Lucy/Util/ToolSet.h"

	#include "Clownfish/TestHarness/TestBatchRunner.h"
	#include "Lucy/Test.h"
	#include "Lucy/Test/Analysis/TestStandardTokenizer.h"
	#include "Lucy/Analysis/StandardTokenizer.h"
	#include "Lucy/Store/FSFolder.h"
	#include "Lucy/Test/TestUtils.h"
	#include "Lucy/Util/Json.h"

	TestStandardTokenizer*
	TestStandardTokenizer_new() {
	return (TestStandardTokenizer*)Class_Make_Obj(TESTSTANDARDTOKENIZER);
	}

	static void
	test_Dump_Load_and_Equals(TestBatchRunner *runner) {
	StandardTokenizer *tokenizer = StandardTokenizer_new();
	Obj *dump = StandardTokenizer_Dump(tokenizer);
	StandardTokenizer *clone
	= (StandardTokenizer*)StandardTokenizer_Load(tokenizer, dump);

	TEST_TRUE(runner,
	StandardTokenizer_Equals(tokenizer, (Obj*)clone),
	"Dump => Load round trip");

	DECREF(tokenizer);
	DECREF(dump);
	DECREF(clone);
	}

	static void
	test_tokenizer(TestBatchRunner *runner) {
	StandardTokenizer *tokenizer = StandardTokenizer_new();

	String *word = SSTR_WRAP_C(
	" ."
	"tha\xCC\x82t's"
	":"
	"1,02\xC2\xADZ4.38"
	"\xE0\xB8\x81\xC2\xAD\xC2\xAD"
	"\xF0\xA0\x80\x80"
	"a"
	"/");
	Vector *got = StandardTokenizer_Split(tokenizer, word);
	String token = (String)Vec_Fetch(got, 0);
	char *token_str = Str_To_Utf8(token);
	TEST_TRUE(runner,
	token
	&& Str_is_a(token, STRING)
	&& Str_Equals_Utf8(token, "tha\xcc\x82t's", 8),
	"Token: %s", token_str);
	free(token_str);
	token = (String*)Vec_Fetch(got, 1);
	token_str = Str_To_Utf8(token);
	TEST_TRUE(runner,
	token
	&& Str_is_a(token, STRING)
	&& Str_Equals_Utf8(token, "1,02\xC2\xADZ4.38", 11),
	"Token: %s", token_str);
	free(token_str);
	token = (String*)Vec_Fetch(got, 2);
	token_str = Str_To_Utf8(token);
	TEST_TRUE(runner,
	token
	&& Str_is_a(token, STRING)
	&& Str_Equals_Utf8(token, "\xE0\xB8\x81\xC2\xAD\xC2\xAD", 7),
	"Token: %s", token_str);
	free(token_str);
	token = (String*)Vec_Fetch(got, 3);
	token_str = Str_To_Utf8(token);
	TEST_TRUE(runner,
	token
	&& Str_is_a(token, STRING)
	&& Str_Equals_Utf8(token, "\xF0\xA0\x80\x80", 4),
	"Token: %s", token_str);
	free(token_str);
	token = (String*)Vec_Fetch(got, 4);
	token_str = Str_To_Utf8(token);
	TEST_TRUE(runner,
	token
	&& Str_is_a(token, STRING)
	&& Str_Equals_Utf8(token, "a", 1),
	"Token: %s", token_str);
	free(token_str);
	DECREF(got);

	FSFolder *modules_folder = TestUtils_modules_folder();
	if (modules_folder == NULL) {
	SKIP(runner, 1372, "Can't locate test data");
	}
	else {
	String *path = Str_newf("unicode/ucd/WordBreakTest.json");
	Vector tests = (Vector)Json_slurp_json((Folder*)modules_folder, path);
	if (!tests) { RETHROW(Err_get_error()); }

	for (size_t i = 0, max = Vec_Get_Size(tests); i < max; i++) {
	Hash test = (Hash)Vec_Fetch(tests, i);
	String text = (String)Hash_Fetch_Utf8(test, "text", 4);
	Vector wanted = (Vector)Hash_Fetch_Utf8(test, "words", 5);
	Vector *got = StandardTokenizer_Split(tokenizer, text);
	TEST_TRUE(runner, Vec_Equals(wanted, (Obj*)got), "UCD test #%d",
	(int)i + 1);
	DECREF(got);
	}

	DECREF(tests);
	DECREF(modules_folder);
	DECREF(path);
	}

	DECREF(tokenizer);
	}

	void
	TestStandardTokenizer_Run_IMP(TestStandardTokenizer self, TestBatchRunner runner) {
	TestBatchRunner_Plan(runner, (TestBatch*)self, 1378);
	test_Dump_Load_and_Equals(runner);
	test_tokenizer(runner);
	}