lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ja.dict;

 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;

 import org.apache.lucene.analysis.ja.util.DictionaryBuilder;
 import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
 import org.apache.lucene.analysis.ja.util.ToStringUtil;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.IntsRefFSTEnum;

 import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;

 /**
  * Tests of TokenInfoDictionary build tools; run using ant test-tools
  */
 public class TokenInfoDictionaryTest extends LuceneTestCase {

   public void testPut() throws Exception {
     TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
                                                // "large" id
                                                "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
     IntsRef wordIdRef = new IntsRefBuilder().get();

     dict.lookupWordIds(0, wordIdRef);
     int wordId = wordIdRef.ints[wordIdRef.offset];
     assertEquals(5000, dict.getLeftId(wordId));
     assertEquals(5000, dict.getRightId(wordId));
     assertEquals(3, dict.getWordCost(wordId));

     dict.lookupWordIds(1, wordIdRef);
     wordId = wordIdRef.ints[wordIdRef.offset];
     assertEquals(1, dict.getLeftId(wordId));
     assertEquals(1, dict.getRightId(wordId));
     assertEquals(2, dict.getWordCost(wordId));
   }

   private TokenInfoDictionary newDictionary(String... entries) throws Exception {
     Path dir = createTempDir();
     try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
          PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
       for (String entry : entries) {
         printer.println(entry);
       }
     }
     Files.createFile(dir.resolve("unk.def"));
     Files.createFile(dir.resolve("char.def"));
     try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
       printer.println("1 1");
     }
     DictionaryBuilder.build(DictionaryFormat.IPADIC, dir, dir, "utf-8", true);
     String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
     // We must also load the other files (in BinaryDictionary) from the correct path
     return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
   }

   public void testPutException() {
     // too few columns
     expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
     // left id != right id
     expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
     // left id != right id
     expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
     // id too large
     expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
   }

   /** enumerates the entire FST/lookup data and just does basic sanity checks */
   public void testEnumerateAll() throws Exception {
     // just for debugging
     int numTerms = 0;
     int numWords = 0;
     int lastWordId = -1;
     int lastSourceId = -1;
     TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
     ConnectionCosts matrix = ConnectionCosts.getInstance();
     FST<Long> fst = tid.getFST().getInternalFST();
     IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
     IntsRefFSTEnum.InputOutput<Long> mapping;
     IntsRef scratch = new IntsRef();
     while ((mapping = fstEnum.next()) != null) {
       numTerms++;
       IntsRef input = mapping.input;
       char[] chars = new char[input.length];
       for (int i = 0; i < chars.length; i++) {
         chars[i] = (char)input.ints[input.offset+i];
       }
       assertTrue(UnicodeUtil.validUTF16String(new String(chars)));

       Long output = mapping.output;
       int sourceId = output.intValue();
       // we walk in order, terms, sourceIds, and wordIds should always be increasing
       assertTrue(sourceId > lastSourceId);
       lastSourceId = sourceId;
       tid.lookupWordIds(sourceId, scratch);
       for (int i = 0; i < scratch.length; i++) {
         numWords++;
         int wordId = scratch.ints[scratch.offset+i];
         assertTrue(wordId > lastWordId);
         lastWordId = wordId;

         String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
         assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));

         String inflectionForm = tid.getInflectionForm(wordId);
         assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
         if (inflectionForm != null) {
           // check that it's actually an ipadic inflection form
           assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
         }

         String inflectionType = tid.getInflectionType(wordId);
         assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
         if (inflectionType != null) {
           // check that it's actually an ipadic inflection type
           assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
         }

         int leftId = tid.getLeftId(wordId);
         int rightId = tid.getRightId(wordId);

         matrix.get(rightId, leftId);

         tid.getWordCost(wordId);

         String pos = tid.getPartOfSpeech(wordId);
         assertNotNull(pos);
         assertTrue(UnicodeUtil.validUTF16String(pos));
         // check that it's actually an ipadic pos tag
         assertNotNull(ToStringUtil.getPOSTranslation(pos));

         String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
         assertNotNull(pronunciation);
         assertTrue(UnicodeUtil.validUTF16String(pronunciation));

         String reading = tid.getReading(wordId, chars, 0, chars.length);
         assertNotNull(reading);
         assertTrue(UnicodeUtil.validUTF16String(reading));
       }
     }
     if (VERBOSE) {
       System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.ja.dict;

	import java.io.OutputStream;
	import java.io.OutputStreamWriter;
	import java.io.PrintWriter;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Path;

	import org.apache.lucene.analysis.ja.util.DictionaryBuilder;
	import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
	import org.apache.lucene.analysis.ja.util.ToStringUtil;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.IntsRefFSTEnum;

	import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;

	/**
	* Tests of TokenInfoDictionary build tools; run using ant test-tools
	*/
	public class TokenInfoDictionaryTest extends LuceneTestCase {

	public void testPut() throws Exception {
	TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,,,,,,,*",
	// "large" id
	"一般,5000,5000,3,名詞,一般,,,,,,,*");
	IntsRef wordIdRef = new IntsRefBuilder().get();

	dict.lookupWordIds(0, wordIdRef);
	int wordId = wordIdRef.ints[wordIdRef.offset];
	assertEquals(5000, dict.getLeftId(wordId));
	assertEquals(5000, dict.getRightId(wordId));
	assertEquals(3, dict.getWordCost(wordId));

	dict.lookupWordIds(1, wordIdRef);
	wordId = wordIdRef.ints[wordIdRef.offset];
	assertEquals(1, dict.getLeftId(wordId));
	assertEquals(1, dict.getRightId(wordId));
	assertEquals(2, dict.getWordCost(wordId));
	}

	private TokenInfoDictionary newDictionary(String... entries) throws Exception {
	Path dir = createTempDir();
	try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
	PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
	for (String entry : entries) {
	printer.println(entry);
	}
	}
	Files.createFile(dir.resolve("unk.def"));
	Files.createFile(dir.resolve("char.def"));
	try (OutputStream out = Files.newOutputStream(dir.resolve("matrix.def"));
	PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
	printer.println("1 1");
	}
	DictionaryBuilder.build(DictionaryFormat.IPADIC, dir, dir, "utf-8", true);
	String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
	// We must also load the other files (in BinaryDictionary) from the correct path
	return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
	}

	public void testPutException() {
	// too few columns
	expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,,,,,*"));
	// left id != right id
	expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,,,,,,,*"));
	// left id != right id
	expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,,,,,,,*"));
	// id too large
	expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,,,,,,,*"));
	}

	/** enumerates the entire FST/lookup data and just does basic sanity checks */
	public void testEnumerateAll() throws Exception {
	// just for debugging
	int numTerms = 0;
	int numWords = 0;
	int lastWordId = -1;
	int lastSourceId = -1;
	TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
	ConnectionCosts matrix = ConnectionCosts.getInstance();
	FST<Long> fst = tid.getFST().getInternalFST();
	IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
	IntsRefFSTEnum.InputOutput<Long> mapping;
	IntsRef scratch = new IntsRef();
	while ((mapping = fstEnum.next()) != null) {
	numTerms++;
	IntsRef input = mapping.input;
	char[] chars = new char[input.length];
	for (int i = 0; i < chars.length; i++) {
	chars[i] = (char)input.ints[input.offset+i];
	}
	assertTrue(UnicodeUtil.validUTF16String(new String(chars)));

	Long output = mapping.output;
	int sourceId = output.intValue();
	// we walk in order, terms, sourceIds, and wordIds should always be increasing
	assertTrue(sourceId > lastSourceId);
	lastSourceId = sourceId;
	tid.lookupWordIds(sourceId, scratch);
	for (int i = 0; i < scratch.length; i++) {
	numWords++;
	int wordId = scratch.ints[scratch.offset+i];
	assertTrue(wordId > lastWordId);
	lastWordId = wordId;

	String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
	assertTrue(baseForm == null \|\| UnicodeUtil.validUTF16String(baseForm));

	String inflectionForm = tid.getInflectionForm(wordId);
	assertTrue(inflectionForm == null \|\| UnicodeUtil.validUTF16String(inflectionForm));
	if (inflectionForm != null) {
	// check that it's actually an ipadic inflection form
	assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
	}

	String inflectionType = tid.getInflectionType(wordId);
	assertTrue(inflectionType == null \|\| UnicodeUtil.validUTF16String(inflectionType));
	if (inflectionType != null) {
	// check that it's actually an ipadic inflection type
	assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
	}

	int leftId = tid.getLeftId(wordId);
	int rightId = tid.getRightId(wordId);

	matrix.get(rightId, leftId);

	tid.getWordCost(wordId);

	String pos = tid.getPartOfSpeech(wordId);
	assertNotNull(pos);
	assertTrue(UnicodeUtil.validUTF16String(pos));
	// check that it's actually an ipadic pos tag
	assertNotNull(ToStringUtil.getPOSTranslation(pos));

	String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
	assertNotNull(pronunciation);
	assertTrue(UnicodeUtil.validUTF16String(pronunciation));

	String reading = tid.getReading(wordId, chars, 0, chars.length);
	assertNotNull(reading);
	assertTrue(UnicodeUtil.validUTF16String(reading));
	}
	}
	if (VERBOSE) {
	System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
	}
	}

	}