lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.ja.util;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
 import org.apache.lucene.util.packed.PackedInts;

 import com.ibm.icu.text.Normalizer2;

 /**
  */
 public class TokenInfoDictionaryBuilder {

   /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
   private int offset = 0;

   private String encoding = "euc-jp";

   private boolean normalizeEntries = false;
   private Normalizer2 normalizer;

   private DictionaryFormat format = DictionaryFormat.IPADIC;

   public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
     this.format = format;
     this.encoding = encoding;
     this.normalizeEntries = normalizeEntries;
     this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
   }

   public TokenInfoDictionaryWriter build(String dirname) throws IOException {
     FilenameFilter filter = new FilenameFilter() {
       @Override
       public boolean accept(File dir, String name) {
         return name.endsWith(".csv");
       }
     };
     ArrayList<File> csvFiles = new ArrayList<File>();
     for (File file : new File(dirname).listFiles(filter)) {
       csvFiles.add(file);
     }
     Collections.sort(csvFiles);
     return buildDictionary(csvFiles);
   }

   public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
     TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

     // all lines in the file
     System.out.println("  parse...");
     List<String[]> lines = new ArrayList<String[]>(400000);
     for (File file : csvFiles){
       FileInputStream inputStream = new FileInputStream(file);
       Charset cs = Charset.forName(encoding);
       CharsetDecoder decoder = cs.newDecoder()
           .onMalformedInput(CodingErrorAction.REPORT)
           .onUnmappableCharacter(CodingErrorAction.REPORT);
       InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
       BufferedReader reader = new BufferedReader(streamReader);

       String line = null;
       while ((line = reader.readLine()) != null) {
         String[] entry = CSVUtil.parse(line);

         if(entry.length < 13) {
           System.out.println("Entry in CSV is not valid: " + line);
           continue;
         }

         String[] formatted = formatEntry(entry);
         lines.add(formatted);

         // NFKC normalize dictionary entry
         if (normalizeEntries) {
           if (normalizer.isNormalized(entry[0])){
             continue;
           }
           String[] normalizedEntry = new String[entry.length];
           for (int i = 0; i < entry.length; i++) {
             normalizedEntry[i] = normalizer.normalize(entry[i]);
           }

           formatted = formatEntry(normalizedEntry);
           lines.add(formatted);
         }
       }
     }

     System.out.println("  sort...");

     // sort by term: we sorted the files already and use a stable sort.
     Collections.sort(lines, new Comparator<String[]>() {
       public int compare(String[] left, String[] right) {
         return left[0].compareTo(right[0]);
       }
     });

     System.out.println("  encode...");

     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
     IntsRef scratch = new IntsRef();
     long ord = -1; // first ord will be 0
     String lastValue = null;

     // build tokeninfo dictionary
     for (String[] entry : lines) {
       int next = dictionary.put(entry);

       if(next == offset){
         System.out.println("Failed to process line: " + Arrays.toString(entry));
         continue;
       }

       String token = entry[0];
       if (!token.equals(lastValue)) {
         // new word to add to fst
         ord++;
         lastValue = token;
         scratch.grow(token.length());
         scratch.length = token.length();
         for (int i = 0; i < token.length(); i++) {
           scratch.ints[i] = (int) token.charAt(i);
         }
         fstBuilder.add(scratch, ord);
       }
       dictionary.addMapping((int)ord, offset);
       offset = next;
     }

     final FST<Long> fst = fstBuilder.finish();

     System.out.print("  " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes...  ");
     dictionary.setFST(fst);
     System.out.println(" done");

     return dictionary;
   }

   /*
    * IPADIC features
    *
    * 0   - surface
    * 1   - left cost
    * 2   - right cost
    * 3   - word cost
    * 4-9 - pos
    * 10  - base form
    * 11  - reading
    * 12  - pronounciation
    *
    * UniDic features
    *
    * 0   - surface
    * 1   - left cost
    * 2   - right cost
    * 3   - word cost
    * 4-9 - pos
    * 10  - base form reading
    * 11  - base form
    * 12  - surface form
    * 13  - surface reading
    */

   public String[] formatEntry(String[] features) {
     if (this.format == DictionaryFormat.IPADIC) {
       return features;
     } else {
       String[] features2 = new String[13];
       features2[0] = features[0];
       features2[1] = features[1];
       features2[2] = features[2];
       features2[3] = features[3];
       features2[4] = features[4];
       features2[5] = features[5];
       features2[6] = features[6];
       features2[7] = features[7];
       features2[8] = features[8];
       features2[9] = features[9];
       features2[10] = features[11];

       // If the surface reading is non-existent, use surface form for reading and pronunciation.
       // This happens with punctuation in UniDic and there are possibly other cases as well
       if (features[13].length() == 0) {
         features2[11] = features[0];
         features2[12] = features[0];
       } else {
         features2[11] = features[13];
         features2[12] = features[13];
       }
       return features2;
     }
   }
 }
	package org.apache.lucene.analysis.ja.util;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FilenameFilter;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.nio.charset.Charset;
	import java.nio.charset.CharsetDecoder;
	import java.nio.charset.CodingErrorAction;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.fst.Builder;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.PositiveIntOutputs;
	import org.apache.lucene.util.packed.PackedInts;

	import com.ibm.icu.text.Normalizer2;

	/**
	*/
	public class TokenInfoDictionaryBuilder {

	/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
	private int offset = 0;

	private String encoding = "euc-jp";

	private boolean normalizeEntries = false;
	private Normalizer2 normalizer;

	private DictionaryFormat format = DictionaryFormat.IPADIC;

	public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
	this.format = format;
	this.encoding = encoding;
	this.normalizeEntries = normalizeEntries;
	this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
	}

	public TokenInfoDictionaryWriter build(String dirname) throws IOException {
	FilenameFilter filter = new FilenameFilter() {
	@Override
	public boolean accept(File dir, String name) {
	return name.endsWith(".csv");
	}
	};
	ArrayList<File> csvFiles = new ArrayList<File>();
	for (File file : new File(dirname).listFiles(filter)) {
	csvFiles.add(file);
	}
	Collections.sort(csvFiles);
	return buildDictionary(csvFiles);
	}

	public TokenInfoDictionaryWriter buildDictionary(List<File> csvFiles) throws IOException {
	TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

	// all lines in the file
	System.out.println(" parse...");
	List<String[]> lines = new ArrayList<String[]>(400000);
	for (File file : csvFiles){
	FileInputStream inputStream = new FileInputStream(file);
	Charset cs = Charset.forName(encoding);
	CharsetDecoder decoder = cs.newDecoder()
	.onMalformedInput(CodingErrorAction.REPORT)
	.onUnmappableCharacter(CodingErrorAction.REPORT);
	InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
	BufferedReader reader = new BufferedReader(streamReader);

	String line = null;
	while ((line = reader.readLine()) != null) {
	String[] entry = CSVUtil.parse(line);

	if(entry.length < 13) {
	System.out.println("Entry in CSV is not valid: " + line);
	continue;
	}

	String[] formatted = formatEntry(entry);
	lines.add(formatted);

	// NFKC normalize dictionary entry
	if (normalizeEntries) {
	if (normalizer.isNormalized(entry[0])){
	continue;
	}
	String[] normalizedEntry = new String[entry.length];
	for (int i = 0; i < entry.length; i++) {
	normalizedEntry[i] = normalizer.normalize(entry[i]);
	}

	formatted = formatEntry(normalizedEntry);
	lines.add(formatted);
	}
	}
	}

	System.out.println(" sort...");

	// sort by term: we sorted the files already and use a stable sort.
	Collections.sort(lines, new Comparator<String[]>() {
	public int compare(String[] left, String[] right) {
	return left[0].compareTo(right[0]);
	}
	});

	System.out.println(" encode...");

	PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
	Builder<Long> fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true, PackedInts.DEFAULT, true, 15);
	IntsRef scratch = new IntsRef();
	long ord = -1; // first ord will be 0
	String lastValue = null;

	// build tokeninfo dictionary
	for (String[] entry : lines) {
	int next = dictionary.put(entry);

	if(next == offset){
	System.out.println("Failed to process line: " + Arrays.toString(entry));
	continue;
	}

	String token = entry[0];
	if (!token.equals(lastValue)) {
	// new word to add to fst
	ord++;
	lastValue = token;
	scratch.grow(token.length());
	scratch.length = token.length();
	for (int i = 0; i < token.length(); i++) {
	scratch.ints[i] = (int) token.charAt(i);
	}
	fstBuilder.add(scratch, ord);
	}
	dictionary.addMapping((int)ord, offset);
	offset = next;
	}

	final FST<Long> fst = fstBuilder.finish();

	System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... ");
	dictionary.setFST(fst);
	System.out.println(" done");

	return dictionary;
	}

	/*
	* IPADIC features
	*
	* 0 - surface
	* 1 - left cost
	* 2 - right cost
	* 3 - word cost
	* 4-9 - pos
	* 10 - base form
	* 11 - reading
	* 12 - pronounciation
	*
	* UniDic features
	*
	* 0 - surface
	* 1 - left cost
	* 2 - right cost
	* 3 - word cost
	* 4-9 - pos
	* 10 - base form reading
	* 11 - base form
	* 12 - surface form
	* 13 - surface reading
	*/

	public String[] formatEntry(String[] features) {
	if (this.format == DictionaryFormat.IPADIC) {
	return features;
	} else {
	String[] features2 = new String[13];
	features2[0] = features[0];
	features2[1] = features[1];
	features2[2] = features[2];
	features2[3] = features[3];
	features2[4] = features[4];
	features2[5] = features[5];
	features2[6] = features[6];
	features2[7] = features[7];
	features2[8] = features[8];
	features2[9] = features[9];
	features2[10] = features[11];

	// If the surface reading is non-existent, use surface form for reading and pronunciation.
	// This happens with punctuation in UniDic and there are possibly other cases as well
	if (features[13].length() == 0) {
	features2[11] = features[0];
	features2[12] = features[0];
	} else {
	features2[11] = features[13];
	features2[12] = features[13];
	}
	return features2;
	}
	}
	}