lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ko.util;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.Normalizer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;

 import org.apache.lucene.util.fst.PositiveIntOutputs;

 class TokenInfoDictionaryBuilder {

   /** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
   private int offset = 0;

   private String encoding;
   private Normalizer.Form normalForm;

   TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
     this.encoding = encoding;
     normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
   }

   public TokenInfoDictionaryWriter build(Path dir) throws IOException {
     try (Stream<Path> files = Files.list(dir)) {
       List<Path> csvFiles = files
           .filter(path -> path.getFileName().toString().endsWith(".csv"))
           .sorted()
           .collect(Collectors.toList());
       return buildDictionary(csvFiles);
     }
   }

   private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
     TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
     // all lines in the file
     List<String[]> lines = new ArrayList<>(400000);
     for (Path path : csvFiles) {
       try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) {
         String line;
         while ((line = reader.readLine()) != null) {
           String[] entry = CSVUtil.parse(line);

           if (entry.length < 12) {
             throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
           }

           // NFKC normalize dictionary entry
           if (normalForm != null) {
             String[] normalizedEntry = new String[entry.length];
             for (int i = 0; i < entry.length; i++) {
               normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
             }
             lines.add(normalizedEntry);
           } else {
             lines.add(entry);
           }
         }
       }
     }

     // sort by term: we sorted the files already and use a stable sort.
     lines.sort(Comparator.comparing(left -> left[0]));

     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
     IntsRefBuilder scratch = new IntsRefBuilder();
     long ord = -1; // first ord will be 0
     String lastValue = null;

     // build token info dictionary
     for (String[] entry : lines) {
       String surfaceForm = entry[0].trim();
       if (surfaceForm.isEmpty()) {
         continue;
       }
       int next = dictionary.put(entry);

       if(next == offset) {
         throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
       }

       if (!surfaceForm.equals(lastValue)) {
         // new word to add to fst
         ord++;
         lastValue = surfaceForm;
         scratch.grow(surfaceForm.length());
         scratch.setLength(surfaceForm.length());
         for (int i = 0; i < surfaceForm.length(); i++) {
           scratch.setIntAt(i, surfaceForm.charAt(i));
         }
         fstBuilder.add(scratch.get(), ord);
       }
       dictionary.addMapping((int) ord, offset);
       offset = next;
     }
     dictionary.setFST(fstBuilder.finish());
     return dictionary;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.ko.util;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.text.Normalizer;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Comparator;
	import java.util.List;
	import java.util.stream.Collectors;
	import java.util.stream.Stream;

	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.Builder;
	import org.apache.lucene.util.fst.FST;

	import org.apache.lucene.util.fst.PositiveIntOutputs;

	class TokenInfoDictionaryBuilder {

	/** Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file */
	private int offset = 0;

	private String encoding;
	private Normalizer.Form normalForm;

	TokenInfoDictionaryBuilder(String encoding, boolean normalizeEntries) {
	this.encoding = encoding;
	normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
	}

	public TokenInfoDictionaryWriter build(Path dir) throws IOException {
	try (Stream<Path> files = Files.list(dir)) {
	List<Path> csvFiles = files
	.filter(path -> path.getFileName().toString().endsWith(".csv"))
	.sorted()
	.collect(Collectors.toList());
	return buildDictionary(csvFiles);
	}
	}

	private TokenInfoDictionaryWriter buildDictionary(List<Path> csvFiles) throws IOException {
	TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
	// all lines in the file
	List<String[]> lines = new ArrayList<>(400000);
	for (Path path : csvFiles) {
	try (BufferedReader reader = Files.newBufferedReader(path, Charset.forName(encoding))) {
	String line;
	while ((line = reader.readLine()) != null) {
	String[] entry = CSVUtil.parse(line);

	if (entry.length < 12) {
	throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line);
	}

	// NFKC normalize dictionary entry
	if (normalForm != null) {
	String[] normalizedEntry = new String[entry.length];
	for (int i = 0; i < entry.length; i++) {
	normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
	}
	lines.add(normalizedEntry);
	} else {
	lines.add(entry);
	}
	}
	}
	}

	// sort by term: we sorted the files already and use a stable sort.
	lines.sort(Comparator.comparing(left -> left[0]));

	PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
	Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15);
	IntsRefBuilder scratch = new IntsRefBuilder();
	long ord = -1; // first ord will be 0
	String lastValue = null;

	// build token info dictionary
	for (String[] entry : lines) {
	String surfaceForm = entry[0].trim();
	if (surfaceForm.isEmpty()) {
	continue;
	}
	int next = dictionary.put(entry);

	if(next == offset) {
	throw new IllegalStateException("Failed to process line: " + Arrays.toString(entry));
	}

	if (!surfaceForm.equals(lastValue)) {
	// new word to add to fst
	ord++;
	lastValue = surfaceForm;
	scratch.grow(surfaceForm.length());
	scratch.setLength(surfaceForm.length());
	for (int i = 0; i < surfaceForm.length(); i++) {
	scratch.setIntAt(i, surfaceForm.charAt(i));
	}
	fstBuilder.add(scratch.get(), ord);
	}
	dictionary.addMapping((int) ord, offset);
	offset = next;
	}
	dictionary.setFST(fstBuilder.finish());
	return dictionary;
	}
	}