lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ko.dict;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
 import org.apache.lucene.analysis.ko.POS;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.PositiveIntOutputs;

 /**
  * Class for building a User Dictionary. This class allows for adding custom nouns (세종) or compounds
  * (세종시 세종 시).
  */
 public final class UserDictionary implements Dictionary {
   // text -> wordID
   private final TokenInfoFST fst;

   private static final int WORD_COST = -100000;

   // NNG left
   private static final short LEFT_ID = 1781;

   // NNG right
   private static final short RIGHT_ID = 3533;
   // NNG right with hangul and a coda on the last char
   private static final short RIGHT_ID_T = 3535;
   // NNG right with hangul and no coda on the last char
   private static final short RIGHT_ID_F = 3534;

   // length, length... indexed by compound ID or null for simple noun
   private final int[][] segmentations;
   private final short[] rightIds;

   public static UserDictionary open(Reader reader) throws IOException {

     BufferedReader br = new BufferedReader(reader);
     String line;
     List<String> entries = new ArrayList<>();

     // text + optional segmentations
     while ((line = br.readLine()) != null) {
       // Remove comments
       line = line.replaceAll("#.*$", "");

       // Skip empty lines or comment lines
       if (line.trim().length() == 0) {
         continue;
       }
       entries.add(line);
     }

     if (entries.isEmpty()) {
       return null;
     } else {
       return new UserDictionary(entries);
     }
   }

   private UserDictionary(List<String> entries) throws IOException {
     final CharacterDefinition charDef = CharacterDefinition.getInstance();
     entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));

     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
     IntsRefBuilder scratch = new IntsRefBuilder();

     String lastToken = null;
     List<int[]> segmentations = new ArrayList<>(entries.size());
     List<Short> rightIds = new ArrayList<>(entries.size());
     long ord = 0;
     for (String entry : entries) {
       String[] splits = entry.split("\\s+");
       String token = splits[0];
       if (token.equals(lastToken)) {
         continue;
       }
       char lastChar = entry.charAt(entry.length() - 1);
       if (charDef.isHangul(lastChar)) {
         if (charDef.hasCoda(lastChar)) {
           rightIds.add(RIGHT_ID_T);
         } else {
           rightIds.add(RIGHT_ID_F);
         }
       } else {
         rightIds.add(RIGHT_ID);
       }

       if (splits.length == 1) {
         segmentations.add(null);
       } else {
         int[] length = new int[splits.length - 1];
         int offset = 0;
         for (int i = 1; i < splits.length; i++) {
           length[i - 1] = splits[i].length();
           offset += splits[i].length();
         }
         if (offset > token.length()) {
           throw new IllegalArgumentException(
               "Illegal user dictionary entry "
                   + entry
                   + " - the segmentation is bigger than the surface form ("
                   + token
                   + ")");
         }
         segmentations.add(length);
       }

       // add mapping to FST
       scratch.grow(token.length());
       scratch.setLength(token.length());
       for (int i = 0; i < token.length(); i++) {
         scratch.setIntAt(i, token.charAt(i));
       }
       fstCompiler.add(scratch.get(), ord);
       lastToken = token;
       ord++;
     }
     this.fst = new TokenInfoFST(fstCompiler.compile());
     this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
     this.rightIds = new short[rightIds.size()];
     for (int i = 0; i < rightIds.size(); i++) {
       this.rightIds[i] = rightIds.get(i);
     }
   }

   public TokenInfoFST getFST() {
     return fst;
   }

   @Override
   public int getLeftId(int wordId) {
     return LEFT_ID;
   }

   @Override
   public int getRightId(int wordId) {
     return rightIds[wordId];
   }

   @Override
   public int getWordCost(int wordId) {
     return WORD_COST;
   }

   @Override
   public POS.Type getPOSType(int wordId) {
     if (segmentations[wordId] == null) {
       return POS.Type.MORPHEME;
     } else {
       return POS.Type.COMPOUND;
     }
   }

   @Override
   public POS.Tag getLeftPOS(int wordId) {
     return POS.Tag.NNG;
   }

   @Override
   public POS.Tag getRightPOS(int wordId) {
     return POS.Tag.NNG;
   }

   @Override
   public String getReading(int wordId) {
     return null;
   }

   @Override
   public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
     int[] segs = segmentations[wordId];
     if (segs == null) {
       return null;
     }
     int offset = 0;
     Morpheme[] morphemes = new Morpheme[segs.length];
     for (int i = 0; i < segs.length; i++) {
       morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
       offset += segs[i];
     }
     return morphemes;
   }

   /**
    * Lookup words in text
    *
    * @param chars text
    * @param off offset into text
    * @param len length of text
    * @return array of wordId
    */
   public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
     List<Integer> result = new ArrayList<>();
     final FST.BytesReader fstReader = fst.getBytesReader();

     FST.Arc<Long> arc = new FST.Arc<>();
     int end = off + len;
     for (int startOffset = off; startOffset < end; startOffset++) {
       arc = fst.getFirstArc(arc);
       int output = 0;
       int remaining = end - startOffset;
       for (int i = 0; i < remaining; i++) {
         int ch = chars[startOffset + i];
         if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
           break; // continue to next position
         }
         output += arc.output().intValue();
         if (arc.isFinal()) {
           final int finalOutput = output + arc.nextFinalOutput().intValue();
           result.add(finalOutput);
         }
       }
     }
     return result;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.ko.dict;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.Comparator;
	import java.util.List;
	import org.apache.lucene.analysis.ko.POS;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.FSTCompiler;
	import org.apache.lucene.util.fst.PositiveIntOutputs;

	/**
	* Class for building a User Dictionary. This class allows for adding custom nouns (세종) or compounds
	* (세종시 세종 시).
	*/
	public final class UserDictionary implements Dictionary {
	// text -> wordID
	private final TokenInfoFST fst;

	private static final int WORD_COST = -100000;

	// NNG left
	private static final short LEFT_ID = 1781;

	// NNG right
	private static final short RIGHT_ID = 3533;
	// NNG right with hangul and a coda on the last char
	private static final short RIGHT_ID_T = 3535;
	// NNG right with hangul and no coda on the last char
	private static final short RIGHT_ID_F = 3534;

	// length, length... indexed by compound ID or null for simple noun
	private final int[][] segmentations;
	private final short[] rightIds;

	public static UserDictionary open(Reader reader) throws IOException {

	BufferedReader br = new BufferedReader(reader);
	String line;
	List<String> entries = new ArrayList<>();

	// text + optional segmentations
	while ((line = br.readLine()) != null) {
	// Remove comments
	line = line.replaceAll("#.*$", "");

	// Skip empty lines or comment lines
	if (line.trim().length() == 0) {
	continue;
	}
	entries.add(line);
	}

	if (entries.isEmpty()) {
	return null;
	} else {
	return new UserDictionary(entries);
	}
	}

	private UserDictionary(List<String> entries) throws IOException {
	final CharacterDefinition charDef = CharacterDefinition.getInstance();
	entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));

	PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
	FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
	IntsRefBuilder scratch = new IntsRefBuilder();

	String lastToken = null;
	List<int[]> segmentations = new ArrayList<>(entries.size());
	List<Short> rightIds = new ArrayList<>(entries.size());
	long ord = 0;
	for (String entry : entries) {
	String[] splits = entry.split("\\s+");
	String token = splits[0];
	if (token.equals(lastToken)) {
	continue;
	}
	char lastChar = entry.charAt(entry.length() - 1);
	if (charDef.isHangul(lastChar)) {
	if (charDef.hasCoda(lastChar)) {
	rightIds.add(RIGHT_ID_T);
	} else {
	rightIds.add(RIGHT_ID_F);
	}
	} else {
	rightIds.add(RIGHT_ID);
	}

	if (splits.length == 1) {
	segmentations.add(null);
	} else {
	int[] length = new int[splits.length - 1];
	int offset = 0;
	for (int i = 1; i < splits.length; i++) {
	length[i - 1] = splits[i].length();
	offset += splits[i].length();
	}
	if (offset > token.length()) {
	throw new IllegalArgumentException(
	"Illegal user dictionary entry "
	+ entry
	+ " - the segmentation is bigger than the surface form ("
	+ token
	+ ")");
	}
	segmentations.add(length);
	}

	// add mapping to FST
	scratch.grow(token.length());
	scratch.setLength(token.length());
	for (int i = 0; i < token.length(); i++) {
	scratch.setIntAt(i, token.charAt(i));
	}
	fstCompiler.add(scratch.get(), ord);
	lastToken = token;
	ord++;
	}
	this.fst = new TokenInfoFST(fstCompiler.compile());
	this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
	this.rightIds = new short[rightIds.size()];
	for (int i = 0; i < rightIds.size(); i++) {
	this.rightIds[i] = rightIds.get(i);
	}
	}

	public TokenInfoFST getFST() {
	return fst;
	}

	@Override
	public int getLeftId(int wordId) {
	return LEFT_ID;
	}

	@Override
	public int getRightId(int wordId) {
	return rightIds[wordId];
	}

	@Override
	public int getWordCost(int wordId) {
	return WORD_COST;
	}

	@Override
	public POS.Type getPOSType(int wordId) {
	if (segmentations[wordId] == null) {
	return POS.Type.MORPHEME;
	} else {
	return POS.Type.COMPOUND;
	}
	}

	@Override
	public POS.Tag getLeftPOS(int wordId) {
	return POS.Tag.NNG;
	}

	@Override
	public POS.Tag getRightPOS(int wordId) {
	return POS.Tag.NNG;
	}

	@Override
	public String getReading(int wordId) {
	return null;
	}

	@Override
	public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
	int[] segs = segmentations[wordId];
	if (segs == null) {
	return null;
	}
	int offset = 0;
	Morpheme[] morphemes = new Morpheme[segs.length];
	for (int i = 0; i < segs.length; i++) {
	morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off + offset, segs[i]));
	offset += segs[i];
	}
	return morphemes;
	}

	/**
	* Lookup words in text
	*
	* @param chars text
	* @param off offset into text
	* @param len length of text
	* @return array of wordId
	*/
	public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
	List<Integer> result = new ArrayList<>();
	final FST.BytesReader fstReader = fst.getBytesReader();

	FST.Arc<Long> arc = new FST.Arc<>();
	int end = off + len;
	for (int startOffset = off; startOffset < end; startOffset++) {
	arc = fst.getFirstArc(arc);
	int output = 0;
	int remaining = end - startOffset;
	for (int i = 0; i < remaining; i++) {
	int ch = chars[startOffset + i];
	if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
	break; // continue to next position
	}
	output += arc.output().intValue();
	if (arc.isFinal()) {
	final int finalOutput = output + arc.nextFinalOutput().intValue();
	result.add(finalOutput);
	}
	}
	}
	return result;
	}
	}