lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.ja.dict;


 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;

 import org.apache.lucene.analysis.ja.util.CSVUtil;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.PositiveIntOutputs;

 /**
  * Class for building a User Dictionary.
  * This class allows for custom segmentation of phrases.
  */
 public final class UserDictionary implements Dictionary {

   // phrase text -> phrase ID
   private final TokenInfoFST fst;

   // holds wordid, length, length... indexed by phrase ID
   private final int segmentations[][];

   // holds readings and POS, indexed by wordid
   private final String data[];

   private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;

   public static final int WORD_COST = -100000;

   public static final int LEFT_ID = 5;

   public static final int RIGHT_ID = 5;

   public static UserDictionary open(Reader reader) throws IOException {

     BufferedReader br = new BufferedReader(reader);
     String line = null;
     List<String[]> featureEntries = new ArrayList<>();

     // text, segmentation, readings, POS
     while ((line = br.readLine()) != null) {
       // Remove comments
       line = line.replaceAll("#.*$", "");

       // Skip empty lines or comment lines
       if (line.trim().length() == 0) {
         continue;
       }
       String[] values = CSVUtil.parse(line);
       featureEntries.add(values);
     }

     if (featureEntries.isEmpty()) {
       return null;
     } else {
       return new UserDictionary(featureEntries);
     }
   }

   private UserDictionary(List<String[]> featureEntries) throws IOException {

     int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
     // TODO: should we allow multiple segmentations per input 'phrase'?
     // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

     Collections.sort(featureEntries, new Comparator<String[]>() {
       @Override
       public int compare(String[] left, String[] right) {
         return left[0].compareTo(right[0]);
      }
     });

     List<String> data = new ArrayList<>(featureEntries.size());
     List<int[]> segmentations = new ArrayList<>(featureEntries.size());

     PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
     Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
     IntsRefBuilder scratch = new IntsRefBuilder();
     long ord = 0;

     for (String[] values : featureEntries) {
       String surface = values[0].replaceAll("\\s", "");
       String concatenatedSegment = values[1].replaceAll("\\s", "");
       String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
       String[] readings = values[2].replaceAll("  *", " ").split(" ");
       String pos = values[3];

       if (segmentation.length != readings.length) {
         throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                    " - the number of segmentations (" + segmentation.length + ")" +
                                    " does not the match number of readings (" + readings.length + ")");
       }

       if (concatenatedSegment.length() > surface.length()) {
         throw new RuntimeException("Illegal user dictionary entry " + values[0] +
             " - the concatenated segmentation (" + concatenatedSegment + ")" +
             " is longer than the surface form (" + surface + ")");
       }

       int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
       wordIdAndLength[0] = wordId;
       for (int i = 0; i < segmentation.length; i++) {
         wordIdAndLength[i + 1] = segmentation[i].length();
         data.add(readings[i] + INTERNAL_SEPARATOR + pos);
         wordId++;
       }
       // add mapping to FST
       String token = values[0];
       scratch.grow(token.length());
       scratch.setLength(token.length());
       for (int i = 0; i < token.length(); i++) {
         scratch.setIntAt(i, (int) token.charAt(i));
       }
       fstBuilder.add(scratch.get(), ord);
       segmentations.add(wordIdAndLength);
       ord++;
     }
     this.fst = new TokenInfoFST(fstBuilder.finish(), false);
     this.data = data.toArray(new String[data.size()]);
     this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
   }

   /**
    * Lookup words in text
    * @param chars text
    * @param off offset into text
    * @param len length of text
    * @return array of {wordId, position, length}
    */
   public int[][] lookup(char[] chars, int off, int len) throws IOException {
     // TODO: can we avoid this treemap/toIndexArray?
     TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
     boolean found = false; // true if we found any results

     final FST.BytesReader fstReader = fst.getBytesReader();

     FST.Arc<Long> arc = new FST.Arc<>();
     int end = off + len;
     for (int startOffset = off; startOffset < end; startOffset++) {
       arc = fst.getFirstArc(arc);
       int output = 0;
       int remaining = end - startOffset;
       for (int i = 0; i < remaining; i++) {
         int ch = chars[startOffset+i];
         if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
           break; // continue to next position
         }
         output += arc.output().intValue();
         if (arc.isFinal()) {
           final int finalOutput = output + arc.nextFinalOutput().intValue();
           result.put(startOffset-off, segmentations[finalOutput]);
           found = true;
         }
       }
     }

     return found ? toIndexArray(result) : EMPTY_RESULT;
   }

   public TokenInfoFST getFST() {
     return fst;
   }

   private static final int[][] EMPTY_RESULT = new int[0][];

   /**
    * Convert Map of index and wordIdAndLength to array of {wordId, index, length}
    * @return array of {wordId, index, length}
    */
   private int[][] toIndexArray(Map<Integer, int[]> input) {
     ArrayList<int[]> result = new ArrayList<>();
     for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
       int[] wordIdAndLength = entry.getValue();
       int wordId = wordIdAndLength[0];
       // convert length to index
       int current = entry.getKey();
       for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
         int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
         result.add(token);
         current += wordIdAndLength[j];
       }
     }
     return result.toArray(new int[result.size()][]);
   }

   public int[] lookupSegmentation(int phraseID) {
     return segmentations[phraseID];
   }

   @Override
   public int getLeftId(int wordId) {
     return LEFT_ID;
   }

   @Override
   public int getRightId(int wordId) {
     return RIGHT_ID;
   }

   @Override
   public int getWordCost(int wordId) {
     return WORD_COST;
   }

   @Override
   public String getReading(int wordId, char surface[], int off, int len) {
     return getFeature(wordId, 0);
   }

   @Override
   public String getPartOfSpeech(int wordId) {
     return getFeature(wordId, 1);
   }

   @Override
   public String getBaseForm(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }

   @Override
   public String getPronunciation(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }

   @Override
   public String getInflectionType(int wordId) {
     return null; // TODO: add support?
   }

   @Override
   public String getInflectionForm(int wordId) {
     return null; // TODO: add support?
   }

   private String[] getAllFeaturesArray(int wordId) {
     String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
     if(allFeatures == null) {
       return null;
     }

     return allFeatures.split(INTERNAL_SEPARATOR);
   }


   private String getFeature(int wordId, int... fields) {
     String[] allFeatures = getAllFeaturesArray(wordId);
     if (allFeatures == null) {
       return null;
     }
     StringBuilder sb = new StringBuilder();
     if (fields.length == 0) { // All features
       for (String feature : allFeatures) {
         sb.append(CSVUtil.quoteEscape(feature)).append(",");
       }
     } else if (fields.length == 1) { // One feature doesn't need to escape value
       sb.append(allFeatures[fields[0]]).append(",");
     } else {
       for (int field : fields){
         sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
       }
     }
     return sb.deleteCharAt(sb.length() - 1).toString();
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.ja.dict;


	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.List;
	import java.util.Map;
	import java.util.TreeMap;

	import org.apache.lucene.analysis.ja.util.CSVUtil;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.fst.Builder;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.PositiveIntOutputs;

	/**
	* Class for building a User Dictionary.
	* This class allows for custom segmentation of phrases.
	*/
	public final class UserDictionary implements Dictionary {

	// phrase text -> phrase ID
	private final TokenInfoFST fst;

	// holds wordid, length, length... indexed by phrase ID
	private final int segmentations[][];

	// holds readings and POS, indexed by wordid
	private final String data[];

	private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;

	public static final int WORD_COST = -100000;

	public static final int LEFT_ID = 5;

	public static final int RIGHT_ID = 5;

	public static UserDictionary open(Reader reader) throws IOException {

	BufferedReader br = new BufferedReader(reader);
	String line = null;
	List<String[]> featureEntries = new ArrayList<>();

	// text, segmentation, readings, POS
	while ((line = br.readLine()) != null) {
	// Remove comments
	line = line.replaceAll("#.*$", "");

	// Skip empty lines or comment lines
	if (line.trim().length() == 0) {
	continue;
	}
	String[] values = CSVUtil.parse(line);
	featureEntries.add(values);
	}

	if (featureEntries.isEmpty()) {
	return null;
	} else {
	return new UserDictionary(featureEntries);
	}
	}

	private UserDictionary(List<String[]> featureEntries) throws IOException {

	int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
	// TODO: should we allow multiple segmentations per input 'phrase'?
	// the old treemap didn't support this either, and i'm not sure if it's needed/useful?

	Collections.sort(featureEntries, new Comparator<String[]>() {
	@Override
	public int compare(String[] left, String[] right) {
	return left[0].compareTo(right[0]);
	}
	});

	List<String> data = new ArrayList<>(featureEntries.size());
	List<int[]> segmentations = new ArrayList<>(featureEntries.size());

	PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
	Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
	IntsRefBuilder scratch = new IntsRefBuilder();
	long ord = 0;

	for (String[] values : featureEntries) {
	String surface = values[0].replaceAll("\\s", "");
	String concatenatedSegment = values[1].replaceAll("\\s", "");
	String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
	String[] readings = values[2].replaceAll(" *", " ").split(" ");
	String pos = values[3];

	if (segmentation.length != readings.length) {
	throw new RuntimeException("Illegal user dictionary entry " + values[0] +
	" - the number of segmentations (" + segmentation.length + ")" +
	" does not the match number of readings (" + readings.length + ")");
	}

	if (concatenatedSegment.length() > surface.length()) {
	throw new RuntimeException("Illegal user dictionary entry " + values[0] +
	" - the concatenated segmentation (" + concatenatedSegment + ")" +
	" is longer than the surface form (" + surface + ")");
	}

	int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
	wordIdAndLength[0] = wordId;
	for (int i = 0; i < segmentation.length; i++) {
	wordIdAndLength[i + 1] = segmentation[i].length();
	data.add(readings[i] + INTERNAL_SEPARATOR + pos);
	wordId++;
	}
	// add mapping to FST
	String token = values[0];
	scratch.grow(token.length());
	scratch.setLength(token.length());
	for (int i = 0; i < token.length(); i++) {
	scratch.setIntAt(i, (int) token.charAt(i));
	}
	fstBuilder.add(scratch.get(), ord);
	segmentations.add(wordIdAndLength);
	ord++;
	}
	this.fst = new TokenInfoFST(fstBuilder.finish(), false);
	this.data = data.toArray(new String[data.size()]);
	this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
	}

	/**
	* Lookup words in text
	* @param chars text
	* @param off offset into text
	* @param len length of text
	* @return array of {wordId, position, length}
	*/
	public int[][] lookup(char[] chars, int off, int len) throws IOException {
	// TODO: can we avoid this treemap/toIndexArray?
	TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...]
	boolean found = false; // true if we found any results

	final FST.BytesReader fstReader = fst.getBytesReader();

	FST.Arc<Long> arc = new FST.Arc<>();
	int end = off + len;
	for (int startOffset = off; startOffset < end; startOffset++) {
	arc = fst.getFirstArc(arc);
	int output = 0;
	int remaining = end - startOffset;
	for (int i = 0; i < remaining; i++) {
	int ch = chars[startOffset+i];
	if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
	break; // continue to next position
	}
	output += arc.output().intValue();
	if (arc.isFinal()) {
	final int finalOutput = output + arc.nextFinalOutput().intValue();
	result.put(startOffset-off, segmentations[finalOutput]);
	found = true;
	}
	}
	}

	return found ? toIndexArray(result) : EMPTY_RESULT;
	}

	public TokenInfoFST getFST() {
	return fst;
	}

	private static final int[][] EMPTY_RESULT = new int[0][];

	/**
	* Convert Map of index and wordIdAndLength to array of {wordId, index, length}
	* @return array of {wordId, index, length}
	*/
	private int[][] toIndexArray(Map<Integer, int[]> input) {
	ArrayList<int[]> result = new ArrayList<>();
	for (Map.Entry<Integer, int[]> entry : input.entrySet()) {
	int[] wordIdAndLength = entry.getValue();
	int wordId = wordIdAndLength[0];
	// convert length to index
	int current = entry.getKey();
	for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset
	int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
	result.add(token);
	current += wordIdAndLength[j];
	}
	}
	return result.toArray(new int[result.size()][]);
	}

	public int[] lookupSegmentation(int phraseID) {
	return segmentations[phraseID];
	}

	@Override
	public int getLeftId(int wordId) {
	return LEFT_ID;
	}

	@Override
	public int getRightId(int wordId) {
	return RIGHT_ID;
	}

	@Override
	public int getWordCost(int wordId) {
	return WORD_COST;
	}

	@Override
	public String getReading(int wordId, char surface[], int off, int len) {
	return getFeature(wordId, 0);
	}

	@Override
	public String getPartOfSpeech(int wordId) {
	return getFeature(wordId, 1);
	}

	@Override
	public String getBaseForm(int wordId, char surface[], int off, int len) {
	return null; // TODO: add support?
	}

	@Override
	public String getPronunciation(int wordId, char surface[], int off, int len) {
	return null; // TODO: add support?
	}

	@Override
	public String getInflectionType(int wordId) {
	return null; // TODO: add support?
	}

	@Override
	public String getInflectionForm(int wordId) {
	return null; // TODO: add support?
	}

	private String[] getAllFeaturesArray(int wordId) {
	String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
	if(allFeatures == null) {
	return null;
	}

	return allFeatures.split(INTERNAL_SEPARATOR);
	}


	private String getFeature(int wordId, int... fields) {
	String[] allFeatures = getAllFeaturesArray(wordId);
	if (allFeatures == null) {
	return null;
	}
	StringBuilder sb = new StringBuilder();
	if (fields.length == 0) { // All features
	for (String feature : allFeatures) {
	sb.append(CSVUtil.quoteEscape(feature)).append(",");
	}
	} else if (fields.length == 1) { // One feature doesn't need to escape value
	sb.append(allFeatures[fields[0]]).append(",");
	} else {
	for (int field : fields){
	sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
	}
	}
	return sb.deleteCharAt(sb.length() - 1).toString();
	}

	}