modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Segmenter.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.kuromoji;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.EnumMap;
 import java.util.List;

 import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
 import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
 import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;

 /**
  * Tokenizer main class.
  * Thread safe.
  */
 public class Segmenter {
   public static enum Mode {
     NORMAL, SEARCH, SEARCH_WITH_COMPOUNDS, EXTENDED
   }

   public static final Mode DEFAULT_MODE = Mode.SEARCH;

   private final Viterbi viterbi;

   private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);

   private final boolean split;

   public Segmenter() {
     this(null, DEFAULT_MODE, false);
   }

   public Segmenter(Mode mode) {
     this(null, mode, false);
   }

   public Segmenter(UserDictionary userDictionary) {
     this(userDictionary, DEFAULT_MODE, false);
   }

   public Segmenter(UserDictionary userDictionary, Mode mode) {
     this(userDictionary, mode, false);
   }

   public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
     final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
     final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
     this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
     this.split = split;

     dictionaryMap.put(Type.KNOWN, dict);
     dictionaryMap.put(Type.UNKNOWN, unknownDict);
     dictionaryMap.put(Type.USER, userDictionary);
   }

   /**
    * Tokenize input text
    * @param text
    * @return list of Token
    */
   public List<Token> tokenize(String text) {

     if (!split) {
       return doTokenize(0, text);
     }

     List<Integer> splitPositions = getSplitPositions(text);

     if(splitPositions.size() == 0) {
       return doTokenize(0, text);
     }

     ArrayList<Token> result = new ArrayList<Token>();
     int offset = 0;
     for(int position : splitPositions) {
       result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
       offset = position + 1;
     }

     if(offset < text.length()) {
       result.addAll(doTokenize(offset, text.substring(offset)));
     }

     return result;
   }

   /**
    * Split input text at 句読点, which is 。 and 、
    * @param text
    * @return list of split position
    */
   private List<Integer> getSplitPositions(String text) {
     ArrayList<Integer> splitPositions = new ArrayList<Integer>();

     int position = 0;
     int currentPosition = 0;

     while(true) {
       int indexOfMaru = text.indexOf("。", currentPosition);
       int indexOfTen = text.indexOf("、", currentPosition);

       if(indexOfMaru < 0 || indexOfTen < 0) {
         position = Math.max(indexOfMaru, indexOfTen);;
       } else {
         position = Math.min(indexOfMaru, indexOfTen);
       }

       if(position >= 0) {
         splitPositions.add(position);
         currentPosition = position + 1;
       } else {
         break;
       }
     }

     return splitPositions;
   }

   private List<Token> doTokenize(int offset, String sentence) {
     char text[] = sentence.toCharArray();
     return doTokenize(offset, text, 0, text.length, false);
   }

   /**
    * Tokenize input sentence.
    * @param offset offset of sentence in original input text
    * @param sentence sentence to tokenize
    * @return list of Token
    */
   public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
     ArrayList<Token> result = new ArrayList<Token>();
     ViterbiNode[][][] lattice;
     try {
       lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
     } catch (IOException impossible) {
       throw new RuntimeException(impossible);
     }
     List<ViterbiNode> bestPath = viterbi.search(lattice);
     for (ViterbiNode node : bestPath) {
       int wordId = node.getWordId();
       if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
         continue;
       } else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
         continue; // Do not emit punctuation
       }
       Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
       result.add(token);
     }
     /*
     System.out.println("result:");
     for(Token token : result) {
       System.out.println("  " + token);
     }
     */
     return result;
   }

   /** returns a Graphviz String */
   public String debugTokenize(String text) {
     ViterbiNode[][][] lattice;
     try {
       lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
     } catch (IOException impossible) {
       throw new RuntimeException(impossible);
     }
     List<ViterbiNode> bestPath = this.viterbi.search(lattice);

     return new GraphvizFormatter(ConnectionCosts.getInstance())
       .format(lattice[0], lattice[1], bestPath);
   }

   static final boolean isPunctuation(char ch) {
     switch(Character.getType(ch)) {
       case Character.SPACE_SEPARATOR:
       case Character.LINE_SEPARATOR:
       case Character.PARAGRAPH_SEPARATOR:
       case Character.CONTROL:
       case Character.FORMAT:
       case Character.DASH_PUNCTUATION:
       case Character.START_PUNCTUATION:
       case Character.END_PUNCTUATION:
       case Character.CONNECTOR_PUNCTUATION:
       case Character.OTHER_PUNCTUATION:
       case Character.MATH_SYMBOL:
       case Character.CURRENCY_SYMBOL:
       case Character.MODIFIER_SYMBOL:
       case Character.OTHER_SYMBOL:
       case Character.INITIAL_QUOTE_PUNCTUATION:
       case Character.FINAL_QUOTE_PUNCTUATION:
         return true;
       default:
         return false;
     }
   }
 }
	package org.apache.lucene.analysis.kuromoji;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.EnumMap;
	import java.util.List;

	import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
	import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
	import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
	import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
	import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
	import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
	import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
	import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
	import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;

	/**
	* Tokenizer main class.
	* Thread safe.
	*/
	public class Segmenter {
	public static enum Mode {
	NORMAL, SEARCH, SEARCH_WITH_COMPOUNDS, EXTENDED
	}

	public static final Mode DEFAULT_MODE = Mode.SEARCH;

	private final Viterbi viterbi;

	private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);

	private final boolean split;

	public Segmenter() {
	this(null, DEFAULT_MODE, false);
	}

	public Segmenter(Mode mode) {
	this(null, mode, false);
	}

	public Segmenter(UserDictionary userDictionary) {
	this(userDictionary, DEFAULT_MODE, false);
	}

	public Segmenter(UserDictionary userDictionary, Mode mode) {
	this(userDictionary, mode, false);
	}

	public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
	final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
	final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
	this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
	this.split = split;

	dictionaryMap.put(Type.KNOWN, dict);
	dictionaryMap.put(Type.UNKNOWN, unknownDict);
	dictionaryMap.put(Type.USER, userDictionary);
	}

	/**
	* Tokenize input text
	* @param text
	* @return list of Token
	*/
	public List<Token> tokenize(String text) {

	if (!split) {
	return doTokenize(0, text);
	}

	List<Integer> splitPositions = getSplitPositions(text);

	if(splitPositions.size() == 0) {
	return doTokenize(0, text);
	}

	ArrayList<Token> result = new ArrayList<Token>();
	int offset = 0;
	for(int position : splitPositions) {
	result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
	offset = position + 1;
	}

	if(offset < text.length()) {
	result.addAll(doTokenize(offset, text.substring(offset)));
	}

	return result;
	}

	/**
	* Split input text at 句読点, which is 。 and 、
	* @param text
	* @return list of split position
	*/
	private List<Integer> getSplitPositions(String text) {
	ArrayList<Integer> splitPositions = new ArrayList<Integer>();

	int position = 0;
	int currentPosition = 0;

	while(true) {
	int indexOfMaru = text.indexOf("。", currentPosition);
	int indexOfTen = text.indexOf("、", currentPosition);

	if(indexOfMaru < 0 \|\| indexOfTen < 0) {
	position = Math.max(indexOfMaru, indexOfTen);;
	} else {
	position = Math.min(indexOfMaru, indexOfTen);
	}

	if(position >= 0) {
	splitPositions.add(position);
	currentPosition = position + 1;
	} else {
	break;
	}
	}

	return splitPositions;
	}

	private List<Token> doTokenize(int offset, String sentence) {
	char text[] = sentence.toCharArray();
	return doTokenize(offset, text, 0, text.length, false);
	}

	/**
	* Tokenize input sentence.
	* @param offset offset of sentence in original input text
	* @param sentence sentence to tokenize
	* @return list of Token
	*/
	public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
	ArrayList<Token> result = new ArrayList<Token>();
	ViterbiNode[][][] lattice;
	try {
	lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
	} catch (IOException impossible) {
	throw new RuntimeException(impossible);
	}
	List<ViterbiNode> bestPath = viterbi.search(lattice);
	for (ViterbiNode node : bestPath) {
	int wordId = node.getWordId();
	if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
	continue;
	} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
	continue; // Do not emit punctuation
	}
	Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
	result.add(token);
	}
	/*
	System.out.println("result:");
	for(Token token : result) {
	System.out.println(" " + token);
	}
	*/
	return result;
	}

	/** returns a Graphviz String */
	public String debugTokenize(String text) {
	ViterbiNode[][][] lattice;
	try {
	lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
	} catch (IOException impossible) {
	throw new RuntimeException(impossible);
	}
	List<ViterbiNode> bestPath = this.viterbi.search(lattice);

	return new GraphvizFormatter(ConnectionCosts.getInstance())
	.format(lattice[0], lattice[1], bestPath);
	}

	static final boolean isPunctuation(char ch) {
	switch(Character.getType(ch)) {
	case Character.SPACE_SEPARATOR:
	case Character.LINE_SEPARATOR:
	case Character.PARAGRAPH_SEPARATOR:
	case Character.CONTROL:
	case Character.FORMAT:
	case Character.DASH_PUNCTUATION:
	case Character.START_PUNCTUATION:
	case Character.END_PUNCTUATION:
	case Character.CONNECTOR_PUNCTUATION:
	case Character.OTHER_PUNCTUATION:
	case Character.MATH_SYMBOL:
	case Character.CURRENCY_SYMBOL:
	case Character.MODIFIER_SYMBOL:
	case Character.OTHER_SYMBOL:
	case Character.INITIAL_QUOTE_PUNCTUATION:
	case Character.FINAL_QUOTE_PUNCTUATION:
	return true;
	default:
	return false;
	}
	}
	}