blob: b93190ca8764b5cf4dba1d3ebc83e8c6c248f9c6 [file] [log] [blame]
package org.apache.lucene.analysis.kuromoji;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
import org.apache.lucene.analysis.kuromoji.viterbi.GraphvizFormatter;
import org.apache.lucene.analysis.kuromoji.viterbi.Viterbi;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode;
/**
* Tokenizer main class.
* Thread safe.
*/
public class Segmenter {
public static enum Mode {
NORMAL, SEARCH, SEARCH_WITH_COMPOUNDS, EXTENDED
}
public static final Mode DEFAULT_MODE = Mode.SEARCH;
private final Viterbi viterbi;
private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
private final boolean split;
public Segmenter() {
this(null, DEFAULT_MODE, false);
}
public Segmenter(Mode mode) {
this(null, mode, false);
}
public Segmenter(UserDictionary userDictionary) {
this(userDictionary, DEFAULT_MODE, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode) {
this(userDictionary, mode, false);
}
public Segmenter(UserDictionary userDictionary, Mode mode, boolean split) {
final TokenInfoDictionary dict = TokenInfoDictionary.getInstance();
final UnknownDictionary unknownDict = UnknownDictionary.getInstance();
this.viterbi = new Viterbi(dict, unknownDict, ConnectionCosts.getInstance(), userDictionary, mode);
this.split = split;
dictionaryMap.put(Type.KNOWN, dict);
dictionaryMap.put(Type.UNKNOWN, unknownDict);
dictionaryMap.put(Type.USER, userDictionary);
}
/**
* Tokenize input text
* @param text
* @return list of Token
*/
public List<Token> tokenize(String text) {
if (!split) {
return doTokenize(0, text);
}
List<Integer> splitPositions = getSplitPositions(text);
if(splitPositions.size() == 0) {
return doTokenize(0, text);
}
ArrayList<Token> result = new ArrayList<Token>();
int offset = 0;
for(int position : splitPositions) {
result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
offset = position + 1;
}
if(offset < text.length()) {
result.addAll(doTokenize(offset, text.substring(offset)));
}
return result;
}
/**
* Split input text at 句読点, which is 。 and 、
* @param text
* @return list of split position
*/
private List<Integer> getSplitPositions(String text) {
ArrayList<Integer> splitPositions = new ArrayList<Integer>();
int position = 0;
int currentPosition = 0;
while(true) {
int indexOfMaru = text.indexOf("。", currentPosition);
int indexOfTen = text.indexOf("、", currentPosition);
if(indexOfMaru < 0 || indexOfTen < 0) {
position = Math.max(indexOfMaru, indexOfTen);;
} else {
position = Math.min(indexOfMaru, indexOfTen);
}
if(position >= 0) {
splitPositions.add(position);
currentPosition = position + 1;
} else {
break;
}
}
return splitPositions;
}
private List<Token> doTokenize(int offset, String sentence) {
char text[] = sentence.toCharArray();
return doTokenize(offset, text, 0, text.length, false);
}
/**
* Tokenize input sentence.
* @param offset offset of sentence in original input text
* @param sentence sentence to tokenize
* @return list of Token
*/
public List<Token> doTokenize(int offset, char[] sentence, int sentenceOffset, int sentenceLength, boolean discardPunctuation) {
ArrayList<Token> result = new ArrayList<Token>();
ViterbiNode[][][] lattice;
try {
lattice = viterbi.build(sentence, sentenceOffset, sentenceLength);
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == -1){ // Do not include BOS/EOS
continue;
} else if (discardPunctuation && node.getLength() > 0 && isPunctuation(node.getSurfaceForm()[node.getOffset()])) {
continue; // Do not emit punctuation
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getOffset(), node.getLength(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
/*
System.out.println("result:");
for(Token token : result) {
System.out.println(" " + token);
}
*/
return result;
}
/** returns a Graphviz String */
public String debugTokenize(String text) {
ViterbiNode[][][] lattice;
try {
lattice = this.viterbi.build(text.toCharArray(), 0, text.length());
} catch (IOException impossible) {
throw new RuntimeException(impossible);
}
List<ViterbiNode> bestPath = this.viterbi.search(lattice);
return new GraphvizFormatter(ConnectionCosts.getInstance())
.format(lattice[0], lattice[1], bestPath);
}
static final boolean isPunctuation(char ch) {
switch(Character.getType(ch)) {
case Character.SPACE_SEPARATOR:
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.FORMAT:
case Character.DASH_PUNCTUATION:
case Character.START_PUNCTUATION:
case Character.END_PUNCTUATION:
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
case Character.INITIAL_QUOTE_PUNCTUATION:
case Character.FINAL_QUOTE_PUNCTUATION:
return true;
default:
return false;
}
}
}