| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.cn.smart.hhmm; |
| |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.cn.smart.CharType; |
| import org.apache.lucene.analysis.cn.smart.Utility; |
| import org.apache.lucene.analysis.cn.smart.WordType; |
| |
| /** |
| * Finds the optimal segmentation of a sentence into Chinese words |
| * @lucene.experimental |
| */ |
| public class HHMMSegmenter { |
| |
| private static WordDictionary wordDict = WordDictionary.getInstance(); |
| |
| /** |
| * Create the {@link SegGraph} for a sentence. |
| * |
| * @param sentence input sentence, without start and end markers |
| * @return {@link SegGraph} corresponding to the input sentence. |
| */ |
| @SuppressWarnings("fallthrough") |
| private SegGraph createSegGraph(String sentence) { |
| int i = 0, j; |
| int length = sentence.length(); |
| int foundIndex; |
| int[] charTypeArray = getCharTypes(sentence); |
| StringBuilder wordBuf = new StringBuilder(); |
| SegToken token; |
| int frequency = 0; // the number of times word appears. |
| boolean hasFullWidth; |
| int wordType; |
| char[] charArray; |
| |
| SegGraph segGraph = new SegGraph(); |
| while (i < length) { |
| hasFullWidth = false; |
| switch (charTypeArray[i]) { |
| case CharType.SPACE_LIKE: |
| i++; |
| break; |
| case CharType.SURROGATE: |
| int state = Character.codePointAt(sentence, i); |
| int count = Character.charCount(state); |
| charArray = new char[count]; |
| sentence.getChars(i, i + count, charArray, 0); |
| token = new SegToken(charArray, i, i + count, WordType.CHINESE_WORD, 0); |
| segGraph.addToken(token); |
| i += count; |
| break; |
| case CharType.HANZI: |
| j = i + 1; |
| wordBuf.delete(0, wordBuf.length()); |
| // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, |
| // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will |
| // cause word division. |
| wordBuf.append(sentence.charAt(i)); |
| charArray = new char[] { sentence.charAt(i) }; |
| frequency = wordDict.getFrequency(charArray); |
| token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, |
| frequency); |
| segGraph.addToken(token); |
| |
| foundIndex = wordDict.getPrefixMatch(charArray); |
| while (j <= length && foundIndex != -1) { |
| if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) { |
| // It is the phrase we are looking for; In other words, we have found a phrase SegToken |
| // from i to j. It is not a monosyllabic word (single word). |
| frequency = wordDict.getFrequency(charArray); |
| token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, |
| frequency); |
| segGraph.addToken(token); |
| } |
| |
| while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) |
| j++; |
| |
| if (j < length && charTypeArray[j] == CharType.HANZI) { |
| wordBuf.append(sentence.charAt(j)); |
| charArray = new char[wordBuf.length()]; |
| wordBuf.getChars(0, charArray.length, charArray, 0); |
| // idArray has been found (foundWordIndex!=-1) as a prefix before. |
| // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. |
| // So start searching after foundWordIndex. |
| foundIndex = wordDict.getPrefixMatch(charArray, foundIndex); |
| j++; |
| } else { |
| break; |
| } |
| } |
| i++; |
| break; |
| case CharType.FULLWIDTH_LETTER: |
| hasFullWidth = true; /* intentional fallthrough */ |
| case CharType.LETTER: |
| j = i + 1; |
| while (j < length |
| && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) { |
| if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) |
| hasFullWidth = true; |
| j++; |
| } |
| // Found a Token from i to j. Type is LETTER char string. |
| charArray = Utility.STRING_CHAR_ARRAY; |
| frequency = wordDict.getFrequency(charArray); |
| wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; |
| token = new SegToken(charArray, i, j, wordType, frequency); |
| segGraph.addToken(token); |
| i = j; |
| break; |
| case CharType.FULLWIDTH_DIGIT: |
| hasFullWidth = true; /* intentional fallthrough */ |
| case CharType.DIGIT: |
| j = i + 1; |
| while (j < length |
| && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) { |
| if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) |
| hasFullWidth = true; |
| j++; |
| } |
| // Found a Token from i to j. Type is NUMBER char string. |
| charArray = Utility.NUMBER_CHAR_ARRAY; |
| frequency = wordDict.getFrequency(charArray); |
| wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; |
| token = new SegToken(charArray, i, j, wordType, frequency); |
| segGraph.addToken(token); |
| i = j; |
| break; |
| case CharType.DELIMITER: |
| j = i + 1; |
| // No need to search the weight for the punctuation. Picking the highest frequency will work. |
| frequency = Utility.MAX_FREQUENCE; |
| charArray = new char[] { sentence.charAt(i) }; |
| token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); |
| segGraph.addToken(token); |
| i = j; |
| break; |
| default: |
| j = i + 1; |
| // Treat the unrecognized char symbol as unknown string. |
| // For example, any symbol not in GB2312 is treated as one of these. |
| charArray = Utility.STRING_CHAR_ARRAY; |
| frequency = wordDict.getFrequency(charArray); |
| token = new SegToken(charArray, i, j, WordType.STRING, frequency); |
| segGraph.addToken(token); |
| i = j; |
| break; |
| } |
| } |
| |
| // Add two more Tokens: "beginning xx beginning" |
| charArray = Utility.START_CHAR_ARRAY; |
| frequency = wordDict.getFrequency(charArray); |
| token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); |
| segGraph.addToken(token); |
| |
| // "end xx end" |
| charArray = Utility.END_CHAR_ARRAY; |
| frequency = wordDict.getFrequency(charArray); |
| token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, |
| frequency); |
| segGraph.addToken(token); |
| |
| return segGraph; |
| } |
| |
| /** |
| * Get the character types for every character in a sentence. |
| * |
| * @see Utility#getCharType(char) |
| * @param sentence input sentence |
| * @return array of character types corresponding to character positions in the sentence |
| */ |
| private static int[] getCharTypes(String sentence) { |
| int length = sentence.length(); |
| int[] charTypeArray = new int[length]; |
| // the type of each character by position |
| for (int i = 0; i < length; i++) { |
| charTypeArray[i] = Utility.getCharType(sentence.charAt(i)); |
| } |
| |
| return charTypeArray; |
| } |
| |
| /** |
| * Return a list of {@link SegToken} representing the best segmentation of a sentence |
| * @param sentence input sentence |
| * @return best segmentation as a {@link List} |
| */ |
| public List<SegToken> process(String sentence) { |
| SegGraph segGraph = createSegGraph(sentence); |
| BiSegGraph biSegGraph = new BiSegGraph(segGraph); |
| List<SegToken> shortPath = biSegGraph.getShortPath(); |
| return shortPath; |
| } |
| } |