blob: 4940dbaf3751db1092c717673c50e9ddfc6e74b3 [file] [log] [blame]
// lucene version compatibility level: 4.8.1
using System.Collections.Generic;
using System.Text;
namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Finds the optimal segmentation of a sentence into Chinese words
/// <para/>
/// @lucene.experimental
/// </summary>
public class HHMMSegmenter
{
private static WordDictionary wordDict = WordDictionary.GetInstance();
/// <summary>
/// Create the <see cref="SegGraph"/> for a sentence.
/// </summary>
/// <param name="sentence">input sentence, without start and end markers</param>
/// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
private SegGraph CreateSegGraph(string sentence)
{
int i = 0, j;
int length = sentence.Length;
int foundIndex;
CharType[] charTypeArray = GetCharTypes(sentence);
StringBuilder wordBuf = new StringBuilder();
SegToken token;
int frequency = 0; // the number of times word appears.
bool hasFullWidth;
WordType wordType;
char[] charArray;
SegGraph segGraph = new SegGraph();
while (i < length)
{
hasFullWidth = false;
switch (charTypeArray[i])
{
case CharType.SPACE_LIKE:
i++;
break;
case CharType.HANZI:
j = i + 1;
//wordBuf.delete(0, wordBuf.length());
wordBuf.Remove(0, wordBuf.Length);
// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
// cause word division.
wordBuf.Append(sentence[i]);
charArray = new char[] { sentence[i] };
frequency = wordDict.GetFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.AddToken(token);
foundIndex = wordDict.GetPrefixMatch(charArray);
while (j <= length && foundIndex != -1)
{
if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
{
// It is the phrase we are looking for; In other words, we have found a phrase SegToken
// from i to j. It is not a monosyllabic word (single word).
frequency = wordDict.GetFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
frequency);
segGraph.AddToken(token);
}
while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
j++;
if (j < length && charTypeArray[j] == CharType.HANZI)
{
wordBuf.Append(sentence[j]);
charArray = new char[wordBuf.Length];
//wordBuf.GetChars(0, charArray.Length, charArray, 0);
wordBuf.CopyTo(0, charArray, 0, charArray.Length);
// idArray has been found (foundWordIndex!=-1) as a prefix before.
// Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
// So start searching after foundWordIndex.
foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
j++;
}
else
{
break;
}
}
i++;
break;
case CharType.FULLWIDTH_LETTER:
hasFullWidth = true; /* intentional fallthrough */
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
{
if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
hasFullWidth = true;
j++;
}
// Found a Token from i to j. Type is LETTER char string.
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.AddToken(token);
i = j;
break;
case CharType.LETTER:
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
{
if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
hasFullWidth = true;
j++;
}
// Found a Token from i to j. Type is LETTER char string.
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.AddToken(token);
i = j;
break;
case CharType.FULLWIDTH_DIGIT:
hasFullWidth = true; /* intentional fallthrough */
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
{
if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
hasFullWidth = true;
j++;
}
// Found a Token from i to j. Type is NUMBER char string.
charArray = Utility.NUMBER_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.AddToken(token);
i = j;
break;
case CharType.DIGIT:
j = i + 1;
while (j < length
&& (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
{
if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
hasFullWidth = true;
j++;
}
// Found a Token from i to j. Type is NUMBER char string.
charArray = Utility.NUMBER_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
token = new SegToken(charArray, i, j, wordType, frequency);
segGraph.AddToken(token);
i = j;
break;
case CharType.DELIMITER:
j = i + 1;
// No need to search the weight for the punctuation. Picking the highest frequency will work.
frequency = Utility.MAX_FREQUENCE;
charArray = new char[] { sentence[i] };
token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
segGraph.AddToken(token);
i = j;
break;
default:
j = i + 1;
// Treat the unrecognized char symbol as unknown string.
// For example, any symbol not in GB2312 is treated as one of these.
charArray = Utility.STRING_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
token = new SegToken(charArray, i, j, WordType.STRING, frequency);
segGraph.AddToken(token);
i = j;
break;
}
}
// Add two more Tokens: "beginning xx beginning"
charArray = Utility.START_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
segGraph.AddToken(token);
// "end xx end"
charArray = Utility.END_CHAR_ARRAY;
frequency = wordDict.GetFrequency(charArray);
token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
frequency);
segGraph.AddToken(token);
return segGraph;
}
/// <summary>
/// Get the character types for every character in a sentence.
/// </summary>
/// <param name="sentence">input sentence</param>
/// <returns>array of character types corresponding to character positions in the sentence</returns>
/// <seealso cref="Utility.GetCharType(char)"/>
private static CharType[] GetCharTypes(string sentence)
{
int length = sentence.Length;
CharType[] charTypeArray = new CharType[length];
// the type of each character by position
for (int i = 0; i < length; i++)
{
charTypeArray[i] = Utility.GetCharType(sentence[i]);
}
return charTypeArray;
}
/// <summary>
/// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
/// </summary>
/// <param name="sentence">input sentence</param>
/// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
public virtual IList<SegToken> Process(string sentence)
{
SegGraph segGraph = CreateSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
IList<SegToken> shortPath = biSegGraph.GetShortPath();
return shortPath;
}
}
}