src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs - lucenenet - Git at Google

 // lucene version compatibility level: 4.8.1
 using System.Collections.Generic;
 using System.Text;

 namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Finds the optimal segmentation of a sentence into Chinese words
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     public class HHMMSegmenter
     {
         private static WordDictionary wordDict = WordDictionary.GetInstance();

         /// <summary>
         /// Create the <see cref="SegGraph"/> for a sentence.
         /// </summary>
         /// <param name="sentence">input sentence, without start and end markers</param>
         /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
         private SegGraph CreateSegGraph(string sentence)
         {
             int i = 0, j;
             int length = sentence.Length;
             int foundIndex;
             CharType[] charTypeArray = GetCharTypes(sentence);
             StringBuilder wordBuf = new StringBuilder();
             SegToken token;
             int frequency = 0; // the number of times word appears.
             bool hasFullWidth;
             WordType wordType;
             char[] charArray;

             SegGraph segGraph = new SegGraph();
             while (i < length)
             {
                 hasFullWidth = false;
                 switch (charTypeArray[i])
                 {
                     case CharType.SPACE_LIKE:
                         i++;
                         break;
                     case CharType.HANZI:
                         j = i + 1;
                         //wordBuf.delete(0, wordBuf.length());
                         wordBuf.Remove(0, wordBuf.Length);
                         // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
                         // it will store that single Chinese character (Hanzi) in the SegGraph.  Otherwise, it will
                         // cause word division.
                         wordBuf.Append(sentence[i]);
                         charArray = new char[] { sentence[i] };
                         frequency = wordDict.GetFrequency(charArray);
                         token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                             frequency);
                         segGraph.AddToken(token);

                         foundIndex = wordDict.GetPrefixMatch(charArray);
                         while (j <= length && foundIndex != -1)
                         {
                             if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
                             {
                                 // It is the phrase we are looking for; In other words, we have found a phrase SegToken
                                 // from i to j.  It is not a monosyllabic word (single word).
                                 frequency = wordDict.GetFrequency(charArray);
                                 token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
                                     frequency);
                                 segGraph.AddToken(token);
                             }

                             while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
                                 j++;

                             if (j < length && charTypeArray[j] == CharType.HANZI)
                             {
                                 wordBuf.Append(sentence[j]);
                                 charArray = new char[wordBuf.Length];
                                 //wordBuf.GetChars(0, charArray.Length, charArray, 0);
                                 wordBuf.CopyTo(0, charArray, 0, charArray.Length);
                                 // idArray has been found (foundWordIndex!=-1) as a prefix before.
                                 // Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
                                 // So start searching after foundWordIndex.
                                 foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
                                 j++;
                             }
                             else
                             {
                                 break;
                             }
                         }
                         i++;
                         break;
                     case CharType.FULLWIDTH_LETTER:
                         hasFullWidth = true; /* intentional fallthrough */

                         j = i + 1;
                         while (j < length
                             && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                         {
                             if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                                 hasFullWidth = true;
                             j++;
                         }
                         // Found a Token from i to j. Type is LETTER char string.
                         charArray = Utility.STRING_CHAR_ARRAY;
                         frequency = wordDict.GetFrequency(charArray);
                         wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                         token = new SegToken(charArray, i, j, wordType, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;

                     case CharType.LETTER:
                         j = i + 1;
                         while (j < length
                             && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER))
                         {
                             if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
                                 hasFullWidth = true;
                             j++;
                         }
                         // Found a Token from i to j. Type is LETTER char string.
                         charArray = Utility.STRING_CHAR_ARRAY;
                         frequency = wordDict.GetFrequency(charArray);
                         wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
                         token = new SegToken(charArray, i, j, wordType, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;
                     case CharType.FULLWIDTH_DIGIT:
                         hasFullWidth = true; /* intentional fallthrough */

                         j = i + 1;
                         while (j < length
                             && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                         {
                             if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                                 hasFullWidth = true;
                             j++;
                         }
                         // Found a Token from i to j. Type is NUMBER char string.
                         charArray = Utility.NUMBER_CHAR_ARRAY;
                         frequency = wordDict.GetFrequency(charArray);
                         wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                         token = new SegToken(charArray, i, j, wordType, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;

                     case CharType.DIGIT:
                         j = i + 1;
                         while (j < length
                             && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
                         {
                             if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
                                 hasFullWidth = true;
                             j++;
                         }
                         // Found a Token from i to j. Type is NUMBER char string.
                         charArray = Utility.NUMBER_CHAR_ARRAY;
                         frequency = wordDict.GetFrequency(charArray);
                         wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
                         token = new SegToken(charArray, i, j, wordType, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;
                     case CharType.DELIMITER:
                         j = i + 1;
                         // No need to search the weight for the punctuation.  Picking the highest frequency will work.
                         frequency = Utility.MAX_FREQUENCE;
                         charArray = new char[] { sentence[i] };
                         token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;
                     default:
                         j = i + 1;
                         // Treat the unrecognized char symbol as unknown string.
                         // For example, any symbol not in GB2312 is treated as one of these.
                         charArray = Utility.STRING_CHAR_ARRAY;
                         frequency = wordDict.GetFrequency(charArray);
                         token = new SegToken(charArray, i, j, WordType.STRING, frequency);
                         segGraph.AddToken(token);
                         i = j;
                         break;
                 }
             }

             // Add two more Tokens: "beginning xx beginning"
             charArray = Utility.START_CHAR_ARRAY;
             frequency = wordDict.GetFrequency(charArray);
             token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
             segGraph.AddToken(token);

             // "end xx end"
             charArray = Utility.END_CHAR_ARRAY;
             frequency = wordDict.GetFrequency(charArray);
             token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
                 frequency);
             segGraph.AddToken(token);

             return segGraph;
         }

         /// <summary>
         /// Get the character types for every character in a sentence.
         /// </summary>
         /// <param name="sentence">input sentence</param>
         /// <returns>array of character types corresponding to character positions in the sentence</returns>
         /// <seealso cref="Utility.GetCharType(char)"/>
         private static CharType[] GetCharTypes(string sentence)
         {
             int length = sentence.Length;
             CharType[] charTypeArray = new CharType[length];
             // the type of each character by position
             for (int i = 0; i < length; i++)
             {
                 charTypeArray[i] = Utility.GetCharType(sentence[i]);
             }

             return charTypeArray;
         }

         /// <summary>
         /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
         /// </summary>
         /// <param name="sentence">input sentence</param>
         /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
         public virtual IList<SegToken> Process(string sentence)
         {
             SegGraph segGraph = CreateSegGraph(sentence);
             BiSegGraph biSegGraph = new BiSegGraph(segGraph);
             IList<SegToken> shortPath = biSegGraph.GetShortPath();
             return shortPath;
         }
     }
 }
	// lucene version compatibility level: 4.8.1
	using System.Collections.Generic;
	using System.Text;

	namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Finds the optimal segmentation of a sentence into Chinese words
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	public class HHMMSegmenter
	{
	private static WordDictionary wordDict = WordDictionary.GetInstance();

	/// <summary>
	/// Create the <see cref="SegGraph"/> for a sentence.
	/// </summary>
	/// <param name="sentence">input sentence, without start and end markers</param>
	/// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns>
	private SegGraph CreateSegGraph(string sentence)
	{
	int i = 0, j;
	int length = sentence.Length;
	int foundIndex;
	CharType[] charTypeArray = GetCharTypes(sentence);
	StringBuilder wordBuf = new StringBuilder();
	SegToken token;
	int frequency = 0; // the number of times word appears.
	bool hasFullWidth;
	WordType wordType;
	char[] charArray;

	SegGraph segGraph = new SegGraph();
	while (i < length)
	{
	hasFullWidth = false;
	switch (charTypeArray[i])
	{
	case CharType.SPACE_LIKE:
	i++;
	break;
	case CharType.HANZI:
	j = i + 1;
	//wordBuf.delete(0, wordBuf.length());
	wordBuf.Remove(0, wordBuf.Length);
	// It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not,
	// it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will
	// cause word division.
	wordBuf.Append(sentence[i]);
	charArray = new char[] { sentence[i] };
	frequency = wordDict.GetFrequency(charArray);
	token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
	frequency);
	segGraph.AddToken(token);

	foundIndex = wordDict.GetPrefixMatch(charArray);
	while (j <= length && foundIndex != -1)
	{
	if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1)
	{
	// It is the phrase we are looking for; In other words, we have found a phrase SegToken
	// from i to j. It is not a monosyllabic word (single word).
	frequency = wordDict.GetFrequency(charArray);
	token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
	frequency);
	segGraph.AddToken(token);
	}

	while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
	j++;

	if (j < length && charTypeArray[j] == CharType.HANZI)
	{
	wordBuf.Append(sentence[j]);
	charArray = new char[wordBuf.Length];
	//wordBuf.GetChars(0, charArray.Length, charArray, 0);
	wordBuf.CopyTo(0, charArray, 0, charArray.Length);
	// idArray has been found (foundWordIndex!=-1) as a prefix before.
	// Therefore, idArray after it has been lengthened can only appear after foundWordIndex.
	// So start searching after foundWordIndex.
	foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex);
	j++;
	}
	else
	{
	break;
	}
	}
	i++;
	break;
	case CharType.FULLWIDTH_LETTER:
	hasFullWidth = true; /* intentional fallthrough */

	j = i + 1;
	while (j < length
	&& (charTypeArray[j] == CharType.LETTER \|\| charTypeArray[j] == CharType.FULLWIDTH_LETTER))
	{
	if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
	hasFullWidth = true;
	j++;
	}
	// Found a Token from i to j. Type is LETTER char string.
	charArray = Utility.STRING_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
	token = new SegToken(charArray, i, j, wordType, frequency);
	segGraph.AddToken(token);
	i = j;
	break;

	case CharType.LETTER:
	j = i + 1;
	while (j < length
	&& (charTypeArray[j] == CharType.LETTER \|\| charTypeArray[j] == CharType.FULLWIDTH_LETTER))
	{
	if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
	hasFullWidth = true;
	j++;
	}
	// Found a Token from i to j. Type is LETTER char string.
	charArray = Utility.STRING_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
	token = new SegToken(charArray, i, j, wordType, frequency);
	segGraph.AddToken(token);
	i = j;
	break;
	case CharType.FULLWIDTH_DIGIT:
	hasFullWidth = true; /* intentional fallthrough */

	j = i + 1;
	while (j < length
	&& (charTypeArray[j] == CharType.DIGIT \|\| charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
	{
	if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
	hasFullWidth = true;
	j++;
	}
	// Found a Token from i to j. Type is NUMBER char string.
	charArray = Utility.NUMBER_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
	token = new SegToken(charArray, i, j, wordType, frequency);
	segGraph.AddToken(token);
	i = j;
	break;

	case CharType.DIGIT:
	j = i + 1;
	while (j < length
	&& (charTypeArray[j] == CharType.DIGIT \|\| charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
	{
	if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
	hasFullWidth = true;
	j++;
	}
	// Found a Token from i to j. Type is NUMBER char string.
	charArray = Utility.NUMBER_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
	token = new SegToken(charArray, i, j, wordType, frequency);
	segGraph.AddToken(token);
	i = j;
	break;
	case CharType.DELIMITER:
	j = i + 1;
	// No need to search the weight for the punctuation. Picking the highest frequency will work.
	frequency = Utility.MAX_FREQUENCE;
	charArray = new char[] { sentence[i] };
	token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
	segGraph.AddToken(token);
	i = j;
	break;
	default:
	j = i + 1;
	// Treat the unrecognized char symbol as unknown string.
	// For example, any symbol not in GB2312 is treated as one of these.
	charArray = Utility.STRING_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	token = new SegToken(charArray, i, j, WordType.STRING, frequency);
	segGraph.AddToken(token);
	i = j;
	break;
	}
	}

	// Add two more Tokens: "beginning xx beginning"
	charArray = Utility.START_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
	segGraph.AddToken(token);

	// "end xx end"
	charArray = Utility.END_CHAR_ARRAY;
	frequency = wordDict.GetFrequency(charArray);
	token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
	frequency);
	segGraph.AddToken(token);

	return segGraph;
	}

	/// <summary>
	/// Get the character types for every character in a sentence.
	/// </summary>
	/// <param name="sentence">input sentence</param>
	/// <returns>array of character types corresponding to character positions in the sentence</returns>
	/// <seealso cref="Utility.GetCharType(char)"/>
	private static CharType[] GetCharTypes(string sentence)
	{
	int length = sentence.Length;
	CharType[] charTypeArray = new CharType[length];
	// the type of each character by position
	for (int i = 0; i < length; i++)
	{
	charTypeArray[i] = Utility.GetCharType(sentence[i]);
	}

	return charTypeArray;
	}

	/// <summary>
	/// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence
	/// </summary>
	/// <param name="sentence">input sentence</param>
	/// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns>
	public virtual IList<SegToken> Process(string sentence)
	{
	SegGraph segGraph = CreateSegGraph(sentence);
	BiSegGraph biSegGraph = new BiSegGraph(segGraph);
	IList<SegToken> shortPath = biSegGraph.GetShortPath();
	return shortPath;
	}
	}
	}