src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs - lucenenet - Git at Google

 // lucene version compatibility level: 4.8.1
 using Lucene.Net.Analysis.Cn.Smart.Hhmm;
 using Lucene.Net.Support;
 using System.Collections.Generic;

 namespace Lucene.Net.Analysis.Cn.Smart
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Segment a sentence of Chinese text into words.
     /// <para/>
     /// @lucene.experimental
     /// </summary>
     internal class WordSegmenter
     {
         private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();

         private SegTokenFilter tokenFilter = new SegTokenFilter();

         /// <summary>
         /// Segment a sentence into words with <see cref="HHMMSegmenter"/>
         /// </summary>
         /// <param name="sentence">input sentence</param>
         /// <param name="startOffset"> start offset of sentence</param>
         /// <returns><see cref="IList{T}"/> of <see cref="SegToken"/>.</returns>
         public virtual IList<SegToken> SegmentSentence(string sentence, int startOffset)
         {

             IList<SegToken> segTokenList = hhmmSegmenter.Process(sentence);
             // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
             IList<SegToken> result = Collections.EmptyList<SegToken>();

             if (segTokenList.Count > 2) // if its not an empty sentence
                 result = segTokenList.SubList(1, segTokenList.Count - 1);

             foreach (SegToken st in result)
             {
                 ConvertSegToken(st, sentence, startOffset);
             }

             return result;
         }

         /// <summary>
         /// Process a <see cref="SegToken"/> so that it is ready for indexing.
         /// </summary>
         /// <param name="st">st input <see cref="SegToken"/></param>
         /// <param name="sentence">associated Sentence</param>
         /// <param name="sentenceStartOffset">offset into sentence</param>
         /// <returns>Lucene <see cref="SegToken"/></returns>
         public virtual SegToken ConvertSegToken(SegToken st, string sentence,
             int sentenceStartOffset)
         {

             switch (st.WordType)
             {
                 case WordType.STRING:
                 case WordType.NUMBER:
                 case WordType.FULLWIDTH_NUMBER:
                 case WordType.FULLWIDTH_STRING:
                     st.CharArray = sentence.Substring(st.StartOffset, st.EndOffset - st.StartOffset)
                         .ToCharArray();
                     break;
                 default:
                     break;
             }

             st = tokenFilter.Filter(st);
             st.StartOffset += sentenceStartOffset;
             st.EndOffset += sentenceStartOffset;
             return st;
         }
     }
 }
	// lucene version compatibility level: 4.8.1
	using Lucene.Net.Analysis.Cn.Smart.Hhmm;
	using Lucene.Net.Support;
	using System.Collections.Generic;

	namespace Lucene.Net.Analysis.Cn.Smart
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Segment a sentence of Chinese text into words.
	/// <para/>
	/// @lucene.experimental
	/// </summary>
	internal class WordSegmenter
	{
	private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();

	private SegTokenFilter tokenFilter = new SegTokenFilter();

	/// <summary>
	/// Segment a sentence into words with <see cref="HHMMSegmenter"/>
	/// </summary>
	/// <param name="sentence">input sentence</param>
	/// <param name="startOffset"> start offset of sentence</param>
	/// <returns><see cref="IList{T}"/> of <see cref="SegToken"/>.</returns>
	public virtual IList<SegToken> SegmentSentence(string sentence, int startOffset)
	{

	IList<SegToken> segTokenList = hhmmSegmenter.Process(sentence);
	// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
	IList<SegToken> result = Collections.EmptyList<SegToken>();

	if (segTokenList.Count > 2) // if its not an empty sentence
	result = segTokenList.SubList(1, segTokenList.Count - 1);

	foreach (SegToken st in result)
	{
	ConvertSegToken(st, sentence, startOffset);
	}

	return result;
	}

	/// <summary>
	/// Process a <see cref="SegToken"/> so that it is ready for indexing.
	/// </summary>
	/// <param name="st">st input <see cref="SegToken"/></param>
	/// <param name="sentence">associated Sentence</param>
	/// <param name="sentenceStartOffset">offset into sentence</param>
	/// <returns>Lucene <see cref="SegToken"/></returns>
	public virtual SegToken ConvertSegToken(SegToken st, string sentence,
	int sentenceStartOffset)
	{

	switch (st.WordType)
	{
	case WordType.STRING:
	case WordType.NUMBER:
	case WordType.FULLWIDTH_NUMBER:
	case WordType.FULLWIDTH_STRING:
	st.CharArray = sentence.Substring(st.StartOffset, st.EndOffset - st.StartOffset)
	.ToCharArray();
	break;
	default:
	break;
	}

	st = tokenFilter.Filter(st);
	st.StartOffset += sentenceStartOffset;
	st.EndOffset += sentenceStartOffset;
	return st;
	}
	}
	}