src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System;
 using System.Globalization;

 namespace Lucene.Net.Analysis.Cn
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A <see cref="TokenFilter"/> with a stop word table.
     /// <list type="bullet">
     ///     <item><description>Numeric tokens are removed.</description></item>
     ///     <item><description>English tokens must be larger than 1 character.</description></item>
     ///     <item><description>One Chinese character as one Chinese word.</description></item>
     /// </list>
     /// TO DO:
     /// <list type="number">
     ///     <item><description>Add Chinese stop words, such as \ue400</description></item>
     ///     <item><description>Dictionary based Chinese word extraction</description></item>
     ///     <item><description>Intelligent Chinese word extraction</description></item>
     /// </list>
     /// </summary>
     /// @deprecated (3.1) Use <see cref="Core.StopFilter"/> instead, which has the same functionality.
     /// This filter will be removed in Lucene 5.0
     [Obsolete("(3.1) Use StopFilter instead, which has the same functionality.")]
     public sealed class ChineseFilter : TokenFilter
     {
         // Only English now, Chinese to be added later.
         public static readonly string[] STOP_WORDS = new string[] {
             "and", "are", "as", "at", "be", "but", "by",
             "for", "if", "in", "into", "is", "it",
             "no", "not", "of", "on", "or", "such",
             "that", "the", "their", "then", "there", "these",
             "they", "this", "to", "was", "will", "with"
         };

         private CharArraySet stopTable;

         private ICharTermAttribute termAtt;

         public ChineseFilter(TokenStream @in)
             : base(@in)
         {
             stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, STOP_WORDS, false);
             termAtt = AddAttribute<ICharTermAttribute>();
         }

         public override bool IncrementToken()
         {
             while (m_input.IncrementToken())
             {
                 char[] text = termAtt.Buffer;
                 int termLength = termAtt.Length;

                 // why not key off token type here assuming ChineseTokenizer comes first?
                 if (!stopTable.Contains(text, 0, termLength))
                 {
                     switch (CharUnicodeInfo.GetUnicodeCategory(text[0]))
                     {

                         case UnicodeCategory.LowercaseLetter:
                         case UnicodeCategory.UppercaseLetter:

                             // English word/token should larger than 1 character.
                             if (termLength > 1)
                             {
                                 return true;
                             }
                             break;
                         case UnicodeCategory.OtherLetter:

                             // One Chinese character as one Chinese word.
                             // Chinese word extraction to be added later here.

                             return true;
                     }
                 }
             }
             return false;
         }
     }
 }
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Util;
	using System;
	using System.Globalization;

	namespace Lucene.Net.Analysis.Cn
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A <see cref="TokenFilter"/> with a stop word table.
	/// <list type="bullet">
	/// <item><description>Numeric tokens are removed.</description></item>
	/// <item><description>English tokens must be larger than 1 character.</description></item>
	/// <item><description>One Chinese character as one Chinese word.</description></item>
	/// </list>
	/// TO DO:
	/// <list type="number">
	/// <item><description>Add Chinese stop words, such as \ue400</description></item>
	/// <item><description>Dictionary based Chinese word extraction</description></item>
	/// <item><description>Intelligent Chinese word extraction</description></item>
	/// </list>
	/// </summary>
	/// @deprecated (3.1) Use <see cref="Core.StopFilter"/> instead, which has the same functionality.
	/// This filter will be removed in Lucene 5.0
	[Obsolete("(3.1) Use StopFilter instead, which has the same functionality.")]
	public sealed class ChineseFilter : TokenFilter
	{
	// Only English now, Chinese to be added later.
	public static readonly string[] STOP_WORDS = new string[] {
	"and", "are", "as", "at", "be", "but", "by",
	"for", "if", "in", "into", "is", "it",
	"no", "not", "of", "on", "or", "such",
	"that", "the", "their", "then", "there", "these",
	"they", "this", "to", "was", "will", "with"
	};

	private CharArraySet stopTable;

	private ICharTermAttribute termAtt;

	public ChineseFilter(TokenStream @in)
	: base(@in)
	{
	stopTable = new CharArraySet(LuceneVersion.LUCENE_CURRENT, STOP_WORDS, false);
	termAtt = AddAttribute<ICharTermAttribute>();
	}

	public override bool IncrementToken()
	{
	while (m_input.IncrementToken())
	{
	char[] text = termAtt.Buffer;
	int termLength = termAtt.Length;

	// why not key off token type here assuming ChineseTokenizer comes first?
	if (!stopTable.Contains(text, 0, termLength))
	{
	switch (CharUnicodeInfo.GetUnicodeCategory(text[0]))
	{

	case UnicodeCategory.LowercaseLetter:
	case UnicodeCategory.UppercaseLetter:

	// English word/token should larger than 1 character.
	if (termLength > 1)
	{
	return true;
	}
	break;
	case UnicodeCategory.OtherLetter:

	// One Chinese character as one Chinese word.
	// Chinese word extraction to be added later here.

	return true;
	}
	}
	}
	return false;
	}
	}
	}