src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/ScandinavianFoldingFilter.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;

 namespace Lucene.Net.Analysis.Miscellaneous
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// This filter folds Scandinavian characters åÅäæÄÆ->a and öÖøØ->o.
     /// It also discriminate against use of double vowels aa, ae, ao, oe and oo, leaving just the first one.
     /// <para/>
     /// It's is a semantically more destructive solution than <see cref="ScandinavianNormalizationFilter"/> but
     /// can in addition help with matching raksmorgas as räksmörgås.
     /// <para/>
     /// blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj
     /// räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas
     /// <para/>
     /// Background:
     /// Swedish åäö are in fact the same letters as Norwegian and Danish åæø and thus interchangeable
     /// when used between these languages. They are however folded differently when people type
     /// them on a keyboard lacking these characters.
     /// <para/>
     /// In that situation almost all Swedish people use a, a, o instead of å, ä, ö.
     /// <para/>
     /// Norwegians and Danes on the other hand usually type aa, ae and oe instead of å, æ and ø.
     /// Some do however use a, a, o, oo, ao and sometimes permutations of everything above.
     /// <para/>
     /// This filter solves that mismatch problem, but might also cause new.
     /// </summary>
     /// <seealso cref="ScandinavianNormalizationFilter"/>
     public sealed class ScandinavianFoldingFilter : TokenFilter
     {
         public ScandinavianFoldingFilter(TokenStream input)
             : base(input)
         {
             charTermAttribute = AddAttribute<ICharTermAttribute>();
         }

         private readonly ICharTermAttribute charTermAttribute;

         private const char AA = '\u00C5'; // Å
         private const char aa = '\u00E5'; // å
         private const char AE = '\u00C6'; // Æ
         private const char ae = '\u00E6'; // æ
         private const char AE_se = '\u00C4'; // Ä
         private const char ae_se = '\u00E4'; // ä
         private const char OE = '\u00D8'; // Ø
         private const char oe = '\u00F8'; // ø
         private const char OE_se = '\u00D6'; // Ö
         private const char oe_se = '\u00F6'; //ö


         public override bool IncrementToken()
         {
             if (!m_input.IncrementToken())
             {
                 return false;
             }

             char[] buffer = charTermAttribute.Buffer;
             int length = charTermAttribute.Length;


             int i;
             for (i = 0; i < length; i++)
             {

                 if (buffer[i] == aa || buffer[i] == ae_se || buffer[i] == ae)
                 {

                     buffer[i] = 'a';

                 }
                 else if (buffer[i] == AA || buffer[i] == AE_se || buffer[i] == AE)
                 {

                     buffer[i] = 'A';

                 }
                 else if (buffer[i] == oe || buffer[i] == oe_se)
                 {

                     buffer[i] = 'o';

                 }
                 else if (buffer[i] == OE || buffer[i] == OE_se)
                 {

                     buffer[i] = 'O';

                 }
                 else if (length - 1 > i)
                 {

                     if ((buffer[i] == 'a' || buffer[i] == 'A') && (buffer[i + 1] == 'a' || buffer[i + 1] == 'A' || buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                     {

                         length = StemmerUtil.Delete(buffer, i + 1, length);

                     }
                     else if ((buffer[i] == 'o' || buffer[i] == 'O') && (buffer[i + 1] == 'e' || buffer[i + 1] == 'E' || buffer[i + 1] == 'o' || buffer[i + 1] == 'O'))
                     {

                         length = StemmerUtil.Delete(buffer, i + 1, length);

                     }
                 }
             }

             charTermAttribute.Length = length;

             return true;
         }
     }
 }
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;

	namespace Lucene.Net.Analysis.Miscellaneous
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// This filter folds Scandinavian characters åÅäæÄÆ->a and öÖøØ->o.
	/// It also discriminate against use of double vowels aa, ae, ao, oe and oo, leaving just the first one.
	/// <para/>
	/// It's is a semantically more destructive solution than <see cref="ScandinavianNormalizationFilter"/> but
	/// can in addition help with matching raksmorgas as räksmörgås.
	/// <para/>
	/// blåbærsyltetøj == blåbärsyltetöj == blaabaarsyltetoej == blabarsyltetoj
	/// räksmörgås == ræksmørgås == ræksmörgaos == raeksmoergaas == raksmorgas
	/// <para/>
	/// Background:
	/// Swedish åäö are in fact the same letters as Norwegian and Danish åæø and thus interchangeable
	/// when used between these languages. They are however folded differently when people type
	/// them on a keyboard lacking these characters.
	/// <para/>
	/// In that situation almost all Swedish people use a, a, o instead of å, ä, ö.
	/// <para/>
	/// Norwegians and Danes on the other hand usually type aa, ae and oe instead of å, æ and ø.
	/// Some do however use a, a, o, oo, ao and sometimes permutations of everything above.
	/// <para/>
	/// This filter solves that mismatch problem, but might also cause new.
	/// </summary>
	/// <seealso cref="ScandinavianNormalizationFilter"/>
	public sealed class ScandinavianFoldingFilter : TokenFilter
	{
	public ScandinavianFoldingFilter(TokenStream input)
	: base(input)
	{
	charTermAttribute = AddAttribute<ICharTermAttribute>();
	}

	private readonly ICharTermAttribute charTermAttribute;

	private const char AA = '\u00C5'; // Å
	private const char aa = '\u00E5'; // å
	private const char AE = '\u00C6'; // Æ
	private const char ae = '\u00E6'; // æ
	private const char AE_se = '\u00C4'; // Ä
	private const char ae_se = '\u00E4'; // ä
	private const char OE = '\u00D8'; // Ø
	private const char oe = '\u00F8'; // ø
	private const char OE_se = '\u00D6'; // Ö
	private const char oe_se = '\u00F6'; //ö


	public override bool IncrementToken()
	{
	if (!m_input.IncrementToken())
	{
	return false;
	}

	char[] buffer = charTermAttribute.Buffer;
	int length = charTermAttribute.Length;


	int i;
	for (i = 0; i < length; i++)
	{

	if (buffer[i] == aa \|\| buffer[i] == ae_se \|\| buffer[i] == ae)
	{

	buffer[i] = 'a';

	}
	else if (buffer[i] == AA \|\| buffer[i] == AE_se \|\| buffer[i] == AE)
	{

	buffer[i] = 'A';

	}
	else if (buffer[i] == oe \|\| buffer[i] == oe_se)
	{

	buffer[i] = 'o';

	}
	else if (buffer[i] == OE \|\| buffer[i] == OE_se)
	{

	buffer[i] = 'O';

	}
	else if (length - 1 > i)
	{

	if ((buffer[i] == 'a' \|\| buffer[i] == 'A') && (buffer[i + 1] == 'a' \|\| buffer[i + 1] == 'A' \|\| buffer[i + 1] == 'e' \|\| buffer[i + 1] == 'E' \|\| buffer[i + 1] == 'o' \|\| buffer[i + 1] == 'O'))
	{

	length = StemmerUtil.Delete(buffer, i + 1, length);

	}
	else if ((buffer[i] == 'o' \|\| buffer[i] == 'O') && (buffer[i + 1] == 'e' \|\| buffer[i + 1] == 'E' \|\| buffer[i + 1] == 'o' \|\| buffer[i + 1] == 'O'))
	{

	length = StemmerUtil.Delete(buffer, i + 1, length);

	}
	}
	}

	charTermAttribute.Length = length;

	return true;
	}
	}
	}