src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs - lucenenet - Git at Google

 // Lucene version compatibility level 8.6.1
 using ICU4N.Globalization;
 using ICU4N.Text;
 using J2N;
 using Lucene.Net.Analysis.Standard;
 using System;
 using System.IO;

 namespace Lucene.Net.Analysis.Icu.Segmentation
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Default <see cref="ICUTokenizerConfig"/> that is generally applicable
     /// to many languages.
     /// </summary>
     /// <remarks>
     /// Generally tokenizes Unicode text according to UAX#29
     /// (<see cref="T:BreakIterator.GetWordInstance(ULocale.ROOT)"/>),
     /// but with the following tailorings:
     /// <list type="bullet">
     ///     <item><description>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.</description></item>
     /// </list>
     /// <para/>
     /// @lucene.experimental
     /// </remarks>
     public class DefaultICUTokenizerConfig : ICUTokenizerConfig
     {
         /// <summary>Token type for words containing ideographic characters</summary>
         public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
         /// <summary>Token type for words containing Japanese hiragana</summary>
         public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
         /// <summary>Token type for words containing Japanese katakana</summary>
         public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
         /// <summary>Token type for words containing Korean hangul</summary>
         public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
         /// <summary>Token type for words that contain letters</summary>
         public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
         /// <summary>Token type for words that appear to be numbers</summary>
         public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
         /// <summary>Token type for words that appear to be emoji sequences</summary>
         public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI

         /// <summary>
         /// the default breakiterators in use. these can be expensive to
         /// instantiate, cheap to clone.
         /// </summary>
         // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
         // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
         private static readonly BreakIterator cjkBreakIterator = BreakIterator.GetWordInstance(UCultureInfo.InvariantCulture);

         // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
         // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html

         // the same as ROOT, except no dictionary segmentation for cjk
         private static readonly BreakIterator defaultBreakIterator =
             ReadBreakIterator("Default.brk");
         private static readonly BreakIterator myanmarSyllableIterator =
             ReadBreakIterator("MyanmarSyllable.brk");

         // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
         private readonly bool cjkAsWords;
         private readonly bool myanmarAsWords;

         /// <summary>
         /// Creates a new config. This object is lightweight, but the first
         /// time the class is referenced, breakiterators will be initialized.
         /// </summary>
         /// <param name="cjkAsWords">true if cjk text should undergo dictionary-based segmentation,
         /// otherwise text will be segmented according to UAX#29 defaults.</param>
         /// <param name="myanmarAsWords">If this is true, all Han+Hiragana+Katakana words will be tagged as IDEOGRAPHIC.</param>
         public DefaultICUTokenizerConfig(bool cjkAsWords, bool myanmarAsWords)
         {
             this.cjkAsWords = cjkAsWords;
             this.myanmarAsWords = myanmarAsWords;
         }

         public override bool CombineCJ => cjkAsWords;

         public override RuleBasedBreakIterator GetBreakIterator(int script)
         {
             switch (script)
             {
                 case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
                 case UScript.Myanmar:
                     if (myanmarAsWords)
                     {
                         return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
                     }
                     else
                     {
                         return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
                     }
                 default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
             }
         }

         public override string GetType(int script, int ruleStatus)
         {
             switch (ruleStatus)
             {
                 case BreakIterator.WordIdeo:
                     return WORD_IDEO;
                 case BreakIterator.WordKana: //RuleBasedBreakIterator.WORD_KANA:
                     return script == UScript.Hiragana ? WORD_HIRAGANA : WORD_KATAKANA;
                 case BreakIterator.WordLetter: //RuleBasedBreakIterator.WORD_LETTER:
                     return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
                 case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
                     return WORD_NUMBER;
                 case EMOJI_SEQUENCE_STATUS:
                     return WORD_EMOJI;
                 default: /* some other custom code */
                     return "<OTHER>";
             }
         }

         private static RuleBasedBreakIterator ReadBreakIterator(string filename)
         {
             using (Stream @is = typeof(DefaultICUTokenizerConfig).FindAndGetManifestResourceStream(filename))
             {
                 try
                 {
                     RuleBasedBreakIterator bi =
                         RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is);
                     return bi;
                 }
                 catch (IOException e)
                 {
                     throw new Exception(e.ToString(), e);
                 }
             }
         }
     }
 }
	// Lucene version compatibility level 8.6.1
	using ICU4N.Globalization;
	using ICU4N.Text;
	using J2N;
	using Lucene.Net.Analysis.Standard;
	using System;
	using System.IO;

	namespace Lucene.Net.Analysis.Icu.Segmentation
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Default <see cref="ICUTokenizerConfig"/> that is generally applicable
	/// to many languages.
	/// </summary>
	/// <remarks>
	/// Generally tokenizes Unicode text according to UAX#29
	/// (<see cref="T:BreakIterator.GetWordInstance(ULocale.ROOT)"/>),
	/// but with the following tailorings:
	/// <list type="bullet">
	/// <item><description>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.</description></item>
	/// </list>
	/// <para/>
	/// @lucene.experimental
	/// </remarks>
	public class DefaultICUTokenizerConfig : ICUTokenizerConfig
	{
	/// <summary>Token type for words containing ideographic characters</summary>
	public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
	/// <summary>Token type for words containing Japanese hiragana</summary>
	public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
	/// <summary>Token type for words containing Japanese katakana</summary>
	public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
	/// <summary>Token type for words containing Korean hangul</summary>
	public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
	/// <summary>Token type for words that contain letters</summary>
	public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
	/// <summary>Token type for words that appear to be numbers</summary>
	public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
	/// <summary>Token type for words that appear to be emoji sequences</summary>
	public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI

	/// <summary>
	/// the default breakiterators in use. these can be expensive to
	/// instantiate, cheap to clone.
	/// </summary>
	// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
	// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
	private static readonly BreakIterator cjkBreakIterator = BreakIterator.GetWordInstance(UCultureInfo.InvariantCulture);

	// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
	// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html

	// the same as ROOT, except no dictionary segmentation for cjk
	private static readonly BreakIterator defaultBreakIterator =
	ReadBreakIterator("Default.brk");
	private static readonly BreakIterator myanmarSyllableIterator =
	ReadBreakIterator("MyanmarSyllable.brk");

	// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
	private readonly bool cjkAsWords;
	private readonly bool myanmarAsWords;

	/// <summary>
	/// Creates a new config. This object is lightweight, but the first
	/// time the class is referenced, breakiterators will be initialized.
	/// </summary>
	/// <param name="cjkAsWords">true if cjk text should undergo dictionary-based segmentation,
	/// otherwise text will be segmented according to UAX#29 defaults.</param>
	/// <param name="myanmarAsWords">If this is true, all Han+Hiragana+Katakana words will be tagged as IDEOGRAPHIC.</param>
	public DefaultICUTokenizerConfig(bool cjkAsWords, bool myanmarAsWords)
	{
	this.cjkAsWords = cjkAsWords;
	this.myanmarAsWords = myanmarAsWords;
	}

	public override bool CombineCJ => cjkAsWords;

	public override RuleBasedBreakIterator GetBreakIterator(int script)
	{
	switch (script)
	{
	case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
	case UScript.Myanmar:
	if (myanmarAsWords)
	{
	return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
	}
	else
	{
	return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
	}
	default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
	}
	}

	public override string GetType(int script, int ruleStatus)
	{
	switch (ruleStatus)
	{
	case BreakIterator.WordIdeo:
	return WORD_IDEO;
	case BreakIterator.WordKana: //RuleBasedBreakIterator.WORD_KANA:
	return script == UScript.Hiragana ? WORD_HIRAGANA : WORD_KATAKANA;
	case BreakIterator.WordLetter: //RuleBasedBreakIterator.WORD_LETTER:
	return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
	case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
	return WORD_NUMBER;
	case EMOJI_SEQUENCE_STATUS:
	return WORD_EMOJI;
	default: /* some other custom code */
	return "<OTHER>";
	}
	}

	private static RuleBasedBreakIterator ReadBreakIterator(string filename)
	{
	using (Stream @is = typeof(DefaultICUTokenizerConfig).FindAndGetManifestResourceStream(filename))
	{
	try
	{
	RuleBasedBreakIterator bi =
	RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is);
	return bi;
	}
	catch (IOException e)
	{
	throw new Exception(e.ToString(), e);
	}
	}
	}
	}
	}