blob: b6093cba53483d7b15d921bfb3b4d82eaf634036 [file] [log] [blame]
// Lucene version compatibility level 8.6.1
using ICU4N.Globalization;
using ICU4N.Text;
using J2N;
using Lucene.Net.Analysis.Standard;
using System;
using System.IO;
namespace Lucene.Net.Analysis.Icu.Segmentation
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Default <see cref="ICUTokenizerConfig"/> that is generally applicable
/// to many languages.
/// </summary>
/// <remarks>
/// Generally tokenizes Unicode text according to UAX#29
/// (<see cref="T:BreakIterator.GetWordInstance(ULocale.ROOT)"/>),
/// but with the following tailorings:
/// <list type="bullet">
/// <item><description>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.</description></item>
/// </list>
/// <para/>
/// @lucene.experimental
/// </remarks>
public class DefaultICUTokenizerConfig : ICUTokenizerConfig
{
/// <summary>Token type for words containing ideographic characters</summary>
public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
/// <summary>Token type for words containing Japanese hiragana</summary>
public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
/// <summary>Token type for words containing Japanese katakana</summary>
public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
/// <summary>Token type for words containing Korean hangul</summary>
public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
/// <summary>Token type for words that contain letters</summary>
public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/// <summary>Token type for words that appear to be numbers</summary>
public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/// <summary>Token type for words that appear to be emoji sequences</summary>
public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI
/// <summary>
/// the default breakiterators in use. these can be expensive to
/// instantiate, cheap to clone.
/// </summary>
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
private static readonly BreakIterator cjkBreakIterator = BreakIterator.GetWordInstance(UCultureInfo.InvariantCulture);
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
// the same as ROOT, except no dictionary segmentation for cjk
private static readonly BreakIterator defaultBreakIterator =
ReadBreakIterator("Default.brk");
private static readonly BreakIterator myanmarSyllableIterator =
ReadBreakIterator("MyanmarSyllable.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private readonly bool cjkAsWords;
private readonly bool myanmarAsWords;
/// <summary>
/// Creates a new config. This object is lightweight, but the first
/// time the class is referenced, breakiterators will be initialized.
/// </summary>
/// <param name="cjkAsWords">true if cjk text should undergo dictionary-based segmentation,
/// otherwise text will be segmented according to UAX#29 defaults.</param>
/// <param name="myanmarAsWords">If this is true, all Han+Hiragana+Katakana words will be tagged as IDEOGRAPHIC.</param>
public DefaultICUTokenizerConfig(bool cjkAsWords, bool myanmarAsWords)
{
this.cjkAsWords = cjkAsWords;
this.myanmarAsWords = myanmarAsWords;
}
public override bool CombineCJ => cjkAsWords;
public override RuleBasedBreakIterator GetBreakIterator(int script)
{
switch (script)
{
case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone();
case UScript.Myanmar:
if (myanmarAsWords)
{
return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
else
{
return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone();
}
default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone();
}
}
public override string GetType(int script, int ruleStatus)
{
switch (ruleStatus)
{
case BreakIterator.WordIdeo:
return WORD_IDEO;
case BreakIterator.WordKana: //RuleBasedBreakIterator.WORD_KANA:
return script == UScript.Hiragana ? WORD_HIRAGANA : WORD_KATAKANA;
case BreakIterator.WordLetter: //RuleBasedBreakIterator.WORD_LETTER:
return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER;
case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
case EMOJI_SEQUENCE_STATUS:
return WORD_EMOJI;
default: /* some other custom code */
return "<OTHER>";
}
}
private static RuleBasedBreakIterator ReadBreakIterator(string filename)
{
using (Stream @is = typeof(DefaultICUTokenizerConfig).FindAndGetManifestResourceStream(filename))
{
try
{
RuleBasedBreakIterator bi =
RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is);
return bi;
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
}
}
}
}