| // Lucene version compatibility level 4.8.1 |
| using Lucene.Net.Analysis.Standard.Std31; |
| using Lucene.Net.Analysis.Standard.Std34; |
| using Lucene.Net.Analysis.Standard.Std36; |
| using Lucene.Net.Analysis.Standard.Std40; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Util; |
| using System; |
| using System.IO; |
| |
| namespace Lucene.Net.Analysis.Standard |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// This class implements Word Break rules from the Unicode Text Segmentation |
| /// algorithm, as specified in ` |
| /// <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a> |
| /// URLs and email addresses are also tokenized according to the relevant RFCs. |
| /// <para/> |
| /// Tokens produced are of the following types: |
| /// <list type="bullet"> |
| /// <item><description><ALPHANUM>: A sequence of alphabetic and numeric characters</description></item> |
| /// <item><description><NUM>: A number</description></item> |
| /// <item><description><URL>: A URL</description></item> |
| /// <item><description><EMAIL>: An email address</description></item> |
| /// <item><description><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast |
| /// Asian languages, including Thai, Lao, Myanmar, and Khmer</description></item> |
| /// <item><description><IDEOGRAPHIC>: A single CJKV ideographic character</description></item> |
| /// <item><description><HIRAGANA>: A single hiragana character</description></item> |
| /// </list> |
| /// <para>You must specify the required <see cref="LuceneVersion"/> |
| /// compatibility when creating <see cref="UAX29URLEmailTokenizer"/>: |
| /// <list type="bullet"> |
| /// <item><description> As of 3.4, Hiragana and Han characters are no longer wrongly split |
| /// from their combining characters. If you use a previous version number, |
| /// you get the exact broken behavior for backwards compatibility.</description></item> |
| /// </list> |
| /// </para> |
| /// </summary> |
| public sealed class UAX29URLEmailTokenizer : Tokenizer |
| { |
| /// <summary> |
| /// A private instance of the JFlex-constructed scanner </summary> |
| private IStandardTokenizerInterface scanner; |
| |
| public const int ALPHANUM = 0; |
| public const int NUM = 1; |
| public const int SOUTHEAST_ASIAN = 2; |
| public const int IDEOGRAPHIC = 3; |
| public const int HIRAGANA = 4; |
| public const int KATAKANA = 5; |
| public const int HANGUL = 6; |
| public const int URL = 7; |
| public const int EMAIL = 8; |
| |
| /// <summary> |
| /// String token types that correspond to token type int constants </summary> |
| public static readonly string[] TOKEN_TYPES = new string[] { |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA], |
| StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL], |
| "<URL>", |
| "<EMAIL>" |
| }; |
| |
| private int skippedPositions; |
| |
| private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; |
| |
| /// <summary> |
| /// Set the max allowed token length. Any token longer |
| /// than this is skipped. |
| /// </summary> |
| public int MaxTokenLength |
| { |
| get => maxTokenLength; |
| set |
| { |
| if (value < 1) |
| { |
| throw new ArgumentOutOfRangeException(nameof(MaxTokenLength), "maxTokenLength must be greater than zero"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) |
| } |
| this.maxTokenLength = value; |
| } |
| } |
| |
| /// <summary> |
| /// Creates a new instance of the <see cref="UAX29URLEmailTokenizer"/>. Attaches |
| /// the <paramref name="input"/> to the newly created JFlex scanner. |
| /// </summary> |
| /// <param name="matchVersion"> Lucene compatibility version </param> |
| /// <param name="input"> The input reader </param> |
| public UAX29URLEmailTokenizer(LuceneVersion matchVersion, TextReader input) |
| : base(input) |
| { |
| Init(matchVersion); |
| } |
| |
| /// <summary> |
| /// Creates a new <see cref="UAX29URLEmailTokenizer"/> with a given <see cref="AttributeFactory"/> |
| /// </summary> |
| public UAX29URLEmailTokenizer(LuceneVersion matchVersion, AttributeFactory factory, TextReader input) |
| : base(factory, input) |
| { |
| Init(matchVersion); |
| } |
| |
| /// <summary> |
| /// LUCENENET specific: This method was added in .NET to prevent having to repeat code in the constructors. |
| /// </summary> |
| /// <param name="matchVersion"></param> |
| private void Init(LuceneVersion matchVersion) |
| { |
| this.scanner = GetScannerFor(matchVersion); |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.offsetAtt = AddAttribute<IOffsetAttribute>(); |
| this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); |
| this.typeAtt = AddAttribute<ITypeAttribute>(); |
| } |
| |
| private IStandardTokenizerInterface GetScannerFor(LuceneVersion matchVersion) |
| { |
| // best effort NPE if you dont call reset |
| #pragma warning disable 612, 618 |
| if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_47)) |
| { |
| return new UAX29URLEmailTokenizerImpl(m_input); |
| } |
| else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40)) |
| { |
| return new UAX29URLEmailTokenizerImpl40(m_input); |
| } |
| else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_36)) |
| { |
| return new UAX29URLEmailTokenizerImpl36(m_input); |
| } |
| else if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_34)) |
| { |
| return new UAX29URLEmailTokenizerImpl34(m_input); |
| } |
| else |
| { |
| return new UAX29URLEmailTokenizerImpl31(m_input); |
| } |
| #pragma warning restore 612, 618 |
| } |
| |
| // this tokenizer generates three attributes: |
| // term offset, positionIncrement and type |
| private ICharTermAttribute termAtt; |
| private IOffsetAttribute offsetAtt; |
| private IPositionIncrementAttribute posIncrAtt; |
| private ITypeAttribute typeAtt; |
| |
| public override sealed bool IncrementToken() |
| { |
| ClearAttributes(); |
| skippedPositions = 0; |
| |
| while (true) |
| { |
| int tokenType = scanner.GetNextToken(); |
| |
| if (tokenType == StandardTokenizerInterface.YYEOF) |
| { |
| return false; |
| } |
| |
| if (scanner.YyLength <= maxTokenLength) |
| { |
| posIncrAtt.PositionIncrement = skippedPositions + 1; |
| scanner.GetText(termAtt); |
| int start = scanner.YyChar; |
| offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length)); |
| typeAtt.Type = TOKEN_TYPES[tokenType]; |
| return true; |
| } |
| else |
| // When we skip a too-long term, we still increment the |
| // position increment |
| { |
| skippedPositions++; |
| } |
| } |
| } |
| |
| public override sealed void End() |
| { |
| base.End(); |
| // set final offset |
| int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength); |
| offsetAtt.SetOffset(finalOffset, finalOffset); |
| // adjust any skipped tokens |
| posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions; |
| } |
| |
| public override void Close() |
| { |
| base.Close(); |
| scanner.YyReset(m_input); |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| scanner.YyReset(m_input); |
| skippedPositions = 0; |
| } |
| } |
| } |