blob: e3e51a89162f7aa697790fa551023e3ea2d639a2 [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using System;
using Reader = System.IO.TextReader;
namespace Lucene.Net.Analysis.Standard
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// A grammar-based tokenizer constructed with JFlex (and then ported to .NET)
///
/// <para> This should be a good tokenizer for most European-language documents:
///
/// <list type="bullet">
/// <item><description>Splits words at punctuation characters, removing punctuation. However, a
/// dot that's not followed by whitespace is considered part of a token.</description></item>
/// <item><description>Splits words at hyphens, unless there's a number in the token, in which case
/// the whole token is interpreted as a product number and is not split.</description></item>
/// <item><description>Recognizes email addresses and internet hostnames as one token.</description></item>
/// </list>
///
/// </para>
/// <para>Many applications have specific tokenizer needs. If this tokenizer does
/// not suit your application, please consider copying this source code
/// directory to your project and maintaining your own grammar-based tokenizer.
///
/// <see cref="ClassicTokenizer"/> was named <see cref="StandardTokenizer"/> in Lucene versions prior to 3.1.
/// As of 3.1, <see cref="StandardTokenizer"/> implements Unicode text segmentation,
/// as specified by UAX#29.
/// </para>
/// </summary>
public sealed class ClassicTokenizer : Tokenizer
{
/// <summary>
/// A private instance of the JFlex-constructed scanner </summary>
private IStandardTokenizerInterface scanner;
public const int ALPHANUM = 0;
public const int APOSTROPHE = 1;
public const int ACRONYM = 2;
public const int COMPANY = 3;
public const int EMAIL = 4;
public const int HOST = 5;
public const int NUM = 6;
public const int CJ = 7;
public const int ACRONYM_DEP = 8;
/// <summary>
/// String token types that correspond to token type int constants </summary>
public static readonly string[] TOKEN_TYPES = new string[] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>"
};
private int skippedPositions;
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/// <summary>
/// Set the max allowed token length. Any token longer
/// than this is skipped.
/// </summary>
public int MaxTokenLength
{
get => maxTokenLength;
set
{
if (value < 1)
throw new ArgumentException("maxTokenLength must be greater than zero");
this.maxTokenLength = value;
}
}
/// <summary>
/// Creates a new instance of the <see cref="ClassicTokenizer"/>. Attaches
/// the <paramref name="input"/> to the newly created JFlex scanner.
/// </summary>
/// <param name="matchVersion"> lucene compatibility version </param>
/// <param name="input"> The input reader
///
/// See http://issues.apache.org/jira/browse/LUCENE-1068 </param>
public ClassicTokenizer(LuceneVersion matchVersion, Reader input)
: base(input)
{
Init(matchVersion);
}
/// <summary>
/// Creates a new <see cref="ClassicTokenizer"/> with a given <see cref="AttributeSource.AttributeFactory"/>
/// </summary>
public ClassicTokenizer(LuceneVersion matchVersion, AttributeFactory factory, Reader input)
: base(factory, input)
{
Init(matchVersion);
}
private void Init(LuceneVersion matchVersion)
{
this.scanner = new ClassicTokenizerImpl(m_input);
this.termAtt = AddAttribute<ICharTermAttribute>();
this.offsetAtt = AddAttribute<IOffsetAttribute>();
this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
this.typeAtt = AddAttribute<ITypeAttribute>();
}
// this tokenizer generates three attributes:
// term offset, positionIncrement and type
private ICharTermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private IPositionIncrementAttribute posIncrAtt;
private ITypeAttribute typeAtt;
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public override sealed bool IncrementToken()
{
ClearAttributes();
skippedPositions = 0;
while (true)
{
int tokenType = scanner.GetNextToken();
if (tokenType == StandardTokenizerInterface.YYEOF)
{
return false;
}
if (scanner.YyLength <= maxTokenLength)
{
posIncrAtt.PositionIncrement = skippedPositions + 1;
scanner.GetText(termAtt);
int start = scanner.YyChar;
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
if (tokenType == ClassicTokenizer.ACRONYM_DEP)
{
typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
termAtt.Length = termAtt.Length - 1; // remove extra '.'
}
else
{
typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
}
return true;
}
else
// When we skip a too-long term, we still increment the
// position increment
{
skippedPositions++;
}
}
}
public override sealed void End()
{
base.End();
// set final offset
int finalOffset = CorrectOffset(scanner.YyChar + scanner.YyLength);
offsetAtt.SetOffset(finalOffset, finalOffset);
// adjust any skipped tokens
posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions;
}
protected override void Dispose(bool disposing)
{
base.Dispose(disposing);
if (disposing)
{
scanner.YyReset(m_input);
}
}
public override void Reset()
{
base.Reset();
scanner.YyReset(m_input);
skippedPositions = 0;
}
}
}