blob: bf5ecabd2fac174d92034bb71178fde41c593a17 [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
using Lucene.Net.Diagnostics;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
namespace Lucene.Net.Analysis.Compound
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Base class for decomposition token filters.
/// <para/>
/// You must specify the required <see cref="LuceneVersion"/> compatibility when creating
/// <see cref="CompoundWordTokenFilterBase"/>:
/// <list type="bullet">
/// <item><description>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
/// supplementary characters in strings and char arrays provided as compound word
/// dictionaries.</description></item>
/// <item><description>As of 4.4, <see cref="CompoundWordTokenFilterBase"/> doesn't update offsets.</description></item>
/// </list>
/// </summary>
public abstract class CompoundWordTokenFilterBase : TokenFilter
/// <summary>
/// The default for minimal word length that gets decomposed
/// </summary>
public const int DEFAULT_MIN_WORD_SIZE = 5;
/// <summary>
/// The default for minimal length of subwords that get propagated to the output of this filter
/// </summary>
public const int DEFAULT_MIN_SUBWORD_SIZE = 2;
/// <summary>
/// The default for maximal length of subwords that get propagated to the output of this filter
/// </summary>
public const int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected readonly LuceneVersion m_matchVersion;
protected readonly CharArraySet m_dictionary;
protected readonly Queue<CompoundToken> m_tokens;
protected readonly int m_minWordSize;
protected readonly int m_minSubwordSize;
protected readonly int m_maxSubwordSize;
protected readonly bool m_onlyLongestMatch;
protected readonly ICharTermAttribute m_termAtt;
protected readonly IOffsetAttribute m_offsetAtt;
private readonly IPositionIncrementAttribute posIncAtt;
private AttributeSource.State current;
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, bool onlyLongestMatch)
: this(matchVersion, input, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch)
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary)
protected CompoundWordTokenFilterBase(LuceneVersion matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch)
: base(input)
m_termAtt = AddAttribute<ICharTermAttribute>();
m_offsetAtt = AddAttribute<IOffsetAttribute>();
posIncAtt = AddAttribute<IPositionIncrementAttribute>();
this.m_matchVersion = matchVersion;
this.m_tokens = new Queue<CompoundToken>();
if (minWordSize < 0)
throw new ArgumentException("minWordSize cannot be negative");
this.m_minWordSize = minWordSize;
if (minSubwordSize < 0)
throw new ArgumentException("minSubwordSize cannot be negative");
this.m_minSubwordSize = minSubwordSize;
if (maxSubwordSize < 0)
throw new ArgumentException("maxSubwordSize cannot be negative");
this.m_maxSubwordSize = maxSubwordSize;
this.m_onlyLongestMatch = onlyLongestMatch;
this.m_dictionary = dictionary;
public override sealed bool IncrementToken()
if (m_tokens.Count > 0)
if (Debugging.AssertsEnabled) Debugging.Assert(current != null);
CompoundToken token = m_tokens.Dequeue();
RestoreState(current); // keep all other attributes untouched
m_offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
posIncAtt.PositionIncrement = 0;
return true;
current = null; // not really needed, but for safety
if (m_input.IncrementToken())
// Only words longer than minWordSize get processed
if (m_termAtt.Length >= this.m_minWordSize)
// only capture the state if we really need it for producing new tokens
if (m_tokens.Count > 0)
current = CaptureState();
// return original token:
return true;
return false;
/// <summary>
/// Decomposes the current <see cref="m_termAtt"/> and places <see cref="CompoundToken"/> instances in the <see cref="m_tokens"/> list.
/// The original token may not be placed in the list, as it is automatically passed through this filter.
/// </summary>
protected abstract void Decompose();
public override void Reset()
current = null;
/// <summary>
/// Helper class to hold decompounded token information
/// </summary>
protected class CompoundToken
private readonly ICharSequence txt;
private readonly int startOffset, endOffset;
public ICharSequence Text => txt; // LUCENENET specific: changed public field into property backed by private field
public int StartOffset => startOffset; // LUCENENET specific: changed public field into property backed by private field
public int EndOffset => endOffset; // LUCENENET specific: changed public field into property backed by private field
/// <summary>
/// Construct the compound token based on a slice of the current <see cref="CompoundWordTokenFilterBase.m_termAtt"/>. </summary>
public CompoundToken(CompoundWordTokenFilterBase compoundWordTokenFilterBase, int offset, int length)
this.txt = compoundWordTokenFilterBase.m_termAtt.Subsequence(offset, length); // LUCENENET: Corrected 2nd Subsequence parameter
// offsets of the original word
int startOff = compoundWordTokenFilterBase.m_offsetAtt.StartOffset;
int endOff = compoundWordTokenFilterBase.m_offsetAtt.EndOffset;
#pragma warning disable 612, 618
if (compoundWordTokenFilterBase.m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_44) || endOff - startOff != compoundWordTokenFilterBase.m_termAtt.Length)
#pragma warning restore 612, 618
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
this.startOffset = startOff;
this.endOffset = endOff;
int newStart = startOff + offset;
this.startOffset = newStart;
this.endOffset = newStart + length;