blob: e6421a8d338046e4afd6ac4126659960427b9996 [file] [log] [blame]
using J2N;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System.Collections.Generic;
namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Provides the ability to override any <see cref="KeywordAttribute"/> aware stemmer
/// with custom dictionary-based stemming.
/// </summary>
public sealed class StemmerOverrideFilter : TokenFilter
{
private readonly StemmerOverrideMap stemmerOverrideMap;
private readonly ICharTermAttribute termAtt;
private readonly IKeywordAttribute keywordAtt;
private readonly FST.BytesReader fstReader;
private readonly FST.Arc<BytesRef> scratchArc = new FST.Arc<BytesRef>();
private readonly CharsRef spare = new CharsRef();
/// <summary>
/// Create a new <see cref="StemmerOverrideFilter"/>, performing dictionary-based stemming
/// with the provided dictionary (<paramref name="stemmerOverrideMap"/>).
/// <para>
/// Any dictionary-stemmed terms will be marked with <see cref="KeywordAttribute"/>
/// so that they will not be stemmed with stemmers down the chain.
/// </para>
/// </summary>
public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap)
: base(input)
{
this.stemmerOverrideMap = stemmerOverrideMap;
fstReader = stemmerOverrideMap.GetBytesReader();
termAtt = AddAttribute<ICharTermAttribute>();
keywordAtt = AddAttribute<IKeywordAttribute>();
}
public override bool IncrementToken()
{
if (m_input.IncrementToken())
{
if (fstReader == null)
{
// No overrides
return true;
}
if (!keywordAtt.IsKeyword) // don't muck with already-keyworded terms
{
BytesRef stem = stemmerOverrideMap.Get(termAtt.Buffer, termAtt.Length, scratchArc, fstReader);
if (stem != null)
{
char[] buffer = spare.Chars = termAtt.Buffer;
UnicodeUtil.UTF8toUTF16(stem.Bytes, stem.Offset, stem.Length, spare);
if (spare.Chars != buffer)
{
termAtt.CopyBuffer(spare.Chars, spare.Offset, spare.Length);
}
termAtt.Length = spare.Length;
keywordAtt.IsKeyword = true;
}
}
return true;
}
else
{
return false;
}
}
/// <summary>
/// A read-only 4-byte FST backed map that allows fast case-insensitive key
/// value lookups for <see cref="StemmerOverrideFilter"/>
/// </summary>
// TODO maybe we can generalize this and reuse this map somehow?
public sealed class StemmerOverrideMap
{
private readonly FST<BytesRef> fst;
private readonly bool ignoreCase;
/// <summary>
/// Creates a new <see cref="StemmerOverrideMap"/> </summary>
/// <param name="fst"> the fst to lookup the overrides </param>
/// <param name="ignoreCase"> if the keys case should be ingored </param>
public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase)
{
this.fst = fst;
this.ignoreCase = ignoreCase;
}
/// <summary>
/// Returns a <see cref="FST.BytesReader"/> to pass to the <see cref="Get(char[], int, FST.Arc{BytesRef}, FST.BytesReader)"/> method.
/// </summary>
public FST.BytesReader GetBytesReader()
{
if (fst == null)
{
return null;
}
else
{
return fst.GetBytesReader();
}
}
/// <summary>
/// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
/// </summary>
public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
{
BytesRef pendingOutput = fst.Outputs.NoOutput;
BytesRef matchOutput = null;
int bufUpto = 0;
fst.GetFirstArc(scratchArc);
while (bufUpto < bufferLen)
{
int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
{
return null;
}
pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
bufUpto += Character.CharCount(codePoint);
}
if (scratchArc.IsFinal)
{
matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
}
return matchOutput;
}
}
/// <summary>
/// This builder builds an <see cref="FST"/> for the <see cref="StemmerOverrideFilter"/>
/// </summary>
public class Builder
{
private readonly BytesRefHash hash = new BytesRefHash();
private readonly BytesRef spare = new BytesRef();
private readonly List<string> outputValues = new List<string>();
private readonly bool ignoreCase;
private readonly CharsRef charsSpare = new CharsRef();
/// <summary>
/// Creates a new <see cref="Builder"/> with <see cref="ignoreCase"/> set to <c>false</c>
/// </summary>
public Builder()
: this(false)
{
}
/// <summary>
/// Creates a new <see cref="Builder"/> </summary>
/// <param name="ignoreCase"> if the input case should be ignored. </param>
public Builder(bool ignoreCase)
{
this.ignoreCase = ignoreCase;
}
/// <summary>
/// Adds an input string and it's stemmer override output to this builder.
/// </summary>
/// <param name="input"> the input char sequence </param>
/// <param name="output"> the stemmer override output char sequence </param>
/// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
public virtual bool Add(string input, string output)
{
int length = input.Length;
if (ignoreCase)
{
// convert on the fly to lowercase
charsSpare.Grow(length);
char[] buffer = charsSpare.Chars;
for (int i = 0; i < length;)
{
i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
}
UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
}
else
{
UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
}
if (hash.Add(spare) >= 0)
{
outputValues.Add(output);
return true;
}
return false;
}
/// <summary>
/// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
/// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
/// <exception cref="System.IO.IOException"> if an <see cref="System.IO.IOException"/> occurs; </exception>
public virtual StemmerOverrideMap Build()
{
ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
Int32sRef intsSpare = new Int32sRef();
int size = hash.Count;
for (int i = 0; i < size; i++)
{
int id = sort[i];
BytesRef bytesRef = hash.Get(id, spare);
UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
builder.Add(intsSpare, new BytesRef(outputValues[id]));
}
return new StemmerOverrideMap(builder.Finish(), ignoreCase);
}
}
}
}