| // Lucene version compatibility level 8.2.0 |
| using J2N; |
| using Lucene.Net.Analysis.Morfologik.TokenAttributes; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using Morfologik.Stemming; |
| using Morfologik.Stemming.Polish; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Analysis.Morfologik |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// <see cref="TokenFilter"/> using Morfologik library to transform input tokens into lemma and |
| /// morphosyntactic (POS) tokens. Applies to Polish only. |
| /// <para/> |
| /// MorfologikFilter contains a <see cref="MorphosyntacticTagsAttribute"/>, which provides morphosyntactic |
| /// annotations for produced lemmas. See the Morfologik documentation for details. |
| /// </summary> |
| public class MorfologikFilter : TokenFilter |
| { |
| private readonly ICharTermAttribute termAtt; |
| private readonly IMorphosyntacticTagsAttribute tagsAtt; |
| private readonly IPositionIncrementAttribute posIncrAtt; |
| private readonly IKeywordAttribute keywordAttr; |
| |
| private readonly CharsRef scratch = new CharsRef(); |
| |
| private State current; |
| private readonly TokenStream input; |
| private readonly IStemmer stemmer; |
| |
| private IList<WordData> lemmaList; |
| private readonly List<StringBuilder> tagsList = new List<StringBuilder>(); |
| |
| private int lemmaListIndex; |
| |
| private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture |
| |
| /// <summary> |
| /// Creates a filter with the default (Polish) dictionary. |
| /// </summary> |
| /// <param name="input">Input token stream.</param> |
| public MorfologikFilter(TokenStream input) |
| : this(input, new PolishStemmer().Dictionary) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a filter with a given dictionary. |
| /// </summary> |
| /// <param name="input">Input token stream.</param> |
| /// <param name="dict"><see cref="Dictionary"/> to use for stemming.</param> |
| public MorfologikFilter(TokenStream input, Dictionary dict) |
| : base(input) |
| { |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.tagsAtt = AddAttribute<IMorphosyntacticTagsAttribute>(); |
| this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); |
| this.keywordAttr = AddAttribute<IKeywordAttribute>(); |
| |
| this.input = input; |
| this.stemmer = new DictionaryLookup(dict); |
| this.lemmaList = new List<WordData>(); |
| } |
| |
| /// <summary> |
| /// A regex used to split lemma forms. |
| /// </summary> |
| private readonly static Regex lemmaSplitter = new Regex("\\+|\\|", RegexOptions.Compiled); |
| |
| private void PopNextLemma() |
| { |
| // One tag (concatenated) per lemma. |
| WordData lemma = lemmaList[lemmaListIndex++]; |
| termAtt.SetEmpty().Append(lemma.GetStem().ToString()); |
| var tag = lemma.GetTag(); |
| if (tag != null) |
| { |
| string[] tags = lemmaSplitter.Split(tag.ToString()); |
| for (int i = 0; i < tags.Length; i++) |
| { |
| if (tagsList.Count <= i) |
| { |
| tagsList.Add(new StringBuilder()); |
| } |
| StringBuilder buffer = tagsList[i]; |
| buffer.Length = 0; |
| buffer.Append(tags[i]); |
| } |
| tagsAtt.Tags = tagsList.SubList(0, tags.Length); |
| } |
| else |
| { |
| tagsAtt.Tags = Collections.EmptyList<StringBuilder>(); |
| } |
| } |
| |
| /// <summary> |
| /// Lookup a given surface form of a token and update |
| /// <see cref="lemmaList"/> and <see cref="lemmaListIndex"/> accordingly. |
| /// </summary> |
| private bool LookupSurfaceForm(string token) |
| { |
| lemmaList = this.stemmer.Lookup(token); |
| lemmaListIndex = 0; |
| return lemmaList.Count > 0; |
| } |
| |
| /// <summary>Retrieves the next token (possibly from the list of lemmas).</summary> |
| public override sealed bool IncrementToken() |
| { |
| if (lemmaListIndex < lemmaList.Count) |
| { |
| RestoreState(current); |
| posIncrAtt.PositionIncrement = 0; |
| PopNextLemma(); |
| return true; |
| } |
| else if (this.input.IncrementToken()) |
| { |
| if (!keywordAttr.IsKeyword && |
| (LookupSurfaceForm(termAtt.ToString()) || LookupSurfaceForm(ToLowercase(termAtt.ToString())))) |
| { |
| current = CaptureState(); |
| PopNextLemma(); |
| } |
| else |
| { |
| tagsAtt.Clear(); |
| } |
| return true; |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| /// <summary>Convert to lowercase in-place.</summary> |
| private string ToLowercase(string chs) |
| { |
| int length = chs.Length; |
| scratch.Length = length; |
| scratch.Grow(length); |
| |
| char[] buffer = scratch.Chars; |
| for (int i = 0; i < length;) |
| { |
| i += Character.ToChars( |
| Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread |
| } |
| |
| return scratch.ToString(); |
| } |
| |
| /// <summary>Resets stems accumulator and hands over to superclass.</summary> |
| public override void Reset() |
| { |
| lemmaListIndex = 0; |
| lemmaList = new List<WordData>(); |
| tagsList.Clear(); |
| base.Reset(); |
| } |
| } |
| } |