| using Lucene.Net.Analysis.TokenAttributes; |
| using System; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.Analysis.Pattern |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// This tokenizer uses regex pattern matching to construct distinct tokens |
| /// for the input stream. It takes two arguments: "pattern" and "group". |
| /// <para/> |
| /// <list type="bullet"> |
| /// <item><description>"pattern" is the regular expression.</description></item> |
| /// <item><description>"group" says which group to extract into tokens.</description></item> |
| /// </list> |
| /// <para> |
| /// group=-1 (the default) is equivalent to "split". In this case, the tokens will |
| /// be equivalent to the output from (without empty tokens): |
| /// <see cref="Regex.Replace(string, string)"/> |
| /// </para> |
| /// <para> |
| /// Using group >= 0 selects the matching group as the token. For example, if you have:<br/> |
| /// <code> |
| /// pattern = \'([^\']+)\' |
| /// group = 0 |
| /// input = aaa 'bbb' 'ccc' |
| /// </code> |
| /// the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input |
| /// but using group=1, the output would be: bbb and ccc (no ' marks) |
| /// </para> |
| /// <para>NOTE: This <see cref="Tokenizer"/> does not output tokens that are of zero length.</para> |
| /// </summary> |
| /// <seealso cref="Regex"/> |
| public sealed class PatternTokenizer : Tokenizer |
| { |
| private readonly ICharTermAttribute termAtt; |
| private readonly IOffsetAttribute offsetAtt; |
| |
| private readonly StringBuilder str = new StringBuilder(); |
| private int index; |
| private bool isReset = false; |
| |
| private readonly int group; |
| private Match matcher; |
| private readonly Regex pattern; |
| |
| /// <summary> |
| /// creates a new <see cref="PatternTokenizer"/> returning tokens from group (-1 for split functionality) </summary> |
| public PatternTokenizer(TextReader input, Regex pattern, int group) |
| : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, pattern, group) |
| { |
| } |
| |
| /// <summary> |
| /// creates a new <see cref="PatternTokenizer"/> returning tokens from group (-1 for split functionality) </summary> |
| public PatternTokenizer(AttributeFactory factory, TextReader input, Regex pattern, int group) |
| : base(factory, input) |
| { |
| this.termAtt = AddAttribute<ICharTermAttribute>(); |
| this.offsetAtt = AddAttribute<IOffsetAttribute>(); |
| this.group = group; |
| |
| // Use "" instead of str so don't consume chars |
| // (fillBuffer) from the input on throwing IAE below: |
| this.matcher = pattern.Match(""); |
| this.pattern = pattern; |
| |
| // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher |
| var groupCount = pattern.GetGroupNumbers().Length; |
| if (group >= 0 && group > groupCount) |
| { |
| throw new ArgumentException("invalid group specified: pattern only has: " + groupCount + " capturing groups"); |
| } |
| |
| } |
| |
| public override bool IncrementToken() |
| { |
| if (index >= str.Length) |
| { |
| return false; |
| } |
| ClearAttributes(); |
| |
| if (group >= 0) |
| { |
| |
| // match a specific group |
| if (matcher.Success) |
| { |
| do |
| { |
| // We have alredy parsed from this index, go to the next token. |
| if (!isReset && matcher.Groups[group].Index == index) |
| { |
| continue; |
| } |
| isReset = false; |
| |
| index = matcher.Groups[group].Index; |
| int endIndex = matcher.Groups[group].Index + matcher.Groups[group].Length; |
| |
| if (index == endIndex) |
| { |
| continue; |
| } |
| |
| termAtt.SetEmpty().Append(str.ToString(), index, endIndex - index); // LUCENENET: Corrected 3rd parameter |
| offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(endIndex)); |
| return true; |
| |
| } while ((matcher = matcher.NextMatch()).Success); |
| } |
| |
| |
| index = int.MaxValue; // mark exhausted |
| return false; |
| |
| } |
| else |
| { |
| |
| // String.split() functionality |
| if (matcher.Success) |
| { |
| do |
| { |
| if (matcher.Index - index > 0) |
| { |
| // found a non-zero-length token |
| termAtt.SetEmpty().Append(str.ToString(), index, matcher.Index - index); // LUCENENET: Corrected 3rd parameter |
| offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(matcher.Index)); |
| index = matcher.Index + matcher.Length; |
| return true; |
| } |
| |
| isReset = false; |
| index = matcher.Index + matcher.Length; |
| } while ((matcher = matcher.NextMatch()).Success); |
| } |
| |
| if (str.Length - index == 0) |
| { |
| index = int.MaxValue; // mark exhausted |
| return false; |
| } |
| |
| termAtt.SetEmpty().Append(str.ToString(), index, str.Length - index); // LUCENENET: Corrected 3rd parameter |
| offsetAtt.SetOffset(CorrectOffset(index), CorrectOffset(str.Length)); |
| index = int.MaxValue; // mark exhausted |
| return true; |
| } |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| int ofs = CorrectOffset(str.Length); |
| offsetAtt.SetOffset(ofs, ofs); |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| FillBuffer(str, m_input); |
| |
| // LUCENENET: Since we need to "reset" the Match |
| // object, we also need an "isReset" flag to indicate |
| // whether we are at the head of the match and to |
| // take the appropriate measures to ensure we don't |
| // overwrite our matcher variable with |
| // matcher = matcher.NextMatch(); |
| // before it is time. A string could potentially |
| // match on index 0, so we need another variable to |
| // manage this state. |
| matcher = pattern.Match(str.ToString()); |
| isReset = true; |
| index = 0; |
| } |
| |
| // TODO: we should see if we can make this tokenizer work without reading |
| // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? |
| private readonly char[] buffer = new char[8192]; |
| |
| private void FillBuffer(StringBuilder sb, TextReader input) |
| { |
| int len; |
| sb.Length = 0; |
| while ((len = input.Read(buffer, 0, buffer.Length)) > 0) |
| { |
| sb.Append(buffer, 0, len); |
| } |
| } |
| } |
| } |