| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.QueryParsers.Classic; |
| using Lucene.Net.Search; |
| using Lucene.Net.Util; |
| using System; |
| using System.IO; |
| using System.Text; |
| using System.Text.RegularExpressions; |
| |
| namespace Lucene.Net.QueryParsers.Analyzing |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Overrides Lucene's default <see cref="QueryParser"/> so that Fuzzy-, Prefix-, Range-, and WildcardQuerys |
| /// are also passed through the given analyzer, but wildcard characters <c>*</c> and |
| /// <c>?</c> don't get removed from the search terms. |
| /// <para/> |
| /// <b>Warning:</b> This class should only be used with analyzers that do not use stopwords |
| /// or that add tokens. Also, several stemming analyzers are inappropriate: for example, <see cref="Analysis.De.GermanAnalyzer"/> |
| /// will turn <c>Häuser</c> into <c>hau</c>, but <c>H?user</c> will |
| /// become <c>h?user</c> when using this parser and thus no match would be found (i.e. |
| /// using this parser will be no improvement over QueryParser in such cases). |
| /// </summary> |
| public class AnalyzingQueryParser : Classic.QueryParser |
| { |
| // gobble escaped chars or find a wildcard character |
| private readonly Regex wildcardPattern = new Regex(@"(\\.)|([?*]+)", RegexOptions.Compiled); |
| |
| public AnalyzingQueryParser(LuceneVersion matchVersion, string field, Analyzer analyzer) |
| : base(matchVersion, field, analyzer) |
| { |
| AnalyzeRangeTerms = true; |
| } |
| |
| /// <summary> |
| /// Called when parser parses an input term |
| /// that uses prefix notation; that is, contains a single '*' wildcard |
| /// character as its last character. Since this is a special case |
| /// of generic wildcard term, and such a query can be optimized easily, |
| /// this usually results in a different query object. |
| /// <para/> |
| /// Depending on analyzer and settings, a prefix term may (most probably will) |
| /// be lower-cased automatically. It <b>will</b> go through the default Analyzer. |
| /// <para/> |
| /// Overrides super class, by passing terms through analyzer. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term to use for building term for the query |
| /// (<b>without</b> trailing '*' character!)</param> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal override Query GetWildcardQuery(string field, string termStr) |
| { |
| if (termStr == null) |
| { |
| //can't imagine this would ever happen |
| throw new ParseException("Passed null value as term to GetWildcardQuery"); |
| } |
| if (!AllowLeadingWildcard && (termStr.StartsWith("*", StringComparison.Ordinal) || termStr.StartsWith("?", StringComparison.Ordinal))) |
| { |
| throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery" |
| + " unless AllowLeadingWildcard returns true"); |
| } |
| |
| Match wildcardMatcher = wildcardPattern.Match(termStr); |
| StringBuilder sb = new StringBuilder(); |
| int last = 0; |
| |
| while (wildcardMatcher.Success) |
| { |
| // continue if escaped char |
| if (wildcardMatcher.Groups[1].Success) |
| { |
| wildcardMatcher = wildcardMatcher.NextMatch(); |
| continue; |
| } |
| |
| if (wildcardMatcher.Index > last) |
| { |
| string chunk = termStr.Substring(last, wildcardMatcher.Index - last); |
| string analyzed = AnalyzeSingleChunk(field, termStr, chunk); |
| sb.Append(analyzed); |
| } |
| |
| //append the wildcard character |
| sb.Append(wildcardMatcher.Groups[2]); |
| |
| last = wildcardMatcher.Index + wildcardMatcher.Length; |
| wildcardMatcher = wildcardMatcher.NextMatch(); |
| } |
| if (last < termStr.Length) |
| { |
| sb.Append(AnalyzeSingleChunk(field, termStr, termStr.Substring(last))); |
| } |
| return base.GetWildcardQuery(field, sb.ToString()); |
| } |
| |
| /// <summary> |
| /// Called when parser parses an input term |
| /// that uses prefix notation; that is, contains a single '*' wildcard |
| /// character as its last character. Since this is a special case |
| /// of generic wildcard term, and such a query can be optimized easily, |
| /// this usually results in a different query object. |
| /// <para/> |
| /// Depending on analyzer and settings, a prefix term may (most probably will) |
| /// be lower-cased automatically. It <b>will</b> go through the default Analyzer. |
| /// <para/> |
| /// Overrides super class, by passing terms through analyzer. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term to use for building term for the query (<b>without</b> trailing '*' character!)</param> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal override Query GetPrefixQuery(string field, string termStr) |
| { |
| string analyzed = AnalyzeSingleChunk(field, termStr, termStr); |
| return base.GetPrefixQuery(field, analyzed); |
| } |
| |
| /// <summary> |
| /// Called when parser parses an input term that has the fuzzy suffix (~) appended. |
| /// <para/> |
| /// Depending on analyzer and settings, a fuzzy term may (most probably will) |
| /// be lower-cased automatically. It <b>will</b> go through the default Analyzer. |
| /// <para/> |
| /// Overrides super class, by passing terms through analyzer. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term to use for building term for the query</param> |
| /// <param name="minSimilarity"></param> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal override Query GetFuzzyQuery(string field, string termStr, float minSimilarity) |
| { |
| string analyzed = AnalyzeSingleChunk(field, termStr, termStr); |
| return base.GetFuzzyQuery(field, analyzed, minSimilarity); |
| } |
| |
| /// <summary> |
| /// Returns the analyzed form for the given chunk. |
| /// |
| /// If the analyzer produces more than one output token from the given chunk, |
| /// a ParseException is thrown. |
| /// </summary> |
| /// <param name="field">The target field</param> |
| /// <param name="termStr">The full term from which the given chunk is excerpted</param> |
| /// <param name="chunk">The portion of the given termStr to be analyzed</param> |
| /// <returns>The result of analyzing the given chunk</returns> |
| /// <exception cref="ParseException">ParseException when analysis returns other than one output token</exception> |
| protected internal virtual string AnalyzeSingleChunk(string field, string termStr, string chunk) |
| { |
| string analyzed = null; |
| TokenStream stream = null; |
| try |
| { |
| stream = Analyzer.GetTokenStream(field, chunk); |
| stream.Reset(); |
| ICharTermAttribute termAtt = stream.GetAttribute<ICharTermAttribute>(); |
| // get first and hopefully only output token |
| if (stream.IncrementToken()) |
| { |
| analyzed = termAtt.ToString(); |
| |
| // try to increment again, there should only be one output token |
| StringBuilder multipleOutputs = null; |
| while (stream.IncrementToken()) |
| { |
| if (null == multipleOutputs) |
| { |
| multipleOutputs = new StringBuilder(); |
| multipleOutputs.Append('"'); |
| multipleOutputs.Append(analyzed); |
| multipleOutputs.Append('"'); |
| } |
| multipleOutputs.Append(','); |
| multipleOutputs.Append('"'); |
| multipleOutputs.Append(termAtt.ToString()); |
| multipleOutputs.Append('"'); |
| } |
| stream.End(); |
| if (null != multipleOutputs) |
| { |
| throw new ParseException( |
| string.Format(@"Analyzer created multiple terms for ""{0}"": {1}", chunk, multipleOutputs.ToString())); |
| } |
| } |
| else |
| { |
| // nothing returned by analyzer. Was it a stop word and the user accidentally |
| // used an analyzer with stop words? |
| stream.End(); |
| throw new ParseException(string.Format(@"Analyzer returned nothing for ""{0}""", chunk)); |
| } |
| } |
| catch (IOException /*e*/) |
| { |
| throw new ParseException( |
| string.Format(@"IO error while trying to analyze single term: ""{0}""", termStr)); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(stream); |
| } |
| return analyzed; |
| } |
| } |
| } |