blob: 2f31efa51d66f842784258779dd1e20d3fdfe24d [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.QueryParsers.Classic;
using Lucene.Net.Search;
using Lucene.Net.Util;
using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Lucene.Net.QueryParsers.Analyzing
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/// <summary>
/// Overrides Lucene's default <see cref="QueryParser"/> so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
/// are also passed through the given analyzer, but wildcard characters <c>*</c> and
/// <c>?</c> don't get removed from the search terms.
/// <para/>
/// <b>Warning:</b> This class should only be used with analyzers that do not use stopwords
/// or that add tokens. Also, several stemming analyzers are inappropriate: for example, <see cref="Analysis.De.GermanAnalyzer"/>
/// will turn <c>Häuser</c> into <c>hau</c>, but <c>H?user</c> will
/// become <c>h?user</c> when using this parser and thus no match would be found (i.e.
/// using this parser will be no improvement over QueryParser in such cases).
/// </summary>
public class AnalyzingQueryParser : Classic.QueryParser
// gobble escaped chars or find a wildcard character
private readonly Regex wildcardPattern = new Regex(@"(\\.)|([?*]+)", RegexOptions.Compiled);
public AnalyzingQueryParser(LuceneVersion matchVersion, string field, Analyzer analyzer)
: base(matchVersion, field, analyzer)
AnalyzeRangeTerms = true;
/// <summary>
/// Called when parser parses an input term
/// that uses prefix notation; that is, contains a single '*' wildcard
/// character as its last character. Since this is a special case
/// of generic wildcard term, and such a query can be optimized easily,
/// this usually results in a different query object.
/// <para/>
/// Depending on analyzer and settings, a prefix term may (most probably will)
/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
/// <para/>
/// Overrides super class, by passing terms through analyzer.
/// </summary>
/// <param name="field">Name of the field query will use.</param>
/// <param name="termStr">Term to use for building term for the query
/// (<b>without</b> trailing '*' character!)</param>
/// <returns>Resulting <see cref="Query"/> built for the term</returns>
protected internal override Query GetWildcardQuery(string field, string termStr)
if (termStr == null)
//can't imagine this would ever happen
throw new ParseException("Passed null value as term to GetWildcardQuery");
if (!AllowLeadingWildcard && (termStr.StartsWith("*", StringComparison.Ordinal) || termStr.StartsWith("?", StringComparison.Ordinal)))
throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
+ " unless AllowLeadingWildcard returns true");
Match wildcardMatcher = wildcardPattern.Match(termStr);
StringBuilder sb = new StringBuilder();
int last = 0;
while (wildcardMatcher.Success)
// continue if escaped char
if (wildcardMatcher.Groups[1].Success)
wildcardMatcher = wildcardMatcher.NextMatch();
if (wildcardMatcher.Index > last)
string chunk = termStr.Substring(last, wildcardMatcher.Index - last);
string analyzed = AnalyzeSingleChunk(field, termStr, chunk);
//append the wildcard character
last = wildcardMatcher.Index + wildcardMatcher.Length;
wildcardMatcher = wildcardMatcher.NextMatch();
if (last < termStr.Length)
sb.Append(AnalyzeSingleChunk(field, termStr, termStr.Substring(last)));
return base.GetWildcardQuery(field, sb.ToString());
/// <summary>
/// Called when parser parses an input term
/// that uses prefix notation; that is, contains a single '*' wildcard
/// character as its last character. Since this is a special case
/// of generic wildcard term, and such a query can be optimized easily,
/// this usually results in a different query object.
/// <para/>
/// Depending on analyzer and settings, a prefix term may (most probably will)
/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
/// <para/>
/// Overrides super class, by passing terms through analyzer.
/// </summary>
/// <param name="field">Name of the field query will use.</param>
/// <param name="termStr">Term to use for building term for the query (<b>without</b> trailing '*' character!)</param>
/// <returns>Resulting <see cref="Query"/> built for the term</returns>
protected internal override Query GetPrefixQuery(string field, string termStr)
string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
return base.GetPrefixQuery(field, analyzed);
/// <summary>
/// Called when parser parses an input term that has the fuzzy suffix (~) appended.
/// <para/>
/// Depending on analyzer and settings, a fuzzy term may (most probably will)
/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
/// <para/>
/// Overrides super class, by passing terms through analyzer.
/// </summary>
/// <param name="field">Name of the field query will use.</param>
/// <param name="termStr">Term to use for building term for the query</param>
/// <param name="minSimilarity"></param>
/// <returns>Resulting <see cref="Query"/> built for the term</returns>
protected internal override Query GetFuzzyQuery(string field, string termStr, float minSimilarity)
string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
return base.GetFuzzyQuery(field, analyzed, minSimilarity);
/// <summary>
/// Returns the analyzed form for the given chunk.
/// If the analyzer produces more than one output token from the given chunk,
/// a ParseException is thrown.
/// </summary>
/// <param name="field">The target field</param>
/// <param name="termStr">The full term from which the given chunk is excerpted</param>
/// <param name="chunk">The portion of the given termStr to be analyzed</param>
/// <returns>The result of analyzing the given chunk</returns>
/// <exception cref="ParseException">ParseException when analysis returns other than one output token</exception>
protected internal virtual string AnalyzeSingleChunk(string field, string termStr, string chunk)
string analyzed = null;
TokenStream stream = null;
stream = Analyzer.GetTokenStream(field, chunk);
ICharTermAttribute termAtt = stream.GetAttribute<ICharTermAttribute>();
// get first and hopefully only output token
if (stream.IncrementToken())
analyzed = termAtt.ToString();
// try to increment again, there should only be one output token
StringBuilder multipleOutputs = null;
while (stream.IncrementToken())
if (null == multipleOutputs)
multipleOutputs = new StringBuilder();
if (null != multipleOutputs)
throw new ParseException(
string.Format(@"Analyzer created multiple terms for ""{0}"": {1}", chunk, multipleOutputs.ToString()));
// nothing returned by analyzer. Was it a stop word and the user accidentally
// used an analyzer with stop words?
throw new ParseException(string.Format(@"Analyzer returned nothing for ""{0}""", chunk));
catch (IOException /*e*/)
throw new ParseException(
string.Format(@"IO error while trying to analyze single term: ""{0}""", termStr));
return analyzed;