src/Lucene.Net.QueryParser/Analyzing/AnalyzingQueryParser.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.QueryParsers.Classic;
 using Lucene.Net.Search;
 using Lucene.Net.Util;
 using System;
 using System.IO;
 using System.Text;
 using System.Text.RegularExpressions;

 namespace Lucene.Net.QueryParsers.Analyzing
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Overrides Lucene's default <see cref="QueryParser"/> so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
     /// are also passed through the given analyzer, but wildcard characters <c>*</c> and
     /// <c>?</c> don't get removed from the search terms.
     /// <para/>
     /// <b>Warning:</b> This class should only be used with analyzers that do not use stopwords
     /// or that add tokens. Also, several stemming analyzers are inappropriate: for example, <see cref="Analysis.De.GermanAnalyzer"/>
     /// will turn <c>Häuser</c> into <c>hau</c>, but <c>H?user</c> will
     /// become <c>h?user</c> when using this parser and thus no match would be found (i.e.
     /// using this parser will be no improvement over QueryParser in such cases).
     /// </summary>
     public class AnalyzingQueryParser : Classic.QueryParser
     {
         // gobble escaped chars or find a wildcard character
         private readonly Regex wildcardPattern = new Regex(@"(\\.)|([?*]+)", RegexOptions.Compiled);

         public AnalyzingQueryParser(LuceneVersion matchVersion, string field, Analyzer analyzer)
             : base(matchVersion, field, analyzer)
         {
             AnalyzeRangeTerms = true;
         }

         /// <summary>
         /// Called when parser parses an input term
         /// that uses prefix notation; that is, contains a single '*' wildcard
         /// character as its last character. Since this is a special case
         /// of generic wildcard term, and such a query can be optimized easily,
         /// this usually results in a different query object.
         /// <para/>
         /// Depending on analyzer and settings, a prefix term may (most probably will)
         /// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
         /// <para/>
         /// Overrides super class, by passing terms through analyzer.
         /// </summary>
         /// <param name="field">Name of the field query will use.</param>
         /// <param name="termStr">Term to use for building term for the query
         /// (<b>without</b> trailing '*' character!)</param>
         /// <returns>Resulting <see cref="Query"/> built for the term</returns>
         protected internal override Query GetWildcardQuery(string field, string termStr)
         {
             if (termStr == null)
             {
                 //can't imagine this would ever happen
                 throw new ParseException("Passed null value as term to GetWildcardQuery");
             }
             if (!AllowLeadingWildcard && (termStr.StartsWith("*", StringComparison.Ordinal) || termStr.StartsWith("?", StringComparison.Ordinal)))
             {
                 throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
                                         + " unless AllowLeadingWildcard returns true");
             }

             Match wildcardMatcher = wildcardPattern.Match(termStr);
             StringBuilder sb = new StringBuilder();
             int last = 0;

             while (wildcardMatcher.Success)
             {
                 // continue if escaped char
                 if (wildcardMatcher.Groups[1].Success)
                 {
                     wildcardMatcher = wildcardMatcher.NextMatch();
                     continue;
                 }

                 if (wildcardMatcher.Index > last)
                 {
                     string chunk = termStr.Substring(last, wildcardMatcher.Index - last);
                     string analyzed = AnalyzeSingleChunk(field, termStr, chunk);
                     sb.Append(analyzed);
                 }

                 //append the wildcard character
                 sb.Append(wildcardMatcher.Groups[2]);

                 last = wildcardMatcher.Index + wildcardMatcher.Length;
                 wildcardMatcher = wildcardMatcher.NextMatch();
             }
             if (last < termStr.Length)
             {
                 sb.Append(AnalyzeSingleChunk(field, termStr, termStr.Substring(last)));
             }
             return base.GetWildcardQuery(field, sb.ToString());
         }

         /// <summary>
         /// Called when parser parses an input term
         /// that uses prefix notation; that is, contains a single '*' wildcard
         /// character as its last character. Since this is a special case
         /// of generic wildcard term, and such a query can be optimized easily,
         /// this usually results in a different query object.
         /// <para/>
         /// Depending on analyzer and settings, a prefix term may (most probably will)
         /// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
         /// <para/>
         /// Overrides super class, by passing terms through analyzer.
         /// </summary>
         /// <param name="field">Name of the field query will use.</param>
         /// <param name="termStr">Term to use for building term for the query (<b>without</b> trailing '*' character!)</param>
         /// <returns>Resulting <see cref="Query"/> built for the term</returns>
         protected internal override Query GetPrefixQuery(string field, string termStr)
         {
             string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
             return base.GetPrefixQuery(field, analyzed);
         }

         /// <summary>
         /// Called when parser parses an input term that has the fuzzy suffix (~) appended.
         /// <para/>
         /// Depending on analyzer and settings, a fuzzy term may (most probably will)
         /// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
         /// <para/>
         /// Overrides super class, by passing terms through analyzer.
         /// </summary>
         /// <param name="field">Name of the field query will use.</param>
         /// <param name="termStr">Term to use for building term for the query</param>
         /// <param name="minSimilarity"></param>
         /// <returns>Resulting <see cref="Query"/> built for the term</returns>
         protected internal override Query GetFuzzyQuery(string field, string termStr, float minSimilarity)
         {
             string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
             return base.GetFuzzyQuery(field, analyzed, minSimilarity);
         }

         /// <summary>
         /// Returns the analyzed form for the given chunk.
         ///
         /// If the analyzer produces more than one output token from the given chunk,
         /// a ParseException is thrown.
         /// </summary>
         /// <param name="field">The target field</param>
         /// <param name="termStr">The full term from which the given chunk is excerpted</param>
         /// <param name="chunk">The portion of the given termStr to be analyzed</param>
         /// <returns>The result of analyzing the given chunk</returns>
         /// <exception cref="ParseException">ParseException when analysis returns other than one output token</exception>
         protected internal virtual string AnalyzeSingleChunk(string field, string termStr, string chunk)
         {
             string analyzed = null;
             TokenStream stream = null;
             try
             {
                 stream = Analyzer.GetTokenStream(field, chunk);
                 stream.Reset();
                 ICharTermAttribute termAtt = stream.GetAttribute<ICharTermAttribute>();
                 // get first and hopefully only output token
                 if (stream.IncrementToken())
                 {
                     analyzed = termAtt.ToString();

                     // try to increment again, there should only be one output token
                     StringBuilder multipleOutputs = null;
                     while (stream.IncrementToken())
                     {
                         if (null == multipleOutputs)
                         {
                             multipleOutputs = new StringBuilder();
                             multipleOutputs.Append('"');
                             multipleOutputs.Append(analyzed);
                             multipleOutputs.Append('"');
                         }
                         multipleOutputs.Append(',');
                         multipleOutputs.Append('"');
                         multipleOutputs.Append(termAtt.ToString());
                         multipleOutputs.Append('"');
                     }
                     stream.End();
                     if (null != multipleOutputs)
                     {
                         throw new ParseException(
                             string.Format(@"Analyzer created multiple terms for ""{0}"": {1}", chunk, multipleOutputs.ToString()));
                     }
                 }
                 else
                 {
                     // nothing returned by analyzer.  Was it a stop word and the user accidentally
                     // used an analyzer with stop words?
                     stream.End();
                     throw new ParseException(string.Format(@"Analyzer returned nothing for ""{0}""", chunk));
                 }
             }
             catch (IOException /*e*/)
             {
                 throw new ParseException(
                     string.Format(@"IO error while trying to analyze single term: ""{0}""", termStr));
             }
             finally
             {
                 IOUtils.DisposeWhileHandlingException(stream);
             }
             return analyzed;
         }
     }
 }
	using Lucene.Net.Analysis;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.QueryParsers.Classic;
	using Lucene.Net.Search;
	using Lucene.Net.Util;
	using System;
	using System.IO;
	using System.Text;
	using System.Text.RegularExpressions;

	namespace Lucene.Net.QueryParsers.Analyzing
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Overrides Lucene's default <see cref="QueryParser"/> so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
	/// are also passed through the given analyzer, but wildcard characters <c>*</c> and
	/// <c>?</c> don't get removed from the search terms.
	/// <para/>
	/// <b>Warning:</b> This class should only be used with analyzers that do not use stopwords
	/// or that add tokens. Also, several stemming analyzers are inappropriate: for example, <see cref="Analysis.De.GermanAnalyzer"/>
	/// will turn <c>Häuser</c> into <c>hau</c>, but <c>H?user</c> will
	/// become <c>h?user</c> when using this parser and thus no match would be found (i.e.
	/// using this parser will be no improvement over QueryParser in such cases).
	/// </summary>
	public class AnalyzingQueryParser : Classic.QueryParser
	{
	// gobble escaped chars or find a wildcard character
	private readonly Regex wildcardPattern = new Regex(@"(\\.)\|([?*]+)", RegexOptions.Compiled);

	public AnalyzingQueryParser(LuceneVersion matchVersion, string field, Analyzer analyzer)
	: base(matchVersion, field, analyzer)
	{
	AnalyzeRangeTerms = true;
	}

	/// <summary>
	/// Called when parser parses an input term
	/// that uses prefix notation; that is, contains a single '*' wildcard
	/// character as its last character. Since this is a special case
	/// of generic wildcard term, and such a query can be optimized easily,
	/// this usually results in a different query object.
	/// <para/>
	/// Depending on analyzer and settings, a prefix term may (most probably will)
	/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
	/// <para/>
	/// Overrides super class, by passing terms through analyzer.
	/// </summary>
	/// <param name="field">Name of the field query will use.</param>
	/// <param name="termStr">Term to use for building term for the query
	/// (<b>without</b> trailing '*' character!)</param>
	/// <returns>Resulting <see cref="Query"/> built for the term</returns>
	protected internal override Query GetWildcardQuery(string field, string termStr)
	{
	if (termStr == null)
	{
	//can't imagine this would ever happen
	throw new ParseException("Passed null value as term to GetWildcardQuery");
	}
	if (!AllowLeadingWildcard && (termStr.StartsWith("*", StringComparison.Ordinal) \|\| termStr.StartsWith("?", StringComparison.Ordinal)))
	{
	throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
	+ " unless AllowLeadingWildcard returns true");
	}

	Match wildcardMatcher = wildcardPattern.Match(termStr);
	StringBuilder sb = new StringBuilder();
	int last = 0;

	while (wildcardMatcher.Success)
	{
	// continue if escaped char
	if (wildcardMatcher.Groups[1].Success)
	{
	wildcardMatcher = wildcardMatcher.NextMatch();
	continue;
	}

	if (wildcardMatcher.Index > last)
	{
	string chunk = termStr.Substring(last, wildcardMatcher.Index - last);
	string analyzed = AnalyzeSingleChunk(field, termStr, chunk);
	sb.Append(analyzed);
	}

	//append the wildcard character
	sb.Append(wildcardMatcher.Groups[2]);

	last = wildcardMatcher.Index + wildcardMatcher.Length;
	wildcardMatcher = wildcardMatcher.NextMatch();
	}
	if (last < termStr.Length)
	{
	sb.Append(AnalyzeSingleChunk(field, termStr, termStr.Substring(last)));
	}
	return base.GetWildcardQuery(field, sb.ToString());
	}

	/// <summary>
	/// Called when parser parses an input term
	/// that uses prefix notation; that is, contains a single '*' wildcard
	/// character as its last character. Since this is a special case
	/// of generic wildcard term, and such a query can be optimized easily,
	/// this usually results in a different query object.
	/// <para/>
	/// Depending on analyzer and settings, a prefix term may (most probably will)
	/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
	/// <para/>
	/// Overrides super class, by passing terms through analyzer.
	/// </summary>
	/// <param name="field">Name of the field query will use.</param>
	/// <param name="termStr">Term to use for building term for the query (<b>without</b> trailing '*' character!)</param>
	/// <returns>Resulting <see cref="Query"/> built for the term</returns>
	protected internal override Query GetPrefixQuery(string field, string termStr)
	{
	string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
	return base.GetPrefixQuery(field, analyzed);
	}

	/// <summary>
	/// Called when parser parses an input term that has the fuzzy suffix (~) appended.
	/// <para/>
	/// Depending on analyzer and settings, a fuzzy term may (most probably will)
	/// be lower-cased automatically. It <b>will</b> go through the default Analyzer.
	/// <para/>
	/// Overrides super class, by passing terms through analyzer.
	/// </summary>
	/// <param name="field">Name of the field query will use.</param>
	/// <param name="termStr">Term to use for building term for the query</param>
	/// <param name="minSimilarity"></param>
	/// <returns>Resulting <see cref="Query"/> built for the term</returns>
	protected internal override Query GetFuzzyQuery(string field, string termStr, float minSimilarity)
	{
	string analyzed = AnalyzeSingleChunk(field, termStr, termStr);
	return base.GetFuzzyQuery(field, analyzed, minSimilarity);
	}

	/// <summary>
	/// Returns the analyzed form for the given chunk.
	///
	/// If the analyzer produces more than one output token from the given chunk,
	/// a ParseException is thrown.
	/// </summary>
	/// <param name="field">The target field</param>
	/// <param name="termStr">The full term from which the given chunk is excerpted</param>
	/// <param name="chunk">The portion of the given termStr to be analyzed</param>
	/// <returns>The result of analyzing the given chunk</returns>
	/// <exception cref="ParseException">ParseException when analysis returns other than one output token</exception>
	protected internal virtual string AnalyzeSingleChunk(string field, string termStr, string chunk)
	{
	string analyzed = null;
	TokenStream stream = null;
	try
	{
	stream = Analyzer.GetTokenStream(field, chunk);
	stream.Reset();
	ICharTermAttribute termAtt = stream.GetAttribute<ICharTermAttribute>();
	// get first and hopefully only output token
	if (stream.IncrementToken())
	{
	analyzed = termAtt.ToString();

	// try to increment again, there should only be one output token
	StringBuilder multipleOutputs = null;
	while (stream.IncrementToken())
	{
	if (null == multipleOutputs)
	{
	multipleOutputs = new StringBuilder();
	multipleOutputs.Append('"');
	multipleOutputs.Append(analyzed);
	multipleOutputs.Append('"');
	}
	multipleOutputs.Append(',');
	multipleOutputs.Append('"');
	multipleOutputs.Append(termAtt.ToString());
	multipleOutputs.Append('"');
	}
	stream.End();
	if (null != multipleOutputs)
	{
	throw new ParseException(
	string.Format(@"Analyzer created multiple terms for ""{0}"": {1}", chunk, multipleOutputs.ToString()));
	}
	}
	else
	{
	// nothing returned by analyzer. Was it a stop word and the user accidentally
	// used an analyzer with stop words?
	stream.End();
	throw new ParseException(string.Format(@"Analyzer returned nothing for ""{0}""", chunk));
	}
	}
	catch (IOException /e/)
	{
	throw new ParseException(
	string.Format(@"IO error while trying to analyze single term: ""{0}""", termStr));
	}
	finally
	{
	IOUtils.DisposeWhileHandlingException(stream);
	}
	return analyzed;
	}
	}
	}