| using J2N; |
| using J2N.Numerics; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Documents; |
| using Lucene.Net.Index; |
| using Lucene.Net.QueryParsers.Flexible.Standard; |
| using Lucene.Net.Search; |
| using Lucene.Net.Util; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| #if FEATURE_SERIALIZABLE_EXCEPTIONS |
| using System.Runtime.Serialization; |
| #endif |
| using System.Text; |
| |
| namespace Lucene.Net.QueryParsers.Classic |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // LUCENENET specific: In Java, this was part of the QueryParser class, |
| // but it has been de-nested to make usage syntax shorter. |
| |
| /// <summary> |
| /// The default operator for parsing queries. |
| /// Use <see cref="QueryParserBase.DefaultOperator"/> to change it. |
| /// </summary> |
| public enum Operator |
| { |
| OR, |
| AND |
| } |
| |
| /// <summary> |
| /// This class is overridden by <see cref="QueryParser"/>. |
| /// </summary> |
| public abstract class QueryParserBase : QueryBuilder, ICommonQueryParserConfiguration |
| { |
| /// <summary> |
| /// Do not catch this exception in your code, it means you are using methods that you should no longer use. |
| /// </summary> |
| // LUCENENET: It is no longer good practice to use binary serialization. |
| // See: https://github.com/dotnet/corefx/issues/23584#issuecomment-325724568 |
| #if FEATURE_SERIALIZABLE_EXCEPTIONS |
| [Serializable] |
| #endif |
| public class MethodRemovedUseAnother : Exception |
| { |
| public MethodRemovedUseAnother() |
| { } |
| |
| #if FEATURE_SERIALIZABLE_EXCEPTIONS |
| /// <summary> |
| /// Initializes a new instance of this class with serialized data. |
| /// </summary> |
| /// <param name="info">The <see cref="SerializationInfo"/> that holds the serialized object data about the exception being thrown.</param> |
| /// <param name="context">The <see cref="StreamingContext"/> that contains contextual information about the source or destination.</param> |
| protected MethodRemovedUseAnother(SerializationInfo info, StreamingContext context) |
| : base(info, context) |
| { |
| } |
| #endif |
| } |
| |
| protected const int CONJ_NONE = 0; |
| protected const int CONJ_AND = 1; |
| protected const int CONJ_OR = 2; |
| |
| protected const int MOD_NONE = 0; |
| protected const int MOD_NOT = 10; |
| protected const int MOD_REQ = 11; |
| |
| |
| // make it possible to call setDefaultOperator() without accessing |
| // the nested class: |
| |
| /// <summary> |
| /// Alternative form of <see cref="Operator.AND"/> |
| /// </summary> |
| public const Operator AND_OPERATOR = Operator.AND; |
| /// <summary> |
| /// Alternative form of <see cref="Operator.OR"/> |
| /// </summary> |
| public const Operator OR_OPERATOR = Operator.OR; |
| |
| ///// <summary> |
| ///// The actual operator that parser uses to combine query terms |
| ///// </summary> |
| //Operator operator_Renamed = OR_OPERATOR; |
| |
| |
| //bool lowercaseExpandedTerms = true; |
| //MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; |
| //bool allowLeadingWildcard = false; |
| |
| protected string m_field; |
| //int phraseSlop = 0; |
| //float fuzzyMinSim = FuzzyQuery.DefaultMinSimilarity; |
| //int fuzzyPrefixLength = FuzzyQuery.DefaultPrefixLength; |
| CultureInfo locale = null; // LUCENENET NOTE: null indicates read CultureInfo.CurrentCulture on the fly |
| TimeZoneInfo timeZone = null; // LUCENENET NOTE: null indicates read TimeZoneInfo.Local on the fly |
| |
| // TODO: Work out what the default date resolution SHOULD be (was null in Java, which isn't valid for an enum type) |
| |
| /// <summary> |
| /// the default date resolution |
| /// </summary> |
| DateTools.Resolution dateResolution = DateTools.Resolution.DAY; |
| /// <summary> |
| /// maps field names to date resolutions |
| /// </summary> |
| IDictionary<string, DateTools.Resolution> fieldToDateResolution = null; |
| |
| /// <summary> |
| /// Whether or not to analyze range terms when constructing RangeQuerys |
| /// (For example, analyzing terms into collation keys for locale-sensitive RangeQuery) |
| /// </summary> |
| bool analyzeRangeTerms = false; |
| |
| /// <summary> |
| /// So the generated QueryParser(CharStream) won't error out |
| /// </summary> |
| protected QueryParserBase() |
| : base(null) |
| { |
| // Set property defaults. |
| DefaultOperator = OR_OPERATOR; |
| LowercaseExpandedTerms = true; |
| MultiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; |
| AllowLeadingWildcard = false; |
| |
| PhraseSlop = 0; |
| #pragma warning disable 612, 618 |
| FuzzyMinSim = FuzzyQuery.DefaultMinSimilarity; |
| #pragma warning restore 612, 618 |
| FuzzyPrefixLength = FuzzyQuery.DefaultPrefixLength; |
| } |
| |
| /// <summary> |
| /// Initializes a query parser. Called by the QueryParser constructor |
| /// </summary> |
| /// <param name="matchVersion">Lucene version to match.</param> |
| /// <param name="f">the default field for query terms.</param> |
| /// <param name="a">used to find terms in the query text.</param> |
| public virtual void Init(LuceneVersion matchVersion, string f, Analyzer a) |
| { |
| Analyzer = a; |
| m_field = f; |
| #pragma warning disable 612, 618 |
| if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31)) |
| #pragma warning restore 612, 618 |
| { |
| AutoGeneratePhraseQueries = false; |
| } |
| else |
| { |
| AutoGeneratePhraseQueries = true; |
| } |
| } |
| |
| // the generated parser will create these in QueryParser |
| public abstract void ReInit(ICharStream stream); |
| public abstract Query TopLevelQuery(string field); |
| |
| /// <summary> |
| /// Parses a query string, returning a <see cref="Query"/>. |
| /// </summary> |
| /// <param name="query">the query string to be parsed.</param> |
| /// <exception cref="ParseException">if the parsing fails</exception> |
| public virtual Query Parse(string query) |
| { |
| ReInit(new FastCharStream(new StringReader(query))); |
| try |
| { |
| // TopLevelQuery is a Query followed by the end-of-input (EOF) |
| Query res = TopLevelQuery(m_field); |
| return res != null ? res : NewBooleanQuery(false); |
| } |
| catch (ParseException tme) |
| { |
| // rethrow to include the original query: |
| throw new ParseException("Cannot parse '" + query + "': " + tme.Message, tme); |
| } |
| catch (TokenMgrError tme) |
| { |
| throw new ParseException("Cannot parse '" + query + "': " + tme.Message, tme); |
| } |
| catch (BooleanQuery.TooManyClausesException tmc) |
| { |
| throw new ParseException("Cannot parse '" + query + "': too many boolean clauses", tmc); |
| } |
| } |
| |
| /// <summary> |
| /// Returns the default field. |
| /// </summary> |
| public virtual string Field |
| { |
| get { return m_field; } |
| } |
| |
| /// <summary> |
| /// Set to true if phrase queries will be automatically generated |
| /// when the analyzer returns more than one term from whitespace |
| /// delimited text. |
| /// NOTE: this behavior may not be suitable for all languages. |
| /// <para/> |
| /// Set to false if phrase queries should only be generated when |
| /// surrounded by double quotes. |
| /// </summary> |
| public bool AutoGeneratePhraseQueries { get; set; } |
| |
| /// <summary> |
| /// Get or Set the minimum similarity for fuzzy queries. |
| /// Default is 2f. |
| /// </summary> |
| public virtual float FuzzyMinSim { get; set; } |
| |
| /// <summary> |
| /// Get or Set the prefix length for fuzzy queries. |
| /// Default is 0. |
| /// </summary> |
| public virtual int FuzzyPrefixLength { get; set; } |
| |
| /// <summary> |
| /// Gets or Sets the default slop for phrases. |
| /// If zero, then exact phrase matches are required. |
| /// Default value is zero. |
| /// </summary> |
| public virtual int PhraseSlop { get; set; } |
| |
| /// <summary> |
| /// Set to <c>true</c> to allow leading wildcard characters. |
| /// <para/> |
| /// When set, <c>*</c> or <c>?</c> are allowed as |
| /// the first character of a PrefixQuery and WildcardQuery. |
| /// Note that this can produce very slow |
| /// queries on big indexes. |
| /// <para/> |
| /// Default: false. |
| /// </summary> |
| public virtual bool AllowLeadingWildcard { get; set; } |
| |
| /// <summary> |
| /// Gets or Sets the boolean operator of the QueryParser. |
| /// In default mode (<see cref="OR_OPERATOR"/>) terms without any modifiers |
| /// are considered optional: for example <c>capital of Hungary</c> is equal to |
| /// <c>capital OR of OR Hungary</c>. |
| /// <para/> |
| /// In <see cref="AND_OPERATOR"/> mode terms are considered to be in conjunction: the |
| /// above mentioned query is parsed as <c>capital AND of AND Hungary</c> |
| /// </summary> |
| public virtual Operator DefaultOperator { get; set; } |
| |
| /// <summary> |
| /// Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically |
| /// lower-cased or not. Default is <c>true</c>. |
| /// </summary> |
| public virtual bool LowercaseExpandedTerms { get; set; } |
| |
| /// <summary> |
| /// By default QueryParser uses <see cref="MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT"/> |
| /// when creating a <see cref="PrefixQuery"/>, <see cref="WildcardQuery"/> or <see cref="TermRangeQuery"/>. This implementation is generally preferable because it |
| /// a) Runs faster b) Does not have the scarcity of terms unduly influence score |
| /// c) avoids any <see cref="BooleanQuery.TooManyClausesException"/> exception. |
| /// However, if your application really needs to use the |
| /// old-fashioned <see cref="BooleanQuery"/> expansion rewriting and the above |
| /// points are not relevant then use this to change |
| /// the rewrite method. |
| /// </summary> |
| public virtual MultiTermQuery.RewriteMethod MultiTermRewriteMethod { get; set; } |
| |
| /// <summary> |
| /// Get or Set locale used by date range parsing, lowercasing, and other |
| /// locale-sensitive operations. |
| /// <para/> |
| /// By default, the culture is <c>null</c>, which indicates to read the culture on the fly |
| /// from <see cref="CultureInfo.CurrentCulture"/>. This ensures if you change the culture on |
| /// the current thread, QueryParser will utilize it. You can also explicitly set a culture. |
| /// Setting the culture to <c>null</c> will restore the default behavior if you have explicitly set a culture. |
| /// </summary> |
| public virtual CultureInfo Locale // LUCENENET TODO: API - Rename Culture |
| { |
| get { return this.locale == null ? CultureInfo.CurrentCulture : this.locale; } |
| set { this.locale = value; } |
| } |
| |
| /// <summary> |
| /// Get or Set the current time zone for date and time parsing operations. |
| /// <para/> |
| /// By default, the time zone is <c>null</c>, which indicates to read the time zone on the fly |
| /// from <see cref="TimeZoneInfo.Local"/>. This ensures if you change the time zone on |
| /// the current system, QueryParser will utilize it. You can also explicitly set a time zone. |
| /// Setting the time zone to <c>null</c> will restore the default behavior if you have explicitly set a time zone. |
| /// </summary> |
| public virtual TimeZoneInfo TimeZone |
| { |
| get { return this.timeZone == null ? TimeZoneInfo.Local : this.timeZone; } |
| set { this.timeZone = value; } |
| } |
| |
| /// <summary> |
| /// Gets or Sets the default date resolution used by RangeQueries for fields for which no |
| /// specific date resolutions has been set. Field specific resolutions can be set |
| /// with <see cref="SetDateResolution(string,DateTools.Resolution)"/>. |
| /// </summary> |
| public virtual void SetDateResolution(DateTools.Resolution dateResolution) |
| { |
| this.dateResolution = dateResolution; |
| } |
| |
| /// <summary> |
| /// Sets the date resolution used by RangeQueries for a specific field. |
| /// </summary> |
| /// <param name="fieldName">field for which the date resolution is to be set</param> |
| /// <param name="dateResolution">date resolution to set</param> |
| public virtual void SetDateResolution(string fieldName, DateTools.Resolution dateResolution) |
| { |
| if (string.IsNullOrEmpty(fieldName)) |
| { |
| throw new ArgumentNullException("fieldName cannot be null or empty string."); |
| } |
| |
| if (fieldToDateResolution == null) |
| { |
| // lazily initialize Dictionary |
| fieldToDateResolution = new Dictionary<string, DateTools.Resolution>(); |
| } |
| |
| fieldToDateResolution[fieldName] = dateResolution; |
| } |
| |
| /// <summary> |
| /// Returns the date resolution that is used by RangeQueries for the given field. |
| /// Returns null, if no default or field specific date resolution has been set |
| /// for the given field. |
| /// </summary> |
| public virtual DateTools.Resolution GetDateResolution(string fieldName) |
| { |
| if (string.IsNullOrEmpty(fieldName)) |
| { |
| throw new ArgumentNullException("fieldName cannot be null or empty string."); |
| } |
| |
| if (fieldToDateResolution == null) |
| { |
| // no field specific date resolutions set; return default date resolution instead |
| return this.dateResolution; |
| } |
| |
| if (!fieldToDateResolution.TryGetValue(fieldName, out DateTools.Resolution resolution)) |
| { |
| // no date resolutions set for the given field; return default date resolution instead |
| return this.dateResolution; |
| } |
| |
| return resolution; |
| } |
| |
| /// <summary> |
| /// Get or Set whether or not to analyze range terms when constructing <see cref="TermRangeQuery"/>s. |
| /// For example, setting this to true can enable analyzing terms into |
| /// collation keys for locale-sensitive <see cref="TermRangeQuery"/>. |
| /// </summary> |
| public virtual bool AnalyzeRangeTerms |
| { |
| get { return analyzeRangeTerms; } |
| set { analyzeRangeTerms = value; } |
| } |
| |
| protected internal virtual void AddClause(IList<BooleanClause> clauses, int conj, int mods, Query q) |
| { |
| bool required, prohibited; |
| |
| // If this term is introduced by AND, make the preceding term required, |
| // unless it's already prohibited |
| if (clauses.Count > 0 && conj == CONJ_AND) |
| { |
| BooleanClause c = clauses[clauses.Count - 1]; |
| if (!c.IsProhibited) |
| c.Occur = Occur.MUST; |
| } |
| |
| if (clauses.Count > 0 && DefaultOperator == AND_OPERATOR && conj == CONJ_OR) |
| { |
| // If this term is introduced by OR, make the preceding term optional, |
| // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) |
| // notice if the input is a OR b, first term is parsed as required; without |
| // this modification a OR b would parsed as +a OR b |
| BooleanClause c = clauses[clauses.Count - 1]; |
| if (!c.IsProhibited) |
| c.Occur = Occur.SHOULD; |
| } |
| |
| // We might have been passed a null query; the term might have been |
| // filtered away by the analyzer. |
| if (q == null) |
| return; |
| |
| if (DefaultOperator == OR_OPERATOR) |
| { |
| // We set REQUIRED if we're introduced by AND or +; PROHIBITED if |
| // introduced by NOT or -; make sure not to set both. |
| prohibited = (mods == MOD_NOT); |
| required = (mods == MOD_REQ); |
| if (conj == CONJ_AND && !prohibited) |
| { |
| required = true; |
| } |
| } |
| else |
| { |
| // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED |
| // if not PROHIBITED and not introduced by OR |
| prohibited = (mods == MOD_NOT); |
| required = (!prohibited && conj != CONJ_OR); |
| } |
| if (required && !prohibited) |
| clauses.Add(NewBooleanClause(q, Occur.MUST)); |
| else if (!required && !prohibited) |
| clauses.Add(NewBooleanClause(q, Occur.SHOULD)); |
| else if (!required && prohibited) |
| clauses.Add(NewBooleanClause(q, Occur.MUST_NOT)); |
| else |
| throw new Exception("Clause cannot be both required and prohibited"); |
| } |
| |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| protected internal virtual Query GetFieldQuery(string field, string queryText, bool quoted) |
| { |
| return NewFieldQuery(Analyzer, field, queryText, quoted); |
| } |
| |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| protected internal virtual Query NewFieldQuery(Analyzer analyzer, string field, string queryText, bool quoted) |
| { |
| Occur occur = DefaultOperator == Operator.AND ? Occur.MUST : Occur.SHOULD; |
| return CreateFieldQuery(analyzer, occur, field, queryText, quoted || AutoGeneratePhraseQueries, PhraseSlop); |
| } |
| |
| /// <summary> |
| /// Base implementation delegates to <see cref="GetFieldQuery(string,string,bool)"/>. |
| /// This method may be overridden, for example, to return |
| /// a <see cref="Search.Spans.SpanNearQuery"/> instead of a <see cref="PhraseQuery"/>. |
| /// </summary> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| protected internal virtual Query GetFieldQuery(string field, string queryText, int slop) |
| { |
| Query query = GetFieldQuery(field, queryText, true); |
| |
| if (query is PhraseQuery) |
| { |
| ((PhraseQuery)query).Slop = slop; |
| } |
| if (query is MultiPhraseQuery) |
| { |
| ((MultiPhraseQuery)query).Slop = slop; |
| } |
| |
| return query; |
| } |
| |
| protected internal virtual Query GetRangeQuery(string field, |
| string part1, |
| string part2, |
| bool startInclusive, |
| bool endInclusive) |
| { |
| if (LowercaseExpandedTerms) |
| { |
| part1 = part1 == null ? null : Locale.TextInfo.ToLower(part1); |
| part2 = part2 == null ? null : Locale.TextInfo.ToLower(part2); |
| } |
| |
| string shortDateFormat = Locale.DateTimeFormat.ShortDatePattern; |
| DateTime d1; |
| DateTime d2 = DateTime.MaxValue; // We really don't care what we set this to, but we need something or the compiler will complain below |
| DateTools.Resolution resolution = GetDateResolution(field); |
| |
| // LUCENENET specific: This doesn't emulate java perfectly. |
| // See LUCENENET-423 - DateRange differences with Java and .NET |
| |
| // Java allows parsing of the string up to the end of the pattern |
| // and then ignores everything else. .NET will throw an exception, |
| // so this will fail in those cases, though the code below is clear |
| // that users can only specify the date, not the time. Unfortunately, |
| // the date format is much more strict in .NET. |
| |
| // To emulate Java more precisely, it is possible to make a custom format |
| // by calling Locale.DateTimeFormat.SetAllDateTimePatterns(string[], char) |
| // that contains all of the formats that you need to support and setting |
| // the Locale.DateTimeFormat.ShortDatePattern to be the same as the second |
| // parameter of SetAllDateTimePatterns. |
| |
| // LUCENENET TODO: Try to make setting custom formats easier by adding |
| // another configuration setting (IList<string> of date formats). |
| // Also consider making a IsStrictDateFormat setting which allows toggling |
| // to DateTime.TryParse(part1, Locale, DateTimeStyles.None, out d1); |
| // rather than TryParseExact |
| |
| if (DateTime.TryParseExact(part1, shortDateFormat, Locale, DateTimeStyles.None, out d1)) |
| { |
| part1 = DateTools.DateToString(d1, resolution); |
| } |
| |
| if (DateTime.TryParseExact(part2, shortDateFormat, Locale, DateTimeStyles.None, out d2)) |
| { |
| if (endInclusive) |
| { |
| // The user can only specify the date, not the time, so make sure |
| // the time is set to the latest possible time of that date to really |
| // include all documents: |
| |
| d2 = TimeZoneInfo.ConvertTime(d2, TimeZone); |
| var cal = Locale.Calendar; |
| d2 = cal.AddHours(d2, 23); |
| d2 = cal.AddMinutes(d2, 59); |
| d2 = cal.AddSeconds(d2, 59); |
| d2 = cal.AddMilliseconds(d2, 999); |
| } |
| |
| part2 = DateTools.DateToString(d2, resolution); |
| } |
| |
| return NewRangeQuery(field, part1, part2, startInclusive, endInclusive); |
| } |
| |
| /// <summary>Builds a new <see cref="BooleanClause"/> instance</summary> |
| /// <param name="q">sub query</param> |
| /// <param name="occur">how this clause should occur when matching documents</param> |
| /// <returns> new <see cref="BooleanClause"/> instance</returns> |
| protected internal virtual BooleanClause NewBooleanClause(Query q, Occur occur) |
| { |
| return new BooleanClause(q, occur); |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="PrefixQuery"/> instance |
| /// </summary> |
| /// <param name="prefix">Prefix term</param> |
| /// <returns>new <see cref="PrefixQuery"/> instance</returns> |
| protected internal virtual Query NewPrefixQuery(Term prefix) |
| { |
| PrefixQuery query = new PrefixQuery(prefix); |
| query.MultiTermRewriteMethod = MultiTermRewriteMethod; |
| return query; |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="RegexpQuery"/> instance |
| /// </summary> |
| /// <param name="regexp">Regexp term</param> |
| /// <returns>new <see cref="RegexpQuery"/> instance</returns> |
| protected internal virtual Query NewRegexpQuery(Term regexp) |
| { |
| RegexpQuery query = new RegexpQuery(regexp); |
| query.MultiTermRewriteMethod = MultiTermRewriteMethod; |
| return query; |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="FuzzyQuery"/> instance |
| /// </summary> |
| /// <param name="term">Term</param> |
| /// <param name="minimumSimilarity">minimum similarity</param> |
| /// <param name="prefixLength">prefix length</param> |
| /// <returns>new <see cref="FuzzyQuery"/> Instance</returns> |
| protected internal virtual Query NewFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) |
| { |
| // FuzzyQuery doesn't yet allow constant score rewrite |
| string text = term.Text(); |
| #pragma warning disable 612, 618 |
| int numEdits = FuzzyQuery.SingleToEdits(minimumSimilarity, |
| text.CodePointCount(0, text.Length)); |
| #pragma warning restore 612, 618 |
| return new FuzzyQuery(term, numEdits, prefixLength); |
| } |
| |
| // LUCENETODO: Should this be protected instead? |
| private BytesRef AnalyzeMultitermTerm(string field, string part) |
| { |
| return AnalyzeMultitermTerm(field, part, Analyzer); |
| } |
| |
| protected internal virtual BytesRef AnalyzeMultitermTerm(string field, string part, Analyzer analyzerIn) |
| { |
| if (analyzerIn == null) analyzerIn = Analyzer; |
| |
| TokenStream source = null; |
| try |
| { |
| source = analyzerIn.GetTokenStream(field, part); |
| source.Reset(); |
| |
| ITermToBytesRefAttribute termAtt = source.GetAttribute<ITermToBytesRefAttribute>(); |
| BytesRef bytes = termAtt.BytesRef; |
| |
| if (!source.IncrementToken()) |
| throw new ArgumentException("analyzer returned no terms for multiTerm term: " + part); |
| termAtt.FillBytesRef(); |
| if (source.IncrementToken()) |
| throw new ArgumentException("analyzer returned too many terms for multiTerm term: " + part); |
| source.End(); |
| return BytesRef.DeepCopyOf(bytes); |
| } |
| catch (IOException e) |
| { |
| throw new Exception("Error analyzing multiTerm term: " + part, e); |
| } |
| finally |
| { |
| IOUtils.DisposeWhileHandlingException(source); |
| } |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="TermRangeQuery"/> instance |
| /// </summary> |
| /// <param name="field">Field</param> |
| /// <param name="part1">min</param> |
| /// <param name="part2">max</param> |
| /// <param name="startInclusive">true if the start of the range is inclusive</param> |
| /// <param name="endInclusive">true if the end of the range is inclusive</param> |
| /// <returns>new <see cref="TermRangeQuery"/> instance</returns> |
| protected internal virtual Query NewRangeQuery(string field, string part1, string part2, bool startInclusive, bool endInclusive) |
| { |
| BytesRef start; |
| BytesRef end; |
| |
| if (part1 == null) |
| { |
| start = null; |
| } |
| else |
| { |
| start = analyzeRangeTerms ? AnalyzeMultitermTerm(field, part1) : new BytesRef(part1); |
| } |
| |
| if (part2 == null) |
| { |
| end = null; |
| } |
| else |
| { |
| end = analyzeRangeTerms ? AnalyzeMultitermTerm(field, part2) : new BytesRef(part2); |
| } |
| |
| TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive); |
| |
| query.MultiTermRewriteMethod = MultiTermRewriteMethod; |
| return query; |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="MatchAllDocsQuery"/> instance |
| /// </summary> |
| /// <returns>new <see cref="MatchAllDocsQuery"/> instance</returns> |
| protected internal virtual Query NewMatchAllDocsQuery() |
| { |
| return new MatchAllDocsQuery(); |
| } |
| |
| /// <summary> |
| /// Builds a new <see cref="WildcardQuery"/> instance |
| /// </summary> |
| /// <param name="t">wildcard term</param> |
| /// <returns>new <see cref="WildcardQuery"/> instance</returns> |
| protected internal virtual Query NewWildcardQuery(Term t) |
| { |
| WildcardQuery query = new WildcardQuery(t); |
| query.MultiTermRewriteMethod = MultiTermRewriteMethod; |
| return query; |
| } |
| |
| /// <summary> |
| /// Factory method for generating query, given a set of clauses. |
| /// By default creates a boolean query composed of clauses passed in. |
| /// <para/> |
| /// Can be overridden by extending classes, to modify query being |
| /// returned. |
| /// </summary> |
| /// <param name="clauses">List that contains <see cref="BooleanClause"/> instances |
| /// to join.</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> object.</returns> |
| protected internal virtual Query GetBooleanQuery(IList<BooleanClause> clauses) |
| { |
| return GetBooleanQuery(clauses, false); |
| } |
| |
| /// <summary> |
| /// Factory method for generating query, given a set of clauses. |
| /// By default creates a boolean query composed of clauses passed in. |
| /// <para/> |
| /// Can be overridden by extending classes, to modify query being |
| /// returned. |
| /// </summary> |
| /// <param name="clauses">List that contains <see cref="BooleanClause"/> instances |
| /// to join.</param> |
| /// <param name="disableCoord">true if coord scoring should be disabled.</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> object.</returns> |
| protected internal virtual Query GetBooleanQuery(IList<BooleanClause> clauses, bool disableCoord) |
| { |
| if (clauses.Count == 0) |
| { |
| return null; // all clause words were filtered away by the analyzer. |
| } |
| BooleanQuery query = NewBooleanQuery(disableCoord); |
| foreach (BooleanClause clause in clauses) |
| { |
| query.Add(clause); |
| } |
| return query; |
| } |
| |
| /// <summary> |
| /// Factory method for generating a query. Called when parser |
| /// parses an input term token that contains one or more wildcard |
| /// characters (? and *), but is not a prefix term token (one |
| /// that has just a single * character at the end) |
| /// <para/> |
| /// Depending on settings, prefix term may be lower-cased |
| /// automatically. It will not go through the default Analyzer, |
| /// however, since normal Analyzers are unlikely to work properly |
| /// with wildcard templates. |
| /// <para/> |
| /// Can be overridden by extending classes, to provide custom handling for |
| /// wildcard queries, which may be necessary due to missing analyzer calls. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term token that contains one or more wild card |
| /// characters (? or *), but is not simple prefix term</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal virtual Query GetWildcardQuery(string field, string termStr) |
| { |
| if ("*".Equals(field, StringComparison.Ordinal)) |
| { |
| if ("*".Equals(termStr, StringComparison.Ordinal)) return NewMatchAllDocsQuery(); |
| } |
| if (!AllowLeadingWildcard && (termStr.StartsWith("*", StringComparison.Ordinal) || termStr.StartsWith("?", StringComparison.Ordinal))) |
| throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); |
| if (LowercaseExpandedTerms) |
| { |
| termStr = Locale.TextInfo.ToLower(termStr); |
| } |
| Term t = new Term(field, termStr); |
| return NewWildcardQuery(t); |
| } |
| |
| /// <summary> |
| /// Factory method for generating a query. Called when parser |
| /// parses an input term token that contains a regular expression |
| /// query. |
| /// <para/> |
| /// Depending on settings, pattern term may be lower-cased |
| /// automatically. It will not go through the default Analyzer, |
| /// however, since normal Analyzers are unlikely to work properly |
| /// with regular expression templates. |
| /// <para/> |
| /// Can be overridden by extending classes, to provide custom handling for |
| /// regular expression queries, which may be necessary due to missing analyzer |
| /// calls. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term token that contains a regular expression</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal virtual Query GetRegexpQuery(string field, string termStr) |
| { |
| if (LowercaseExpandedTerms) |
| { |
| termStr = Locale.TextInfo.ToLower(termStr); |
| } |
| Term t = new Term(field, termStr); |
| return NewRegexpQuery(t); |
| } |
| |
| /// <summary> |
| /// Factory method for generating a query (similar to |
| /// <see cref="GetWildcardQuery(string, string)"/>). Called when parser parses an input term |
| /// token that uses prefix notation; that is, contains a single '*' wildcard |
| /// character as its last character. Since this is a special case |
| /// of generic wildcard term, and such a query can be optimized easily, |
| /// this usually results in a different query object. |
| /// <para/> |
| /// Depending on settings, a prefix term may be lower-cased |
| /// automatically. It will not go through the default Analyzer, |
| /// however, since normal Analyzers are unlikely to work properly |
| /// with wildcard templates. |
| /// <para/> |
| /// Can be overridden by extending classes, to provide custom handling for |
| /// wild card queries, which may be necessary due to missing analyzer calls. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term token to use for building term for the query</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal virtual Query GetPrefixQuery(string field, string termStr) |
| { |
| if (!AllowLeadingWildcard && termStr.StartsWith("*", StringComparison.Ordinal)) |
| throw new ParseException("'*' not allowed as first character in PrefixQuery"); |
| if (LowercaseExpandedTerms) |
| { |
| termStr = Locale.TextInfo.ToLower(termStr); |
| } |
| Term t = new Term(field, termStr); |
| return NewPrefixQuery(t); |
| } |
| |
| /// <summary> |
| /// Factory method for generating a query (similar to |
| /// <see cref="GetWildcardQuery(string, string)"/>). Called when parser parses |
| /// an input term token that has the fuzzy suffix (~) appended. |
| /// </summary> |
| /// <param name="field">Name of the field query will use.</param> |
| /// <param name="termStr">Term token to use for building term for the query</param> |
| /// <param name="minSimilarity">minimum similarity</param> |
| /// <exception cref="ParseException">throw in overridden method to disallow</exception> |
| /// <returns>Resulting <see cref="Query"/> built for the term</returns> |
| protected internal virtual Query GetFuzzyQuery(string field, string termStr, float minSimilarity) |
| { |
| if (LowercaseExpandedTerms) |
| { |
| termStr = Locale.TextInfo.ToLower(termStr); |
| } |
| Term t = new Term(field, termStr); |
| return NewFuzzyQuery(t, minSimilarity, FuzzyPrefixLength); |
| } |
| |
| // extracted from the .jj grammar |
| internal virtual Query HandleBareTokenQuery(string qfield, Token term, Token fuzzySlop, bool prefix, bool wildcard, bool fuzzy, bool regexp) |
| { |
| Query q; |
| |
| string termImage = DiscardEscapeChar(term.Image); |
| if (wildcard) |
| { |
| q = GetWildcardQuery(qfield, term.Image); |
| } |
| else if (prefix) |
| { |
| q = GetPrefixQuery(qfield, DiscardEscapeChar(term.Image.Substring(0, term.Image.Length - 1))); |
| } |
| else if (regexp) |
| { |
| q = GetRegexpQuery(qfield, term.Image.Substring(1, term.Image.Length - 2)); |
| } |
| else if (fuzzy) |
| { |
| q = HandleBareFuzzy(qfield, fuzzySlop, termImage); |
| } |
| else |
| { |
| q = GetFieldQuery(qfield, termImage, false); |
| } |
| return q; |
| } |
| |
| internal virtual Query HandleBareFuzzy(string qfield, Token fuzzySlop, string termImage) |
| { |
| Query q; |
| float fms = FuzzyMinSim; |
| try |
| { |
| // LUCENENET NOTE: Apparently a "feature" of Lucene is to always |
| // use "." as the decimal specifier for fuzzy slop, even if the culture uses |
| // a different one, such as ",". |
| |
| // LUCENENET TODO: It would probably be more intuitive to use |
| // the current Locale to specify the decimal identifier than |
| // to hard code it to be ".", but this would differ from Java Lucene. |
| // Perhaps just make it a non-default option? |
| fms = float.Parse(fuzzySlop.Image.Substring(1), CultureInfo.InvariantCulture); |
| } |
| catch (Exception /*ignored*/) { } |
| if (fms < 0.0f) |
| { |
| throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); |
| } |
| else if (fms >= 1.0f && fms != (int)fms) |
| { |
| throw new ParseException("Fractional edit distances are not allowed!"); |
| } |
| q = GetFuzzyQuery(qfield, termImage, fms); |
| return q; |
| } |
| |
| // extracted from the .jj grammar |
| internal virtual Query HandleQuotedTerm(string qfield, Token term, Token fuzzySlop) |
| { |
| int s = PhraseSlop; // default |
| if (fuzzySlop != null) |
| { |
| try |
| { |
| // LUCENENET NOTE: Apparently a "feature" of Lucene is to always |
| // use "." as the decimal specifier for fuzzy slop, even if the culture uses |
| // a different one, such as ",". |
| |
| // LUCENENET TODO: It would probably be more intuitive to use |
| // the current Locale to specify the decimal identifier than |
| // to hard code it to be ".", but this would differ from Java Lucene. |
| // Perhaps just make it a non-default option? |
| s = (int)float.Parse(fuzzySlop.Image.Substring(1), CultureInfo.InvariantCulture); |
| } |
| catch (Exception /*ignored*/) { } |
| } |
| return GetFieldQuery(qfield, DiscardEscapeChar(term.Image.Substring(1, term.Image.Length - 2)), s); |
| } |
| |
| // extracted from the .jj grammar |
| internal virtual Query HandleBoost(Query q, Token boost) |
| { |
| if (boost != null) |
| { |
| float f = (float)1.0; |
| try |
| { |
| // LUCENENET NOTE: Apparently a "feature" of Lucene is to always |
| // use "." as the decimal specifier for boost, even if the culture uses |
| // a different one, such as ",". |
| |
| // LUCENENET TODO: It would probably be more intuitive to use |
| // the current Locale to specify the decimal identifier than |
| // to hard code it to be ".", but this would differ from Java Lucene. |
| // Perhaps just make it a non-default option? |
| f = float.Parse(boost.Image, CultureInfo.InvariantCulture); |
| } |
| catch (Exception /*ignored*/) |
| { |
| /* Should this be handled somehow? (defaults to "no boost", if |
| * boost number is invalid) |
| */ |
| } |
| |
| // avoid boosting null queries, such as those caused by stop words |
| if (q != null) |
| { |
| q.Boost = f; |
| } |
| } |
| return q; |
| } |
| |
| /// <summary> |
| /// Returns a string where the escape char has been |
| /// removed, or kept only once if there was a double escape. |
| /// <para/> |
| /// Supports escaped unicode characters, e. g. translates |
| /// <c>\\u0041</c> to <c>A</c>. |
| /// </summary> |
| internal virtual string DiscardEscapeChar(string input) |
| { |
| // Create char array to hold unescaped char sequence |
| char[] output = new char[input.Length]; |
| |
| // The length of the output can be less than the input |
| // due to discarded escape chars. This variable holds |
| // the actual length of the output |
| int length = 0; |
| |
| // We remember whether the last processed character was |
| // an escape character |
| bool lastCharWasEscapeChar = false; |
| |
| // The multiplier the current unicode digit must be multiplied with. |
| // E. g. the first digit must be multiplied with 16^3, the second with 16^2... |
| int codePointMultiplier = 0; |
| |
| // Used to calculate the codepoint of the escaped unicode character |
| int codePoint = 0; |
| |
| for (int i = 0; i < input.Length; i++) |
| { |
| char curChar = input[i]; |
| if (codePointMultiplier > 0) |
| { |
| codePoint += HexToInt32(curChar) * codePointMultiplier; |
| codePointMultiplier = codePointMultiplier.TripleShift(4); |
| if (codePointMultiplier == 0) |
| { |
| output[length++] = (char)codePoint; |
| codePoint = 0; |
| } |
| } |
| else if (lastCharWasEscapeChar) |
| { |
| if (curChar == 'u') |
| { |
| // found an escaped unicode character |
| codePointMultiplier = 16 * 16 * 16; |
| } |
| else |
| { |
| // this character was escaped |
| output[length] = curChar; |
| length++; |
| } |
| lastCharWasEscapeChar = false; |
| } |
| else |
| { |
| if (curChar == '\\') |
| { |
| lastCharWasEscapeChar = true; |
| } |
| else |
| { |
| output[length] = curChar; |
| length++; |
| } |
| } |
| } |
| |
| if (codePointMultiplier > 0) |
| { |
| throw new ParseException("Truncated unicode escape sequence."); |
| } |
| |
| if (lastCharWasEscapeChar) |
| { |
| throw new ParseException("Term can not end with escape character."); |
| } |
| |
| return new string(output, 0, length); |
| } |
| |
| /// <summary> |
| /// Returns the numeric value of the hexadecimal character |
| /// <para/> |
| /// NOTE: This was hexToInt() in Lucene |
| /// </summary> |
| private static int HexToInt32(char c) |
| { |
| if ('0' <= c && c <= '9') |
| { |
| return c - '0'; |
| } |
| else if ('a' <= c && c <= 'f') |
| { |
| return c - 'a' + 10; |
| } |
| else if ('A' <= c && c <= 'F') |
| { |
| return c - 'A' + 10; |
| } |
| else |
| { |
| throw new ParseException("Non-hex character in Unicode escape sequence: " + c); |
| } |
| } |
| |
| /// <summary> |
| /// Returns a string where those characters that QueryParser |
| /// expects to be escaped are escaped by a preceding <code>\</code>. |
| /// </summary> |
| public static string Escape(string s) |
| { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < s.Length; i++) |
| { |
| char c = s[i]; |
| // These characters are part of the query syntax and must be escaped |
| if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' |
| || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' |
| || c == '*' || c == '?' || c == '|' || c == '&' || c == '/') |
| { |
| sb.Append('\\'); |
| } |
| sb.Append(c); |
| } |
| return sb.ToString(); |
| } |
| } |
| } |