| // lucene version compatibility level: 4.8.1 |
| using Lucene.Net.Analysis.Phonetic.Language; |
| using Lucene.Net.Analysis.Util; |
| using Lucene.Net.Support; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.Reflection; |
| |
| namespace Lucene.Net.Analysis.Phonetic |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Factory for <see cref="PhoneticFilter"/>. |
| /// <para/> |
| /// Create tokens based on phonetic encoders from the Language namespace. |
| /// <para/> |
| /// This takes one required argument, "encoder", and the rest are optional: |
| /// <list type="bullet"> |
| /// <item> |
| /// <term>encoder</term> |
| /// <description> |
| /// required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0), |
| /// or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by |
| /// itself if it already contains a '.' or otherwise as in the same package as these others. |
| /// </description> |
| /// </item> |
| /// <item> |
| /// <term>inject</term> |
| /// <description> |
| /// (default=true) add tokens to the stream with the offset=0 |
| /// </description> |
| /// </item> |
| /// <item> |
| /// <term>maxCodeLength</term> |
| /// <description> |
| /// The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't |
| /// support this then specifying this is an error. |
| /// </description> |
| /// </item> |
| /// </list> |
| /// |
| /// <code> |
| /// <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"> |
| /// <analyzer> |
| /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| /// <filter class="solr.PhoneticFilterFactory" encoder="DoubleMetaphone" inject="true"/> |
| /// </analyzer> |
| /// </fieldType> |
| /// </code> |
| /// </summary> |
| /// <seealso cref="PhoneticFilter"/> |
| public class PhoneticFilterFactory : TokenFilterFactory, IResourceLoaderAware |
| { |
| /// <summary>parameter name: either a short name or a full class name</summary> |
| public static readonly string ENCODER = "encoder"; |
| /// <summary>parameter name: true if encoded tokens should be added as synonyms</summary> |
| public static readonly string INJECT = "inject"; // boolean |
| /** parameter name: restricts the length of the phonetic code */ |
| public static readonly string MAX_CODE_LENGTH = "maxCodeLength"; |
| private static readonly string PACKAGE_CONTAINING_ENCODERS = "Lucene.Net.Analysis.Phonetic.Language."; |
| |
| //Effectively constants; uppercase keys |
| private static readonly IDictionary<string, Type> registry = new Dictionary<string, Type> // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) |
| { |
| { "DoubleMetaphone".ToUpperInvariant(), typeof(DoubleMetaphone) }, |
| { "Metaphone".ToUpperInvariant(), typeof(Metaphone) }, |
| { "Soundex".ToUpperInvariant(), typeof(Soundex) }, |
| { "RefinedSoundex".ToUpperInvariant(), typeof(RefinedSoundex) }, |
| { "Caverphone".ToUpperInvariant(), typeof(Caverphone2) }, |
| { "ColognePhonetic".ToUpperInvariant(), typeof(ColognePhonetic) }, |
| }; |
| |
| internal bool inject; //accessed by the test |
| private readonly string name; |
| private readonly int? maxCodeLength; |
| private Type clazz = null; |
| private MethodInfo setMaxCodeLenMethod = null; |
| |
| /// <summary>Creates a new <see cref="PhoneticFilterFactory"/>.</summary> |
| public PhoneticFilterFactory(IDictionary<string, string> args) |
| : base(args) |
| { |
| inject = GetBoolean(args, INJECT, true); |
| name = Require(args, ENCODER); |
| string v = Get(args, MAX_CODE_LENGTH); |
| if (v != null) |
| { |
| maxCodeLength = int.Parse(v, CultureInfo.InvariantCulture); |
| } |
| else |
| { |
| maxCodeLength = null; |
| } |
| if (!(args.Count == 0)) |
| { |
| throw new ArgumentException("Unknown parameters: " + args); |
| } |
| } |
| |
| |
| public virtual void Inform(IResourceLoader loader) |
| { |
| registry.TryGetValue(name.ToUpperInvariant(), out clazz); |
| if (clazz == null) |
| { |
| clazz = ResolveEncoder(name, loader); |
| } |
| |
| if (maxCodeLength != null) |
| { |
| try |
| { |
| setMaxCodeLenMethod = clazz.GetMethod("set_MaxCodeLen"); |
| } |
| catch (Exception e) |
| { |
| throw new ArgumentException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e); |
| } |
| } |
| |
| GetEncoder();//trigger initialization for potential problems to be thrown now |
| } |
| |
| private Type ResolveEncoder(string name, IResourceLoader loader) |
| { |
| string lookupName = name; |
| if (name.IndexOf('.') == -1) |
| { |
| lookupName = PACKAGE_CONTAINING_ENCODERS + name; |
| } |
| try |
| { |
| return loader.NewInstance<IStringEncoder>(lookupName).GetType(); |
| } |
| catch (Exception e) |
| { |
| throw new ArgumentException("Error loading encoder '" + name + "': must be full class name or one of " + Collections.ToString(registry.Keys), e); |
| } |
| } |
| |
| /// <summary>Must be thread-safe.</summary> |
| protected internal virtual IStringEncoder GetEncoder() |
| { |
| // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate |
| // every time. A simple benchmark showed this as negligible. |
| try |
| { |
| IStringEncoder encoder = (IStringEncoder)Activator.CreateInstance(clazz); |
| // Try to set the maxCodeLength |
| if (maxCodeLength != null && setMaxCodeLenMethod != null) |
| { |
| setMaxCodeLenMethod.Invoke(encoder, new object[] { maxCodeLength }); |
| } |
| return encoder; |
| } |
| catch (Exception e) |
| { |
| Exception t = (e is TargetInvocationException) ? e.InnerException : e; |
| throw new ArgumentException("Error initializing encoder: " + name + " / " + clazz, t); |
| } |
| } |
| |
| public override TokenStream Create(TokenStream input) |
| { |
| return new PhoneticFilter(input, GetEncoder(), inject); |
| } |
| } |
| } |