blob: 571b67e2c0af376aa568a8495537052434a67267 [file] [log] [blame]
using J2N.Collections.Generic.Extensions;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Abstract parent class for analysis factories <see cref="TokenizerFactory"/>,
/// <see cref="TokenFilterFactory"/> and <see cref="CharFilterFactory"/>.
/// <para>
/// The typical lifecycle for a factory consumer is:
/// <list type="bullet">
/// <item><description>Create factory via its constructor (or via XXXFactory.ForName)</description></item>
/// <item><description>(Optional) If the factory uses resources such as files,
/// <see cref="IResourceLoaderAware.Inform(IResourceLoader)"/> is called to initialize those resources.</description></item>
/// <item><description>Consumer calls create() to obtain instances.</description></item>
/// </list>
/// </para>
/// </summary>
public abstract class AbstractAnalysisFactory
{
public const string LUCENE_MATCH_VERSION_PARAM = "luceneMatchVersion";
/// <summary>
/// The original args, before any processing </summary>
private readonly IDictionary<string, string> originalArgs;
/// <summary>
/// the luceneVersion arg </summary>
protected readonly LuceneVersion m_luceneMatchVersion;
/// <summary>
/// Initialize this factory via a set of key-value pairs.
/// </summary>
protected AbstractAnalysisFactory(IDictionary<string, string> args)
{
IsExplicitLuceneMatchVersion = false;
originalArgs = args.AsReadOnly();
string version = Get(args, LUCENE_MATCH_VERSION_PARAM);
// LUCENENET TODO: What should we do if the version is null?
//luceneMatchVersion = version == null ? (LuceneVersion?)null : LuceneVersionHelpers.ParseLeniently(version);
m_luceneMatchVersion = version == null ?
#pragma warning disable 612, 618
LuceneVersion.LUCENE_CURRENT :
#pragma warning restore 612, 618
LuceneVersionExtensions.ParseLeniently(version);
args.Remove(CLASS_NAME); // consume the class arg
}
public IDictionary<string, string> OriginalArgs
{
get { return originalArgs; }
}
/// <summary>
/// this method can be called in the <see cref="TokenizerFactory.Create(TextReader)"/>
/// or <see cref="TokenFilterFactory.Create(TokenStream)"/> methods,
/// to inform user, that for this factory a <see cref="m_luceneMatchVersion"/> is required
/// </summary>
protected void AssureMatchVersion() // LUCENENET TODO: Remove this method (not used anyway in .NET)
{
// LUCENENET NOTE: since luceneMatchVersion can never be null in .NET,
// this method effectively does nothing. However, leaving it in place because
// it is used throughout Lucene.
//if (luceneMatchVersion == null)
//{
// throw new System.ArgumentException("Configuration Error: Factory '" + this.GetType().FullName + "' needs a 'luceneMatchVersion' parameter");
//}
}
public LuceneVersion LuceneMatchVersion
{
get { return this.m_luceneMatchVersion; }
}
public virtual string Require(IDictionary<string, string> args, string name)
{
string s;
if (!args.TryGetValue(name, out s))
{
throw new System.ArgumentException("Configuration Error: missing parameter '" + name + "'");
}
args.Remove(name);
return s;
}
public virtual string Require(IDictionary<string, string> args, string name, ICollection<string> allowedValues)
{
return Require(args, name, allowedValues, true);
}
public virtual string Require(IDictionary<string, string> args, string name, ICollection<string> allowedValues,
bool caseSensitive)
{
string s;
if (!args.TryGetValue(name, out s) || s == null)
{
throw new ArgumentException("Configuration Error: missing parameter '" + name + "'");
}
args.Remove(name);
foreach (var allowedValue in allowedValues)
{
if (caseSensitive)
{
if (s.Equals(allowedValue, StringComparison.Ordinal))
{
return s;
}
}
else
{
if (s.Equals(allowedValue, StringComparison.OrdinalIgnoreCase))
{
return s;
}
}
}
throw new ArgumentException("Configuration Error: '" + name + "' value must be one of " +
allowedValues);
}
public virtual string Get(IDictionary<string, string> args, string name, string defaultVal = null)
{
string s;
if (args.TryGetValue(name, out s))
args.Remove(name);
return s ?? defaultVal;
}
public virtual string Get(IDictionary<string, string> args, string name, ICollection<string> allowedValues)
{
return Get(args, name, allowedValues, null); // defaultVal = null
}
public virtual string Get(IDictionary<string, string> args, string name, ICollection<string> allowedValues, string defaultVal)
{
return Get(args, name, allowedValues, defaultVal, true);
}
public virtual string Get(IDictionary<string, string> args, string name, ICollection<string> allowedValues, string defaultVal, bool caseSensitive)
{
string s = null;
if (!args.TryGetValue(name, out s) || s == null)
{
return defaultVal;
}
else
{
args.Remove(name);
foreach (string allowedValue in allowedValues)
{
if (caseSensitive)
{
if (s.Equals(allowedValue, StringComparison.Ordinal))
{
return s;
}
}
else
{
if (s.Equals(allowedValue, StringComparison.OrdinalIgnoreCase))
{
return s;
}
}
}
throw new System.ArgumentException("Configuration Error: '" + name + "' value must be one of " +
allowedValues);
}
}
/// <summary>
/// NOTE: This was requireInt() in Lucene
/// </summary>
protected int RequireInt32(IDictionary<string, string> args, string name)
{
return int.Parse(Require(args, name), CultureInfo.InvariantCulture);
}
/// <summary>
/// NOTE: This was getInt() in Lucene
/// </summary>
protected int GetInt32(IDictionary<string, string> args, string name, int defaultVal)
{
string s;
if (args.TryGetValue(name, out s))
{
args.Remove(name);
return int.Parse(s, CultureInfo.InvariantCulture);
}
return defaultVal;
}
protected bool RequireBoolean(IDictionary<string, string> args, string name)
{
return bool.Parse(Require(args, name));
}
protected bool GetBoolean(IDictionary<string, string> args, string name, bool defaultVal)
{
string s;
if (args.TryGetValue(name, out s))
{
args.Remove(name);
return bool.Parse(s);
}
return defaultVal;
}
/// <summary>
/// NOTE: This was requireFloat() in Lucene
/// </summary>
protected float RequireSingle(IDictionary<string, string> args, string name)
{
return float.Parse(Require(args, name), CultureInfo.InvariantCulture);
}
/// <summary>
/// NOTE: This was getFloat() in Lucene
/// </summary>
protected float GetSingle(IDictionary<string, string> args, string name, float defaultVal)
{
string s;
if (args.TryGetValue(name, out s))
{
args.Remove(name);
return float.Parse(s, CultureInfo.InvariantCulture);
}
return defaultVal;
}
public virtual char RequireChar(IDictionary<string, string> args, string name)
{
return Require(args, name)[0];
}
public virtual char GetChar(IDictionary<string, string> args, string name, char defaultVal)
{
string s;
if (args.TryGetValue(name, out s))
{
args.Remove(name);
if (s.Length != 1)
{
throw new System.ArgumentException(name + " should be a char. \"" + s + "\" is invalid");
}
else
{
return s[0];
}
}
return defaultVal;
}
private static readonly Regex ITEM_PATTERN = new Regex("[^,\\s]+", RegexOptions.Compiled);
/// <summary>
/// Returns whitespace- and/or comma-separated set of values, or null if none are found </summary>
public virtual ISet<string> GetSet(IDictionary<string, string> args, string name)
{
string s;
if (args.TryGetValue(name, out s))
{
args.Remove(name);
ISet<string> set = null;
Match matcher = ITEM_PATTERN.Match(s);
if (matcher.Success)
{
set = new JCG.HashSet<string>
{
matcher.Groups[0].Value
};
matcher = matcher.NextMatch();
while (matcher.Success)
{
set.Add(matcher.Groups[0].Value);
matcher = matcher.NextMatch();
}
}
return set;
}
return null;
}
/// <summary>
/// Compiles a pattern for the value of the specified argument key <paramref name="name"/>
/// </summary>
protected Regex GetPattern(IDictionary<string, string> args, string name)
{
try
{
return new Regex(Require(args, name), RegexOptions.Compiled);
}
catch (Exception e)
{
throw new System.ArgumentException("Configuration Error: '" + name + "' can not be parsed in " + this.GetType().Name, e);
}
}
/// <summary>
/// Gets a <see cref="CultureInfo"/> value of the specified argument key <paramref name="name"/>.
/// <para/>
/// To specify the invariant culture, pass the string <c>"invariant"</c>.
/// <para/>
/// LUCENENET specific
/// </summary>
protected CultureInfo GetCulture(IDictionary<string, string> args, string name, CultureInfo defaultVal)
{
string culture;
if (args.TryGetValue(name, out culture))
{
args.Remove(name);
try
{
if (culture.Equals("invariant", StringComparison.Ordinal))
{
return CultureInfo.InvariantCulture;
}
return new CultureInfo(culture);
}
catch (Exception e)
{
throw new System.ArgumentException("Configuration Error: '" + name + "' can not be parsed in " + this.GetType().Name, e);
}
}
return defaultVal;
}
/// <summary>
/// Returns as <see cref="CharArraySet"/> from wordFiles, which
/// can be a comma-separated list of filenames
/// </summary>
protected CharArraySet GetWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
{
AssureMatchVersion();
IList<string> files = SplitFileNames(wordFiles);
CharArraySet words = null;
if (files.Count() > 0)
{
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(m_luceneMatchVersion, files.Count * 10, ignoreCase);
foreach (string file in files)
{
var wlist = GetLines(loader, file.Trim());
words.UnionWith(StopFilter.MakeStopSet(m_luceneMatchVersion, wlist, ignoreCase));
}
}
return words;
}
/// <summary>
/// Returns the resource's lines (with content treated as UTF-8)
/// </summary>
protected IList<string> GetLines(IResourceLoader loader, string resource)
{
return WordlistLoader.GetLines(loader.OpenResource(resource), Encoding.UTF8);
}
/// <summary>
/// Same as <see cref="GetWordSet(IResourceLoader, string, bool)"/>,
/// except the input is in snowball format.
/// </summary>
protected CharArraySet GetSnowballWordSet(IResourceLoader loader, string wordFiles, bool ignoreCase)
{
AssureMatchVersion();
IList<string> files = SplitFileNames(wordFiles);
CharArraySet words = null;
if (files.Count() > 0)
{
// default stopwords list has 35 or so words, but maybe don't make it that
// big to start
words = new CharArraySet(m_luceneMatchVersion, files.Count() * 10, ignoreCase);
foreach (string file in files)
{
using (Stream stream = loader.OpenResource(file.Trim()))
{
using (TextReader reader = new StreamReader(stream, Encoding.UTF8))
{
WordlistLoader.GetSnowballWordSet(reader, words);
}
}
}
}
return words;
}
/// <summary>
/// Splits file names separated by comma character.
/// File names can contain comma characters escaped by backslash '\'
/// </summary>
/// <param name="fileNames"> the string containing file names </param>
/// <returns> a list of file names with the escaping backslashed removed </returns>
protected IList<string> SplitFileNames(string fileNames)
{
if (fileNames == null)
{
return Collections.EmptyList<string>();
}
IList<string> result = new List<string>();
foreach (string file in Regex.Split(fileNames, "(?<!\\\\),"))
{
result.Add(Regex.Replace(file, "\\\\(?=,)", ""));
}
return result;
}
private const string CLASS_NAME = "class";
/// <returns> the string used to specify the concrete class name in a serialized representation: the class arg.
/// If the concrete class name was not specified via a class arg, returns <c>GetType().Name</c>. </returns>
public virtual string GetClassArg()
{
if (null != originalArgs)
{
string className = originalArgs[CLASS_NAME];
if (null != className)
{
return className;
}
}
return this.GetType().Name;
}
public virtual bool IsExplicitLuceneMatchVersion { get; set; }
}
}