blob: 90dc2336baf8b5a62d8b85281c67676680a35e0e [file] [log] [blame]
#if FEATURE_COLLATION
using Icu.Collation;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Util;
using System;
using System.IO;
namespace Lucene.Net.Collation
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// <para>
/// Configures <see cref="KeywordTokenizer"/> with <see cref="CollationAttributeFactory"/>.
/// </para>
/// <para>
/// Converts the token into its <see cref="System.Globalization.SortKey"/>, and then
/// encodes the <see cref="System.Globalization.SortKey"/> either directly or with
/// <see cref="IndexableBinaryStringTools"/> (see version note below), to allow
/// it to be stored as an index term.
/// </para>
/// <para>
/// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at
/// index and query time -- <see cref="System.Globalization.SortKey"/> are only comparable when produced by
/// the same <see cref="Collator"/>. Since <c>java.text.RuleBasedCollators</c> are not
/// independently versioned, it is unsafe to search against stored
/// <see cref="System.Globalization.SortKey"/> unless the following are exactly the same (best practice is
/// to store this information with the index and check that they remain the
/// same at query time):
/// </para>
/// <list type="number">
/// <item><description>JVM vendor</description></item>
/// <item><description>JVM version, including patch version</description></item>
/// <item><description>
/// The language (and country and variant, if specified) of the Locale
/// used when constructing the collator via
/// <see cref="Collator.Create(System.Globalization.CultureInfo)"/>.
/// </description></item>
/// <item><description>
/// The collation strength used - see <see cref="Collator.Strength"/>
/// </description></item>
/// </list>
/// <para>
/// The <c>ICUCollationKeyAnalyzer</c> in the analysis-icu package
/// uses ICU4J's Collator, which makes its
/// its version available, thus allowing collation to be versioned
/// independently from the JVM. ICUCollationKeyAnalyzer is also significantly
/// faster and generates significantly shorter keys than CollationKeyAnalyzer.
/// See <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
/// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
/// generation timing and key length comparisons between ICU4J and
/// <see cref="Collator"/> over several languages.
/// </para>
/// <para>
/// CollationKeys generated by <see cref="Collator"/> are not compatible
/// with those those generated by ICU Collators. Specifically, if you use
/// CollationKeyAnalyzer to generate index terms, do not use
/// ICUCollationKeyAnalyzer on the query side, or vice versa.
/// </para>
/// <para>You must specify the required <see cref="LuceneVersion"/>
/// compatibility when creating <see cref="CollationKeyAnalyzer"/>:
/// <list type="bullet">
/// <item><description> As of 4.0, Collation Keys are directly encoded as bytes. Previous
/// versions will encode the bytes with <see cref="IndexableBinaryStringTools"/>.</description></item>
/// </list>
/// </para>
/// </summary>
public sealed class CollationKeyAnalyzer : Analyzer
{
private readonly Collator collator;
private readonly CollationAttributeFactory factory;
private readonly LuceneVersion matchVersion;
/// <summary>
/// Create a new <see cref="CollationKeyAnalyzer"/>, using the specified collator.
/// </summary>
/// <param name="matchVersion"> See <see cref="CollationKeyAnalyzer"/> </param>
/// <param name="collator"> <see cref="System.Globalization.SortKey"/> generator </param>
public CollationKeyAnalyzer(LuceneVersion matchVersion, Collator collator)
{
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new CollationAttributeFactory(collator);
}
[Obsolete("Use <seealso cref=\"CollationKeyAnalyzer#CollationKeyAnalyzer(LuceneVersion, Collator)\"/> and specify a version instead. This ctor will be removed in Lucene 5.0")]
public CollationKeyAnalyzer(Collator collator)
: this(LuceneVersion.LUCENE_31, collator)
{
}
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
#pragma warning disable 612, 618
if (this.matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
#pragma warning restore 612, 618
{
var tokenizer = new KeywordTokenizer(this.factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
else
{
var tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
#pragma warning disable 612, 618
new CollationKeyFilter(tokenizer, this.collator));
#pragma warning restore 612, 618
}
}
}
}
#endif