blob: 2162dbb948d39813c9c3c849db40996d126d5531 [file] [log] [blame]
// lucene version compatibility level: 4.8.1
using ICU4N.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.IO;
namespace Lucene.Net.Collation
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Configures <see cref="KeywordTokenizer"/> with <see cref="ICUCollationAttributeFactory"/>.
/// </summary>
/// <remarks>
/// Converts the token into its <see cref="CollationKey"/>, and
/// then encodes the <see cref="CollationKey"/> either directly or with
/// <see cref="IndexableBinaryStringTools"/> (see <a href="#version">below</a>), to allow it to
/// be stored as an index term.
/// <para/>
/// <strong>WARNING:</strong> Make sure you use exactly the same <see cref="Collator"/> at
/// index and query time -- CollationKeys are only comparable when produced by
/// the same <see cref="Collator"/>. <see cref="RuleBasedCollator"/>s are
/// independently versioned, so it is safe to search against stored
/// <see cref="CollationKey"/>s if the following are exactly the same (best practice is
/// to store this information with the index and check that they remain the
/// same at query time):
/// <list type="number">
/// <item><description>Collator version - see <see cref="Collator"/> Version</description></item>
/// <item><description>The collation strength used - see <see cref="Collator.Strength"/></description></item>
/// </list>
/// <para/>
/// <see cref="CollationKey"/>s generated by ICU Collators are not compatible with those
/// generated by java.text.Collators. Specifically, if you use
/// <see cref="ICUCollationKeyAnalyzer"/> to generate index terms, do not use
/// CollationKeyAnalyzer on the query side, or vice versa.
/// <para/>
/// ICUCollationKeyAnalyzer is significantly faster and generates significantly
/// shorter keys than CollationKeyAnalyzer. See
/// <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
/// >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
/// generation timing and key length comparisons between ICU4J and
/// java.text.Collator over several languages.
/// <para/>
/// <a name="version"/>
/// You must specify the required <see cref="LuceneVersion"/>
/// compatibility when creating <see cref="ICUCollationKeyAnalyzer"/>:
/// <list type="bullet">
/// <item><description>As of 4.0, <see cref="CollationKey"/>s are directly encoded as bytes. Previous
/// versions will encode the bytes with <see cref="IndexableBinaryStringTools"/>.</description></item>
/// </list>
/// </remarks>
[ExceptionToClassNameConvention]
public sealed class ICUCollationKeyAnalyzer : Analyzer
{
private readonly Collator collator;
private readonly ICUCollationAttributeFactory factory;
private readonly LuceneVersion matchVersion;
/// <summary>
/// Create a new <see cref="ICUCollationKeyAnalyzer"/>, using the specified <paramref name="collator"/>.
/// </summary>
/// <param name="matchVersion">See <see cref="ICUCollationKeyAnalyzer"/>.</param>
/// <param name="collator"><see cref="CollationKey"/> generator.</param>
public ICUCollationKeyAnalyzer(LuceneVersion matchVersion, Collator collator)
{
this.matchVersion = matchVersion;
this.collator = collator;
this.factory = new ICUCollationAttributeFactory(collator);
}
[Obsolete("Use ICUCollationKeyAnalyzer.ICUCollationKeyAnalyzer(LuceneVersion, Collator) and specify a version instead. This ctor will be removed in Lucene 5.0")]
public ICUCollationKeyAnalyzer(Collator collator)
: this(LuceneVersion.LUCENE_31, collator)
{
}
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
#pragma warning disable 612, 618
if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_40))
#pragma warning restore 612, 618
{
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new TokenStreamComponents(tokenizer, tokenizer);
}
else
{
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
return new TokenStreamComponents(tokenizer,
#pragma warning disable 612, 618
new ICUCollationKeyFilter(tokenizer, collator));
#pragma warning restore 612, 618
}
}
}
}