blob: d7919fd390820a83e4ce3b8cee7166a04c5462fb [file] [log] [blame]
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.IO;
namespace Lucene.Net.Analysis.Core
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestAnalyzers : BaseTokenStreamTestCase
{
[Test]
public virtual void TestSimple()
{
Analyzer a = new SimpleAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "foo", "bar" });
AssertAnalyzesTo(a, "foo bar . FOO <> BAR", new string[] { "foo", "bar", "foo", "bar" });
AssertAnalyzesTo(a, "foo.bar.FOO.BAR", new string[] { "foo", "bar", "foo", "bar" });
AssertAnalyzesTo(a, "U.S.A.", new string[] { "u", "s", "a" });
AssertAnalyzesTo(a, "C++", new string[] { "c" });
AssertAnalyzesTo(a, "B2B", new string[] { "b", "b" });
AssertAnalyzesTo(a, "2B", new string[] { "b" });
AssertAnalyzesTo(a, "\"QUOTED\" word", new string[] { "quoted", "word" });
}
[Test]
public virtual void TestNull()
{
Analyzer a = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "FOO", "BAR" });
AssertAnalyzesTo(a, "foo bar . FOO <> BAR", new string[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
AssertAnalyzesTo(a, "foo.bar.FOO.BAR", new string[] { "foo.bar.FOO.BAR" });
AssertAnalyzesTo(a, "U.S.A.", new string[] { "U.S.A." });
AssertAnalyzesTo(a, "C++", new string[] { "C++" });
AssertAnalyzesTo(a, "B2B", new string[] { "B2B" });
AssertAnalyzesTo(a, "2B", new string[] { "2B" });
AssertAnalyzesTo(a, "\"QUOTED\" word", new string[] { "\"QUOTED\"", "word" });
}
[Test]
public virtual void TestStop()
{
Analyzer a = new StopAnalyzer(TEST_VERSION_CURRENT);
AssertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "foo", "bar" });
AssertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new string[] { "foo", "bar", "foo", "bar" });
}
internal virtual void VerifyPayload(TokenStream ts)
{
IPayloadAttribute payloadAtt = ts.GetAttribute<IPayloadAttribute>();
ts.Reset();
for (sbyte b = 1; ; b++)
{
bool hasNext = ts.IncrementToken();
if (!hasNext)
{
break;
}
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
assertEquals(b, payloadAtt.Payload.Bytes[0]);
}
}
// Make sure old style next() calls result in a new copy of payloads
[Test]
public virtual void TestPayloadCopy()
{
string s = "how now brown cow";
TokenStream ts;
ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
ts = new PayloadSetter(ts);
VerifyPayload(ts);
ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(s));
ts = new PayloadSetter(ts);
VerifyPayload(ts);
}
// LUCENE-1150: Just a compile time test, to ensure the
// StandardAnalyzer constants remain publicly accessible
public virtual void _TestStandardConstants()
{
#pragma warning disable 219, 612, 618
int x = StandardTokenizer.ALPHANUM;
x = StandardTokenizer.APOSTROPHE;
x = StandardTokenizer.ACRONYM;
x = StandardTokenizer.COMPANY;
x = StandardTokenizer.EMAIL;
x = StandardTokenizer.HOST;
x = StandardTokenizer.NUM;
x = StandardTokenizer.CJ;
string[] y = StandardTokenizer.TOKEN_TYPES;
#pragma warning restore 219, 612, 618
}
private static readonly Analyzer LOWERCASE_WHITESPACE_ANALYZER = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
});
private static readonly Analyzer UPPERCASE_WHITESPACE_ANALYZER = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(tokenizer, new UpperCaseFilter(TEST_VERSION_CURRENT, tokenizer));
});
/// <summary>
/// Test that LowercaseFilter handles entire unicode range correctly
/// </summary>
[Test]
public virtual void TestLowerCaseFilter()
{
Analyzer a = LOWERCASE_WHITESPACE_ANALYZER;
// BMP
AssertAnalyzesTo(a, "AbaCaDabA", new string[] { "abacadaba" });
// supplementary
AssertAnalyzesTo(a, "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16", new string[] { "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e" });
AssertAnalyzesTo(a, "AbaCa\ud801\udc16DabA", new string[] { "abaca\ud801\udc3edaba" });
// unpaired lead surrogate
AssertAnalyzesTo(a, "AbaC\uD801AdaBa", new string[] { "abac\uD801adaba" });
// unpaired trail surrogate
AssertAnalyzesTo(a, "AbaC\uDC16AdaBa", new string[] { "abac\uDC16adaba" });
}
/// <summary>
/// Test that LowercaseFilter handles entire unicode range correctly
/// </summary>
[Test]
public virtual void TestUpperCaseFilter()
{
Analyzer a = UPPERCASE_WHITESPACE_ANALYZER;
// BMP
AssertAnalyzesTo(a, "AbaCaDabA", new string[] { "ABACADABA" });
// supplementary
AssertAnalyzesTo(a, "\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e", new string[] { "\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16" });
AssertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", new string[] { "ABACA\ud801\udc16DABA" });
// unpaired lead surrogate
AssertAnalyzesTo(a, "AbaC\uD801AdaBa", new string[] { "ABAC\uD801ADABA" });
// unpaired trail surrogate
AssertAnalyzesTo(a, "AbaC\uDC16AdaBa", new string[] { "ABAC\uDC16ADABA" });
}
/// <summary>
/// Test that LowercaseFilter handles the lowercasing correctly if the term
/// buffer has a trailing surrogate character leftover and the current term in
/// the buffer ends with a corresponding leading surrogate.
/// </summary>
[Test]
public virtual void TestLowerCaseFilterLowSurrogateLeftover()
{
// test if the limit of the termbuffer is correctly used with supplementary
// chars
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("BogustermBogusterm\udc16"));
LowerCaseFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
AssertTokenStreamContents(filter, new string[] { "bogustermbogusterm\udc16" });
filter.Reset();
string highSurEndingUpper = "BogustermBoguster\ud801";
string highSurEndingLower = "bogustermboguster\ud801";
tokenizer.SetReader(new StringReader(highSurEndingUpper));
AssertTokenStreamContents(filter, new string[] { highSurEndingLower });
assertTrue(filter.HasAttribute<ICharTermAttribute>());
char[] termBuffer = filter.GetAttribute<ICharTermAttribute>().Buffer;
int length = highSurEndingLower.Length;
assertEquals('\ud801', termBuffer[length - 1]);
}
[Test]
public virtual void TestLowerCaseTokenizer()
{
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, reader);
AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "\ud801\udc44test" });
}
[Test]
[Obsolete("deprecated (3.1)")]
public virtual void TestLowerCaseTokenizerBWCompat()
{
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(LuceneVersion.LUCENE_30, reader);
AssertTokenStreamContents(tokenizer, new string[] { "tokenizer", "test" });
}
[Test]
public virtual void TestWhitespaceTokenizer()
{
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" });
}
[Test]
[Obsolete("deprecated (3.1)")]
public virtual void TestWhitespaceTokenizerBWCompat()
{
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(LuceneVersion.LUCENE_30, reader);
AssertTokenStreamContents(tokenizer, new string[] { "Tokenizer", "\ud801\udc1ctest" });
}
/// <summary>
/// blast some random strings through the analyzer </summary>
[Test]
public virtual void TestRandomStrings()
{
CheckRandomData(Random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
CheckRandomData(Random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
CheckRandomData(Random, new StopAnalyzer(TEST_VERSION_CURRENT), 1000 * RandomMultiplier);
}
/// <summary>
/// blast some random large strings through the analyzer </summary>
[Test]
[Slow]
public virtual void TestRandomHugeStrings()
{
Random random = Random;
CheckRandomData(random, new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
CheckRandomData(random, new SimpleAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
CheckRandomData(random, new StopAnalyzer(TEST_VERSION_CURRENT), 100 * RandomMultiplier, 8192);
}
}
internal sealed class PayloadSetter : TokenFilter
{
internal IPayloadAttribute payloadAtt;
public PayloadSetter(TokenStream input) : base(input)
{
p = new BytesRef(data, 0, 1);
payloadAtt = AddAttribute<IPayloadAttribute>();
}
internal byte[] data = new byte[1];
internal BytesRef p;
public override bool IncrementToken()
{
bool hasNext = m_input.IncrementToken();
if (!hasNext)
{
return false;
}
payloadAtt.Payload = p; // reuse the payload / byte[]
data[0]++;
return true;
}
}
}