src/Lucene.Net.Tests.Analysis.Common/Analysis/Core/TestClassicAnalyzer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Documents;
 using Lucene.Net.Index;
 using Lucene.Net.Search;
 using Lucene.Net.Store;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System;

 namespace Lucene.Net.Analysis.Core
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     public class TestClassicAnalyzer : BaseTokenStreamTestCase
     {

         private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);

         [Test]
         public virtual void TestMaxTermLength()
         {
             ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
             sa.MaxTokenLength = 5;
             AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "xy", "z" });
         }

         [Test]
         public virtual void TestMaxTermLength2()
         {
             ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
             AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "toolong", "xy", "z" });
             sa.MaxTokenLength = 5;

             AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "xy", "z" }, new int[] { 1, 1, 2, 1 });
         }

         [Test]
         public virtual void TestMaxTermLength3()
         {
             char[] chars = new char[255];
             for (int i = 0; i < 255; i++)
             {
                 chars[i] = 'a';
             }
             string longTerm = new string(chars, 0, 255);

             AssertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new string[] { "ab", "cd", longTerm, "xy", "z" });
             AssertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new string[] { "ab", "cd", "xy", "z" });
         }

         [Test]
         public virtual void TestAlphanumeric()
         {
             // alphanumeric tokens
             AssertAnalyzesTo(a, "B2B", new string[] { "b2b" });
             AssertAnalyzesTo(a, "2B", new string[] { "2b" });
         }

         [Test]
         public virtual void TestUnderscores()
         {
             // underscores are delimiters, but not in email addresses (below)
             AssertAnalyzesTo(a, "word_having_underscore", new string[] { "word", "having", "underscore" });
             AssertAnalyzesTo(a, "word_with_underscore_and_stopwords", new string[] { "word", "underscore", "stopwords" });
         }

         [Test]
         public virtual void TestDelimiters()
         {
             // other delimiters: "-", "/", ","
             AssertAnalyzesTo(a, "some-dashed-phrase", new string[] { "some", "dashed", "phrase" });
             AssertAnalyzesTo(a, "dogs,chase,cats", new string[] { "dogs", "chase", "cats" });
             AssertAnalyzesTo(a, "ac/dc", new string[] { "ac", "dc" });
         }

         [Test]
         public virtual void TestApostrophes()
         {
             // internal apostrophes: O'Reilly, you're, O'Reilly's
             // possessives are actually removed by StardardFilter, not the tokenizer
             AssertAnalyzesTo(a, "O'Reilly", new string[] { "o'reilly" });
             AssertAnalyzesTo(a, "you're", new string[] { "you're" });
             AssertAnalyzesTo(a, "she's", new string[] { "she" });
             AssertAnalyzesTo(a, "Jim's", new string[] { "jim" });
             AssertAnalyzesTo(a, "don't", new string[] { "don't" });
             AssertAnalyzesTo(a, "O'Reilly's", new string[] { "o'reilly" });
         }

         [Test]
         public virtual void TestTSADash()
         {
             // t and s had been stopwords in Lucene <= 2.0, which made it impossible
             // to correctly search for these terms:
             AssertAnalyzesTo(a, "s-class", new string[] { "s", "class" });
             AssertAnalyzesTo(a, "t-com", new string[] { "t", "com" });
             // 'a' is still a stopword:
             AssertAnalyzesTo(a, "a-class", new string[] { "class" });
         }

         [Test]
         public virtual void TestCompanyNames()
         {
             // company names
             AssertAnalyzesTo(a, "AT&T", new string[] { "at&t" });
             AssertAnalyzesTo(a, "Excite@Home", new string[] { "excite@home" });
         }

         [Test]
         public virtual void TestLucene1140()
         {
             try
             {
                 ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
                 AssertAnalyzesTo(analyzer, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });
             }
             catch (System.NullReferenceException)
             {
                 fail("Should not throw an NPE and it did");
             }

         }

         [Test]
         public virtual void TestDomainNames()
         {
             // Current lucene should not show the bug
             ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);

             // domain names
             AssertAnalyzesTo(a2, "www.nutch.org", new string[] { "www.nutch.org" });
             //Notice the trailing .  See https://issues.apache.org/jira/browse/LUCENE-1068.
             // the following should be recognized as HOST:
             AssertAnalyzesTo(a2, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });

             // 2.3 should show the bug. But, alas, it's obsolete, we don't support it.
             // a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
             // AssertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });

             // 2.4 should not show the bug. But, alas, it's also obsolete,
             // so we check latest released (Robert's gonna break this on 4.0 soon :) )
 #pragma warning disable 612, 618
             a2 = new ClassicAnalyzer(LuceneVersion.LUCENE_31);
 #pragma warning restore 612, 618
             AssertAnalyzesTo(a2, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });
         }

         [Test]
         public virtual void TestEMailAddresses()
         {
             // email addresses, possibly with underscores, periods, etc
             AssertAnalyzesTo(a, "test@example.com", new string[] { "test@example.com" });
             AssertAnalyzesTo(a, "first.lastname@example.com", new string[] { "first.lastname@example.com" });
             AssertAnalyzesTo(a, "first_lastname@example.com", new string[] { "first_lastname@example.com" });
         }

         [Test]
         public virtual void TestNumeric()
         {
             // floating point, serial, model numbers, ip addresses, etc.
             // every other segment must have at least one digit
             AssertAnalyzesTo(a, "21.35", new string[] { "21.35" });
             AssertAnalyzesTo(a, "R2D2 C3PO", new string[] { "r2d2", "c3po" });
             AssertAnalyzesTo(a, "216.239.63.104", new string[] { "216.239.63.104" });
             AssertAnalyzesTo(a, "1-2-3", new string[] { "1-2-3" });
             AssertAnalyzesTo(a, "a1-b2-c3", new string[] { "a1-b2-c3" });
             AssertAnalyzesTo(a, "a1-b-c3", new string[] { "a1-b-c3" });
         }

         [Test]
         public virtual void TestTextWithNumbers()
         {
             // numbers
             AssertAnalyzesTo(a, "David has 5000 bones", new string[] { "david", "has", "5000", "bones" });
         }

         [Test]
         public virtual void TestVariousText()
         {
             // various
             AssertAnalyzesTo(a, "C embedded developers wanted", new string[] { "c", "embedded", "developers", "wanted" });
             AssertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "foo", "bar" });
             AssertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new string[] { "foo", "bar", "foo", "bar" });
             AssertAnalyzesTo(a, "\"QUOTED\" word", new string[] { "quoted", "word" });
         }

         [Test]
         public virtual void TestAcronyms()
         {
             // acronyms have their dots stripped
             AssertAnalyzesTo(a, "U.S.A.", new string[] { "usa" });
         }

         [Test]
         public virtual void TestCPlusPlusHash()
         {
             // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
             AssertAnalyzesTo(a, "C++", new string[] { "c" });
             AssertAnalyzesTo(a, "C#", new string[] { "c" });
         }

         [Test]
         public virtual void TestKorean()
         {
             // Korean words
             AssertAnalyzesTo(a, "안녕하세요 한글입니다", new string[] { "안녕하세요", "한글입니다" });
         }

         // Compliance with the "old" JavaCC-based analyzer, see:
         // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752

         [Test]
         public virtual void TestComplianceFileName()
         {
             AssertAnalyzesTo(a, "2004.jpg", new string[] { "2004.jpg" }, new string[] { "<HOST>" });
         }

         [Test]
         public virtual void TestComplianceNumericIncorrect()
         {
             AssertAnalyzesTo(a, "62.46", new string[] { "62.46" }, new string[] { "<HOST>" });
         }

         [Test]
         public virtual void TestComplianceNumericLong()
         {
             AssertAnalyzesTo(a, "978-0-94045043-1", new string[] { "978-0-94045043-1" }, new string[] { "<NUM>" });
         }

         [Test]
         public virtual void TestComplianceNumericFile()
         {
             AssertAnalyzesTo(a, "78academyawards/rules/rule02.html", new string[] { "78academyawards/rules/rule02.html" }, new string[] { "<NUM>" });
         }

         [Test]
         public virtual void TestComplianceNumericWithUnderscores()
         {
             AssertAnalyzesTo(a, "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs", new string[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs" }, new string[] { "<NUM>" });
         }

         [Test]
         public virtual void TestComplianceNumericWithDash()
         {
             AssertAnalyzesTo(a, "mid-20th", new string[] { "mid-20th" }, new string[] { "<NUM>" });
         }

         [Test]
         public virtual void TestComplianceManyTokens()
         {
             AssertAnalyzesTo(a, "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm " + "safari-0-sheikh-zayed-grand-mosque.jpg", new string[] { "money.cnn.com", "magazines", "fortune", "fortune", "archive/2007/03/19/8402357", "index.htm", "safari-0-sheikh", "zayed", "grand", "mosque.jpg" }, new string[] { "<HOST>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<HOST>" });
         }

         [Test]
         public virtual void TestJava14BWCompatibility()
         {
 #pragma warning disable 612, 618
             ClassicAnalyzer sa = new ClassicAnalyzer(LuceneVersion.LUCENE_30);
 #pragma warning restore 612, 618
             AssertAnalyzesTo(sa, "test\u02C6test", new string[] { "test", "test" });
         }

         /// <summary>
         /// Make sure we skip wicked long terms.
         /// </summary>
         [Test]
         public virtual void TestWickedLongTerm()
         {
             using (RAMDirectory dir = new RAMDirectory())
             {

                 char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
                 Arrays.Fill(chars, 'x');

                 string bigTerm = new string(chars);
                 Document doc = new Document();

                 using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT))))
                 {
                     // This produces a too-long term:
                     string contents = "abc xyz x" + bigTerm + " another term";
                     doc.Add(new TextField("content", contents, Field.Store.NO));
                     writer.AddDocument(doc);

                     // Make sure we can add another normal document
                     doc = new Document();
                     doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO));
                     writer.AddDocument(doc);
                 }
 #pragma warning disable 612, 618
                 using (IndexReader reader = IndexReader.Open(dir))
 #pragma warning restore 612, 618
                 {

                     // Make sure all terms < max size were indexed
                     assertEquals(2, reader.DocFreq(new Term("content", "abc")));
                     assertEquals(1, reader.DocFreq(new Term("content", "bbb")));
                     assertEquals(1, reader.DocFreq(new Term("content", "term")));
                     assertEquals(1, reader.DocFreq(new Term("content", "another")));

                     // Make sure position is still incremented when
                     // massive term is skipped:
                     DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another"));
                     assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                     assertEquals(1, tps.Freq);
                     assertEquals(3, tps.NextPosition());

                     // Make sure the doc that has the massive term is in
                     // the index:
                     assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs);

                 }

                 // Make sure we can add a document with exactly the
                 // maximum length term, and search on that term:
                 doc = new Document();
                 doc.Add(new TextField("content", bigTerm, Field.Store.NO));
                 ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
                 sa.MaxTokenLength = 100000;
                 using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa)))
                 {
                     writer.AddDocument(doc);
                 }
 #pragma warning disable 612, 618
                 using (var reader = IndexReader.Open(dir))
 #pragma warning restore 612, 618
                 {
                     assertEquals(1, reader.DocFreq(new Term("content", bigTerm)));
                 }

             }
         }

         /// <summary>
         /// blast some random strings through the analyzer </summary>
         [Test]
         public virtual void TestRandomStrings()
         {
             CheckRandomData(Random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
         }

         /// <summary>
         /// blast some random large strings through the analyzer </summary>
         [Test]
         public virtual void TestRandomHugeStrings()
         {
             Random random = Random;
             CheckRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100 * RANDOM_MULTIPLIER, 8192);
         }
     }
 }
	using Lucene.Net.Analysis.Standard;
	using Lucene.Net.Documents;
	using Lucene.Net.Index;
	using Lucene.Net.Search;
	using Lucene.Net.Store;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using NUnit.Framework;
	using System;

	namespace Lucene.Net.Analysis.Core
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	public class TestClassicAnalyzer : BaseTokenStreamTestCase
	{

	private Analyzer a = new ClassicAnalyzer(TEST_VERSION_CURRENT);

	[Test]
	public virtual void TestMaxTermLength()
	{
	ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
	sa.MaxTokenLength = 5;
	AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "xy", "z" });
	}

	[Test]
	public virtual void TestMaxTermLength2()
	{
	ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "toolong", "xy", "z" });
	sa.MaxTokenLength = 5;

	AssertAnalyzesTo(sa, "ab cd toolong xy z", new string[] { "ab", "cd", "xy", "z" }, new int[] { 1, 1, 2, 1 });
	}

	[Test]
	public virtual void TestMaxTermLength3()
	{
	char[] chars = new char[255];
	for (int i = 0; i < 255; i++)
	{
	chars[i] = 'a';
	}
	string longTerm = new string(chars, 0, 255);

	AssertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new string[] { "ab", "cd", longTerm, "xy", "z" });
	AssertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new string[] { "ab", "cd", "xy", "z" });
	}

	[Test]
	public virtual void TestAlphanumeric()
	{
	// alphanumeric tokens
	AssertAnalyzesTo(a, "B2B", new string[] { "b2b" });
	AssertAnalyzesTo(a, "2B", new string[] { "2b" });
	}

	[Test]
	public virtual void TestUnderscores()
	{
	// underscores are delimiters, but not in email addresses (below)
	AssertAnalyzesTo(a, "word_having_underscore", new string[] { "word", "having", "underscore" });
	AssertAnalyzesTo(a, "word_with_underscore_and_stopwords", new string[] { "word", "underscore", "stopwords" });
	}

	[Test]
	public virtual void TestDelimiters()
	{
	// other delimiters: "-", "/", ","
	AssertAnalyzesTo(a, "some-dashed-phrase", new string[] { "some", "dashed", "phrase" });
	AssertAnalyzesTo(a, "dogs,chase,cats", new string[] { "dogs", "chase", "cats" });
	AssertAnalyzesTo(a, "ac/dc", new string[] { "ac", "dc" });
	}

	[Test]
	public virtual void TestApostrophes()
	{
	// internal apostrophes: O'Reilly, you're, O'Reilly's
	// possessives are actually removed by StardardFilter, not the tokenizer
	AssertAnalyzesTo(a, "O'Reilly", new string[] { "o'reilly" });
	AssertAnalyzesTo(a, "you're", new string[] { "you're" });
	AssertAnalyzesTo(a, "she's", new string[] { "she" });
	AssertAnalyzesTo(a, "Jim's", new string[] { "jim" });
	AssertAnalyzesTo(a, "don't", new string[] { "don't" });
	AssertAnalyzesTo(a, "O'Reilly's", new string[] { "o'reilly" });
	}

	[Test]
	public virtual void TestTSADash()
	{
	// t and s had been stopwords in Lucene <= 2.0, which made it impossible
	// to correctly search for these terms:
	AssertAnalyzesTo(a, "s-class", new string[] { "s", "class" });
	AssertAnalyzesTo(a, "t-com", new string[] { "t", "com" });
	// 'a' is still a stopword:
	AssertAnalyzesTo(a, "a-class", new string[] { "class" });
	}

	[Test]
	public virtual void TestCompanyNames()
	{
	// company names
	AssertAnalyzesTo(a, "AT&T", new string[] { "at&t" });
	AssertAnalyzesTo(a, "Excite@Home", new string[] { "excite@home" });
	}

	[Test]
	public virtual void TestLucene1140()
	{
	try
	{
	ClassicAnalyzer analyzer = new ClassicAnalyzer(TEST_VERSION_CURRENT);
	AssertAnalyzesTo(analyzer, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });
	}
	catch (System.NullReferenceException)
	{
	fail("Should not throw an NPE and it did");
	}

	}

	[Test]
	public virtual void TestDomainNames()
	{
	// Current lucene should not show the bug
	ClassicAnalyzer a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);

	// domain names
	AssertAnalyzesTo(a2, "www.nutch.org", new string[] { "www.nutch.org" });
	//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
	// the following should be recognized as HOST:
	AssertAnalyzesTo(a2, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });

	// 2.3 should show the bug. But, alas, it's obsolete, we don't support it.
	// a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
	// AssertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });

	// 2.4 should not show the bug. But, alas, it's also obsolete,
	// so we check latest released (Robert's gonna break this on 4.0 soon :) )
	#pragma warning disable 612, 618
	a2 = new ClassicAnalyzer(LuceneVersion.LUCENE_31);
	#pragma warning restore 612, 618
	AssertAnalyzesTo(a2, "www.nutch.org.", new string[] { "www.nutch.org" }, new string[] { "<HOST>" });
	}

	[Test]
	public virtual void TestEMailAddresses()
	{
	// email addresses, possibly with underscores, periods, etc
	AssertAnalyzesTo(a, "test@example.com", new string[] { "test@example.com" });
	AssertAnalyzesTo(a, "first.lastname@example.com", new string[] { "first.lastname@example.com" });
	AssertAnalyzesTo(a, "first_lastname@example.com", new string[] { "first_lastname@example.com" });
	}

	[Test]
	public virtual void TestNumeric()
	{
	// floating point, serial, model numbers, ip addresses, etc.
	// every other segment must have at least one digit
	AssertAnalyzesTo(a, "21.35", new string[] { "21.35" });
	AssertAnalyzesTo(a, "R2D2 C3PO", new string[] { "r2d2", "c3po" });
	AssertAnalyzesTo(a, "216.239.63.104", new string[] { "216.239.63.104" });
	AssertAnalyzesTo(a, "1-2-3", new string[] { "1-2-3" });
	AssertAnalyzesTo(a, "a1-b2-c3", new string[] { "a1-b2-c3" });
	AssertAnalyzesTo(a, "a1-b-c3", new string[] { "a1-b-c3" });
	}

	[Test]
	public virtual void TestTextWithNumbers()
	{
	// numbers
	AssertAnalyzesTo(a, "David has 5000 bones", new string[] { "david", "has", "5000", "bones" });
	}

	[Test]
	public virtual void TestVariousText()
	{
	// various
	AssertAnalyzesTo(a, "C embedded developers wanted", new string[] { "c", "embedded", "developers", "wanted" });
	AssertAnalyzesTo(a, "foo bar FOO BAR", new string[] { "foo", "bar", "foo", "bar" });
	AssertAnalyzesTo(a, "foo bar . FOO <> BAR", new string[] { "foo", "bar", "foo", "bar" });
	AssertAnalyzesTo(a, "\"QUOTED\" word", new string[] { "quoted", "word" });
	}

	[Test]
	public virtual void TestAcronyms()
	{
	// acronyms have their dots stripped
	AssertAnalyzesTo(a, "U.S.A.", new string[] { "usa" });
	}

	[Test]
	public virtual void TestCPlusPlusHash()
	{
	// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
	AssertAnalyzesTo(a, "C++", new string[] { "c" });
	AssertAnalyzesTo(a, "C#", new string[] { "c" });
	}

	[Test]
	public virtual void TestKorean()
	{
	// Korean words
	AssertAnalyzesTo(a, "안녕하세요 한글입니다", new string[] { "안녕하세요", "한글입니다" });
	}

	// Compliance with the "old" JavaCC-based analyzer, see:
	// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752

	[Test]
	public virtual void TestComplianceFileName()
	{
	AssertAnalyzesTo(a, "2004.jpg", new string[] { "2004.jpg" }, new string[] { "<HOST>" });
	}

	[Test]
	public virtual void TestComplianceNumericIncorrect()
	{
	AssertAnalyzesTo(a, "62.46", new string[] { "62.46" }, new string[] { "<HOST>" });
	}

	[Test]
	public virtual void TestComplianceNumericLong()
	{
	AssertAnalyzesTo(a, "978-0-94045043-1", new string[] { "978-0-94045043-1" }, new string[] { "<NUM>" });
	}

	[Test]
	public virtual void TestComplianceNumericFile()
	{
	AssertAnalyzesTo(a, "78academyawards/rules/rule02.html", new string[] { "78academyawards/rules/rule02.html" }, new string[] { "<NUM>" });
	}

	[Test]
	public virtual void TestComplianceNumericWithUnderscores()
	{
	AssertAnalyzesTo(a, "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs", new string[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs" }, new string[] { "<NUM>" });
	}

	[Test]
	public virtual void TestComplianceNumericWithDash()
	{
	AssertAnalyzesTo(a, "mid-20th", new string[] { "mid-20th" }, new string[] { "<NUM>" });
	}

	[Test]
	public virtual void TestComplianceManyTokens()
	{
	AssertAnalyzesTo(a, "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm " + "safari-0-sheikh-zayed-grand-mosque.jpg", new string[] { "money.cnn.com", "magazines", "fortune", "fortune", "archive/2007/03/19/8402357", "index.htm", "safari-0-sheikh", "zayed", "grand", "mosque.jpg" }, new string[] { "<HOST>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>", "<HOST>" });
	}

	[Test]
	public virtual void TestJava14BWCompatibility()
	{
	#pragma warning disable 612, 618
	ClassicAnalyzer sa = new ClassicAnalyzer(LuceneVersion.LUCENE_30);
	#pragma warning restore 612, 618
	AssertAnalyzesTo(sa, "test\u02C6test", new string[] { "test", "test" });
	}

	/// <summary>
	/// Make sure we skip wicked long terms.
	/// </summary>
	[Test]
	public virtual void TestWickedLongTerm()
	{
	using (RAMDirectory dir = new RAMDirectory())
	{

	char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
	Arrays.Fill(chars, 'x');

	string bigTerm = new string(chars);
	Document doc = new Document();

	using (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new ClassicAnalyzer(TEST_VERSION_CURRENT))))
	{
	// This produces a too-long term:
	string contents = "abc xyz x" + bigTerm + " another term";
	doc.Add(new TextField("content", contents, Field.Store.NO));
	writer.AddDocument(doc);

	// Make sure we can add another normal document
	doc = new Document();
	doc.Add(new TextField("content", "abc bbb ccc", Field.Store.NO));
	writer.AddDocument(doc);
	}
	#pragma warning disable 612, 618
	using (IndexReader reader = IndexReader.Open(dir))
	#pragma warning restore 612, 618
	{

	// Make sure all terms < max size were indexed
	assertEquals(2, reader.DocFreq(new Term("content", "abc")));
	assertEquals(1, reader.DocFreq(new Term("content", "bbb")));
	assertEquals(1, reader.DocFreq(new Term("content", "term")));
	assertEquals(1, reader.DocFreq(new Term("content", "another")));

	// Make sure position is still incremented when
	// massive term is skipped:
	DocsAndPositionsEnum tps = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), "content", new BytesRef("another"));
	assertTrue(tps.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
	assertEquals(1, tps.Freq);
	assertEquals(3, tps.NextPosition());

	// Make sure the doc that has the massive term is in
	// the index:
	assertEquals("document with wicked long term should is not in the index!", 2, reader.NumDocs);

	}

	// Make sure we can add a document with exactly the
	// maximum length term, and search on that term:
	doc = new Document();
	doc.Add(new TextField("content", bigTerm, Field.Store.NO));
	ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
	sa.MaxTokenLength = 100000;
	using (var writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa)))
	{
	writer.AddDocument(doc);
	}
	#pragma warning disable 612, 618
	using (var reader = IndexReader.Open(dir))
	#pragma warning restore 612, 618
	{
	assertEquals(1, reader.DocFreq(new Term("content", bigTerm)));
	}

	}
	}

	/// <summary>
	/// blast some random strings through the analyzer </summary>
	[Test]
	public virtual void TestRandomStrings()
	{
	CheckRandomData(Random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
	}

	/// <summary>
	/// blast some random large strings through the analyzer </summary>
	[Test]
	public virtual void TestRandomHugeStrings()
	{
	Random random = Random;
	CheckRandomData(random, new ClassicAnalyzer(TEST_VERSION_CURRENT), 100 * RANDOM_MULTIPLIER, 8192);
	}
	}
	}