blob: f96b73dce7fc31eaba44f9f6c69d070166cd0055 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
/** tests for classicanalyzer */
public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
private Analyzer a;
@Override
public void setUp() throws Exception {
super.setUp();
a = new ClassicAnalyzer();
}
@Override
public void tearDown() throws Exception {
a.close();
super.tearDown();
}
public void testMaxTermLength() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer();
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
sa.close();
}
public void testMaxTermLength2() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer();
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, new int[]{1, 1, 2, 1});
sa.close();
}
public void testMaxTermLength3() throws Exception {
char[] chars = new char[255];
for(int i=0;i<255;i++)
chars[i] = 'a';
String longTerm = new String(chars, 0, 255);
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testAlphanumeric() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
}
public void testUnderscores() throws Exception {
// underscores are delimiters, but not in email addresses (below)
assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
}
public void testDelimiters() throws Exception {
// other delimiters: "-", "/", ","
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophes() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
// possessives are actually removed by StardardFilter, not the tokenizer
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
}
public void testTSADash() throws Exception {
// t and s had been stopwords in Lucene <= 2.0, which made it impossible
// to correctly search for these terms:
assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
// 'a' is still a stopword:
assertAnalyzesTo(a, "a-class", new String[]{"class"});
}
public void testCompanyNames() throws Exception {
// company names
assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
}
public void testLucene1140() throws Exception {
try {
ClassicAnalyzer analyzer = new ClassicAnalyzer();
assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
analyzer.close();
} catch (NullPointerException e) {
fail("Should not throw an NPE and it did");
}
}
public void testDomainNames() throws Exception {
// Current lucene should not show the bug
ClassicAnalyzer a2 = new ClassicAnalyzer();
// domain names
assertAnalyzesTo(a2, "www.nutch.org", new String[]{"www.nutch.org"});
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
// the following should be recognized as HOST:
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
// 2.3 should show the bug. But, alas, it's obsolete, we don't support it.
// a2 = new ClassicAnalyzer(org.apache.lucene.util.Version.LUCENE_23);
// assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
// 2.4 should not show the bug. But, alas, it's also obsolete,
// so we check latest released (Robert's gonna break this on 4.0 soon :) )
a2.close();
a2 = new ClassicAnalyzer();
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
a2.close();
}
public void testEMailAddresses() throws Exception {
// email addresses, possibly with underscores, periods, etc
assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
}
public void testNumeric() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
}
public void testTextWithNumbers() throws Exception {
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
}
public void testVariousText() throws Exception {
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}
public void testAcronyms() throws Exception {
// acronyms have their dots stripped
assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
}
public void testCPlusPlusHash() throws Exception {
// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
assertAnalyzesTo(a, "C++", new String[]{"c"});
assertAnalyzesTo(a, "C#", new String[]{"c"});
}
public void testKorean() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
// Compliance with the "old" JavaCC-based analyzer, see:
// https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
public void testComplianceFileName() throws Exception {
assertAnalyzesTo(a, "2004.jpg",
new String[]{"2004.jpg"},
new String[]{"<HOST>"});
}
public void testComplianceNumericIncorrect() throws Exception {
assertAnalyzesTo(a, "62.46",
new String[]{"62.46"},
new String[]{"<HOST>"});
}
public void testComplianceNumericLong() throws Exception {
assertAnalyzesTo(a, "978-0-94045043-1",
new String[]{"978-0-94045043-1"},
new String[]{"<NUM>"});
}
public void testComplianceNumericFile() throws Exception {
assertAnalyzesTo(
a,
"78academyawards/rules/rule02.html",
new String[]{"78academyawards/rules/rule02.html"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithUnderscores() throws Exception {
assertAnalyzesTo(
a,
"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
new String[]{"<NUM>"});
}
public void testComplianceNumericWithDash() throws Exception {
assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
new String[]{"<NUM>"});
}
public void testComplianceManyTokens() throws Exception {
assertAnalyzesTo(
a,
"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
+ "safari-0-sheikh-zayed-grand-mosque.jpg",
new String[]{"money.cnn.com", "magazines", "fortune",
"fortune", "archive/2007/03/19/8402357", "index.htm",
"safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
"<ALPHANUM>", "<HOST>"});
}
public void testJava14BWCompatibility() throws Exception {
ClassicAnalyzer sa = new ClassicAnalyzer();
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
sa.close();
}
/**
* Make sure we skip wicked long terms.
*/
public void testWickedLongTerm() throws IOException {
RAMDirectory dir = new RAMDirectory();
Analyzer analyzer = new ClassicAnalyzer();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
char[] chars = new char[IndexWriter.MAX_TERM_LENGTH];
Arrays.fill(chars, 'x');
Document doc = new Document();
final String bigTerm = new String(chars);
// This produces a too-long term:
String contents = "abc xyz x" + bigTerm + " another term";
doc.add(new TextField("content", contents, Field.Store.NO));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(new TextField("content", "abc bbb ccc", Field.Store.NO));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
// Make sure all terms < max size were indexed
assertEquals(2, reader.docFreq(new Term("content", "abc")));
assertEquals(1, reader.docFreq(new Term("content", "bbb")));
assertEquals(1, reader.docFreq(new Term("content", "term")));
assertEquals(1, reader.docFreq(new Term("content", "another")));
// Make sure position is still incremented when
// massive term is skipped:
PostingsEnum tps = MultiTerms.getTermPostingsEnum(reader,
"content",
new BytesRef("another"));
assertTrue(tps.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, tps.freq());
assertEquals(3, tps.nextPosition());
// Make sure the doc that has the massive term is in
// the index:
assertEquals("document with wicked long term should is not in the index!", 2, reader.numDocs());
reader.close();
// Make sure we can add a document with exactly the
// maximum length term, and search on that term:
doc = new Document();
doc.add(new TextField("content", bigTerm, Field.Store.NO));
ClassicAnalyzer sa = new ClassicAnalyzer();
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, new IndexWriterConfig(sa));
writer.addDocument(doc);
writer.close();
reader = DirectoryReader.open(dir);
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
reader.close();
dir.close();
analyzer.close();
sa.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new ClassicAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Analyzer analyzer = new ClassicAnalyzer();
checkRandomData(random(), analyzer, 10 * RANDOM_MULTIPLIER, 8192);
analyzer.close();
}
}