blob: ff6f9610ff7a7733ad8e91609a5a72344686850e [file] [log] [blame]
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.util.Version;
/**
* Testcase for {@link CharTokenizer} subclasses
*/
public class TestCharTokenizers extends BaseTokenStreamTestCase {
/*
* test to read surrogate pairs without loosing the pairing
* if the surrogate pair is at the border of the internal IO buffer
*/
public void testReadSupplementaryChars() throws IOException {
StringBuilder builder = new StringBuilder();
// create random input
int num = 1024 + random.nextInt(1024);
num *= RANDOM_MULTIPLIER;
for (int i = 1; i < num; i++) {
builder.append("\ud801\udc1cabc");
if((i % 10) == 0)
builder.append(" ");
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "\ud801\udc1c");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
TEST_VERSION_CURRENT, new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
}
/*
* test to extend the buffer TermAttribute buffer internally. If the internal
* alg that extends the size of the char array only extends by 1 char and the
* next char to be filled in is a supplementary codepoint (using 2 chars) an
* index out of bound exception is triggered.
*/
public void testExtendCharBuffer() throws IOException {
for (int i = 0; i < 40; i++) {
StringBuilder builder = new StringBuilder();
for (int j = 0; j < 1+i; j++) {
builder.append("a");
}
builder.append("\ud801\udc1cabc");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
TEST_VERSION_CURRENT, new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
}
}
/*
* tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
*/
public void testMaxWordLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 255; i++) {
builder.append("A");
}
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
}
/*
* tests the max word length of 255 with a surrogate pair at position 255
*/
public void testMaxWordLengthWithSupplementary() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 254; i++) {
builder.append("A");
}
builder.append("\ud801\udc1c");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
}
public void testLowerCaseTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT,
reader);
assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
"\ud801\udc44test" });
}
public void testLowerCaseTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
}
public void testWhitespaceTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
reader);
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
"\ud801\udc1ctest" });
}
public void testWhitespaceTokenizerBWCompat() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
reader);
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
"\ud801\udc1ctest" });
}
public void testIsTokenCharCharInSubclass() {
new TestingCharTokenizer(Version.LUCENE_30, new StringReader(""));
try {
new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader(""));
fail("version 3.1 is not permitted if char based method is implemented");
} catch (IllegalArgumentException e) {
// expected
}
}
public void testNormalizeCharInSubclass() {
new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader(""));
try {
new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT,
new StringReader(""));
fail("version 3.1 is not permitted if char based method is implemented");
} catch (IllegalArgumentException e) {
// expected
}
}
public void testNormalizeAndIsTokenCharCharInSubclass() {
new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30,
new StringReader(""));
try {
new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT,
new StringReader(""));
fail("version 3.1 is not permitted if char based method is implemented");
} catch (IllegalArgumentException e) {
// expected
}
}
static final class TestingCharTokenizer extends CharTokenizer {
public TestingCharTokenizer(Version matchVersion, Reader input) {
super(matchVersion, input);
}
@Override
protected boolean isTokenChar(int c) {
return Character.isLetter(c);
}
@Deprecated @Override
protected boolean isTokenChar(char c) {
return Character.isLetter(c);
}
}
static final class TestingCharTokenizerNormalize extends CharTokenizer {
public TestingCharTokenizerNormalize(Version matchVersion, Reader input) {
super(matchVersion, input);
}
@Deprecated @Override
protected char normalize(char c) {
return c;
}
@Override
protected int normalize(int c) {
return c;
}
}
static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer {
public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion,
Reader input) {
super(matchVersion, input);
}
@Deprecated @Override
protected char normalize(char c) {
return c;
}
@Override
protected int normalize(int c) {
return c;
}
@Override
protected boolean isTokenChar(int c) {
return Character.isLetter(c);
}
@Deprecated @Override
protected boolean isTokenChar(char c) {
return Character.isLetter(c);
}
}
}