| package org.apache.lucene.analysis; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * Testcase for {@link CharTokenizer} subclasses |
| */ |
| public class TestCharTokenizers extends BaseTokenStreamTestCase { |
| |
| /* |
| * test to read surrogate pairs without loosing the pairing |
| * if the surrogate pair is at the border of the internal IO buffer |
| */ |
| public void testReadSupplementaryChars() throws IOException { |
| StringBuilder builder = new StringBuilder(); |
| // create random input |
| int num = 1024 + random.nextInt(1024); |
| num *= RANDOM_MULTIPLIER; |
| for (int i = 1; i < num; i++) { |
| builder.append("\ud801\udc1cabc"); |
| if((i % 10) == 0) |
| builder.append(" "); |
| } |
| // internal buffer size is 1024 make sure we have a surrogate pair right at the border |
| builder.insert(1023, "\ud801\udc1c"); |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( |
| TEST_VERSION_CURRENT, new StringReader(builder.toString())); |
| assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); |
| } |
| |
| /* |
| * test to extend the buffer TermAttribute buffer internally. If the internal |
| * alg that extends the size of the char array only extends by 1 char and the |
| * next char to be filled in is a supplementary codepoint (using 2 chars) an |
| * index out of bound exception is triggered. |
| */ |
| public void testExtendCharBuffer() throws IOException { |
| for (int i = 0; i < 40; i++) { |
| StringBuilder builder = new StringBuilder(); |
| for (int j = 0; j < 1+i; j++) { |
| builder.append("a"); |
| } |
| builder.append("\ud801\udc1cabc"); |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( |
| TEST_VERSION_CURRENT, new StringReader(builder.toString())); |
| assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); |
| } |
| } |
| |
| /* |
| * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens |
| */ |
| public void testMaxWordLength() throws IOException { |
| StringBuilder builder = new StringBuilder(); |
| |
| for (int i = 0; i < 255; i++) { |
| builder.append("A"); |
| } |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( |
| TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); |
| assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); |
| } |
| |
| /* |
| * tests the max word length of 255 with a surrogate pair at position 255 |
| */ |
| public void testMaxWordLengthWithSupplementary() throws IOException { |
| StringBuilder builder = new StringBuilder(); |
| |
| for (int i = 0; i < 254; i++) { |
| builder.append("A"); |
| } |
| builder.append("\ud801\udc1c"); |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( |
| TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); |
| assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); |
| } |
| |
| public void testLowerCaseTokenizer() throws IOException { |
| StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, |
| reader); |
| assertTokenStreamContents(tokenizer, new String[] { "tokenizer", |
| "\ud801\udc44test" }); |
| } |
| |
| public void testLowerCaseTokenizerBWCompat() throws IOException { |
| StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); |
| LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30, |
| reader); |
| assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" }); |
| } |
| |
| public void testWhitespaceTokenizer() throws IOException { |
| StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); |
| WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, |
| reader); |
| assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", |
| "\ud801\udc1ctest" }); |
| } |
| |
| public void testWhitespaceTokenizerBWCompat() throws IOException { |
| StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); |
| WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30, |
| reader); |
| assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", |
| "\ud801\udc1ctest" }); |
| } |
| |
| public void testIsTokenCharCharInSubclass() { |
| new TestingCharTokenizer(Version.LUCENE_30, new StringReader("")); |
| try { |
| new TestingCharTokenizer(TEST_VERSION_CURRENT, new StringReader("")); |
| fail("version 3.1 is not permitted if char based method is implemented"); |
| } catch (IllegalArgumentException e) { |
| // expected |
| } |
| } |
| |
| public void testNormalizeCharInSubclass() { |
| new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader("")); |
| try { |
| new TestingCharTokenizerNormalize(TEST_VERSION_CURRENT, |
| new StringReader("")); |
| fail("version 3.1 is not permitted if char based method is implemented"); |
| } catch (IllegalArgumentException e) { |
| // expected |
| } |
| } |
| |
| public void testNormalizeAndIsTokenCharCharInSubclass() { |
| new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30, |
| new StringReader("")); |
| try { |
| new TestingCharTokenizerNormalizeIsTokenChar(TEST_VERSION_CURRENT, |
| new StringReader("")); |
| fail("version 3.1 is not permitted if char based method is implemented"); |
| } catch (IllegalArgumentException e) { |
| // expected |
| } |
| } |
| |
| static final class TestingCharTokenizer extends CharTokenizer { |
| public TestingCharTokenizer(Version matchVersion, Reader input) { |
| super(matchVersion, input); |
| } |
| |
| @Override |
| protected boolean isTokenChar(int c) { |
| return Character.isLetter(c); |
| } |
| |
| @Deprecated @Override |
| protected boolean isTokenChar(char c) { |
| return Character.isLetter(c); |
| } |
| } |
| |
| static final class TestingCharTokenizerNormalize extends CharTokenizer { |
| public TestingCharTokenizerNormalize(Version matchVersion, Reader input) { |
| super(matchVersion, input); |
| } |
| |
| @Deprecated @Override |
| protected char normalize(char c) { |
| return c; |
| } |
| |
| @Override |
| protected int normalize(int c) { |
| return c; |
| } |
| } |
| |
| static final class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer { |
| public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion, |
| Reader input) { |
| super(matchVersion, input); |
| } |
| |
| @Deprecated @Override |
| protected char normalize(char c) { |
| return c; |
| } |
| |
| @Override |
| protected int normalize(int c) { |
| return c; |
| } |
| |
| @Override |
| protected boolean isTokenChar(int c) { |
| return Character.isLetter(c); |
| } |
| |
| @Deprecated @Override |
| protected boolean isTokenChar(char c) { |
| return Character.isLetter(c); |
| } |
| } |
| } |