| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.core; |
| |
| import java.util.Random; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.util.SparseFixedBitSet; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.AfterClass; |
| import org.junit.BeforeClass; |
| |
| /** Tests for {@link DecimalDigitFilter} */ |
| public class TestDecimalDigitFilter extends BaseTokenStreamTestCase { |
| private Analyzer tokenized; |
| private Analyzer keyword; |
| |
| private static SparseFixedBitSet DECIMAL_DIGIT_CODEPOINTS; |
| |
| @BeforeClass |
| public static void init_DECIMAL_DIGIT_CODEPOINTS() { |
| DECIMAL_DIGIT_CODEPOINTS = new SparseFixedBitSet(Character.MAX_CODE_POINT); |
| for (int codepoint = Character.MIN_CODE_POINT; |
| codepoint < Character.MAX_CODE_POINT; |
| codepoint++) { |
| if (Character.isDigit(codepoint)) { |
| DECIMAL_DIGIT_CODEPOINTS.set(codepoint); |
| } |
| } |
| assert 0 < DECIMAL_DIGIT_CODEPOINTS.cardinality(); |
| } |
| |
| @AfterClass |
| public static void destroy_DECIMAL_DIGIT_CODEPOINTS() { |
| DECIMAL_DIGIT_CODEPOINTS = null; |
| } |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| tokenized = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer)); |
| } |
| }; |
| keyword = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents(tokenizer, new DecimalDigitFilter(tokenizer)); |
| } |
| }; |
| } |
| |
| @Override |
| public void tearDown() throws Exception { |
| tokenized.close(); |
| keyword.close(); |
| super.tearDown(); |
| } |
| |
| /** test that digits are normalized */ |
| public void testSimple() throws Exception { |
| checkOneTerm(tokenized, "١٢٣٤", "1234"); |
| } |
| |
| /** test that double struck digits are normalized */ |
| public void testDoubleStruck() throws Exception { |
| // MATHEMATICAL DOUBLE-STRUCK DIGIT ... 1, 9, 8, 4 |
| final String input = "𝟙 𝟡 𝟠 𝟜"; |
| final String expected = "1 9 8 4"; |
| checkOneTerm(keyword, input, expected); |
| checkOneTerm(keyword, input.replaceAll("\\s", ""), expected.replaceAll("\\s", "")); |
| } |
| |
| /** test sequences of digits mixed with other random simple string data */ |
| public void testRandomSequences() throws Exception { |
| |
| // test numIters random strings containing a sequence of numDigits codepoints |
| final int numIters = atLeast(5); |
| for (int iter = 0; iter < numIters; iter++) { |
| final int numDigits = atLeast(20); |
| final StringBuilder expected = new StringBuilder(); |
| final StringBuilder actual = new StringBuilder(); |
| for (int digitCounter = 0; digitCounter < numDigits; digitCounter++) { |
| |
| // increased odds of 0 length random string prefix |
| final String prefix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); |
| expected.append(prefix); |
| actual.append(prefix); |
| |
| int codepoint = getRandomDecimalDigit(random()); |
| |
| int value = Character.getNumericValue(codepoint); |
| assert value >= 0 && value <= 9; |
| expected.append(Integer.toString(value)); |
| actual.appendCodePoint(codepoint); |
| } |
| // occasional suffix, increased odds of 0 length random string |
| final String suffix = random().nextBoolean() ? "" : TestUtil.randomSimpleString(random()); |
| expected.append(suffix); |
| actual.append(suffix); |
| |
| checkOneTerm(keyword, actual.toString(), expected.toString()); |
| } |
| } |
| |
| /** test each individual digit in different locations of strings. */ |
| public void testRandom() throws Exception { |
| int numCodePointsChecked = 0; // sanity check |
| for (int codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(0); |
| codepoint != DocIdSetIterator.NO_MORE_DOCS; |
| codepoint = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(codepoint + 1)) { |
| |
| assert Character.isDigit(codepoint); |
| |
| // add some a-z before/after the string |
| String prefix = TestUtil.randomSimpleString(random()); |
| String suffix = TestUtil.randomSimpleString(random()); |
| |
| StringBuilder expected = new StringBuilder(); |
| expected.append(prefix); |
| int value = Character.getNumericValue(codepoint); |
| assert value >= 0 && value <= 9; |
| expected.append(Integer.toString(value)); |
| expected.append(suffix); |
| |
| StringBuilder actual = new StringBuilder(); |
| actual.append(prefix); |
| actual.appendCodePoint(codepoint); |
| actual.append(suffix); |
| |
| checkOneTerm(keyword, actual.toString(), expected.toString()); |
| |
| numCodePointsChecked++; |
| } |
| assert DECIMAL_DIGIT_CODEPOINTS.cardinality() == numCodePointsChecked; |
| } |
| |
| /** check the filter is a no-op for the empty string term */ |
| public void testEmptyTerm() throws Exception { |
| checkOneTerm(keyword, "", ""); |
| } |
| |
| /** blast some random strings through the filter */ |
| public void testRandomStrings() throws Exception { |
| checkRandomData(random(), tokenized, 200 * RANDOM_MULTIPLIER); |
| } |
| |
| /** returns a psuedo-random codepoint which is a Decimal Digit */ |
| public static int getRandomDecimalDigit(Random r) { |
| final int aprox = TestUtil.nextInt(r, 0, DECIMAL_DIGIT_CODEPOINTS.length() - 1); |
| |
| if (DECIMAL_DIGIT_CODEPOINTS.get(aprox)) { // lucky guess |
| assert Character.isDigit(aprox); |
| return aprox; |
| } |
| |
| // seek up and down for closest set bit |
| final int lower = DECIMAL_DIGIT_CODEPOINTS.prevSetBit(aprox); |
| final int higher = DECIMAL_DIGIT_CODEPOINTS.nextSetBit(aprox); |
| |
| // sanity check edge cases |
| if (lower < 0) { |
| assert higher != DocIdSetIterator.NO_MORE_DOCS; |
| assert Character.isDigit(higher); |
| return higher; |
| } |
| if (higher == DocIdSetIterator.NO_MORE_DOCS) { |
| assert 0 <= lower; |
| assert Character.isDigit(lower); |
| return lower; |
| } |
| |
| // which is closer? |
| final int cmp = Integer.compare(aprox - lower, higher - aprox); |
| |
| if (0 == cmp) { |
| // dead even, flip a coin |
| final int result = random().nextBoolean() ? lower : higher; |
| assert Character.isDigit(result); |
| return result; |
| } |
| |
| final int result = (cmp < 0) ? lower : higher; |
| assert Character.isDigit(result); |
| return result; |
| } |
| } |