| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Random; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.Ignore; |
| |
| /** Tests {@link PatternReplaceCharFilter} */ |
| public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { |
| public void testFailingDot() throws IOException { |
| checkOutput("A. .B.", "\\.[\\s]*", ".", "A..B.", "A..B."); |
| } |
| |
| public void testLongerReplacement() throws IOException { |
| checkOutput("XXabcZZabcYY", "abc", "abcde", "XXabcdeZZabcdeYY", "XXabcccZZabcccYY"); |
| checkOutput("XXabcabcYY", "abc", "abcde", "XXabcdeabcdeYY", "XXabcccabcccYY"); |
| checkOutput("abcabcYY", "abc", "abcde", "abcdeabcdeYY", "abcccabcccYY"); |
| checkOutput( |
| "YY", |
| "^", |
| "abcde", |
| "abcdeYY", |
| // Should be: "-----YY" but we're enforcing non-negative offsets. |
| "YYYYYYY"); |
| checkOutput("YY", "$", "abcde", "YYabcde", "YYYYYYY"); |
| checkOutput("XYZ", ".", "abc", "abcabcabc", "XXXYYYZZZ"); |
| checkOutput("XYZ", ".", "$0abc", "XabcYabcZabc", "XXXXYYYYZZZZ"); |
| } |
| |
| public void testShorterReplacement() throws IOException { |
| checkOutput("XXabcZZabcYY", "abc", "xy", "XXxyZZxyYY", "XXabZZabYY"); |
| checkOutput("XXabcabcYY", "abc", "xy", "XXxyxyYY", "XXababYY"); |
| checkOutput("abcabcYY", "abc", "xy", "xyxyYY", "ababYY"); |
| checkOutput("abcabcYY", "abc", "", "YY", "YY"); |
| checkOutput("YYabcabc", "abc", "", "YY", "YY"); |
| } |
| |
| private void checkOutput( |
| String input, |
| String pattern, |
| String replacement, |
| String expectedOutput, |
| String expectedIndexMatchedOutput) |
| throws IOException { |
| CharFilter cs = |
| new PatternReplaceCharFilter(pattern(pattern), replacement, new StringReader(input)); |
| |
| StringBuilder output = new StringBuilder(); |
| for (int chr = cs.read(); chr > 0; chr = cs.read()) { |
| output.append((char) chr); |
| } |
| |
| StringBuilder indexMatched = new StringBuilder(); |
| for (int i = 0; i < output.length(); i++) { |
| indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i)))); |
| } |
| |
| boolean outputGood = expectedOutput.equals(output.toString()); |
| boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString()); |
| |
| if (!outputGood || !indexMatchedGood || false) { |
| System.out.println("Pattern : " + pattern); |
| System.out.println("Replac. : " + replacement); |
| System.out.println("Input : " + input); |
| System.out.println("Output : " + output); |
| System.out.println("Expected: " + expectedOutput); |
| System.out.println("Output/i: " + indexMatched); |
| System.out.println("Expected: " + expectedIndexMatchedOutput); |
| System.out.println(); |
| } |
| |
| assertTrue("Output doesn't match.", outputGood); |
| assertTrue("Index-matched output doesn't match.", indexMatchedGood); |
| } |
| |
| // 1111 |
| // 01234567890123 |
| // this is test. |
| public void testNothingChange() throws IOException { |
| final String BLOCK = "this is test."; |
| CharFilter cs = |
| new PatternReplaceCharFilter( |
| pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"this", "is", "test."}, |
| new int[] {0, 5, 8}, |
| new int[] {4, 7, 13}, |
| BLOCK.length()); |
| } |
| |
| // 012345678 |
| // aa bb cc |
| public void testReplaceByEmpty() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| CharFilter cs = |
| new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, new String[] {}); |
| } |
| |
| // 012345678 |
| // aa bb cc |
| // aa#bb#cc |
| public void test1block1matchSameLength() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| CharFilter cs = |
| new PatternReplaceCharFilter( |
| pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, new String[] {"aa#bb#cc"}, new int[] {0}, new int[] {8}, BLOCK.length()); |
| } |
| |
| // 11111 |
| // 012345678901234 |
| // aa bb cc dd |
| // aa##bb###cc dd |
| public void test1block1matchLonger() throws IOException { |
| final String BLOCK = "aa bb cc dd"; |
| CharFilter cs = |
| new PatternReplaceCharFilter( |
| pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"aa##bb###cc", "dd"}, |
| new int[] {0, 9}, |
| new int[] {8, 11}, |
| BLOCK.length()); |
| } |
| |
| // 01234567 |
| // a a |
| // aa aa |
| public void test1block2matchLonger() throws IOException { |
| final String BLOCK = " a a"; |
| CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, new String[] {"aa", "aa"}, new int[] {1, 4}, new int[] {2, 5}, BLOCK.length()); |
| } |
| |
| // 11111 |
| // 012345678901234 |
| // aa bb cc dd |
| // aa#bb dd |
| public void test1block1matchShorter() throws IOException { |
| final String BLOCK = "aa bb cc dd"; |
| CharFilter cs = |
| new PatternReplaceCharFilter( |
| pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, new String[] {"aa#bb", "dd"}, new int[] {0, 12}, new int[] {11, 14}, BLOCK.length()); |
| } |
| |
| // 111111111122222222223333 |
| // 0123456789012345678901234567890123 |
| // aa bb cc --- aa bb aa bb cc |
| // aa bb cc --- aa bb aa bb cc |
| public void test1blockMultiMatches() throws IOException { |
| final String BLOCK = " aa bb cc --- aa bb aa bb cc"; |
| CharFilter cs = |
| new PatternReplaceCharFilter( |
| pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc"}, |
| new int[] {2, 6, 9, 11, 15, 18, 21, 25, 29}, |
| new int[] {4, 8, 10, 14, 17, 20, 23, 27, 33}, |
| BLOCK.length()); |
| } |
| |
| // 11111111112222222222333333333 |
| // 012345678901234567890123456789012345678 |
| // aa bb cc --- aa bb aa. bb aa bb cc |
| // aa##bb cc --- aa##bb aa. bb aa##bb cc |
| |
| // aa bb cc --- aa bbbaa. bb aa b cc |
| |
| public void test2blocksMultiMatches() throws IOException { |
| final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; |
| |
| CharFilter cs = |
| new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)"), "$1##$2", new StringReader(BLOCK)); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc"}, |
| new int[] {2, 8, 11, 15, 21, 25, 28, 36}, |
| new int[] {7, 10, 14, 20, 24, 27, 35, 38}, |
| BLOCK.length()); |
| } |
| |
| // 11111111112222222222333333333 |
| // 012345678901234567890123456789012345678 |
| // a bb - ccc . --- bb a . ccc ccc bb |
| // aa b - c . --- b aa . c c b |
| public void testChain() throws IOException { |
| final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; |
| CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK)); |
| cs = new PatternReplaceCharFilter(pattern("bb"), "b", cs); |
| cs = new PatternReplaceCharFilter(pattern("ccc"), "c", cs); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b"}, |
| new int[] {1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33}, |
| new int[] {2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35}, |
| BLOCK.length()); |
| } |
| |
| private Pattern pattern(String p) { |
| return Pattern.compile(p); |
| } |
| |
| /** |
| * A demonstration of how backtracking regular expressions can lead to relatively easy DoS |
| * attacks. |
| * |
| * @see <a |
| * href="http://swtch.com/~rsc/regexp/regexp1.html">"http://swtch.com/~rsc/regexp/regexp1.html"</a> |
| */ |
| @Ignore |
| public void testNastyPattern() throws Exception { |
| Pattern p = Pattern.compile("(c.+)*xy"); |
| String input = |
| "[;<!--aecbbaa-->< febcfdc fbb = \"fbeeebff\" fc = dd >\\';<eefceceaa e= babae\" eacbaff =\"fcfaccacd\" = bcced>>>< bccaafe edb = ecfccdff\" <?</script>< edbd ebbcd=\"faacfcc\" aeca= bedbc ceeaac =adeafde aadccdaf = \"afcc ffda=aafbe �\"1843785582']"; |
| for (int i = 0; i < input.length(); i++) { |
| Matcher matcher = p.matcher(input.substring(0, i)); |
| long t = System.currentTimeMillis(); |
| if (matcher.find()) { |
| System.out.println(matcher.group()); |
| } |
| System.out.println(i + " > " + (System.currentTimeMillis() - t) / 1000.0); |
| } |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| int numPatterns = 5; |
| Random random = new Random(random().nextLong()); |
| for (int i = 0; i < numPatterns; i++) { |
| final Pattern p = TestUtil.randomPattern(random()); |
| |
| final String replacement = TestUtil.randomSimpleString(random); |
| Analyzer a = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| return new PatternReplaceCharFilter(p, replacement, reader); |
| } |
| }; |
| |
| /* max input length. don't make it longer -- exponential processing |
| * time for certain patterns. */ |
| final int maxInputLength = 30; |
| /* ASCII only input?: */ |
| final boolean asciiOnly = true; |
| checkRandomData(random, a, 50 * RANDOM_MULTIPLIER, maxInputLength, asciiOnly); |
| a.close(); |
| } |
| } |
| } |