| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Random; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.Ignore; |
| |
| /** |
| * Tests {@link PatternReplaceCharFilter} |
| */ |
| public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { |
| public void testFailingDot() throws IOException { |
| checkOutput( |
| "A. .B.", "\\.[\\s]*", ".", |
| "A..B.", |
| "A..B."); |
| } |
| |
| public void testLongerReplacement() throws IOException { |
| checkOutput( |
| "XXabcZZabcYY", "abc", "abcde", |
| "XXabcdeZZabcdeYY", |
| "XXabcccZZabcccYY"); |
| checkOutput( |
| "XXabcabcYY", "abc", "abcde", |
| "XXabcdeabcdeYY", |
| "XXabcccabcccYY"); |
| checkOutput( |
| "abcabcYY", "abc", "abcde", |
| "abcdeabcdeYY", |
| "abcccabcccYY"); |
| checkOutput( |
| "YY", "^", "abcde", |
| "abcdeYY", |
| // Should be: "-----YY" but we're enforcing non-negative offsets. |
| "YYYYYYY"); |
| checkOutput( |
| "YY", "$", "abcde", |
| "YYabcde", |
| "YYYYYYY"); |
| checkOutput( |
| "XYZ", ".", "abc", |
| "abcabcabc", |
| "XXXYYYZZZ"); |
| checkOutput( |
| "XYZ", ".", "$0abc", |
| "XabcYabcZabc", |
| "XXXXYYYYZZZZ"); |
| } |
| |
| public void testShorterReplacement() throws IOException { |
| checkOutput( |
| "XXabcZZabcYY", "abc", "xy", |
| "XXxyZZxyYY", |
| "XXabZZabYY"); |
| checkOutput( |
| "XXabcabcYY", "abc", "xy", |
| "XXxyxyYY", |
| "XXababYY"); |
| checkOutput( |
| "abcabcYY", "abc", "xy", |
| "xyxyYY", |
| "ababYY"); |
| checkOutput( |
| "abcabcYY", "abc", "", |
| "YY", |
| "YY"); |
| checkOutput( |
| "YYabcabc", "abc", "", |
| "YY", |
| "YY"); |
| } |
| |
| private void checkOutput(String input, String pattern, String replacement, |
| String expectedOutput, String expectedIndexMatchedOutput) throws IOException { |
| CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement, |
| new StringReader(input)); |
| |
| StringBuilder output = new StringBuilder(); |
| for (int chr = cs.read(); chr > 0; chr = cs.read()) { |
| output.append((char) chr); |
| } |
| |
| StringBuilder indexMatched = new StringBuilder(); |
| for (int i = 0; i < output.length(); i++) { |
| indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i)))); |
| } |
| |
| boolean outputGood = expectedOutput.equals(output.toString()); |
| boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString()); |
| |
| if (!outputGood || !indexMatchedGood || false) { |
| System.out.println("Pattern : " + pattern); |
| System.out.println("Replac. : " + replacement); |
| System.out.println("Input : " + input); |
| System.out.println("Output : " + output); |
| System.out.println("Expected: " + expectedOutput); |
| System.out.println("Output/i: " + indexMatched); |
| System.out.println("Expected: " + expectedIndexMatchedOutput); |
| System.out.println(); |
| } |
| |
| assertTrue("Output doesn't match.", outputGood); |
| assertTrue("Index-matched output doesn't match.", indexMatchedGood); |
| } |
| |
| // 1111 |
| // 01234567890123 |
| // this is test. |
| public void testNothingChange() throws IOException { |
| final String BLOCK = "this is test."; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "this", "is", "test." }, |
| new int[] { 0, 5, 8 }, |
| new int[] { 4, 7, 13 }, |
| BLOCK.length()); |
| } |
| |
| // 012345678 |
| // aa bb cc |
| public void testReplaceByEmpty() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, new String[] {}); |
| } |
| |
| // 012345678 |
| // aa bb cc |
| // aa#bb#cc |
| public void test1block1matchSameLength() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa#bb#cc" }, |
| new int[] { 0 }, |
| new int[] { 8 }, |
| BLOCK.length()); |
| } |
| |
| // 11111 |
| // 012345678901234 |
| // aa bb cc dd |
| // aa##bb###cc dd |
| public void test1block1matchLonger() throws IOException { |
| final String BLOCK = "aa bb cc dd"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa##bb###cc", "dd" }, |
| new int[] { 0, 9 }, |
| new int[] { 8, 11 }, |
| BLOCK.length()); |
| } |
| |
| // 01234567 |
| // a a |
| // aa aa |
| public void test1block2matchLonger() throws IOException { |
| final String BLOCK = " a a"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "aa" }, |
| new int[] { 1, 4 }, |
| new int[] { 2, 5 }, |
| BLOCK.length()); |
| } |
| |
| // 11111 |
| // 012345678901234 |
| // aa bb cc dd |
| // aa#bb dd |
| public void test1block1matchShorter() throws IOException { |
| final String BLOCK = "aa bb cc dd"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa#bb", "dd" }, |
| new int[] { 0, 12 }, |
| new int[] { 11, 14 }, |
| BLOCK.length()); |
| } |
| |
| // 111111111122222222223333 |
| // 0123456789012345678901234567890123 |
| // aa bb cc --- aa bb aa bb cc |
| // aa bb cc --- aa bb aa bb cc |
| public void test1blockMultiMatches() throws IOException { |
| final String BLOCK = " aa bb cc --- aa bb aa bb cc"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, |
| new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, |
| new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }, |
| BLOCK.length()); |
| } |
| |
| // 11111111112222222222333333333 |
| // 012345678901234567890123456789012345678 |
| // aa bb cc --- aa bb aa. bb aa bb cc |
| // aa##bb cc --- aa##bb aa. bb aa##bb cc |
| |
| // aa bb cc --- aa bbbaa. bb aa b cc |
| |
| public void test2blocksMultiMatches() throws IOException { |
| final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; |
| |
| CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", |
| new StringReader( BLOCK ) ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, |
| new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, |
| new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }, |
| BLOCK.length()); |
| } |
| |
| // 11111111112222222222333333333 |
| // 012345678901234567890123456789012345678 |
| // a bb - ccc . --- bb a . ccc ccc bb |
| // aa b - c . --- b aa . c c b |
| public void testChain() throws IOException { |
| final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; |
| CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", |
| new StringReader( BLOCK ) ); |
| cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs ); |
| cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs ); |
| TokenStream ts = whitespaceMockTokenizer(cs); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, |
| new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, |
| new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }, |
| BLOCK.length()); |
| } |
| |
| private Pattern pattern( String p ){ |
| return Pattern.compile( p ); |
| } |
| |
| /** |
| * A demonstration of how backtracking regular expressions can lead to relatively |
| * easy DoS attacks. |
| * |
| * @see <a href="http://swtch.com/~rsc/regexp/regexp1.html">"http://swtch.com/~rsc/regexp/regexp1.html"</a> |
| */ |
| @Ignore |
| public void testNastyPattern() throws Exception { |
| Pattern p = Pattern.compile("(c.+)*xy"); |
| String input = "[;<!--aecbbaa-->< febcfdc fbb = \"fbeeebff\" fc = dd >\\';<eefceceaa e= babae\" eacbaff =\"fcfaccacd\" = bcced>>>< bccaafe edb = ecfccdff\" <?</script>< edbd ebbcd=\"faacfcc\" aeca= bedbc ceeaac =adeafde aadccdaf = \"afcc ffda=aafbe �\"1843785582']"; |
| for (int i = 0; i < input.length(); i++) { |
| Matcher matcher = p.matcher(input.substring(0, i)); |
| long t = System.currentTimeMillis(); |
| if (matcher.find()) { |
| System.out.println(matcher.group()); |
| } |
| System.out.println(i + " > " + (System.currentTimeMillis() - t) / 1000.0); |
| } |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| int numPatterns = 5; |
| Random random = new Random(random().nextLong()); |
| for (int i = 0; i < numPatterns; i++) { |
| final Pattern p = TestUtil.randomPattern(random()); |
| |
| final String replacement = TestUtil.randomSimpleString(random); |
| Analyzer a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| return new PatternReplaceCharFilter(p, replacement, reader); |
| } |
| }; |
| |
| /* max input length. don't make it longer -- exponential processing |
| * time for certain patterns. */ |
| final int maxInputLength = 30; |
| /* ASCII only input?: */ |
| final boolean asciiOnly = true; |
| checkRandomData(random, a, 50 * RANDOM_MULTIPLIER, maxInputLength, asciiOnly); |
| a.close(); |
| } |
| } |
| } |