| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.pattern; |
| |
| |
| import java.io.StringReader; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| |
| public class TestPatternCaptureGroupTokenFilter extends BaseTokenStreamTestCase { |
| |
| public void testNoPattern() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| public void testNoMatch() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"xx"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"xx"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"xx"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"xx"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| public void testNoCapture() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {".."}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {".."}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {".."}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {".."}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| public void testEmptyCapture() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {".(y*)"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {".(y*)"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {".(y*)"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {".(y*)"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| public void testCaptureAll() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.+)"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.+)"}, |
| new String[] {"foobarbaz"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.+)"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.+)"}, |
| new String[] {"foo","bar","baz"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| public void testCaptureStart() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.)"}, |
| new String[] {"f"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.)"}, |
| new String[] {"foobarbaz","f"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.)"}, |
| new String[] {"f","b","b"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.)"}, |
| new String[] {"foo","f","bar","b","baz","b"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| true |
| ); |
| } |
| |
| public void testCaptureMiddle() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^.(.)."}, |
| new String[] {"o"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^.(.)."}, |
| new String[] {"foobarbaz","o"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^.(.)."}, |
| new String[] {"o","a","a"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^.(.)."}, |
| new String[] {"foo","o","bar","a","baz","a"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| true |
| ); |
| } |
| |
| public void testCaptureEnd() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.)$"}, |
| new String[] {"z"}, |
| new int[] {0}, |
| new int[] {9}, |
| new int[] {1}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.)$"}, |
| new String[] {"foobarbaz","z"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.)$"}, |
| new String[] {"o","r","z"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.)$"}, |
| new String[] {"foo","o","bar","r","baz","z"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| true |
| ); |
| } |
| |
| public void testCaptureStartMiddle() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.)(.)"}, |
| new String[] {"f","o"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.)(.)"}, |
| new String[] {"foobarbaz","f","o"}, |
| new int[] {0,0,0}, |
| new int[] {9,9,9}, |
| new int[] {1,0,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.)(.)"}, |
| new String[] {"f","o","b","a","b","a"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.)(.)"}, |
| new String[] {"foo","f","o","bar","b","a","baz","b","a"}, |
| new int[] {0,0,0,4,4,4,8,8,8}, |
| new int[] {3,3,3,7,7,7,11,11,11}, |
| new int[] {1,0,0,1,0,0,1,0,0}, |
| true |
| ); |
| } |
| |
| public void testCaptureStartEnd() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.).+(.)$"}, |
| new String[] {"f","z"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"^(.).+(.)$"}, |
| new String[] {"foobarbaz","f","z"}, |
| new int[] {0,0,0}, |
| new int[] {9,9,9}, |
| new int[] {1,0,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.).+(.)$"}, |
| new String[] {"f","o","b","r","b","z"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"^(.).+(.)$"}, |
| new String[] {"foo","f","o","bar","b","r","baz","b","z"}, |
| new int[] {0,0,0,4,4,4,8,8,8}, |
| new int[] {3,3,3,7,7,7,11,11,11}, |
| new int[] {1,0,0,1,0,0,1,0,0}, |
| true |
| ); |
| } |
| |
| public void testCaptureMiddleEnd() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.)(.)$"}, |
| new String[] {"a","z"}, |
| new int[] {0,0}, |
| new int[] {9,9}, |
| new int[] {1,0}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.)(.)$"}, |
| new String[] {"foobarbaz","a","z"}, |
| new int[] {0,0,0}, |
| new int[] {9,9,9}, |
| new int[] {1,0,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.)(.)$"}, |
| new String[] {"o","o","a","r","a","z"}, |
| new int[] {0,0,4,4,8,8}, |
| new int[] {3,3,7,7,11,11}, |
| new int[] {1,0,1,0,1,0}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.)(.)$"}, |
| new String[] {"foo","o","o","bar","a","r","baz","a","z"}, |
| new int[] {0,0,0,4,4,4,8,8,8}, |
| new int[] {3,3,3,7,7,7,11,11,11}, |
| new int[] {1,0,0,1,0,0,1,0,0}, |
| true |
| ); |
| } |
| |
| public void testMultiCaptureOverlap() throws Exception { |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.(.(.)))"}, |
| new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, |
| new int[] {0,0,0,0,0,0,0,0,0}, |
| new int[] {9,9,9,9,9,9,9,9,9}, |
| new int[] {1,0,0,0,0,0,0,0,0}, |
| false |
| ); |
| testPatterns( |
| "foobarbaz", |
| new String[] {"(.(.(.)))"}, |
| new String[] {"foobarbaz","foo","oo","o","bar","ar","r","baz","az","z"}, |
| new int[] {0,0,0,0,0,0,0,0,0,0}, |
| new int[] {9,9,9,9,9,9,9,9,9,9}, |
| new int[] {1,0,0,0,0,0,0,0,0,0}, |
| true |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.(.(.)))"}, |
| new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, |
| new int[] {0,0,0,4,4,4,8,8,8}, |
| new int[] {3,3,3,7,7,7,11,11,11}, |
| new int[] {1,0,0,1,0,0,1,0,0}, |
| false |
| ); |
| |
| testPatterns( |
| "foo bar baz", |
| new String[] {"(.(.(.)))"}, |
| new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, |
| new int[] {0,0,0,4,4,4,8,8,8}, |
| new int[] {3,3,3,7,7,7,11,11,11}, |
| new int[] {1,0,0,1,0,0,1,0,0}, |
| true |
| ); |
| } |
| |
| public void testMultiPattern() throws Exception { |
| testPatterns( |
| "aaabbbaaa", |
| new String[] {"(aaa)","(bbb)","(ccc)"}, |
| new String[] {"aaa","bbb","aaa"}, |
| new int[] {0,0,0}, |
| new int[] {9,9,9}, |
| new int[] {1,0,0}, |
| false |
| ); |
| testPatterns( |
| "aaabbbaaa", |
| new String[] {"(aaa)","(bbb)","(ccc)"}, |
| new String[] {"aaabbbaaa","aaa","bbb","aaa"}, |
| new int[] {0,0,0,0}, |
| new int[] {9,9,9,9}, |
| new int[] {1,0,0,0}, |
| true |
| ); |
| |
| testPatterns( |
| "aaa bbb aaa", |
| new String[] {"(aaa)","(bbb)","(ccc)"}, |
| new String[] {"aaa","bbb","aaa"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| false |
| ); |
| |
| testPatterns( |
| "aaa bbb aaa", |
| new String[] {"(aaa)","(bbb)","(ccc)"}, |
| new String[] {"aaa","bbb","aaa"}, |
| new int[] {0,4,8}, |
| new int[] {3,7,11}, |
| new int[] {1,1,1}, |
| true |
| ); |
| } |
| |
| |
| public void testCamelCase() throws Exception { |
| testPatterns( |
| "letsPartyLIKEits1999_dude", |
| new String[] { |
| "([A-Z]{2,})", |
| "(?<![A-Z])([A-Z][a-z]+)", |
| "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)", |
| "([0-9]+)" |
| }, |
| new String[] {"lets","Party","LIKE","its","1999","dude"}, |
| new int[] {0,0,0,0,0,0}, |
| new int[] {25,25,25,25,25,25}, |
| new int[] {1,0,0,0,0,0,0}, |
| false |
| ); |
| testPatterns( |
| "letsPartyLIKEits1999_dude", |
| new String[] { |
| "([A-Z]{2,})", |
| "(?<![A-Z])([A-Z][a-z]+)", |
| "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)", |
| "([0-9]+)" |
| }, |
| new String[] {"letsPartyLIKEits1999_dude","lets","Party","LIKE","its","1999","dude"}, |
| new int[] {0,0,0,0,0,0,0}, |
| new int[] {25,25,25,25,25,25,25}, |
| new int[] {1,0,0,0,0,0,0,0}, |
| true |
| ); |
| } |
| |
| public void testRandomString() throws Exception { |
| Analyzer a = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, |
| new PatternCaptureGroupTokenFilter(tokenizer, false, |
| Pattern.compile("((..)(..))"))); |
| } |
| }; |
| |
| checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER); |
| a.close(); |
| } |
| |
| private void testPatterns(String input, String[] regexes, String[] tokens, |
| int[] startOffsets, int[] endOffsets, int[] positions, |
| boolean preserveOriginal) throws Exception { |
| Pattern[] patterns = new Pattern[regexes.length]; |
| for (int i = 0; i < regexes.length; i++) { |
| patterns[i] = Pattern.compile(regexes[i]); |
| } |
| |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader( new StringReader(input)); |
| TokenStream ts = new PatternCaptureGroupTokenFilter(tokenizer, preserveOriginal, patterns); |
| assertTokenStreamContents(ts, tokens, startOffsets, endOffsets, positions); |
| } |
| |
| } |