| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.charfilter.MappingCharFilter; |
| import org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| public class TestPatternTokenizer extends BaseTokenStreamTestCase |
| { |
| public void testSplitting() throws Exception |
| { |
| String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" |
| String[][] tests = { |
| // group pattern input output |
| { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" }, |
| { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" }, |
| { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" }, |
| { "-1", ":", "boo:and:foo", "boo and foo" }, |
| { "-1", "o", "boo:and:foo", "b :and:f" }, |
| { "0", ":", "boo:and:foo", ": :" }, |
| { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, |
| { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } |
| }; |
| |
| for( String[] test : tests ) { |
| TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0])); |
| ((Tokenizer)stream).setReader(new StringReader(test[2])); |
| String out = tsToString( stream ); |
| // System.out.println( test[2] + " ==> " + out ); |
| |
| assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); |
| |
| // Make sure it is the same as if we called 'split' |
| // test disabled, as we remove empty tokens |
| /*if( "-1".equals( test[0] ) ) { |
| String[] split = test[2].split( test[1] ); |
| stream = tokenizer.create( new StringReader( test[2] ) ); |
| int i=0; |
| for( Token t = stream.next(); null != t; t = stream.next() ) |
| { |
| assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); |
| } |
| }*/ |
| } |
| } |
| |
| public void testOffsetCorrection() throws Exception { |
| final String INPUT = "Günther Günther is here"; |
| |
| // create MappingCharFilter |
| List<String> mappingRules = new ArrayList<>(); |
| mappingRules.add( "\"ü\" => \"ü\"" ); |
| NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| builder.add("ü", "ü"); |
| NormalizeCharMap normMap = builder.build(); |
| CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); |
| |
| // create PatternTokenizer |
| Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1); |
| stream.setReader(charStream); |
| assertTokenStreamContents(stream, |
| new String[] { "Günther", "Günther", "is", "here" }, |
| new int[] { 0, 13, 26, 29 }, |
| new int[] { 12, 25, 28, 33 }, |
| INPUT.length()); |
| |
| charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) ); |
| stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0); |
| stream.setReader(charStream); |
| assertTokenStreamContents(stream, |
| new String[] { "Günther", "Günther" }, |
| new int[] { 0, 13 }, |
| new int[] { 12, 25 }, |
| INPUT.length()); |
| } |
| |
| /** |
| * TODO: rewrite tests not to use string comparison. |
| */ |
| private static String tsToString(TokenStream in) throws IOException { |
| StringBuilder out = new StringBuilder(); |
| CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); |
| // extra safety to enforce, that the state is not preserved and also |
| // assign bogus values |
| in.clearAttributes(); |
| termAtt.setEmpty().append("bogusTerm"); |
| in.reset(); |
| while (in.incrementToken()) { |
| if (out.length() > 0) |
| out.append(' '); |
| out.append(termAtt.toString()); |
| in.clearAttributes(); |
| termAtt.setEmpty().append("bogusTerm"); |
| } |
| |
| in.close(); |
| return out.toString(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| Analyzer a = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1); |
| return new TokenStreamComponents(tokenizer); |
| } |
| }; |
| checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); |
| a.close(); |
| |
| Analyzer b = new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0); |
| return new TokenStreamComponents(tokenizer); |
| } |
| }; |
| checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER); |
| b.close(); |
| } |
| |
| // LUCENE-6814 |
| @Nightly |
| public void testHeapFreedAfterClose() throws Exception { |
| // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers? |
| |
| // Build a 1MB string: |
| StringBuilder b = new StringBuilder(); |
| for(int i=0;i<1024;i++) { |
| // 1023 spaces, then an x |
| for(int j=0;j<1023;j++) { |
| b.append(' '); |
| } |
| b.append('x'); |
| } |
| |
| String big = b.toString(); |
| |
| Pattern x = Pattern.compile("x"); |
| |
| List<Tokenizer> tokenizers = new ArrayList<>(); |
| for(int i=0;i<512;i++) { |
| Tokenizer stream = new PatternTokenizer(x, -1); |
| tokenizers.add(stream); |
| stream.setReader(new StringReader(big)); |
| stream.reset(); |
| for(int j=0;j<1024;j++) { |
| assertTrue(stream.incrementToken()); |
| } |
| assertFalse(stream.incrementToken()); |
| stream.end(); |
| stream.close(); |
| } |
| } |
| } |