blob: cc8f01d7df21c02526f878f0d66b4a58de17b7c3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestPatternTokenizer extends BaseTokenStreamTestCase
{
public void testSplitting() throws Exception
{
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
String[][] tests = {
// group pattern input output
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
{ "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
{ "-1", ":", "boo:and:foo", "boo and foo" },
{ "-1", "o", "boo:and:foo", "b :and:f" },
{ "0", ":", "boo:and:foo", ": :" },
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
};
for( String[] test : tests ) {
TokenStream stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile(test[1]), Integer.parseInt(test[0]));
((Tokenizer)stream).setReader(new StringReader(test[2]));
String out = tsToString( stream );
// System.out.println( test[2] + " ==> " + out );
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
// Make sure it is the same as if we called 'split'
// test disabled, as we remove empty tokens
/*if( "-1".equals( test[0] ) ) {
String[] split = test[2].split( test[1] );
stream = tokenizer.create( new StringReader( test[2] ) );
int i=0;
for( Token t = stream.next(); null != t; t = stream.next() )
{
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
}
}*/
}
}
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add( "\"&uuml;\" => \"ü\"" );
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("&uuml;", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
// create PatternTokenizer
Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
stream.setReader(charStream);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
new int[] { 12, 25, 28, 33 },
INPUT.length());
charStream = new MappingCharFilter( normMap, new StringReader( INPUT ) );
stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
stream.setReader(charStream);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
new int[] { 12, 25 },
INPUT.length());
}
/**
* TODO: rewrite tests not to use string comparison.
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
in.reset();
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');
out.append(termAtt.toString());
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
}
in.close();
return out.toString();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
return new TokenStreamComponents(tokenizer);
}
};
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
a.close();
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
return new TokenStreamComponents(tokenizer);
}
};
checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER);
b.close();
}
// LUCENE-6814
@Nightly
public void testHeapFreedAfterClose() throws Exception {
// TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers?
// Build a 1MB string:
StringBuilder b = new StringBuilder();
for(int i=0;i<1024;i++) {
// 1023 spaces, then an x
for(int j=0;j<1023;j++) {
b.append(' ');
}
b.append('x');
}
String big = b.toString();
Pattern x = Pattern.compile("x");
List<Tokenizer> tokenizers = new ArrayList<>();
for(int i=0;i<512;i++) {
Tokenizer stream = new PatternTokenizer(x, -1);
tokenizers.add(stream);
stream.setReader(new StringReader(big));
stream.reset();
for(int j=0;j<1024;j++) {
assertTrue(stream.incrementToken());
}
assertFalse(stream.incrementToken());
stream.end();
stream.close();
}
}
}