| Index: solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
|
| ===================================================================
|
| --- solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (revision 940789)
|
| +++ solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (working copy)
|
| @@ -17,120 +17,25 @@
|
| |
| package org.apache.solr.analysis; |
| |
| -import java.io.IOException; |
| import java.io.StringReader; |
| -import java.util.ArrayList; |
| import java.util.HashMap; |
| -import java.util.List; |
| import java.util.Map; |
| |
| -import org.apache.lucene.analysis.CharReader; |
| -import org.apache.lucene.analysis.CharStream; |
| -import org.apache.lucene.analysis.charfilter.MappingCharFilter; |
| -import org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
| import org.apache.lucene.analysis.TokenStream; |
| -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| +/** Simple Tests to ensure this factory is working */ |
| public class TestPatternTokenizerFactory extends BaseTokenTestCase |
| { |
| - public void testSplitting() throws Exception |
| - { |
| - String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'" |
| - String[][] tests = { |
| - // group pattern input output |
| - { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" }, |
| - { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" }, |
| - { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" }, |
| - { "-1", ":", "boo:and:foo", "boo and foo" }, |
| - { "-1", "o", "boo:and:foo", "b :and:f" }, |
| - { "0", ":", "boo:and:foo", ": :" }, |
| - { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" }, |
| - { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } |
| - }; |
| - |
| - |
| - Map<String,String> args = new HashMap<String, String>(); |
| - for( String[] test : tests ) { |
| - args.put( PatternTokenizerFactory.GROUP, test[0] ); |
| - args.put( PatternTokenizerFactory.PATTERN, test[1] ); |
| + public void testFactory() throws Exception { |
| + final String INPUT = "Günther Günther is here"; |
| |
| - PatternTokenizerFactory tokenizer = new PatternTokenizerFactory(); |
| - tokenizer.init( args ); |
| - |
| - TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); |
| - String out = tsToString( stream ); |
| - // System.out.println( test[2] + " ==> " + out ); |
| - |
| - assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); |
| - |
| - // Make sure it is the same as if we called 'split' |
| - // test disabled, as we remove empty tokens |
| - /*if( "-1".equals( test[0] ) ) { |
| - String[] split = test[2].split( test[1] ); |
| - stream = tokenizer.create( new StringReader( test[2] ) ); |
| - int i=0; |
| - for( Token t = stream.next(); null != t; t = stream.next() ) |
| - { |
| - assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) ); |
| - } |
| - }*/ |
| - } |
| - } |
| - |
| - public void testOffsetCorrection() throws Exception { |
| - final String INPUT = "Günther Günther is here"; |
| - |
| - // create MappingCharFilter |
| - MappingCharFilterFactory cfFactory = new MappingCharFilterFactory(); |
| - List<String> mappingRules = new ArrayList<String>(); |
| - mappingRules.add( "\"ü\" => \"ü\"" ); |
| - NormalizeCharMap normMap = new NormalizeCharMap(); |
| - cfFactory.parseRules( mappingRules, normMap ); |
| - CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); |
| - |
| // create PatternTokenizer |
| Map<String,String> args = new HashMap<String, String>(); |
| args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); |
| PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); |
| tokFactory.init( args ); |
| - TokenStream stream = tokFactory.create( charStream ); |
| + TokenStream stream = tokFactory.create( new StringReader(INPUT) ); |
| assertTokenStreamContents(stream, |
| - new String[] { "Günther", "Günther", "is", "here" }, |
| - new int[] { 0, 13, 26, 29 }, |
| - new int[] { 12, 25, 28, 33 }); |
| - |
| - charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); |
| - args.put( PatternTokenizerFactory.PATTERN, "Günther" ); |
| - args.put( PatternTokenizerFactory.GROUP, "0" ); |
| - tokFactory = new PatternTokenizerFactory(); |
| - tokFactory.init( args ); |
| - stream = tokFactory.create( charStream ); |
| - assertTokenStreamContents(stream, |
| - new String[] { "Günther", "Günther" }, |
| - new int[] { 0, 13 }, |
| - new int[] { 12, 25 }); |
| + new String[] { "Günther", "Günther", "is", "here" }); |
| } |
| - |
| - /** |
| - * TODO: rewrite tests not to use string comparison. |
| - * @deprecated only tests TermAttribute! |
| - */ |
| - private static String tsToString(TokenStream in) throws IOException { |
| - StringBuilder out = new StringBuilder(); |
| - CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); |
| - // extra safety to enforce, that the state is not preserved and also |
| - // assign bogus values |
| - in.clearAttributes(); |
| - termAtt.setEmpty().append("bogusTerm"); |
| - while (in.incrementToken()) { |
| - if (out.length() > 0) |
| - out.append(' '); |
| - out.append(termAtt.toString()); |
| - in.clearAttributes(); |
| - termAtt.setEmpty().append("bogusTerm"); |
| - } |
| - |
| - in.close(); |
| - return out.toString(); |
| - } |
| } |
| Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java
|
| ===================================================================
|
| --- solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (revision 940789)
|
| +++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (working copy)
|
| @@ -21,7 +21,6 @@
|
| import java.io.StringReader; |
| import java.util.HashMap; |
| import java.util.Map; |
| -import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.CharReader; |
| import org.apache.lucene.analysis.CharStream; |
| @@ -29,11 +28,9 @@
|
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| |
| /** |
| - * |
| - * @version $Id$ |
| - * |
| + * Simple tests to ensure this factory is working |
| */ |
| -public class TestPatternReplaceCharFilter extends BaseTokenTestCase { |
| +public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase { |
| |
| // 1111 |
| // 01234567890123 |
| @@ -86,99 +83,4 @@
|
| new int[] { 0 }, |
| new int[] { 8 }); |
| } |
| - |
| - // 11111 |
| - // 012345678901234 |
| - // aa bb cc dd |
| - // aa##bb###cc dd |
| - public void test1block1matchLonger() throws IOException { |
| - final String BLOCK = "aa bb cc dd"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa##bb###cc", "dd" }, |
| - new int[] { 0, 9 }, |
| - new int[] { 8, 11 }); |
| - } |
| - |
| - // 01234567 |
| - // a a |
| - // aa aa |
| - public void test1block2matchLonger() throws IOException { |
| - final String BLOCK = " a a"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa", "aa" }, |
| - new int[] { 1, 4 }, |
| - new int[] { 2, 5 }); |
| - } |
| - |
| - // 11111 |
| - // 012345678901234 |
| - // aa bb cc dd |
| - // aa#bb dd |
| - public void test1block1matchShorter() throws IOException { |
| - final String BLOCK = "aa bb cc dd"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa#bb", "dd" }, |
| - new int[] { 0, 12 }, |
| - new int[] { 11, 14 }); |
| - } |
| - |
| - // 111111111122222222223333 |
| - // 0123456789012345678901234567890123 |
| - // aa bb cc --- aa bb aa bb cc |
| - // aa bb cc --- aa bb aa bb cc |
| - public void test1blockMultiMatches() throws IOException { |
| - final String BLOCK = " aa bb cc --- aa bb aa bb cc"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, |
| - new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, |
| - new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }); |
| - } |
| - |
| - // 11111111112222222222333333333 |
| - // 012345678901234567890123456789012345678 |
| - // aa bb cc --- aa bb aa. bb aa bb cc |
| - // aa##bb cc --- aa##bb aa. bb aa##bb cc |
| - public void test2blocksMultiMatches() throws IOException { |
| - final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, |
| - new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, |
| - new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }); |
| - } |
| - |
| - // 11111111112222222222333333333 |
| - // 012345678901234567890123456789012345678 |
| - // a bb - ccc . --- bb a . ccc ccc bb |
| - // aa b - c . --- b aa . c c b |
| - public void testChain() throws IOException { |
| - final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; |
| - CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".", |
| - CharReader.get( new StringReader( BLOCK ) ) ); |
| - cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); |
| - cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, |
| - new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, |
| - new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }); |
| - } |
| - |
| - private Pattern pattern( String p ){ |
| - return Pattern.compile( p ); |
| - } |
| } |
| Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java
|
| ===================================================================
|
| --- solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (revision 940789)
|
| +++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (working copy)
|
| @@ -21,61 +21,25 @@
|
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| |
| import java.io.StringReader; |
| -import java.util.regex.Pattern; |
| +import java.util.HashMap; |
| +import java.util.Map; |
| |
| /** |
| - * @version $Id:$ |
| + * Simple tests to ensure this factory is working |
| */ |
| -public class TestPatternReplaceFilter extends BaseTokenTestCase { |
| +public class TestPatternReplaceFilterFactory extends BaseTokenTestCase { |
| |
| public void testReplaceAll() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| - TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| - Pattern.compile("a*b"), |
| - "-", true); |
| + PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory(); |
| + Map<String,String> args = new HashMap<String,String>(); |
| + args.put("pattern", "a*b"); |
| + args.put("replacement", "-"); |
| + factory.init(args); |
| + TokenStream ts = factory.create |
| + (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input))); |
| + |
| assertTokenStreamContents(ts, |
| new String[] { "-foo-foo-foo-", "-", "c-" }); |
| } |
| - |
| - public void testReplaceFirst() throws Exception { |
| - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| - TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| - Pattern.compile("a*b"), |
| - "-", false); |
| - assertTokenStreamContents(ts, |
| - new String[] { "-fooaabfooabfoob", "-", "c-" }); |
| - } |
| - |
| - public void testStripFirst() throws Exception { |
| - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| - TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| - Pattern.compile("a*b"), |
| - null, false); |
| - assertTokenStreamContents(ts, |
| - new String[] { "fooaabfooabfoob", "", "c" }); |
| - } |
| - |
| - public void testStripAll() throws Exception { |
| - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| - TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| - Pattern.compile("a*b"), |
| - null, true); |
| - assertTokenStreamContents(ts, |
| - new String[] { "foofoofoo", "", "c" }); |
| - } |
| - |
| - public void testReplaceAllWithBackRef() throws Exception { |
| - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| - TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| - Pattern.compile("(a*)b"), |
| - "$1\\$", true); |
| - assertTokenStreamContents(ts, |
| - new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" }); |
| - } |
| - |
| } |
| Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java (deleted)
|
| ===================================================================
|
| Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java (deleted)
|
| ===================================================================
|
| Index: solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
|
| ===================================================================
|
| --- solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (revision 940789)
|
| +++ solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (working copy)
|
| @@ -27,6 +27,7 @@
|
| |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.pattern.PatternTokenizer; |
| import org.apache.solr.common.SolrException; |
| |
| |
| Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
|
| ===================================================================
|
| --- solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (revision 940789)
|
| +++ solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (working copy)
|
| @@ -17,6 +17,7 @@
|
| |
| package org.apache.solr.analysis; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; |
| |
| import java.util.Map; |
| import java.util.regex.Pattern; |
| Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java (deleted)
|
| ===================================================================
|
| Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java (deleted)
|
| ===================================================================
|
| Index: solr/src/java/org/apache/solr/analysis/PatternTokenizer.java (deleted)
|
| ===================================================================
|
| Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
|
| ===================================================================
|
| --- solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (revision 940789)
|
| +++ solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (working copy)
|
| @@ -22,6 +22,7 @@
|
| import java.util.regex.PatternSyntaxException; |
| |
| import org.apache.lucene.analysis.CharStream; |
| +import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; |
| |
| /** |
| * |
| Index: lucene/contrib/CHANGES.txt
|
| ===================================================================
|
| --- lucene/contrib/CHANGES.txt (revision 940789)
|
| +++ lucene/contrib/CHANGES.txt (working copy)
|
| @@ -92,6 +92,9 @@
|
| stemming. Add Turkish and Romanian stopwords lists to support this. |
| (Robert Muir, Uwe Schindler, Simon Willnauer) |
| |
| + * LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the |
| + pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir) |
| + |
| New features |
| |
| * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser. |
| @@ -165,6 +168,8 @@
|
| into subwords and performs optional transformations on subword groups. |
| - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which |
| filters out Tokens at the same position and Term text as the previous token. |
| + - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a |
| + CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes. |
| (... in progress) |
| |
| Build |
| Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (working copy)
|
| @@ -15,8 +15,9 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| |
| @@ -26,12 +27,12 @@
|
| /** |
| * @version $Id:$ |
| */ |
| -public class TestPatternReplaceFilter extends BaseTokenTestCase { |
| +public class TestPatternReplaceFilter extends BaseTokenStreamTestCase { |
| |
| public void testReplaceAll() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), |
| Pattern.compile("a*b"), |
| "-", true); |
| assertTokenStreamContents(ts, |
| @@ -41,7 +42,7 @@
|
| public void testReplaceFirst() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), |
| Pattern.compile("a*b"), |
| "-", false); |
| assertTokenStreamContents(ts, |
| @@ -51,7 +52,7 @@
|
| public void testStripFirst() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), |
| Pattern.compile("a*b"), |
| null, false); |
| assertTokenStreamContents(ts, |
| @@ -61,7 +62,7 @@
|
| public void testStripAll() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), |
| Pattern.compile("a*b"), |
| null, true); |
| assertTokenStreamContents(ts, |
| @@ -71,7 +72,7 @@
|
| public void testReplaceAllWithBackRef() throws Exception { |
| String input = "aabfooaabfooabfoob ab caaaaaaaaab"; |
| TokenStream ts = new PatternReplaceFilter |
| - (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)), |
| + (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)), |
| Pattern.compile("(a*)b"), |
| "$1\\$", true); |
| assertTokenStreamContents(ts, |
| Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (working copy)
|
| @@ -15,15 +15,15 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.util.ArrayList; |
| -import java.util.HashMap; |
| import java.util.List; |
| -import java.util.Map; |
| +import java.util.regex.Pattern; |
| |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharReader; |
| import org.apache.lucene.analysis.CharStream; |
| import org.apache.lucene.analysis.charfilter.MappingCharFilter; |
| @@ -31,7 +31,7 @@
|
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| -public class TestPatternTokenizerFactory extends BaseTokenTestCase |
| +public class TestPatternTokenizer extends BaseTokenStreamTestCase |
| { |
| public void testSplitting() throws Exception |
| { |
| @@ -48,16 +48,8 @@
|
| { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" } |
| }; |
| |
| - |
| - Map<String,String> args = new HashMap<String, String>(); |
| - for( String[] test : tests ) { |
| - args.put( PatternTokenizerFactory.GROUP, test[0] ); |
| - args.put( PatternTokenizerFactory.PATTERN, test[1] ); |
| - |
| - PatternTokenizerFactory tokenizer = new PatternTokenizerFactory(); |
| - tokenizer.init( args ); |
| - |
| - TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); |
| + for( String[] test : tests ) { |
| + TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0])); |
| String out = tsToString( stream ); |
| // System.out.println( test[2] + " ==> " + out ); |
| |
| @@ -81,30 +73,21 @@
|
| final String INPUT = "Günther Günther is here"; |
| |
| // create MappingCharFilter |
| - MappingCharFilterFactory cfFactory = new MappingCharFilterFactory(); |
| List<String> mappingRules = new ArrayList<String>(); |
| mappingRules.add( "\"ü\" => \"ü\"" ); |
| NormalizeCharMap normMap = new NormalizeCharMap(); |
| - cfFactory.parseRules( mappingRules, normMap ); |
| + normMap.add("ü", "ü"); |
| CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); |
| |
| // create PatternTokenizer |
| - Map<String,String> args = new HashMap<String, String>(); |
| - args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); |
| - PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); |
| - tokFactory.init( args ); |
| - TokenStream stream = tokFactory.create( charStream ); |
| + TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1); |
| assertTokenStreamContents(stream, |
| new String[] { "Günther", "Günther", "is", "here" }, |
| new int[] { 0, 13, 26, 29 }, |
| new int[] { 12, 25, 28, 33 }); |
| |
| charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) ); |
| - args.put( PatternTokenizerFactory.PATTERN, "Günther" ); |
| - args.put( PatternTokenizerFactory.GROUP, "0" ); |
| - tokFactory = new PatternTokenizerFactory(); |
| - tokFactory.init( args ); |
| - stream = tokFactory.create( charStream ); |
| + stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0); |
| assertTokenStreamContents(stream, |
| new String[] { "Günther", "Günther" }, |
| new int[] { 0, 13 }, |
| Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (working copy)
|
| @@ -15,39 +15,31 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| -import java.util.HashMap; |
| -import java.util.Map; |
| import java.util.regex.Pattern; |
| |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharReader; |
| import org.apache.lucene.analysis.CharStream; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceTokenizer; |
| |
| /** |
| - * |
| - * @version $Id$ |
| - * |
| + * Tests {@link PatternReplaceCharFilter} |
| */ |
| -public class TestPatternReplaceCharFilter extends BaseTokenTestCase { |
| +public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { |
| |
| // 1111 |
| // 01234567890123 |
| // this is test. |
| public void testNothingChange() throws IOException { |
| final String BLOCK = "this is test."; |
| - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); |
| - Map<String,String> args = new HashMap<String,String>(); |
| - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); |
| - args.put("replacement", "$1$2$3"); |
| - factory.init(args); |
| - CharStream cs = factory.create( |
| + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "this", "is", "test." }, |
| new int[] { 0, 5, 8 }, |
| @@ -58,13 +50,9 @@
|
| // aa bb cc |
| public void testReplaceByEmpty() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); |
| - Map<String,String> args = new HashMap<String,String>(); |
| - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); |
| - factory.init(args); |
| - CharStream cs = factory.create( |
| + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertFalse(ts.incrementToken()); |
| } |
| |
| @@ -73,14 +61,9 @@
|
| // aa#bb#cc |
| public void test1block1matchSameLength() throws IOException { |
| final String BLOCK = "aa bb cc"; |
| - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); |
| - Map<String,String> args = new HashMap<String,String>(); |
| - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); |
| - args.put("replacement", "$1#$2#$3"); |
| - factory.init(args); |
| - CharStream cs = factory.create( |
| + CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa#bb#cc" }, |
| new int[] { 0 }, |
| @@ -95,7 +78,7 @@
|
| final String BLOCK = "aa bb cc dd"; |
| CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa##bb###cc", "dd" }, |
| new int[] { 0, 9 }, |
| @@ -109,7 +92,7 @@
|
| final String BLOCK = " a a"; |
| CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "aa" }, |
| new int[] { 1, 4 }, |
| @@ -124,7 +107,7 @@
|
| final String BLOCK = "aa bb cc dd"; |
| CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa#bb", "dd" }, |
| new int[] { 0, 12 }, |
| @@ -139,7 +122,7 @@
|
| final String BLOCK = " aa bb cc --- aa bb aa bb cc"; |
| CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, |
| new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, |
| @@ -154,7 +137,7 @@
|
| final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; |
| CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".", |
| CharReader.get( new StringReader( BLOCK ) ) ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, |
| new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, |
| @@ -171,7 +154,7 @@
|
| CharReader.get( new StringReader( BLOCK ) ) ); |
| cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs ); |
| cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs ); |
| - TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs ); |
| + TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs ); |
| assertTokenStreamContents(ts, |
| new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, |
| new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, |
| Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (revision 940789)
|
| +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (working copy)
|
| @@ -62,8 +62,10 @@
|
| * pat.tokenStream("content", "James is running round in the woods"), |
| * "English")); |
| * </pre> |
| - * |
| + * @deprecated use the pattern-based analysis in the analysis/pattern package instead. |
| + * This analyzer will be removed in a future release (4.1) |
| */ |
| +@Deprecated |
| public final class PatternAnalyzer extends Analyzer { |
| |
| /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */ |
| Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (working copy)
|
| @@ -15,7 +15,7 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| @@ -35,7 +35,6 @@
|
| * string. |
| * </p> |
| * |
| - * @version $Id:$ |
| * @see Pattern |
| */ |
| public final class PatternReplaceFilter extends TokenFilter { |
| Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy)
|
| @@ -15,7 +15,7 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| @@ -24,7 +24,6 @@
|
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| -import org.apache.commons.io.IOUtils; |
| |
| /** |
| * This tokenizer uses regex pattern matching to construct distinct tokens |
| @@ -51,7 +50,6 @@
|
| * </p> |
| * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p> |
| * |
| - * @version $Id$ |
| * @see Pattern |
| */ |
| public final class PatternTokenizer extends Tokenizer { |
| @@ -59,7 +57,7 @@
|
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| - private String str; |
| + private final StringBuilder str = new StringBuilder(); |
| private int index; |
| |
| private final Pattern pattern; |
| @@ -71,7 +69,7 @@
|
| super(input); |
| this.pattern = pattern; |
| this.group = group; |
| - str = IOUtils.toString(input); |
| + fillBuffer(str, input); |
| matcher = pattern.matcher(str); |
| index = 0; |
| } |
| @@ -84,11 +82,11 @@
|
| |
| // match a specific group |
| while (matcher.find()) { |
| - final String match = matcher.group(group); |
| - if (match.length() == 0) continue; |
| - termAtt.setEmpty().append(match); |
| index = matcher.start(group); |
| - offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group))); |
| + final int endIndex = matcher.end(group); |
| + if (index == endIndex) continue; |
| + termAtt.setEmpty().append(str, index, endIndex); |
| + offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); |
| return true; |
| } |
| |
| @@ -131,9 +129,19 @@
|
| @Override |
| public void reset(Reader input) throws IOException { |
| super.reset(input); |
| - str = IOUtils.toString(input); |
| + fillBuffer(str, input); |
| matcher.reset(str); |
| index = 0; |
| } |
| - |
| + |
| + // TODO: we should see if we can make this tokenizer work without reading |
| + // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? |
| + final char[] buffer = new char[8192]; |
| + private void fillBuffer(StringBuilder sb, Reader input) throws IOException { |
| + int len; |
| + sb.setLength(0); |
| + while ((len = input.read(buffer)) > 0) { |
| + sb.append(buffer, 0, len); |
| + } |
| + } |
| } |
| Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (working copy)
|
| @@ -15,7 +15,7 @@
|
| * limitations under the License. |
| */ |
| |
| -package org.apache.solr.analysis; |
| +package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.util.LinkedList; |
| @@ -45,7 +45,6 @@
|
| * highlight snippet="aa1<em>23bb</em>" |
| * </p> |
| * |
| - * @version $Id$ |
| * @since Solr 1.5 |
| */ |
| public class PatternReplaceCharFilter extends BaseCharFilter { |
| Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html
|
| ===================================================================
|
| --- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0)
|
| +++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Set of components for pattern-based (regex) analysis. |
| +</body> |
| +</html> |
|
|
| Property changes on: lucene\contrib\analyzers\common\src\java\org\apache\lucene\analysis\pattern\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|