blob: a6b523820463750f5ccec4d69165d980e620ac73 [file] [log] [blame]
Index: solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (revision 940789)
+++ solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (working copy)
@@ -17,120 +17,25 @@
package org.apache.solr.analysis;
-import java.io.IOException;
import java.io.StringReader;
-import java.util.ArrayList;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
-import org.apache.lucene.analysis.CharReader;
-import org.apache.lucene.analysis.CharStream;
-import org.apache.lucene.analysis.charfilter.MappingCharFilter;
-import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+/** Simple Tests to ensure this factory is working */
public class TestPatternTokenizerFactory extends BaseTokenTestCase
{
- public void testSplitting() throws Exception
- {
- String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
- String[][] tests = {
- // group pattern input output
- { "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
- { "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
- { "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
- { "-1", ":", "boo:and:foo", "boo and foo" },
- { "-1", "o", "boo:and:foo", "b :and:f" },
- { "0", ":", "boo:and:foo", ": :" },
- { "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
- { "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
- };
-
-
- Map<String,String> args = new HashMap<String, String>();
- for( String[] test : tests ) {
- args.put( PatternTokenizerFactory.GROUP, test[0] );
- args.put( PatternTokenizerFactory.PATTERN, test[1] );
+ public void testFactory() throws Exception {
+ final String INPUT = "Günther Günther is here";
- PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
- tokenizer.init( args );
-
- TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
- String out = tsToString( stream );
- // System.out.println( test[2] + " ==> " + out );
-
- assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
-
- // Make sure it is the same as if we called 'split'
- // test disabled, as we remove empty tokens
- /*if( "-1".equals( test[0] ) ) {
- String[] split = test[2].split( test[1] );
- stream = tokenizer.create( new StringReader( test[2] ) );
- int i=0;
- for( Token t = stream.next(); null != t; t = stream.next() )
- {
- assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
- }
- }*/
- }
- }
-
- public void testOffsetCorrection() throws Exception {
- final String INPUT = "G&uuml;nther G&uuml;nther is here";
-
- // create MappingCharFilter
- MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
- List<String> mappingRules = new ArrayList<String>();
- mappingRules.add( "\"&uuml;\" => \"ü\"" );
- NormalizeCharMap normMap = new NormalizeCharMap();
- cfFactory.parseRules( mappingRules, normMap );
- CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
-
// create PatternTokenizer
Map<String,String> args = new HashMap<String, String>();
args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
- TokenStream stream = tokFactory.create( charStream );
+ TokenStream stream = tokFactory.create( new StringReader(INPUT) );
assertTokenStreamContents(stream,
- new String[] { "Günther", "Günther", "is", "here" },
- new int[] { 0, 13, 26, 29 },
- new int[] { 12, 25, 28, 33 });
-
- charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
- args.put( PatternTokenizerFactory.PATTERN, "Günther" );
- args.put( PatternTokenizerFactory.GROUP, "0" );
- tokFactory = new PatternTokenizerFactory();
- tokFactory.init( args );
- stream = tokFactory.create( charStream );
- assertTokenStreamContents(stream,
- new String[] { "Günther", "Günther" },
- new int[] { 0, 13 },
- new int[] { 12, 25 });
+ new String[] { "Günther", "Günther", "is", "here" });
}
-
- /**
- * TODO: rewrite tests not to use string comparison.
- * @deprecated only tests TermAttribute!
- */
- private static String tsToString(TokenStream in) throws IOException {
- StringBuilder out = new StringBuilder();
- CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
- // extra safety to enforce, that the state is not preserved and also
- // assign bogus values
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- while (in.incrementToken()) {
- if (out.length() > 0)
- out.append(' ');
- out.append(termAtt.toString());
- in.clearAttributes();
- termAtt.setEmpty().append("bogusTerm");
- }
-
- in.close();
- return out.toString();
- }
}
Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (revision 940789)
+++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (working copy)
@@ -21,7 +21,6 @@
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
-import java.util.regex.Pattern;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
@@ -29,11 +28,9 @@
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
- *
- * @version $Id$
- *
+ * Simple tests to ensure this factory is working
*/
-public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
+public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase {
// 1111
// 01234567890123
@@ -86,99 +83,4 @@
new int[] { 0 },
new int[] { 8 });
}
-
- // 11111
- // 012345678901234
- // aa bb cc dd
- // aa##bb###cc dd
- public void test1block1matchLonger() throws IOException {
- final String BLOCK = "aa bb cc dd";
- CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
- CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa##bb###cc", "dd" },
- new int[] { 0, 9 },
- new int[] { 8, 11 });
- }
-
- // 01234567
- // a a
- // aa aa
- public void test1block2matchLonger() throws IOException {
- final String BLOCK = " a a";
- CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
- CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa", "aa" },
- new int[] { 1, 4 },
- new int[] { 2, 5 });
- }
-
- // 11111
- // 012345678901234
- // aa bb cc dd
- // aa#bb dd
- public void test1block1matchShorter() throws IOException {
- final String BLOCK = "aa bb cc dd";
- CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
- CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa#bb", "dd" },
- new int[] { 0, 12 },
- new int[] { 11, 14 });
- }
-
- // 111111111122222222223333
- // 0123456789012345678901234567890123
- // aa bb cc --- aa bb aa bb cc
- // aa bb cc --- aa bb aa bb cc
- public void test1blockMultiMatches() throws IOException {
- final String BLOCK = " aa bb cc --- aa bb aa bb cc";
- CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
- CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
- new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
- new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 });
- }
-
- // 11111111112222222222333333333
- // 012345678901234567890123456789012345678
- // aa bb cc --- aa bb aa. bb aa bb cc
- // aa##bb cc --- aa##bb aa. bb aa##bb cc
- public void test2blocksMultiMatches() throws IOException {
- final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
- CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
- CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
- new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
- new int[] { 7, 10, 14, 20, 24, 27, 35, 38 });
- }
-
- // 11111111112222222222333333333
- // 012345678901234567890123456789012345678
- // a bb - ccc . --- bb a . ccc ccc bb
- // aa b - c . --- b aa . c c b
- public void testChain() throws IOException {
- final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
- CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa", ".",
- CharReader.get( new StringReader( BLOCK ) ) );
- cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
- cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
- assertTokenStreamContents(ts,
- new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
- new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
- new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 });
- }
-
- private Pattern pattern( String p ){
- return Pattern.compile( p );
- }
}
Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (revision 940789)
+++ solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (working copy)
@@ -21,61 +21,25 @@
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.StringReader;
-import java.util.regex.Pattern;
+import java.util.HashMap;
+import java.util.Map;
/**
- * @version $Id:$
+ * Simple tests to ensure this factory is working
*/
-public class TestPatternReplaceFilter extends BaseTokenTestCase {
+public class TestPatternReplaceFilterFactory extends BaseTokenTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
- TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
- Pattern.compile("a*b"),
- "-", true);
+ PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("pattern", "a*b");
+ args.put("replacement", "-");
+ factory.init(args);
+ TokenStream ts = factory.create
+ (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
+
assertTokenStreamContents(ts,
new String[] { "-foo-foo-foo-", "-", "c-" });
}
-
- public void testReplaceFirst() throws Exception {
- String input = "aabfooaabfooabfoob ab caaaaaaaaab";
- TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
- Pattern.compile("a*b"),
- "-", false);
- assertTokenStreamContents(ts,
- new String[] { "-fooaabfooabfoob", "-", "c-" });
- }
-
- public void testStripFirst() throws Exception {
- String input = "aabfooaabfooabfoob ab caaaaaaaaab";
- TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
- Pattern.compile("a*b"),
- null, false);
- assertTokenStreamContents(ts,
- new String[] { "fooaabfooabfoob", "", "c" });
- }
-
- public void testStripAll() throws Exception {
- String input = "aabfooaabfooabfoob ab caaaaaaaaab";
- TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
- Pattern.compile("a*b"),
- null, true);
- assertTokenStreamContents(ts,
- new String[] { "foofoofoo", "", "c" });
- }
-
- public void testReplaceAllWithBackRef() throws Exception {
- String input = "aabfooaabfooabfoob ab caaaaaaaaab";
- TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
- Pattern.compile("(a*)b"),
- "$1\\$", true);
- assertTokenStreamContents(ts,
- new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
- }
-
}
Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilter.java (deleted)
===================================================================
Index: solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java (deleted)
===================================================================
Index: solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (revision 940789)
+++ solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (working copy)
@@ -27,6 +27,7 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.solr.common.SolrException;
Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (revision 940789)
+++ solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (working copy)
@@ -17,6 +17,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import java.util.Map;
import java.util.regex.Pattern;
Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilter.java (deleted)
===================================================================
Index: solr/src/java/org/apache/solr/analysis/PatternReplaceFilter.java (deleted)
===================================================================
Index: solr/src/java/org/apache/solr/analysis/PatternTokenizer.java (deleted)
===================================================================
Index: solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (revision 940789)
+++ solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (working copy)
@@ -22,6 +22,7 @@
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
/**
*
Index: lucene/contrib/CHANGES.txt
===================================================================
--- lucene/contrib/CHANGES.txt (revision 940789)
+++ lucene/contrib/CHANGES.txt (working copy)
@@ -92,6 +92,9 @@
stemming. Add Turkish and Romanian stopwords lists to support this.
(Robert Muir, Uwe Schindler, Simon Willnauer)
+ * LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the
+ pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
+
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
@@ -165,6 +168,8 @@
into subwords and performs optional transformations on subword groups.
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
filters out Tokens at the same position and Term text as the previous token.
+ - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
+ CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
(... in progress)
Build
Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
===================================================================
--- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (revision 0)
+++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java (working copy)
@@ -15,8 +15,9 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
@@ -26,12 +27,12 @@
/**
* @version $Id:$
*/
-public class TestPatternReplaceFilter extends BaseTokenTestCase {
+public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+ (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
"-", true);
assertTokenStreamContents(ts,
@@ -41,7 +42,7 @@
public void testReplaceFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+ (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
"-", false);
assertTokenStreamContents(ts,
@@ -51,7 +52,7 @@
public void testStripFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+ (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
null, false);
assertTokenStreamContents(ts,
@@ -61,7 +62,7 @@
public void testStripAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+ (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
null, true);
assertTokenStreamContents(ts,
@@ -71,7 +72,7 @@
public void testReplaceAllWithBackRef() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
- (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+ (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("(a*)b"),
"$1\\$", true);
assertTokenStreamContents(ts,
Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
===================================================================
--- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (revision 0)
+++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java (working copy)
@@ -15,15 +15,15 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+import java.util.regex.Pattern;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
@@ -31,7 +31,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-public class TestPatternTokenizerFactory extends BaseTokenTestCase
+public class TestPatternTokenizer extends BaseTokenStreamTestCase
{
public void testSplitting() throws Exception
{
@@ -48,16 +48,8 @@
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
};
-
- Map<String,String> args = new HashMap<String, String>();
- for( String[] test : tests ) {
- args.put( PatternTokenizerFactory.GROUP, test[0] );
- args.put( PatternTokenizerFactory.PATTERN, test[1] );
-
- PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
- tokenizer.init( args );
-
- TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
+ for( String[] test : tests ) {
+ TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0]));
String out = tsToString( stream );
// System.out.println( test[2] + " ==> " + out );
@@ -81,30 +73,21 @@
final String INPUT = "G&uuml;nther G&uuml;nther is here";
// create MappingCharFilter
- MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
List<String> mappingRules = new ArrayList<String>();
mappingRules.add( "\"&uuml;\" => \"ü\"" );
NormalizeCharMap normMap = new NormalizeCharMap();
- cfFactory.parseRules( mappingRules, normMap );
+ normMap.add("&uuml;", "ü");
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
// create PatternTokenizer
- Map<String,String> args = new HashMap<String, String>();
- args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
- PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
- tokFactory.init( args );
- TokenStream stream = tokFactory.create( charStream );
+ TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
new int[] { 12, 25, 28, 33 });
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
- args.put( PatternTokenizerFactory.PATTERN, "Günther" );
- args.put( PatternTokenizerFactory.GROUP, "0" );
- tokFactory = new PatternTokenizerFactory();
- tokFactory.init( args );
- stream = tokFactory.create( charStream );
+ stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
Index: lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
===================================================================
--- lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (revision 0)
+++ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java (working copy)
@@ -15,39 +15,31 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
import java.util.regex.Pattern;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
- *
- * @version $Id$
- *
+ * Tests {@link PatternReplaceCharFilter}
*/
-public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
+public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
// 1111
// 01234567890123
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
- PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
- Map<String,String> args = new HashMap<String,String>();
- args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
- args.put("replacement", "$1$2$3");
- factory.init(args);
- CharStream cs = factory.create(
+ CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
@@ -58,13 +50,9 @@
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
- PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
- Map<String,String> args = new HashMap<String,String>();
- args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
- factory.init(args);
- CharStream cs = factory.create(
+ CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertFalse(ts.incrementToken());
}
@@ -73,14 +61,9 @@
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc";
- PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
- Map<String,String> args = new HashMap<String,String>();
- args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
- args.put("replacement", "$1#$2#$3");
- factory.init(args);
- CharStream cs = factory.create(
+ CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
@@ -95,7 +78,7 @@
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
@@ -109,7 +92,7 @@
final String BLOCK = " a a";
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
@@ -124,7 +107,7 @@
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
@@ -139,7 +122,7 @@
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
@@ -154,7 +137,7 @@
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
@@ -171,7 +154,7 @@
CharReader.get( new StringReader( BLOCK ) ) );
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
- TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+ TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (revision 940789)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java (working copy)
@@ -62,8 +62,10 @@
* pat.tokenStream("content", "James is running round in the woods"),
* "English"));
* </pre>
- *
+ * @deprecated use the pattern-based analysis in the analysis/pattern package instead.
+ * This analyzer will be removed in a future release (4.1)
*/
+@Deprecated
public final class PatternAnalyzer extends Analyzer {
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (revision 0)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -35,7 +35,6 @@
* string.
* </p>
*
- * @version $Id:$
* @see Pattern
*/
public final class PatternReplaceFilter extends TokenFilter {
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (revision 0)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.Reader;
@@ -24,7 +24,6 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.commons.io.IOUtils;
/**
* This tokenizer uses regex pattern matching to construct distinct tokens
@@ -51,7 +50,6 @@
* </p>
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
*
- * @version $Id$
* @see Pattern
*/
public final class PatternTokenizer extends Tokenizer {
@@ -59,7 +57,7 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private String str;
+ private final StringBuilder str = new StringBuilder();
private int index;
private final Pattern pattern;
@@ -71,7 +69,7 @@
super(input);
this.pattern = pattern;
this.group = group;
- str = IOUtils.toString(input);
+ fillBuffer(str, input);
matcher = pattern.matcher(str);
index = 0;
}
@@ -84,11 +82,11 @@
// match a specific group
while (matcher.find()) {
- final String match = matcher.group(group);
- if (match.length() == 0) continue;
- termAtt.setEmpty().append(match);
index = matcher.start(group);
- offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
+ final int endIndex = matcher.end(group);
+ if (index == endIndex) continue;
+ termAtt.setEmpty().append(str, index, endIndex);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
return true;
}
@@ -131,9 +129,19 @@
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
- str = IOUtils.toString(input);
+ fillBuffer(str, input);
matcher.reset(str);
index = 0;
}
-
+
+ // TODO: we should see if we can make this tokenizer work without reading
+ // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
+ final char[] buffer = new char[8192];
+ private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
+ int len;
+ sb.setLength(0);
+ while ((len = input.read(buffer)) > 0) {
+ sb.append(buffer, 0, len);
+ }
+ }
}
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (revision 0)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (working copy)
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.util.LinkedList;
@@ -45,7 +45,6 @@
* highlight snippet="aa1&lt;em&gt;23bb&lt;/em&gt;"
* </p>
*
- * @version $Id$
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {
Index: lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html
===================================================================
--- lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0)
+++ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Set of components for pattern-based (regex) analysis.
+</body>
+</html>
Property changes on: lucene\contrib\analyzers\common\src\java\org\apache\lucene\analysis\pattern\package.html
___________________________________________________________________
Added: svn:eol-style
+ native