blob: f2bbe052f3533f2f0afa2b8eb01ba7b66ee6b8c3 [file] [log] [blame]
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index a697cce..c04f5a8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -146,6 +146,12 @@ public final class CustomAnalyzer extends Analyzer {
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
+ // tokenizers can return a tokenfilter if the tokenizer does normalization,
+ // although this is really bogus/abstraction violation...
+ if (tokenizer instanceof MultiTermAwareComponent) {
+ TokenFilterFactory filter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenizer).getMultiTermComponent();
+ result = filter.create(result);
+ }
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index d9ea43c..8614ee1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -31,9 +31,9 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
-import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
@@ -431,7 +431,7 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
- return new KeywordTokenizerFactory(getOriginalArgs());
+ return new DummyTokenFilterFactory(Collections.emptyMap());
}
}
@@ -499,5 +499,13 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
.build();
assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c"));
}
+
+ /** test normalize where the TokenizerFactory returns a filter to normalize the text */
+ public void testNormalizationWithLowerCaseTokenizer() throws IOException {
+ CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
+ .withTokenizer(LowerCaseTokenizerFactory.class, Collections.emptyMap())
+ .build();
+ assertEquals(new BytesRef("abc"), analyzer1.normalize("dummy", "ABC"));
+ }
}