| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java (revision 0) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java (revision 0) |
| @@ -0,0 +1,46 @@ |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.MockTokenizer; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.tartarus.snowball.SnowballProgram; |
| + |
| +import java.io.IOException; |
| +import java.io.StringReader; |
| + |
| +public class TestKeywordRepeatFilter extends BaseTokenStreamTestCase { |
| + |
| + public void testBasic() throws IOException { |
| + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter( |
| + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false)), "English")); |
| + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); |
| + } |
| + |
| + |
| + public void testComposition() throws IOException { |
| + TokenStream ts = new RemoveDuplicatesTokenFilter(new SnowballFilter(new KeywordRepeatFilter(new KeywordRepeatFilter( |
| + new MockTokenizer(new StringReader("the birds are flying"), MockTokenizer.WHITESPACE, false))), "English")); |
| + assertTokenStreamContents(ts, new String[] { "the", "birds", "bird", "are", "flying", "fli"}, new int[] {1,1,0,1,1,0}); |
| + } |
| + |
| + |
| + |
| +} |
| |
| Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordRepeatFilter.java |
| ___________________________________________________________________ |
| Added: svn:keywords |
| + Date Author Id Revision HeadURL |
| Added: svn:eol-style |
| + native |
| |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java (revision 0) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java (revision 0) |
| @@ -0,0 +1,38 @@ |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.util.TokenFilterFactory; |
| + |
| +/** |
| + * Factory for {@link KeywordRepeatFilter}. |
| + * <pre class="prettyprint" > |
| + * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"> |
| + * <analyzer> |
| + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| + * <filter class="solr.KeywordRepeatFilter"/> |
| + * </analyzer> |
| + * </fieldType></pre> |
| + */ |
| +public final class KeywordRepeatFilterFactory extends TokenFilterFactory { |
| + @Override |
| + public TokenStream create(TokenStream input) { |
| + return new KeywordRepeatFilter(input); |
| + } |
| +} |
| |
| Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilterFactory.java |
| ___________________________________________________________________ |
| Added: svn:keywords |
| + Date Author Id Revision HeadURL |
| Added: svn:eol-style |
| + native |
| |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java (revision 0) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java (revision 0) |
| @@ -0,0 +1,68 @@ |
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +/* |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| + |
| +import java.io.IOException; |
| + |
| + |
| +/** |
| + * This TokenFilterĀ emits each incoming token twice once as keyword and once non-keyword, in other words once with |
| + * {@link KeywordAttribute#setKeyword(boolean)} set to <code>true</code> and once set to <code>false</code>. |
| + * This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the |
| + * un-stemmed version of a term into the same field. |
| + */ |
| +public final class KeywordRepeatFilter extends TokenFilter { |
| + |
| + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); |
| + private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); |
| + private State state; |
| + /** |
| + * Construct a token stream filtering the given input. |
| + */ |
| + public KeywordRepeatFilter(TokenStream input) { |
| + super(input); |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() throws IOException { |
| + if (state != null) { |
| + restoreState(state); |
| + posIncAttr.setPositionIncrement(0); |
| + keywordAttribute.setKeyword(false); |
| + state = null; |
| + return true; |
| + } |
| + if (input.incrementToken()) { |
| + state = captureState(); |
| + keywordAttribute.setKeyword(true); |
| + return true; |
| + } |
| + return false; |
| + } |
| + |
| + @Override |
| + public void reset() throws IOException { |
| + super.reset(); |
| + state = null; |
| + } |
| +} |
| |
| Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java |
| ___________________________________________________________________ |
| Added: svn:keywords |
| + Date Author Id Revision HeadURL |
| Added: svn:eol-style |
| + native |
| |
| Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory |
| =================================================================== |
| --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 1454297) |
| +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) |
| @@ -58,6 +58,7 @@ |
| org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory |
| org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory |
| org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory |
| +org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory |
| org.apache.lucene.analysis.miscellaneous.LengthFilterFactory |
| org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory |
| org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory |
| Index: lucene/CHANGES.txt |
| =================================================================== |
| --- lucene/CHANGES.txt (revision 1454297) |
| +++ lucene/CHANGES.txt (working copy) |
| @@ -32,6 +32,10 @@ |
| * LUCENE-4815: DrillSideways now allows more than one FacetRequest per |
| dimension (Mike McCandless) |
| |
| +* LUCENE-4817: Added KeywordRepeatFilter that allows to emit a token twice |
| + once as a keyword and once as an ordinary token allow stemmers to emit |
| + a stemmed version along with the un-stemmed version. (Simon Willnauer) |
| + |
| ======================= Lucene 4.2.0 ======================= |
| |
| Changes in backwards compatibility policy |