| Index: solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java |
| =================================================================== |
| --- solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java (révision 1477238) |
| +++ solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java (copie de travail) |
| @@ -201,12 +201,12 @@ |
| public void testOptions() throws Exception { |
| assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true" |
| , TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true") |
| - ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}" |
| + ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}" |
| ); |
| |
| assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true" |
| , TermVectorParams.ALL, "true") |
| - ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}" |
| + ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}" |
| ); |
| |
| // test each combination at random |
| @@ -214,7 +214,7 @@ |
| list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true")); |
| String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" }, |
| { TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" }, |
| - { TermVectorParams.POSITIONS, "'positions':{'position':1}" }, |
| + { TermVectorParams.POSITIONS, "'positions':{'position':5}" }, |
| { TermVectorParams.DF, "'df':2" }, |
| { TermVectorParams.TF_IDF, "'tf-idf':0.5" } }; |
| StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={"); |
| @@ -249,7 +249,7 @@ |
| ,"f.test_basictv." + TermVectorParams.TF_IDF, "false" |
| ) |
| ,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}" |
| - ,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}" |
| + ,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}" |
| ,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}" |
| ,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}" |
| ); |
| Index: solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java |
| =================================================================== |
| --- solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (révision 1477238) |
| +++ solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (copie de travail) |
| @@ -323,16 +323,16 @@ |
| tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter"); |
| assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList); |
| assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size()); |
| - assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1}, null, false)); |
| - assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2}, null, false)); |
| - assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3}, null, false)); |
| - assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4}, null, false)); |
| + assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false)); |
| + assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false)); |
| + assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false)); |
| + assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false)); |
| tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter"); |
| assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList); |
| assertEquals("Expecting 4 tokens", 4, tokenList.size()); |
| - assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1,1}, null, false)); |
| - assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2,2}, null, true)); |
| - assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3,3}, null, false)); |
| - assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4,4}, null, false)); |
| + assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false)); |
| + assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true)); |
| + assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false)); |
| + assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false)); |
| } |
| } |
| Index: solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java |
| =================================================================== |
| --- solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (révision 1477238) |
| +++ solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (copie de travail) |
| @@ -178,25 +178,25 @@ |
| tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter"); |
| assertNotNull("Expcting StopFilter analysis breakdown", tokenList); |
| assertEquals(tokenList.size(), 8); |
| - assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1}, null, false)); |
| - assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2}, null, false)); |
| - assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3}, null, true)); |
| - assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4}, null, false)); |
| - assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5}, null, false)); |
| - assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6}, null, false)); |
| - assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7}, null, true)); |
| - assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8}, null, false)); |
| + assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false)); |
| + assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3}, null, false)); |
| + assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4}, null, true)); |
| + assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5}, null, false)); |
| + assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6}, null, false)); |
| + assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8}, null, false)); |
| + assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9}, null, true)); |
| + assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10}, null, false)); |
| tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter"); |
| assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList); |
| assertEquals(tokenList.size(), 8); |
| - assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1,1}, null, false)); |
| - assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2,2}, null, false)); |
| - assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3,3}, null, true)); |
| - assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4,4}, null, false)); |
| - assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5,5}, null, false)); |
| - assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6,6}, null, false)); |
| - assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7,7}, null, true)); |
| - assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8,8}, null, false)); |
| + assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2,2}, null, false)); |
| + assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3,3}, null, false)); |
| + assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4,4}, null, true)); |
| + assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5,5}, null, false)); |
| + assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6,6}, null, false)); |
| + assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8,8}, null, false)); |
| + assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9,9}, null, true)); |
| + assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10,10}, null, false)); |
| |
| NamedList<List<NamedList>> queryPart = textType.get("query"); |
| assertNotNull("expecting a query token analysis for field type 'text'", queryPart); |
| Index: solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java |
| =================================================================== |
| --- solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java (révision 1477238) |
| +++ solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java (copie de travail) |
| @@ -87,8 +87,8 @@ |
| req("cool stuff") |
| ,"//*[@numFound='3']" |
| ,"//result/doc[1]/int[@name='id'][.='42']" |
| - ,"//result/doc[2]/int[@name='id'][.='666']" |
| - ,"//result/doc[3]/int[@name='id'][.='8675309']" |
| + ,"//result/doc[2]/int[@name='id'][.='8675309']" |
| + ,"//result/doc[3]/int[@name='id'][.='666']" |
| ); |
| |
| assertQ("multi qf", |
| Index: solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java |
| =================================================================== |
| --- solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (révision 1477238) |
| +++ solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (copie de travail) |
| @@ -53,7 +53,7 @@ |
| TokenStream filter = new PatternReplaceFilter(tokenizer, |
| Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true); |
| filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter); |
| - filter = new TrimFilter(filter, false); |
| + filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| }); |
| Index: solr/example/example-DIH/solr/mail/conf/schema.xml |
| =================================================================== |
| --- solr/example/example-DIH/solr/mail/conf/schema.xml (révision 1477238) |
| +++ solr/example/example-DIH/solr/mail/conf/schema.xml (copie de travail) |
| @@ -202,13 +202,10 @@ |
| <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
| --> |
| <!-- Case insensitive stop word removal. |
| - add enablePositionIncrements=true in both the index and query |
| - analyzers to leave a 'gap' for more accurate phrase queries. |
| --> |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="stopwords.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| @@ -222,7 +219,6 @@ |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="stopwords.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| Index: solr/example/solr/collection1/conf/schema.xml |
| =================================================================== |
| --- solr/example/solr/collection1/conf/schema.xml (révision 1477238) |
| +++ solr/example/solr/collection1/conf/schema.xml (copie de travail) |
| @@ -440,7 +440,7 @@ |
| <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> |
| <analyzer type="index"> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
| <!-- in this example, we will only use synonyms at query time |
| <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
| --> |
| @@ -448,7 +448,7 @@ |
| </analyzer> |
| <analyzer type="query"> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
| <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| </analyzer> |
| @@ -466,13 +466,10 @@ |
| <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
| --> |
| <!-- Case insensitive stop word removal. |
| - add enablePositionIncrements=true in both the index and query |
| - analyzers to leave a 'gap' for more accurate phrase queries. |
| --> |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="lang/stopwords_en.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.EnglishPossessiveFilterFactory"/> |
| @@ -488,7 +485,6 @@ |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="lang/stopwords_en.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.EnglishPossessiveFilterFactory"/> |
| @@ -516,13 +512,10 @@ |
| <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> |
| --> |
| <!-- Case insensitive stop word removal. |
| - add enablePositionIncrements=true in both the index and query |
| - analyzers to leave a 'gap' for more accurate phrase queries. |
| --> |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="lang/stopwords_en.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| @@ -535,7 +528,6 @@ |
| <filter class="solr.StopFilterFactory" |
| ignoreCase="true" |
| words="lang/stopwords_en.txt" |
| - enablePositionIncrements="true" |
| /> |
| <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| @@ -566,7 +558,7 @@ |
| <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100"> |
| <analyzer type="index"> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" |
| maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> |
| @@ -574,7 +566,7 @@ |
| <analyzer type="query"> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -730,7 +722,7 @@ |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <!-- for any non-arabic --> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" /> |
| <!-- normalizes ﻯ to ﻱ, etc --> |
| <filter class="solr.ArabicNormalizationFilterFactory"/> |
| <filter class="solr.ArabicStemFilterFactory"/> |
| @@ -742,7 +734,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" /> |
| <filter class="solr.BulgarianStemFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -754,7 +746,7 @@ |
| <!-- removes l', etc --> |
| <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Catalan"/> |
| </analyzer> |
| </fieldType> |
| @@ -776,7 +768,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" /> |
| <filter class="solr.CzechStemFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -786,7 +778,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Danish"/> |
| </analyzer> |
| </fieldType> |
| @@ -796,7 +788,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" /> |
| <filter class="solr.GermanNormalizationFilterFactory"/> |
| <filter class="solr.GermanLightStemFilterFactory"/> |
| <!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> --> |
| @@ -810,7 +802,7 @@ |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <!-- greek specific lowercase for sigma --> |
| <filter class="solr.GreekLowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" /> |
| <filter class="solr.GreekStemFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -820,7 +812,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" /> |
| <filter class="solr.SpanishLightStemFilterFactory"/> |
| <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> --> |
| </analyzer> |
| @@ -831,7 +823,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Basque"/> |
| </analyzer> |
| </fieldType> |
| @@ -845,7 +837,7 @@ |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.ArabicNormalizationFilterFactory"/> |
| <filter class="solr.PersianNormalizationFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" /> |
| </analyzer> |
| </fieldType> |
| |
| @@ -854,7 +846,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Finnish"/> |
| <!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> --> |
| </analyzer> |
| @@ -867,7 +859,7 @@ |
| <!-- removes l', etc --> |
| <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" /> |
| <filter class="solr.FrenchLightStemFilterFactory"/> |
| <!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> --> |
| <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> --> |
| @@ -881,9 +873,9 @@ |
| <!-- removes d', etc --> |
| <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/> |
| <!-- removes n-, etc. position increments is intentionally false! --> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/> |
| <filter class="solr.IrishLowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"/> |
| <filter class="solr.SnowballPorterFilterFactory" language="Irish"/> |
| </analyzer> |
| </fieldType> |
| @@ -893,7 +885,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" /> |
| <filter class="solr.GalicianStemFilterFactory"/> |
| <!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> --> |
| </analyzer> |
| @@ -908,7 +900,7 @@ |
| <filter class="solr.IndicNormalizationFilterFactory"/> |
| <!-- normalizes variation in spelling --> |
| <filter class="solr.HindiNormalizationFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" /> |
| <filter class="solr.HindiStemFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -918,7 +910,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/> |
| <!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> --> |
| </analyzer> |
| @@ -929,7 +921,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Armenian"/> |
| </analyzer> |
| </fieldType> |
| @@ -939,7 +931,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" /> |
| <!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false --> |
| <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/> |
| </analyzer> |
| @@ -952,7 +944,7 @@ |
| <!-- removes l', etc --> |
| <filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" /> |
| <filter class="solr.ItalianLightStemFilterFactory"/> |
| <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> --> |
| </analyzer> |
| @@ -999,11 +991,11 @@ |
| <!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) --> |
| <filter class="solr.JapaneseBaseFormFilterFactory"/> |
| <!-- Removes tokens with certain part-of-speech tags --> |
| - <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" /> |
| <!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> |
| <filter class="solr.CJKWidthFilterFactory"/> |
| <!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" /> |
| <!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> |
| <filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> |
| <!-- Lower-cases romaji characters --> |
| @@ -1016,7 +1008,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" /> |
| <filter class="solr.LatvianStemFilterFactory"/> |
| </analyzer> |
| </fieldType> |
| @@ -1026,7 +1018,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" /> |
| <filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/> |
| <filter class="solr.SnowballPorterFilterFactory" language="Dutch"/> |
| </analyzer> |
| @@ -1037,7 +1029,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/> |
| <!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> --> |
| <!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> --> |
| @@ -1049,7 +1041,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" /> |
| <filter class="solr.PortugueseLightStemFilterFactory"/> |
| <!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> --> |
| <!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> --> |
| @@ -1062,7 +1054,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Romanian"/> |
| </analyzer> |
| </fieldType> |
| @@ -1072,7 +1064,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Russian"/> |
| <!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> --> |
| </analyzer> |
| @@ -1083,7 +1075,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Swedish"/> |
| <!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> --> |
| </analyzer> |
| @@ -1095,7 +1087,7 @@ |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.LowerCaseFilterFactory"/> |
| <filter class="solr.ThaiWordFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" /> |
| </analyzer> |
| </fieldType> |
| |
| @@ -1104,7 +1096,7 @@ |
| <analyzer> |
| <tokenizer class="solr.StandardTokenizerFactory"/> |
| <filter class="solr.TurkishLowerCaseFilterFactory"/> |
| - <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/> |
| + <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" /> |
| <filter class="solr.SnowballPorterFilterFactory" language="Turkish"/> |
| </analyzer> |
| </fieldType> |
| Index: lucene/core/src/java/org/apache/lucene/analysis/package.html |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/analysis/package.html (révision 1477238) |
| +++ lucene/core/src/java/org/apache/lucene/analysis/package.html (copie de travail) |
| @@ -282,19 +282,19 @@ |
| <p> |
| If the selected analyzer filters the stop words "is" and "the", then for a document |
| containing the string "blue is the sky", only the tokens "blue", "sky" are indexed, |
| - with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky" |
| + with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky" |
| would find that document, because the same analyzer filters the same stop words from |
| - that query. But also the phrase query "blue sky" would find that document. |
| + that query. But the phrase query "blue sky" would not find that document because the |
| + position increment between "blue" and "sky" is only 1. |
| </p> |
| <p> |
| - If this behavior does not fit the application needs, a modified analyzer can |
| - be used, that would increment further the positions of tokens following a |
| - removed stop word, using |
| - {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}. |
| - This can be done with something like the following (note, however, that |
| - StopFilter natively includes this capability by subclassing |
| - FilteringTokenFilter}: |
| + If this behavior does not fit the application needs, the query parser needs to be |
| + configured to not take position increments into account when generating phrase queries. |
| </p> |
| +<p> |
| + Note that a StopFilter MUST increment the position increment in order not to generate corrupt |
| + tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens: |
| +</p> |
| <PRE class="prettyprint"> |
| public TokenStream tokenStream(final String fieldName, Reader reader) { |
| final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader); |
| @@ -308,7 +308,7 @@ |
| boolean hasNext = ts.incrementToken(); |
| if (hasNext) { |
| if (stopWords.contains(termAtt.toString())) { |
| - extraIncrement++; // filter this word |
| + extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word |
| continue; |
| } |
| if (extraIncrement>0) { |
| @@ -323,11 +323,6 @@ |
| } |
| </PRE> |
| <p> |
| - Now, with this modified analyzer, the phrase query "blue sky" would find that document. |
| - But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky" |
| - where both w1 and w2 are stop words would match that document. |
| -</p> |
| -<p> |
| A few more use cases for modifying position increments are: |
| </p> |
| <ol> |
| @@ -338,6 +333,72 @@ |
| As result, all synonyms of a token would be considered to appear in exactly the |
| same position as that token, and so would they be seen by phrase and proximity searches.</li> |
| </ol> |
| + |
| +<h3>Token Position Length</h3> |
| +<p> |
| + By default, all tokens created by Analyzers and Tokenizers have a |
| + {@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one. |
| + This means that the token occupies a single position. This attribute is not indexed |
| + and thus not taken into account for positional queries, but is used by eg. suggesters. |
| +</p> |
| +<p> |
| + The main use case for positions lengths is multi-word synonyms. With single-word |
| + synonyms, setting the position increment to 0 is enough to denote the fact that two |
| + words are synonyms, for example: |
| +</p> |
| +<table> |
| +<tr><td>Term</td><td>red</td><td>magenta</td></tr> |
| +<tr><td>Position increment</td><td>1</td><td>0</td></tr> |
| +</table> |
| +<p> |
| + Given that position(magenta) = 0 + position(red), they are at the same position, so anything |
| + working with analyzers will return the exact same result if you replace "magenta" with "red" |
| + in the input. However, multi-word synonyms are more tricky. Let's say that you want to build |
| + a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments |
| + are not enough anymore: |
| +</p> |
| +<table> |
| +<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr> |
| +<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr> |
| +</table> |
| +<p> |
| + The problem with this token stream is that "IBM" is at the same position as "International" |
| + although it is a synonym with "International Business Machines" as a whole. Setting |
| + the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean |
| + than "International" is a synonym of "Business". The only way to solve this issue is to |
| + make "IBM" span across 3 positions, this is where position lengths come to rescue. |
| +</p> |
| +<table> |
| +<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr> |
| +<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr> |
| +<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr> |
| +</table> |
| +<p> |
| + This new attribute makes clear that "IBM" and "International Business Machines" start and end |
| + at the same positions. |
| +</p> |
| +<a name="corrupt" /> |
| +<h3>How to not write corrupt token streams</h3> |
| +<p> |
| + There are a few rules to observe when writing custom Tokenizers and TokenFilters: |
| +</p> |
| +<ul> |
| + <li>The first position increment must be > 0.</li> |
| + <li>Positions must not go backward.</li> |
| + <li>Tokens that have the same start position must have the same start offset.</li> |
| + <li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li> |
| +</ul> |
| +<p> |
| + Although these rules might seem easy to follow, problems can quickly happen when chaining |
| + badly implemented filters that play with positions and offsets, such as synonym or n-grams |
| + filters. Here are good practices for writing correct filters: |
| +</p> |
| +<ul> |
| + <li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li> |
| + <li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li> |
| + <li>When they remove tokens, token filters should increment the position increment of the following token.</li> |
| + <li>Token filters should preserve position lengths.</li> |
| +</ul> |
| <h2>TokenStream API</h2> |
| <p> |
| "Flexible Indexing" summarizes the effort of making the Lucene indexer |
| @@ -383,6 +444,10 @@ |
| <td>See above for detailed information about position increment.</td> |
| </tr> |
| <tr> |
| + <td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td> |
| + <td>The number of positions occupied by a token.</td> |
| + </tr> |
| + <tr> |
| <td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td> |
| <td>The payload that a Token can optionally have.</td> |
| </tr> |
| @@ -532,20 +597,26 @@ |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| |
| /** |
| - * Build a filter that removes words that are too long or too |
| - * short from the text. |
| + * Create a new LengthFilter. This will filter out tokens whose |
| + * CharTermAttribute is either too short |
| + * (< min) or too long (> max). |
| + * @param version the Lucene match version |
| + * @param in the TokenStream to consume |
| + * @param min the minimum length |
| + * @param max the maximum length |
| */ |
| - public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { |
| - super(enablePositionIncrements, in); |
| + public LengthFilter(Version version, TokenStream in, int min, int max) { |
| + super(version, in); |
| this.min = min; |
| this.max = max; |
| } |
| - |
| + |
| {@literal @Override} |
| - public boolean accept() throws IOException { |
| + public boolean accept() { |
| final int len = termAtt.length(); |
| - return (len >= min && len <= max); |
| + return (len >= min && len <= max); |
| } |
| + |
| } |
| </pre> |
| <p> |
| @@ -573,66 +644,39 @@ |
| public abstract class FilteringTokenFilter extends TokenFilter { |
| |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| - private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value! |
| |
| - public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){ |
| - super(input); |
| - this.enablePositionIncrements = enablePositionIncrements; |
| + /** |
| + * Create a new FilteringTokenFilter. |
| + * @param in the TokenStream to consume |
| + */ |
| + public FilteringTokenFilter(Version version, TokenStream in) { |
| + super(in); |
| } |
| |
| - /** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */ |
| + /** Override this method and return if the current input token should be returned by incrementToken. */ |
| protected abstract boolean accept() throws IOException; |
| |
| {@literal @Override} |
| public final boolean incrementToken() throws IOException { |
| - if (enablePositionIncrements) { |
| - int skippedPositions = 0; |
| - while (input.incrementToken()) { |
| - if (accept()) { |
| - if (skippedPositions != 0) { |
| - posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| - } |
| - return true; |
| + int skippedPositions = 0; |
| + while (input.incrementToken()) { |
| + if (accept()) { |
| + if (skippedPositions != 0) { |
| + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| } |
| - skippedPositions += posIncrAtt.getPositionIncrement(); |
| + return true; |
| } |
| - } else { |
| - while (input.incrementToken()) { |
| - if (accept()) { |
| - return true; |
| - } |
| - } |
| + skippedPositions += posIncrAtt.getPositionIncrement(); |
| } |
| // reached EOS -- return false |
| return false; |
| } |
| |
| - /** |
| - * {@literal @see #setEnablePositionIncrements(boolean)} |
| - */ |
| - public boolean getEnablePositionIncrements() { |
| - return enablePositionIncrements; |
| + {@literal @Override} |
| + public void reset() throws IOException { |
| + super.reset(); |
| } |
| |
| - /** |
| - * If <code>true</code>, this TokenFilter will preserve |
| - * positions of the incoming tokens (ie, accumulate and |
| - * set position increments of the removed tokens). |
| - * Generally, <code>true</code> is best as it does not |
| - * lose information (positions of the original tokens) |
| - * during indexing. |
| - * |
| - * <p> When set, when a token is stopped |
| - * (omitted), the position increment of the following |
| - * token is incremented. |
| - * |
| - * <p> <b>NOTE</b>: be sure to also |
| - * set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if |
| - * you use QueryParser to create queries. |
| - */ |
| - public void setEnablePositionIncrements(boolean enable) { |
| - this.enablePositionIncrements = enable; |
| - } |
| } |
| </pre> |
| |
| Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (révision 1477238) |
| +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (copie de travail) |
| @@ -17,12 +17,8 @@ |
| * limitations under the License. |
| */ |
| |
| -import java.io.FileOutputStream; |
| import java.io.IOException; |
| -import java.io.OutputStreamWriter; |
| -import java.io.Writer; |
| |
| -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
| @@ -43,10 +39,18 @@ |
| * @lucene.experimental */ |
| public class TokenStreamToAutomaton { |
| |
| + private boolean preservePositionIncrements; |
| + |
| /** Sole constructor. */ |
| public TokenStreamToAutomaton() { |
| + this.preservePositionIncrements = true; |
| } |
| |
| + /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */ |
| + public void setPreservePositionIncrements(boolean enablePositionIncrements) { |
| + this.preservePositionIncrements = enablePositionIncrements; |
| + } |
| + |
| private static class Position implements RollingBuffer.Resettable { |
| // Any tokens that ended at our position arrive to this state: |
| State arriving; |
| @@ -108,6 +112,9 @@ |
| int maxOffset = 0; |
| while (in.incrementToken()) { |
| int posInc = posIncAtt.getPositionIncrement(); |
| + if (!preservePositionIncrements && posInc > 1) { |
| + posInc = 1; |
| + } |
| assert pos > -1 || posInc > 0; |
| |
| if (posInc > 0) { |
| Index: lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (copie de travail) |
| @@ -213,7 +213,7 @@ |
| public void testEndOffsetPositionStopFilter() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( |
| - TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); |
| + TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); |
| Document doc = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| Index: lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (copie de travail) |
| @@ -64,16 +64,10 @@ |
| |
| /** Test a configuration that behaves a lot like StopAnalyzer */ |
| public void testStop() throws Exception { |
| - Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| assertAnalyzesTo(a, "the quick brown a fox", |
| new String[] { "quick", "brown", "fox" }, |
| new int[] { 2, 1, 2 }); |
| - |
| - // disable positions |
| - a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| - assertAnalyzesTo(a, "the quick brown a fox", |
| - new String[] { "quick", "brown", "fox" }, |
| - new int[] { 1, 1, 1 }); |
| } |
| |
| /** Test a configuration that behaves a lot like KeepWordFilter */ |
| @@ -83,7 +77,7 @@ |
| BasicOperations.complement( |
| Automaton.union( |
| Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar"))))); |
| - Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true); |
| + Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); |
| assertAnalyzesTo(a, "quick foo brown bar bar fox foo", |
| new String[] { "foo", "bar", "bar", "foo" }, |
| new int[] { 2, 2, 1, 2 }); |
| @@ -92,7 +86,7 @@ |
| /** Test a configuration that behaves a lot like LengthFilter */ |
| public void testLength() throws Exception { |
| CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton()); |
| - Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5, true); |
| + Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5); |
| assertAnalyzesTo(a, "ok toolong fine notfine", |
| new String[] { "ok", "fine" }, |
| new int[] { 1, 2 }); |
| Index: lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java (copie de travail) |
| @@ -222,7 +222,7 @@ |
| |
| public void testPhraseQueryWithStopAnalyzer() throws Exception { |
| Directory directory = newDirectory(); |
| - Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| + Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), directory, |
| newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer)); |
| Document doc = new Document(); |
| @@ -241,16 +241,6 @@ |
| assertEquals(1, hits.length); |
| QueryUtils.check(random(), query,searcher); |
| |
| - |
| - // StopAnalyzer as of 2.4 does not leave "holes", so this matches. |
| - query = new PhraseQuery(); |
| - query.add(new Term("field", "words")); |
| - query.add(new Term("field", "here")); |
| - hits = searcher.search(query, null, 1000).scoreDocs; |
| - assertEquals(1, hits.length); |
| - QueryUtils.check(random(), query,searcher); |
| - |
| - |
| reader.close(); |
| directory.close(); |
| } |
| Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java (copie de travail) |
| @@ -49,7 +49,7 @@ |
| // create test index |
| final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, |
| newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), |
| - MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) |
| + MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) |
| .setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy()) |
| .setSimilarity(new DefaultSimilarity())); |
| addDocument(writer, "A", "Should we, could we, would we?"); |
| Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java (copie de travail) |
| @@ -37,7 +37,7 @@ |
| |
| // mimic StopAnalyzer |
| CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton()); |
| - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true); |
| + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); |
| |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer); |
| Document doc = new Document(); |
| Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java (révision 1477238) |
| +++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java (copie de travail) |
| @@ -60,7 +60,7 @@ |
| mDirectory = newDirectory(); |
| final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory, |
| newIndexWriterConfig(TEST_VERSION_CURRENT, |
| - new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)) |
| + new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)) |
| .setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity())); |
| addDocument(writer, "1", "I think it should work."); |
| addDocument(writer, "2", "I think it should work."); |
| Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java |
| =================================================================== |
| --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (révision 1477238) |
| +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (copie de travail) |
| @@ -247,7 +247,7 @@ |
| |
| public void testCommonTermsQueryHighlightTest() throws IOException { |
| Directory dir = newDirectory(); |
| - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); |
| + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); |
| FieldType type = new FieldType(TextField.TYPE_STORED); |
| type.setStoreTermVectorOffsets(true); |
| type.setStoreTermVectorPositions(true); |
| Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java |
| =================================================================== |
| --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (révision 1477238) |
| +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (copie de travail) |
| @@ -247,7 +247,7 @@ |
| */ |
| private String highlightField(Query query, String fieldName, String text) |
| throws IOException, InvalidTokenOffsetsException { |
| - TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true) |
| + TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET) |
| .tokenStream(fieldName, new StringReader(text)); |
| // Assuming "<B>", "</B>" used to highlight |
| SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); |
| @@ -1308,7 +1308,7 @@ |
| } |
| |
| public void testMaxSizeHighlight() throws Exception { |
| - final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| // we disable MockTokenizer checks because we will forcefully limit the |
| // tokenstream and call end() before incrementToken() returns false. |
| analyzer.setEnableChecks(false); |
| @@ -1343,7 +1343,7 @@ |
| CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken")); |
| // we disable MockTokenizer checks because we will forcefully limit the |
| // tokenstream and call end() before incrementToken() returns false. |
| - final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true); |
| + final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); |
| analyzer.setEnableChecks(false); |
| TermQuery query = new TermQuery(new Term("data", goodWord)); |
| |
| @@ -1394,7 +1394,7 @@ |
| Highlighter hg = getHighlighter(query, "text", fm); |
| hg.setTextFragmenter(new NullFragmenter()); |
| hg.setMaxDocCharsToAnalyze(36); |
| - String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text); |
| + String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text); |
| assertTrue( |
| "Matched text should contain remainder of text after highlighted query ", |
| match.endsWith("in it")); |
| @@ -1411,7 +1411,7 @@ |
| numHighlights = 0; |
| // test to show how rewritten query can still be used |
| searcher = newSearcher(reader); |
| - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD); |
| @@ -1875,11 +1875,11 @@ |
| super.setUp(); |
| |
| a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); |
| - analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| dir = newDirectory(); |
| ramDir = newDirectory(); |
| IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig( |
| - TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); |
| + TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))); |
| for (String text : texts) { |
| addDoc(writer, text); |
| } |
| Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java |
| =================================================================== |
| --- lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java (révision 1477238) |
| +++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java (copie de travail) |
| @@ -89,7 +89,7 @@ |
| private String highlightField(Query query, String fieldName, |
| String text) throws IOException, InvalidTokenOffsetsException { |
| TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, |
| - true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName, |
| + true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName, |
| new StringReader(text)); |
| // Assuming "<B>", "</B>" used to highlight |
| SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); |
| Index: lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java |
| =================================================================== |
| --- lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (révision 1477238) |
| +++ lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (copie de travail) |
| @@ -259,7 +259,7 @@ |
| private Analyzer randomAnalyzer() { |
| switch(random().nextInt(4)) { |
| case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| - case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| case 2: return new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (révision 1477238) |
| +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (copie de travail) |
| @@ -17,7 +17,6 @@ |
| * limitations under the License. |
| */ |
| |
| -import java.io.IOException; |
| import java.io.Reader; |
| import java.util.HashMap; |
| import java.util.Map; |
| @@ -46,7 +45,6 @@ |
| private final CharacterRunAutomaton runAutomaton; |
| private final boolean lowerCase; |
| private final CharacterRunAutomaton filter; |
| - private final boolean enablePositionIncrements; |
| private int positionIncrementGap; |
| private final Random random; |
| private Map<String,Integer> previousMappings = new HashMap<String,Integer>(); |
| @@ -60,30 +58,28 @@ |
| * @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+) |
| * @param lowerCase true if the tokenizer should lowercase terms |
| * @param filter DFA describing how terms should be filtered (set of stopwords, etc) |
| - * @param enablePositionIncrements true if position increments should reflect filtered terms. |
| */ |
| - public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) { |
| + public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter) { |
| super(new PerFieldReuseStrategy()); |
| // TODO: this should be solved in a different way; Random should not be shared (!). |
| this.random = new Random(random.nextLong()); |
| this.runAutomaton = runAutomaton; |
| this.lowerCase = lowerCase; |
| this.filter = filter; |
| - this.enablePositionIncrements = enablePositionIncrements; |
| } |
| |
| /** |
| - * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean) |
| + * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton) |
| * MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}). |
| */ |
| public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) { |
| - this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true); |
| + this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET); |
| } |
| |
| /** |
| * Create a Whitespace-lowercasing analyzer with no stopwords removal. |
| * <p> |
| - * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean) |
| + * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton) |
| * MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}). |
| */ |
| public MockAnalyzer(Random random) { |
| @@ -95,7 +91,6 @@ |
| MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength); |
| tokenizer.setEnableChecks(enableChecks); |
| MockTokenFilter filt = new MockTokenFilter(tokenizer, filter); |
| - filt.setEnablePositionIncrements(enablePositionIncrements); |
| return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName)); |
| } |
| |
| Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (révision 1477238) |
| +++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (copie de travail) |
| @@ -55,7 +55,6 @@ |
| makeString("with")))); |
| |
| private final CharacterRunAutomaton filter; |
| - private boolean enablePositionIncrements = true; |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| @@ -80,9 +79,7 @@ |
| int skippedPositions = 0; |
| while (input.incrementToken()) { |
| if (!filter.run(termAtt.buffer(), 0, termAtt.length())) { |
| - if (enablePositionIncrements) { |
| - posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| - } |
| + posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| return true; |
| } |
| skippedPositions += posIncrAtt.getPositionIncrement(); |
| @@ -90,20 +87,4 @@ |
| // reached EOS -- return false |
| return false; |
| } |
| - |
| - /** |
| - * @see #setEnablePositionIncrements(boolean) |
| - */ |
| - public boolean getEnablePositionIncrements() { |
| - return enablePositionIncrements; |
| - } |
| - |
| - /** |
| - * If <code>true</code>, this Filter will preserve |
| - * positions of the incoming tokens (ie, accumulate and |
| - * set position increments of the removed stop tokens). |
| - */ |
| - public void setEnablePositionIncrements(boolean enable) { |
| - this.enablePositionIncrements = enable; |
| - } |
| } |
| Index: lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java (révision 1477238) |
| +++ lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java (copie de travail) |
| @@ -59,7 +59,7 @@ |
| directory = newDirectory(); |
| stopword = "" + randomChar(); |
| CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword)); |
| - analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset, true); |
| + analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); |
| RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer); |
| Document doc = new Document(); |
| Field id = new StringField("id", "", Field.Store.NO); |
| Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java |
| =================================================================== |
| --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (révision 1477238) |
| +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (copie de travail) |
| @@ -164,8 +164,9 @@ |
| new TermFreq("the ghost of christmas past", 50), |
| }; |
| |
| - Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(standard); |
| + suggester.setPreservePositionIncrements(false); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); |
| @@ -187,7 +188,7 @@ |
| } |
| |
| public void testEmpty() throws Exception { |
| - Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| AnalyzingSuggester suggester = new AnalyzingSuggester(standard); |
| suggester.build(new TermFreqArrayIterator(new TermFreq[0])); |
| |
| Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java |
| =================================================================== |
| --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (révision 1477238) |
| +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (copie de travail) |
| @@ -153,8 +153,9 @@ |
| new TermFreq("the ghost of christmas past", 50), |
| }; |
| |
| - Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| + Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| FuzzySuggester suggester = new FuzzySuggester(standard); |
| + suggester.setPreservePositionIncrements(false); |
| suggester.build(new TermFreqArrayIterator(keys)); |
| |
| List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1); |
| Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java |
| =================================================================== |
| --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (révision 1477238) |
| +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (copie de travail) |
| @@ -75,9 +75,9 @@ |
| * example, if you use an analyzer removing stop words, |
| * then the partial text "ghost chr..." could see the |
| * suggestion "The Ghost of Christmas Past". Note that |
| - * your {@code StopFilter} instance must NOT preserve |
| - * position increments for this example to work, so you should call |
| - * {@code setEnablePositionIncrements(false)} on it. |
| + * position increments MUST NOT be preserved for this example |
| + * to work, so you should call |
| + * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}. |
| * |
| * <p> |
| * If SynonymFilter is used to map wifi and wireless network to |
| @@ -185,6 +185,9 @@ |
| |
| private static final int PAYLOAD_SEP = '\u001f'; |
| |
| + /** Whether position holes should appear in the automaton. */ |
| + private boolean preservePositionIncrements; |
| + |
| /** |
| * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int) |
| * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | |
| @@ -241,8 +244,15 @@ |
| throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); |
| } |
| this.maxGraphExpansions = maxGraphExpansions; |
| + preservePositionIncrements = true; |
| } |
| |
| + /** Whether to take position holes (position increment > 1) into account when |
| + * building the automaton, <code>true</code> by default. */ |
| + public void setPreservePositionIncrements(boolean preservePositionIncrements) { |
| + this.preservePositionIncrements = preservePositionIncrements; |
| + } |
| + |
| /** Returns byte size of the underlying FST. */ |
| public long sizeInBytes() { |
| return fst == null ? 0 : fst.sizeInBytes(); |
| @@ -327,13 +337,16 @@ |
| } |
| |
| TokenStreamToAutomaton getTokenStreamToAutomaton() { |
| + final TokenStreamToAutomaton tsta; |
| if (preserveSep) { |
| - return new EscapingTokenStreamToAutomaton(); |
| + tsta = new EscapingTokenStreamToAutomaton(); |
| } else { |
| // When we're not preserving sep, we don't steal 0xff |
| // byte, so we don't need to do any escaping: |
| - return new TokenStreamToAutomaton(); |
| + tsta = new TokenStreamToAutomaton(); |
| } |
| + tsta.setPreservePositionIncrements(preservePositionIncrements); |
| + return tsta; |
| } |
| |
| private static class AnalyzingComparator implements Comparator<BytesRef> { |
| Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java |
| =================================================================== |
| --- lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java (révision 1477238) |
| +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java (copie de travail) |
| @@ -852,7 +852,7 @@ |
| public void testBoost() |
| throws Exception { |
| CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on")); |
| - Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true); |
| + Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); |
| CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); |
| Query q = getQuery("on^1.0",qp); |
| assertNotNull(q); |
| @@ -865,7 +865,7 @@ |
| q = getQuery("\"on\"^1.0",qp); |
| assertNotNull(q); |
| |
| - Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| CommonQueryParserConfiguration qp2 = getParserConfig(a2); |
| q = getQuery("the^3", qp2); |
| // "the" is a stop word so the result is an empty query: |
| @@ -1007,7 +1007,7 @@ |
| |
| public void testStopwords() throws Exception { |
| CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton()); |
| - CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true)); |
| + CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet)); |
| Query result = getQuery("field:the OR field:foo",qp); |
| assertNotNull("result is null and it shouldn't be", result); |
| assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery); |
| @@ -1023,7 +1023,7 @@ |
| } |
| |
| public void testPositionIncrement() throws Exception { |
| - CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)); |
| + CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); |
| qp.setEnablePositionIncrements(true); |
| String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; |
| // 0 2 5 7 8 |
| @@ -1070,7 +1070,7 @@ |
| // "match" |
| public void testPositionIncrements() throws Exception { |
| Directory dir = newDirectory(); |
| - Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a)); |
| Document doc = new Document(); |
| doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO)); |
| @@ -1185,7 +1185,7 @@ |
| } |
| |
| public void testPhraseQueryToString() throws Exception { |
| - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); |
| + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); |
| CommonQueryParserConfiguration qp = getParserConfig(analyzer); |
| qp.setEnablePositionIncrements(true); |
| PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp); |
| @@ -1235,26 +1235,13 @@ |
| CharacterRunAutomaton stopStopList = |
| new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton()); |
| |
| - CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, false)); |
| + CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); |
| |
| - PhraseQuery phraseQuery = new PhraseQuery(); |
| - phraseQuery.add(new Term("field", "1")); |
| - phraseQuery.add(new Term("field", "2")); |
| - |
| - assertEquals(phraseQuery, getQuery("\"1 2\"",qp)); |
| - assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp)); |
| - |
| - qp.setEnablePositionIncrements(true); |
| - assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp)); |
| - |
| - qp.setEnablePositionIncrements(false); |
| - assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp)); |
| - |
| qp = getParserConfig( |
| - new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, true)); |
| + new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList)); |
| qp.setEnablePositionIncrements(true); |
| |
| - phraseQuery = new PhraseQuery(); |
| + PhraseQuery phraseQuery = new PhraseQuery(); |
| phraseQuery.add(new Term("field", "1")); |
| phraseQuery.add(new Term("field", "2"), 2); |
| assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp)); |
| Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java |
| =================================================================== |
| --- lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java (révision 1477238) |
| +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java (copie de travail) |
| @@ -946,7 +946,7 @@ |
| |
| public void testBoost() throws Exception { |
| CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on")); |
| - Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true); |
| + Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); |
| StandardQueryParser qp = new StandardQueryParser(); |
| qp.setAnalyzer(oneStopAnalyzer); |
| |
| @@ -962,7 +962,7 @@ |
| assertNotNull(q); |
| |
| StandardQueryParser qp2 = new StandardQueryParser(); |
| - qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)); |
| + qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); |
| |
| q = qp2.parse("the^3", "field"); |
| // "the" is a stop word so the result is an empty query: |
| @@ -1179,7 +1179,7 @@ |
| public void testStopwords() throws Exception { |
| StandardQueryParser qp = new StandardQueryParser(); |
| CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton()); |
| - qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true)); |
| + qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet)); |
| |
| Query result = qp.parse("a:the OR a:foo", "a"); |
| assertNotNull("result is null and it shouldn't be", result); |
| @@ -1203,7 +1203,7 @@ |
| public void testPositionIncrement() throws Exception { |
| StandardQueryParser qp = new StandardQueryParser(); |
| qp.setAnalyzer( |
| - new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)); |
| + new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); |
| |
| qp.setEnablePositionIncrements(true); |
| |
| Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java |
| =================================================================== |
| --- lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java (révision 1477238) |
| +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java (copie de travail) |
| @@ -546,7 +546,7 @@ |
| |
| public void testBoost() throws Exception { |
| CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on")); |
| - Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true); |
| + Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); |
| |
| PrecedenceQueryParser qp = new PrecedenceQueryParser(); |
| qp.setAnalyzer(oneStopAnalyzer); |
| @@ -561,7 +561,7 @@ |
| q = qp.parse("\"on\"^1.0", "field"); |
| assertNotNull(q); |
| |
| - q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)).parse("the^3", |
| + q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3", |
| "field"); |
| assertNotNull(q); |
| } |
| Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java |
| =================================================================== |
| --- lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java (révision 1477238) |
| +++ lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java (copie de travail) |
| @@ -58,7 +58,7 @@ |
| @BeforeClass |
| public static void beforeClass() throws Exception { |
| // TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT): |
| - Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false); |
| + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET); |
| //initialize the parser |
| builder = new CorePlusExtensionsParser("contents", analyzer); |
| |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (copie de travail) |
| @@ -61,7 +61,7 @@ |
| Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT); |
| assertAnalyzesTo(a, "n-athair", |
| new String[] { "athair" }, |
| - new int[] { 1 }); |
| + new int[] { 2 }); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (copie de travail) |
| @@ -50,7 +50,7 @@ |
| public void testCreationWithBlackList() throws Exception { |
| TokenFilterFactory factory = tokenFilterFactory("Type", |
| "types", "stoptypes-1.txt, stoptypes-2.txt", |
| - "enablePositionIncrements", "false"); |
| + "enablePositionIncrements", "true"); |
| NumericTokenStream input = new NumericTokenStream(); |
| input.setIntValue(123); |
| factory.create(input); |
| @@ -59,7 +59,7 @@ |
| public void testCreationWithWhiteList() throws Exception { |
| TokenFilterFactory factory = tokenFilterFactory("Type", |
| "types", "stoptypes-1.txt, stoptypes-2.txt", |
| - "enablePositionIncrements", "false", |
| + "enablePositionIncrements", "true", |
| "useWhitelist", "true"); |
| NumericTokenStream input = new NumericTokenStream(); |
| input.setIntValue(123); |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (copie de travail) |
| @@ -75,7 +75,7 @@ |
| doTestStopPositons(stpf,true); |
| // without increments |
| reader = new StringReader(sb.toString()); |
| - stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); |
| + stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet); |
| doTestStopPositons(stpf,false); |
| // with increments, concatenating two stop filters |
| ArrayList<String> a0 = new ArrayList<String>(); |
| @@ -166,7 +166,7 @@ |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| TokenFilter filter = new MockSynonymFilter(tokenizer); |
| - StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); |
| + StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); |
| stopfilter.setEnablePositionIncrements(false); |
| return new TokenStreamComponents(tokenizer, stopfilter); |
| } |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (copie de travail) |
| @@ -161,8 +161,6 @@ |
| // startOffset thats > its endOffset |
| // (see LUCENE-3738 for a list of other offenders here) |
| // broken! |
| - Lucene43NGramTokenizer.class, |
| - // broken! |
| EdgeNGramTokenizer.class, |
| // broken! |
| EdgeNGramTokenFilter.class, |
| @@ -182,55 +180,6 @@ |
| private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>(); |
| static { |
| try { |
| - brokenOffsetsConstructors.put( |
| - TrimFilter.class.getConstructor(TokenStream.class, boolean.class), |
| - new Predicate<Object[]>() { |
| - @Override |
| - public boolean apply(Object[] args) { |
| - assert args.length == 2; |
| - return (Boolean) args[1]; // args are broken if updateOffsets is true |
| - } |
| - }); |
| - brokenOffsetsConstructors.put( |
| - TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class), |
| - new Predicate<Object[]>() { |
| - @Override |
| - public boolean apply(Object[] args) { |
| - assert args.length == 4; |
| - // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! |
| - return !(Boolean) args[0]; |
| - } |
| - }); |
| - brokenOffsetsConstructors.put( |
| - TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class), |
| - new Predicate<Object[]>() { |
| - @Override |
| - public boolean apply(Object[] args) { |
| - assert args.length == 3; |
| - // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! |
| - return !(Boolean) args[0]; |
| - } |
| - }); |
| - brokenOffsetsConstructors.put( |
| - LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class), |
| - new Predicate<Object[]>() { |
| - @Override |
| - public boolean apply(Object[] args) { |
| - assert args.length == 4; |
| - // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! |
| - return !(Boolean) args[0]; |
| - } |
| - }); |
| - brokenOffsetsConstructors.put( |
| - KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class), |
| - new Predicate<Object[]>() { |
| - @Override |
| - public boolean apply(Object[] args) { |
| - assert args.length == 3; |
| - // LUCENE-4065: only if you pass 'false' to enablePositionIncrements! |
| - return !(Boolean) args[0]; |
| - } |
| - }); |
| for (Class<?> c : Arrays.<Class<?>>asList( |
| ReversePathHierarchyTokenizer.class, |
| PathHierarchyTokenizer.class, |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java (copie de travail) |
| @@ -24,6 +24,7 @@ |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.util.English; |
| +import org.apache.lucene.util.Version; |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| @@ -36,7 +37,7 @@ |
| public void testTypeFilter() throws IOException { |
| StringReader reader = new StringReader("121 is palindrome, while 123 is not"); |
| Set<String> stopTypes = asSet("<NUM>"); |
| - TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes); |
| + TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes); |
| assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"}); |
| } |
| |
| @@ -59,12 +60,12 @@ |
| |
| // with increments |
| StringReader reader = new StringReader(sb.toString()); |
| - TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); |
| + TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); |
| testPositons(typeTokenFilter); |
| |
| // without increments |
| reader = new StringReader(sb.toString()); |
| - typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); |
| + typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); |
| testPositons(typeTokenFilter); |
| |
| } |
| @@ -87,7 +88,7 @@ |
| public void testTypeFilterWhitelist() throws IOException { |
| StringReader reader = new StringReader("121 is palindrome, while 123 is not"); |
| Set<String> stopTypes = Collections.singleton("<NUM>"); |
| - TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true); |
| + TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true); |
| assertTokenStreamContents(stream, new String[]{"121", "123"}); |
| } |
| |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (copie de travail) |
| @@ -306,7 +306,6 @@ |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| StopFilter filter = new StopFilter(TEST_VERSION_CURRENT, |
| tokenizer, StandardAnalyzer.STOP_WORDS_SET); |
| - filter.setEnablePositionIncrements(true); |
| return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords)); |
| } |
| }; |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (copie de travail) |
| @@ -29,6 +29,7 @@ |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.*; |
| +import org.apache.lucene.util.Version; |
| |
| /** |
| */ |
| @@ -46,7 +47,7 @@ |
| new Token(ccc, 0, ccc.length, 11, 15), |
| new Token(whitespace, 0, whitespace.length, 16, 20), |
| new Token(empty, 0, empty.length, 21, 21)); |
| - ts = new TrimFilter(ts, false); |
| + ts = new TrimFilter(TEST_VERSION_CURRENT, ts, false); |
| |
| assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""}); |
| |
| @@ -59,7 +60,7 @@ |
| new Token(b, 0, b.length, 0, 2), |
| new Token(ccc, 0, ccc.length, 0, 3), |
| new Token(whitespace, 0, whitespace.length, 0, 3)); |
| - ts = new TrimFilter(ts, true); |
| + ts = new TrimFilter(Version.LUCENE_43, ts, true); |
| |
| assertTokenStreamContents(ts, |
| new String[] { "a", "b", "c", "" }, |
| @@ -120,7 +121,7 @@ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); |
| - return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false)); |
| + return new TokenStreamComponents(tokenizer, new TrimFilter(Version.LUCENE_43, tokenizer, true)); |
| } |
| }; |
| checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); |
| @@ -130,7 +131,7 @@ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); |
| - return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true)); |
| + return new TokenStreamComponents(tokenizer, new TrimFilter(TEST_VERSION_CURRENT, tokenizer, false)); |
| } |
| }; |
| checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); |
| @@ -141,7 +142,9 @@ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| - return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random().nextBoolean())); |
| + final boolean updateOffsets = random().nextBoolean(); |
| + final Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT; |
| + return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets)); |
| } |
| }; |
| checkOneTermReuse(a, "", ""); |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (copie de travail) |
| @@ -22,6 +22,8 @@ |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; |
| +import org.apache.lucene.analysis.util.ClasspathResourceLoader; |
| +import org.apache.lucene.util.Version; |
| |
| public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase { |
| |
| @@ -29,8 +31,10 @@ |
| Reader reader = new StringReader("foo foobar super-duper-trooper"); |
| TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| stream = tokenFilterFactory("Length", |
| + Version.LUCENE_43, new ClasspathResourceLoader(getClass()), |
| "min", "4", |
| - "max", "10").create(stream); |
| + "max", "10", |
| + "enablePositionIncrements", "false").create(stream); |
| assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); |
| } |
| |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (copie de travail) |
| @@ -28,6 +28,7 @@ |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.util.CharArraySet; |
| +import org.apache.lucene.util.Version; |
| |
| /** Test {@link KeepWordFilter} */ |
| public class TestKeepWordFilter extends BaseTokenStreamTestCase { |
| @@ -42,22 +43,22 @@ |
| |
| // Test Stopwords |
| TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); |
| - stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| + stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 }); |
| |
| // Now force case |
| stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); |
| - stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); |
| + stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); |
| assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 }); |
| |
| // Test Stopwords |
| stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); |
| - stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| + stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 }); |
| |
| // Now force case |
| stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); |
| - stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); |
| + stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false)); |
| assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 }); |
| } |
| |
| @@ -72,7 +73,7 @@ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); |
| - TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| + TokenStream stream = new KeepWordFilter(TEST_VERSION_CURRENT, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true)); |
| return new TokenStreamComponents(tokenizer, stream); |
| } |
| }; |
| Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (copie de travail) |
| @@ -19,6 +19,7 @@ |
| |
| import org.apache.lucene.analysis.*; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| +import org.apache.lucene.util.Version; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| @@ -29,7 +30,7 @@ |
| public void testFilterNoPosIncr() throws Exception { |
| TokenStream stream = new MockTokenizer( |
| new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); |
| - LengthFilter filter = new LengthFilter(false, stream, 2, 6); |
| + LengthFilter filter = new LengthFilter(Version.LUCENE_43, false, stream, 2, 6); |
| assertTokenStreamContents(filter, |
| new String[]{"short", "ab", "foo"}, |
| new int[]{1, 1, 1} |
| @@ -39,7 +40,7 @@ |
| public void testFilterWithPosIncr() throws Exception { |
| TokenStream stream = new MockTokenizer( |
| new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); |
| - LengthFilter filter = new LengthFilter(true, stream, 2, 6); |
| + LengthFilter filter = new LengthFilter(TEST_VERSION_CURRENT, stream, 2, 6); |
| assertTokenStreamContents(filter, |
| new String[]{"short", "ab", "foo"}, |
| new int[]{1, 4, 2} |
| @@ -51,7 +52,7 @@ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new KeywordTokenizer(reader); |
| - return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5)); |
| + return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5)); |
| } |
| }; |
| checkOneTermReuse(a, "", ""); |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (copie de travail) |
| @@ -138,7 +138,9 @@ |
| final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| TokenStream result = new StandardFilter(matchVersion, source); |
| StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS); |
| - s.setEnablePositionIncrements(false); |
| + if (!matchVersion.onOrAfter(Version.LUCENE_44)) { |
| + s.setEnablePositionIncrements(false); |
| + } |
| result = s; |
| result = new ElisionFilter(result, DEFAULT_ARTICLES); |
| result = new IrishLowerCaseFilter(result); |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (copie de travail) |
| @@ -22,24 +22,54 @@ |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.util.Version; |
| |
| /** |
| * Abstract base class for TokenFilters that may remove tokens. |
| * You have to implement {@link #accept} and return a boolean if the current |
| * token should be preserved. {@link #incrementToken} uses this method |
| * to decide if a token should be passed to the caller. |
| + * <p><a name="version" />As of Lucene 4.4, an {@link IllegalArgumentException} |
| + * is thrown when trying to disable position increments when filtering terms. |
| */ |
| public abstract class FilteringTokenFilter extends TokenFilter { |
| |
| + private static void checkPositionIncrement(Version version, boolean enablePositionIncrements) { |
| + if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) { |
| + throw new IllegalArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams"); |
| + } |
| + } |
| + |
| + protected final Version version; |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value! |
| - private boolean first = true; // only used when not preserving gaps |
| + private boolean first = true; |
| |
| - public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){ |
| - super(input); |
| + /** |
| + * Create a new {@link FilteringTokenFilter}. |
| + * @param version the Lucene match <a href="#version">version</a> |
| + * @param enablePositionIncrements whether to increment position increments when filtering out terms |
| + * @param input the input to consume |
| + * @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4 |
| + */ |
| + @Deprecated |
| + public FilteringTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input){ |
| + this(version, input); |
| + checkPositionIncrement(version, enablePositionIncrements); |
| this.enablePositionIncrements = enablePositionIncrements; |
| } |
| |
| + /** |
| + * Create a new {@link FilteringTokenFilter}. |
| + * @param version the Lucene match version |
| + * @param in the {@link TokenStream} to consume |
| + */ |
| + public FilteringTokenFilter(Version version, TokenStream in) { |
| + super(in); |
| + this.version = version; |
| + this.enablePositionIncrements = true; |
| + } |
| + |
| /** Override this method and return if the current input token should be returned by {@link #incrementToken}. */ |
| protected abstract boolean accept() throws IOException; |
| |
| @@ -102,8 +132,11 @@ |
| * <p> <b>NOTE</b>: be sure to also |
| * set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if |
| * you use QueryParser to create queries. |
| + * @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4 |
| */ |
| + @Deprecated |
| public void setEnablePositionIncrements(boolean enable) { |
| + checkPositionIncrement(version, enable); |
| this.enablePositionIncrements = enable; |
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (copie de travail) |
| @@ -35,7 +35,7 @@ |
| * <analyzer> |
| * <tokenizer class="solr.StandardTokenizerFactory"/> |
| * <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt" |
| - * enablePositionIncrements="true" useWhitelist="false"/> |
| + * useWhitelist="false"/> |
| * </analyzer> |
| * </fieldType></pre> |
| */ |
| @@ -49,7 +49,7 @@ |
| public TypeTokenFilterFactory(Map<String,String> args) { |
| super(args); |
| stopTypesFiles = require(args, "types"); |
| - enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); |
| + enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); |
| useWhitelist = getBoolean(args, "useWhitelist", false); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| @@ -78,6 +78,8 @@ |
| |
| @Override |
| public TokenStream create(TokenStream input) { |
| - return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist); |
| + @SuppressWarnings("deprecation") |
| + final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist); |
| + return filter; |
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java (copie de travail) |
| @@ -17,13 +17,13 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.util.Set; |
| + |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.analysis.util.FilteringTokenFilter; |
| +import org.apache.lucene.util.Version; |
| |
| -import java.io.IOException; |
| -import java.util.Set; |
| - |
| /** |
| * Removes tokens whose types appear in a set of blocked types from a token stream. |
| */ |
| @@ -33,17 +33,44 @@ |
| private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); |
| private final boolean useWhiteList; |
| |
| - public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) { |
| - super(enablePositionIncrements, input); |
| + /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */ |
| + @Deprecated |
| + public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) { |
| + super(version, enablePositionIncrements, input); |
| this.stopTypes = stopTypes; |
| this.useWhiteList = useWhiteList; |
| } |
| |
| - public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) { |
| - this(enablePositionIncrements, input, stopTypes, false); |
| + /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */ |
| + @Deprecated |
| + public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) { |
| + this(version, enablePositionIncrements, input, stopTypes, false); |
| } |
| |
| /** |
| + * Create a new {@link TypeTokenFilter}. |
| + * @param version the Lucene match version |
| + * @param input the {@link TokenStream} to consume |
| + * @param stopTypes the types to filter |
| + * @param useWhiteList if true, then tokens whose type is in stopTypes will |
| + * be kept, otherwise they will be filtered out |
| + */ |
| + public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) { |
| + super(version, input); |
| + this.stopTypes = stopTypes; |
| + this.useWhiteList = useWhiteList; |
| + } |
| + |
| + /** |
| + * Create a new {@link TypeTokenFilter} that filters tokens out |
| + * (useWhiteList=false). |
| + * @see #TypeTokenFilter(Version, TokenStream, Set, boolean) |
| + */ |
| + public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) { |
| + this(version, input, stopTypes, false); |
| + } |
| + |
| + /** |
| * By default accept the token if its type is not a stop type. |
| * When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes |
| */ |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (copie de travail) |
| @@ -57,7 +57,7 @@ |
| * @see #makeStopSet(Version, java.lang.String...) |
| */ |
| public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) { |
| - super(true, in); |
| + super(matchVersion, in); |
| this.stopWords = stopWords; |
| } |
| |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (copie de travail) |
| @@ -51,7 +51,7 @@ |
| stopWordFiles = get(args, "words"); |
| format = get(args, "format"); |
| ignoreCase = getBoolean(args, "ignoreCase", false); |
| - enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); |
| + enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (copie de travail) |
| @@ -20,6 +20,7 @@ |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.util.FilteringTokenFilter; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| +import org.apache.lucene.util.Version; |
| |
| /** |
| * Removes words that are too long or too short from the stream. |
| @@ -34,16 +35,29 @@ |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| |
| + /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */ |
| + @Deprecated |
| + public LengthFilter(Version version, boolean enablePositionIncrements, TokenStream in, int min, int max) { |
| + super(version, enablePositionIncrements, in); |
| + this.min = min; |
| + this.max = max; |
| + } |
| + |
| /** |
| - * Build a filter that removes words that are too long or too |
| - * short from the text. |
| + * Create a new {@link LengthFilter}. This will filter out tokens whose |
| + * {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()} |
| + * < min) or too long ({@link CharTermAttribute#length()} > max). |
| + * @param version the Lucene match version |
| + * @param in the {@link TokenStream} to consume |
| + * @param min the minimum length |
| + * @param max the maximum length |
| */ |
| - public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) { |
| - super(enablePositionIncrements, in); |
| + public LengthFilter(Version version, TokenStream in, int min, int max) { |
| + super(version, in); |
| this.min = min; |
| this.max = max; |
| } |
| - |
| + |
| @Override |
| public boolean accept() { |
| final int len = termAtt.length(); |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (copie de travail) |
| @@ -17,18 +17,18 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.util.Map; |
| + |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.util.TokenFilterFactory; |
| |
| -import java.util.Map; |
| - |
| /** |
| * Factory for {@link LengthFilter}. |
| * <pre class="prettyprint"> |
| * <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100"> |
| * <analyzer> |
| * <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| - * <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/> |
| + * <filter class="solr.LengthFilterFactory" min="0" max="1" /> |
| * </analyzer> |
| * </fieldType></pre> |
| */ |
| @@ -44,7 +44,7 @@ |
| super(args); |
| min = requireInt(args, MIN_KEY); |
| max = requireInt(args, MAX_KEY); |
| - enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); |
| + enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| } |
| @@ -52,6 +52,8 @@ |
| |
| @Override |
| public LengthFilter create(TokenStream input) { |
| - return new LengthFilter(enablePositionIncrements, input,min,max); |
| + @SuppressWarnings("deprecation") |
| + final LengthFilter filter = new LengthFilter(luceneMatchVersion, enablePositionIncrements, input,min,max); |
| + return filter; |
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (copie de travail) |
| @@ -29,7 +29,7 @@ |
| * <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100"> |
| * <analyzer> |
| * <tokenizer class="solr.NGramTokenizerFactory"/> |
| - * <filter class="solr.TrimFilterFactory" updateOffsets="false"/> |
| + * <filter class="solr.TrimFilterFactory" /> |
| * </analyzer> |
| * </fieldType></pre> |
| * |
| @@ -50,6 +50,8 @@ |
| |
| @Override |
| public TrimFilter create(TokenStream input) { |
| - return new TrimFilter(input, updateOffsets); |
| + @SuppressWarnings("deprecation") |
| + final TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets); |
| + return filter; |
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (copie de travail) |
| @@ -21,6 +21,7 @@ |
| import org.apache.lucene.analysis.util.FilteringTokenFilter; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.util.CharArraySet; |
| +import org.apache.lucene.util.Version; |
| |
| /** |
| * A TokenFilter that only keeps tokens with text contained in the |
| @@ -32,13 +33,26 @@ |
| private final CharArraySet words; |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| |
| - /** The words set passed to this constructor will be directly used by this filter |
| - * and should not be modified, */ |
| - public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) { |
| - super(enablePositionIncrements, in); |
| + /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */ |
| + @Deprecated |
| + public KeepWordFilter(Version version, boolean enablePositionIncrements, TokenStream in, CharArraySet words) { |
| + super(version, enablePositionIncrements, in); |
| this.words = words; |
| } |
| |
| + /** |
| + * Create a new {@link KeepWordFilter}. |
| + * <p><b>NOTE</b>: The words set passed to this constructor will be directly |
| + * used by this filter and should not be modified. |
| + * @param version the Lucene match version |
| + * @param in the {@link TokenStream} to consume |
| + * @param words the words to keep |
| + */ |
| + public KeepWordFilter(Version version, TokenStream in, CharArraySet words) { |
| + super(version, in); |
| + this.words = words; |
| + } |
| + |
| @Override |
| public boolean accept() { |
| return words.contains(termAtt.buffer(), 0, termAtt.length()); |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (copie de travail) |
| @@ -21,11 +21,14 @@ |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.util.Version; |
| |
| import java.io.IOException; |
| |
| /** |
| * Trims leading and trailing whitespace from Tokens in the stream. |
| + * <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore |
| + * as it can lead to broken token streams. |
| */ |
| public final class TrimFilter extends TokenFilter { |
| |
| @@ -33,12 +36,27 @@ |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| |
| - |
| - public TrimFilter(TokenStream in, boolean updateOffsets) { |
| + /** |
| + * Create a new {@link TrimFilter}. |
| + * @param version the Lucene match version |
| + * @param in the stream to consume |
| + * @param updateOffsets whether to update offsets |
| + * @deprecated Offset updates are not supported anymore as of Lucene 4.4. |
| + */ |
| + @Deprecated |
| + public TrimFilter(Version version, TokenStream in, boolean updateOffsets) { |
| super(in); |
| + if (updateOffsets && version.onOrAfter(Version.LUCENE_44)) { |
| + throw new IllegalArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4"); |
| + } |
| this.updateOffsets = updateOffsets; |
| } |
| |
| + /** Create a new {@link TrimFilter} on top of <code>in</code>. */ |
| + public TrimFilter(Version version, TokenStream in) { |
| + this(version, in, false); |
| + } |
| + |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (!input.incrementToken()) return false; |
| @@ -55,11 +73,10 @@ |
| int endOff = 0; |
| |
| // eat the first characters |
| - //QUESTION: Should we use Character.isWhitespace() instead? |
| - for (start = 0; start < len && termBuffer[start] <= ' '; start++) { |
| + for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) { |
| } |
| // eat the end characters |
| - for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) { |
| + for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) { |
| endOff++; |
| } |
| if (start > 0 || end < len) { |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (copie de travail) |
| @@ -32,7 +32,7 @@ |
| * <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100"> |
| * <analyzer> |
| * <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
| - * <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/> |
| + * <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/> |
| * </analyzer> |
| * </fieldType></pre> |
| */ |
| @@ -48,7 +48,7 @@ |
| assureMatchVersion(); |
| wordFiles = get(args, "words"); |
| ignoreCase = getBoolean(args, "ignoreCase", false); |
| - enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); |
| + enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| } |
| @@ -76,6 +76,12 @@ |
| @Override |
| public TokenStream create(TokenStream input) { |
| // if the set is null, it means it was empty |
| - return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words); |
| + if (words == null) { |
| + return input; |
| + } else { |
| + @SuppressWarnings("deprecation") |
| + final TokenStream filter = new KeepWordFilter(luceneMatchVersion, enablePositionIncrements, input, words); |
| + return filter; |
| + } |
| } |
| } |
| Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java |
| =================================================================== |
| --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (révision 1477238) |
| +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (copie de travail) |
| @@ -73,7 +73,7 @@ |
| * @param maxGram the largest n-gram to generate |
| */ |
| public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { |
| - super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE)); |
| + super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE)); |
| this.version = version; |
| if (minGram < 1) { |
| throw new IllegalArgumentException("minGram must be greater than zero"); |
| Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java |
| =================================================================== |
| --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java (révision 1477238) |
| +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java (copie de travail) |
| @@ -89,7 +89,7 @@ |
| protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
| Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode); |
| TokenStream stream = new JapaneseBaseFormFilter(tokenizer); |
| - stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); |
| + stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags); |
| stream = new CJKWidthFilter(stream); |
| stream = new StopFilter(matchVersion, stream, stopwords); |
| stream = new JapaneseKatakanaStemFilter(stream); |
| Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java |
| =================================================================== |
| --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (révision 1477238) |
| +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (copie de travail) |
| @@ -50,7 +50,7 @@ |
| public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) { |
| super(args); |
| stopTagFiles = get(args, "tags"); |
| - enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false); |
| + enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true); |
| if (!args.isEmpty()) { |
| throw new IllegalArgumentException("Unknown parameters: " + args); |
| } |
| @@ -72,6 +72,12 @@ |
| @Override |
| public TokenStream create(TokenStream stream) { |
| // if stoptags is null, it means the file is empty |
| - return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags); |
| + if (stopTags != null) { |
| + @SuppressWarnings("deprecation") |
| + final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, enablePositionIncrements, stream, stopTags); |
| + return filter; |
| + } else { |
| + return stream; |
| + } |
| } |
| } |
| Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java |
| =================================================================== |
| --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java (révision 1477238) |
| +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java (copie de travail) |
| @@ -22,6 +22,7 @@ |
| import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; |
| import org.apache.lucene.analysis.util.FilteringTokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.util.Version; |
| |
| /** |
| * Removes tokens that match a set of part-of-speech tags. |
| @@ -30,11 +31,24 @@ |
| private final Set<String> stopTags; |
| private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); |
| |
| - public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) { |
| - super(enablePositionIncrements, input); |
| + /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */ |
| + @Deprecated |
| + public JapanesePartOfSpeechStopFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) { |
| + super(version, enablePositionIncrements, input); |
| this.stopTags = stopTags; |
| } |
| |
| + /** |
| + * Create a new {@link JapanesePartOfSpeechStopFilter}. |
| + * @param version the Lucene match version |
| + * @param input the {@link TokenStream} to consume |
| + * @param stopTags the part-of-speech tags that should be removed |
| + */ |
| + public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) { |
| + super(version, input); |
| + this.stopTags = stopTags; |
| + } |
| + |
| @Override |
| protected boolean accept() { |
| final String pos = posAtt.getPartOfSpeech(); |