blob: 2ccea0da42b80a9625c5ff065a6ea59e0ca67ca1 [file] [log] [blame]
Index: solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java
===================================================================
--- solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java (révision 1477238)
+++ solr/core/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java (copie de travail)
@@ -201,12 +201,12 @@
public void testOptions() throws Exception {
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
- ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
+ ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.ALL, "true")
- ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
+ ,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
// test each combination at random
@@ -214,7 +214,7 @@
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
- { TermVectorParams.POSITIONS, "'positions':{'position':1}" },
+ { TermVectorParams.POSITIONS, "'positions':{'position':5}" },
{ TermVectorParams.DF, "'df':2" },
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
@@ -249,7 +249,7 @@
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
)
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
- ,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':1}, 'df':2, 'tf-idf':0.5}"
+ ,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
);
Index: solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java
===================================================================
--- solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (révision 1477238)
+++ solr/core/src/test/org/apache/solr/handler/DocumentAnalysisRequestHandlerTest.java (copie de travail)
@@ -323,16 +323,16 @@
tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
- assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4}, null, false));
+ assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6}, null, false));
tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
assertEquals("Expecting 4 tokens", 4, tokenList.size());
- assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 1, new int[]{2,2,2,1,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 2, new int[]{3,3,3,2,2}, null, true));
- assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 3, new int[]{4,4,4,3,3}, null, false));
- assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 4, new int[]{6,6,6,4,4}, null, false));
+ assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[]{2,2,2,2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[]{3,3,3,3,3}, null, true));
+ assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[]{4,4,4,4,4}, null, false));
+ assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[]{6,6,6,6,6}, null, false));
}
}
Index: solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
===================================================================
--- solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (révision 1477238)
+++ solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (copie de travail)
@@ -178,25 +178,25 @@
tokenList = indexPart.get("org.apache.lucene.analysis.core.StopFilter");
assertNotNull("Expcting StopFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
- assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3}, null, true));
- assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4}, null, false));
- assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5}, null, false));
- assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6}, null, false));
- assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7}, null, true));
- assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8}, null, false));
+ assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4}, null, true));
+ assertToken(tokenList.get(3), new TokenInfo("jumped", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5}, null, false));
+ assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6}, null, false));
+ assertToken(tokenList.get(5), new TokenInfo("lazy", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8}, null, false));
+ assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9}, null, true));
+ assertToken(tokenList.get(7), new TokenInfo("dogs", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10}, null, false));
tokenList = indexPart.get("org.apache.lucene.analysis.en.PorterStemFilter");
assertNotNull("Expcting PorterStemFilter analysis breakdown", tokenList);
assertEquals(tokenList.size(), 8);
- assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 1, new int[]{2,2,2,1,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 2, new int[]{3,3,3,2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 3, new int[]{4,4,4,3,3}, null, true));
- assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 4, new int[]{5,5,5,4,4}, null, false));
- assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 5, new int[]{6,6,6,5,5}, null, false));
- assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 6, new int[]{8,8,8,6,6}, null, false));
- assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 7, new int[]{9,9,9,7,7}, null, true));
- assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 8, new int[]{10,10,10,8,8}, null, false));
+ assertToken(tokenList.get(0), new TokenInfo("quick", null, "<ALPHANUM>", 4, 9, 2, new int[]{2,2,2,2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("red", null, "<ALPHANUM>", 10, 13, 3, new int[]{3,3,3,3,3}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("fox", null, "<ALPHANUM>", 14, 17, 4, new int[]{4,4,4,4,4}, null, true));
+ assertToken(tokenList.get(3), new TokenInfo("jump", null, "<ALPHANUM>", 18, 24, 5, new int[]{5,5,5,5,5}, null, false));
+ assertToken(tokenList.get(4), new TokenInfo("over", null, "<ALPHANUM>", 25, 29, 6, new int[]{6,6,6,6,6}, null, false));
+ assertToken(tokenList.get(5), new TokenInfo("lazi", null, "<ALPHANUM>", 34, 38, 8, new int[]{8,8,8,8,8}, null, false));
+ assertToken(tokenList.get(6), new TokenInfo("brown", null, "<ALPHANUM>", 39, 44, 9, new int[]{9,9,9,9,9}, null, true));
+ assertToken(tokenList.get(7), new TokenInfo("dog", null, "<ALPHANUM>", 45, 49, 10, new int[]{10,10,10,10,10}, null, false));
NamedList<List<NamedList>> queryPart = textType.get("query");
assertNotNull("expecting a query token analysis for field type 'text'", queryPart);
Index: solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java
===================================================================
--- solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java (révision 1477238)
+++ solr/core/src/test/org/apache/solr/DisMaxRequestHandlerTest.java (copie de travail)
@@ -87,8 +87,8 @@
req("cool stuff")
,"//*[@numFound='3']"
,"//result/doc[1]/int[@name='id'][.='42']"
- ,"//result/doc[2]/int[@name='id'][.='666']"
- ,"//result/doc[3]/int[@name='id'][.='8675309']"
+ ,"//result/doc[2]/int[@name='id'][.='8675309']"
+ ,"//result/doc[3]/int[@name='id'][.='666']"
);
assertQ("multi qf",
Index: solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
===================================================================
--- solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (révision 1477238)
+++ solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (copie de travail)
@@ -53,7 +53,7 @@
TokenStream filter = new PatternReplaceFilter(tokenizer,
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
- filter = new TrimFilter(filter, false);
+ filter = new TrimFilter(TEST_VERSION_CURRENT, filter, false);
return new TokenStreamComponents(tokenizer, filter);
}
});
Index: solr/example/example-DIH/solr/mail/conf/schema.xml
===================================================================
--- solr/example/example-DIH/solr/mail/conf/schema.xml (révision 1477238)
+++ solr/example/example-DIH/solr/mail/conf/schema.xml (copie de travail)
@@ -202,13 +202,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@@ -222,7 +219,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
Index: solr/example/solr/collection1/conf/schema.xml
===================================================================
--- solr/example/solr/collection1/conf/schema.xml (révision 1477238)
+++ solr/example/solr/collection1/conf/schema.xml (copie de travail)
@@ -440,7 +440,7 @@
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
@@ -448,7 +448,7 @@
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -466,13 +466,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
@@ -488,7 +485,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
@@ -516,13 +512,10 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@@ -535,7 +528,6 @@
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="lang/stopwords_en.txt"
- enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
@@ -566,7 +558,7 @@
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
@@ -574,7 +566,7 @@
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
@@ -730,7 +722,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- for any non-arabic -->
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ar.txt" />
<!-- normalizes ﻯ to ﻱ, etc -->
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.ArabicStemFilterFactory"/>
@@ -742,7 +734,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_bg.txt" />
<filter class="solr.BulgarianStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -754,7 +746,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ca.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ca.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
</analyzer>
</fieldType>
@@ -776,7 +768,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_cz.txt" />
<filter class="solr.CzechStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -786,7 +778,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_da.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
</analyzer>
</fieldType>
@@ -796,7 +788,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" />
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.GermanMinimalStemFilterFactory"/> -->
@@ -810,7 +802,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- greek specific lowercase for sigma -->
<filter class="solr.GreekLowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" />
<filter class="solr.GreekStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -820,7 +812,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_es.txt" format="snowball" />
<filter class="solr.SpanishLightStemFilterFactory"/>
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Spanish"/> -->
</analyzer>
@@ -831,7 +823,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_eu.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
</analyzer>
</fieldType>
@@ -845,7 +837,7 @@
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.PersianNormalizationFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fa.txt" />
</analyzer>
</fieldType>
@@ -854,7 +846,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fi.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
<!-- less aggressive: <filter class="solr.FinnishLightStemFilterFactory"/> -->
</analyzer>
@@ -867,7 +859,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_fr.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_fr.txt" format="snowball" />
<filter class="solr.FrenchLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.FrenchMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="French"/> -->
@@ -881,9 +873,9 @@
<!-- removes d', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_ga.txt"/>
<!-- removes n-, etc. position increments is intentionally false! -->
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt" enablePositionIncrements="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/hyphenations_ga.txt"/>
<filter class="solr.IrishLowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ga.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
</analyzer>
</fieldType>
@@ -893,7 +885,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_gl.txt" />
<filter class="solr.GalicianStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.GalicianMinimalStemFilterFactory"/> -->
</analyzer>
@@ -908,7 +900,7 @@
<filter class="solr.IndicNormalizationFilterFactory"/>
<!-- normalizes variation in spelling -->
<filter class="solr.HindiNormalizationFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hi.txt" />
<filter class="solr.HindiStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -918,7 +910,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hu.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
<!-- less aggressive: <filter class="solr.HungarianLightStemFilterFactory"/> -->
</analyzer>
@@ -929,7 +921,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_hy.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
</analyzer>
</fieldType>
@@ -939,7 +931,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_id.txt" />
<!-- for a less aggressive approach (only inflectional suffixes), set stemDerivational to false -->
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
</analyzer>
@@ -952,7 +944,7 @@
<!-- removes l', etc -->
<filter class="solr.ElisionFilterFactory" ignoreCase="true" articles="lang/contractions_it.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_it.txt" format="snowball" />
<filter class="solr.ItalianLightStemFilterFactory"/>
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Italian"/> -->
</analyzer>
@@ -999,11 +991,11 @@
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) -->
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<!-- Removes tokens with certain part-of-speech tags -->
- <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/>
+ <filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" />
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) -->
<filter class="solr.CJKWidthFilterFactory"/>
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking -->
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" />
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" />
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) -->
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<!-- Lower-cases romaji characters -->
@@ -1016,7 +1008,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_lv.txt" />
<filter class="solr.LatvianStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -1026,7 +1018,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_nl.txt" format="snowball" />
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
</analyzer>
@@ -1037,7 +1029,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_no.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
<!-- less aggressive: <filter class="solr.NorwegianLightStemFilterFactory"/> -->
<!-- singular/plural: <filter class="solr.NorwegianMinimalStemFilterFactory"/> -->
@@ -1049,7 +1041,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_pt.txt" format="snowball" />
<filter class="solr.PortugueseLightStemFilterFactory"/>
<!-- less aggressive: <filter class="solr.PortugueseMinimalStemFilterFactory"/> -->
<!-- more aggressive: <filter class="solr.SnowballPorterFilterFactory" language="Portuguese"/> -->
@@ -1062,7 +1054,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ro.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
</analyzer>
</fieldType>
@@ -1072,7 +1064,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
<!-- less aggressive: <filter class="solr.RussianLightStemFilterFactory"/> -->
</analyzer>
@@ -1083,7 +1075,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_sv.txt" format="snowball" />
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
<!-- less aggressive: <filter class="solr.SwedishLightStemFilterFactory"/> -->
</analyzer>
@@ -1095,7 +1087,7 @@
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ThaiWordFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_th.txt" />
</analyzer>
</fieldType>
@@ -1104,7 +1096,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.TurkishLowerCaseFilterFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" enablePositionIncrements="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_tr.txt" />
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
</analyzer>
</fieldType>
Index: lucene/core/src/java/org/apache/lucene/analysis/package.html
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/package.html (révision 1477238)
+++ lucene/core/src/java/org/apache/lucene/analysis/package.html (copie de travail)
@@ -282,19 +282,19 @@
<p>
If the selected analyzer filters the stop words "is" and "the", then for a document
containing the string "blue is the sky", only the tokens "blue", "sky" are indexed,
- with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky"
+ with position("sky") = 3 + position("blue"). Now, a phrase query "blue is the sky"
would find that document, because the same analyzer filters the same stop words from
- that query. But also the phrase query "blue sky" would find that document.
+ that query. But the phrase query "blue sky" would not find that document because the
+ position increment between "blue" and "sky" is only 1.
</p>
<p>
- If this behavior does not fit the application needs, a modified analyzer can
- be used, that would increment further the positions of tokens following a
- removed stop word, using
- {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
- This can be done with something like the following (note, however, that
- StopFilter natively includes this capability by subclassing
- FilteringTokenFilter}:
+ If this behavior does not fit the application needs, the query parser needs to be
+ configured to not take position increments into account when generating phrase queries.
</p>
+<p>
+ Note that a StopFilter MUST increment the position increment in order not to generate corrupt
+ tokenstream graphs. Here is the logic used by StopFilter to increment positions when filtering out tokens:
+</p>
<PRE class="prettyprint">
public TokenStream tokenStream(final String fieldName, Reader reader) {
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
@@ -308,7 +308,7 @@
boolean hasNext = ts.incrementToken();
if (hasNext) {
if (stopWords.contains(termAtt.toString())) {
- extraIncrement++; // filter this word
+ extraIncrement += posIncrAtt.getPositionIncrement(); // filter this word
continue;
}
if (extraIncrement>0) {
@@ -323,11 +323,6 @@
}
</PRE>
<p>
- Now, with this modified analyzer, the phrase query "blue sky" would find that document.
- But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky"
- where both w1 and w2 are stop words would match that document.
-</p>
-<p>
A few more use cases for modifying position increments are:
</p>
<ol>
@@ -338,6 +333,72 @@
As result, all synonyms of a token would be considered to appear in exactly the
same position as that token, and so would they be seen by phrase and proximity searches.</li>
</ol>
+
+<h3>Token Position Length</h3>
+<p>
+ By default, all tokens created by Analyzers and Tokenizers have a
+ {@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#getPositionLength() position length} of one.
+ This means that the token occupies a single position. This attribute is not indexed
+ and thus not taken into account for positional queries, but is used by eg. suggesters.
+</p>
+<p>
+ The main use case for positions lengths is multi-word synonyms. With single-word
+ synonyms, setting the position increment to 0 is enough to denote the fact that two
+ words are synonyms, for example:
+</p>
+<table>
+<tr><td>Term</td><td>red</td><td>magenta</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td></tr>
+</table>
+<p>
+ Given that position(magenta) = 0 + position(red), they are at the same position, so anything
+ working with analyzers will return the exact same result if you replace "magenta" with "red"
+ in the input. However, multi-word synonyms are more tricky. Let's say that you want to build
+ a TokenStream where "IBM" is a synonym of "Internal Business Machines". Position increments
+ are not enough anymore:
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ The problem with this token stream is that "IBM" is at the same position as "International"
+ although it is a synonym with "International Business Machines" as a whole. Setting
+ the position increment of "Business" and "Machines" to 0 wouldn't help as it would mean
+ than "International" is a synonym of "Business". The only way to solve this issue is to
+ make "IBM" span across 3 positions, this is where position lengths come to rescue.
+</p>
+<table>
+<tr><td>Term</td><td>IBM</td><td>International</td><td>Business</td><td>Machines</td></tr>
+<tr><td>Position increment</td><td>1</td><td>0</td><td>1</td><td>1</td></tr>
+<tr><td>Position length</td><td>3</td><td>1</td><td>1</td><td>1</td></tr>
+</table>
+<p>
+ This new attribute makes clear that "IBM" and "International Business Machines" start and end
+ at the same positions.
+</p>
+<a name="corrupt" />
+<h3>How to not write corrupt token streams</h3>
+<p>
+ There are a few rules to observe when writing custom Tokenizers and TokenFilters:
+</p>
+<ul>
+ <li>The first position increment must be &gt; 0.</li>
+ <li>Positions must not go backward.</li>
+ <li>Tokens that have the same start position must have the same start offset.</li>
+ <li>Tokens that have the same end position (taking into account the position length) must have the same end offset.</li>
+</ul>
+<p>
+ Although these rules might seem easy to follow, problems can quickly happen when chaining
+ badly implemented filters that play with positions and offsets, such as synonym or n-grams
+ filters. Here are good practices for writing correct filters:
+</p>
+<ul>
+ <li>Token filters should not modify offsets. If you feel that your filter would need to modify offsets, then it should probably be implemented as a tokenizer.</li>
+ <li>Token filters should not insert positions. If a filter needs to add tokens, then they shoud all have a position increment of 0.</li>
+ <li>When they remove tokens, token filters should increment the position increment of the following token.</li>
+ <li>Token filters should preserve position lengths.</li>
+</ul>
<h2>TokenStream API</h2>
<p>
"Flexible Indexing" summarizes the effort of making the Lucene indexer
@@ -383,6 +444,10 @@
<td>See above for detailed information about position increment.</td>
</tr>
<tr>
+ <td>{@link org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}</td>
+ <td>The number of positions occupied by a token.</td>
+ </tr>
+ <tr>
<td>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}</td>
<td>The payload that a Token can optionally have.</td>
</tr>
@@ -532,20 +597,26 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
- * Build a filter that removes words that are too long or too
- * short from the text.
+ * Create a new LengthFilter. This will filter out tokens whose
+ * CharTermAttribute is either too short
+ * (&lt; min) or too long (&gt; max).
+ * @param version the Lucene match version
+ * @param in the TokenStream to consume
+ * @param min the minimum length
+ * @param max the maximum length
*/
- public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
- super(enablePositionIncrements, in);
+ public LengthFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
this.min = min;
this.max = max;
}
-
+
{@literal @Override}
- public boolean accept() throws IOException {
+ public boolean accept() {
final int len = termAtt.length();
- return (len >= min && len <= max);
+ return (len &gt;= min &amp;&amp; len <= max);
}
+
}
</pre>
<p>
@@ -573,66 +644,39 @@
public abstract class FilteringTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
- public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
- super(input);
- this.enablePositionIncrements = enablePositionIncrements;
+ /**
+ * Create a new FilteringTokenFilter.
+ * @param in the TokenStream to consume
+ */
+ public FilteringTokenFilter(Version version, TokenStream in) {
+ super(in);
}
- /** Override this method and return if the current input token should be returned by {@literal {@link #incrementToken}}. */
+ /** Override this method and return if the current input token should be returned by incrementToken. */
protected abstract boolean accept() throws IOException;
{@literal @Override}
public final boolean incrementToken() throws IOException {
- if (enablePositionIncrements) {
- int skippedPositions = 0;
- while (input.incrementToken()) {
- if (accept()) {
- if (skippedPositions != 0) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
- return true;
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (accept()) {
+ if (skippedPositions != 0) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
- skippedPositions += posIncrAtt.getPositionIncrement();
+ return true;
}
- } else {
- while (input.incrementToken()) {
- if (accept()) {
- return true;
- }
- }
+ skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return false
return false;
}
- /**
- * {@literal @see #setEnablePositionIncrements(boolean)}
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
+ {@literal @Override}
+ public void reset() throws IOException {
+ super.reset();
}
- /**
- * If <code>true</code>, this TokenFilter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed tokens).
- * Generally, <code>true</code> is best as it does not
- * lose information (positions of the original tokens)
- * during indexing.
- *
- * <p> When set, when a token is stopped
- * (omitted), the position increment of the following
- * token is incremented.
- *
- * <p> <b>NOTE</b>: be sure to also
- * set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
- * you use QueryParser to create queries.
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
</pre>
Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (révision 1477238)
+++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (copie de travail)
@@ -17,12 +17,8 @@
* limitations under the License.
*/
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -43,10 +39,18 @@
* @lucene.experimental */
public class TokenStreamToAutomaton {
+ private boolean preservePositionIncrements;
+
/** Sole constructor. */
public TokenStreamToAutomaton() {
+ this.preservePositionIncrements = true;
}
+ /** Whether to generate holes in the automaton for missing positions, <code>true</code> by default. */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
+ }
+
private static class Position implements RollingBuffer.Resettable {
// Any tokens that ended at our position arrive to this state:
State arriving;
@@ -108,6 +112,9 @@
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
+ if (!preservePositionIncrements && posInc > 1) {
+ posInc = 1;
+ }
assert pos > -1 || posInc > 0;
if (posInc > 0) {
Index: lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/index/TestTermVectorsWriter.java (copie de travail)
@@ -213,7 +213,7 @@
public void testEndOffsetPositionStopFilter() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
+ TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
Index: lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java (copie de travail)
@@ -64,16 +64,10 @@
/** Test a configuration that behaves a lot like StopAnalyzer */
public void testStop() throws Exception {
- Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
assertAnalyzesTo(a, "the quick brown a fox",
new String[] { "quick", "brown", "fox" },
new int[] { 2, 1, 2 });
-
- // disable positions
- a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
- assertAnalyzesTo(a, "the quick brown a fox",
- new String[] { "quick", "brown", "fox" },
- new int[] { 1, 1, 1 });
}
/** Test a configuration that behaves a lot like KeepWordFilter */
@@ -83,7 +77,7 @@
BasicOperations.complement(
Automaton.union(
Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar")))));
- Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true);
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords);
assertAnalyzesTo(a, "quick foo brown bar bar fox foo",
new String[] { "foo", "bar", "bar", "foo" },
new int[] { 2, 2, 1, 2 });
@@ -92,7 +86,7 @@
/** Test a configuration that behaves a lot like LengthFilter */
public void testLength() throws Exception {
CharacterRunAutomaton length5 = new CharacterRunAutomaton(new RegExp(".{5,}").toAutomaton());
- Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5, true);
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, length5);
assertAnalyzesTo(a, "ok toolong fine notfine",
new String[] { "ok", "fine" },
new int[] { 1, 2 });
Index: lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/search/TestPhraseQuery.java (copie de travail)
@@ -222,7 +222,7 @@
public void testPhraseQueryWithStopAnalyzer() throws Exception {
Directory directory = newDirectory();
- Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, false);
+ Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig( Version.LUCENE_40, stopAnalyzer));
Document doc = new Document();
@@ -241,16 +241,6 @@
assertEquals(1, hits.length);
QueryUtils.check(random(), query,searcher);
-
- // StopAnalyzer as of 2.4 does not leave "holes", so this matches.
- query = new PhraseQuery();
- query.add(new Term("field", "words"));
- query.add(new Term("field", "here"));
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- QueryUtils.check(random(), query,searcher);
-
-
reader.close();
directory.close();
}
Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced2.java (copie de travail)
@@ -49,7 +49,7 @@
// create test index
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(),
- MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
+ MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
.setOpenMode(OpenMode.APPEND).setMergePolicy(newLogMergePolicy())
.setSimilarity(new DefaultSimilarity()));
addDocument(writer, "A", "Should we, could we, would we?");
Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpanFirstQuery.java (copie de travail)
@@ -37,7 +37,7 @@
// mimic StopAnalyzer
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
Document doc = new Document();
Index: lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java (révision 1477238)
+++ lucene/core/src/test/org/apache/lucene/search/spans/TestSpansAdvanced.java (copie de travail)
@@ -60,7 +60,7 @@
mDirectory = newDirectory();
final RandomIndexWriter writer = new RandomIndexWriter(random(), mDirectory,
newIndexWriterConfig(TEST_VERSION_CURRENT,
- new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))
+ new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET))
.setMergePolicy(newLogMergePolicy()).setSimilarity(new DefaultSimilarity()));
addDocument(writer, "1", "I think it should work.");
addDocument(writer, "2", "I think it should work.");
Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (révision 1477238)
+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (copie de travail)
@@ -247,7 +247,7 @@
public void testCommonTermsQueryHighlightTest() throws IOException {
Directory dir = newDirectory();
- IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPositions(true);
Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (révision 1477238)
+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (copie de travail)
@@ -247,7 +247,7 @@
*/
private String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
- TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)
+ TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
.tokenStream(fieldName, new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@@ -1308,7 +1308,7 @@
}
public void testMaxSizeHighlight() throws Exception {
- final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
analyzer.setEnableChecks(false);
@@ -1343,7 +1343,7 @@
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
// we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
- final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
+ final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
analyzer.setEnableChecks(false);
TermQuery query = new TermQuery(new Term("data", goodWord));
@@ -1394,7 +1394,7 @@
Highlighter hg = getHighlighter(query, "text", fm);
hg.setTextFragmenter(new NullFragmenter());
hg.setMaxDocCharsToAnalyze(36);
- String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true), "text", text);
+ String match = hg.getBestFragment(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords), "text", text);
assertTrue(
"Matched text should contain remainder of text after highlighted query ",
match.endsWith("in it"));
@@ -1411,7 +1411,7 @@
numHighlights = 0;
// test to show how rewritten query can still be used
searcher = newSearcher(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
BooleanQuery query = new BooleanQuery();
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
@@ -1875,11 +1875,11 @@
super.setUp();
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
dir = newDirectory();
ramDir = newDirectory();
IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
+ TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
for (String text : texts) {
addDoc(writer, text);
}
Index: lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java (révision 1477238)
+++ lucene/highlighter/src/test/org/apache/lucene/search/highlight/custom/HighlightCustomQueryTest.java (copie de travail)
@@ -89,7 +89,7 @@
private String highlightField(Query query, String fieldName,
String text) throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE,
- true, MockTokenFilter.ENGLISH_STOPSET, true).tokenStream(fieldName,
+ true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName,
new StringReader(text));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
Index: lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java
===================================================================
--- lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (révision 1477238)
+++ lucene/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java (copie de travail)
@@ -259,7 +259,7 @@
private Analyzer randomAnalyzer() {
switch(random().nextInt(4)) {
case 0: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
- case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ case 1: return new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
case 2: return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (révision 1477238)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java (copie de travail)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
@@ -46,7 +45,6 @@
private final CharacterRunAutomaton runAutomaton;
private final boolean lowerCase;
private final CharacterRunAutomaton filter;
- private final boolean enablePositionIncrements;
private int positionIncrementGap;
private final Random random;
private Map<String,Integer> previousMappings = new HashMap<String,Integer>();
@@ -60,30 +58,28 @@
* @param runAutomaton DFA describing how tokenization should happen (e.g. [a-zA-Z]+)
* @param lowerCase true if the tokenizer should lowercase terms
* @param filter DFA describing how terms should be filtered (set of stopwords, etc)
- * @param enablePositionIncrements true if position increments should reflect filtered terms.
*/
- public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter, boolean enablePositionIncrements) {
+ public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase, CharacterRunAutomaton filter) {
super(new PerFieldReuseStrategy());
// TODO: this should be solved in a different way; Random should not be shared (!).
this.random = new Random(random.nextLong());
this.runAutomaton = runAutomaton;
this.lowerCase = lowerCase;
this.filter = filter;
- this.enablePositionIncrements = enablePositionIncrements;
}
/**
- * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
+ * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
* MockAnalyzer(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
- this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET, true);
+ this(random, runAutomaton, lowerCase, MockTokenFilter.EMPTY_STOPSET);
}
/**
* Create a Whitespace-lowercasing analyzer with no stopwords removal.
* <p>
- * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton, boolean)
+ * Calls {@link #MockAnalyzer(Random, CharacterRunAutomaton, boolean, CharacterRunAutomaton)
* MockAnalyzer(random, MockTokenizer.WHITESPACE, true, MockTokenFilter.EMPTY_STOPSET, false}).
*/
public MockAnalyzer(Random random) {
@@ -95,7 +91,6 @@
MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase, maxTokenLength);
tokenizer.setEnableChecks(enableChecks);
MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
- filt.setEnablePositionIncrements(enablePositionIncrements);
return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
}
Index: lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (révision 1477238)
+++ lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenFilter.java (copie de travail)
@@ -55,7 +55,6 @@
makeString("with"))));
private final CharacterRunAutomaton filter;
- private boolean enablePositionIncrements = true;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -80,9 +79,7 @@
int skippedPositions = 0;
while (input.incrementToken()) {
if (!filter.run(termAtt.buffer(), 0, termAtt.length())) {
- if (enablePositionIncrements) {
- posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
- }
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
@@ -90,20 +87,4 @@
// reached EOS -- return false
return false;
}
-
- /**
- * @see #setEnablePositionIncrements(boolean)
- */
- public boolean getEnablePositionIncrements() {
- return enablePositionIncrements;
- }
-
- /**
- * If <code>true</code>, this Filter will preserve
- * positions of the incoming tokens (ie, accumulate and
- * set position increments of the removed stop tokens).
- */
- public void setEnablePositionIncrements(boolean enable) {
- this.enablePositionIncrements = enable;
- }
}
Index: lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java (révision 1477238)
+++ lucene/test-framework/src/java/org/apache/lucene/search/SearchEquivalenceTestBase.java (copie de travail)
@@ -59,7 +59,7 @@
directory = newDirectory();
stopword = "" + randomChar();
CharacterRunAutomaton stopset = new CharacterRunAutomaton(BasicAutomata.makeString(stopword));
- analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset, true);
+ analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset);
RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer);
Document doc = new Document();
Field id = new StringField("id", "", Field.Store.NO);
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (révision 1477238)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (copie de travail)
@@ -164,8 +164,9 @@
new TermFreq("the ghost of christmas past", 50),
};
- Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
+ Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
+ suggester.setPreservePositionIncrements(false);
suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
@@ -187,7 +188,7 @@
}
public void testEmpty() throws Exception {
- Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
+ Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
AnalyzingSuggester suggester = new AnalyzingSuggester(standard);
suggester.build(new TermFreqArrayIterator(new TermFreq[0]));
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (révision 1477238)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (copie de travail)
@@ -153,8 +153,9 @@
new TermFreq("the ghost of christmas past", 50),
};
- Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
+ Analyzer standard = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
FuzzySuggester suggester = new FuzzySuggester(standard);
+ suggester.setPreservePositionIncrements(false);
suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("the ghost of chris", random()), false, 1);
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (révision 1477238)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (copie de travail)
@@ -75,9 +75,9 @@
* example, if you use an analyzer removing stop words,
* then the partial text "ghost chr..." could see the
* suggestion "The Ghost of Christmas Past". Note that
- * your {@code StopFilter} instance must NOT preserve
- * position increments for this example to work, so you should call
- * {@code setEnablePositionIncrements(false)} on it.
+ * position increments MUST NOT be preserved for this example
+ * to work, so you should call
+ * {@link #setPreservePositionIncrements(boolean) setPreservePositionIncrements(false)}.
*
* <p>
* If SynonymFilter is used to map wifi and wireless network to
@@ -185,6 +185,9 @@
private static final int PAYLOAD_SEP = '\u001f';
+ /** Whether position holes should appear in the automaton. */
+ private boolean preservePositionIncrements;
+
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@@ -241,8 +244,15 @@
throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
}
this.maxGraphExpansions = maxGraphExpansions;
+ preservePositionIncrements = true;
}
+ /** Whether to take position holes (position increment > 1) into account when
+ * building the automaton, <code>true</code> by default. */
+ public void setPreservePositionIncrements(boolean preservePositionIncrements) {
+ this.preservePositionIncrements = preservePositionIncrements;
+ }
+
/** Returns byte size of the underlying FST. */
public long sizeInBytes() {
return fst == null ? 0 : fst.sizeInBytes();
@@ -327,13 +337,16 @@
}
TokenStreamToAutomaton getTokenStreamToAutomaton() {
+ final TokenStreamToAutomaton tsta;
if (preserveSep) {
- return new EscapingTokenStreamToAutomaton();
+ tsta = new EscapingTokenStreamToAutomaton();
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
- return new TokenStreamToAutomaton();
+ tsta = new TokenStreamToAutomaton();
}
+ tsta.setPreservePositionIncrements(preservePositionIncrements);
+ return tsta;
}
private static class AnalyzingComparator implements Comparator<BytesRef> {
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java (révision 1477238)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java (copie de travail)
@@ -852,7 +852,7 @@
public void testBoost()
throws Exception {
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
- Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords, true);
+ Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer);
Query q = getQuery("on^1.0",qp);
assertNotNull(q);
@@ -865,7 +865,7 @@
q = getQuery("\"on\"^1.0",qp);
assertNotNull(q);
- Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp2 = getParserConfig(a2);
q = getQuery("the^3", qp2);
// "the" is a stop word so the result is an empty query:
@@ -1007,7 +1007,7 @@
public void testStopwords() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
- CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
+ CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = getQuery("field:the OR field:foo",qp);
assertNotNull("result is null and it shouldn't be", result);
assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
@@ -1023,7 +1023,7 @@
}
public void testPositionIncrement() throws Exception {
- CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
+ CommonQueryParserConfiguration qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
qp.setEnablePositionIncrements(true);
String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\"";
// 0 2 5 7 8
@@ -1070,7 +1070,7 @@
// "match"
public void testPositionIncrements() throws Exception {
Directory dir = newDirectory();
- Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, a));
Document doc = new Document();
doc.add(newTextField("field", "the wizard of ozzy", Field.Store.NO));
@@ -1185,7 +1185,7 @@
}
public void testPhraseQueryToString() throws Exception {
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
qp.setEnablePositionIncrements(true);
PhraseQuery q = (PhraseQuery)getQuery("\"this hi this is a test is\"", qp);
@@ -1235,26 +1235,13 @@
CharacterRunAutomaton stopStopList =
new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
- CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, false));
+ CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
- PhraseQuery phraseQuery = new PhraseQuery();
- phraseQuery.add(new Term("field", "1"));
- phraseQuery.add(new Term("field", "2"));
-
- assertEquals(phraseQuery, getQuery("\"1 2\"",qp));
- assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
-
- qp.setEnablePositionIncrements(true);
- assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
-
- qp.setEnablePositionIncrements(false);
- assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
-
qp = getParserConfig(
- new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList, true));
+ new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
qp.setEnablePositionIncrements(true);
- phraseQuery = new PhraseQuery();
+ PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.add(new Term("field", "1"));
phraseQuery.add(new Term("field", "2"), 2);
assertEquals(phraseQuery, getQuery("\"1 stop 2\"",qp));
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java (révision 1477238)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java (copie de travail)
@@ -946,7 +946,7 @@
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
- Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
+ Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
@@ -962,7 +962,7 @@
assertNotNull(q);
StandardQueryParser qp2 = new StandardQueryParser();
- qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
+ qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
q = qp2.parse("the^3", "field");
// "the" is a stop word so the result is an empty query:
@@ -1179,7 +1179,7 @@
public void testStopwords() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
- qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true));
+ qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
Query result = qp.parse("a:the OR a:foo", "a");
assertNotNull("result is null and it shouldn't be", result);
@@ -1203,7 +1203,7 @@
public void testPositionIncrement() throws Exception {
StandardQueryParser qp = new StandardQueryParser();
qp.setAnalyzer(
- new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true));
+ new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET));
qp.setEnablePositionIncrements(true);
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java (révision 1477238)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java (copie de travail)
@@ -546,7 +546,7 @@
public void testBoost() throws Exception {
CharacterRunAutomaton stopSet = new CharacterRunAutomaton(BasicAutomata.makeString("on"));
- Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet, true);
+ Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
PrecedenceQueryParser qp = new PrecedenceQueryParser();
qp.setAnalyzer(oneStopAnalyzer);
@@ -561,7 +561,7 @@
q = qp.parse("\"on\"^1.0", "field");
assertNotNull(q);
- q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)).parse("the^3",
+ q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3",
"field");
assertNotNull(q);
}
Index: lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java
===================================================================
--- lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java (révision 1477238)
+++ lucene/queryparser/src/test/org/apache/lucene/queryparser/xml/TestParser.java (copie de travail)
@@ -58,7 +58,7 @@
@BeforeClass
public static void beforeClass() throws Exception {
// TODO: rewrite test (this needs to set QueryParser.enablePositionIncrements, too, for work with CURRENT):
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET, false);
+ Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
//initialize the parser
builder = new CorePlusExtensionsParser("contents", analyzer);
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishAnalyzer.java (copie de travail)
@@ -61,7 +61,7 @@
Analyzer a = new IrishAnalyzer(TEST_VERSION_CURRENT);
assertAnalyzesTo(a, "n-athair",
new String[] { "athair" },
- new int[] { 1 });
+ new int[] { 2 });
}
/** blast some random strings through the analyzer */
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (copie de travail)
@@ -50,7 +50,7 @@
public void testCreationWithBlackList() throws Exception {
TokenFilterFactory factory = tokenFilterFactory("Type",
"types", "stoptypes-1.txt, stoptypes-2.txt",
- "enablePositionIncrements", "false");
+ "enablePositionIncrements", "true");
NumericTokenStream input = new NumericTokenStream();
input.setIntValue(123);
factory.create(input);
@@ -59,7 +59,7 @@
public void testCreationWithWhiteList() throws Exception {
TokenFilterFactory factory = tokenFilterFactory("Type",
"types", "stoptypes-1.txt, stoptypes-2.txt",
- "enablePositionIncrements", "false",
+ "enablePositionIncrements", "true",
"useWhitelist", "true");
NumericTokenStream input = new NumericTokenStream();
input.setIntValue(123);
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (copie de travail)
@@ -75,7 +75,7 @@
doTestStopPositons(stpf,true);
// without increments
reader = new StringReader(sb.toString());
- stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
+ stpf = new StopFilter(Version.LUCENE_43, new MockTokenizer(reader, MockTokenizer.WHITESPACE, false), stopSet);
doTestStopPositons(stpf,false);
// with increments, concatenating two stop filters
ArrayList<String> a0 = new ArrayList<String>();
@@ -166,7 +166,7 @@
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
TokenFilter filter = new MockSynonymFilter(tokenizer);
- StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
+ StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
stopfilter.setEnablePositionIncrements(false);
return new TokenStreamComponents(tokenizer, stopfilter);
}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (copie de travail)
@@ -161,8 +161,6 @@
// startOffset thats > its endOffset
// (see LUCENE-3738 for a list of other offenders here)
// broken!
- Lucene43NGramTokenizer.class,
- // broken!
EdgeNGramTokenizer.class,
// broken!
EdgeNGramTokenFilter.class,
@@ -182,55 +180,6 @@
private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<Constructor<?>, Predicate<Object[]>>();
static {
try {
- brokenOffsetsConstructors.put(
- TrimFilter.class.getConstructor(TokenStream.class, boolean.class),
- new Predicate<Object[]>() {
- @Override
- public boolean apply(Object[] args) {
- assert args.length == 2;
- return (Boolean) args[1]; // args are broken if updateOffsets is true
- }
- });
- brokenOffsetsConstructors.put(
- TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class, boolean.class),
- new Predicate<Object[]>() {
- @Override
- public boolean apply(Object[] args) {
- assert args.length == 4;
- // LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
- return !(Boolean) args[0];
- }
- });
- brokenOffsetsConstructors.put(
- TypeTokenFilter.class.getConstructor(boolean.class, TokenStream.class, Set.class),
- new Predicate<Object[]>() {
- @Override
- public boolean apply(Object[] args) {
- assert args.length == 3;
- // LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
- return !(Boolean) args[0];
- }
- });
- brokenOffsetsConstructors.put(
- LengthFilter.class.getConstructor(boolean.class, TokenStream.class, int.class, int.class),
- new Predicate<Object[]>() {
- @Override
- public boolean apply(Object[] args) {
- assert args.length == 4;
- // LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
- return !(Boolean) args[0];
- }
- });
- brokenOffsetsConstructors.put(
- KeepWordFilter.class.getConstructor(boolean.class, TokenStream.class, CharArraySet.class),
- new Predicate<Object[]>() {
- @Override
- public boolean apply(Object[] args) {
- assert args.length == 3;
- // LUCENE-4065: only if you pass 'false' to enablePositionIncrements!
- return !(Boolean) args[0];
- }
- });
for (Class<?> c : Arrays.<Class<?>>asList(
ReversePathHierarchyTokenizer.class,
PathHierarchyTokenizer.class,
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilter.java (copie de travail)
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.English;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.StringReader;
@@ -36,7 +37,7 @@
public void testTypeFilter() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = asSet("<NUM>");
- TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
+ TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes);
assertTokenStreamContents(stream, new String[]{"is", "palindrome", "while", "is", "not"});
}
@@ -59,12 +60,12 @@
// with increments
StringReader reader = new StringReader(sb.toString());
- TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
+ TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
// without increments
reader = new StringReader(sb.toString());
- typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
+ typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
testPositons(typeTokenFilter);
}
@@ -87,7 +88,7 @@
public void testTypeFilterWhitelist() throws IOException {
StringReader reader = new StringReader("121 is palindrome, while 123 is not");
Set<String> stopTypes = Collections.singleton("<NUM>");
- TokenStream stream = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
+ TokenStream stream = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopTypes, true);
assertTokenStreamContents(stream, new String[]{"121", "123"});
}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java (copie de travail)
@@ -306,7 +306,6 @@
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
tokenizer, StandardAnalyzer.STOP_WORDS_SET);
- filter.setEnablePositionIncrements(true);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
}
};
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java (copie de travail)
@@ -29,6 +29,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.util.Version;
/**
*/
@@ -46,7 +47,7 @@
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21));
- ts = new TrimFilter(ts, false);
+ ts = new TrimFilter(TEST_VERSION_CURRENT, ts, false);
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
@@ -59,7 +60,7 @@
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3));
- ts = new TrimFilter(ts, true);
+ ts = new TrimFilter(Version.LUCENE_43, ts, true);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "" },
@@ -120,7 +121,7 @@
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
- return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, false));
+ return new TokenStreamComponents(tokenizer, new TrimFilter(Version.LUCENE_43, tokenizer, true));
}
};
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
@@ -130,7 +131,7 @@
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.KEYWORD, false);
- return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, true));
+ return new TokenStreamComponents(tokenizer, new TrimFilter(TEST_VERSION_CURRENT, tokenizer, false));
}
};
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
@@ -141,7 +142,9 @@
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer, random().nextBoolean()));
+ final boolean updateOffsets = random().nextBoolean();
+ final Version version = updateOffsets ? Version.LUCENE_43 : TEST_VERSION_CURRENT;
+ return new TokenStreamComponents(tokenizer, new TrimFilter(version, tokenizer, updateOffsets));
}
};
checkOneTermReuse(a, "", "");
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (copie de travail)
@@ -22,6 +22,8 @@
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.util.Version;
public class TestLengthFilterFactory extends BaseTokenStreamFactoryTestCase {
@@ -29,8 +31,10 @@
Reader reader = new StringReader("foo foobar super-duper-trooper");
TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
stream = tokenFilterFactory("Length",
+ Version.LUCENE_43, new ClasspathResourceLoader(getClass()),
"min", "4",
- "max", "10").create(stream);
+ "max", "10",
+ "enablePositionIncrements", "false").create(stream);
assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 });
}
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepWordFilter.java (copie de travail)
@@ -28,6 +28,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/** Test {@link KeepWordFilter} */
public class TestKeepWordFilter extends BaseTokenStreamTestCase {
@@ -42,22 +43,22 @@
// Test Stopwords
TokenStream stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+ stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- stream = new KeepWordFilter(true, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+ stream = new KeepWordFilter(TEST_VERSION_CURRENT, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
// Test Stopwords
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+ stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT, words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 1, 1 });
// Now force case
stream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
- stream = new KeepWordFilter(false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
+ stream = new KeepWordFilter(Version.LUCENE_43, false, stream, new CharArraySet(TEST_VERSION_CURRENT,words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 1 });
}
@@ -72,7 +73,7 @@
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
- TokenStream stream = new KeepWordFilter(true, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
+ TokenStream stream = new KeepWordFilter(TEST_VERSION_CURRENT, tokenizer, new CharArraySet(TEST_VERSION_CURRENT, words, true));
return new TokenStreamComponents(tokenizer, stream);
}
};
Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java
===================================================================
--- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (révision 1477238)
+++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilter.java (copie de travail)
@@ -19,6 +19,7 @@
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
@@ -29,7 +30,7 @@
public void testFilterNoPosIncr() throws Exception {
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
- LengthFilter filter = new LengthFilter(false, stream, 2, 6);
+ LengthFilter filter = new LengthFilter(Version.LUCENE_43, false, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 1, 1}
@@ -39,7 +40,7 @@
public void testFilterWithPosIncr() throws Exception {
TokenStream stream = new MockTokenizer(
new StringReader("short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false);
- LengthFilter filter = new LengthFilter(true, stream, 2, 6);
+ LengthFilter filter = new LengthFilter(TEST_VERSION_CURRENT, stream, 2, 6);
assertTokenStreamContents(filter,
new String[]{"short", "ab", "foo"},
new int[]{1, 4, 2}
@@ -51,7 +52,7 @@
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
- return new TokenStreamComponents(tokenizer, new LengthFilter(true, tokenizer, 0, 5));
+ return new TokenStreamComponents(tokenizer, new LengthFilter(TEST_VERSION_CURRENT, tokenizer, 0, 5));
}
};
checkOneTermReuse(a, "", "");
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java (copie de travail)
@@ -138,7 +138,9 @@
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
StopFilter s = new StopFilter(matchVersion, result, HYPHENATIONS);
- s.setEnablePositionIncrements(false);
+ if (!matchVersion.onOrAfter(Version.LUCENE_44)) {
+ s.setEnablePositionIncrements(false);
+ }
result = s;
result = new ElisionFilter(result, DEFAULT_ARTICLES);
result = new IrishLowerCaseFilter(result);
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/FilteringTokenFilter.java (copie de travail)
@@ -22,24 +22,54 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.Version;
/**
* Abstract base class for TokenFilters that may remove tokens.
* You have to implement {@link #accept} and return a boolean if the current
* token should be preserved. {@link #incrementToken} uses this method
* to decide if a token should be passed to the caller.
+ * <p><a name="version" />As of Lucene 4.4, an {@link IllegalArgumentException}
+ * is thrown when trying to disable position increments when filtering terms.
*/
public abstract class FilteringTokenFilter extends TokenFilter {
+ private static void checkPositionIncrement(Version version, boolean enablePositionIncrements) {
+ if (!enablePositionIncrements && version.onOrAfter(Version.LUCENE_44)) {
+ throw new IllegalArgumentException("enablePositionIncrements=false is not supported anymore as of Lucene 4.4 as it can create broken token streams");
+ }
+ }
+
+ protected final Version version;
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private boolean enablePositionIncrements; // no init needed, as ctor enforces setting value!
- private boolean first = true; // only used when not preserving gaps
+ private boolean first = true;
- public FilteringTokenFilter(boolean enablePositionIncrements, TokenStream input){
- super(input);
+ /**
+ * Create a new {@link FilteringTokenFilter}.
+ * @param version the Lucene match <a href="#version">version</a>
+ * @param enablePositionIncrements whether to increment position increments when filtering out terms
+ * @param input the input to consume
+ * @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
+ */
+ @Deprecated
+ public FilteringTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input){
+ this(version, input);
+ checkPositionIncrement(version, enablePositionIncrements);
this.enablePositionIncrements = enablePositionIncrements;
}
+ /**
+ * Create a new {@link FilteringTokenFilter}.
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ */
+ public FilteringTokenFilter(Version version, TokenStream in) {
+ super(in);
+ this.version = version;
+ this.enablePositionIncrements = true;
+ }
+
/** Override this method and return if the current input token should be returned by {@link #incrementToken}. */
protected abstract boolean accept() throws IOException;
@@ -102,8 +132,11 @@
* <p> <b>NOTE</b>: be sure to also
* set org.apache.lucene.queryparser.classic.QueryParser#setEnablePositionIncrements if
* you use QueryParser to create queries.
+ * @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4
*/
+ @Deprecated
public void setEnablePositionIncrements(boolean enable) {
+ checkPositionIncrement(version, enable);
this.enablePositionIncrements = enable;
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (copie de travail)
@@ -35,7 +35,7 @@
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
- * enablePositionIncrements="true" useWhitelist="false"/&gt;
+ * useWhitelist="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@@ -49,7 +49,7 @@
public TypeTokenFilterFactory(Map<String,String> args) {
super(args);
stopTypesFiles = require(args, "types");
- enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
+ enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
useWhitelist = getBoolean(args, "useWhitelist", false);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
@@ -78,6 +78,8 @@
@Override
public TokenStream create(TokenStream input) {
- return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist);
+ @SuppressWarnings("deprecation")
+ final TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist);
+ return filter;
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilter.java (copie de travail)
@@ -17,13 +17,13 @@
* limitations under the License.
*/
+import java.util.Set;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
+import org.apache.lucene.util.Version;
-import java.io.IOException;
-import java.util.Set;
-
/**
* Removes tokens whose types appear in a set of blocked types from a token stream.
*/
@@ -33,17 +33,44 @@
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final boolean useWhiteList;
- public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
- super(enablePositionIncrements, input);
+ /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
+ @Deprecated
+ public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
+ super(version, enablePositionIncrements, input);
this.stopTypes = stopTypes;
this.useWhiteList = useWhiteList;
}
- public TypeTokenFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
- this(enablePositionIncrements, input, stopTypes, false);
+ /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
+ @Deprecated
+ public TypeTokenFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTypes) {
+ this(version, enablePositionIncrements, input, stopTypes, false);
}
/**
+ * Create a new {@link TypeTokenFilter}.
+ * @param version the Lucene match version
+ * @param input the {@link TokenStream} to consume
+ * @param stopTypes the types to filter
+ * @param useWhiteList if true, then tokens whose type is in stopTypes will
+ * be kept, otherwise they will be filtered out
+ */
+ public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes, boolean useWhiteList) {
+ super(version, input);
+ this.stopTypes = stopTypes;
+ this.useWhiteList = useWhiteList;
+ }
+
+ /**
+ * Create a new {@link TypeTokenFilter} that filters tokens out
+ * (useWhiteList=false).
+ * @see #TypeTokenFilter(Version, TokenStream, Set, boolean)
+ */
+ public TypeTokenFilter(Version version, TokenStream input, Set<String> stopTypes) {
+ this(version, input, stopTypes, false);
+ }
+
+ /**
* By default accept the token if its type is not a stop type.
* When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
*/
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilter.java (copie de travail)
@@ -57,7 +57,7 @@
* @see #makeStopSet(Version, java.lang.String...)
*/
public StopFilter(Version matchVersion, TokenStream in, CharArraySet stopWords) {
- super(true, in);
+ super(matchVersion, in);
this.stopWords = stopWords;
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (copie de travail)
@@ -51,7 +51,7 @@
stopWordFiles = get(args, "words");
format = get(args, "format");
ignoreCase = getBoolean(args, "ignoreCase", false);
- enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
+ enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilter.java (copie de travail)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
/**
* Removes words that are too long or too short from the stream.
@@ -34,16 +35,29 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
+ @Deprecated
+ public LengthFilter(Version version, boolean enablePositionIncrements, TokenStream in, int min, int max) {
+ super(version, enablePositionIncrements, in);
+ this.min = min;
+ this.max = max;
+ }
+
/**
- * Build a filter that removes words that are too long or too
- * short from the text.
+ * Create a new {@link LengthFilter}. This will filter out tokens whose
+ * {@link CharTermAttribute} is either too short ({@link CharTermAttribute#length()}
+ * &lt; min) or too long ({@link CharTermAttribute#length()} &gt; max).
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ * @param min the minimum length
+ * @param max the maximum length
*/
- public LengthFilter(boolean enablePositionIncrements, TokenStream in, int min, int max) {
- super(enablePositionIncrements, in);
+ public LengthFilter(Version version, TokenStream in, int min, int max) {
+ super(version, in);
this.min = min;
this.max = max;
}
-
+
@Override
public boolean accept() {
final int len = termAtt.length();
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (copie de travail)
@@ -17,18 +17,18 @@
* limitations under the License.
*/
+import java.util.Map;
+
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
-import java.util.Map;
-
/**
* Factory for {@link LengthFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- * &lt;filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/&gt;
+ * &lt;filter class="solr.LengthFilterFactory" min="0" max="1" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@@ -44,7 +44,7 @@
super(args);
min = requireInt(args, MIN_KEY);
max = requireInt(args, MAX_KEY);
- enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
+ enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -52,6 +52,8 @@
@Override
public LengthFilter create(TokenStream input) {
- return new LengthFilter(enablePositionIncrements, input,min,max);
+ @SuppressWarnings("deprecation")
+ final LengthFilter filter = new LengthFilter(luceneMatchVersion, enablePositionIncrements, input,min,max);
+ return filter;
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (copie de travail)
@@ -29,7 +29,7 @@
* &lt;fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.NGramTokenizerFactory"/&gt;
- * &lt;filter class="solr.TrimFilterFactory" updateOffsets="false"/&gt;
+ * &lt;filter class="solr.TrimFilterFactory" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
@@ -50,6 +50,8 @@
@Override
public TrimFilter create(TokenStream input) {
- return new TrimFilter(input, updateOffsets);
+ @SuppressWarnings("deprecation")
+ final TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets);
+ return filter;
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilter.java (copie de travail)
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* A TokenFilter that only keeps tokens with text contained in the
@@ -32,13 +33,26 @@
private final CharArraySet words;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- /** The words set passed to this constructor will be directly used by this filter
- * and should not be modified, */
- public KeepWordFilter(boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
- super(enablePositionIncrements, in);
+ /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
+ @Deprecated
+ public KeepWordFilter(Version version, boolean enablePositionIncrements, TokenStream in, CharArraySet words) {
+ super(version, enablePositionIncrements, in);
this.words = words;
}
+ /**
+ * Create a new {@link KeepWordFilter}.
+ * <p><b>NOTE</b>: The words set passed to this constructor will be directly
+ * used by this filter and should not be modified.
+ * @param version the Lucene match version
+ * @param in the {@link TokenStream} to consume
+ * @param words the words to keep
+ */
+ public KeepWordFilter(Version version, TokenStream in, CharArraySet words) {
+ super(version, in);
+ this.words = words;
+ }
+
@Override
public boolean accept() {
return words.contains(termAtt.buffer(), 0, termAtt.length());
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilter.java (copie de travail)
@@ -21,11 +21,14 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.Version;
import java.io.IOException;
/**
* Trims leading and trailing whitespace from Tokens in the stream.
+ * <p>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
+ * as it can lead to broken token streams.
*/
public final class TrimFilter extends TokenFilter {
@@ -33,12 +36,27 @@
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- public TrimFilter(TokenStream in, boolean updateOffsets) {
+ /**
+ * Create a new {@link TrimFilter}.
+ * @param version the Lucene match version
+ * @param in the stream to consume
+ * @param updateOffsets whether to update offsets
+ * @deprecated Offset updates are not supported anymore as of Lucene 4.4.
+ */
+ @Deprecated
+ public TrimFilter(Version version, TokenStream in, boolean updateOffsets) {
super(in);
+ if (updateOffsets && version.onOrAfter(Version.LUCENE_44)) {
+ throw new IllegalArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
+ }
this.updateOffsets = updateOffsets;
}
+ /** Create a new {@link TrimFilter} on top of <code>in</code>. */
+ public TrimFilter(Version version, TokenStream in) {
+ this(version, in, false);
+ }
+
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
@@ -55,11 +73,10 @@
int endOff = 0;
// eat the first characters
- //QUESTION: Should we use Character.isWhitespace() instead?
- for (start = 0; start < len && termBuffer[start] <= ' '; start++) {
+ for (start = 0; start < len && Character.isWhitespace(termBuffer[start]); start++) {
}
// eat the end characters
- for (end = len; end >= start && termBuffer[end - 1] <= ' '; end--) {
+ for (end = len; end >= start && Character.isWhitespace(termBuffer[end - 1]); end--) {
endOff++;
}
if (start > 0 || end < len) {
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (copie de travail)
@@ -32,7 +32,7 @@
* &lt;fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- * &lt;filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/&gt;
+ * &lt;filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
@@ -48,7 +48,7 @@
assureMatchVersion();
wordFiles = get(args, "words");
ignoreCase = getBoolean(args, "ignoreCase", false);
- enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
+ enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -76,6 +76,12 @@
@Override
public TokenStream create(TokenStream input) {
// if the set is null, it means it was empty
- return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words);
+ if (words == null) {
+ return input;
+ } else {
+ @SuppressWarnings("deprecation")
+ final TokenStream filter = new KeepWordFilter(luceneMatchVersion, enablePositionIncrements, input, words);
+ return filter;
+ }
}
}
Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
===================================================================
--- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (révision 1477238)
+++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (copie de travail)
@@ -73,7 +73,7 @@
* @param maxGram the largest n-gram to generate
*/
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
- super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
+ super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
===================================================================
--- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java (révision 1477238)
+++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java (copie de travail)
@@ -89,7 +89,7 @@
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode);
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
- stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags);
+ stream = new JapanesePartOfSpeechStopFilter(matchVersion, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords);
stream = new JapaneseKatakanaStemFilter(stream);
Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java
===================================================================
--- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (révision 1477238)
+++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (copie de travail)
@@ -50,7 +50,7 @@
public JapanesePartOfSpeechStopFilterFactory(Map<String,String> args) {
super(args);
stopTagFiles = get(args, "tags");
- enablePositionIncrements = getBoolean(args, "enablePositionIncrements", false);
+ enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -72,6 +72,12 @@
@Override
public TokenStream create(TokenStream stream) {
// if stoptags is null, it means the file is empty
- return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags);
+ if (stopTags != null) {
+ @SuppressWarnings("deprecation")
+ final TokenStream filter = new JapanesePartOfSpeechStopFilter(luceneMatchVersion, enablePositionIncrements, stream, stopTags);
+ return filter;
+ } else {
+ return stream;
+ }
}
}
Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java
===================================================================
--- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java (révision 1477238)
+++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilter.java (copie de travail)
@@ -22,6 +22,7 @@
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.util.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.Version;
/**
* Removes tokens that match a set of part-of-speech tags.
@@ -30,11 +31,24 @@
private final Set<String> stopTags;
private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
- public JapanesePartOfSpeechStopFilter(boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
- super(enablePositionIncrements, input);
+ /** @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. */
+ @Deprecated
+ public JapanesePartOfSpeechStopFilter(Version version, boolean enablePositionIncrements, TokenStream input, Set<String> stopTags) {
+ super(version, enablePositionIncrements, input);
this.stopTags = stopTags;
}
+ /**
+ * Create a new {@link JapanesePartOfSpeechStopFilter}.
+ * @param version the Lucene match version
+ * @param input the {@link TokenStream} to consume
+ * @param stopTags the part-of-speech tags that should be removed
+ */
+ public JapanesePartOfSpeechStopFilter(Version version, TokenStream input, Set<String> stopTags) {
+ super(version, input);
+ this.stopTags = stopTags;
+ }
+
@Override
protected boolean accept() {
final String pos = posAtt.getPartOfSpeech();