blob: d406e593ed8c9f6eea818d363c87f020dca130da [file] [log] [blame]
Index: NOTICE.txt
===================================================================
--- NOTICE.txt (revision 906571)
+++ NOTICE.txt (working copy)
@@ -23,6 +23,11 @@
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
See http://members.unine.ch/jacques.savoy/clef/index.html.
+The Romanian analyzer (contrib/analyzers) comes with a default
+stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
+contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt.
+See http://members.unine.ch/jacques.savoy/clef/index.html.
+
The Bulgarian analyzer (contrib/analyzers) comes with a default
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.da;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new DanishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "undersøg", "undersøg");
+ checkOneTermReuse(a, "undersøgelse", "undersøg");
+ // stopword
+ assertAnalyzesTo(a, "på", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("undersøgelse");
+ Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT,
+ DanishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "undersøgelse", "undersøgelse");
+ checkOneTermReuse(a, "undersøg", "undersøg");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\da\TestDanishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (revision 0)
@@ -0,0 +1,93 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.util.Version;
+
+public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
+ public void testReusableTokenStream() throws Exception {
+ Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "Tisch", "tisch");
+ checkOneTermReuse(a, "Tische", "tisch");
+ checkOneTermReuse(a, "Tischen", "tisch");
+ }
+
+ public void testExclusionTableBWCompat() throws IOException {
+ GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
+ new StringReader("Fischen Trinken")));
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ filter.setExclusionSet(set);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttribute() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "Fischen Trinken")), set));
+ assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
+ }
+
+ public void testWithKeywordAttributeAndExclusionTable() throws IOException {
+ CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set.add("fischen");
+ CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
+ set1.add("trinken");
+ set1.add("fischen");
+ GermanStemFilter filter = new GermanStemFilter(
+ new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
+ "Fischen Trinken")), set));
+ filter.setExclusionSet(set1);
+ assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
+ }
+
+ /*
+ * Test that changes to the exclusion table are applied immediately
+ * when using reusable token streams.
+ */
+ public void testExclusionTableReuse() throws Exception {
+ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "tischen", "tisch");
+ a.setStemExclusionTable(new String[] { "tischen" });
+ checkOneTermReuse(a, "tischen", "tischen");
+ }
+
+ /** test some features of the new snowball filter
+ * these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
+ */
+ public void testGermanSpecials() throws Exception {
+ GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
+ // a/o/u + e is equivalent to the umlaut form
+ checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+ checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
+ // here they are with the old stemmer
+ a = new GermanAnalyzer(Version.LUCENE_30);
+ checkOneTermReuse(a, "Schaltflächen", "schaltflach");
+ checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\de\TestGermanAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy)
@@ -20,15 +20,14 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
-import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
/**
@@ -40,6 +39,8 @@
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
public void testStemming() throws Exception {
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
+ TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
// read test cases from external file:
File dataDir = new File(System.getProperty("dataDir", "./bin"));
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
@@ -55,68 +56,12 @@
continue; // ignore comments and empty lines
String[] parts = line.split(";");
//System.out.println(parts[0] + " -- " + parts[1]);
- check(parts[0], parts[1]);
+ tokenizer.reset(new StringReader(parts[0]));
+ filter.reset();
+ assertTokenStreamContents(filter, new String[] { parts[1] });
}
breader.close();
isr.close();
fis.close();
}
-
- public void testReusableTokenStream() throws Exception {
- Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "Tisch", "tisch");
- checkReuse(a, "Tische", "tisch");
- checkReuse(a, "Tischen", "tisch");
- }
-
- public void testExclusionTableBWCompat() throws IOException {
- GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
- new StringReader("Fischen Trinken")));
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- filter.setExclusionSet(set);
- assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
- }
-
- public void testWithKeywordAttribute() throws IOException {
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- GermanStemFilter filter = new GermanStemFilter(
- new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
- "Fischen Trinken")), set));
- assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
- }
-
- public void testWithKeywordAttributeAndExclusionTable() throws IOException {
- CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set.add("fischen");
- CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
- set1.add("trinken");
- set1.add("fischen");
- GermanStemFilter filter = new GermanStemFilter(
- new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
- "Fischen Trinken")), set));
- filter.setExclusionSet(set1);
- assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
- }
-
- /*
- * Test that changes to the exclusion table are applied immediately
- * when using reusable token streams.
- */
- public void testExclusionTableReuse() throws Exception {
- GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
- checkReuse(a, "tischen", "tisch");
- a.setStemExclusionTable(new String[] { "tischen" });
- checkReuse(a, "tischen", "tischen");
- }
-
-
- private void check(final String input, final String expected) throws Exception {
- checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected);
- }
-
- private void checkReuse(Analyzer a, String input, String expected) throws Exception {
- checkOneTermReuse(a, input, expected);
- }
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new SwedishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
+ checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
+ // stopword
+ assertAnalyzesTo(a, "och", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("jaktkarlarne");
+ Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT,
+ SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
+ checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\sv\TestSwedishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new FinnishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+ checkOneTermReuse(a, "edeltäjistään", "edeltäj");
+ // stopword
+ assertAnalyzesTo(a, "olla", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("edeltäjistään");
+ Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT,
+ FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
+ checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\fi\TestFinnishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 0)
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.KeywordTokenizer;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
+ public void testOverride() throws IOException {
+ // lets make booked stem to books
+ // the override filter will convert "booked" to "books",
+ // but also mark it with KeywordAttribute so Porter will not change it.
+ Map<String,String> dictionary = new HashMap<String,String>();
+ dictionary.put("booked", "books");
+ Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
+ TokenStream stream = new PorterStemFilter(
+ new StemmerOverrideFilter(Version.LUCENE_CURRENT, tokenizer, dictionary));
+ assertTokenStreamContents(stream, new String[] { "books" });
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\miscellaneous\TestStemmerOverrideFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy)
@@ -18,10 +18,8 @@
*/
-import java.io.IOException;
import java.io.StringReader;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
/**
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (working copy)
@@ -22,7 +22,6 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.util.Version;
-import java.io.IOException;
import java.io.StringReader;
/**
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (working copy)
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
@@ -113,6 +115,94 @@
}
+ /**
+ * @deprecated remove this test for Lucene 4.0
+ */
+ @Deprecated
+ public void testAnalyzer30() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
+
+ assertAnalyzesTo(fa, "", new String[] {
+ });
+
+ assertAnalyzesTo(
+ fa,
+ "chien chat cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ "chien CHAT CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(
+ fa,
+ " chien ,? + = - CHAT /: > CHEVAL",
+ new String[] { "chien", "chat", "cheval" });
+
+ assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
+
+ assertAnalyzesTo(
+ fa,
+ "mot \"entreguillemet\"",
+ new String[] { "mot", "entreguillemet" });
+
+ // let's do some french specific tests now
+
+ /* 1. couldn't resist
+ I would expect this to stay one term as in French the minus
+ sign is often used for composing words */
+ assertAnalyzesTo(
+ fa,
+ "Jean-François",
+ new String[] { "jean", "françois" });
+
+ // 2. stopwords
+ assertAnalyzesTo(
+ fa,
+ "le la chien les aux chat du des à cheval",
+ new String[] { "chien", "chat", "cheval" });
+
+ // some nouns and adjectives
+ assertAnalyzesTo(
+ fa,
+ "lances chismes habitable chiste éléments captifs",
+ new String[] {
+ "lanc",
+ "chism",
+ "habit",
+ "chist",
+ "élément",
+ "captif" });
+
+ // some verbs
+ assertAnalyzesTo(
+ fa,
+ "finissions souffrirent rugissante",
+ new String[] { "fin", "souffr", "rug" });
+
+ // some everything else
+ // aujourd'hui stays one term which is OK
+ assertAnalyzesTo(
+ fa,
+ "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
+ new String[] {
+ "c3po",
+ "aujourd'hui",
+ "oeuf",
+ "ïâöûàä",
+ "anticonstitutionnel",
+ "jav" });
+
+ // some more everything else
+ // here 1940-1945 stays as one term, 1940:1945 not ?
+ assertAnalyzesTo(
+ fa,
+ "33Bis 1940-1945 1940:1945 (---i+++)*",
+ new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
+
+ }
+
public void testReusableTokenStream() throws Exception {
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
// stopwords
@@ -157,4 +247,28 @@
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
"chist" });
}
+
+ public void testElision() throws Exception {
+ FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
+ assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
+ }
+
+ /**
+ * Prior to 3.1, this analyzer had no lowercase filter.
+ * stopwords were case sensitive. Preserve this for back compat.
+ * @deprecated Remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testBuggyStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Votre", new String[] { "votr" });
+ }
+
+ /**
+ * Test that stopwords are not case sensitive
+ */
+ public void testStopwordsCasing() throws IOException {
+ FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Votre", new String[] { });
+ }
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (working copy)
@@ -100,9 +100,6 @@
check("ophalend", "ophal");
check("ophalers", "ophaler");
check("ophef", "ophef");
- check("opheffen", "ophef"); // versus snowball 'opheff'
- check("opheffende", "ophef"); // versus snowball 'opheff'
- check("opheffing", "ophef"); // versus snowball 'opheff'
check("opheldering", "ophelder");
check("ophemelde", "ophemeld");
check("ophemelen", "ophemel");
@@ -118,6 +115,24 @@
check("ophouden", "ophoud");
}
+ /**
+ * @deprecated remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testOldBuggyStemmer() throws Exception {
+ Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
+ checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
+ checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
+ checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
+ }
+
+ public void testSnowballCorrectness() throws Exception {
+ Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
+ checkOneTermReuse(a, "opheffen", "opheff");
+ checkOneTermReuse(a, "opheffende", "opheff");
+ checkOneTermReuse(a, "opheffing", "opheff");
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
@@ -161,6 +176,25 @@
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
}
+ /**
+ * Prior to 3.1, this analyzer had no lowercase filter.
+ * stopwords were case sensitive. Preserve this for back compat.
+ * @deprecated Remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testBuggyStopwordsCasing() throws IOException {
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
+ }
+
+ /**
+ * Test that stopwords are not case sensitive
+ */
+ public void testStopwordsCasing() throws IOException {
+ DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Zelf", new String[] { });
+ }
+
private void check(final String input, final String expected) throws Exception {
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new HungarianAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "babakocsi", "babakocs");
+ checkOneTermReuse(a, "babakocsijáért", "babakocs");
+ // stopword
+ assertAnalyzesTo(a, "által", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("babakocsi");
+ Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT,
+ HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "babakocsi", "babakocsi");
+ checkOneTermReuse(a, "babakocsijáért", "babakocs");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\hu\TestHungarianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new NorwegianAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
+ checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
+ // stopword
+ assertAnalyzesTo(a, "det", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("havnedistriktene");
+ Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT,
+ NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
+ checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\no\TestNorwegianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.ro;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new RomanianAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "absenţa", "absenţ");
+ checkOneTermReuse(a, "absenţi", "absenţ");
+ // stopword
+ assertAnalyzesTo(a, "îl", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("absenţa");
+ Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT,
+ RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "absenţa", "absenţa");
+ checkOneTermReuse(a, "absenţi", "absenţ");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\ro\TestRomanianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new PortugueseAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "quilométricas", "quilométr");
+ checkOneTermReuse(a, "quilométricos", "quilométr");
+ // stopword
+ assertAnalyzesTo(a, "não", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("quilométricas");
+ Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT,
+ PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "quilométricas", "quilométricas");
+ checkOneTermReuse(a, "quilométricos", "quilométr");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\pt\TestPortugueseAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new TurkishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "ağacı", "ağaç");
+ checkOneTermReuse(a, "ağaç", "ağaç");
+ // stopword
+ assertAnalyzesTo(a, "dolayı", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("ağacı");
+ Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT,
+ TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "ağacı", "ağacı");
+ checkOneTermReuse(a, "ağaç", "ağaç");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\tr\TestTurkishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (working copy)
@@ -25,7 +25,9 @@
/**
* Testcase for {@link RussianLetterTokenizer}
+ * @deprecated Remove this test class in Lucene 4.0
*/
+@Deprecated
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
public void testRussianLetterTokenizer() throws IOException {
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (working copy)
@@ -50,9 +50,14 @@
dataDir = new File(System.getProperty("dataDir", "./bin"));
}
- public void testUnicode() throws IOException
+ /**
+ * @deprecated remove this test and its datafiles in Lucene 4.0
+ * the Snowball version has its own data tests.
+ */
+ @Deprecated
+ public void testUnicode30() throws IOException
{
- RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
+ RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30);
inWords =
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@@ -110,12 +115,22 @@
}
}
+ /** @deprecated remove this test in Lucene 4.0: stopwords changed */
+ @Deprecated
+ public void testReusableTokenStream30() throws Exception {
+ Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
+ assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
+ new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
+ assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
+ new String[] { "знан", "хран", "тайн" });
+ }
+
public void testReusableTokenStream() throws Exception {
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
- new String[] { "знан", "хран", "тайн" });
+ new String[] { "знан", "эт", "хран", "тайн" });
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (working copy)
@@ -24,6 +24,10 @@
import java.io.FileInputStream;
import java.util.ArrayList;
+/**
+ * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
+ */
+@Deprecated
public class TestRussianStem extends LuceneTestCase
{
private ArrayList words = new ArrayList();
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (working copy)
@@ -22,11 +22,8 @@
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
-import java.util.HashSet;
-import java.util.Arrays;
import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy)
@@ -18,7 +18,6 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
@@ -63,4 +62,23 @@
assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9",
new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" });
}
+
+ /**
+ * Greek Analyzer didn't call standardFilter, so no normalization of acronyms.
+ * check that this is preserved.
+ * @deprecated remove this test in Lucene 4.0
+ */
+ @Deprecated
+ public void testAcronymBWCompat() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
+ assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." });
+ }
+
+ /**
+ * test that acronym normalization works
+ */
+ public void testAcronym() throws Exception {
+ Analyzer a = new GreekAnalyzer(Version.LUCENE_31);
+ assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" });
+ }
}
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new EnglishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "books", "book");
+ checkOneTermReuse(a, "book", "book");
+ // stopword
+ assertAnalyzesTo(a, "the", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("books");
+ Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT,
+ EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "books", "books");
+ checkOneTermReuse(a, "book", "book");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\en\TestEnglishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (working copy)
@@ -22,8 +22,6 @@
import java.util.HashSet;
import java.util.Set;
-import javax.print.DocFlavor.CHAR_ARRAY;
-
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 906571)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy)
@@ -21,7 +21,6 @@
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
/**
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new SpanishAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "chicana", "chican");
+ checkOneTermReuse(a, "chicano", "chican");
+ // stopword
+ assertAnalyzesTo(a, "los", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("chicano");
+ Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT,
+ SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "chicana", "chican");
+ checkOneTermReuse(a, "chicano", "chicano");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\es\TestSpanishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new ItalianAnalyzer(Version.LUCENE_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "abbandonata", "abbandon");
+ checkOneTermReuse(a, "abbandonati", "abbandon");
+ // stopword
+ assertAnalyzesTo(a, "dallo", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("abbandonata");
+ Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT,
+ ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "abbandonata", "abbandonata");
+ checkOneTermReuse(a, "abbandonati", "abbandon");
+ }
+}
Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\it\TestItalianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.da;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.DanishStemmer;
+
+/**
+ * {@link Analyzer} for Danish.
+ */
+public final class DanishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Danish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public DanishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public DanishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new DanishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\da\DanishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Danish.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\da\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (working copy)
@@ -36,10 +36,12 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.German2Stemmer;
/**
* {@link Analyzer} for German language.
@@ -60,7 +62,7 @@
* List of typical german stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
- //TODO make this private in 3.1
+ //TODO make this private in 3.1, remove in 4.0
@Deprecated
public final static String[] GERMAN_STOP_WORDS = {
"einer", "eine", "eines", "einem", "einen",
@@ -77,6 +79,9 @@
"durch", "wegen", "wird"
};
+ /** File containing default German stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
+
/**
* Returns a set of default German-stopwords
* @return a set of default German-stopwords
@@ -86,8 +91,21 @@
}
private static class DefaultSetHolder {
- private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet(
+ /** @deprecated remove in Lucene 4.0 */
+ @Deprecated
+ private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
+ private static final Set<?> DEFAULT_SET;
+ static {
+ try {
+ DEFAULT_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
/**
@@ -105,7 +123,9 @@
* {@link #getDefaultStopSet()}.
*/
public GermanAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
+ : DefaultSetHolder.DEFAULT_SET_30);
}
/**
@@ -199,8 +219,9 @@
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with {@link StandardFilter},
- * {@link LowerCaseFilter}, {@link StopFilter}, and
- * {@link GermanStemFilter}
+ * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and
+ * {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
@@ -210,6 +231,10 @@
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter( matchVersion, result, stopwords);
result = new KeywordMarkerTokenFilter(result, exclusionSet);
- return new TokenStreamComponents(source, new GermanStemFilter(result));
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new SnowballFilter(result, new German2Stemmer());
+ else
+ result = new GermanStemFilter(result);
+ return new TokenStreamComponents(source, result);
}
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.sv;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.SwedishStemmer;
+
+/**
+ * {@link Analyzer} for Swedish.
+ */
+public final class SwedishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Swedish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public SwedishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new SwedishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\sv\SwedishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Swedish.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\sv\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.fi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.FinnishStemmer;
+
+/**
+ * {@link Analyzer} for Finnish.
+ */
+public final class FinnishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Italian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public FinnishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new FinnishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\fi\FinnishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Finnish.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\fi\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 0)
@@ -0,0 +1,70 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.Version;
+
+/**
+ * Provides the ability to override any {@link KeywordAttribute} aware stemmer
+ * with custom dictionary-based stemming.
+ */
+public final class StemmerOverrideFilter extends TokenFilter {
+ private final CharArrayMap<String> dictionary;
+
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
+
+ /**
+ * Create a new StemmerOverrideFilter, performing dictionary-based stemming
+ * with the provided <code>dictionary</code>.
+ * <p>
+ * Any dictionary-stemmed terms will be marked with {@link KeywordAttribute}
+ * so that they will not be stemmed with stemmers down the chain.
+ * </p>
+ */
+ public StemmerOverrideFilter(Version matchVersion, TokenStream input,
+ Map<?,String> dictionary) {
+ super(input);
+ this.dictionary = dictionary instanceof CharArrayMap ?
+ (CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
+ String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength());
+ if (stem != null) {
+ termAtt.setTermBuffer(stem);
+ keywordAtt.setKeyword(true);
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\miscellaneous\StemmerOverrideFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy)
@@ -18,7 +18,6 @@
*/
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import java.io.IOException;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
@@ -17,7 +17,6 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy)
@@ -19,7 +19,6 @@
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (working copy)
@@ -25,8 +25,10 @@
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
* (French stemming algorithm) for details
* </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
-
+@Deprecated
public class FrenchStemmer {
/**
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -40,7 +41,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class FrenchStemFilter extends TokenFilter {
/**
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy)
@@ -68,7 +68,7 @@
/**
* Constructs an elision filter with standard stop words
*/
- protected ElisionFilter(Version matchVersion, TokenStream input) {
+ public ElisionFilter(Version matchVersion, TokenStream input) {
this(matchVersion, input, DEFAULT_ARTICLES);
}
@@ -77,7 +77,7 @@
* @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead
*/
@Deprecated
- protected ElisionFilter(TokenStream input) {
+ public ElisionFilter(TokenStream input) {
this(Version.LUCENE_30, input);
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (working copy)
@@ -27,6 +27,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@@ -68,7 +69,7 @@
* Extended list of typical French stopwords.
* @deprecated use {@link #getDefaultStopSet()} instead
*/
- // TODO make this private in 3.1
+ // TODO make this private in 3.1, remove in 4.0
@Deprecated
public final static String[] FRENCH_STOP_WORDS = {
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
@@ -95,6 +96,9 @@
"été", "être", "ô"
};
+ /** File containing default French stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
+
/**
* Contains words that should be indexed but not stemmed.
*/
@@ -110,16 +114,31 @@
}
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
+ /** @deprecated remove this in Lucene 4.0 */
+ @Deprecated
+ static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
false));
+ static final Set<?> DEFAULT_STOP_SET;
+ static {
+ try {
+ DEFAULT_STOP_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
/**
- * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}).
+ * Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
*/
public FrenchAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
+ : DefaultSetHolder.DEFAULT_STOP_SET_30);
}
/**
@@ -207,20 +226,34 @@
* {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * {@link FrenchStemFilter} and {@link LowerCaseFilter}
+ * filtered with {@link StandardFilter}, {@link ElisionFilter},
+ * {@link LowerCaseFilter}, {@link StopFilter},
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new StandardTokenizer(matchVersion, reader);
- TokenStream result = new StandardFilter(source);
- result = new StopFilter(matchVersion, result, stopwords);
- if(!excltable.isEmpty())
- result = new KeywordMarkerTokenFilter(result, excltable);
- result = new FrenchStemFilter(result);
- // Convert to lowercase after stemming!
- return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new ElisionFilter(matchVersion, result);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new FrenchStemFilter(result);
+ // Convert to lowercase after stemming!
+ return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
+ }
}
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (working copy)
@@ -26,8 +26,10 @@
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
* algorithm in Martin Porter's snowball project.
* </p>
+ * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
-
+@Deprecated
public class DutchStemmer {
/**
* Buffer for the terms while stemming them.
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy)
@@ -26,6 +26,7 @@
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -42,7 +43,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (working copy)
@@ -20,11 +20,14 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
@@ -33,7 +36,6 @@
import java.io.File;
import java.io.IOException;
import java.io.Reader;
-import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -60,19 +62,11 @@
* @deprecated use {@link #getDefaultStopSet()} instead
*/
@Deprecated
- public final static String[] DUTCH_STOP_WORDS =
- {
- "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
- "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
- "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
- "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
- "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
- "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
- "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
- "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
- "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
- "uw", "iemand", "geweest", "andere"
- };
+ public final static String[] DUTCH_STOP_WORDS = getDefaultStopSet().toArray(new String[0]);
+
+ /** File containing default Dutch stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
+
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
@@ -82,9 +76,18 @@
}
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
- .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
- Arrays.asList(DUTCH_STOP_WORDS), false));
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
@@ -223,18 +226,32 @@
* text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer}
- * filtered with {@link StandardFilter}, {@link StopFilter},
- * and {@link DutchStemFilter}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * {@link StemmerOverrideFilter}, and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader aReader) {
- final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
- TokenStream result = new StandardFilter(source);
- result = new StopFilter(matchVersion, result, stoptable);
- if (!excltable.isEmpty())
- result = new KeywordMarkerTokenFilter(result, excltable);
- result = new DutchStemFilter(result, stemdict);
- return new TokenStreamComponents(source, result);
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stoptable);
+ if (!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ if (!stemdict.isEmpty())
+ result = new StemmerOverrideFilter(matchVersion, result, stemdict);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+ TokenStream result = new StandardFilter(source);
+ result = new StopFilter(matchVersion, result, stoptable);
+ if (!excltable.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, excltable);
+ result = new DutchStemFilter(result, stemdict);
+ return new TokenStreamComponents(source, result);
+ }
}
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy)
@@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.Locale;
import java.lang.Character.UnicodeBlock;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.hu;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.HungarianStemmer;
+
+/**
+ * {@link Analyzer} for Hungarian.
+ */
+public final class HungarianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Hungarian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public HungarianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new HungarianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\hu\HungarianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Hungarian.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\hu\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (revision 0)
@@ -0,0 +1,130 @@
+package org.apache.lucene.analysis.no;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+
+/**
+ * {@link Analyzer} for Norwegian.
+ */
+public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Norwegian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public NorwegianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new NorwegianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
+
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\no\NorwegianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Norwegian.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\no\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy)
@@ -17,7 +17,6 @@
*/
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy)
@@ -19,7 +19,6 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.index.Payload;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy)
@@ -19,7 +19,6 @@
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (revision 0)
@@ -0,0 +1,133 @@
+package org.apache.lucene.analysis.ro;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.RomanianStemmer;
+
+/**
+ * {@link Analyzer} for Romanian.
+ */
+public final class RomanianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Romanian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public RomanianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new RomanianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\ro\RomanianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Romanian.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\ro\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (working copy)
@@ -39,7 +39,10 @@
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
* </ul>
* </p>
+ * @deprecated Use the language-specific analyzer in contrib/analyzers instead.
+ * This analyzer will be removed in Lucene 4.0
*/
+@Deprecated
public final class SnowballAnalyzer extends Analyzer {
private String name;
private Set<?> stopSet;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy)
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link
@@ -39,14 +40,14 @@
*/
public final class SnowballFilter extends TokenFilter {
- private SnowballProgram stemmer;
+ private final SnowballProgram stemmer;
- private TermAttribute termAtt;
+ private final TermAttribute termAtt = addAttribute(TermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
super(input);
this.stemmer = stemmer;
- termAtt = addAttribute(TermAttribute.class);
}
/**
@@ -67,23 +68,24 @@
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
- termAtt = addAttribute(TermAttribute.class);
}
/** Returns the next input Token, after being stemmed */
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
- char termBuffer[] = termAtt.termBuffer();
- final int length = termAtt.termLength();
- stemmer.setCurrent(termBuffer, length);
- stemmer.stem();
- final char finalTerm[] = stemmer.getCurrentBuffer();
- final int newLength = stemmer.getCurrentBufferLength();
- if (finalTerm != termBuffer)
- termAtt.setTermBuffer(finalTerm, 0, newLength);
- else
- termAtt.setTermLength(newLength);
+ if (!keywordAttr.isKeyword()) {
+ char termBuffer[] = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ stemmer.setCurrent(termBuffer, length);
+ stemmer.stem();
+ final char finalTerm[] = stemmer.getCurrentBuffer();
+ final int newLength = stemmer.getCurrentBufferLength();
+ if (finalTerm != termBuffer)
+ termAtt.setTermBuffer(finalTerm, 0, newLength);
+ else
+ termAtt.setTermLength(newLength);
+ }
return true;
} else {
return false;
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+
+/**
+ * {@link Analyzer} for Portuguese.
+ */
+public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Portuguese stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public PortugueseAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new PortugueseStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\pt\PortugueseAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Portuguese.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\pt\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (revision 0)
@@ -0,0 +1,132 @@
+package org.apache.lucene.analysis.tr;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * {@link Analyzer} for Turkish.
+ */
+public final class TurkishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Turkish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+ /**
+ * The comment character in the stopwords file.
+ * All lines prefixed with this will be ignored.
+ */
+ private static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class,
+ DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public TurkishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new TurkishLowerCaseFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new TurkishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\tr\TurkishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (working copy)
@@ -17,15 +17,6 @@
-->
<html><head></head>
<body>
-Support for Turkish.
-<p>
-This package contains just the TokenStream for handling turkish casing,
-for a stemmer please see the snowball package.
-</p>
-<p>
-WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the
-language is set to Turkish, so you will need to construct your own
-analyzer that combines TurkishLowerCaseFilter and SnowballFilter.
-</p>
+Analyzer for Turkish.
</body>
-</html>
\ No newline at end of file
+</html>
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (working copy)
@@ -21,6 +21,7 @@
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.Tokenizer; // for javadocs
import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
+import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
@@ -35,8 +36,11 @@
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
* {@link CharTokenizer#normalize(int)} for details.</li>
- * </ul>
+ * </ul>
+ * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
+ * This filter will be removed in Lucene 4.0
*/
+@Deprecated
public class RussianLetterTokenizer extends CharTokenizer
{
private static final int DIGIT_0 = '0';
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (working copy)
@@ -19,7 +19,10 @@
/**
* Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
class RussianStemmer
{
// positions of RV, R1 and R2 respectively
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
import java.io.IOException;
@@ -40,7 +41,11 @@
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerTokenFilter
+ * @deprecated Use {@link SnowballFilter} with
+ * {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the
+ * same functionality. This filter will be removed in Lucene 4.0
*/
+@Deprecated
public final class RussianStemFilter extends TokenFilter
{
/**
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (working copy)
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Map;
@@ -26,11 +27,15 @@
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.util.Version;
/**
@@ -44,9 +49,11 @@
public final class RussianAnalyzer extends StopwordAnalyzerBase
{
/**
- * List of typical Russian stopwords.
+ * List of typical Russian stopwords. (for backwards compatibility)
+ * @deprecated Remove this for LUCENE 4.0
*/
- private static final String[] RUSSIAN_STOP_WORDS = {
+ @Deprecated
+ private static final String[] RUSSIAN_STOP_WORDS_30 = {
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
@@ -59,10 +66,27 @@
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
};
+ /** File containing default Russian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
+
private static class DefaultSetHolder {
- static final Set<?> DEFAULT_STOP_SET = CharArraySet
+ /** @deprecated remove this for Lucene 4.0 */
+ @Deprecated
+ static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
- Arrays.asList(RUSSIAN_STOP_WORDS), false));
+ Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET =
+ WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
}
private final Set<?> stemExclusionSet;
@@ -77,7 +101,9 @@
}
public RussianAnalyzer(Version matchVersion) {
- this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ this(matchVersion,
+ matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
+ : DefaultSetHolder.DEFAULT_STOP_SET_30);
}
/**
@@ -132,19 +158,30 @@
* provided {@link Reader}.
*
* @return {@link TokenStreamComponents} built from a
- * {@link RussianLetterTokenizer} filtered with
+ * {@link StandardTokenizer} filtered with {@link StandardFilter},
* {@link LowerCaseFilter}, {@link StopFilter},
- * and {@link RussianStemFilter}
+ * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
+ * and {@link SnowballFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
- TokenStream result = new LowerCaseFilter(matchVersion, source);
- result = new StopFilter(matchVersion, result, stopwords);
- if(!stemExclusionSet.isEmpty())
- result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
- return new TokenStreamComponents(source, new RussianStemFilter(result));
-
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+ result, stemExclusionSet);
+ result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
+ return new TokenStreamComponents(source, result);
+ } else {
+ final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
+ TokenStream result = new LowerCaseFilter(matchVersion, source);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter(
+ result, stemExclusionSet);
+ return new TokenStreamComponents(source, new RussianStemFilter(result));
+ }
}
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
import org.apache.lucene.util.Version;
@@ -117,13 +118,15 @@
*
* @return {@link TokenStreamComponents} built from a
* {@link StandardTokenizer} filtered with
- * {@link GreekLowerCaseFilter} and {@link StopFilter}
+ * {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
- final TokenStream result = new GreekLowerCaseFilter(source);
+ TokenStream result = new GreekLowerCaseFilter(source);
+ if (matchVersion.onOrAfter(Version.LUCENE_31))
+ result = new StandardFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
}
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (revision 0)
@@ -0,0 +1,113 @@
+package org.apache.lucene.analysis.en;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for English.
+ */
+public final class EnglishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}.
+ */
+ public EnglishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link PorterStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new PorterStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\en\EnglishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for English.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\en\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 906571)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy)
@@ -21,7 +21,6 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Set the positionIncrement of all tokens to the "positionIncrement",
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.es;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.SpanishStemmer;
+
+/**
+ * {@link Analyzer} for Spanish.
+ */
+public final class SpanishAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Spanish stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public SpanishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new SpanishStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\es\SpanishAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Spanish.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\es\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.it;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+import org.tartarus.snowball.ext.ItalianStemmer;
+
+/**
+ * {@link Analyzer} for Italian.
+ */
+public final class ItalianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Italian stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public ItalianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
+ * {@link Reader}.
+ *
+ * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
+ * filtered with {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
+ * exclusion set is provided and {@link SnowballFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerTokenFilter(result, stemExclusionSet);
+ result = new SnowballFilter(result, new ItalianStemmer());
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\it\ItalianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Italian.
+</body>
+</html>
Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\it\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
===================================================================
--- contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (revision 0)
+++ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (revision 0)
@@ -0,0 +1,233 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+acea
+aceasta
+această
+aceea
+acei
+aceia
+acel
+acela
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+aceşti
+aceştia
+acolo
+acum
+ai
+aia
+aibă
+aici
+al
+ăla
+ale
+alea
+ălea
+altceva
+altcineva
+am
+ar
+are
+aş
+aşadar
+asemenea
+asta
+ăsta
+astăzi
+astea
+ăstea
+ăştia
+asupra
+aţi
+au
+avea
+avem
+aveţi
+azi
+bine
+bucur
+bună
+ca
+că
+căci
+când
+care
+cărei
+căror
+cărui
+cât
+câte
+câţi
+către
+câtva
+ce
+cel
+ceva
+chiar
+cînd
+cine
+cineva
+cît
+cîte
+cîţi
+cîtva
+contra
+cu
+cum
+cumva
+curând
+curînd
+da
+dă
+dacă
+dar
+datorită
+de
+deci
+deja
+deoarece
+departe
+deşi
+din
+dinaintea
+dintr
+dintre
+drept
+după
+ea
+ei
+el
+ele
+eram
+este
+eşti
+eu
+face
+fără
+fi
+fie
+fiecare
+fii
+fim
+fiţi
+iar
+ieri
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încât
+încît
+încotro
+între
+întrucât
+întrucît
+îţi
+la
+lângă
+le
+li
+lîngă
+lor
+lui
+mă
+mâine
+mea
+mei
+mele
+mereu
+meu
+mi
+mine
+mult
+multă
+mulţi
+ne
+nicăieri
+nici
+nimeni
+nişte
+noastră
+noastre
+noi
+noştri
+nostru
+nu
+ori
+oricând
+oricare
+oricât
+orice
+oricînd
+oricine
+oricît
+oricum
+oriunde
+până
+pe
+pentru
+peste
+pînă
+poate
+pot
+prea
+prima
+primul
+prin
+printr
+sa
+să
+săi
+sale
+sau
+său
+se
+şi
+sînt
+sîntem
+sînteţi
+spre
+sub
+sunt
+suntem
+sunteţi
+ta
+tăi
+tale
+tău
+te
+ţi
+ţie
+tine
+toată
+toate
+tot
+toţi
+totuşi
+tu
+un
+una
+unde
+undeva
+unei
+unele
+uneori
+unor
+vă
+vi
+voastră
+voastre
+voi
+voştri
+vostru
+vouă
+vreo
+vreun
Property changes on: contrib\analyzers\common\src\resources\org\apache\lucene\analysis\ro\stopwords.txt
___________________________________________________________________
Added: svn:eol-style
+ native
Index: contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
===================================================================
--- contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (revision 0)
+++ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (revision 0)
@@ -0,0 +1,212 @@
+# Turkish stopwords from LUCENE-559
+# merged with the list from "Information Retrieval on Turkish Texts"
+# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
+acaba
+altmış
+altı
+ama
+ancak
+arada
+aslında
+ayrıca
+bana
+bazı
+belki
+ben
+benden
+beni
+benim
+beri
+beş
+bile
+bin
+bir
+birçok
+biri
+birkaç
+birkez
+birşey
+birşeyi
+biz
+bize
+bizden
+bizi
+bizim
+böyle
+böylece
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+burada
+çok
+çünkü
+da
+daha
+dahi
+de
+defa
+değil
+diğer
+diye
+doksan
+dokuz
+dolayı
+dolayısıyla
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+eğer
+elli
+en
+etmesi
+etti
+ettiği
+ettiğini
+gibi
+göre
+halen
+hangi
+hatta
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkesin
+hiç
+hiçbir
+için
+iki
+ile
+ilgili
+ise
+işte
+itibaren
+itibariyle
+kadar
+karşın
+katrilyon
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kez
+ki
+kim
+kimden
+kime
+kimi
+kimse
+kırk
+milyar
+milyon
+mu
+mü
+mı
+nasıl
+ne
+neden
+nedenle
+nerde
+nerede
+nereye
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduğu
+olduğunu
+olduklarını
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+ondan
+onlar
+onlardan
+onları
+onların
+onu
+onun
+otuz
+oysa
+öyle
+pek
+rağmen
+sadece
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+şey
+şeyden
+şeyi
+şeyler
+şöyle
+şu
+şuna
+şunda
+şundan
+şunları
+şunu
+tarafından
+trilyon
+tüm
+üç
+üzere
+var
+vardı
+ve
+veya
+ya
+yani
+yapacak
+yapılan
+yapılması
+yapıyor
+yapmak
+yaptı
+yaptığı
+yaptığını
+yaptıkları
+yedi
+yerine
+yetmiş
+yine
+yirmi
+yoksa
+yüz
+zaten
Property changes on: contrib\analyzers\common\src\resources\org\apache\lucene\analysis\tr\stopwords.txt
___________________________________________________________________
Added: svn:eol-style
+ native