| Index: NOTICE.txt
|
| ===================================================================
|
| --- NOTICE.txt (revision 906571)
|
| +++ NOTICE.txt (working copy)
|
| @@ -23,6 +23,11 @@
|
| contrib/analyzers/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt. |
| See http://members.unine.ch/jacques.savoy/clef/index.html. |
| |
| +The Romanian analyzer (contrib/analyzers) comes with a default |
| +stopword list that is BSD-licensed created by Jacques Savoy. The file resides in |
| +contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt. |
| +See http://members.unine.ch/jacques.savoy/clef/index.html. |
| + |
| The Bulgarian analyzer (contrib/analyzers) comes with a default |
| stopword list that is BSD-licensed created by Jacques Savoy. The file resides in |
| contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt. |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/da/TestDanishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.da; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestDanishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new DanishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "undersøg", "undersøg"); |
| + checkOneTermReuse(a, "undersøgelse", "undersøg"); |
| + // stopword |
| + assertAnalyzesTo(a, "på", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("undersøgelse"); |
| + Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT, |
| + DanishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "undersøgelse", "undersøgelse"); |
| + checkOneTermReuse(a, "undersøg", "undersøg"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\da\TestDanishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanAnalyzer.java (revision 0)
|
| @@ -0,0 +1,93 @@
|
| +package org.apache.lucene.analysis.de; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.StringReader; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseTokenizer; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestGermanAnalyzer extends BaseTokenStreamTestCase { |
| + public void testReusableTokenStream() throws Exception { |
| + Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); |
| + checkOneTermReuse(a, "Tisch", "tisch"); |
| + checkOneTermReuse(a, "Tische", "tisch"); |
| + checkOneTermReuse(a, "Tischen", "tisch"); |
| + } |
| + |
| + public void testExclusionTableBWCompat() throws IOException { |
| + GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, |
| + new StringReader("Fischen Trinken"))); |
| + CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| + set.add("fischen"); |
| + filter.setExclusionSet(set); |
| + assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); |
| + } |
| + |
| + public void testWithKeywordAttribute() throws IOException { |
| + CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| + set.add("fischen"); |
| + GermanStemFilter filter = new GermanStemFilter( |
| + new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( |
| + "Fischen Trinken")), set)); |
| + assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); |
| + } |
| + |
| + public void testWithKeywordAttributeAndExclusionTable() throws IOException { |
| + CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| + set.add("fischen"); |
| + CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| + set1.add("trinken"); |
| + set1.add("fischen"); |
| + GermanStemFilter filter = new GermanStemFilter( |
| + new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( |
| + "Fischen Trinken")), set)); |
| + filter.setExclusionSet(set1); |
| + assertTokenStreamContents(filter, new String[] { "fischen", "trinken" }); |
| + } |
| + |
| + /* |
| + * Test that changes to the exclusion table are applied immediately |
| + * when using reusable token streams. |
| + */ |
| + public void testExclusionTableReuse() throws Exception { |
| + GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); |
| + checkOneTermReuse(a, "tischen", "tisch"); |
| + a.setStemExclusionTable(new String[] { "tischen" }); |
| + checkOneTermReuse(a, "tischen", "tischen"); |
| + } |
| + |
| + /** test some features of the new snowball filter |
| + * these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer |
| + */ |
| + public void testGermanSpecials() throws Exception { |
| + GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); |
| + // a/o/u + e is equivalent to the umlaut form |
| + checkOneTermReuse(a, "Schaltflächen", "schaltflach"); |
| + checkOneTermReuse(a, "Schaltflaechen", "schaltflach"); |
| + // here they are with the old stemmer |
| + a = new GermanAnalyzer(Version.LUCENE_30); |
| + checkOneTermReuse(a, "Schaltflächen", "schaltflach"); |
| + checkOneTermReuse(a, "Schaltflaechen", "schaltflaech"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\de\TestGermanAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java (working copy)
|
| @@ -20,15 +20,14 @@
|
| import java.io.BufferedReader; |
| import java.io.File; |
| import java.io.FileInputStream; |
| -import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.io.StringReader; |
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| -import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.CharArraySet; |
| -import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| -import org.apache.lucene.analysis.LowerCaseTokenizer; |
| +import org.apache.lucene.analysis.KeywordTokenizer; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| @@ -40,6 +39,8 @@
|
| public class TestGermanStemFilter extends BaseTokenStreamTestCase { |
| |
| public void testStemming() throws Exception { |
| + Tokenizer tokenizer = new KeywordTokenizer(new StringReader("")); |
| + TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer)); |
| // read test cases from external file: |
| File dataDir = new File(System.getProperty("dataDir", "./bin")); |
| File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt"); |
| @@ -55,68 +56,12 @@
|
| continue; // ignore comments and empty lines |
| String[] parts = line.split(";"); |
| //System.out.println(parts[0] + " -- " + parts[1]); |
| - check(parts[0], parts[1]); |
| + tokenizer.reset(new StringReader(parts[0])); |
| + filter.reset(); |
| + assertTokenStreamContents(filter, new String[] { parts[1] }); |
| } |
| breader.close(); |
| isr.close(); |
| fis.close(); |
| } |
| - |
| - public void testReusableTokenStream() throws Exception { |
| - Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); |
| - checkReuse(a, "Tisch", "tisch"); |
| - checkReuse(a, "Tische", "tisch"); |
| - checkReuse(a, "Tischen", "tisch"); |
| - } |
| - |
| - public void testExclusionTableBWCompat() throws IOException { |
| - GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, |
| - new StringReader("Fischen Trinken"))); |
| - CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| - set.add("fischen"); |
| - filter.setExclusionSet(set); |
| - assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); |
| - } |
| - |
| - public void testWithKeywordAttribute() throws IOException { |
| - CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| - set.add("fischen"); |
| - GermanStemFilter filter = new GermanStemFilter( |
| - new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( |
| - "Fischen Trinken")), set)); |
| - assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); |
| - } |
| - |
| - public void testWithKeywordAttributeAndExclusionTable() throws IOException { |
| - CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| - set.add("fischen"); |
| - CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true); |
| - set1.add("trinken"); |
| - set1.add("fischen"); |
| - GermanStemFilter filter = new GermanStemFilter( |
| - new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader( |
| - "Fischen Trinken")), set)); |
| - filter.setExclusionSet(set1); |
| - assertTokenStreamContents(filter, new String[] { "fischen", "trinken" }); |
| - } |
| - |
| - /* |
| - * Test that changes to the exclusion table are applied immediately |
| - * when using reusable token streams. |
| - */ |
| - public void testExclusionTableReuse() throws Exception { |
| - GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT); |
| - checkReuse(a, "tischen", "tisch"); |
| - a.setStemExclusionTable(new String[] { "tischen" }); |
| - checkReuse(a, "tischen", "tischen"); |
| - } |
| - |
| - |
| - private void check(final String input, final String expected) throws Exception { |
| - checkOneTerm(new GermanAnalyzer(Version.LUCENE_CURRENT), input, expected); |
| - } |
| - |
| - private void checkReuse(Analyzer a, String input, String expected) throws Exception { |
| - checkOneTermReuse(a, input, expected); |
| - } |
| } |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/sv/TestSwedishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.sv; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestSwedishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new SwedishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "jaktkarlarne", "jaktkarl"); |
| + checkOneTermReuse(a, "jaktkarlens", "jaktkarl"); |
| + // stopword |
| + assertAnalyzesTo(a, "och", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("jaktkarlarne"); |
| + Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT, |
| + SwedishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne"); |
| + checkOneTermReuse(a, "jaktkarlens", "jaktkarl"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\sv\TestSwedishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fi/TestFinnishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.fi; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestFinnishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new FinnishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "edeltäjiinsä", "edeltäj"); |
| + checkOneTermReuse(a, "edeltäjistään", "edeltäj"); |
| + // stopword |
| + assertAnalyzesTo(a, "olla", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("edeltäjistään"); |
| + Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT, |
| + FinnishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "edeltäjiinsä", "edeltäj"); |
| + checkOneTermReuse(a, "edeltäjistään", "edeltäjistään"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\fi\TestFinnishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java (revision 0)
|
| @@ -0,0 +1,44 @@
|
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +import java.io.IOException; |
| +import java.io.StringReader; |
| +import java.util.HashMap; |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.KeywordTokenizer; |
| +import org.apache.lucene.analysis.PorterStemFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.util.Version; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase { |
| + public void testOverride() throws IOException { |
| + // lets make booked stem to books |
| + // the override filter will convert "booked" to "books", |
| + // but also mark it with KeywordAttribute so Porter will not change it. |
| + Map<String,String> dictionary = new HashMap<String,String>(); |
| + dictionary.put("booked", "books"); |
| + Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked")); |
| + TokenStream stream = new PorterStemFilter( |
| + new StemmerOverrideFilter(Version.LUCENE_CURRENT, tokenizer, dictionary)); |
| + assertTokenStreamContents(stream, new String[] { "books" }); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\miscellaneous\TestStemmerOverrideFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (working copy)
|
| @@ -18,10 +18,8 @@
|
| */ |
| |
| |
| -import java.io.IOException; |
| import java.io.StringReader; |
| |
| -import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| |
| /** |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (working copy)
|
| @@ -22,7 +22,6 @@
|
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.util.Version; |
| |
| -import java.io.IOException; |
| import java.io.StringReader; |
| |
| /** |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/fr/TestFrenchAnalyzer.java (working copy)
|
| @@ -17,6 +17,8 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| + |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.util.Version; |
| @@ -113,6 +115,94 @@
|
| |
| } |
| |
| + /** |
| + * @deprecated remove this test for Lucene 4.0 |
| + */ |
| + @Deprecated |
| + public void testAnalyzer30() throws Exception { |
| + FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30); |
| + |
| + assertAnalyzesTo(fa, "", new String[] { |
| + }); |
| + |
| + assertAnalyzesTo( |
| + fa, |
| + "chien chat cheval", |
| + new String[] { "chien", "chat", "cheval" }); |
| + |
| + assertAnalyzesTo( |
| + fa, |
| + "chien CHAT CHEVAL", |
| + new String[] { "chien", "chat", "cheval" }); |
| + |
| + assertAnalyzesTo( |
| + fa, |
| + " chien ,? + = - CHAT /: > CHEVAL", |
| + new String[] { "chien", "chat", "cheval" }); |
| + |
| + assertAnalyzesTo(fa, "chien++", new String[] { "chien" }); |
| + |
| + assertAnalyzesTo( |
| + fa, |
| + "mot \"entreguillemet\"", |
| + new String[] { "mot", "entreguillemet" }); |
| + |
| + // let's do some french specific tests now |
| + |
| + /* 1. couldn't resist |
| + I would expect this to stay one term as in French the minus |
| + sign is often used for composing words */ |
| + assertAnalyzesTo( |
| + fa, |
| + "Jean-François", |
| + new String[] { "jean", "françois" }); |
| + |
| + // 2. stopwords |
| + assertAnalyzesTo( |
| + fa, |
| + "le la chien les aux chat du des à cheval", |
| + new String[] { "chien", "chat", "cheval" }); |
| + |
| + // some nouns and adjectives |
| + assertAnalyzesTo( |
| + fa, |
| + "lances chismes habitable chiste éléments captifs", |
| + new String[] { |
| + "lanc", |
| + "chism", |
| + "habit", |
| + "chist", |
| + "élément", |
| + "captif" }); |
| + |
| + // some verbs |
| + assertAnalyzesTo( |
| + fa, |
| + "finissions souffrirent rugissante", |
| + new String[] { "fin", "souffr", "rug" }); |
| + |
| + // some everything else |
| + // aujourd'hui stays one term which is OK |
| + assertAnalyzesTo( |
| + fa, |
| + "C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ", |
| + new String[] { |
| + "c3po", |
| + "aujourd'hui", |
| + "oeuf", |
| + "ïâöûàä", |
| + "anticonstitutionnel", |
| + "jav" }); |
| + |
| + // some more everything else |
| + // here 1940-1945 stays as one term, 1940:1945 not ? |
| + assertAnalyzesTo( |
| + fa, |
| + "33Bis 1940-1945 1940:1945 (---i+++)*", |
| + new String[] { "33bis", "1940-1945", "1940", "1945", "i" }); |
| + |
| + } |
| + |
| public void testReusableTokenStream() throws Exception { |
| FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT); |
| // stopwords |
| @@ -157,4 +247,28 @@
|
| assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable", |
| "chist" }); |
| } |
| + |
| + public void testElision() throws Exception { |
| + FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT); |
| + assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" }); |
| + } |
| + |
| + /** |
| + * Prior to 3.1, this analyzer had no lowercase filter. |
| + * stopwords were case sensitive. Preserve this for back compat. |
| + * @deprecated Remove this test in Lucene 4.0 |
| + */ |
| + @Deprecated |
| + public void testBuggyStopwordsCasing() throws IOException { |
| + FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30); |
| + assertAnalyzesTo(a, "Votre", new String[] { "votr" }); |
| + } |
| + |
| + /** |
| + * Test that stopwords are not case sensitive |
| + */ |
| + public void testStopwordsCasing() throws IOException { |
| + FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31); |
| + assertAnalyzesTo(a, "Votre", new String[] { }); |
| + } |
| } |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/nl/TestDutchStemmer.java (working copy)
|
| @@ -100,9 +100,6 @@
|
| check("ophalend", "ophal"); |
| check("ophalers", "ophaler"); |
| check("ophef", "ophef"); |
| - check("opheffen", "ophef"); // versus snowball 'opheff' |
| - check("opheffende", "ophef"); // versus snowball 'opheff' |
| - check("opheffing", "ophef"); // versus snowball 'opheff' |
| check("opheldering", "ophelder"); |
| check("ophemelde", "ophemeld"); |
| check("ophemelen", "ophemel"); |
| @@ -118,6 +115,24 @@
|
| check("ophouden", "ophoud"); |
| } |
| |
| + /** |
| + * @deprecated remove this test in Lucene 4.0 |
| + */ |
| + @Deprecated |
| + public void testOldBuggyStemmer() throws Exception { |
| + Analyzer a = new DutchAnalyzer(Version.LUCENE_30); |
| + checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff' |
| + checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff' |
| + checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff' |
| + } |
| + |
| + public void testSnowballCorrectness() throws Exception { |
| + Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT); |
| + checkOneTermReuse(a, "opheffen", "opheff"); |
| + checkOneTermReuse(a, "opheffende", "opheff"); |
| + checkOneTermReuse(a, "opheffing", "opheff"); |
| + } |
| + |
| public void testReusableTokenStream() throws Exception { |
| Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT); |
| checkOneTermReuse(a, "lichaamsziek", "lichaamsziek"); |
| @@ -161,6 +176,25 @@
|
| checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent"); |
| } |
| |
| + /** |
| + * Prior to 3.1, this analyzer had no lowercase filter. |
| + * stopwords were case sensitive. Preserve this for back compat. |
| + * @deprecated Remove this test in Lucene 4.0 |
| + */ |
| + @Deprecated |
| + public void testBuggyStopwordsCasing() throws IOException { |
| + DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30); |
| + assertAnalyzesTo(a, "Zelf", new String[] { "zelf" }); |
| + } |
| + |
| + /** |
| + * Test that stopwords are not case sensitive |
| + */ |
| + public void testStopwordsCasing() throws IOException { |
| + DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31); |
| + assertAnalyzesTo(a, "Zelf", new String[] { }); |
| + } |
| + |
| private void check(final String input, final String expected) throws Exception { |
| checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected); |
| } |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/hu/TestHungarianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.hu; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestHungarianAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new HungarianAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "babakocsi", "babakocs"); |
| + checkOneTermReuse(a, "babakocsijáért", "babakocs"); |
| + // stopword |
| + assertAnalyzesTo(a, "által", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("babakocsi"); |
| + Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT, |
| + HungarianAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "babakocsi", "babakocsi"); |
| + checkOneTermReuse(a, "babakocsijáért", "babakocs"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\hu\TestHungarianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/no/TestNorwegianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.no; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new NorwegianAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "havnedistriktene", "havnedistrikt"); |
| + checkOneTermReuse(a, "havnedistrikter", "havnedistrikt"); |
| + // stopword |
| + assertAnalyzesTo(a, "det", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("havnedistriktene"); |
| + Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT, |
| + NorwegianAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "havnedistriktene", "havnedistriktene"); |
| + checkOneTermReuse(a, "havnedistrikter", "havnedistrikt"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\no\TestNorwegianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ro/TestRomanianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.ro; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestRomanianAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new RomanianAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "absenţa", "absenţ"); |
| + checkOneTermReuse(a, "absenţi", "absenţ"); |
| + // stopword |
| + assertAnalyzesTo(a, "îl", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("absenţa"); |
| + Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT, |
| + RomanianAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "absenţa", "absenţa"); |
| + checkOneTermReuse(a, "absenţi", "absenţ"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\ro\TestRomanianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new PortugueseAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "quilométricas", "quilométr"); |
| + checkOneTermReuse(a, "quilométricos", "quilométr"); |
| + // stopword |
| + assertAnalyzesTo(a, "não", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("quilométricas"); |
| + Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT, |
| + PortugueseAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "quilométricas", "quilométricas"); |
| + checkOneTermReuse(a, "quilométricos", "quilométr"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\pt\TestPortugueseAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/tr/TestTurkishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.tr; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestTurkishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new TurkishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "ağacı", "ağaç"); |
| + checkOneTermReuse(a, "ağaç", "ağaç"); |
| + // stopword |
| + assertAnalyzesTo(a, "dolayı", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("ağacı"); |
| + Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT, |
| + TurkishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "ağacı", "ağacı"); |
| + checkOneTermReuse(a, "ağaç", "ağaç"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\tr\TestTurkishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (working copy)
|
| @@ -25,7 +25,9 @@
|
| |
| /** |
| * Testcase for {@link RussianLetterTokenizer} |
| + * @deprecated Remove this test class in Lucene 4.0 |
| */ |
| +@Deprecated |
| public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase { |
| |
| public void testRussianLetterTokenizer() throws IOException { |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java (working copy)
|
| @@ -50,9 +50,14 @@
|
| dataDir = new File(System.getProperty("dataDir", "./bin")); |
| } |
| |
| - public void testUnicode() throws IOException |
| + /** |
| + * @deprecated remove this test and its datafiles in Lucene 4.0 |
| + * the Snowball version has its own data tests. |
| + */ |
| + @Deprecated |
| + public void testUnicode30() throws IOException |
| { |
| - RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT); |
| + RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_30); |
| inWords = |
| new InputStreamReader( |
| new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")), |
| @@ -110,12 +115,22 @@
|
| } |
| } |
| |
| + /** @deprecated remove this test in Lucene 4.0: stopwords changed */ |
| + @Deprecated |
| + public void testReusableTokenStream30() throws Exception { |
| + Analyzer a = new RussianAnalyzer(Version.LUCENE_30); |
| + assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", |
| + new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" }); |
| + assertAnalyzesToReuse(a, "Но знание это хранилось в тайне", |
| + new String[] { "знан", "хран", "тайн" }); |
| + } |
| + |
| public void testReusableTokenStream() throws Exception { |
| Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT); |
| assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще", |
| new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" }); |
| assertAnalyzesToReuse(a, "Но знание это хранилось в тайне", |
| - new String[] { "знан", "хран", "тайн" }); |
| + new String[] { "знан", "эт", "хран", "тайн" }); |
| } |
| |
| |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianStem.java (working copy)
|
| @@ -24,6 +24,10 @@
|
| import java.io.FileInputStream; |
| import java.util.ArrayList; |
| |
| +/** |
| + * @deprecated Remove this test class (and its datafiles!) in Lucene 4.0 |
| + */ |
| +@Deprecated |
| public class TestRussianStem extends LuceneTestCase |
| { |
| private ArrayList words = new ArrayList(); |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (working copy)
|
| @@ -22,11 +22,8 @@
|
| import java.util.Collection; |
| import java.util.Iterator; |
| import java.util.LinkedList; |
| -import java.util.HashSet; |
| -import java.util.Arrays; |
| |
| import org.apache.lucene.analysis.*; |
| -import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; |
| import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter; |
| import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream; |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/el/GreekAnalyzerTest.java (working copy)
|
| @@ -18,7 +18,6 @@
|
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| @@ -63,4 +62,23 @@
|
| assertAnalyzesToReuse(a, "\u03a0\u03a1\u039f\u03ab\u03a0\u039f\u0398\u0395\u03a3\u0395\u0399\u03a3 \u0386\u03c8\u03bf\u03b3\u03bf\u03c2, \u03bf \u03bc\u03b5\u03c3\u03c4\u03cc\u03c2 \u03ba\u03b1\u03b9 \u03bf\u03b9 \u03ac\u03bb\u03bb\u03bf\u03b9", |
| new String[] { "\u03c0\u03c1\u03bf\u03c5\u03c0\u03bf\u03b8\u03b5\u03c3\u03b5\u03b9\u03c3", "\u03b1\u03c8\u03bf\u03b3\u03bf\u03c3", "\u03bc\u03b5\u03c3\u03c4\u03bf\u03c3", "\u03b1\u03bb\u03bb\u03bf\u03b9" }); |
| } |
| + |
| + /** |
| + * Greek Analyzer didn't call standardFilter, so no normalization of acronyms. |
| + * check that this is preserved. |
| + * @deprecated remove this test in Lucene 4.0 |
| + */ |
| + @Deprecated |
| + public void testAcronymBWCompat() throws Exception { |
| + Analyzer a = new GreekAnalyzer(Version.LUCENE_30); |
| + assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "α.π.τ." }); |
| + } |
| + |
| + /** |
| + * test that acronym normalization works |
| + */ |
| + public void testAcronym() throws Exception { |
| + Analyzer a = new GreekAnalyzer(Version.LUCENE_31); |
| + assertAnalyzesTo(a, "Α.Π.Τ.", new String[] { "απτ" }); |
| + } |
| } |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/en/TestEnglishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.en; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestEnglishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new EnglishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "books", "book"); |
| + checkOneTermReuse(a, "book", "book"); |
| + // stopword |
| + assertAnalyzesTo(a, "the", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("books"); |
| + Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT, |
| + EnglishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "books", "books"); |
| + checkOneTermReuse(a, "book", "book"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\en\TestEnglishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicAnalyzer.java (working copy)
|
| @@ -22,8 +22,6 @@
|
| import java.util.HashSet; |
| import java.util.Set; |
| |
| -import javax.print.DocFlavor.CHAR_ARRAY; |
| - |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.util.Version; |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (working copy)
|
| @@ -21,7 +21,6 @@
|
| import java.io.StringReader; |
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| -import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/es/TestSpanishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.es; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestSpanishAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new SpanishAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "chicana", "chican"); |
| + checkOneTermReuse(a, "chicano", "chican"); |
| + // stopword |
| + assertAnalyzesTo(a, "los", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("chicano"); |
| + Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT, |
| + SpanishAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "chicana", "chican"); |
| + checkOneTermReuse(a, "chicano", "chicano"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\es\TestSpanishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,54 @@
|
| +package org.apache.lucene.analysis.it; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.util.Version; |
| + |
| +public class TestItalianAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new ItalianAnalyzer(Version.LUCENE_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "abbandonata", "abbandon"); |
| + checkOneTermReuse(a, "abbandonati", "abbandon"); |
| + // stopword |
| + assertAnalyzesTo(a, "dallo", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("abbandonata"); |
| + Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT, |
| + ItalianAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "abbandonata", "abbandonata"); |
| + checkOneTermReuse(a, "abbandonati", "abbandon"); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\it\TestItalianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.da; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.DanishStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Danish. |
| + */ |
| +public final class DanishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Danish stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "danish_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public DanishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public DanishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public DanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new DanishStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\da\DanishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/da/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Danish. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\da\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java (working copy)
|
| @@ -36,10 +36,12 @@
|
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.German2Stemmer; |
| |
| /** |
| * {@link Analyzer} for German language. |
| @@ -60,7 +62,7 @@
|
| * List of typical german stopwords. |
| * @deprecated use {@link #getDefaultStopSet()} instead |
| */ |
| - //TODO make this private in 3.1 |
| + //TODO make this private in 3.1, remove in 4.0 |
| @Deprecated |
| public final static String[] GERMAN_STOP_WORDS = { |
| "einer", "eine", "eines", "einem", "einen", |
| @@ -77,6 +79,9 @@
|
| "durch", "wegen", "wird" |
| }; |
| |
| + /** File containing default German stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt"; |
| + |
| /** |
| * Returns a set of default German-stopwords |
| * @return a set of default German-stopwords |
| @@ -86,8 +91,21 @@
|
| } |
| |
| private static class DefaultSetHolder { |
| - private static final Set<?> DEFAULT_SET = CharArraySet.unmodifiableSet(new CharArraySet( |
| + /** @deprecated remove in Lucene 4.0 */ |
| + @Deprecated |
| + private static final Set<?> DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet( |
| Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false)); |
| + private static final Set<?> DEFAULT_SET; |
| + static { |
| + try { |
| + DEFAULT_SET = |
| + WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| } |
| |
| /** |
| @@ -105,7 +123,9 @@
|
| * {@link #getDefaultStopSet()}. |
| */ |
| public GermanAnalyzer(Version matchVersion) { |
| - this(matchVersion, DefaultSetHolder.DEFAULT_SET); |
| + this(matchVersion, |
| + matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET |
| + : DefaultSetHolder.DEFAULT_SET_30); |
| } |
| |
| /** |
| @@ -199,8 +219,9 @@
|
| * |
| * @return {@link TokenStreamComponents} built from a |
| * {@link StandardTokenizer} filtered with {@link StandardFilter}, |
| - * {@link LowerCaseFilter}, {@link StopFilter}, and |
| - * {@link GermanStemFilter} |
| + * {@link LowerCaseFilter}, {@link StopFilter}, |
| + * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and |
| + * {@link SnowballFilter} |
| */ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, |
| @@ -210,6 +231,10 @@
|
| result = new LowerCaseFilter(matchVersion, result); |
| result = new StopFilter( matchVersion, result, stopwords); |
| result = new KeywordMarkerTokenFilter(result, exclusionSet); |
| - return new TokenStreamComponents(source, new GermanStemFilter(result)); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) |
| + result = new SnowballFilter(result, new German2Stemmer()); |
| + else |
| + result = new GermanStemFilter(result); |
| + return new TokenStreamComponents(source, result); |
| } |
| } |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.sv; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.SwedishStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Swedish. |
| + */ |
| +public final class SwedishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Swedish stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "swedish_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public SwedishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public SwedishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public SwedishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new SwedishStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\sv\SwedishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/sv/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Swedish. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\sv\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.fi; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.FinnishStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Finnish. |
| + */ |
| +public final class FinnishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Italian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "finnish_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public FinnishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public FinnishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public FinnishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new FinnishStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\fi\FinnishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fi/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Finnish. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\fi\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java (revision 0)
|
| @@ -0,0 +1,70 @@
|
| +package org.apache.lucene.analysis.miscellaneous; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.CharArrayMap; |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| +import org.apache.lucene.util.Version; |
| + |
| +/** |
| + * Provides the ability to override any {@link KeywordAttribute} aware stemmer |
| + * with custom dictionary-based stemming. |
| + */ |
| +public final class StemmerOverrideFilter extends TokenFilter { |
| + private final CharArrayMap<String> dictionary; |
| + |
| + private final TermAttribute termAtt = addAttribute(TermAttribute.class); |
| + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); |
| + |
| + /** |
| + * Create a new StemmerOverrideFilter, performing dictionary-based stemming |
| + * with the provided <code>dictionary</code>. |
| + * <p> |
| + * Any dictionary-stemmed terms will be marked with {@link KeywordAttribute} |
| + * so that they will not be stemmed with stemmers down the chain. |
| + * </p> |
| + */ |
| + public StemmerOverrideFilter(Version matchVersion, TokenStream input, |
| + Map<?,String> dictionary) { |
| + super(input); |
| + this.dictionary = dictionary instanceof CharArrayMap ? |
| + (CharArrayMap<String>) dictionary : CharArrayMap.copy(matchVersion, dictionary); |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() throws IOException { |
| + if (input.incrementToken()) { |
| + if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms |
| + String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength()); |
| + if (stem != null) { |
| + termAtt.setTermBuffer(stem); |
| + keywordAtt.setKeyword(true); |
| + } |
| + } |
| + return true; |
| + } else { |
| + return false; |
| + } |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\miscellaneous\StemmerOverrideFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (working copy)
|
| @@ -18,7 +18,6 @@
|
| */ |
| |
| import org.apache.lucene.analysis.TokenStream; |
| -import org.apache.lucene.analysis.Token; |
| |
| import java.io.IOException; |
| |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (working copy)
|
| @@ -17,7 +17,6 @@
|
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (working copy)
|
| @@ -19,7 +19,6 @@
|
| |
| import java.io.IOException; |
| |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemmer.java (working copy)
|
| @@ -25,8 +25,10 @@
|
| * refer to http://snowball.sourceforge.net/french/stemmer.html<br> |
| * (French stemming algorithm) for details |
| * </p> |
| + * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead, |
| + * which has the same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| - |
| +@Deprecated |
| public class FrenchStemmer { |
| |
| /** |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java (working copy)
|
| @@ -20,6 +20,7 @@
|
| import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| |
| @@ -40,7 +41,11 @@
|
| * the {@link KeywordAttribute} before this {@link TokenStream}. |
| * </p> |
| * @see KeywordMarkerTokenFilter |
| + * @deprecated Use {@link SnowballFilter} with |
| + * {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the |
| + * same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| public final class FrenchStemFilter extends TokenFilter { |
| |
| /** |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java (working copy)
|
| @@ -68,7 +68,7 @@
|
| /** |
| * Constructs an elision filter with standard stop words |
| */ |
| - protected ElisionFilter(Version matchVersion, TokenStream input) { |
| + public ElisionFilter(Version matchVersion, TokenStream input) { |
| this(matchVersion, input, DEFAULT_ARTICLES); |
| } |
| |
| @@ -77,7 +77,7 @@
|
| * @deprecated use {@link #ElisionFilter(Version, TokenStream)} instead |
| */ |
| @Deprecated |
| - protected ElisionFilter(TokenStream input) { |
| + public ElisionFilter(TokenStream input) { |
| this(Version.LUCENE_30, input); |
| } |
| |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java (working copy)
|
| @@ -27,6 +27,7 @@
|
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| import org.apache.lucene.analysis.standard.StandardFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc |
| @@ -68,7 +69,7 @@
|
| * Extended list of typical French stopwords. |
| * @deprecated use {@link #getDefaultStopSet()} instead |
| */ |
| - // TODO make this private in 3.1 |
| + // TODO make this private in 3.1, remove in 4.0 |
| @Deprecated |
| public final static String[] FRENCH_STOP_WORDS = { |
| "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", |
| @@ -95,6 +96,9 @@
|
| "été", "être", "ô" |
| }; |
| |
| + /** File containing default French stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt"; |
| + |
| /** |
| * Contains words that should be indexed but not stemmed. |
| */ |
| @@ -110,16 +114,31 @@
|
| } |
| |
| private static class DefaultSetHolder { |
| - static final Set<?> DEFAULT_STOP_SET = CharArraySet |
| + /** @deprecated remove this in Lucene 4.0 */ |
| + @Deprecated |
| + static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet |
| .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS), |
| false)); |
| + static final Set<?> DEFAULT_STOP_SET; |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = |
| + WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| } |
| |
| /** |
| - * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}). |
| + * Builds an analyzer with the default stop words ({@link #getDefaultStopSet}). |
| */ |
| public FrenchAnalyzer(Version matchVersion) { |
| - this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + this(matchVersion, |
| + matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET |
| + : DefaultSetHolder.DEFAULT_STOP_SET_30); |
| } |
| |
| /** |
| @@ -207,20 +226,34 @@
|
| * {@link Reader}. |
| * |
| * @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} |
| - * filtered with {@link StandardFilter}, {@link StopFilter}, |
| - * {@link FrenchStemFilter} and {@link LowerCaseFilter} |
| + * filtered with {@link StandardFilter}, {@link ElisionFilter}, |
| + * {@link LowerCaseFilter}, {@link StopFilter}, |
| + * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, |
| + * and {@link SnowballFilter} |
| */ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, |
| Reader reader) { |
| - final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| - TokenStream result = new StandardFilter(source); |
| - result = new StopFilter(matchVersion, result, stopwords); |
| - if(!excltable.isEmpty()) |
| - result = new KeywordMarkerTokenFilter(result, excltable); |
| - result = new FrenchStemFilter(result); |
| - // Convert to lowercase after stemming! |
| - return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new ElisionFilter(matchVersion, result); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!excltable.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, excltable); |
| + result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } else { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!excltable.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, excltable); |
| + result = new FrenchStemFilter(result); |
| + // Convert to lowercase after stemming! |
| + return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result)); |
| + } |
| } |
| } |
| |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java (working copy)
|
| @@ -26,8 +26,10 @@
|
| * the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a> |
| * algorithm in Martin Porter's snowball project. |
| * </p> |
| + * @deprecated Use {@link org.tartarus.snowball.ext.DutchStemmer} instead, |
| + * which has the same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| - |
| +@Deprecated |
| public class DutchStemmer { |
| /** |
| * Buffer for the terms while stemming them. |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java (working copy)
|
| @@ -26,6 +26,7 @@
|
| import org.apache.lucene.analysis.KeywordMarkerTokenFilter;// for javadoc |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| |
| @@ -42,7 +43,11 @@
|
| * the {@link KeywordAttribute} before this {@link TokenStream}. |
| * </p> |
| * @see KeywordMarkerTokenFilter |
| + * @deprecated Use {@link SnowballFilter} with |
| + * {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the |
| + * same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| public final class DutchStemFilter extends TokenFilter { |
| /** |
| * The actual token in the input stream. |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java (working copy)
|
| @@ -20,11 +20,14 @@
|
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| import org.apache.lucene.analysis.ReusableAnalyzerBase; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| import org.apache.lucene.analysis.standard.StandardFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc |
| @@ -33,7 +36,6 @@
|
| import java.io.File; |
| import java.io.IOException; |
| import java.io.Reader; |
| -import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| @@ -60,19 +62,11 @@
|
| * @deprecated use {@link #getDefaultStopSet()} instead |
| */ |
| @Deprecated |
| - public final static String[] DUTCH_STOP_WORDS = |
| - { |
| - "de", "en", "van", "ik", "te", "dat", "die", "in", "een", |
| - "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had", |
| - "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo", |
| - "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar", |
| - "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal", |
| - "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren", |
| - "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus", |
| - "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt", |
| - "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets", |
| - "uw", "iemand", "geweest", "andere" |
| - }; |
| + public final static String[] DUTCH_STOP_WORDS = getDefaultStopSet().toArray(new String[0]); |
| + |
| + /** File containing default Dutch stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "dutch_stop.txt"; |
| + |
| /** |
| * Returns an unmodifiable instance of the default stop-words set. |
| * @return an unmodifiable instance of the default stop-words set. |
| @@ -82,9 +76,18 @@
|
| } |
| |
| private static class DefaultSetHolder { |
| - static final Set<?> DEFAULT_STOP_SET = CharArraySet |
| - .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, |
| - Arrays.asList(DUTCH_STOP_WORDS), false)); |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| } |
| |
| |
| @@ -223,18 +226,32 @@
|
| * text in the provided {@link Reader}. |
| * |
| * @return A {@link TokenStream} built from a {@link StandardTokenizer} |
| - * filtered with {@link StandardFilter}, {@link StopFilter}, |
| - * and {@link DutchStemFilter} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, |
| + * {@link StemmerOverrideFilter}, and {@link SnowballFilter} |
| */ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, |
| Reader aReader) { |
| - final Tokenizer source = new StandardTokenizer(matchVersion, aReader); |
| - TokenStream result = new StandardFilter(source); |
| - result = new StopFilter(matchVersion, result, stoptable); |
| - if (!excltable.isEmpty()) |
| - result = new KeywordMarkerTokenFilter(result, excltable); |
| - result = new DutchStemFilter(result, stemdict); |
| - return new TokenStreamComponents(source, result); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, aReader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stoptable); |
| + if (!excltable.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, excltable); |
| + if (!stemdict.isEmpty()) |
| + result = new StemmerOverrideFilter(matchVersion, result, stemdict); |
| + result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } else { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, aReader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new StopFilter(matchVersion, result, stoptable); |
| + if (!excltable.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, excltable); |
| + result = new DutchStemFilter(result, stemdict); |
| + return new TokenStreamComponents(source, result); |
| + } |
| } |
| } |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java (working copy)
|
| @@ -19,7 +19,6 @@
|
| import java.io.IOException; |
| import java.util.Locale; |
| import java.lang.Character.UnicodeBlock; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.hu; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.HungarianStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Hungarian. |
| + */ |
| +public final class HungarianAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Hungarian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "hungarian_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public HungarianAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public HungarianAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public HungarianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new HungarianStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\hu\HungarianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/hu/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Hungarian. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\hu\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,130 @@
|
| +package org.apache.lucene.analysis.no; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.NorwegianStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Norwegian. |
| + */ |
| +public final class NorwegianAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Norwegian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "norwegian_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public NorwegianAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public NorwegianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new NorwegianStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
| + |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\no\NorwegianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/no/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Norwegian. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\no\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy)
|
| @@ -17,7 +17,6 @@
|
| */ |
| |
| |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilter.java (working copy)
|
| @@ -19,7 +19,6 @@
|
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.index.Payload; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilter.java (working copy)
|
| @@ -19,7 +19,6 @@
|
| |
| import java.io.IOException; |
| |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,133 @@
|
| +package org.apache.lucene.analysis.ro; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.RomanianStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Romanian. |
| + */ |
| +public final class RomanianAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Romanian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
| + /** |
| + * The comment character in the stopwords file. |
| + * All lines prefixed with this will be ignored. |
| + */ |
| + private static final String STOPWORDS_COMMENT = "#"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = loadStopwordSet(false, RomanianAnalyzer.class, |
| + DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public RomanianAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public RomanianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new RomanianStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\ro\RomanianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ro/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Romanian. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\ro\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java (working copy)
|
| @@ -39,7 +39,10 @@
|
| * <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language. |
| * </ul> |
| * </p> |
| + * @deprecated Use the language-specific analyzer in contrib/analyzers instead. |
| + * This analyzer will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| public final class SnowballAnalyzer extends Analyzer { |
| private String name; |
| private Set<?> stopSet; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java (working copy)
|
| @@ -21,6 +21,7 @@
|
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link |
| import org.apache.lucene.analysis.LowerCaseFilter; // javadoc @link |
| @@ -39,14 +40,14 @@
|
| */ |
| public final class SnowballFilter extends TokenFilter { |
| |
| - private SnowballProgram stemmer; |
| + private final SnowballProgram stemmer; |
| |
| - private TermAttribute termAtt; |
| + private final TermAttribute termAtt = addAttribute(TermAttribute.class); |
| + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| |
| public SnowballFilter(TokenStream input, SnowballProgram stemmer) { |
| super(input); |
| this.stemmer = stemmer; |
| - termAtt = addAttribute(TermAttribute.class); |
| } |
| |
| /** |
| @@ -67,23 +68,24 @@
|
| } catch (Exception e) { |
| throw new RuntimeException(e.toString()); |
| } |
| - termAtt = addAttribute(TermAttribute.class); |
| } |
| |
| /** Returns the next input Token, after being stemmed */ |
| @Override |
| public final boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| - char termBuffer[] = termAtt.termBuffer(); |
| - final int length = termAtt.termLength(); |
| - stemmer.setCurrent(termBuffer, length); |
| - stemmer.stem(); |
| - final char finalTerm[] = stemmer.getCurrentBuffer(); |
| - final int newLength = stemmer.getCurrentBufferLength(); |
| - if (finalTerm != termBuffer) |
| - termAtt.setTermBuffer(finalTerm, 0, newLength); |
| - else |
| - termAtt.setTermLength(newLength); |
| + if (!keywordAttr.isKeyword()) { |
| + char termBuffer[] = termAtt.termBuffer(); |
| + final int length = termAtt.termLength(); |
| + stemmer.setCurrent(termBuffer, length); |
| + stemmer.stem(); |
| + final char finalTerm[] = stemmer.getCurrentBuffer(); |
| + final int newLength = stemmer.getCurrentBufferLength(); |
| + if (finalTerm != termBuffer) |
| + termAtt.setTermBuffer(finalTerm, 0, newLength); |
| + else |
| + termAtt.setTermLength(newLength); |
| + } |
| return true; |
| } else { |
| return false; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.PortugueseStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Portuguese. |
| + */ |
| +public final class PortugueseAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Portuguese stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "portuguese_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public PortugueseAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public PortugueseAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new PortugueseStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\pt\PortugueseAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/pt/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Portuguese. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\pt\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,132 @@
|
| +package org.apache.lucene.analysis.tr; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.TurkishStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Turkish. |
| + */ |
| +public final class TurkishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Turkish stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
| + /** |
| + * The comment character in the stopwords file. |
| + * All lines prefixed with this will be ignored. |
| + */ |
| + private static final String STOPWORDS_COMMENT = "#"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = loadStopwordSet(false, TurkishAnalyzer.class, |
| + DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public TurkishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public TurkishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public TurkishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new TurkishLowerCaseFilter(result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new TurkishStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\tr\TurkishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/package.html (working copy)
|
| @@ -17,15 +17,6 @@
|
| --> |
| <html><head></head> |
| <body> |
| -Support for Turkish. |
| -<p> |
| -This package contains just the TokenStream for handling turkish casing, |
| -for a stemmer please see the snowball package. |
| -</p> |
| -<p> |
| -WARNING: SnowballAnalyzer uses LowerCaseFilter by default, even when the |
| -language is set to Turkish, so you will need to construct your own |
| -analyzer that combines TurkishLowerCaseFilter and SnowballFilter. |
| -</p> |
| +Analyzer for Turkish. |
| </body> |
| -</html>
|
| \ No newline at end of file
|
| +</html> |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (working copy)
|
| @@ -21,6 +21,7 @@
|
| import org.apache.lucene.analysis.CharTokenizer; |
| import org.apache.lucene.analysis.Tokenizer; // for javadocs |
| import org.apache.lucene.analysis.LetterTokenizer; // for javadocs |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.Version; |
| |
| @@ -35,8 +36,11 @@
|
| * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and |
| * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and |
| * {@link CharTokenizer#normalize(int)} for details.</li> |
| - * </ul> |
| + * </ul> |
| + * @deprecated Use {@link StandardTokenizer} instead, which has the same functionality. |
| + * This filter will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| public class RussianLetterTokenizer extends CharTokenizer |
| { |
| private static final int DIGIT_0 = '0'; |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java (working copy)
|
| @@ -19,7 +19,10 @@
|
| |
| /** |
| * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description). |
| + * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead, |
| + * which has the same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| class RussianStemmer |
| { |
| // positions of RV, R1 and R2 respectively |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java (working copy)
|
| @@ -24,6 +24,7 @@
|
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link |
| |
| import java.io.IOException; |
| |
| @@ -40,7 +41,11 @@
|
| * the {@link KeywordAttribute} before this {@link TokenStream}. |
| * </p> |
| * @see KeywordMarkerTokenFilter |
| + * @deprecated Use {@link SnowballFilter} with |
| + * {@link org.tartarus.snowball.ext.RussianStemmer} instead, which has the |
| + * same functionality. This filter will be removed in Lucene 4.0 |
| */ |
| +@Deprecated |
| public final class RussianStemFilter extends TokenFilter |
| { |
| /** |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java (working copy)
|
| @@ -17,6 +17,7 @@
|
| * limitations under the License. |
| */ |
| |
| +import java.io.IOException; |
| import java.io.Reader; |
| import java.util.Arrays; |
| import java.util.Map; |
| @@ -26,11 +27,15 @@
|
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.LowerCaseFilter; |
| import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| @@ -44,9 +49,11 @@
|
| public final class RussianAnalyzer extends StopwordAnalyzerBase |
| { |
| /** |
| - * List of typical Russian stopwords. |
| + * List of typical Russian stopwords. (for backwards compatibility) |
| + * @deprecated Remove this for LUCENE 4.0 |
| */ |
| - private static final String[] RUSSIAN_STOP_WORDS = { |
| + @Deprecated |
| + private static final String[] RUSSIAN_STOP_WORDS_30 = { |
| "а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в", |
| "вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где", |
| "да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть", |
| @@ -59,10 +66,27 @@
|
| "чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я" |
| }; |
| |
| + /** File containing default Russian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt"; |
| + |
| private static class DefaultSetHolder { |
| - static final Set<?> DEFAULT_STOP_SET = CharArraySet |
| + /** @deprecated remove this for Lucene 4.0 */ |
| + @Deprecated |
| + static final Set<?> DEFAULT_STOP_SET_30 = CharArraySet |
| .unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, |
| - Arrays.asList(RUSSIAN_STOP_WORDS), false)); |
| + Arrays.asList(RUSSIAN_STOP_WORDS_30), false)); |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = |
| + WordlistLoader.getSnowballWordSet(SnowballFilter.class, DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| } |
| |
| private final Set<?> stemExclusionSet; |
| @@ -77,7 +101,9 @@
|
| } |
| |
| public RussianAnalyzer(Version matchVersion) { |
| - this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + this(matchVersion, |
| + matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET |
| + : DefaultSetHolder.DEFAULT_STOP_SET_30); |
| } |
| |
| /** |
| @@ -132,19 +158,30 @@
|
| * provided {@link Reader}. |
| * |
| * @return {@link TokenStreamComponents} built from a |
| - * {@link RussianLetterTokenizer} filtered with |
| + * {@link StandardTokenizer} filtered with {@link StandardFilter}, |
| * {@link LowerCaseFilter}, {@link StopFilter}, |
| - * and {@link RussianStemFilter} |
| + * {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, |
| + * and {@link SnowballFilter} |
| */ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, |
| Reader reader) { |
| - final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); |
| - TokenStream result = new LowerCaseFilter(matchVersion, source); |
| - result = new StopFilter(matchVersion, result, stopwords); |
| - if(!stemExclusionSet.isEmpty()) |
| - result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| - return new TokenStreamComponents(source, new RussianStemFilter(result)); |
| - |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter( |
| + result, stemExclusionSet); |
| + result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } else { |
| + final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader); |
| + TokenStream result = new LowerCaseFilter(matchVersion, source); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerTokenFilter( |
| + result, stemExclusionSet); |
| + return new TokenStreamComponents(source, new RussianStemFilter(result)); |
| + } |
| } |
| } |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java (working copy)
|
| @@ -24,6 +24,7 @@
|
| import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc |
| import org.apache.lucene.util.Version; |
| @@ -117,13 +118,15 @@
|
| * |
| * @return {@link TokenStreamComponents} built from a |
| * {@link StandardTokenizer} filtered with |
| - * {@link GreekLowerCaseFilter} and {@link StopFilter} |
| + * {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter} |
| */ |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName, |
| Reader reader) { |
| final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| - final TokenStream result = new GreekLowerCaseFilter(source); |
| + TokenStream result = new GreekLowerCaseFilter(source); |
| + if (matchVersion.onOrAfter(Version.LUCENE_31)) |
| + result = new StandardFilter(result); |
| return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); |
| } |
| } |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,113 @@
|
| +package org.apache.lucene.analysis.en; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.PorterStemFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| + |
| +/** |
| + * {@link Analyzer} for English. |
| + */ |
| +public final class EnglishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #getDefaultStopSet}. |
| + */ |
| + public EnglishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public EnglishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link PorterStemFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new PorterStemFilter(result); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\en\EnglishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for English. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\en\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (revision 906571)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/position/PositionFilter.java (working copy)
|
| @@ -21,7 +21,6 @@
|
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| |
| /** Set the positionIncrement of all tokens to the "positionIncrement", |
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.es; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.SpanishStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Spanish. |
| + */ |
| +public final class SpanishAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Spanish stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "spanish_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public SpanishAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public SpanishAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public SpanishAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new SpanishStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\es\SpanishAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/es/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Spanish. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\es\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.it; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| +import org.apache.lucene.analysis.KeywordMarkerTokenFilter; |
| +import org.apache.lucene.analysis.LowerCaseFilter; |
| +import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.WordlistLoader; |
| +import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link |
| +import org.apache.lucene.analysis.snowball.SnowballFilter; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.util.Version; |
| +import org.tartarus.snowball.ext.ItalianStemmer; |
| + |
| +/** |
| + * {@link Analyzer} for Italian. |
| + */ |
| +public final class ItalianAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Italian stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(SnowballFilter.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public ItalianAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public ItalianAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerTokenFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public ItalianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided |
| + * {@link Reader}. |
| + * |
| + * @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer} |
| + * filtered with {@link StandardFilter}, {@link LowerCaseFilter}, |
| + * {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem |
| + * exclusion set is provided and {@link SnowballFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerTokenFilter(result, stemExclusionSet); |
| + result = new SnowballFilter(result, new ItalianStemmer()); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\it\ItalianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html
|
| ===================================================================
|
| --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (revision 0)
|
| +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/it/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Italian. |
| +</body> |
| +</html> |
|
|
| Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\it\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt
|
| ===================================================================
|
| --- contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (revision 0)
|
| +++ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt (revision 0)
|
| @@ -0,0 +1,233 @@
|
| +# This file was created by Jacques Savoy and is distributed under the BSD license. |
| +# See http://members.unine.ch/jacques.savoy/clef/index.html. |
| +# Also see http://www.opensource.org/licenses/bsd-license.html |
| +acea |
| +aceasta |
| +această |
| +aceea |
| +acei |
| +aceia |
| +acel |
| +acela |
| +acele |
| +acelea |
| +acest |
| +acesta |
| +aceste |
| +acestea |
| +aceşti |
| +aceştia |
| +acolo |
| +acum |
| +ai |
| +aia |
| +aibă |
| +aici |
| +al |
| +ăla |
| +ale |
| +alea |
| +ălea |
| +altceva |
| +altcineva |
| +am |
| +ar |
| +are |
| +aş |
| +aşadar |
| +asemenea |
| +asta |
| +ăsta |
| +astăzi |
| +astea |
| +ăstea |
| +ăştia |
| +asupra |
| +aţi |
| +au |
| +avea |
| +avem |
| +aveţi |
| +azi |
| +bine |
| +bucur |
| +bună |
| +ca |
| +că |
| +căci |
| +când |
| +care |
| +cărei |
| +căror |
| +cărui |
| +cât |
| +câte |
| +câţi |
| +către |
| +câtva |
| +ce |
| +cel |
| +ceva |
| +chiar |
| +cînd |
| +cine |
| +cineva |
| +cît |
| +cîte |
| +cîţi |
| +cîtva |
| +contra |
| +cu |
| +cum |
| +cumva |
| +curând |
| +curînd |
| +da |
| +dă |
| +dacă |
| +dar |
| +datorită |
| +de |
| +deci |
| +deja |
| +deoarece |
| +departe |
| +deşi |
| +din |
| +dinaintea |
| +dintr |
| +dintre |
| +drept |
| +după |
| +ea |
| +ei |
| +el |
| +ele |
| +eram |
| +este |
| +eşti |
| +eu |
| +face |
| +fără |
| +fi |
| +fie |
| +fiecare |
| +fii |
| +fim |
| +fiţi |
| +iar |
| +ieri |
| +îi |
| +îl |
| +îmi |
| +împotriva |
| +în |
| +înainte |
| +înaintea |
| +încât |
| +încît |
| +încotro |
| +între |
| +întrucât |
| +întrucît |
| +îţi |
| +la |
| +lângă |
| +le |
| +li |
| +lîngă |
| +lor |
| +lui |
| +mă |
| +mâine |
| +mea |
| +mei |
| +mele |
| +mereu |
| +meu |
| +mi |
| +mine |
| +mult |
| +multă |
| +mulţi |
| +ne |
| +nicăieri |
| +nici |
| +nimeni |
| +nişte |
| +noastră |
| +noastre |
| +noi |
| +noştri |
| +nostru |
| +nu |
| +ori |
| +oricând |
| +oricare |
| +oricât |
| +orice |
| +oricînd |
| +oricine |
| +oricît |
| +oricum |
| +oriunde |
| +până |
| +pe |
| +pentru |
| +peste |
| +pînă |
| +poate |
| +pot |
| +prea |
| +prima |
| +primul |
| +prin |
| +printr |
| +sa |
| +să |
| +săi |
| +sale |
| +sau |
| +său |
| +se |
| +şi |
| +sînt |
| +sîntem |
| +sînteţi |
| +spre |
| +sub |
| +sunt |
| +suntem |
| +sunteţi |
| +ta |
| +tăi |
| +tale |
| +tău |
| +te |
| +ţi |
| +ţie |
| +tine |
| +toată |
| +toate |
| +tot |
| +toţi |
| +totuşi |
| +tu |
| +un |
| +una |
| +unde |
| +undeva |
| +unei |
| +unele |
| +uneori |
| +unor |
| +vă |
| +vi |
| +voastră |
| +voastre |
| +voi |
| +voştri |
| +vostru |
| +vouă |
| +vreo |
| +vreun |
|
|
| Property changes on: contrib\analyzers\common\src\resources\org\apache\lucene\analysis\ro\stopwords.txt
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt
|
| ===================================================================
|
| --- contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (revision 0)
|
| +++ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/tr/stopwords.txt (revision 0)
|
| @@ -0,0 +1,212 @@
|
| +# Turkish stopwords from LUCENE-559 |
| +# merged with the list from "Information Retrieval on Turkish Texts" |
| +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) |
| +acaba |
| +altmış |
| +altı |
| +ama |
| +ancak |
| +arada |
| +aslında |
| +ayrıca |
| +bana |
| +bazı |
| +belki |
| +ben |
| +benden |
| +beni |
| +benim |
| +beri |
| +beş |
| +bile |
| +bin |
| +bir |
| +birçok |
| +biri |
| +birkaç |
| +birkez |
| +birşey |
| +birşeyi |
| +biz |
| +bize |
| +bizden |
| +bizi |
| +bizim |
| +böyle |
| +böylece |
| +bu |
| +buna |
| +bunda |
| +bundan |
| +bunlar |
| +bunları |
| +bunların |
| +bunu |
| +bunun |
| +burada |
| +çok |
| +çünkü |
| +da |
| +daha |
| +dahi |
| +de |
| +defa |
| +değil |
| +diğer |
| +diye |
| +doksan |
| +dokuz |
| +dolayı |
| +dolayısıyla |
| +dört |
| +edecek |
| +eden |
| +ederek |
| +edilecek |
| +ediliyor |
| +edilmesi |
| +ediyor |
| +eğer |
| +elli |
| +en |
| +etmesi |
| +etti |
| +ettiği |
| +ettiğini |
| +gibi |
| +göre |
| +halen |
| +hangi |
| +hatta |
| +hem |
| +henüz |
| +hep |
| +hepsi |
| +her |
| +herhangi |
| +herkesin |
| +hiç |
| +hiçbir |
| +için |
| +iki |
| +ile |
| +ilgili |
| +ise |
| +işte |
| +itibaren |
| +itibariyle |
| +kadar |
| +karşın |
| +katrilyon |
| +kendi |
| +kendilerine |
| +kendini |
| +kendisi |
| +kendisine |
| +kendisini |
| +kez |
| +ki |
| +kim |
| +kimden |
| +kime |
| +kimi |
| +kimse |
| +kırk |
| +milyar |
| +milyon |
| +mu |
| +mü |
| +mı |
| +nasıl |
| +ne |
| +neden |
| +nedenle |
| +nerde |
| +nerede |
| +nereye |
| +niye |
| +niçin |
| +o |
| +olan |
| +olarak |
| +oldu |
| +olduğu |
| +olduğunu |
| +olduklarını |
| +olmadı |
| +olmadığı |
| +olmak |
| +olması |
| +olmayan |
| +olmaz |
| +olsa |
| +olsun |
| +olup |
| +olur |
| +olursa |
| +oluyor |
| +on |
| +ona |
| +ondan |
| +onlar |
| +onlardan |
| +onları |
| +onların |
| +onu |
| +onun |
| +otuz |
| +oysa |
| +öyle |
| +pek |
| +rağmen |
| +sadece |
| +sanki |
| +sekiz |
| +seksen |
| +sen |
| +senden |
| +seni |
| +senin |
| +siz |
| +sizden |
| +sizi |
| +sizin |
| +şey |
| +şeyden |
| +şeyi |
| +şeyler |
| +şöyle |
| +şu |
| +şuna |
| +şunda |
| +şundan |
| +şunları |
| +şunu |
| +tarafından |
| +trilyon |
| +tüm |
| +üç |
| +üzere |
| +var |
| +vardı |
| +ve |
| +veya |
| +ya |
| +yani |
| +yapacak |
| +yapılan |
| +yapılması |
| +yapıyor |
| +yapmak |
| +yaptı |
| +yaptığı |
| +yaptığını |
| +yaptıkları |
| +yedi |
| +yerine |
| +yetmiş |
| +yine |
| +yirmi |
| +yoksa |
| +yüz |
| +zaten |
|
|
| Property changes on: contrib\analyzers\common\src\resources\org\apache\lucene\analysis\tr\stopwords.txt
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|