| Index: solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java
|
| ===================================================================
|
| --- solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (revision 0)
|
| +++ solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (revision 0)
|
| @@ -0,0 +1,36 @@
|
| +package org.apache.solr.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.Reader; |
| +import java.io.StringReader; |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| + |
| +/** |
| + * Simple tests to ensure the Galician stem factory is working. |
| + */ |
| +public class TestGalicianStemFilterFactory extends BaseTokenTestCase { |
| + public void testStemming() throws Exception { |
| + Reader reader = new StringReader("cariñosa"); |
| + GalicianStemFilterFactory factory = new GalicianStemFilterFactory(); |
| + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); |
| + assertTokenStreamContents(stream, new String[] { "cariñ" }); |
| + } |
| +} |
|
|
| Property changes on: solr\src\test\org\apache\solr\analysis\TestGalicianStemFilterFactory.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java
|
| ===================================================================
|
| --- solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java (revision 0)
|
| +++ solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java (revision 0)
|
| @@ -0,0 +1,36 @@
|
| +package org.apache.solr.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.Reader; |
| +import java.io.StringReader; |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.core.WhitespaceTokenizer; |
| + |
| +/** |
| + * Simple tests to ensure the Portuguese stem factory is working. |
| + */ |
| +public class TestPortugueseStemFilterFactory extends BaseTokenTestCase { |
| + public void testStemming() throws Exception { |
| + Reader reader = new StringReader("maluquice"); |
| + PortugueseStemFilterFactory factory = new PortugueseStemFilterFactory(); |
| + TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader)); |
| + assertTokenStreamContents(stream, new String[] { "maluc" }); |
| + } |
| +} |
|
|
| Property changes on: solr\src\test\org\apache\solr\analysis\TestPortugueseStemFilterFactory.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java
|
| ===================================================================
|
| --- solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (revision 0)
|
| +++ solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (revision 0)
|
| @@ -0,0 +1,28 @@
|
| +package org.apache.solr.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.pt.PortugueseStemFilter; |
| + |
| +/** Factory for {@link PortugueseStemFilter} */ |
| +public class PortugueseStemFilterFactory extends BaseTokenFilterFactory { |
| + public TokenStream create(TokenStream input) { |
| + return new PortugueseStemFilter(input); |
| + } |
| +} |
|
|
| Property changes on: solr\src\java\org\apache\solr\analysis\PortugueseStemFilterFactory.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java
|
| ===================================================================
|
| --- solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (revision 0)
|
| +++ solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (revision 0)
|
| @@ -0,0 +1,28 @@
|
| +package org.apache.solr.analysis; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.gl.GalicianStemFilter; |
| + |
| +/** Factory for {@link GalicianStemFilter} */ |
| +public class GalicianStemFilterFactory extends BaseTokenFilterFactory { |
| + public TokenStream create(TokenStream input) { |
| + return new GalicianStemFilter(input); |
| + } |
| +} |
|
|
| Property changes on: solr\src\java\org\apache\solr\analysis\GalicianStemFilterFactory.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptrslptestdata.zip
|
| ===================================================================
|
| Cannot display: file marked as a binary type.
|
| svn:mime-type = application/octet-stream
|
|
|
| Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\pt\ptrslptestdata.zip
|
| ___________________________________________________________________
|
| Added: svn:mime-type
|
| + application/octet-stream
|
|
|
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
|
| ===================================================================
|
| --- modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (revision 0)
|
| +++ modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (revision 0)
|
| @@ -0,0 +1,69 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary; |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.core.LowerCaseFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; |
| + |
| +/** |
| + * Simple tests for {@link PortugueseStemFilter} |
| + */ |
| +public class TestPortugueseStemFilter extends BaseTokenStreamTestCase { |
| + private Analyzer analyzer = new ReusableAnalyzerBase() { |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); |
| + TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); |
| + return new TokenStreamComponents(source, new PortugueseStemFilter(result)); |
| + } |
| + }; |
| + |
| + /** |
| + * Test the example from the paper "Assessing the impact of stemming accuracy |
| + * on information retrieval" |
| + */ |
| + public void testExamples() throws IOException { |
| + assertAnalyzesTo( |
| + analyzer, |
| + "O debate político, pelo menos o que vem a público, parece, de modo nada " |
| + + "surpreendente, restrito a temas menores. Mas há, evidentemente, " |
| + + "grandes questões em jogo nas eleições que se aproximam.", |
| + new String[] { |
| + "o", "debat", "politic", "pel", "menos", "o", "que", "vem", "a", |
| + "public", "parec", "de", "mod", "nad", "surpreend", "restrit", |
| + "a", "tem", "men", "mas", "ha", "evid", "grand", "quest", |
| + "em", "jog", "na", "eleic", "que", "se", "aproxim" |
| + }); |
| + } |
| + |
| + /** Test against a vocabulary from the reference impl */ |
| + public void testVocabulary() throws IOException { |
| + assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt"); |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\pt\TestPortugueseStemFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java
|
| ===================================================================
|
| --- modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java (revision 0)
|
| +++ modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java (revision 0)
|
| @@ -0,0 +1,52 @@
|
| +package org.apache.lucene.analysis.gl; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary; |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.core.LowerCaseFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.analysis.util.ReusableAnalyzerBase; |
| + |
| +/** |
| + * Simple tests for {@link GalicianStemFilter} |
| + */ |
| +public class TestGalicianStemFilter extends BaseTokenStreamTestCase { |
| + private Analyzer analyzer = new ReusableAnalyzerBase() { |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); |
| + TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); |
| + return new TokenStreamComponents(source, new GalicianStemFilter(result)); |
| + } |
| + }; |
| + |
| + |
| + /** Test against a vocabulary from the reference impl */ |
| + public void testVocabulary() throws IOException { |
| + assertVocabulary(analyzer, getDataFile("gltestdata.zip"), "gl.txt"); |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\TestGalicianStemFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/gltestdata.zip
|
| ===================================================================
|
| Cannot display: file marked as a binary type.
|
| svn:mime-type = application/octet-stream
|
|
|
| Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\gltestdata.zip
|
| ___________________________________________________________________
|
| Added: svn:mime-type
|
| + application/octet-stream
|
|
|
| Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
|
| ===================================================================
|
| --- modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java (revision 0)
|
| +++ modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,53 @@
|
| +package org.apache.lucene.analysis.gl; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.util.HashSet; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| + |
| +public class TestGalicianAnalyzer extends BaseTokenStreamTestCase { |
| + /** This test fails with NPE when the |
| + * stopwords file is missing in classpath */ |
| + public void testResourcesAvailable() { |
| + new GalicianAnalyzer(TEST_VERSION_CURRENT); |
| + } |
| + |
| + /** test stopwords and stemming */ |
| + public void testBasics() throws IOException { |
| + Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT); |
| + // stemming |
| + checkOneTermReuse(a, "correspondente", "correspond"); |
| + checkOneTermReuse(a, "corresponderá", "correspond"); |
| + // stopword |
| + assertAnalyzesTo(a, "e", new String[] {}); |
| + } |
| + |
| + /** test use of exclusion set */ |
| + public void testExclude() throws IOException { |
| + Set<String> exclusionSet = new HashSet<String>(); |
| + exclusionSet.add("correspondente"); |
| + Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT, |
| + GalicianAnalyzer.getDefaultStopSet(), exclusionSet); |
| + checkOneTermReuse(a, "correspondente", "correspondente"); |
| + checkOneTermReuse(a, "corresponderá", "correspond"); |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\TestGalicianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java (revision 0)
|
| @@ -0,0 +1,102 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.util.Map; |
| + |
| +/** |
| + * Portuguese stemmer implementing the RSLP (Removedor de Sufixos da Lingua Portuguesa) |
| + * algorithm. This is sometimes also referred to as the Orengo stemmer. |
| + * |
| + * @see RSLPStemmerBase |
| + */ |
| +public class PortugueseStemmer extends RSLPStemmerBase { |
| + private static final Step plural, feminine, adverb, augmentative, noun, verb, vowel; |
| + |
| + static { |
| + Map<String,Step> steps = parse(PortugueseStemmer.class, "portuguese.rslp"); |
| + plural = steps.get("Plural"); |
| + feminine = steps.get("Feminine"); |
| + adverb = steps.get("Adverb"); |
| + augmentative = steps.get("Augmentative"); |
| + noun = steps.get("Noun"); |
| + verb = steps.get("Verb"); |
| + vowel = steps.get("Vowel"); |
| + } |
| + |
| + /** |
| + * @param s buffer, oversized to at least <code>len+1</code> |
| + * @param len initial valid length of buffer |
| + * @return new valid length, stemmed |
| + */ |
| + public int stem(char s[], int len) { |
| + assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1"; |
| + |
| + len = plural.apply(s, len); |
| + len = adverb.apply(s, len); |
| + len = feminine.apply(s, len); |
| + len = augmentative.apply(s, len); |
| + |
| + int oldlen = len; |
| + len = noun.apply(s, len); |
| + |
| + if (len == oldlen) { /* suffix not removed */ |
| + oldlen = len; |
| + |
| + len = verb.apply(s, len); |
| + |
| + if (len == oldlen) { /* suffix not removed */ |
| + len = vowel.apply(s, len); |
| + } |
| + } |
| + |
| + // rslp accent removal |
| + for (int i = 0; i < len; i++) { |
| + switch(s[i]) { |
| + case 'à': |
| + case 'á': |
| + case 'â': |
| + case 'ã': |
| + case 'ä': |
| + case 'å': s[i] = 'a'; break; |
| + case 'ç': s[i] = 'c'; break; |
| + case 'è': |
| + case 'é': |
| + case 'ê': |
| + case 'ë': s[i] = 'e'; break; |
| + case 'ì': |
| + case 'í': |
| + case 'î': |
| + case 'ï': s[i] = 'i'; break; |
| + case 'ñ': s[i] = 'n'; break; |
| + case 'ò': |
| + case 'ó': |
| + case 'ô': |
| + case 'õ': |
| + case 'ö': s[i] = 'o'; break; |
| + case 'ù': |
| + case 'ú': |
| + case 'û': |
| + case 'ü': s[i] = 'u'; break; |
| + case 'ý': |
| + case 'ÿ': s[i] = 'y'; break; |
| + } |
| + } |
| + return len; |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\PortugueseStemmer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java (revision 0)
|
| @@ -0,0 +1,60 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
| +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| + |
| +/** |
| + * A {@link TokenFilter} that applies {@link PortugueseStemmer} to stem |
| + * Portuguese words. |
| + * <p> |
| + * To prevent terms from being stemmed use an instance of |
| + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets |
| + * the {@link KeywordAttribute} before this {@link TokenStream}. |
| + * </p> |
| + */ |
| +public final class PortugueseStemFilter extends TokenFilter { |
| + private final PortugueseStemmer stemmer = new PortugueseStemmer(); |
| + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| + |
| + public PortugueseStemFilter(TokenStream input) { |
| + super(input); |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() throws IOException { |
| + if (input.incrementToken()) { |
| + if (!keywordAttr.isKeyword()) { |
| + // this stemmer increases word length by 1: worst case '*ã' -> '*ão' |
| + final int len = termAtt.length(); |
| + final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len); |
| + termAtt.setLength(newlen); |
| + } |
| + return true; |
| + } else { |
| + return false; |
| + } |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\PortugueseStemFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (revision 1054344)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (working copy)
|
| @@ -1,10 +1,5 @@
|
| package org.apache.lucene.analysis.pt; |
| |
| -import java.util.Arrays; |
| - |
| -import org.apache.lucene.analysis.util.CharArraySet; |
| -import org.apache.lucene.util.Version; |
| - |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -31,89 +26,14 @@
|
| * which is just the plural reduction step of the RSLP |
| * algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>, |
| * Orengo et al. |
| + * @see RSLPStemmerBase |
| */ |
| -public class PortugueseMinimalStemmer { |
| +public class PortugueseMinimalStemmer extends RSLPStemmerBase { |
| |
| - private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31, |
| - Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois", |
| - "depois","dois","leis"), |
| - false); |
| + private static final Step pluralStep = |
| + parse(PortugueseMinimalStemmer.class, "portuguese.rslp").get("Plural"); |
| |
| - private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31, |
| - Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos", |
| - "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés", |
| - "através", "convés", "ês", "país", "após", "ambas", "ambos", |
| - "messias", "depois"), |
| - false); |
| - |
| public int stem(char s[], int len) { |
| - if (len < 3 || s[len-1] != 's') |
| - return len; |
| - |
| - if (s[len-2] == 'n') { |
| - len--; |
| - s[len-1] = 'm'; |
| - return len; |
| - } |
| - |
| - if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') { |
| - len--; |
| - s[len-2] = 'ã'; |
| - s[len-1] = 'o'; |
| - return len; |
| - } |
| - |
| - if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e') |
| - if (!(len == 4 && s[0] == 'm')) { |
| - len--; |
| - s[len-1] = 'o'; |
| - return len; |
| - } |
| - |
| - if (len >= 4 && s[len-2] == 'i') { |
| - if (s[len-3] == 'a') |
| - if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) { |
| - len--; |
| - s[len-1] = 'l'; |
| - return len; |
| - } |
| - |
| - if (len >= 5 && s[len-3] == 'é') { |
| - len--; |
| - s[len-2] = 'e'; |
| - s[len-1] = 'l'; |
| - return len; |
| - } |
| - |
| - if (len >= 5 && s[len-3] == 'e') { |
| - len--; |
| - s[len-1] = 'l'; |
| - return len; |
| - } |
| - |
| - if (len >= 5 && s[len-3] == 'ó') { |
| - len--; |
| - s[len-2] = 'o'; |
| - s[len-1] = 'l'; |
| - return len; |
| - } |
| - |
| - if (!excIS.contains(s, 0, len)) { |
| - s[len-1] = 'l'; |
| - return len; |
| - } |
| - } |
| - |
| - if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e') |
| - return len - 2; |
| - |
| - if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e') |
| - if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o')) |
| - return len - 2; |
| - |
| - if (excS.contains(s, 0, len)) |
| - return len; |
| - else |
| - return len-1; |
| + return pluralStep.apply(s, len); |
| } |
| } |
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java (revision 0)
|
| @@ -0,0 +1,345 @@
|
| +package org.apache.lucene.analysis.pt; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.InputStream; |
| +import java.io.InputStreamReader; |
| +import java.io.LineNumberReader; |
| +import java.util.ArrayList; |
| +import java.util.Arrays; |
| +import java.util.HashMap; |
| +import java.util.List; |
| +import java.util.Map; |
| +import java.util.regex.Matcher; |
| +import java.util.regex.Pattern; |
| + |
| +import org.apache.lucene.analysis.util.CharArraySet; |
| +import org.apache.lucene.util.Version; |
| + |
| +import static org.apache.lucene.analysis.util.StemmerUtil.*; |
| + |
| +/** |
| + * Base class for stemmers that use a set of RSLP-like stemming steps. |
| + * <p> |
| + * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed |
| + * originally for stemming the Portuguese language, described in the paper |
| + * <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al. |
| + * <p> |
| + * Since this time a plural-only modification (RSLP-S) as well as a modification |
| + * for the Galician language have been implemented. This class parses a configuration |
| + * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s. |
| + * <p> |
| + * The general rule format is: |
| + * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote> |
| + * where: |
| + * <ul> |
| + * <li><code>suffix</code> is the suffix to be removed (such as "inho"). |
| + * <li><code>N</code> is the min stem size, where stem is defined as the candidate stem |
| + * after removing the suffix (but before appending the replacement!) |
| + * <li><code>replacement</code> is an optimal string to append after removing the suffix. |
| + * This can be the empty string. |
| + * <li><code>exceptions</code> is an optional list of exceptions, patterns that should |
| + * not be stemmed. These patterns can be specified as whole word or suffix (ends-with) |
| + * patterns, depending upon the exceptions format flag in the step header. |
| + * </ul> |
| + * <p> |
| + * A step is an ordered list of rules, with a structure in this format: |
| + * <blockquote>{ "name", N, B, { "cond1", "cond2", ... } |
| + * ... rules ... }; |
| + * </blockquote> |
| + * where: |
| + * <ul> |
| + * <li><code>name</code> is a name for the step (such as "Plural"). |
| + * <li><code>N</code> is the min word size. Words that are less than this length bypass |
| + * the step completely, as an optimization. Note: N can be zero, in this case this |
| + * implementation will automatically calculate the appropriate value from the underlying |
| + * rules. |
| + * <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched. |
| + * A value of 1 indicates whole-word pattern matching, a value of 0 indicates that |
| + * exceptions are actually suffixes and should be matched with ends-with. |
| + * <li><code>conds</code> are an optional list of conditions to enter the step at all. If |
| + * the list is non-empty, then a word must end with one of these conditions or it will |
| + * bypass the step completely as an optimization. |
| + * </ul> |
| + * <p> |
| + * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a> |
| + * @lucene.internal |
| + */ |
| +public abstract class RSLPStemmerBase { |
| + |
| + /** |
| + * A basic rule, with no exceptions. |
| + */ |
| + protected static class Rule { |
| + protected final char suffix[]; |
| + protected final char replacement[]; |
| + protected final int min; |
| + |
| + /** |
| + * Create a rule. |
| + * @param suffix suffix to remove |
| + * @param min minimum stem length |
| + * @param replacement replacement string |
| + */ |
| + public Rule(String suffix, int min, String replacement) { |
| + this.suffix = suffix.toCharArray(); |
| + this.replacement = replacement.toCharArray(); |
| + this.min = min; |
| + } |
| + |
| + /** |
| + * @return true if the word matches this rule. |
| + */ |
| + public boolean matches(char s[], int len) { |
| + return (len - suffix.length >= min && endsWith(s, len, suffix)); |
| + } |
| + |
| + /** |
| + * @return new valid length of the string after firing this rule. |
| + */ |
| + public int replace(char s[], int len) { |
| + if (replacement.length > 0) { |
| + System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length); |
| + } |
| + return len - suffix.length + replacement.length; |
| + } |
| + } |
| + |
| + /** |
| + * A rule with a set of whole-word exceptions. |
| + */ |
| + protected static class RuleWithSetExceptions extends Rule { |
| + protected final CharArraySet exceptions; |
| + |
| + public RuleWithSetExceptions(String suffix, int min, String replacement, |
| + String[] exceptions) { |
| + super(suffix, min, replacement); |
| + for (int i = 0; i < exceptions.length; i++) { |
| + if (!exceptions[i].endsWith(suffix)) |
| + System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); |
| + } |
| + this.exceptions = new CharArraySet(Version.LUCENE_31, |
| + Arrays.asList(exceptions), false); |
| + } |
| + |
| + @Override |
| + public boolean matches(char s[], int len) { |
| + return super.matches(s, len) && !exceptions.contains(s, 0, len); |
| + } |
| + } |
| + |
| + /** |
| + * A rule with a set of exceptional suffixes. |
| + */ |
| + protected static class RuleWithSuffixExceptions extends Rule { |
| + // TODO: use a more efficient datastructure: automaton? |
| + protected final char[][] exceptions; |
| + |
| + public RuleWithSuffixExceptions(String suffix, int min, String replacement, |
| + String[] exceptions) { |
| + super(suffix, min, replacement); |
| + for (int i = 0; i < exceptions.length; i++) { |
| + if (!exceptions[i].endsWith(suffix)) |
| + System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); |
| + } |
| + this.exceptions = new char[exceptions.length][]; |
| + for (int i = 0; i < exceptions.length; i++) |
| + this.exceptions[i] = exceptions[i].toCharArray(); |
| + } |
| + |
| + @Override |
| + public boolean matches(char s[], int len) { |
| + if (!super.matches(s, len)) |
| + return false; |
| + |
| + for (int i = 0; i < exceptions.length; i++) |
| + if (endsWith(s, len, exceptions[i])) |
| + return false; |
| + |
| + return true; |
| + } |
| + } |
| + |
| + /** |
| + * A step containing a list of rules. |
| + */ |
| + protected static class Step { |
| + protected final String name; |
| + protected final Rule rules[]; |
| + protected final int min; |
| + protected final char[][] suffixes; |
| + |
| + /** |
| + * Create a new step |
| + * @param name Step's name. |
| + * @param rules an ordered list of rules. |
| + * @param min minimum word size. if this is 0 it is automatically calculated. |
| + * @param suffixes optional list of conditional suffixes. may be null. |
| + */ |
| + public Step(String name, Rule rules[], int min, String suffixes[]) { |
| + this.name = name; |
| + this.rules = rules; |
| + if (min == 0) { |
| + min = Integer.MAX_VALUE; |
| + for (Rule r : rules) |
| + min = Math.min(min, r.min + r.suffix.length); |
| + } |
| + this.min = min; |
| + |
| + if (suffixes == null || suffixes.length == 0) { |
| + this.suffixes = null; |
| + } else { |
| + this.suffixes = new char[suffixes.length][]; |
| + for (int i = 0; i < suffixes.length; i++) |
| + this.suffixes[i] = suffixes[i].toCharArray(); |
| + } |
| + } |
| + |
| + /** |
| + * @return new valid length of the string after applying the entire step. |
| + */ |
| + public int apply(char s[], int len) { |
| + if (len < min) |
| + return len; |
| + |
| + if (suffixes != null) { |
| + boolean found = false; |
| + |
| + for (int i = 0; i < suffixes.length; i++) |
| + if (endsWith(s, len, suffixes[i])) { |
| + found = true; |
| + break; |
| + } |
| + |
| + if (!found) return len; |
| + } |
| + |
| + for (int i = 0; i < rules.length; i++) { |
| + if (rules[i].matches(s, len)) |
| + return rules[i].replace(s, len); |
| + } |
| + |
| + return len; |
| + } |
| + } |
| + |
| + /** |
| + * Parse a resource file into an RSLP stemmer description. |
| + * @return a Map containing the named Steps in this description. |
| + */ |
| + protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) { |
| + // TODO: this parser is ugly, but works. use a jflex grammar instead. |
| + try { |
| + InputStream is = clazz.getResourceAsStream(resource); |
| + LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8")); |
| + Map<String,Step> steps = new HashMap<String,Step>(); |
| + String step; |
| + while ((step = readLine(r)) != null) { |
| + Step s = parseStep(r, step); |
| + steps.put(s.name, s); |
| + } |
| + r.close(); |
| + return steps; |
| + } catch (IOException e) { |
| + throw new RuntimeException(e); |
| + } |
| + } |
| + |
| + private static final Pattern headerPattern = |
| + Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$"); |
| + private static final Pattern stripPattern = |
| + Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$"); |
| + private static final Pattern repPattern = |
| + Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$"); |
| + private static final Pattern excPattern = |
| + Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$"); |
| + |
| + private static Step parseStep(LineNumberReader r, String header) throws IOException { |
| + Matcher matcher = headerPattern.matcher(header); |
| + if (!matcher.find()) { |
| + throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber()); |
| + } |
| + assert matcher.groupCount() == 4; |
| + String name = matcher.group(1); |
| + int min = Integer.parseInt(matcher.group(2)); |
| + int type = Integer.parseInt(matcher.group(3)); |
| + String suffixes[] = parseList(matcher.group(4)); |
| + Rule rules[] = parseRules(r, type); |
| + return new Step(name, rules, min, suffixes); |
| + } |
| + |
| + private static Rule[] parseRules(LineNumberReader r, int type) throws IOException { |
| + List<Rule> rules = new ArrayList<Rule>(); |
| + String line; |
| + while ((line = readLine(r)) != null) { |
| + Matcher matcher = stripPattern.matcher(line); |
| + if (matcher.matches()) { |
| + rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), "")); |
| + } else { |
| + matcher = repPattern.matcher(line); |
| + if (matcher.matches()) { |
| + rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3))); |
| + } else { |
| + matcher = excPattern.matcher(line); |
| + if (matcher.matches()) { |
| + if (type == 0) { |
| + rules.add(new RuleWithSuffixExceptions(matcher.group(1), |
| + Integer.parseInt(matcher.group(2)), |
| + matcher.group(3), |
| + parseList(matcher.group(4)))); |
| + } else { |
| + rules.add(new RuleWithSetExceptions(matcher.group(1), |
| + Integer.parseInt(matcher.group(2)), |
| + matcher.group(3), |
| + parseList(matcher.group(4)))); |
| + } |
| + } else { |
| + throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber()); |
| + } |
| + } |
| + } |
| + if (line.endsWith(";")) |
| + return rules.toArray(new Rule[rules.size()]); |
| + } |
| + return null; |
| + } |
| + |
| + private static String[] parseList(String s) { |
| + if (s.isEmpty()) |
| + return null; |
| + String list[] = s.split(","); |
| + for (int i = 0; i < list.length; i++) |
| + list[i] = parseString(list[i].trim()); |
| + return list; |
| + } |
| + |
| + private static String parseString(String s) { |
| + return s.substring(1, s.length()-1); |
| + } |
| + |
| + private static String readLine(LineNumberReader r) throws IOException { |
| + String line = null; |
| + while ((line = r.readLine()) != null) { |
| + line = line.trim(); |
| + if (!line.isEmpty() && line.charAt(0) != '#') |
| + return line; |
| + } |
| + return line; |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\RSLPStemmerBase.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java (revision 0)
|
| @@ -0,0 +1,83 @@
|
| +package org.apache.lucene.analysis.gl; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.util.Map; |
| + |
| +import org.apache.lucene.analysis.pt.RSLPStemmerBase; |
| + |
| +/** |
| + * Galician stemmer implementing "Regras do lematizador para o galego". |
| + * |
| + * @see RSLPStemmerBase |
| + * @see <a href="http://bvg.udc.es/recursos_lingua/stemming.jsp">Description of rules</a> |
| + */ |
| +public class GalicianStemmer extends RSLPStemmerBase { |
| + private static final Step plural, unification, adverb, augmentative, noun, verb, vowel; |
| + |
| + static { |
| + Map<String,Step> steps = parse(GalicianStemmer.class, "galician.rslp"); |
| + plural = steps.get("Plural"); |
| + unification = steps.get("Unification"); |
| + adverb = steps.get("Adverb"); |
| + augmentative = steps.get("Augmentative"); |
| + noun = steps.get("Noun"); |
| + verb = steps.get("Verb"); |
| + vowel = steps.get("Vowel"); |
| + } |
| + |
| + /** |
| + * @param s buffer, oversized to at least <code>len+1</code> |
| + * @param len initial valid length of buffer |
| + * @return new valid length, stemmed |
| + */ |
| + public int stem(char s[], int len) { |
| + assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1"; |
| + |
| + len = plural.apply(s, len); |
| + len = unification.apply(s, len); |
| + len = adverb.apply(s, len); |
| + |
| + int oldlen; |
| + do { |
| + oldlen = len; |
| + len = augmentative.apply(s, len); |
| + } while (len != oldlen); |
| + |
| + oldlen = len; |
| + len = noun.apply(s, len); |
| + if (len == oldlen) { /* suffix not removed */ |
| + len = verb.apply(s, len); |
| + } |
| + |
| + len = vowel.apply(s, len); |
| + |
| + // RSLG accent removal |
| + for (int i = 0; i < len; i++) |
| + switch(s[i]) { |
| + case 'á': s[i] = 'a'; break; |
| + case 'é': |
| + case 'ê': s[i] = 'e'; break; |
| + case 'í': s[i] = 'i'; break; |
| + case 'ó': s[i] = 'o'; break; |
| + case 'ú': s[i] = 'u'; break; |
| + } |
| + |
| + return len; |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianStemmer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java (revision 0)
|
| @@ -0,0 +1,60 @@
|
| +package org.apache.lucene.analysis.gl; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
| +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| + |
| +/** |
| + * A {@link TokenFilter} that applies {@link GalicianStemmer} to stem |
| + * Galician words. |
| + * <p> |
| + * To prevent terms from being stemmed use an instance of |
| + * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets |
| + * the {@link KeywordAttribute} before this {@link TokenStream}. |
| + * </p> |
| + */ |
| +public final class GalicianStemFilter extends TokenFilter { |
| + private final GalicianStemmer stemmer = new GalicianStemmer(); |
| + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| + |
| + public GalicianStemFilter(TokenStream input) { |
| + super(input); |
| + } |
| + |
| + @Override |
| + public boolean incrementToken() throws IOException { |
| + if (input.incrementToken()) { |
| + if (!keywordAttr.isKeyword()) { |
| + // this stemmer increases word length by 1: worst case '*çom' -> '*ción' |
| + final int len = termAtt.length(); |
| + final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len); |
| + termAtt.setLength(newlen); |
| + } |
| + return true; |
| + } else { |
| + return false; |
| + } |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianStemFilter.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java (revision 0)
|
| @@ -0,0 +1,129 @@
|
| +package org.apache.lucene.analysis.gl; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.Set; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.core.LowerCaseFilter; |
| +import org.apache.lucene.analysis.core.StopFilter; |
| +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.Tokenizer; |
| +import org.apache.lucene.analysis.standard.StandardFilter; |
| +import org.apache.lucene.analysis.standard.StandardTokenizer; |
| +import org.apache.lucene.analysis.util.CharArraySet; |
| +import org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
| +import org.apache.lucene.analysis.util.WordlistLoader; |
| +import org.apache.lucene.util.Version; |
| + |
| +/** |
| + * {@link Analyzer} for Galician. |
| + */ |
| +public final class GalicianAnalyzer extends StopwordAnalyzerBase { |
| + private final Set<?> stemExclusionSet; |
| + |
| + /** File containing default Galician stopwords. */ |
| + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
| + |
| + /** |
| + * Returns an unmodifiable instance of the default stop words set. |
| + * @return default stop words set. |
| + */ |
| + public static Set<?> getDefaultStopSet(){ |
| + return DefaultSetHolder.DEFAULT_STOP_SET; |
| + } |
| + |
| + /** |
| + * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class |
| + * accesses the static final set the first time.; |
| + */ |
| + private static class DefaultSetHolder { |
| + static final Set<?> DEFAULT_STOP_SET; |
| + |
| + static { |
| + try { |
| + DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class, |
| + DEFAULT_STOPWORD_FILE); |
| + } catch (IOException ex) { |
| + // default set should always be present as it is part of the |
| + // distribution (JAR) |
| + throw new RuntimeException("Unable to load default stopword set"); |
| + } |
| + } |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
| + */ |
| + public GalicianAnalyzer(Version matchVersion) { |
| + this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + */ |
| + public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) { |
| + this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
| + } |
| + |
| + /** |
| + * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
| + * provided this analyzer will add a {@link KeywordMarkerFilter} before |
| + * stemming. |
| + * |
| + * @param matchVersion lucene compatibility version |
| + * @param stopwords a stopword set |
| + * @param stemExclusionSet a set of terms not to be stemmed |
| + */ |
| + public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) { |
| + super(matchVersion, stopwords); |
| + this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
| + matchVersion, stemExclusionSet)); |
| + } |
| + |
| + /** |
| + * Creates a |
| + * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} |
| + * which tokenizes all the text in the provided {@link Reader}. |
| + * |
| + * @return A |
| + * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} |
| + * built from an {@link StandardTokenizer} filtered with |
| + * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} |
| + * , {@link KeywordMarkerFilter} if a stem exclusion set is |
| + * provided and {@link GalicianStemFilter}. |
| + */ |
| + @Override |
| + protected TokenStreamComponents createComponents(String fieldName, |
| + Reader reader) { |
| + final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
| + TokenStream result = new StandardFilter(matchVersion, source); |
| + result = new LowerCaseFilter(matchVersion, result); |
| + result = new StopFilter(matchVersion, result, stopwords); |
| + if(!stemExclusionSet.isEmpty()) |
| + result = new KeywordMarkerFilter(result, stemExclusionSet); |
| + result = new GalicianStemFilter(result); |
| + return new TokenStreamComponents(source, result); |
| + } |
| +} |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianAnalyzer.java
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html (revision 0)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html (revision 0)
|
| @@ -0,0 +1,22 @@
|
| +<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
| +<!-- |
| + Licensed to the Apache Software Foundation (ASF) under one or more |
| + contributor license agreements. See the NOTICE file distributed with |
| + this work for additional information regarding copyright ownership. |
| + The ASF licenses this file to You under the Apache License, Version 2.0 |
| + (the "License"); you may not use this file except in compliance with |
| + the License. You may obtain a copy of the License at |
| + |
| + http://www.apache.org/licenses/LICENSE-2.0 |
| + |
| + Unless required by applicable law or agreed to in writing, software |
| + distributed under the License is distributed on an "AS IS" BASIS, |
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + See the License for the specific language governing permissions and |
| + limitations under the License. |
| +--> |
| +<html><head></head> |
| +<body> |
| +Analyzer for Galician. |
| +</body> |
| +</html> |
|
|
| Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\package.html
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
|
| ===================================================================
|
| --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (revision 1054344)
|
| +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (working copy)
|
| @@ -57,6 +57,25 @@
|
| } |
| |
| /** |
| + * Returns true if the character array ends with the suffix. |
| + * |
| + * @param s Input Buffer |
| + * @param len length of input buffer |
| + * @param suffix Suffix string to test |
| + * @return true if <code>s</code> ends with <code>suffix</code> |
| + */ |
| + public static boolean endsWith(char s[], int len, char suffix[]) { |
| + final int suffixLen = suffix.length; |
| + if (suffixLen > len) |
| + return false; |
| + for (int i = suffixLen - 1; i >= 0; i--) |
| + if (s[len -(suffixLen - i)] != suffix[i]) |
| + return false; |
| + |
| + return true; |
| + } |
| + |
| + /** |
| * Delete a character in-place |
| * |
| * @param s Input Buffer |
| Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp
|
| ===================================================================
|
| --- modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp (revision 0)
|
| +++ modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp (revision 0)
|
| @@ -0,0 +1,456 @@
|
| +# Steps file for the RSLP stemmer.
|
| +
|
| +# Step 1: Plural Reduction
|
| +{ "Plural", 3, 1, {"s"},
|
| + # bons -> bom
|
| + {"ns",1,"m"},
|
| + # balões -> balão
|
| + {"ões",3,"ão"},
|
| + # capitães -> capitão
|
| + {"ães",1,"ão",{"mães"}},
|
| + # normais -> normal
|
| + {"ais",1,"al",{"cais","mais"}},
|
| + # papéis -> papel
|
| + {"éis",2,"el"},
|
| + # amáveis -> amável
|
| + {"eis",2,"el"},
|
| + # lençóis -> lençol
|
| + {"óis",2,"ol"},
|
| + # barris -> barril
|
| + {"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}},
|
| + # males -> mal
|
| + {"les",3,"l"},
|
| + # mares -> mar
|
| + {"res",3,"r", {"árvores"}},
|
| + # casas -> casa
|
| + {"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}};
|
| +
|
| +# Step 2: Adverb Reduction
|
| +{ "Adverb", 0, 0, {},
|
| + # felizmente -> feliz
|
| + {"mente",4,"",{"experimente"}}};
|
| +
|
| +# Step 3: Feminine Reduction
|
| +{ "Feminine", 3, 1, {"a","ã"},
|
| + # chefona -> chefão
|
| + {"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}},
|
| + # vilã -> vilão
|
| + {"ã",2,"ão",{"amanhã","arapuã","fã","divã"}},
|
| + # professora -> professor
|
| + {"ora",3,"or"},
|
| + # americana -> americano
|
| + {"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}},
|
| + # sozinha -> sozinho
|
| + {"inha",3,"inho",{"rainha","linha","minha"}},
|
| + # inglesa -> inglês
|
| + {"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}},
|
| + # famosa -> famoso
|
| + {"osa",3,"oso",{"mucosa","prosa"}},
|
| + # maníaca -> maníaco
|
| + {"íaca",3,"íaco"},
|
| + # prática -> prático
|
| + {"ica",3,"ico",{"dica"}},
|
| + # cansada -> cansado
|
| + {"ada",2,"ado",{"pitada"}},
|
| + # mantida -> mantido
|
| + {"ida",3,"ido",{"vida","dúvida"}},
|
| + {"ída",3,"ido",{"recaída","saída"}},
|
| + # prima -> primo
|
| + {"ima",3,"imo",{"vítima"}},
|
| + # passiva -> passivo
|
| + {"iva",3,"ivo",{"saliva","oliva"}},
|
| + # primeira -> primeiro
|
| + {"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}};
|
| +
|
| +# Step 4: Augmentative/Diminutive Reduction
|
| +{ "Augmentative", 0, 1, {},
|
| + # cansadíssimo -> cansad
|
| + {"díssimo",5},
|
| + # amabilíssimo -> ama
|
| + {"abilíssimo",5},
|
| + # fortíssimo -> fort
|
| + {"íssimo",3},
|
| + {"ésimo",3},
|
| + # chiquérrimo -> chiqu
|
| + {"érrimo",4},
|
| + # pezinho -> pe
|
| + {"zinho",2},
|
| + # maluquinho -> maluc
|
| + {"quinho",4,"c"},
|
| + # amiguinho -> amig
|
| + {"uinho",4},
|
| + # cansadinho -> cansad
|
| + {"adinho",3},
|
| + # carrinho -> carr
|
| + {"inho",3,"",{"caminho","cominho"}},
|
| + # grandalhão -> grand
|
| + {"alhão",4},
|
| + # dentuça -> dent
|
| + {"uça",4},
|
| + # ricaço -> ric
|
| + {"aço",4,"",{"antebraço"}},
|
| + {"aça",4},
|
| + # casadão -> cans
|
| + {"adão",4},
|
| + {"idão",4},
|
| + # corpázio -> corp
|
| + {"ázio",3,"",{"topázio"}},
|
| + # pratarraz -> prat
|
| + {"arraz",4},
|
| + {"zarrão",3},
|
| + {"arrão",4},
|
| + # bocarra -> boc
|
| + {"arra",3},
|
| + # calorzão -> calor
|
| + {"zão",2,"",{"coalizão"}},
|
| + # meninão -> menin
|
| + {"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}};
|
| +
|
| +# Step 5: Noun Suffix Reduction
|
| +{ "Noun", 0, 0, {},
|
| + # existencialista -> exist
|
| + {"encialista",4},
|
| + # minimalista -> minim
|
| + {"alista",5},
|
| + # contagem -> cont
|
| + {"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}},
|
| + # gerenciamento -> gerenc
|
| + {"iamento",4},
|
| + # monitoramento -> monitor
|
| + {"amento",3,"",{"firmamento","fundamento","departamento"}},
|
| + # nascimento -> nasc
|
| + {"imento",3},
|
| + {"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}},
|
| + # comercializado -> comerci
|
| + {"alizado",4},
|
| + # traumatizado -> traum
|
| + {"atizado",4},
|
| + {"tizado",4,"",{"alfabetizado"}},
|
| + # alfabetizado -> alfabet
|
| + {"izado",5,"",{"organizado","pulverizado"}},
|
| + # associativo -> associ
|
| + {"ativo",4,"",{"pejorativo","relativo"}},
|
| + # contraceptivo -> contracep
|
| + {"tivo",4,"",{"relativo"}},
|
| + # esportivo -> esport
|
| + {"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}},
|
| + # abalado -> abal
|
| + {"ado",2,"",{"grado"}},
|
| + # impedido -> imped
|
| + {"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}},
|
| + # ralador -> ral
|
| + {"ador",3},
|
| + # entendedor -> entend
|
| + {"edor",3},
|
| + # cumpridor -> cumpr
|
| + {"idor",4,"",{"ouvidor"}},
|
| + {"dor",4,"",{"ouvidor"}},
|
| + {"sor",4,"",{"assessor"}},
|
| + {"atoria",5},
|
| + {"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}},
|
| + {"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}},
|
| + # comparabilidade -> compar
|
| + {"abilidade",5},
|
| + # abolicionista -> abol
|
| + {"icionista",4},
|
| + # intervencionista -> interven
|
| + {"cionista",5},
|
| + {"ionista",5},
|
| + {"ionar",5},
|
| + # profissional -> profiss
|
| + {"ional",4},
|
| + # referência -> refer
|
| + {"ência",3},
|
| + # repugnância -> repugn
|
| + {"ância",4,"",{"ambulância"}},
|
| + # abatedouro -> abat
|
| + {"edouro",3},
|
| + # fofoqueiro -> fofoc
|
| + {"queiro",3,"c"},
|
| + {"adeiro",4,"",{"desfiladeiro"}},
|
| + # brasileiro -> brasil
|
| + {"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}},
|
| + {"uoso",3},
|
| + # gostoso -> gost
|
| + {"oso",3,"",{"precioso"}},
|
| + # comercializaç -> comerci
|
| + {"alizaç",5},
|
| + {"atizaç",5},
|
| + {"tizaç",5},
|
| + {"izaç",5,"",{"organizaç"}},
|
| + # alegaç -> aleg
|
| + {"aç",3,"",{"equaç","relaç"}},
|
| + # aboliç -> abol
|
| + {"iç",3,"",{"eleiç"}},
|
| + # anedotário -> anedot
|
| + {"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}},
|
| + {"atório",3},
|
| + {"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}},
|
| + # ministério -> minist
|
| + {"ério",6},
|
| + # chinês -> chin
|
| + {"ês",4},
|
| + # beleza -> bel
|
| + {"eza",3},
|
| + # rigidez -> rigid
|
| + {"ez",4},
|
| + # parentesco -> parent
|
| + {"esco",4},
|
| + # ocupante -> ocup
|
| + {"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}},
|
| + # bombástico -> bomb
|
| + {"ástico",4,"",{"eclesiástico"}},
|
| + {"alístico",3},
|
| + {"áutico",4},
|
| + {"êutico",4},
|
| + {"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}},
|
| + # polêmico -> polêm
|
| + {"ico",4,"",{"tico","público","explico"}},
|
| + # produtividade -> produt
|
| + {"ividade",5},
|
| + # profundidade -> profund
|
| + {"idade",4,"",{"autoridade","comunidade"}},
|
| + # aposentadoria -> aposentad
|
| + {"oria",4,"",{"categoria"}},
|
| + # existencial -> exist
|
| + {"encial",5},
|
| + # artista -> art
|
| + {"ista",4},
|
| + {"auta",5},
|
| + # maluquice -> maluc
|
| + {"quice",4,"c"},
|
| + # chatice -> chat
|
| + {"ice",4,"",{"cúmplice"}},
|
| + # demoníaco -> demon
|
| + {"íaco",3},
|
| + # decorrente -> decorr
|
| + {"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}},
|
| + {"ense",5},
|
| + # criminal -> crim
|
| + {"inal",3},
|
| + # americano -> americ
|
| + {"ano",4},
|
| + # amável -> am
|
| + {"ável",2,"",{"afável","razoável","potável","vulnerável"}},
|
| + # combustível -> combust
|
| + {"ível",3,"",{"possível"}},
|
| + {"vel",5,"",{"possível","vulnerável","solúvel"}},
|
| + {"bil",3,"vel"},
|
| + # cobertura -> cobert
|
| + {"ura",4,"",{"imatura","acupuntura","costura"}},
|
| + {"ural",4},
|
| + # consensual -> consens
|
| + {"ual",3,"",{"bissexual","virtual","visual","pontual"}},
|
| + # mundial -> mund
|
| + {"ial",3},
|
| + # experimental -> experiment
|
| + {"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}},
|
| + {"alismo",4},
|
| + {"ivismo",4},
|
| + {"ismo",3,"",{"cinismo"}}};
|
| +
|
| +# Step 6: Verb Suffix Reduction
|
| +{ "Verb", 0, 0, {},
|
| + # cantaríamo -> cant
|
| + {"aríamo",2},
|
| + # cantássemo -> cant
|
| + {"ássemo",2},
|
| + # beberíamo -> beb
|
| + {"eríamo",2},
|
| + # bebêssemo -> beb
|
| + {"êssemo",2},
|
| + # partiríamo -> part
|
| + {"iríamo",3},
|
| + # partíssemo -> part
|
| + {"íssemo",3},
|
| + # cantáramo -> cant
|
| + {"áramo",2},
|
| + # cantárei -> cant
|
| + {"árei",2},
|
| + # cantaremo -> cant
|
| + {"aremo",2},
|
| + # cantariam -> cant
|
| + {"ariam",2},
|
| + # cantaríei -> cant
|
| + {"aríei",2},
|
| + # cantássei -> cant
|
| + {"ássei",2},
|
| + # cantassem -> cant
|
| + {"assem",2},
|
| + # cantávamo -> cant
|
| + {"ávamo",2},
|
| + # bebêramo -> beb
|
| + {"êramo",3},
|
| + # beberemo -> beb
|
| + {"eremo",3},
|
| + # beberiam -> beb
|
| + {"eriam",3},
|
| + # beberíei -> beb
|
| + {"eríei",3},
|
| + # bebêssei -> beb
|
| + {"êssei",3},
|
| + # bebessem -> beb
|
| + {"essem",3},
|
| + # partiríamo -> part
|
| + {"íramo",3},
|
| + # partiremo -> part
|
| + {"iremo",3},
|
| + # partiriam -> part
|
| + {"iriam",3},
|
| + # partiríei -> part
|
| + {"iríei",3},
|
| + # partíssei -> part
|
| + {"íssei",3},
|
| + # partissem -> part
|
| + {"issem",3},
|
| + # cantando -> cant
|
| + {"ando",2},
|
| + # bebendo -> beb
|
| + {"endo",3},
|
| + # partindo -> part
|
| + {"indo",3},
|
| + # propondo -> prop
|
| + {"ondo",3},
|
| + # cantaram -> cant
|
| + {"aram",2},
|
| + {"arão",2},
|
| + # cantarde -> cant
|
| + {"arde",2},
|
| + # cantarei -> cant
|
| + {"arei",2},
|
| + # cantarem -> cant
|
| + {"arem",2},
|
| + # cantaria -> cant
|
| + {"aria",2},
|
| + # cantarmo -> cant
|
| + {"armo",2},
|
| + # cantasse -> cant
|
| + {"asse",2},
|
| + # cantaste -> cant
|
| + {"aste",2},
|
| + # cantavam -> cant
|
| + {"avam",2,"",{"agravam"}},
|
| + # cantávei -> cant
|
| + {"ávei",2},
|
| + # beberam -> beb
|
| + {"eram",3},
|
| + {"erão",3},
|
| + # beberde -> beb
|
| + {"erde",3},
|
| + # beberei -> beb
|
| + {"erei",3},
|
| + # bebêrei -> beb
|
| + {"êrei",3},
|
| + # beberem -> beb
|
| + {"erem",3},
|
| + # beberia -> beb
|
| + {"eria",3},
|
| + # bebermo -> beb
|
| + {"ermo",3},
|
| + # bebesse -> beb
|
| + {"esse",3},
|
| + # bebeste -> beb
|
| + {"este",3,"",{"faroeste","agreste"}},
|
| + # bebíamo -> beb
|
| + {"íamo",3},
|
| + # partiram -> part
|
| + {"iram",3},
|
| + # concluíram -> conclu
|
| + {"íram",3},
|
| + {"irão",2},
|
| + # partirde -> part
|
| + {"irde",2},
|
| + # partírei -> part
|
| + {"irei",3,"",{"admirei"}},
|
| + # partirem -> part
|
| + {"irem",3,"",{"adquirem"}},
|
| + # partiria -> part
|
| + {"iria",3},
|
| + # partirmo -> part
|
| + {"irmo",3},
|
| + # partisse -> part
|
| + {"isse",3},
|
| + # partiste -> part
|
| + {"iste",4},
|
| + {"iava",4,"",{"ampliava"}},
|
| + # cantamo -> cant
|
| + {"amo",2},
|
| + {"iona",3},
|
| + # cantara -> cant
|
| + {"ara",2,"",{"arara","prepara"}},
|
| + # cantará -> cant
|
| + {"ará",2,"",{"alvará"}},
|
| + # cantare -> cant
|
| + {"are",2,"",{"prepare"}},
|
| + # cantava -> cant
|
| + {"ava",2,"",{"agrava"}},
|
| + # cantemo -> cant
|
| + {"emo",2},
|
| + # bebera -> beb
|
| + {"era",3,"",{"acelera","espera"}},
|
| + # beberá -> beb
|
| + {"erá",3},
|
| + # bebere -> beb
|
| + {"ere",3,"",{"espere"}},
|
| + # bebiam -> beb
|
| + {"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}},
|
| + # bebíei -> beb
|
| + {"íei",3},
|
| + # partimo -> part
|
| + {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
|
| + # partira -> part
|
| + {"ira",3,"",{"fronteira","sátira"}},
|
| + {"ído",3},
|
| + # partirá -> part
|
| + {"irá",3},
|
| + {"tizar",4,"",{"alfabetizar"}},
|
| + {"izar",5,"",{"organizar"}},
|
| + {"itar",5,"",{"acreditar","explicitar","estreitar"}},
|
| + # partire -> part
|
| + {"ire",3,"",{"adquire"}},
|
| + # compomo -> comp
|
| + {"omo",3},
|
| + # cantai -> cant
|
| + {"ai",2},
|
| + # cantam -> cant
|
| + {"am",2},
|
| + # barbear -> barb
|
| + {"ear",4,"",{"alardear","nuclear"}},
|
| + # cantar -> cant
|
| + {"ar",2,"",{"azar","bazaar","patamar"}},
|
| + # cheguei -> cheg
|
| + {"uei",3},
|
| + {"uía",5,"u"},
|
| + # cantei -> cant
|
| + {"ei",3},
|
| + {"guem",3,"g"},
|
| + # cantem -> cant
|
| + {"em",2,"",{"alem","virgem"}},
|
| + # beber -> beb
|
| + {"er",2,"",{"éter","pier"}},
|
| + # bebeu -> beb
|
| + {"eu",3,"",{"chapeu"}},
|
| + # bebia -> beb
|
| + {"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
|
| + # partir -> part
|
| + {"ir",3,"",{"freir"}},
|
| + # partiu -> part
|
| + {"iu",3},
|
| + {"eou",5},
|
| + # chegou -> cheg
|
| + {"ou",3},
|
| + # bebi -> beb
|
| + {"i",3}};
|
| +
|
| +# Step 7: Vowel Removal
|
| +{ "Vowel", 0, 0, {},
|
| + {"bil",2,"vel"},
|
| + {"gue",2,"g",{"gangue","jegue"}},
|
| + {"á",3},
|
| + {"ê",3,"",{"bebê"}},
|
| + # menina -> menin
|
| + {"a",3,"",{"ásia"}},
|
| + # grande -> grand
|
| + {"e",3},
|
| + # menino -> menin
|
| + {"o",3,"",{"ão"}}};
|
| Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt
|
| ===================================================================
|
| --- modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt (revision 0)
|
| +++ modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt (revision 0)
|
| @@ -0,0 +1,161 @@
|
| +# galican stopwords |
| +a |
| +aínda |
| +alí |
| +aquel |
| +aquela |
| +aquelas |
| +aqueles |
| +aquilo |
| +aquí |
| +ao |
| +aos |
| +as |
| +así |
| +á |
| +ben |
| +cando |
| +che |
| +co |
| +coa |
| +comigo |
| +con |
| +connosco |
| +contigo |
| +convosco |
| +coas |
| +cos |
| +cun |
| +cuns |
| +cunha |
| +cunhas |
| +da |
| +dalgunha |
| +dalgunhas |
| +dalgún |
| +dalgúns |
| +das |
| +de |
| +del |
| +dela |
| +delas |
| +deles |
| +desde |
| +deste |
| +do |
| +dos |
| +dun |
| +duns |
| +dunha |
| +dunhas |
| +e |
| +el |
| +ela |
| +elas |
| +eles |
| +en |
| +era |
| +eran |
| +esa |
| +esas |
| +ese |
| +eses |
| +esta |
| +estar |
| +estaba |
| +está |
| +están |
| +este |
| +estes |
| +estiven |
| +estou |
| +eu |
| +é |
| +facer |
| +foi |
| +foron |
| +fun |
| +había |
| +hai |
| +iso |
| +isto |
| +la |
| +las |
| +lle |
| +lles |
| +lo |
| +los |
| +mais |
| +me |
| +meu |
| +meus |
| +min |
| +miña |
| +miñas |
| +moi |
| +na |
| +nas |
| +neste |
| +nin |
| +no |
| +non |
| +nos |
| +nosa |
| +nosas |
| +noso |
| +nosos |
| +nós |
| +nun |
| +nunha |
| +nuns |
| +nunhas |
| +o |
| +os |
| +ou |
| +ó |
| +ós |
| +para |
| +pero |
| +pode |
| +pois |
| +pola |
| +polas |
| +polo |
| +polos |
| +por |
| +que |
| +se |
| +senón |
| +ser |
| +seu |
| +seus |
| +sexa |
| +sido |
| +sobre |
| +súa |
| +súas |
| +tamén |
| +tan |
| +te |
| +ten |
| +teñen |
| +teño |
| +ter |
| +teu |
| +teus |
| +ti |
| +tido |
| +tiña |
| +tiven |
| +túa |
| +túas |
| +un |
| +unha |
| +unhas |
| +uns |
| +vos |
| +vosa |
| +vosas |
| +voso |
| +vosos |
| +vós |
|
|
| Property changes on: modules\analysis\common\src\resources\org\apache\lucene\analysis\gl\stopwords.txt
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + native
|
|
|
| Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp
|
| ===================================================================
|
| --- modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp (revision 0)
|
| +++ modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp (revision 0)
|
| @@ -0,0 +1,647 @@
|
| +# Steps file for the RSLP stemmer.
|
| +
|
| +# Step 1: Plural Reduction
|
| +{ "Plural", 3, 1, {"s"},
|
| + # bons -> bon
|
| + {"ns",1,"n",{"luns","furatapóns","furatapons"}},
|
| + # xamós -> xamón
|
| + {"ós",3,"ón"},
|
| + # balões -> balón
|
| + {"ões",3,"ón"},
|
| + # capitães -> capitão
|
| + {"ães",1,"ão",{"mães","magalhães"}},
|
| + # normais -> normal
|
| + {"ais",2,"al",{"cais","tais","mais","pais","ademais"}},
|
| + {"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}},
|
| + # papéis -> papel
|
| + {"éis",2,"el"},
|
| + # posíbeis -> posíbel
|
| + {"eis",2,"el"},
|
| + # espanhóis -> espanhol
|
| + {"óis",2,"ol",{"escornabóis"}},
|
| + # caracois -> caracol
|
| + {"ois",2,"ol",{"escornabois"}},
|
| + # cadrís -> cadril
|
| + {"ís",2,"il",{"país"}},
|
| + # cadris -> cadril
|
| + {"is",2,"il",{"menfis","pais","kinguis"}},
|
| + # males -> mal
|
| + {"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}},
|
| + # mares -> mar
|
| + {"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}},
|
| + # luces -> luz
|
| + {"ces",2,"z"},
|
| + # luzes -> luz
|
| + {"zes",2,"z"},
|
| + # leises -> lei
|
| + {"ises",3,"z"},
|
| + # animás -> animal
|
| + {"ás",1,"al",{"más"}},
|
| + # gases -> gas
|
| + {"ses",2,"s"},
|
| + # casas -> casa
|
| + {"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}};
|
| +
|
| +{ "Unification", 0, 0, {},
|
| + # cansadísimo -> cansadísimo
|
| + {"íssimo",5,"ísimo"},
|
| + # cansadísima -> cansadísima
|
| + {"íssima",5,"ísima"},
|
| + # homaço -> homazo
|
| + {"aço",4,"azo"},
|
| + # mulheraça -> mulheraza
|
| + {"aça",4,"aza"},
|
| + # xentuça -> xentuza
|
| + {"uça",4,"uza"},
|
| + # manilhar -> manillar
|
| + {"lhar",2,"llar"},
|
| + # colher -> coller
|
| + {"lher",2,"ller"},
|
| + # melhor -> mellor
|
| + {"lhor",2,"llor"},
|
| + # alho -> allo
|
| + {"lho",1,"llo"},
|
| + # linhar -> liñar
|
| + {"nhar",2,"ñar"},
|
| + # penhor -> peñor
|
| + {"nhor",2,"ñor"},
|
| + # anho -> año
|
| + {"nho",1,"ño"},
|
| + # cunha -> cuña
|
| + {"nha",1,"ña"},
|
| + # hospitalário -> hospitalario
|
| + {"ário",3,"ario"},
|
| + # bibliotecária -> bibliotecaria
|
| + {"ária",3,"aria"},
|
| + # agradable -> agradábel
|
| + {"able",2,"ábel"},
|
| + # agradávele -> agradábel
|
| + {"ável",2,"ábel"},
|
| + # imposible -> imposíbel
|
| + {"ible",2,"íbel"},
|
| + # imposível -> imposíbel
|
| + {"ível",2,"íbel"},
|
| + # imposiçom -> imposición
|
| + {"çom",2,"ción"},
|
| + # garagem -> garaxe
|
| + {"agem",2,"axe"},
|
| + # garage -> garaxe
|
| + {"age",2,"axe"},
|
| + # impressão -> impressón
|
| + {"ão",3,"ón"},
|
| + # irmao -> irmán
|
| + {"ao",1,"án"},
|
| + # irmau -> irmán
|
| + {"au",1,"án"},
|
| + # garrafom -> garrafón
|
| + {"om",3,"ón"},
|
| + # cantem -> canten
|
| + {"m",2,"n"}};
|
| +
|
| +{ "Adverb", 0, 0, {},
|
| + # felizmente -> feliz
|
| + {"mente",4,"",{"experimente","vehemente","sedimente"}}};
|
| +
|
| +{ "Augmentative", 0, 1, {},
|
| + # cansadísimo -> cansad
|
| + {"dísimo",5},
|
| + # cansadísima -> cansad
|
| + {"dísima",5},
|
| + # amabilísimo -> ama
|
| + {"bilísimo",3},
|
| + # amabilísima -> ama
|
| + {"bilísima",3},
|
| + # fortísimo -> fort
|
| + {"ísimo",3},
|
| + # fortísima -> fort
|
| + {"ísima",3},
|
| + # centésimo -> cent
|
| + {"ésimo",3},
|
| + # centésima -> cent
|
| + {"ésima",3},
|
| + # paupérrimo -> paup
|
| + {"érrimo",4},
|
| + # paupérrima -> paup
|
| + {"érrima",4},
|
| + # charlatana -> charlat
|
| + {"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}},
|
| + # charlatán -> charlat
|
| + {"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}},
|
| + # homazo -> hom
|
| + {"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}},
|
| + # mulleraza -> muller
|
| + {"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}},
|
| + # cascallo -> casc
|
| + {"allo",4,"",{"traballo"}},
|
| + # xentalla -> xent
|
| + {"alla",4},
|
| + # bocarra -> boc
|
| + {"arra",3,"",{"cigarra","cinzarra"}},
|
| + # medicastro -> medic
|
| + {"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}},
|
| + # poetastra -> poet
|
| + {"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}},
|
| + # corpázio -> corp
|
| + {"ázio",3,"",{"topázio"}},
|
| + # soutelo -> sout
|
| + {"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}},
|
| + # avioneta -> avion
|
| + {"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}},
|
| + # guapete -> guap
|
| + {"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}},
|
| + # práctica -> práct
|
| + {"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}},
|
| + # práctico -> práct
|
| + {"ico",3,"",{"conico","acetifico","acidifico"}},
|
| + # trapexo -> trap
|
| + {"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}},
|
| + {"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}},
|
| + # multidão -> mult
|
| + {"idão",3},
|
| + # pequeniño -> pequeno
|
| + {"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}},
|
| + # pequeniña -> pequena
|
| + {"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}},
|
| + # grandito -> grand
|
| + {"ito",3,""},
|
| + # grandita -> grand
|
| + {"ita",3,""},
|
| + # anomaloide -> animal
|
| + {"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}},
|
| + # cazola -> caz
|
| + {"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}},
|
| + # pedrolo -> pedr
|
| + {"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}},
|
| + # vellote -> vell
|
| + {"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}},
|
| + # mozota -> moz
|
| + {"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}},
|
| + # gordocho -> gord
|
| + {"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}},
|
| + # gordecha -> gord
|
| + {"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}},
|
| + # baratuco -> barat
|
| + {"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}},
|
| + # borrachuzo -> borrach
|
| + {"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}},
|
| + # xentuza -> xent
|
| + {"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}},
|
| + # babuxa -> bab
|
| + {"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}},
|
| + {"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}},
|
| + # grupello -> grup
|
| + {"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}},
|
| + # pontella -> pont
|
| + {"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}};
|
| +
|
| +{ "Noun", 0, 0, {},
|
| + # lealdade -> leal
|
| + {"dade",3,"",{"acridade","calidade"}},
|
| + # clarificar -> clar
|
| + {"ificar",2},
|
| + # brasileiro->brasil
|
| + {"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}},
|
| + # marisqueira -> marisqu
|
| + {"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}},
|
| + # hospitalario -> hospital
|
| + {"ario",3,"",{"armario","calcario","lionario","salario"}},
|
| + # bibliotecaria -> bibliotec
|
| + {"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}},
|
| + # humorístico -> humor
|
| + {"ístico",3,"",{"balístico", "ensaístico"}},
|
| + # castrista -> castr
|
| + {"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}},
|
| + # lavado -> lav
|
| + {"ado",2,"",{"grado","agrado"}},
|
| + # decanato -> decan
|
| + {"ato",2,"",{"agnato"}},
|
| + # xemido -> xem
|
| + {"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}},
|
| + # mantida -> mant
|
| + {"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}},
|
| + {"ída",3},
|
| + # mantído -> mant
|
| + {"ido",3},
|
| + # orelludo -> orell
|
| + {"udo",3,"",{"estudo","escudo"}},
|
| + # orelluda -> orell
|
| + {"uda",3},
|
| + {"ada",3,"",{"abada","alhada","allada","pitada"}},
|
| + # comedela -> come
|
| + {"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}},
|
| + # fontela -> font
|
| + {"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}},
|
| + # agradábel -> agrad
|
| + {"ábel",2,"",{"afábel","fiábel"}},
|
| + # combustíbel -> combust
|
| + {"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}},
|
| + # fabricante -> frabrica
|
| + {"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}},
|
| + # ignorancia -> ignora
|
| + {"ncia",3},
|
| + # temperanza -> tempera
|
| + {"nza",3},
|
| + {"acia",3,"",{"acracia","audacia","falacia","farmacia"}},
|
| + # inmundicia -> inmund
|
| + {"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}},
|
| + # xustiza -> xust
|
| + {"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}},
|
| + # clarexar -> clar
|
| + {"exar",3,"",{"palmexar"}},
|
| + # administración -> administr
|
| + {"ación",2,"",{"aeración"}},
|
| + # expedición -> exped
|
| + {"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}},
|
| + # excepción -> except
|
| + {"ción",3,"t"},
|
| + # comprensión -> comprens
|
| + {"sión",3,"s",{"abrasión", "alusión"}},
|
| + # doazón -> do
|
| + {"azón",2,"",{"armazón"}},
|
| + # garrafón -> garraf
|
| + {"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}},
|
| + # lambona -> lamb
|
| + {"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}},
|
| + # bretoa -> bretón
|
| + {"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}},
|
| + # demoníaco -> demoní
|
| + {"aco",3},
|
| + # demoníaca -> demoní
|
| + {"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}},
|
| + # carballal -> carball
|
| + {"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}},
|
| + # nadador -> nada
|
| + {"dor",2,"",{"abaixador"}},
|
| + # benfeitor -> benfei
|
| + {"tor",3,"",{"autor","motor","pastor","pintor"}},
|
| + # produtor -> produt
|
| + {"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}},
|
| + # profesora -> profes
|
| + {"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}},
|
| + # zapataría -> zapat
|
| + {"aría",3,"",{"libraría"}},
|
| + # etiquetaxe -> etiquet
|
| + {"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}},
|
| + # movedizo -> move
|
| + {"dizo",3},
|
| + # limpeza -> limp
|
| + {"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}},
|
| + # rixidez -> rixid
|
| + {"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}},
|
| + # mullerengo -> muller
|
| + {"engo",3},
|
| + # chairego -> chair
|
| + {"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}},
|
| + # cariñoso -> cariñ
|
| + {"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}},
|
| + # cariñosa -> cariñ
|
| + {"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}},
|
| + # negrume -> negr
|
| + {"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}},
|
| + # altura -> alt
|
| + {"ura",3,"",{"albura","armadura","imatura","costura"}},
|
| + # cuspiñar -> cusp
|
| + {"iñar",3},
|
| + # febril -> febr
|
| + {"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}},
|
| + # principesco -> princip
|
| + {"esco",4},
|
| + # mourisco -> mour
|
| + {"isco",4},
|
| + # esportivo -> esport
|
| + {"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}};
|
| +
|
| +{ "Verb", 0, 0, {},
|
| + # amaba -> am
|
| + {"aba",2},
|
| + # andabade -> and
|
| + {"abade",2},
|
| + # andábade -> and
|
| + {"ábade",2},
|
| + # chorabamo -> chor
|
| + {"abamo",2},
|
| + # chorábamo -> chor
|
| + {"ábamo",2},
|
| + # moraban -> morab
|
| + {"aban",2},
|
| + # andache -> and
|
| + {"ache",2},
|
| + # andade -> and
|
| + {"ade",2},
|
| + {"an",2},
|
| + # cantando -> cant
|
| + {"ando",2},
|
| + # cantar -> cant
|
| + {"ar",2,"",{"azar","bazar","patamar"}},
|
| + # lembrarade -> lembra
|
| + {"arade",2},
|
| + {"aramo",2},
|
| + {"arán",2},
|
| + # cantaran -> cant
|
| + {"aran",2},
|
| + # convidárade -> convid
|
| + {"árade",2},
|
| + # convidaría -> convid
|
| + {"aría",2},
|
| + # cantariade -> cant
|
| + {"ariade",2},
|
| + # cantaríade -> cant
|
| + {"aríade",2},
|
| + # cantarian -> cant
|
| + {"arian",2},
|
| + # cantariamo -> cant
|
| + {"ariamo",2},
|
| + # pescaron -> pesc
|
| + {"aron",2},
|
| + # cantase -> cant
|
| + {"ase",2},
|
| + # cantasede -> cant
|
| + {"asede",2},
|
| + # cantásede -> cant
|
| + {"ásede",2},
|
| + # cantasemo -> cant
|
| + {"asemo",2},
|
| + # cantásemo -> cant
|
| + {"ásemo",2},
|
| + # cantasen -> cant
|
| + {"asen",2},
|
| + # loitavan -> loitav
|
| + {"avan",2},
|
| + # cantaríamo -> cant
|
| + {"aríamo",2},
|
| + # cantassen -> cant
|
| + {"assen",2},
|
| + # cantássemo -> cant
|
| + {"ássemo",2},
|
| + # beberíamo -> beb
|
| + {"eríamo",2},
|
| + # bebêssemo -> beb
|
| + {"êssemo",2},
|
| + # partiríamo -> part
|
| + {"iríamo",3},
|
| + # partíssemo -> part
|
| + {"íssemo",3},
|
| + # cantáramo -> cant
|
| + {"áramo",2},
|
| + # cantárei -> cant
|
| + {"árei",2},
|
| + # cantaren -> cant
|
| + {"aren",2},
|
| + # cantaremo -> cant
|
| + {"aremo",2},
|
| + # cantaríei -> cant
|
| + {"aríei",2},
|
| + {"ássei",2},
|
| + # cantávamo-> cant
|
| + {"ávamo",2},
|
| + # bebêramo -> beb
|
| + {"êramo",1},
|
| + # beberemo -> beb
|
| + {"eremo",1},
|
| + # beberíei -> beb
|
| + {"eríei",1},
|
| + # bebêssei -> beb
|
| + {"êssei",1},
|
| + # partiríamo -> part
|
| + {"íramo",3},
|
| + # partiremo -> part
|
| + {"iremo",3},
|
| + # partiríei -> part
|
| + {"iríei",3},
|
| + # partíssei -> part
|
| + {"íssei",3},
|
| + # partissen -> part
|
| + {"issen",3},
|
| + # bebendo -> beb
|
| + {"endo",1},
|
| + # partindo -> part
|
| + {"indo",3},
|
| + # propondo -> prop
|
| + {"ondo",3},
|
| + # cantarde -> cant
|
| + {"arde",2},
|
| + # cantarei -> cant
|
| + {"arei",2},
|
| + # cantaria -> cant
|
| + {"aria",2},
|
| + # cantarmo -> cant
|
| + {"armo",2},
|
| + # cantasse -> cant
|
| + {"asse",2},
|
| + {"aste",2},
|
| + # cantávei -> cant
|
| + {"ávei",2},
|
| + # perderão -> perd
|
| + {"erão",1},
|
| + # beberde -> beb
|
| + {"erde",1},
|
| + # beberei -> beb
|
| + {"erei",1},
|
| + # bebêrei -> beb
|
| + {"êrei",1},
|
| + # beberen -> beb
|
| + {"eren",2},
|
| + # beberia -> beb
|
| + {"eria",1},
|
| + # bebermo -> beb
|
| + {"ermo",1},
|
| + # bebeste -> beb
|
| + {"este",1,"",{"faroeste","agreste"}},
|
| + # bebíamo -> beb
|
| + {"íamo",1},
|
| + # fuxian -> fux
|
| + {"ian",2,"",{"enfian","eloxian","ensaian"}},
|
| + # partirde -> part
|
| + {"irde",2},
|
| + # partírei -> part
|
| + {"irei",3,"",{"admirei"}},
|
| + # partiren -> part
|
| + {"iren",3},
|
| + # partiria -> part
|
| + {"iria",3},
|
| + # partirmo -> part
|
| + {"irmo",3},
|
| + # partisse -> part
|
| + {"isse",3},
|
| + # partiste -> part
|
| + {"iste",4},
|
| + {"iava",1,"",{"ampliava"}},
|
| + # cantamo -> cant
|
| + {"amo",2},
|
| + # funciona -> func
|
| + {"iona",3},
|
| + # cantara -> cant
|
| + {"ara",2,"",{"arara","prepara"}},
|
| + # enviará -> envi
|
| + {"ará",2,"",{"alvará","bacará"}},
|
| + # cantare -> cant
|
| + {"are",2,"",{"prepare"}},
|
| + # cantava -> cant
|
| + {"ava",2,"",{"agrava"}},
|
| + # cantemo -> cant
|
| + {"emo",2},
|
| + # bebera -> beb
|
| + {"era",1,"",{"acelera","espera"}},
|
| + # beberá -> beb
|
| + {"erá",1},
|
| + # bebere -> beb
|
| + {"ere",1,"",{"espere"}},
|
| + # bebíei -> beb
|
| + {"íei",1},
|
| + # metin -> met
|
| + {"in",3},
|
| + # partimo -> part
|
| + {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
|
| + # partira -> part
|
| + {"ira",3,"",{"fronteira","sátira"}},
|
| + {"ído",3},
|
| + # partirá -> part
|
| + {"irá",3},
|
| + # concretizar -> concret
|
| + {"tizar",4,"",{"alfabetizar"}},
|
| + {"izar",3,"",{"organizar"}},
|
| + # saltitar -> salt
|
| + {"itar",5,"",{"acreditar","explicitar","estreitar"}},
|
| + # partire -> part
|
| + {"ire",3,"",{"adquire"}},
|
| + # compomo -> comp
|
| + {"omo",3},
|
| + {"ai",2},
|
| + # barbear -> barb
|
| + {"ear",4,"",{"alardear","nuclear"}},
|
| + # cheguei -> cheg
|
| + {"uei",3},
|
| + {"uía",5,"u"},
|
| + # cantei -> cant
|
| + {"ei",3},
|
| + # beber -> beb
|
| + {"er",1,"",{"éter","pier"}},
|
| + # bebeu -> beb
|
| + {"eu",1,"",{"chapeu"}},
|
| + # bebia -> beb
|
| + {"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
|
| + # partir -> part
|
| + {"ir",3},
|
| + # partiu -> part
|
| + {"iu",3},
|
| + # fraqueou -> fraqu
|
| + {"eou",5},
|
| + # chegou -> cheg
|
| + {"ou",3},
|
| + # bebi -> beb
|
| + {"i",1},
|
| + # varrede -> varr
|
| + {"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}},
|
| + # cantei -> cant
|
| + {"ei",3},
|
| + # anden -> and
|
| + {"en",2},
|
| + # descerade -> desc
|
| + {"erade",1},
|
| + # vivérade -> viv
|
| + {"érade",1},
|
| + # beberan -> beb
|
| + {"eran",2},
|
| + # colleramo -> coller
|
| + {"eramo",1},
|
| + # bebéramo -> beb
|
| + {"éramo",1},
|
| + # perderán -> perd
|
| + {"erán",1},
|
| + # varrería -> varr
|
| + {"ería",1},
|
| + # beberiade -> beb
|
| + {"eriade",1},
|
| + # beberíade -> beb
|
| + {"eríade",1},
|
| + # beberiamo -> beb
|
| + {"eriamo",1},
|
| + # beberian -> beb
|
| + {"erian",1},
|
| + # beberían -> beb
|
| + {"erían",1},
|
| + # perderon -> perd
|
| + {"eron",1},
|
| + # bebese -> beb
|
| + {"ese",1},
|
| + # bebesedes -> beb
|
| + {"esedes",1},
|
| + # bebésedes -> beb
|
| + {"ésedes",1},
|
| + # bebesemo -> beb
|
| + {"esemo",1},
|
| + # bebésemo -> beb
|
| + {"ésemo",1},
|
| + # bebesen -> beb
|
| + {"esen",1},
|
| + # bebêssede -> beb
|
| + {"êssede",1},
|
| + # chovía -> chov
|
| + {"ía",1},
|
| + # faciade -> fac
|
| + {"iade",1},
|
| + # facíade -> fac
|
| + {"íade",1},
|
| + # perdiamo -> perd
|
| + {"iamo",1},
|
| + # fuxían -> fux
|
| + {"ían",1},
|
| + # corriche -> corr
|
| + {"iche",1},
|
| + # partide -> part
|
| + {"ide",1},
|
| + # escribirade -> escrib
|
| + {"irade",3},
|
| + # parírade -> par
|
| + {"írade",3},
|
| + # partiramo -> part
|
| + {"iramo",3},
|
| + # fugirán -> fug
|
| + {"irán",3},
|
| + # viviría -> viv
|
| + {"iría",3},
|
| + # partiriade -> part
|
| + {"iriade",3},
|
| + # partiríade -> part
|
| + {"iríade",3},
|
| + # partiriamo -> part
|
| + {"iriamo",3},
|
| + # partirian -> part
|
| + {"irian",3},
|
| + # partirían -> part
|
| + {"irían",3},
|
| + # reflectiron -> reflect
|
| + {"iron",3},
|
| + # partise -> part
|
| + {"ise",3},
|
| + # partisede -> part
|
| + {"isede",3},
|
| + # partísede -> part
|
| + {"ísede",3},
|
| + # partisemo -> part
|
| + {"isemo",3},
|
| + # partísemo -> part
|
| + {"ísemo",3},
|
| + # partisen -> part
|
| + {"isen",3},
|
| + # partíssede -> part
|
| + {"íssede",3},
|
| + {"tizar",3,"",{"alfabetizar"}},
|
| + {"ondo",3}};
|
| +
|
| +{ "Vowel", 0, 0, {},
|
| + # segue -> seg
|
| + {"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}},
|
| + {"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}},
|
| + {"a",3,"",{"amasadela","cerva"}},
|
| + {"e",3,"",{"marte"}},
|
| + {"o",3,"",{"barro","fado","cabo","libro","cervo"}},
|
| + {"â",3},
|
| + {"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}},
|
| + {"ê",3},
|
| + {"ô",3},
|
| + {"á",3},
|
| + {"é",3},
|
| + {"ó",3},
|
| + # munxi -> munx
|
| + {"i",3}};
|