blob: 84ac8d24b0c63243144816783936ed84fd7ecde9 [file] [log] [blame]
Index: solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (revision 0)
+++ solr/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (revision 0)
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Galician stem factory is working.
+ */
+public class TestGalicianStemFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("cariñosa");
+ GalicianStemFilterFactory factory = new GalicianStemFilterFactory();
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream, new String[] { "cariñ" });
+ }
+}
Property changes on: solr\src\test\org\apache\solr\analysis\TestGalicianStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java
===================================================================
--- solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java (revision 0)
+++ solr/src/test/org/apache/solr/analysis/TestPortugueseStemFilterFactory.java (revision 0)
@@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure the Portuguese stem factory is working.
+ */
+public class TestPortugueseStemFilterFactory extends BaseTokenTestCase {
+ public void testStemming() throws Exception {
+ Reader reader = new StringReader("maluquice");
+ PortugueseStemFilterFactory factory = new PortugueseStemFilterFactory();
+ TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
+ assertTokenStreamContents(stream, new String[] { "maluc" });
+ }
+}
Property changes on: solr\src\test\org\apache\solr\analysis\TestPortugueseStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (revision 0)
+++ solr/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (revision 0)
@@ -0,0 +1,28 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pt.PortugueseStemFilter;
+
+/** Factory for {@link PortugueseStemFilter} */
+public class PortugueseStemFilterFactory extends BaseTokenFilterFactory {
+ public TokenStream create(TokenStream input) {
+ return new PortugueseStemFilter(input);
+ }
+}
Property changes on: solr\src\java\org\apache\solr\analysis\PortugueseStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java
===================================================================
--- solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (revision 0)
+++ solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (revision 0)
@@ -0,0 +1,28 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.gl.GalicianStemFilter;
+
+/** Factory for {@link GalicianStemFilter} */
+public class GalicianStemFilterFactory extends BaseTokenFilterFactory {
+ public TokenStream create(TokenStream input) {
+ return new GalicianStemFilter(input);
+ }
+}
Property changes on: solr\src\java\org\apache\solr\analysis\GalicianStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/pt/ptrslptestdata.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\pt\ptrslptestdata.zip
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (revision 0)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilter.java (revision 0)
@@ -0,0 +1,69 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Simple tests for {@link PortugueseStemFilter}
+ */
+public class TestPortugueseStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
+ return new TokenStreamComponents(source, new PortugueseStemFilter(result));
+ }
+ };
+
+ /**
+ * Test the example from the paper "Assessing the impact of stemming accuracy
+ * on information retrieval"
+ */
+ public void testExamples() throws IOException {
+ assertAnalyzesTo(
+ analyzer,
+ "O debate político, pelo menos o que vem a público, parece, de modo nada "
+ + "surpreendente, restrito a temas menores. Mas há, evidentemente, "
+ + "grandes questões em jogo nas eleições que se aproximam.",
+ new String[] {
+ "o", "debat", "politic", "pel", "menos", "o", "que", "vem", "a",
+ "public", "parec", "de", "mod", "nad", "surpreend", "restrit",
+ "a", "tem", "men", "mas", "ha", "evid", "grand", "quest",
+ "em", "jog", "na", "eleic", "que", "se", "aproxim"
+ });
+ }
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("ptrslptestdata.zip"), "ptrslp.txt");
+ }
+}
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\pt\TestPortugueseStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java (revision 0)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilter.java (revision 0)
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.lucene.analysis.util.VocabularyAssert.assertVocabulary;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+
+/**
+ * Simple tests for {@link GalicianStemFilter}
+ */
+public class TestGalicianStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new ReusableAnalyzerBase() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
+ return new TokenStreamComponents(source, new GalicianStemFilter(result));
+ }
+ };
+
+
+ /** Test against a vocabulary from the reference impl */
+ public void testVocabulary() throws IOException {
+ assertVocabulary(analyzer, getDataFile("gltestdata.zip"), "gl.txt");
+ }
+}
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\TestGalicianStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/gltestdata.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\gltestdata.zip
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java
===================================================================
--- modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java (revision 0)
+++ modules/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianAnalyzer.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestGalicianAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new GalicianAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "correspondente", "correspond");
+ checkOneTermReuse(a, "corresponderá", "correspond");
+ // stopword
+ assertAnalyzesTo(a, "e", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("correspondente");
+ Analyzer a = new GalicianAnalyzer(TEST_VERSION_CURRENT,
+ GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "correspondente", "correspondente");
+ checkOneTermReuse(a, "corresponderá", "correspond");
+ }
+}
Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\gl\TestGalicianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemmer.java (revision 0)
@@ -0,0 +1,102 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+/**
+ * Portuguese stemmer implementing the RSLP (Removedor de Sufixos da Lingua Portuguesa)
+ * algorithm. This is sometimes also referred to as the Orengo stemmer.
+ *
+ * @see RSLPStemmerBase
+ */
+public class PortugueseStemmer extends RSLPStemmerBase {
+ private static final Step plural, feminine, adverb, augmentative, noun, verb, vowel;
+
+ static {
+ Map<String,Step> steps = parse(PortugueseStemmer.class, "portuguese.rslp");
+ plural = steps.get("Plural");
+ feminine = steps.get("Feminine");
+ adverb = steps.get("Adverb");
+ augmentative = steps.get("Augmentative");
+ noun = steps.get("Noun");
+ verb = steps.get("Verb");
+ vowel = steps.get("Vowel");
+ }
+
+ /**
+ * @param s buffer, oversized to at least <code>len+1</code>
+ * @param len initial valid length of buffer
+ * @return new valid length, stemmed
+ */
+ public int stem(char s[], int len) {
+ assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1";
+
+ len = plural.apply(s, len);
+ len = adverb.apply(s, len);
+ len = feminine.apply(s, len);
+ len = augmentative.apply(s, len);
+
+ int oldlen = len;
+ len = noun.apply(s, len);
+
+ if (len == oldlen) { /* suffix not removed */
+ oldlen = len;
+
+ len = verb.apply(s, len);
+
+ if (len == oldlen) { /* suffix not removed */
+ len = vowel.apply(s, len);
+ }
+ }
+
+ // rslp accent removal
+ for (int i = 0; i < len; i++) {
+ switch(s[i]) {
+ case 'à':
+ case 'á':
+ case 'â':
+ case 'ã':
+ case 'ä':
+ case 'å': s[i] = 'a'; break;
+ case 'ç': s[i] = 'c'; break;
+ case 'è':
+ case 'é':
+ case 'ê':
+ case 'ë': s[i] = 'e'; break;
+ case 'ì':
+ case 'í':
+ case 'î':
+ case 'ï': s[i] = 'i'; break;
+ case 'ñ': s[i] = 'n'; break;
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ case 'õ':
+ case 'ö': s[i] = 'o'; break;
+ case 'ù':
+ case 'ú':
+ case 'û':
+ case 'ü': s[i] = 'u'; break;
+ case 'ý':
+ case 'ÿ': s[i] = 'y'; break;
+ }
+ }
+ return len;
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\PortugueseStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilter.java (revision 0)
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link PortugueseStemmer} to stem
+ * Portuguese words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class PortugueseStemFilter extends TokenFilter {
+ private final PortugueseStemmer stemmer = new PortugueseStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public PortugueseStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ // this stemmer increases word length by 1: worst case '*ã' -> '*ão'
+ final int len = termAtt.length();
+ final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len);
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\PortugueseStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (revision 1054344)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemmer.java (working copy)
@@ -1,10 +1,5 @@
package org.apache.lucene.analysis.pt;
-import java.util.Arrays;
-
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.Version;
-
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -31,89 +26,14 @@
* which is just the plural reduction step of the RSLP
* algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
* Orengo et al.
+ * @see RSLPStemmerBase
*/
-public class PortugueseMinimalStemmer {
+public class PortugueseMinimalStemmer extends RSLPStemmerBase {
- private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
- Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois",
- "depois","dois","leis"),
- false);
+ private static final Step pluralStep =
+ parse(PortugueseMinimalStemmer.class, "portuguese.rslp").get("Plural");
- private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
- Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
- "férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
- "através", "convés", "ês", "país", "após", "ambas", "ambos",
- "messias", "depois"),
- false);
-
public int stem(char s[], int len) {
- if (len < 3 || s[len-1] != 's')
- return len;
-
- if (s[len-2] == 'n') {
- len--;
- s[len-1] = 'm';
- return len;
- }
-
- if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
- len--;
- s[len-2] = 'ã';
- s[len-1] = 'o';
- return len;
- }
-
- if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
- if (!(len == 4 && s[0] == 'm')) {
- len--;
- s[len-1] = 'o';
- return len;
- }
-
- if (len >= 4 && s[len-2] == 'i') {
- if (s[len-3] == 'a')
- if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
- len--;
- s[len-1] = 'l';
- return len;
- }
-
- if (len >= 5 && s[len-3] == 'é') {
- len--;
- s[len-2] = 'e';
- s[len-1] = 'l';
- return len;
- }
-
- if (len >= 5 && s[len-3] == 'e') {
- len--;
- s[len-1] = 'l';
- return len;
- }
-
- if (len >= 5 && s[len-3] == 'ó') {
- len--;
- s[len-2] = 'o';
- s[len-1] = 'l';
- return len;
- }
-
- if (!excIS.contains(s, 0, len)) {
- s[len-1] = 'l';
- return len;
- }
- }
-
- if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
- return len - 2;
-
- if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
- if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
- return len - 2;
-
- if (excS.contains(s, 0, len))
- return len;
- else
- return len-1;
+ return pluralStep.apply(s, len);
}
}
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/pt/RSLPStemmerBase.java (revision 0)
@@ -0,0 +1,345 @@
+package org.apache.lucene.analysis.pt;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
+
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Base class for stemmers that use a set of RSLP-like stemming steps.
+ * <p>
+ * RSLP (Removedor de Sufixos da Lingua Portuguesa) is an algorithm designed
+ * originally for stemming the Portuguese language, described in the paper
+ * <i>A Stemming Algorithm for the Portuguese Language</i>, Orengo et. al.
+ * <p>
+ * Since this time a plural-only modification (RSLP-S) as well as a modification
+ * for the Galician language have been implemented. This class parses a configuration
+ * file that describes {@link Step}s, where each Step contains a set of {@link Rule}s.
+ * <p>
+ * The general rule format is:
+ * <blockquote>{ "suffix", N, "replacement", { "exception1", "exception2", ...}}</blockquote>
+ * where:
+ * <ul>
+ * <li><code>suffix</code> is the suffix to be removed (such as "inho").
+ * <li><code>N</code> is the min stem size, where stem is defined as the candidate stem
+ * after removing the suffix (but before appending the replacement!)
+ * <li><code>replacement</code> is an optimal string to append after removing the suffix.
+ * This can be the empty string.
+ * <li><code>exceptions</code> is an optional list of exceptions, patterns that should
+ * not be stemmed. These patterns can be specified as whole word or suffix (ends-with)
+ * patterns, depending upon the exceptions format flag in the step header.
+ * </ul>
+ * <p>
+ * A step is an ordered list of rules, with a structure in this format:
+ * <blockquote>{ "name", N, B, { "cond1", "cond2", ... }
+ * ... rules ... };
+ * </blockquote>
+ * where:
+ * <ul>
+ * <li><code>name</code> is a name for the step (such as "Plural").
+ * <li><code>N</code> is the min word size. Words that are less than this length bypass
+ * the step completely, as an optimization. Note: N can be zero, in this case this
+ * implementation will automatically calculate the appropriate value from the underlying
+ * rules.
+ * <li><code>B</code> is a "boolean" flag specifying how exceptions in the rules are matched.
+ * A value of 1 indicates whole-word pattern matching, a value of 0 indicates that
+ * exceptions are actually suffixes and should be matched with ends-with.
+ * <li><code>conds</code> are an optional list of conditions to enter the step at all. If
+ * the list is non-empty, then a word must end with one of these conditions or it will
+ * bypass the step completely as an optimization.
+ * </ul>
+ * <p>
+ * @see <a href="http://www.inf.ufrgs.br/~viviane/rslp/index.htm">RSLP description</a>
+ * @lucene.internal
+ */
+public abstract class RSLPStemmerBase {
+
+ /**
+ * A basic rule, with no exceptions.
+ */
+ protected static class Rule {
+ protected final char suffix[];
+ protected final char replacement[];
+ protected final int min;
+
+ /**
+ * Create a rule.
+ * @param suffix suffix to remove
+ * @param min minimum stem length
+ * @param replacement replacement string
+ */
+ public Rule(String suffix, int min, String replacement) {
+ this.suffix = suffix.toCharArray();
+ this.replacement = replacement.toCharArray();
+ this.min = min;
+ }
+
+ /**
+ * @return true if the word matches this rule.
+ */
+ public boolean matches(char s[], int len) {
+ return (len - suffix.length >= min && endsWith(s, len, suffix));
+ }
+
+ /**
+ * @return new valid length of the string after firing this rule.
+ */
+ public int replace(char s[], int len) {
+ if (replacement.length > 0) {
+ System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
+ }
+ return len - suffix.length + replacement.length;
+ }
+ }
+
+ /**
+ * A rule with a set of whole-word exceptions.
+ */
+ protected static class RuleWithSetExceptions extends Rule {
+ protected final CharArraySet exceptions;
+
+ public RuleWithSetExceptions(String suffix, int min, String replacement,
+ String[] exceptions) {
+ super(suffix, min, replacement);
+ for (int i = 0; i < exceptions.length; i++) {
+ if (!exceptions[i].endsWith(suffix))
+ System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
+ }
+ this.exceptions = new CharArraySet(Version.LUCENE_31,
+ Arrays.asList(exceptions), false);
+ }
+
+ @Override
+ public boolean matches(char s[], int len) {
+ return super.matches(s, len) && !exceptions.contains(s, 0, len);
+ }
+ }
+
+ /**
+ * A rule with a set of exceptional suffixes.
+ */
+ protected static class RuleWithSuffixExceptions extends Rule {
+ // TODO: use a more efficient datastructure: automaton?
+ protected final char[][] exceptions;
+
+ public RuleWithSuffixExceptions(String suffix, int min, String replacement,
+ String[] exceptions) {
+ super(suffix, min, replacement);
+ for (int i = 0; i < exceptions.length; i++) {
+ if (!exceptions[i].endsWith(suffix))
+ System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
+ }
+ this.exceptions = new char[exceptions.length][];
+ for (int i = 0; i < exceptions.length; i++)
+ this.exceptions[i] = exceptions[i].toCharArray();
+ }
+
+ @Override
+ public boolean matches(char s[], int len) {
+ if (!super.matches(s, len))
+ return false;
+
+ for (int i = 0; i < exceptions.length; i++)
+ if (endsWith(s, len, exceptions[i]))
+ return false;
+
+ return true;
+ }
+ }
+
+ /**
+ * A step containing a list of rules.
+ */
+ protected static class Step {
+ protected final String name;
+ protected final Rule rules[];
+ protected final int min;
+ protected final char[][] suffixes;
+
+ /**
+ * Create a new step
+ * @param name Step's name.
+ * @param rules an ordered list of rules.
+ * @param min minimum word size. if this is 0 it is automatically calculated.
+ * @param suffixes optional list of conditional suffixes. may be null.
+ */
+ public Step(String name, Rule rules[], int min, String suffixes[]) {
+ this.name = name;
+ this.rules = rules;
+ if (min == 0) {
+ min = Integer.MAX_VALUE;
+ for (Rule r : rules)
+ min = Math.min(min, r.min + r.suffix.length);
+ }
+ this.min = min;
+
+ if (suffixes == null || suffixes.length == 0) {
+ this.suffixes = null;
+ } else {
+ this.suffixes = new char[suffixes.length][];
+ for (int i = 0; i < suffixes.length; i++)
+ this.suffixes[i] = suffixes[i].toCharArray();
+ }
+ }
+
+ /**
+ * @return new valid length of the string after applying the entire step.
+ */
+ public int apply(char s[], int len) {
+ if (len < min)
+ return len;
+
+ if (suffixes != null) {
+ boolean found = false;
+
+ for (int i = 0; i < suffixes.length; i++)
+ if (endsWith(s, len, suffixes[i])) {
+ found = true;
+ break;
+ }
+
+ if (!found) return len;
+ }
+
+ for (int i = 0; i < rules.length; i++) {
+ if (rules[i].matches(s, len))
+ return rules[i].replace(s, len);
+ }
+
+ return len;
+ }
+ }
+
+ /**
+ * Parse a resource file into an RSLP stemmer description.
+ * @return a Map containing the named Steps in this description.
+ */
+ protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
+ // TODO: this parser is ugly, but works. use a jflex grammar instead.
+ try {
+ InputStream is = clazz.getResourceAsStream(resource);
+ LineNumberReader r = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
+ Map<String,Step> steps = new HashMap<String,Step>();
+ String step;
+ while ((step = readLine(r)) != null) {
+ Step s = parseStep(r, step);
+ steps.put(s.name, s);
+ }
+ r.close();
+ return steps;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static final Pattern headerPattern =
+ Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
+ private static final Pattern stripPattern =
+ Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
+ private static final Pattern repPattern =
+ Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
+ private static final Pattern excPattern =
+ Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
+
+ private static Step parseStep(LineNumberReader r, String header) throws IOException {
+ Matcher matcher = headerPattern.matcher(header);
+ if (!matcher.find()) {
+ throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
+ }
+ assert matcher.groupCount() == 4;
+ String name = matcher.group(1);
+ int min = Integer.parseInt(matcher.group(2));
+ int type = Integer.parseInt(matcher.group(3));
+ String suffixes[] = parseList(matcher.group(4));
+ Rule rules[] = parseRules(r, type);
+ return new Step(name, rules, min, suffixes);
+ }
+
+ private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
+ List<Rule> rules = new ArrayList<Rule>();
+ String line;
+ while ((line = readLine(r)) != null) {
+ Matcher matcher = stripPattern.matcher(line);
+ if (matcher.matches()) {
+ rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
+ } else {
+ matcher = repPattern.matcher(line);
+ if (matcher.matches()) {
+ rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
+ } else {
+ matcher = excPattern.matcher(line);
+ if (matcher.matches()) {
+ if (type == 0) {
+ rules.add(new RuleWithSuffixExceptions(matcher.group(1),
+ Integer.parseInt(matcher.group(2)),
+ matcher.group(3),
+ parseList(matcher.group(4))));
+ } else {
+ rules.add(new RuleWithSetExceptions(matcher.group(1),
+ Integer.parseInt(matcher.group(2)),
+ matcher.group(3),
+ parseList(matcher.group(4))));
+ }
+ } else {
+ throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
+ }
+ }
+ }
+ if (line.endsWith(";"))
+ return rules.toArray(new Rule[rules.size()]);
+ }
+ return null;
+ }
+
+ private static String[] parseList(String s) {
+ if (s.isEmpty())
+ return null;
+ String list[] = s.split(",");
+ for (int i = 0; i < list.length; i++)
+ list[i] = parseString(list[i].trim());
+ return list;
+ }
+
+ private static String parseString(String s) {
+ return s.substring(1, s.length()-1);
+ }
+
+ private static String readLine(LineNumberReader r) throws IOException {
+ String line = null;
+ while ((line = r.readLine()) != null) {
+ line = line.trim();
+ if (!line.isEmpty() && line.charAt(0) != '#')
+ return line;
+ }
+ return line;
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\pt\RSLPStemmerBase.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemmer.java (revision 0)
@@ -0,0 +1,83 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.pt.RSLPStemmerBase;
+
+/**
+ * Galician stemmer implementing "Regras do lematizador para o galego".
+ *
+ * @see RSLPStemmerBase
+ * @see <a href="http://bvg.udc.es/recursos_lingua/stemming.jsp">Description of rules</a>
+ */
+public class GalicianStemmer extends RSLPStemmerBase {
+ private static final Step plural, unification, adverb, augmentative, noun, verb, vowel;
+
+ static {
+ Map<String,Step> steps = parse(GalicianStemmer.class, "galician.rslp");
+ plural = steps.get("Plural");
+ unification = steps.get("Unification");
+ adverb = steps.get("Adverb");
+ augmentative = steps.get("Augmentative");
+ noun = steps.get("Noun");
+ verb = steps.get("Verb");
+ vowel = steps.get("Vowel");
+ }
+
+ /**
+ * @param s buffer, oversized to at least <code>len+1</code>
+ * @param len initial valid length of buffer
+ * @return new valid length, stemmed
+ */
+ public int stem(char s[], int len) {
+ assert s.length >= len + 1 : "this stemmer requires an oversized array of at least 1";
+
+ len = plural.apply(s, len);
+ len = unification.apply(s, len);
+ len = adverb.apply(s, len);
+
+ int oldlen;
+ do {
+ oldlen = len;
+ len = augmentative.apply(s, len);
+ } while (len != oldlen);
+
+ oldlen = len;
+ len = noun.apply(s, len);
+ if (len == oldlen) { /* suffix not removed */
+ len = verb.apply(s, len);
+ }
+
+ len = vowel.apply(s, len);
+
+ // RSLG accent removal
+ for (int i = 0; i < len; i++)
+ switch(s[i]) {
+ case 'á': s[i] = 'a'; break;
+ case 'é':
+ case 'ê': s[i] = 'e'; break;
+ case 'í': s[i] = 'i'; break;
+ case 'ó': s[i] = 'o'; break;
+ case 'ú': s[i] = 'u'; break;
+ }
+
+ return len;
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianStemmer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilter.java (revision 0)
@@ -0,0 +1,60 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link GalicianStemmer} to stem
+ * Galician words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class GalicianStemFilter extends TokenFilter {
+ private final GalicianStemmer stemmer = new GalicianStemmer();
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+ public GalicianStemFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ // this stemmer increases word length by 1: worst case '*çom' -> '*ción'
+ final int len = termAtt.length();
+ final int newlen = stemmer.stem(termAtt.resizeBuffer(len+1), len);
+ termAtt.setLength(newlen);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.analysis.gl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.util.WordlistLoader;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Galician.
+ */
+public final class GalicianAnalyzer extends StopwordAnalyzerBase {
+ private final Set<?> stemExclusionSet;
+
+ /** File containing default Galician stopwords. */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet(){
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ * accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET;
+
+ static {
+ try {
+ DEFAULT_STOP_SET = WordlistLoader.getWordSet(GalicianAnalyzer.class,
+ DEFAULT_STOPWORD_FILE);
+ } catch (IOException ex) {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new RuntimeException("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public GalicianAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ */
+ public GalicianAnalyzer(Version matchVersion, Set<?> stopwords) {
+ this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ * provided this analyzer will add a {@link KeywordMarkerFilter} before
+ * stemming.
+ *
+ * @param matchVersion lucene compatibility version
+ * @param stopwords a stopword set
+ * @param stemExclusionSet a set of terms not to be stemmed
+ */
+ public GalicianAnalyzer(Version matchVersion, Set<?> stopwords, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+ matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+ * , {@link KeywordMarkerFilter} if a stem exclusion set is
+ * provided and {@link GalicianStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if(!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new GalicianStemFilter(result);
+ return new TokenStreamComponents(source, result);
+ }
+}
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\GalicianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html (revision 0)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/gl/package.html (revision 0)
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Galician.
+</body>
+</html>
Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\gl\package.html
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java
===================================================================
--- modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (revision 1054344)
+++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/StemmerUtil.java (working copy)
@@ -57,6 +57,25 @@
}
/**
+ * Returns true if the character array ends with the suffix.
+ *
+ * @param s Input Buffer
+ * @param len length of input buffer
+ * @param suffix Suffix string to test
+ * @return true if <code>s</code> ends with <code>suffix</code>
+ */
+ public static boolean endsWith(char s[], int len, char suffix[]) {
+ final int suffixLen = suffix.length;
+ if (suffixLen > len)
+ return false;
+ for (int i = suffixLen - 1; i >= 0; i--)
+ if (s[len -(suffixLen - i)] != suffix[i])
+ return false;
+
+ return true;
+ }
+
+ /**
* Delete a character in-place
*
* @param s Input Buffer
Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp
===================================================================
--- modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp (revision 0)
+++ modules/analysis/common/src/resources/org/apache/lucene/analysis/pt/portuguese.rslp (revision 0)
@@ -0,0 +1,456 @@
+# Steps file for the RSLP stemmer.
+
+# Step 1: Plural Reduction
+{ "Plural", 3, 1, {"s"},
+ # bons -> bom
+ {"ns",1,"m"},
+ # balões -> balão
+ {"ões",3,"ão"},
+ # capitães -> capitão
+ {"ães",1,"ão",{"mães"}},
+ # normais -> normal
+ {"ais",1,"al",{"cais","mais"}},
+ # papéis -> papel
+ {"éis",2,"el"},
+ # amáveis -> amável
+ {"eis",2,"el"},
+ # lençóis -> lençol
+ {"óis",2,"ol"},
+ # barris -> barril
+ {"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}},
+ # males -> mal
+ {"les",3,"l"},
+ # mares -> mar
+ {"res",3,"r", {"árvores"}},
+ # casas -> casa
+ {"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}};
+
+# Step 2: Adverb Reduction
+{ "Adverb", 0, 0, {},
+ # felizmente -> feliz
+ {"mente",4,"",{"experimente"}}};
+
+# Step 3: Feminine Reduction
+{ "Feminine", 3, 1, {"a","ã"},
+ # chefona -> chefão
+ {"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}},
+ # vilã -> vilão
+ {"ã",2,"ão",{"amanhã","arapuã","fã","divã"}},
+ # professora -> professor
+ {"ora",3,"or"},
+ # americana -> americano
+ {"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}},
+ # sozinha -> sozinho
+ {"inha",3,"inho",{"rainha","linha","minha"}},
+ # inglesa -> inglês
+ {"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}},
+ # famosa -> famoso
+ {"osa",3,"oso",{"mucosa","prosa"}},
+ # maníaca -> maníaco
+ {"íaca",3,"íaco"},
+ # prática -> prático
+ {"ica",3,"ico",{"dica"}},
+ # cansada -> cansado
+ {"ada",2,"ado",{"pitada"}},
+ # mantida -> mantido
+ {"ida",3,"ido",{"vida","dúvida"}},
+ {"ída",3,"ido",{"recaída","saída"}},
+ # prima -> primo
+ {"ima",3,"imo",{"vítima"}},
+ # passiva -> passivo
+ {"iva",3,"ivo",{"saliva","oliva"}},
+ # primeira -> primeiro
+ {"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}};
+
+# Step 4: Augmentative/Diminutive Reduction
+{ "Augmentative", 0, 1, {},
+ # cansadíssimo -> cansad
+ {"díssimo",5},
+ # amabilíssimo -> ama
+ {"abilíssimo",5},
+ # fortíssimo -> fort
+ {"íssimo",3},
+ {"ésimo",3},
+ # chiquérrimo -> chiqu
+ {"érrimo",4},
+ # pezinho -> pe
+ {"zinho",2},
+ # maluquinho -> maluc
+ {"quinho",4,"c"},
+ # amiguinho -> amig
+ {"uinho",4},
+ # cansadinho -> cansad
+ {"adinho",3},
+ # carrinho -> carr
+ {"inho",3,"",{"caminho","cominho"}},
+ # grandalhão -> grand
+ {"alhão",4},
+ # dentuça -> dent
+ {"uça",4},
+ # ricaço -> ric
+ {"aço",4,"",{"antebraço"}},
+ {"aça",4},
+ # casadão -> cans
+ {"adão",4},
+ {"idão",4},
+ # corpázio -> corp
+ {"ázio",3,"",{"topázio"}},
+ # pratarraz -> prat
+ {"arraz",4},
+ {"zarrão",3},
+ {"arrão",4},
+ # bocarra -> boc
+ {"arra",3},
+ # calorzão -> calor
+ {"zão",2,"",{"coalizão"}},
+ # meninão -> menin
+ {"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}};
+
+# Step 5: Noun Suffix Reduction
+{ "Noun", 0, 0, {},
+ # existencialista -> exist
+ {"encialista",4},
+ # minimalista -> minim
+ {"alista",5},
+ # contagem -> cont
+ {"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}},
+ # gerenciamento -> gerenc
+ {"iamento",4},
+ # monitoramento -> monitor
+ {"amento",3,"",{"firmamento","fundamento","departamento"}},
+ # nascimento -> nasc
+ {"imento",3},
+ {"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}},
+ # comercializado -> comerci
+ {"alizado",4},
+ # traumatizado -> traum
+ {"atizado",4},
+ {"tizado",4,"",{"alfabetizado"}},
+ # alfabetizado -> alfabet
+ {"izado",5,"",{"organizado","pulverizado"}},
+ # associativo -> associ
+ {"ativo",4,"",{"pejorativo","relativo"}},
+ # contraceptivo -> contracep
+ {"tivo",4,"",{"relativo"}},
+ # esportivo -> esport
+ {"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}},
+ # abalado -> abal
+ {"ado",2,"",{"grado"}},
+ # impedido -> imped
+ {"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}},
+ # ralador -> ral
+ {"ador",3},
+ # entendedor -> entend
+ {"edor",3},
+ # cumpridor -> cumpr
+ {"idor",4,"",{"ouvidor"}},
+ {"dor",4,"",{"ouvidor"}},
+ {"sor",4,"",{"assessor"}},
+ {"atoria",5},
+ {"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}},
+ {"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}},
+ # comparabilidade -> compar
+ {"abilidade",5},
+ # abolicionista -> abol
+ {"icionista",4},
+ # intervencionista -> interven
+ {"cionista",5},
+ {"ionista",5},
+ {"ionar",5},
+ # profissional -> profiss
+ {"ional",4},
+ # referência -> refer
+ {"ência",3},
+ # repugnância -> repugn
+ {"ância",4,"",{"ambulância"}},
+ # abatedouro -> abat
+ {"edouro",3},
+ # fofoqueiro -> fofoc
+ {"queiro",3,"c"},
+ {"adeiro",4,"",{"desfiladeiro"}},
+ # brasileiro -> brasil
+ {"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}},
+ {"uoso",3},
+ # gostoso -> gost
+ {"oso",3,"",{"precioso"}},
+ # comercializaç -> comerci
+ {"alizaç",5},
+ {"atizaç",5},
+ {"tizaç",5},
+ {"izaç",5,"",{"organizaç"}},
+ # alegaç -> aleg
+ {"aç",3,"",{"equaç","relaç"}},
+ # aboliç -> abol
+ {"iç",3,"",{"eleiç"}},
+ # anedotário -> anedot
+ {"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}},
+ {"atório",3},
+ {"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}},
+ # ministério -> minist
+ {"ério",6},
+ # chinês -> chin
+ {"ês",4},
+ # beleza -> bel
+ {"eza",3},
+ # rigidez -> rigid
+ {"ez",4},
+ # parentesco -> parent
+ {"esco",4},
+ # ocupante -> ocup
+ {"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}},
+ # bombástico -> bomb
+ {"ástico",4,"",{"eclesiástico"}},
+ {"alístico",3},
+ {"áutico",4},
+ {"êutico",4},
+ {"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}},
+ # polêmico -> polêm
+ {"ico",4,"",{"tico","público","explico"}},
+ # produtividade -> produt
+ {"ividade",5},
+ # profundidade -> profund
+ {"idade",4,"",{"autoridade","comunidade"}},
+ # aposentadoria -> aposentad
+ {"oria",4,"",{"categoria"}},
+ # existencial -> exist
+ {"encial",5},
+ # artista -> art
+ {"ista",4},
+ {"auta",5},
+ # maluquice -> maluc
+ {"quice",4,"c"},
+ # chatice -> chat
+ {"ice",4,"",{"cúmplice"}},
+ # demoníaco -> demon
+ {"íaco",3},
+ # decorrente -> decorr
+ {"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}},
+ {"ense",5},
+ # criminal -> crim
+ {"inal",3},
+ # americano -> americ
+ {"ano",4},
+ # amável -> am
+ {"ável",2,"",{"afável","razoável","potável","vulnerável"}},
+ # combustível -> combust
+ {"ível",3,"",{"possível"}},
+ {"vel",5,"",{"possível","vulnerável","solúvel"}},
+ {"bil",3,"vel"},
+ # cobertura -> cobert
+ {"ura",4,"",{"imatura","acupuntura","costura"}},
+ {"ural",4},
+ # consensual -> consens
+ {"ual",3,"",{"bissexual","virtual","visual","pontual"}},
+ # mundial -> mund
+ {"ial",3},
+ # experimental -> experiment
+ {"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}},
+ {"alismo",4},
+ {"ivismo",4},
+ {"ismo",3,"",{"cinismo"}}};
+
+# Step 6: Verb Suffix Reduction
+{ "Verb", 0, 0, {},
+ # cantaríamo -> cant
+ {"aríamo",2},
+ # cantássemo -> cant
+ {"ássemo",2},
+ # beberíamo -> beb
+ {"eríamo",2},
+ # bebêssemo -> beb
+ {"êssemo",2},
+ # partiríamo -> part
+ {"iríamo",3},
+ # partíssemo -> part
+ {"íssemo",3},
+ # cantáramo -> cant
+ {"áramo",2},
+ # cantárei -> cant
+ {"árei",2},
+ # cantaremo -> cant
+ {"aremo",2},
+ # cantariam -> cant
+ {"ariam",2},
+ # cantaríei -> cant
+ {"aríei",2},
+ # cantássei -> cant
+ {"ássei",2},
+ # cantassem -> cant
+ {"assem",2},
+ # cantávamo -> cant
+ {"ávamo",2},
+ # bebêramo -> beb
+ {"êramo",3},
+ # beberemo -> beb
+ {"eremo",3},
+ # beberiam -> beb
+ {"eriam",3},
+ # beberíei -> beb
+ {"eríei",3},
+ # bebêssei -> beb
+ {"êssei",3},
+ # bebessem -> beb
+ {"essem",3},
+ # partiríamo -> part
+ {"íramo",3},
+ # partiremo -> part
+ {"iremo",3},
+ # partiriam -> part
+ {"iriam",3},
+ # partiríei -> part
+ {"iríei",3},
+ # partíssei -> part
+ {"íssei",3},
+ # partissem -> part
+ {"issem",3},
+ # cantando -> cant
+ {"ando",2},
+ # bebendo -> beb
+ {"endo",3},
+ # partindo -> part
+ {"indo",3},
+ # propondo -> prop
+ {"ondo",3},
+ # cantaram -> cant
+ {"aram",2},
+ {"arão",2},
+ # cantarde -> cant
+ {"arde",2},
+ # cantarei -> cant
+ {"arei",2},
+ # cantarem -> cant
+ {"arem",2},
+ # cantaria -> cant
+ {"aria",2},
+ # cantarmo -> cant
+ {"armo",2},
+ # cantasse -> cant
+ {"asse",2},
+ # cantaste -> cant
+ {"aste",2},
+ # cantavam -> cant
+ {"avam",2,"",{"agravam"}},
+ # cantávei -> cant
+ {"ávei",2},
+ # beberam -> beb
+ {"eram",3},
+ {"erão",3},
+ # beberde -> beb
+ {"erde",3},
+ # beberei -> beb
+ {"erei",3},
+ # bebêrei -> beb
+ {"êrei",3},
+ # beberem -> beb
+ {"erem",3},
+ # beberia -> beb
+ {"eria",3},
+ # bebermo -> beb
+ {"ermo",3},
+ # bebesse -> beb
+ {"esse",3},
+ # bebeste -> beb
+ {"este",3,"",{"faroeste","agreste"}},
+ # bebíamo -> beb
+ {"íamo",3},
+ # partiram -> part
+ {"iram",3},
+ # concluíram -> conclu
+ {"íram",3},
+ {"irão",2},
+ # partirde -> part
+ {"irde",2},
+ # partírei -> part
+ {"irei",3,"",{"admirei"}},
+ # partirem -> part
+ {"irem",3,"",{"adquirem"}},
+ # partiria -> part
+ {"iria",3},
+ # partirmo -> part
+ {"irmo",3},
+ # partisse -> part
+ {"isse",3},
+ # partiste -> part
+ {"iste",4},
+ {"iava",4,"",{"ampliava"}},
+ # cantamo -> cant
+ {"amo",2},
+ {"iona",3},
+ # cantara -> cant
+ {"ara",2,"",{"arara","prepara"}},
+ # cantará -> cant
+ {"ará",2,"",{"alvará"}},
+ # cantare -> cant
+ {"are",2,"",{"prepare"}},
+ # cantava -> cant
+ {"ava",2,"",{"agrava"}},
+ # cantemo -> cant
+ {"emo",2},
+ # bebera -> beb
+ {"era",3,"",{"acelera","espera"}},
+ # beberá -> beb
+ {"erá",3},
+ # bebere -> beb
+ {"ere",3,"",{"espere"}},
+ # bebiam -> beb
+ {"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}},
+ # bebíei -> beb
+ {"íei",3},
+ # partimo -> part
+ {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
+ # partira -> part
+ {"ira",3,"",{"fronteira","sátira"}},
+ {"ído",3},
+ # partirá -> part
+ {"irá",3},
+ {"tizar",4,"",{"alfabetizar"}},
+ {"izar",5,"",{"organizar"}},
+ {"itar",5,"",{"acreditar","explicitar","estreitar"}},
+ # partire -> part
+ {"ire",3,"",{"adquire"}},
+ # compomo -> comp
+ {"omo",3},
+ # cantai -> cant
+ {"ai",2},
+ # cantam -> cant
+ {"am",2},
+ # barbear -> barb
+ {"ear",4,"",{"alardear","nuclear"}},
+ # cantar -> cant
+ {"ar",2,"",{"azar","bazaar","patamar"}},
+ # cheguei -> cheg
+ {"uei",3},
+ {"uía",5,"u"},
+ # cantei -> cant
+ {"ei",3},
+ {"guem",3,"g"},
+ # cantem -> cant
+ {"em",2,"",{"alem","virgem"}},
+ # beber -> beb
+ {"er",2,"",{"éter","pier"}},
+ # bebeu -> beb
+ {"eu",3,"",{"chapeu"}},
+ # bebia -> beb
+ {"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
+ # partir -> part
+ {"ir",3,"",{"freir"}},
+ # partiu -> part
+ {"iu",3},
+ {"eou",5},
+ # chegou -> cheg
+ {"ou",3},
+ # bebi -> beb
+ {"i",3}};
+
+# Step 7: Vowel Removal
+{ "Vowel", 0, 0, {},
+ {"bil",2,"vel"},
+ {"gue",2,"g",{"gangue","jegue"}},
+ {"á",3},
+ {"ê",3,"",{"bebê"}},
+ # menina -> menin
+ {"a",3,"",{"ásia"}},
+ # grande -> grand
+ {"e",3},
+ # menino -> menin
+ {"o",3,"",{"ão"}}};
Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt
===================================================================
--- modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt (revision 0)
+++ modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/stopwords.txt (revision 0)
@@ -0,0 +1,161 @@
+# galican stopwords
+a
+aínda
+alí
+aquel
+aquela
+aquelas
+aqueles
+aquilo
+aquí
+ao
+aos
+as
+así
+ben
+cando
+che
+co
+coa
+comigo
+con
+connosco
+contigo
+convosco
+coas
+cos
+cun
+cuns
+cunha
+cunhas
+da
+dalgunha
+dalgunhas
+dalgún
+dalgúns
+das
+de
+del
+dela
+delas
+deles
+desde
+deste
+do
+dos
+dun
+duns
+dunha
+dunhas
+e
+el
+ela
+elas
+eles
+en
+era
+eran
+esa
+esas
+ese
+eses
+esta
+estar
+estaba
+está
+están
+este
+estes
+estiven
+estou
+eu
+facer
+foi
+foron
+fun
+había
+hai
+iso
+isto
+la
+las
+lle
+lles
+lo
+los
+mais
+me
+meu
+meus
+min
+miña
+miñas
+moi
+na
+nas
+neste
+nin
+no
+non
+nos
+nosa
+nosas
+noso
+nosos
+nós
+nun
+nunha
+nuns
+nunhas
+o
+os
+ou
+ós
+para
+pero
+pode
+pois
+pola
+polas
+polo
+polos
+por
+que
+se
+senón
+ser
+seu
+seus
+sexa
+sido
+sobre
+súa
+súas
+tamén
+tan
+te
+ten
+teñen
+teño
+ter
+teu
+teus
+ti
+tido
+tiña
+tiven
+túa
+túas
+un
+unha
+unhas
+uns
+vos
+vosa
+vosas
+voso
+vosos
+vós
Property changes on: modules\analysis\common\src\resources\org\apache\lucene\analysis\gl\stopwords.txt
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp
===================================================================
--- modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp (revision 0)
+++ modules/analysis/common/src/resources/org/apache/lucene/analysis/gl/galician.rslp (revision 0)
@@ -0,0 +1,647 @@
+# Steps file for the RSLP stemmer.
+
+# Step 1: Plural Reduction
+{ "Plural", 3, 1, {"s"},
+ # bons -> bon
+ {"ns",1,"n",{"luns","furatapóns","furatapons"}},
+ # xamós -> xamón
+ {"ós",3,"ón"},
+ # balões -> balón
+ {"ões",3,"ón"},
+ # capitães -> capitão
+ {"ães",1,"ão",{"mães","magalhães"}},
+ # normais -> normal
+ {"ais",2,"al",{"cais","tais","mais","pais","ademais"}},
+ {"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}},
+ # papéis -> papel
+ {"éis",2,"el"},
+ # posíbeis -> posíbel
+ {"eis",2,"el"},
+ # espanhóis -> espanhol
+ {"óis",2,"ol",{"escornabóis"}},
+ # caracois -> caracol
+ {"ois",2,"ol",{"escornabois"}},
+ # cadrís -> cadril
+ {"ís",2,"il",{"país"}},
+ # cadris -> cadril
+ {"is",2,"il",{"menfis","pais","kinguis"}},
+ # males -> mal
+ {"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}},
+ # mares -> mar
+ {"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}},
+ # luces -> luz
+ {"ces",2,"z"},
+ # luzes -> luz
+ {"zes",2,"z"},
+ # leises -> lei
+ {"ises",3,"z"},
+ # animás -> animal
+ {"ás",1,"al",{"más"}},
+ # gases -> gas
+ {"ses",2,"s"},
+ # casas -> casa
+ {"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}};
+
+{ "Unification", 0, 0, {},
+ # cansadísimo -> cansadísimo
+ {"íssimo",5,"ísimo"},
+ # cansadísima -> cansadísima
+ {"íssima",5,"ísima"},
+ # homaço -> homazo
+ {"aço",4,"azo"},
+ # mulheraça -> mulheraza
+ {"aça",4,"aza"},
+ # xentuça -> xentuza
+ {"uça",4,"uza"},
+ # manilhar -> manillar
+ {"lhar",2,"llar"},
+ # colher -> coller
+ {"lher",2,"ller"},
+ # melhor -> mellor
+ {"lhor",2,"llor"},
+ # alho -> allo
+ {"lho",1,"llo"},
+ # linhar -> liñar
+ {"nhar",2,"ñar"},
+ # penhor -> peñor
+ {"nhor",2,"ñor"},
+ # anho -> año
+ {"nho",1,"ño"},
+ # cunha -> cuña
+ {"nha",1,"ña"},
+ # hospitalário -> hospitalario
+ {"ário",3,"ario"},
+ # bibliotecária -> bibliotecaria
+ {"ária",3,"aria"},
+ # agradable -> agradábel
+ {"able",2,"ábel"},
+ # agradávele -> agradábel
+ {"ável",2,"ábel"},
+ # imposible -> imposíbel
+ {"ible",2,"íbel"},
+ # imposível -> imposíbel
+ {"ível",2,"íbel"},
+ # imposiçom -> imposición
+ {"çom",2,"ción"},
+ # garagem -> garaxe
+ {"agem",2,"axe"},
+ # garage -> garaxe
+ {"age",2,"axe"},
+ # impressão -> impressón
+ {"ão",3,"ón"},
+ # irmao -> irmán
+ {"ao",1,"án"},
+ # irmau -> irmán
+ {"au",1,"án"},
+ # garrafom -> garrafón
+ {"om",3,"ón"},
+ # cantem -> canten
+ {"m",2,"n"}};
+
+{ "Adverb", 0, 0, {},
+ # felizmente -> feliz
+ {"mente",4,"",{"experimente","vehemente","sedimente"}}};
+
+{ "Augmentative", 0, 1, {},
+ # cansadísimo -> cansad
+ {"dísimo",5},
+ # cansadísima -> cansad
+ {"dísima",5},
+ # amabilísimo -> ama
+ {"bilísimo",3},
+ # amabilísima -> ama
+ {"bilísima",3},
+ # fortísimo -> fort
+ {"ísimo",3},
+ # fortísima -> fort
+ {"ísima",3},
+ # centésimo -> cent
+ {"ésimo",3},
+ # centésima -> cent
+ {"ésima",3},
+ # paupérrimo -> paup
+ {"érrimo",4},
+ # paupérrima -> paup
+ {"érrima",4},
+ # charlatana -> charlat
+ {"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}},
+ # charlatán -> charlat
+ {"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}},
+ # homazo -> hom
+ {"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}},
+ # mulleraza -> muller
+ {"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}},
+ # cascallo -> casc
+ {"allo",4,"",{"traballo"}},
+ # xentalla -> xent
+ {"alla",4},
+ # bocarra -> boc
+ {"arra",3,"",{"cigarra","cinzarra"}},
+ # medicastro -> medic
+ {"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}},
+ # poetastra -> poet
+ {"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}},
+ # corpázio -> corp
+ {"ázio",3,"",{"topázio"}},
+ # soutelo -> sout
+ {"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}},
+ # avioneta -> avion
+ {"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}},
+ # guapete -> guap
+ {"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}},
+ # práctica -> práct
+ {"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}},
+ # práctico -> práct
+ {"ico",3,"",{"conico","acetifico","acidifico"}},
+ # trapexo -> trap
+ {"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}},
+ {"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}},
+ # multidão -> mult
+ {"idão",3},
+ # pequeniño -> pequeno
+ {"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}},
+ # pequeniña -> pequena
+ {"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}},
+ # grandito -> grand
+ {"ito",3,""},
+ # grandita -> grand
+ {"ita",3,""},
+ # anomaloide -> animal
+ {"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}},
+ # cazola -> caz
+ {"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}},
+ # pedrolo -> pedr
+ {"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}},
+ # vellote -> vell
+ {"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}},
+ # mozota -> moz
+ {"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}},
+ # gordocho -> gord
+ {"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}},
+ # gordecha -> gord
+ {"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}},
+ # baratuco -> barat
+ {"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}},
+ # borrachuzo -> borrach
+ {"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}},
+ # xentuza -> xent
+ {"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}},
+ # babuxa -> bab
+ {"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}},
+ {"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}},
+ # grupello -> grup
+ {"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}},
+ # pontella -> pont
+ {"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}};
+
+{ "Noun", 0, 0, {},
+ # lealdade -> leal
+ {"dade",3,"",{"acridade","calidade"}},
+ # clarificar -> clar
+ {"ificar",2},
+ # brasileiro->brasil
+ {"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}},
+ # marisqueira -> marisqu
+ {"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}},
+ # hospitalario -> hospital
+ {"ario",3,"",{"armario","calcario","lionario","salario"}},
+ # bibliotecaria -> bibliotec
+ {"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}},
+ # humorístico -> humor
+ {"ístico",3,"",{"balístico", "ensaístico"}},
+ # castrista -> castr
+ {"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}},
+ # lavado -> lav
+ {"ado",2,"",{"grado","agrado"}},
+ # decanato -> decan
+ {"ato",2,"",{"agnato"}},
+ # xemido -> xem
+ {"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}},
+ # mantida -> mant
+ {"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}},
+ {"ída",3},
+ # mantído -> mant
+ {"ido",3},
+ # orelludo -> orell
+ {"udo",3,"",{"estudo","escudo"}},
+ # orelluda -> orell
+ {"uda",3},
+ {"ada",3,"",{"abada","alhada","allada","pitada"}},
+ # comedela -> come
+ {"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}},
+ # fontela -> font
+ {"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}},
+ # agradábel -> agrad
+ {"ábel",2,"",{"afábel","fiábel"}},
+ # combustíbel -> combust
+ {"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}},
+ # fabricante -> frabrica
+ {"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}},
+ # ignorancia -> ignora
+ {"ncia",3},
+ # temperanza -> tempera
+ {"nza",3},
+ {"acia",3,"",{"acracia","audacia","falacia","farmacia"}},
+ # inmundicia -> inmund
+ {"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}},
+ # xustiza -> xust
+ {"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}},
+ # clarexar -> clar
+ {"exar",3,"",{"palmexar"}},
+ # administración -> administr
+ {"ación",2,"",{"aeración"}},
+ # expedición -> exped
+ {"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}},
+ # excepción -> except
+ {"ción",3,"t"},
+ # comprensión -> comprens
+ {"sión",3,"s",{"abrasión", "alusión"}},
+ # doazón -> do
+ {"azón",2,"",{"armazón"}},
+ # garrafón -> garraf
+ {"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}},
+ # lambona -> lamb
+ {"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}},
+ # bretoa -> bretón
+ {"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}},
+ # demoníaco -> demoní
+ {"aco",3},
+ # demoníaca -> demoní
+ {"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}},
+ # carballal -> carball
+ {"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}},
+ # nadador -> nada
+ {"dor",2,"",{"abaixador"}},
+ # benfeitor -> benfei
+ {"tor",3,"",{"autor","motor","pastor","pintor"}},
+ # produtor -> produt
+ {"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}},
+ # profesora -> profes
+ {"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}},
+ # zapataría -> zapat
+ {"aría",3,"",{"libraría"}},
+ # etiquetaxe -> etiquet
+ {"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}},
+ # movedizo -> move
+ {"dizo",3},
+ # limpeza -> limp
+ {"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}},
+ # rixidez -> rixid
+ {"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}},
+ # mullerengo -> muller
+ {"engo",3},
+ # chairego -> chair
+ {"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}},
+ # cariñoso -> cariñ
+ {"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}},
+ # cariñosa -> cariñ
+ {"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}},
+ # negrume -> negr
+ {"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}},
+ # altura -> alt
+ {"ura",3,"",{"albura","armadura","imatura","costura"}},
+ # cuspiñar -> cusp
+ {"iñar",3},
+ # febril -> febr
+ {"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}},
+ # principesco -> princip
+ {"esco",4},
+ # mourisco -> mour
+ {"isco",4},
+ # esportivo -> esport
+ {"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}};
+
+{ "Verb", 0, 0, {},
+ # amaba -> am
+ {"aba",2},
+ # andabade -> and
+ {"abade",2},
+ # andábade -> and
+ {"ábade",2},
+ # chorabamo -> chor
+ {"abamo",2},
+ # chorábamo -> chor
+ {"ábamo",2},
+ # moraban -> morab
+ {"aban",2},
+ # andache -> and
+ {"ache",2},
+ # andade -> and
+ {"ade",2},
+ {"an",2},
+ # cantando -> cant
+ {"ando",2},
+ # cantar -> cant
+ {"ar",2,"",{"azar","bazar","patamar"}},
+ # lembrarade -> lembra
+ {"arade",2},
+ {"aramo",2},
+ {"arán",2},
+ # cantaran -> cant
+ {"aran",2},
+ # convidárade -> convid
+ {"árade",2},
+ # convidaría -> convid
+ {"aría",2},
+ # cantariade -> cant
+ {"ariade",2},
+ # cantaríade -> cant
+ {"aríade",2},
+ # cantarian -> cant
+ {"arian",2},
+ # cantariamo -> cant
+ {"ariamo",2},
+ # pescaron -> pesc
+ {"aron",2},
+ # cantase -> cant
+ {"ase",2},
+ # cantasede -> cant
+ {"asede",2},
+ # cantásede -> cant
+ {"ásede",2},
+ # cantasemo -> cant
+ {"asemo",2},
+ # cantásemo -> cant
+ {"ásemo",2},
+ # cantasen -> cant
+ {"asen",2},
+ # loitavan -> loitav
+ {"avan",2},
+ # cantaríamo -> cant
+ {"aríamo",2},
+ # cantassen -> cant
+ {"assen",2},
+ # cantássemo -> cant
+ {"ássemo",2},
+ # beberíamo -> beb
+ {"eríamo",2},
+ # bebêssemo -> beb
+ {"êssemo",2},
+ # partiríamo -> part
+ {"iríamo",3},
+ # partíssemo -> part
+ {"íssemo",3},
+ # cantáramo -> cant
+ {"áramo",2},
+ # cantárei -> cant
+ {"árei",2},
+ # cantaren -> cant
+ {"aren",2},
+ # cantaremo -> cant
+ {"aremo",2},
+ # cantaríei -> cant
+ {"aríei",2},
+ {"ássei",2},
+ # cantávamo-> cant
+ {"ávamo",2},
+ # bebêramo -> beb
+ {"êramo",1},
+ # beberemo -> beb
+ {"eremo",1},
+ # beberíei -> beb
+ {"eríei",1},
+ # bebêssei -> beb
+ {"êssei",1},
+ # partiríamo -> part
+ {"íramo",3},
+ # partiremo -> part
+ {"iremo",3},
+ # partiríei -> part
+ {"iríei",3},
+ # partíssei -> part
+ {"íssei",3},
+ # partissen -> part
+ {"issen",3},
+ # bebendo -> beb
+ {"endo",1},
+ # partindo -> part
+ {"indo",3},
+ # propondo -> prop
+ {"ondo",3},
+ # cantarde -> cant
+ {"arde",2},
+ # cantarei -> cant
+ {"arei",2},
+ # cantaria -> cant
+ {"aria",2},
+ # cantarmo -> cant
+ {"armo",2},
+ # cantasse -> cant
+ {"asse",2},
+ {"aste",2},
+ # cantávei -> cant
+ {"ávei",2},
+ # perderão -> perd
+ {"erão",1},
+ # beberde -> beb
+ {"erde",1},
+ # beberei -> beb
+ {"erei",1},
+ # bebêrei -> beb
+ {"êrei",1},
+ # beberen -> beb
+ {"eren",2},
+ # beberia -> beb
+ {"eria",1},
+ # bebermo -> beb
+ {"ermo",1},
+ # bebeste -> beb
+ {"este",1,"",{"faroeste","agreste"}},
+ # bebíamo -> beb
+ {"íamo",1},
+ # fuxian -> fux
+ {"ian",2,"",{"enfian","eloxian","ensaian"}},
+ # partirde -> part
+ {"irde",2},
+ # partírei -> part
+ {"irei",3,"",{"admirei"}},
+ # partiren -> part
+ {"iren",3},
+ # partiria -> part
+ {"iria",3},
+ # partirmo -> part
+ {"irmo",3},
+ # partisse -> part
+ {"isse",3},
+ # partiste -> part
+ {"iste",4},
+ {"iava",1,"",{"ampliava"}},
+ # cantamo -> cant
+ {"amo",2},
+ # funciona -> func
+ {"iona",3},
+ # cantara -> cant
+ {"ara",2,"",{"arara","prepara"}},
+ # enviará -> envi
+ {"ará",2,"",{"alvará","bacará"}},
+ # cantare -> cant
+ {"are",2,"",{"prepare"}},
+ # cantava -> cant
+ {"ava",2,"",{"agrava"}},
+ # cantemo -> cant
+ {"emo",2},
+ # bebera -> beb
+ {"era",1,"",{"acelera","espera"}},
+ # beberá -> beb
+ {"erá",1},
+ # bebere -> beb
+ {"ere",1,"",{"espere"}},
+ # bebíei -> beb
+ {"íei",1},
+ # metin -> met
+ {"in",3},
+ # partimo -> part
+ {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
+ # partira -> part
+ {"ira",3,"",{"fronteira","sátira"}},
+ {"ído",3},
+ # partirá -> part
+ {"irá",3},
+ # concretizar -> concret
+ {"tizar",4,"",{"alfabetizar"}},
+ {"izar",3,"",{"organizar"}},
+ # saltitar -> salt
+ {"itar",5,"",{"acreditar","explicitar","estreitar"}},
+ # partire -> part
+ {"ire",3,"",{"adquire"}},
+ # compomo -> comp
+ {"omo",3},
+ {"ai",2},
+ # barbear -> barb
+ {"ear",4,"",{"alardear","nuclear"}},
+ # cheguei -> cheg
+ {"uei",3},
+ {"uía",5,"u"},
+ # cantei -> cant
+ {"ei",3},
+ # beber -> beb
+ {"er",1,"",{"éter","pier"}},
+ # bebeu -> beb
+ {"eu",1,"",{"chapeu"}},
+ # bebia -> beb
+ {"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
+ # partir -> part
+ {"ir",3},
+ # partiu -> part
+ {"iu",3},
+ # fraqueou -> fraqu
+ {"eou",5},
+ # chegou -> cheg
+ {"ou",3},
+ # bebi -> beb
+ {"i",1},
+ # varrede -> varr
+ {"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}},
+ # cantei -> cant
+ {"ei",3},
+ # anden -> and
+ {"en",2},
+ # descerade -> desc
+ {"erade",1},
+ # vivérade -> viv
+ {"érade",1},
+ # beberan -> beb
+ {"eran",2},
+ # colleramo -> coller
+ {"eramo",1},
+ # bebéramo -> beb
+ {"éramo",1},
+ # perderán -> perd
+ {"erán",1},
+ # varrería -> varr
+ {"ería",1},
+ # beberiade -> beb
+ {"eriade",1},
+ # beberíade -> beb
+ {"eríade",1},
+ # beberiamo -> beb
+ {"eriamo",1},
+ # beberian -> beb
+ {"erian",1},
+ # beberían -> beb
+ {"erían",1},
+ # perderon -> perd
+ {"eron",1},
+ # bebese -> beb
+ {"ese",1},
+ # bebesedes -> beb
+ {"esedes",1},
+ # bebésedes -> beb
+ {"ésedes",1},
+ # bebesemo -> beb
+ {"esemo",1},
+ # bebésemo -> beb
+ {"ésemo",1},
+ # bebesen -> beb
+ {"esen",1},
+ # bebêssede -> beb
+ {"êssede",1},
+ # chovía -> chov
+ {"ía",1},
+ # faciade -> fac
+ {"iade",1},
+ # facíade -> fac
+ {"íade",1},
+ # perdiamo -> perd
+ {"iamo",1},
+ # fuxían -> fux
+ {"ían",1},
+ # corriche -> corr
+ {"iche",1},
+ # partide -> part
+ {"ide",1},
+ # escribirade -> escrib
+ {"irade",3},
+ # parírade -> par
+ {"írade",3},
+ # partiramo -> part
+ {"iramo",3},
+ # fugirán -> fug
+ {"irán",3},
+ # viviría -> viv
+ {"iría",3},
+ # partiriade -> part
+ {"iriade",3},
+ # partiríade -> part
+ {"iríade",3},
+ # partiriamo -> part
+ {"iriamo",3},
+ # partirian -> part
+ {"irian",3},
+ # partirían -> part
+ {"irían",3},
+ # reflectiron -> reflect
+ {"iron",3},
+ # partise -> part
+ {"ise",3},
+ # partisede -> part
+ {"isede",3},
+ # partísede -> part
+ {"ísede",3},
+ # partisemo -> part
+ {"isemo",3},
+ # partísemo -> part
+ {"ísemo",3},
+ # partisen -> part
+ {"isen",3},
+ # partíssede -> part
+ {"íssede",3},
+ {"tizar",3,"",{"alfabetizar"}},
+ {"ondo",3}};
+
+{ "Vowel", 0, 0, {},
+ # segue -> seg
+ {"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}},
+ {"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}},
+ {"a",3,"",{"amasadela","cerva"}},
+ {"e",3,"",{"marte"}},
+ {"o",3,"",{"barro","fado","cabo","libro","cervo"}},
+ {"â",3},
+ {"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}},
+ {"ê",3},
+ {"ô",3},
+ {"á",3},
+ {"é",3},
+ {"ó",3},
+ # munxi -> munx
+ {"i",3}};