| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.id; |
| |
| |
| import static org.apache.lucene.analysis.util.StemmerUtil.*; |
| |
| /** |
| * Stemmer for Indonesian. |
| * <p> |
| * Stems Indonesian words with the algorithm presented in: |
| * <i>A Study of Stemming Effects on Information Retrieval in |
| * Bahasa Indonesia</i>, Fadillah Z Tala. |
| * http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf |
| */ |
| public class IndonesianStemmer { |
| private int numSyllables; |
| private int flags; |
| private static final int REMOVED_KE = 1; |
| private static final int REMOVED_PENG = 2; |
| private static final int REMOVED_DI = 4; |
| private static final int REMOVED_MENG = 8; |
| private static final int REMOVED_TER = 16; |
| private static final int REMOVED_BER = 32; |
| private static final int REMOVED_PE = 64; |
| |
| /** |
| * Stem a term (returning its new length). |
| * <p> |
| * Use <code>stemDerivational</code> to control whether full stemming |
| * or only light inflectional stemming is done. |
| */ |
| public int stem(char text[], int length, boolean stemDerivational) { |
| flags = 0; |
| numSyllables = 0; |
| for (int i = 0; i < length; i++) |
| if (isVowel(text[i])) |
| numSyllables++; |
| |
| if (numSyllables > 2) length = removeParticle(text, length); |
| if (numSyllables > 2) length = removePossessivePronoun(text, length); |
| |
| if (stemDerivational) |
| length = stemDerivational(text, length); |
| return length; |
| } |
| |
| private int stemDerivational(char text[], int length) { |
| int oldLength = length; |
| if (numSyllables > 2) length = removeFirstOrderPrefix(text, length); |
| if (oldLength != length) { // a rule is fired |
| oldLength = length; |
| if (numSyllables > 2) length = removeSuffix(text, length); |
| if (oldLength != length) // a rule is fired |
| if (numSyllables > 2) length = removeSecondOrderPrefix(text, length); |
| } else { // fail |
| if (numSyllables > 2) length = removeSecondOrderPrefix(text, length); |
| if (numSyllables > 2) length = removeSuffix(text, length); |
| } |
| return length; |
| } |
| |
| private boolean isVowel(char ch) { |
| switch(ch) { |
| case 'a': |
| case 'e': |
| case 'i': |
| case 'o': |
| case 'u': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| private int removeParticle(char text[], int length) { |
| if (endsWith(text, length, "kah") || |
| endsWith(text, length, "lah") || |
| endsWith(text, length, "pun")) { |
| numSyllables--; |
| return length - 3; |
| } |
| |
| return length; |
| } |
| |
| private int removePossessivePronoun(char text[], int length) { |
| if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) { |
| numSyllables--; |
| return length - 2; |
| } |
| |
| if (endsWith(text, length, "nya")) { |
| numSyllables--; |
| return length - 3; |
| } |
| |
| return length; |
| } |
| |
| private int removeFirstOrderPrefix(char text[], int length) { |
| if (startsWith(text, length, "meng")) { |
| flags |= REMOVED_MENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 4); |
| } |
| |
| if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) { |
| flags |= REMOVED_MENG; |
| text[3] = 's'; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "men")) { |
| flags |= REMOVED_MENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "mem")) { |
| flags |= REMOVED_MENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "me")) { |
| flags |= REMOVED_MENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| if (startsWith(text, length, "peng")) { |
| flags |= REMOVED_PENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 4); |
| } |
| |
| if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) { |
| flags |= REMOVED_PENG; |
| text[3] = 's'; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "peny")) { |
| flags |= REMOVED_PENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 4); |
| } |
| |
| if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) { |
| flags |= REMOVED_PENG; |
| text[2] = 't'; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| if (startsWith(text, length, "pen")) { |
| flags |= REMOVED_PENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "pem")) { |
| flags |= REMOVED_PENG; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "di")) { |
| flags |= REMOVED_DI; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| if (startsWith(text, length, "ter")) { |
| flags |= REMOVED_TER; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "ke")) { |
| flags |= REMOVED_KE; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| return length; |
| } |
| |
| private int removeSecondOrderPrefix(char text[], int length) { |
| if (startsWith(text, length, "ber")) { |
| flags |= REMOVED_BER; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (length == 7 && startsWith(text, length, "belajar")) { |
| flags |= REMOVED_BER; |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "be") && length > 4 |
| && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') { |
| flags |= REMOVED_BER; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| if (startsWith(text, length, "per")) { |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (length == 7 && startsWith(text, length, "pelajar")) { |
| numSyllables--; |
| return deleteN(text, 0, length, 3); |
| } |
| |
| if (startsWith(text, length, "pe")) { |
| flags |= REMOVED_PE; |
| numSyllables--; |
| return deleteN(text, 0, length, 2); |
| } |
| |
| return length; |
| } |
| |
| private int removeSuffix(char text[], int length) { |
| if (endsWith(text, length, "kan") |
| && (flags & REMOVED_KE) == 0 |
| && (flags & REMOVED_PENG) == 0 |
| && (flags & REMOVED_PE) == 0) { |
| numSyllables--; |
| return length - 3; |
| } |
| |
| if (endsWith(text, length, "an") |
| && (flags & REMOVED_DI) == 0 |
| && (flags & REMOVED_MENG) == 0 |
| && (flags & REMOVED_TER) == 0) { |
| numSyllables--; |
| return length - 2; |
| } |
| |
| if (endsWith(text, length, "i") |
| && !endsWith(text, length, "si") |
| && (flags & REMOVED_BER) == 0 |
| && (flags & REMOVED_KE) == 0 |
| && (flags & REMOVED_PENG) == 0) { |
| numSyllables--; |
| return length - 1; |
| } |
| return length; |
| } |
| } |