blob: 8bc555a440dbfe75a2b2c4646f2da51425cfc5b3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.bn;
import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
/**
* Stemmer for Bengali.
* <p>
* The algorithm is based on the report in:
* <i>Natural Language Processing in an Indian Language (Bengali)-I: Verb Phrase Analysis</i>
* P Sengupta and B B Chaudhuri
* </p>
*
* <p>
* Few Stemmer criteria are taken from:
* <i>http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt</i>
* </p>
*/
public class BengaliStemmer {
public int stem(char buffer[], int len) {
// 8
if (len > 9 && (endsWith(buffer, len, "িয়াছিলাম")
|| endsWith(buffer, len, "িতেছিলাম")
|| endsWith(buffer, len, "িতেছিলেন")
|| endsWith(buffer, len, "ইতেছিলেন")
|| endsWith(buffer, len, "িয়াছিলেন")
|| endsWith(buffer, len, "ইয়াছিলেন")
))
return len - 8;
// 7
if ((len > 8) && (endsWith(buffer, len, "িতেছিলি")
|| endsWith(buffer, len, "িতেছিলে")
|| endsWith(buffer, len, "িয়াছিলা")
|| endsWith(buffer, len, "িয়াছিলে")
|| endsWith(buffer, len, "িতেছিলা")
|| endsWith(buffer, len, "িয়াছিলি")
|| endsWith(buffer, len, "য়েদেরকে")
))
return len - 7;
// 6
if ((len > 7) && (endsWith(buffer, len, "িতেছিস")
|| endsWith(buffer, len, "িতেছেন")
|| endsWith(buffer, len, "িয়াছিস")
|| endsWith(buffer, len, "িয়াছেন")
|| endsWith(buffer, len, "েছিলাম")
|| endsWith(buffer, len, "েছিলেন")
|| endsWith(buffer, len, "েদেরকে")
))
return len - 6;
// 5
if ((len > 6) && (endsWith(buffer, len, "িতেছি")
|| endsWith(buffer, len, "িতেছা")
|| endsWith(buffer, len, "িতেছে")
|| endsWith(buffer, len, "ছিলাম")
|| endsWith(buffer, len, "ছিলেন")
|| endsWith(buffer, len, "িয়াছি")
|| endsWith(buffer, len, "িয়াছা")
|| endsWith(buffer, len, "িয়াছে")
|| endsWith(buffer, len, "েছিলে")
|| endsWith(buffer, len, "েছিলা")
|| endsWith(buffer, len, "য়েদের")
|| endsWith(buffer, len, "দেরকে")
))
return len - 5;
// 4
if ((len > 5) && (endsWith(buffer, len, "িলাম")
|| endsWith(buffer, len, "িলেন")
|| endsWith(buffer, len, "িতাম")
|| endsWith(buffer, len, "িতেন")
|| endsWith(buffer, len, "িবেন")
|| endsWith(buffer, len, "ছিলি")
|| endsWith(buffer, len, "ছিলে")
|| endsWith(buffer, len, "ছিলা")
|| endsWith(buffer, len, "তেছে")
|| endsWith(buffer, len, "িতেছ")
|| endsWith(buffer, len, "খানা")
|| endsWith(buffer, len, "খানি")
|| endsWith(buffer, len, "গুলো")
|| endsWith(buffer, len, "গুলি")
|| endsWith(buffer, len, "য়েরা")
|| endsWith(buffer, len, "েদের")
))
return len - 4;
// 3
if ((len > 4) && (endsWith(buffer, len, "লাম")
|| endsWith(buffer, len, "িলি")
|| endsWith(buffer, len, "ইলি")
|| endsWith(buffer, len, "িলে")
|| endsWith(buffer, len, "ইলে")
|| endsWith(buffer, len, "লেন")
|| endsWith(buffer, len, "িলা")
|| endsWith(buffer, len, "ইলা")
|| endsWith(buffer, len, "তাম")
|| endsWith(buffer, len, "িতি")
|| endsWith(buffer, len, "ইতি")
|| endsWith(buffer, len, "িতে")
|| endsWith(buffer, len, "ইতে")
|| endsWith(buffer, len, "তেন")
|| endsWith(buffer, len, "িতা")
|| endsWith(buffer, len, "িবা")
|| endsWith(buffer, len, "ইবা")
|| endsWith(buffer, len, "িবি")
|| endsWith(buffer, len, "ইবি")
|| endsWith(buffer, len, "বেন")
|| endsWith(buffer, len, "িবে")
|| endsWith(buffer, len, "ইবে")
|| endsWith(buffer, len, "ছেন")
|| endsWith(buffer, len, "য়োন")
|| endsWith(buffer, len, "য়ের")
|| endsWith(buffer, len, "েরা")
|| endsWith(buffer, len, "দের")
))
return len - 3;
// 2
if ((len > 3) && (endsWith(buffer, len, "িস")
|| endsWith(buffer, len, "েন")
|| endsWith(buffer, len, "লি")
|| endsWith(buffer, len, "লে")
|| endsWith(buffer, len, "লা")
|| endsWith(buffer, len, "তি")
|| endsWith(buffer, len, "তে")
|| endsWith(buffer, len, "তা")
|| endsWith(buffer, len, "বি")
|| endsWith(buffer, len, "বে")
|| endsWith(buffer, len, "বা")
|| endsWith(buffer, len, "ছি")
|| endsWith(buffer, len, "ছা")
|| endsWith(buffer, len, "ছে")
|| endsWith(buffer, len, "ুন")
|| endsWith(buffer, len, "ুক")
|| endsWith(buffer, len, "টা")
|| endsWith(buffer, len, "টি")
|| endsWith(buffer, len, "নি")
|| endsWith(buffer, len, "ের")
|| endsWith(buffer, len, "তে")
|| endsWith(buffer, len, "রা")
|| endsWith(buffer, len, "কে")
))
return len - 2;
// 1
if ((len > 2) && (endsWith(buffer, len, "ি")
|| endsWith(buffer, len, "ী")
|| endsWith(buffer, len, "া")
|| endsWith(buffer, len, "ো")
|| endsWith(buffer, len, "ে")
|| endsWith(buffer, len, "ব")
|| endsWith(buffer, len, "ত")
))
return len - 1;
return len;
}
}