| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.ckb; |
| |
| import static org.apache.lucene.analysis.util.StemmerUtil.delete; |
| |
| /** |
| * Normalizes the Unicode representation of Sorani text. |
| * |
| * <p>Normalization consists of: |
| * |
| * <ul> |
| * <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH) |
| * <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH) |
| * <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE) |
| * <li>Alternate (joining) form of 'h' (06BE) is converted to 0647 |
| * <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V |
| * BELOW) |
| * <li>Harakat, tatweel, and formatting characters such as directional controls are removed. |
| * </ul> |
| */ |
| public class SoraniNormalizer { |
| |
| static final char YEH = '\u064A'; |
| static final char DOTLESS_YEH = '\u0649'; |
| static final char FARSI_YEH = '\u06CC'; |
| |
| static final char KAF = '\u0643'; |
| static final char KEHEH = '\u06A9'; |
| |
| static final char HEH = '\u0647'; |
| static final char AE = '\u06D5'; |
| static final char ZWNJ = '\u200C'; |
| static final char HEH_DOACHASHMEE = '\u06BE'; |
| static final char TEH_MARBUTA = '\u0629'; |
| |
| static final char REH = '\u0631'; |
| static final char RREH = '\u0695'; |
| static final char RREH_ABOVE = '\u0692'; |
| |
| static final char TATWEEL = '\u0640'; |
| static final char FATHATAN = '\u064B'; |
| static final char DAMMATAN = '\u064C'; |
| static final char KASRATAN = '\u064D'; |
| static final char FATHA = '\u064E'; |
| static final char DAMMA = '\u064F'; |
| static final char KASRA = '\u0650'; |
| static final char SHADDA = '\u0651'; |
| static final char SUKUN = '\u0652'; |
| |
| /** |
| * Normalize an input buffer of Sorani text |
| * |
| * @param s input buffer |
| * @param len length of input buffer |
| * @return length of input buffer after normalization |
| */ |
| public int normalize(char s[], int len) { |
| for (int i = 0; i < len; i++) { |
| switch (s[i]) { |
| case YEH: |
| case DOTLESS_YEH: |
| s[i] = FARSI_YEH; |
| break; |
| case KAF: |
| s[i] = KEHEH; |
| break; |
| case ZWNJ: |
| if (i > 0 && s[i - 1] == HEH) { |
| s[i - 1] = AE; |
| } |
| len = delete(s, i, len); |
| i--; |
| break; |
| case HEH: |
| if (i == len - 1) { |
| s[i] = AE; |
| } |
| break; |
| case TEH_MARBUTA: |
| s[i] = AE; |
| break; |
| case HEH_DOACHASHMEE: |
| s[i] = HEH; |
| break; |
| case REH: |
| if (i == 0) { |
| s[i] = RREH; |
| } |
| break; |
| case RREH_ABOVE: |
| s[i] = RREH; |
| break; |
| case TATWEEL: |
| case KASRATAN: |
| case DAMMATAN: |
| case FATHATAN: |
| case FATHA: |
| case DAMMA: |
| case KASRA: |
| case SHADDA: |
| case SUKUN: |
| len = delete(s, i, len); |
| i--; |
| break; |
| default: |
| if (Character.getType(s[i]) == Character.FORMAT) { |
| len = delete(s, i, len); |
| i--; |
| } |
| } |
| } |
| return len; |
| } |
| } |