blob: e76b195cf385955d8400c36b15f32e54ec6eddd5 [file] [log] [blame]
/* $Id$ */
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.CharArrayWriter;
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Normalizes extended latin (not greek, cyrillic etc.) umlauts and diacritics to their closest
* ASCII (basic Latin, up to 0x7f) equivalents. This class ensures that only 7-bit
* chars are returned.
*
* @author <a href="mailto:ronnie.kolehmainen@ub.uu.se">Ronnie Kolehmainen</a>
* @version $Revision$, $Date$
*/
public final class ASCIIFilter extends TokenFilter
{
/**
* Constructor.
* @param in the token stream to normalize.
*/
public ASCIIFilter(TokenStream in)
{
super(in);
}
/**
* Normalizes umlauts and diacritics to their closest
* ASCII equivalents. Strips unknown chars. All chars
* less the 0x80 are untouched.
* @return the normalized token, or <tt>null</tt> if input
* token from underlying stream is null.
*/
public final Token next() throws IOException
{
if (input == null) {
throw new RuntimeException("ASCIIFilter::next input == null");
}
Token t = input.next();
if (t == null) {
return null;
}
char[] chars = t.termText().toCharArray();
CharArrayWriter charWriter = null;
boolean modified = false;
for (int i = 0; i < chars.length; i++) {
if (chars[i] > 0x7f) {
// non-ascii
if (charWriter == null) {
charWriter = new CharArrayWriter(chars.length);
if (i > 0) {
// fill with already scanned (ascii) chars
charWriter.write(chars, 0, i);
}
}
modified = true; // mark as modified to create new Token
switch (chars[i]) {
case 0x00c0:
case 0x00c1:
case 0x00c2:
case 0x00c3:
case 0x00c4:
case 0x00c5:
case 0x00c6:
case 0x0100:
case 0x0102:
case 0x0104:
case 0x01cd:
case 0x01fa:
case 0x01fc:
case 0x1ea0:
case 0x1ea2:
case 0x1ea4:
case 0x1ea6:
case 0x1ea8:
case 0x1eaa:
case 0x1eac:
case 0x1eae:
case 0x1eb0:
case 0x1eb2:
case 0x1eb4:
case 0x1eb6:
charWriter.write('A');
break;
case 0x00e0:
case 0x00e1:
case 0x00e2:
case 0x00e3:
case 0x00e4:
case 0x00e5:
case 0x00e6:
case 0x0101:
case 0x0103:
case 0x0105:
case 0x01ce:
case 0x01fb:
case 0x01fd:
case 0x1ea1:
case 0x1ea3:
case 0x1ea5:
case 0x1ea7:
case 0x1ea9:
case 0x1eab:
case 0x1ead:
case 0x1eaf:
case 0x1eb1:
case 0x1eb3:
case 0x1eb5:
case 0x1eb7:
charWriter.write('a');
break;
case 0x00c8:
case 0x00c9:
case 0x00ca:
case 0x00cb:
case 0x0112:
case 0x0114:
case 0x0116:
case 0x0118:
case 0x011a:
case 0x1eb8:
case 0x1eba:
case 0x1ebc:
case 0x1ebe:
case 0x1ec0:
case 0x1ec2:
case 0x1ec4:
case 0x1ec6:
charWriter.write('E');
break;
case 0x00e8:
case 0x00e9:
case 0x00ea:
case 0x00eb:
case 0x0113:
case 0x0115:
case 0x0117:
case 0x0119:
case 0x011b:
case 0x1eb9:
case 0x1ebb:
case 0x1ebd:
case 0x1ebf:
case 0x1ec1:
case 0x1ec3:
case 0x1ec5:
case 0x1ec7:
charWriter.write('e');
break;
case 0x00cc:
case 0x00cd:
case 0x00ce:
case 0x00cf:
case 0x0128:
case 0x012a:
case 0x012c:
case 0x012e:
case 0x0130:
case 0x01cf:
case 0x1ec8:
case 0x1eca:
charWriter.write('I');
break;
case 0x00ec:
case 0x00ed:
case 0x00ee:
case 0x00ef:
case 0x0129:
case 0x012b:
case 0x012d:
case 0x012f:
case 0x0131:
case 0x01d0:
case 0x1ec9:
case 0x1ecb:
charWriter.write('i');
break;
case 0x00d2:
case 0x00d3:
case 0x00d4:
case 0x00d5:
case 0x00d6:
case 0x00d8:
case 0x014c:
case 0x014e:
case 0x0150:
case 0x0152:
case 0x01a0:
case 0x01d1:
case 0x01fe:
case 0x1ecc:
case 0x1ece:
case 0x1ed0:
case 0x1ed2:
case 0x1ed4:
case 0x1ed6:
case 0x1ed8:
case 0x1eda:
case 0x1edc:
case 0x1ede:
case 0x1ee0:
case 0x1ee2:
charWriter.write('O');
break;
case 0x00f2:
case 0x00f3:
case 0x00f4:
case 0x00f5:
case 0x00f6:
case 0x00f8:
case 0x014d:
case 0x014f:
case 0x0151:
case 0x0153:
case 0x01a1:
case 0x01d2:
case 0x01ff:
case 0x1ecd:
case 0x1ecf:
case 0x1ed1:
case 0x1ed3:
case 0x1ed5:
case 0x1ed7:
case 0x1ed9:
case 0x1edb:
case 0x1edd:
case 0x1edf:
case 0x1ee1:
case 0x1ee3:
charWriter.write('o');
break;
case 0x00d9:
case 0x00da:
case 0x00db:
case 0x00dc:
case 0x0168:
case 0x016a:
case 0x016c:
case 0x016e:
case 0x0170:
case 0x0172:
case 0x01af:
case 0x01d3:
case 0x01d5:
case 0x01d7:
case 0x01d9:
case 0x01db:
case 0x1ee4:
case 0x1ee6:
case 0x1ee8:
case 0x1eea:
case 0x1eec:
case 0x1eee:
case 0x1ef0:
charWriter.write('U');
break;
case 0x00f9:
case 0x00fa:
case 0x00fb:
case 0x00fc:
case 0x0169:
case 0x016b:
case 0x016d:
case 0x016f:
case 0x0171:
case 0x0173:
case 0x01b0:
case 0x01d4:
case 0x01d6:
case 0x01d8:
case 0x01da:
case 0x01dc:
case 0x1ee5:
case 0x1ee7:
case 0x1ee9:
case 0x1eeb:
case 0x1eed:
case 0x1eef:
case 0x1ef1:
charWriter.write('u');
break;
case 0x00dd:
case 0x0176:
case 0x0178:
case 0x1ef2:
case 0x1ef4:
case 0x1ef6:
case 0x1ef8:
charWriter.write('Y');
break;
case 0x00fd:
case 0x00ff:
case 0x0177:
case 0x1ef3:
case 0x1ef5:
case 0x1ef7:
case 0x1ef9:
charWriter.write('y');
break;
case 0x00c7:
case 0x0106:
case 0x0108:
case 0x010a:
case 0x010c:
charWriter.write('C');
break;
case 0x00e7:
case 0x0107:
case 0x0109:
case 0x010b:
case 0x010d:
charWriter.write('c');
break;
case 0x00d0:
case 0x010e:
case 0x0110:
charWriter.write('D');
break;
case 0x010f:
case 0x0111:
charWriter.write('d');
break;
case 0x011c:
case 0x011e:
case 0x0120:
case 0x0122:
charWriter.write('G');
break;
case 0x011d:
case 0x011f:
case 0x0121:
case 0x0123:
charWriter.write('g');
break;
case 0x0124:
case 0x0126:
charWriter.write('H');
break;
case 0x0125:
case 0x0127:
charWriter.write('h');
break;
case 0x0134:
charWriter.write('J');
break;
case 0x0135:
charWriter.write('j');
break;
case 0x0136:
charWriter.write('K');
break;
case 0x0137:
charWriter.write('k');
break;
case 0x0139:
case 0x013b:
case 0x013d:
case 0x013f:
case 0x0141:
charWriter.write('L');
break;
case 0x013a:
case 0x013c:
case 0x013e:
case 0x0140:
case 0x0142:
charWriter.write('l');
break;
case 0x0143:
case 0x0145:
case 0x0147:
case 0x014a:
charWriter.write('N');
break;
case 0x00f1:
case 0x0144:
case 0x0146:
case 0x0148:
case 0x0149:
case 0x014b:
charWriter.write('n');
break;
case 0x0154:
case 0x0156:
case 0x0158:
charWriter.write('R');
break;
case 0x0155:
case 0x0157:
case 0x0159:
charWriter.write('r');
break;
case 0x015a:
case 0x015c:
case 0x015e:
case 0x0160:
charWriter.write('S');
break;
case 0x00df:
case 0x015b:
case 0x015d:
case 0x015f:
case 0x0161:
charWriter.write('s');
break;
case 0x0162:
case 0x0164:
case 0x0166:
charWriter.write('T');
break;
case 0x0163:
case 0x0165:
case 0x0167:
charWriter.write('t');
break;
case 0x0174:
case 0x0180:
case 0x0182:
case 0x0184:
charWriter.write('W');
break;
case 0x0175:
case 0x0181:
case 0x0183:
case 0x0185:
charWriter.write('w');
break;
case 0x0179:
case 0x017b:
case 0x017d:
charWriter.write('Z');
break;
case 0x017a:
case 0x017c:
case 0x017e:
charWriter.write('z');
break;
default:
// prune non-matching chars, i.e. do nothing
break;
}
} else {
if (charWriter != null) {
charWriter.write(chars[i]);
}
}
}
if (modified) {
// System.out.println("ASCIIFilter modified token: '" + t.termText() +
// "' to '" + charWriter.toString() + "'");
return new Token(charWriter.toString(),
t.startOffset(),
t.endOffset(),
t.type());
} else {
return t;
}
}
}