blob: 07d78570f06790fd92403af25028c14bd9370937 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class implements the classic lucene StandardTokenizer up until 3.0
*/
@SuppressWarnings("fallthrough")
%%
%class ClassicTokenizerImpl
%unicode 3.0
%integer
%function getNextToken
%pack
%char
%buffer 4096
%{
public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
public static final int ACRONYM = ClassicTokenizer.ACRONYM;
public static final int COMPANY = ClassicTokenizer.COMPANY;
public static final int EMAIL = ClassicTokenizer.EMAIL;
public static final int HOST = ClassicTokenizer.HOST;
public static final int NUM = ClassicTokenizer.NUM;
public static final int CJ = ClassicTokenizer.CJ;
public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
public final int yychar()
{
return yychar;
}
/**
* Fills CharTermAttribute with the current token text.
*/
public final void getText(CharTermAttribute t) {
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
public final void setBufferSize(int numChars) {
throw new UnsupportedOperationException();
}
%}
THAI = [\u0E00-\u0E59]
// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possessives
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM = {LETTER} "." ({LETTER} ".")+
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
// punctuation
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
ALPHA = ({LETTER})+
// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
%%
{ALPHANUM} { return ALPHANUM; }
{APOSTROPHE} { return APOSTROPHE; }
{ACRONYM} { return ACRONYM; }
{COMPANY} { return COMPANY; }
{EMAIL} { return EMAIL; }
{HOST} { return HOST; }
{NUM} { return NUM; }
{CJ} { return CJ; }
{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }