| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.standard; |
| |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| |
| /** |
| * This class implements the classic lucene StandardTokenizer up until 3.0 |
| */ |
| @SuppressWarnings("fallthrough") |
| %% |
| |
| %class ClassicTokenizerImpl |
| %unicode 3.0 |
| %integer |
| %function getNextToken |
| %pack |
| %char |
| %buffer 4096 |
| |
| %{ |
| |
| public static final int ALPHANUM = ClassicTokenizer.ALPHANUM; |
| public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE; |
| public static final int ACRONYM = ClassicTokenizer.ACRONYM; |
| public static final int COMPANY = ClassicTokenizer.COMPANY; |
| public static final int EMAIL = ClassicTokenizer.EMAIL; |
| public static final int HOST = ClassicTokenizer.HOST; |
| public static final int NUM = ClassicTokenizer.NUM; |
| public static final int CJ = ClassicTokenizer.CJ; |
| public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP; |
| |
| public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES; |
| |
| public final int yychar() |
| { |
| return yychar; |
| } |
| |
| /** |
| * Fills CharTermAttribute with the current token text. |
| */ |
| public final void getText(CharTermAttribute t) { |
| t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); |
| } |
| |
| public final void setBufferSize(int numChars) { |
| throw new UnsupportedOperationException(); |
| } |
| %} |
| |
| THAI = [\u0E00-\u0E59] |
| |
| // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) |
| ALPHANUM = ({LETTER}|{THAI}|[:digit:])+ |
| |
| // internal apostrophes: O'Reilly, you're, O'Reilly's |
| // use a post-filter to remove possessives |
| APOSTROPHE = {ALPHA} ("'" {ALPHA})+ |
| |
| // acronyms: U.S.A., I.B.M., etc. |
| // use a post-filter to remove dots |
| ACRONYM = {LETTER} "." ({LETTER} ".")+ |
| |
| ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ |
| |
| // company names like AT&T and Excite@Home. |
| COMPANY = {ALPHA} ("&"|"@") {ALPHA} |
| |
| // email addresses |
| EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ |
| |
| // hostname |
| HOST = {ALPHANUM} ((".") {ALPHANUM})+ |
| |
| // floating point, serial, model numbers, ip addresses, etc. |
| // every other segment must have at least one digit |
| NUM = ({ALPHANUM} {P} {HAS_DIGIT} |
| | {HAS_DIGIT} {P} {ALPHANUM} |
| | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ |
| | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ |
| | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ |
| | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) |
| |
| // punctuation |
| P = ("_"|"-"|"/"|"."|",") |
| |
| // at least one digit |
| HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* |
| |
| ALPHA = ({LETTER})+ |
| |
| // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)" |
| LETTER = !(![:letter:]|{CJ}) |
| |
| // Chinese and Japanese (but NOT Korean, which is included in [:letter:]) |
| CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] |
| |
| %% |
| |
| {ALPHANUM} { return ALPHANUM; } |
| {APOSTROPHE} { return APOSTROPHE; } |
| {ACRONYM} { return ACRONYM; } |
| {COMPANY} { return COMPANY; } |
| {EMAIL} { return EMAIL; } |
| {HOST} { return HOST; } |
| {NUM} { return NUM; } |
| {CJ} { return CJ; } |
| {ACRONYM_DEP} { return ACRONYM_DEP; } |
| |
| /** Ignore the rest */ |
| [^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ } |