lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.analysis.standard;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
  * This class implements the classic lucene StandardTokenizer up until 3.0
  */
 @SuppressWarnings("fallthrough")
 %%

 %class ClassicTokenizerImpl
 %unicode 3.0
 %integer
 %function getNextToken
 %pack
 %char
 %buffer 4096

 %{

 public static final int ALPHANUM          = ClassicTokenizer.ALPHANUM;
 public static final int APOSTROPHE        = ClassicTokenizer.APOSTROPHE;
 public static final int ACRONYM           = ClassicTokenizer.ACRONYM;
 public static final int COMPANY           = ClassicTokenizer.COMPANY;
 public static final int EMAIL             = ClassicTokenizer.EMAIL;
 public static final int HOST              = ClassicTokenizer.HOST;
 public static final int NUM               = ClassicTokenizer.NUM;
 public static final int CJ                = ClassicTokenizer.CJ;
 public static final int ACRONYM_DEP       = ClassicTokenizer.ACRONYM_DEP;

 public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;

 public final int yychar()
 {
     return yychar;
 }

 /**
  * Fills CharTermAttribute with the current token text.
  */
 public final void getText(CharTermAttribute t) {
   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }

    public final void setBufferSize(int numChars) {
      throw new UnsupportedOperationException();
    }
 %}

 THAI       = [\u0E00-\u0E59]

 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
 ALPHANUM   = ({LETTER}|{THAI}|[:digit:])+

 // internal apostrophes: O'Reilly, you're, O'Reilly's
 // use a post-filter to remove possessives
 APOSTROPHE =  {ALPHA} ("'" {ALPHA})+

 // acronyms: U.S.A., I.B.M., etc.
 // use a post-filter to remove dots
 ACRONYM    =  {LETTER} "." ({LETTER} ".")+

 ACRONYM_DEP  = {ALPHANUM} "." ({ALPHANUM} ".")+

 // company names like AT&T and Excite@Home.
 COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}

 // email addresses
 EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+

 // hostname
 HOST       =  {ALPHANUM} ((".") {ALPHANUM})+

 // floating point, serial, model numbers, ip addresses, etc.
 // every other segment must have at least one digit
 NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
            | {HAS_DIGIT} {P} {ALPHANUM}
            | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
            | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
            | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
            | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)

 // punctuation
 P           = ("_"|"-"|"/"|"."|",")

 // at least one digit
 HAS_DIGIT  = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*

 ALPHA      = ({LETTER})+

 // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
 LETTER     = !(![:letter:]|{CJ})

 // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
 CJ         = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]

 %%

 {ALPHANUM}                                                     { return ALPHANUM; }
 {APOSTROPHE}                                                   { return APOSTROPHE; }
 {ACRONYM}                                                      { return ACRONYM; }
 {COMPANY}                                                      { return COMPANY; }
 {EMAIL}                                                        { return EMAIL; }
 {HOST}                                                         { return HOST; }
 {NUM}                                                          { return NUM; }
 {CJ}                                                           { return CJ; }
 {ACRONYM_DEP}                                                  { return ACRONYM_DEP; }

 /** Ignore the rest */
 [^]                                                            { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.analysis.standard;

	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

	/**
	* This class implements the classic lucene StandardTokenizer up until 3.0
	*/
	@SuppressWarnings("fallthrough")
	%%

	%class ClassicTokenizerImpl
	%unicode 3.0
	%integer
	%function getNextToken
	%pack
	%char
	%buffer 4096

	%{

	public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
	public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
	public static final int ACRONYM = ClassicTokenizer.ACRONYM;
	public static final int COMPANY = ClassicTokenizer.COMPANY;
	public static final int EMAIL = ClassicTokenizer.EMAIL;
	public static final int HOST = ClassicTokenizer.HOST;
	public static final int NUM = ClassicTokenizer.NUM;
	public static final int CJ = ClassicTokenizer.CJ;
	public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;

	public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;

	public final int yychar()
	{
	return yychar;
	}

	/**
	* Fills CharTermAttribute with the current token text.
	*/
	public final void getText(CharTermAttribute t) {
	t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
	}

	public final void setBufferSize(int numChars) {
	throw new UnsupportedOperationException();
	}
	%}

	THAI = [\u0E00-\u0E59]

	// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
	ALPHANUM = ({LETTER}\|{THAI}\|[:digit:])+

	// internal apostrophes: O'Reilly, you're, O'Reilly's
	// use a post-filter to remove possessives
	APOSTROPHE = {ALPHA} ("'" {ALPHA})+

	// acronyms: U.S.A., I.B.M., etc.
	// use a post-filter to remove dots
	ACRONYM = {LETTER} "." ({LETTER} ".")+

	ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+

	// company names like AT&T and Excite@Home.
	COMPANY = {ALPHA} ("&"\|"@") {ALPHA}

	// email addresses
	EMAIL = {ALPHANUM} (("."\|"-"\|"_") {ALPHANUM})* "@" {ALPHANUM} (("."\|"-") {ALPHANUM})+

	// hostname
	HOST = {ALPHANUM} ((".") {ALPHANUM})+

	// floating point, serial, model numbers, ip addresses, etc.
	// every other segment must have at least one digit
	NUM = ({ALPHANUM} {P} {HAS_DIGIT}
	\| {HAS_DIGIT} {P} {ALPHANUM}
	\| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
	\| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
	\| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
	\| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)

	// punctuation
	P = ("_"\|"-"\|"/"\|"."\|",")

	// at least one digit
	HAS_DIGIT = ({LETTER}\|[:digit:])* [:digit:] ({LETTER}\|[:digit:])*

	ALPHA = ({LETTER})+

	// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>\|<b>)"
	LETTER = !(![:letter:]\|{CJ})

	// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
	CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]

	%%

	{ALPHANUM} { return ALPHANUM; }
	{APOSTROPHE} { return APOSTROPHE; }
	{ACRONYM} { return ACRONYM; }
	{COMPANY} { return COMPANY; }
	{EMAIL} { return EMAIL; }
	{HOST} { return HOST; }
	{NUM} { return NUM; }
	{CJ} { return CJ; }
	{ACRONYM_DEP} { return ACRONYM_DEP; }

	/** Ignore the rest */
	[^] { /* Break so we don't hit fall-through warning: / break;/ ignore */ }