src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj - lucene-solr - Git at Google

 /* ====================================================================
  * The Apache Software License, Version 1.1
  *
  * Copyright (c) 2001 The Apache Software Foundation.  All rights
  * reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. The end-user documentation included with the redistribution,
  *    if any, must include the following acknowledgment:
  *       "This product includes software developed by the
  *        Apache Software Foundation (http://www.apache.org/)."
  *    Alternately, this acknowledgment may appear in the software itself,
  *    if and wherever such third-party acknowledgments normally appear.
  *
  * 4. The names "Apache" and "Apache Software Foundation" and
  *    "Apache Lucene" must not be used to endorse or promote products
  *    derived from this software without prior written permission. For
  *    written permission, please contact apache@apache.org.
  *
  * 5. Products derived from this software may not be called "Apache",
  *    "Apache Lucene", nor may "Apache" appear in their name, without
  *    prior written permission of the Apache Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation.  For more
  * information on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  */

 options {
   STATIC = false;
 //IGNORE_CASE = true;
 //BUILD_PARSER = false;
 //UNICODE_INPUT = true;
   USER_CHAR_STREAM = true;
   OPTIMIZE_TOKEN_MANAGER = true;
 //DEBUG_TOKEN_MANAGER = true;
 }
 PARSER_BEGIN(StandardTokenizer)

 package org.apache.lucene.analysis.standard;

 import java.io.*;

 /** A grammar-based tokenizer constructed with JavaCC.
  *
  * <p> This should be a good tokenizer for most European-language documents.
  *
  * <p>Many applications have specific tokenizer needs.  If this tokenizer does
  * not suit your application, please consider copying this source code
  * directory to your project and maintaining your own grammar-based tokenizer.
  */
 public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {

   /** Constructs a tokenizer for this Reader. */
   public StandardTokenizer(Reader reader) {
     this(new FastCharStream(reader));
     this.input = reader;
   }
 }

 PARSER_END(StandardTokenizer)

 TOKEN : {					  // token patterns

   // basic word: a sequence of digits & letters
   <ALPHANUM: (<LETTER>|<DIGIT>)+ >

   // internal apostrophes: O'Reilly, you're, O'Reilly's
   // use a post-filter to remove possesives
 | <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >

   // acronyms: U.S.A., I.B.M., etc.
   // use a post-filter to remove dots
 | <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >

   // company names like AT&T and Excite@Home.
 | <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >

   // email addresses
 | <EMAIL: <ALPHANUM> "@" <ALPHANUM> ("." <ALPHANUM>)+ >

   // hostname
 | <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >

   // floating point, serial, model numbers, ip addresses, etc.
   // every other segment must have at least one digit
 | <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
        | <HAS_DIGIT> <P> <ALPHANUM>
        | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
        | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
        | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
        | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
         )
   >
 | <#P: ("_"|"-"|"/"|"."|",") >
 | <#HAS_DIGIT:					  // at least one digit
     (<LETTER>|<DIGIT>)*
     <DIGIT>
     (<LETTER>|<DIGIT>)*
   >

 | < #ALPHA: (<LETTER>)+>
 | < #LETTER:					  // unicode letters
       [
        "\u0041"-"\u005a",
        "\u0061"-"\u007a",
        "\u00c0"-"\u00d6",
        "\u00d8"-"\u00f6",
        "\u00f8"-"\u00ff",
        "\u0100"-"\u1fff",
        "\u3040"-"\u318f",
        "\u3300"-"\u337f",
        "\u3400"-"\u3d2d",
        "\u4e00"-"\u9fff",
        "\uf900"-"\ufaff"
       ]
   >
 | < #DIGIT:					  // unicode digits
       [
        "\u0030"-"\u0039",
        "\u0660"-"\u0669",
        "\u06f0"-"\u06f9",
        "\u0966"-"\u096f",
        "\u09e6"-"\u09ef",
        "\u0a66"-"\u0a6f",
        "\u0ae6"-"\u0aef",
        "\u0b66"-"\u0b6f",
        "\u0be7"-"\u0bef",
        "\u0c66"-"\u0c6f",
        "\u0ce6"-"\u0cef",
        "\u0d66"-"\u0d6f",
        "\u0e50"-"\u0e59",
        "\u0ed0"-"\u0ed9",
        "\u1040"-"\u1049"
       ]
   >
 }

 SKIP : {					  // skip unrecognized chars
  <NOISE: ~[] >
 }

 /** Returns the next token in the stream, or null at EOS.
  * <p>The returned token's type is set to an element of {@link
  * StandardTokenizerConstants#tokenImage}.
  */
 org.apache.lucene.analysis.Token next() throws IOException :
 {
   Token token = null;
 }
 {
   ( token = <ALPHANUM> |
     token = <APOSTROPHE> |
     token = <ACRONYM> |
     token = <COMPANY> |
     token = <EMAIL> |
     token = <HOST> |
     token = <NUM> |
     token = <EOF>
    )
     {
       if (token.kind == EOF) {
 	return null;
       } else {
 	return
 	  new org.apache.lucene.analysis.Token(token.image,
 					token.beginColumn,token.endColumn,
 					tokenImage[token.kind]);
       }
     }
 }
	/* ====================================================================
	* The Apache Software License, Version 1.1
	*
	* Copyright (c) 2001 The Apache Software Foundation. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The end-user documentation included with the redistribution,
	* if any, must include the following acknowledgment:
	* "This product includes software developed by the
	* Apache Software Foundation (http://www.apache.org/)."
	* Alternately, this acknowledgment may appear in the software itself,
	* if and wherever such third-party acknowledgments normally appear.
	*
	* 4. The names "Apache" and "Apache Software Foundation" and
	* "Apache Lucene" must not be used to endorse or promote products
	* derived from this software without prior written permission. For
	* written permission, please contact apache@apache.org.
	*
	* 5. Products derived from this software may not be called "Apache",
	* "Apache Lucene", nor may "Apache" appear in their name, without
	* prior written permission of the Apache Software Foundation.
	*
	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation. For more
	* information on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*/

	options {
	STATIC = false;
	//IGNORE_CASE = true;
	//BUILD_PARSER = false;
	//UNICODE_INPUT = true;
	USER_CHAR_STREAM = true;
	OPTIMIZE_TOKEN_MANAGER = true;
	//DEBUG_TOKEN_MANAGER = true;
	}
	PARSER_BEGIN(StandardTokenizer)

	package org.apache.lucene.analysis.standard;

	import java.io.*;

	/** A grammar-based tokenizer constructed with JavaCC.
	*
	* <p> This should be a good tokenizer for most European-language documents.
	*
	* <p>Many applications have specific tokenizer needs. If this tokenizer does
	* not suit your application, please consider copying this source code
	* directory to your project and maintaining your own grammar-based tokenizer.
	*/
	public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {

	/** Constructs a tokenizer for this Reader. */
	public StandardTokenizer(Reader reader) {
	this(new FastCharStream(reader));
	this.input = reader;
	}
	}

	PARSER_END(StandardTokenizer)

	TOKEN : { // token patterns

	// basic word: a sequence of digits & letters
	<ALPHANUM: (<LETTER>\|<DIGIT>)+ >

	// internal apostrophes: O'Reilly, you're, O'Reilly's
	// use a post-filter to remove possesives
	\| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >

	// acronyms: U.S.A., I.B.M., etc.
	// use a post-filter to remove dots
	\| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >

	// company names like AT&T and Excite@Home.
	\| <COMPANY: <ALPHA> ("&"\|"@") <ALPHA> >

	// email addresses
	\| <EMAIL: <ALPHANUM> "@" <ALPHANUM> ("." <ALPHANUM>)+ >

	// hostname
	\| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >

	// floating point, serial, model numbers, ip addresses, etc.
	// every other segment must have at least one digit
	\| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
	\| <HAS_DIGIT> <P> <ALPHANUM>
	\| <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
	\| <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
	\| <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
	\| <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
	)
	>
	\| <#P: ("_"\|"-"\|"/"\|"."\|",") >
	\| <#HAS_DIGIT: // at least one digit
	(<LETTER>\|<DIGIT>)*
	<DIGIT>
	(<LETTER>\|<DIGIT>)*
	>

	\| < #ALPHA: (<LETTER>)+>
	\| < #LETTER: // unicode letters
	[
	"\u0041"-"\u005a",
	"\u0061"-"\u007a",
	"\u00c0"-"\u00d6",
	"\u00d8"-"\u00f6",
	"\u00f8"-"\u00ff",
	"\u0100"-"\u1fff",
	"\u3040"-"\u318f",
	"\u3300"-"\u337f",
	"\u3400"-"\u3d2d",
	"\u4e00"-"\u9fff",
	"\uf900"-"\ufaff"
	]
	>
	\| < #DIGIT: // unicode digits
	[
	"\u0030"-"\u0039",
	"\u0660"-"\u0669",
	"\u06f0"-"\u06f9",
	"\u0966"-"\u096f",
	"\u09e6"-"\u09ef",
	"\u0a66"-"\u0a6f",
	"\u0ae6"-"\u0aef",
	"\u0b66"-"\u0b6f",
	"\u0be7"-"\u0bef",
	"\u0c66"-"\u0c6f",
	"\u0ce6"-"\u0cef",
	"\u0d66"-"\u0d6f",
	"\u0e50"-"\u0e59",
	"\u0ed0"-"\u0ed9",
	"\u1040"-"\u1049"
	]
	>
	}

	SKIP : { // skip unrecognized chars
	<NOISE: ~[] >
	}

	/** Returns the next token in the stream, or null at EOS.
	* <p>The returned token's type is set to an element of {@link
	* StandardTokenizerConstants#tokenImage}.
	*/
	org.apache.lucene.analysis.Token next() throws IOException :
	{
	Token token = null;
	}
	{
	( token = <ALPHANUM> \|
	token = <APOSTROPHE> \|
	token = <ACRONYM> \|
	token = <COMPANY> \|
	token = <EMAIL> \|
	token = <HOST> \|
	token = <NUM> \|
	token = <EOF>
	)
	{
	if (token.kind == EOF) {
	return null;
	} else {
	return
	new org.apache.lucene.analysis.Token(token.image,
	token.beginColumn,token.endColumn,
	tokenImage[token.kind]);
	}
	}
	}