blob: c1fe60275eb32aeecd2c12ec6bcf57b9f8cc280b [file] [log] [blame]
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
options {
STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
//UNICODE_INPUT = true;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(StandardTokenizer)
package org.apache.lucene.analysis.standard;
import java.io.*;
/** A grammar-based tokenizer constructed with JavaCC.
*
* <p> This should be a good tokenizer for most European-language documents.
*
* <p>Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*/
public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
/** Constructs a tokenizer for this Reader. */
public StandardTokenizer(Reader reader) {
this(new FastCharStream(reader));
this.input = reader;
}
}
PARSER_END(StandardTokenizer)
TOKEN : { // token patterns
// basic word: a sequence of digits & letters
<ALPHANUM: (<LETTER>|<DIGIT>)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
// company names like AT&T and Excite@Home.
| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
// email addresses
| <EMAIL: <ALPHANUM> "@" <ALPHANUM> ("." <ALPHANUM>)+ >
// hostname
| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
| <HAS_DIGIT> <P> <ALPHANUM>
| <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
| <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
| <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
| <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
)
>
| <#P: ("_"|"-"|"/"|"."|",") >
| <#HAS_DIGIT: // at least one digit
(<LETTER>|<DIGIT>)*
<DIGIT>
(<LETTER>|<DIGIT>)*
>
| < #ALPHA: (<LETTER>)+>
| < #LETTER: // unicode letters
[
"\u0041"-"\u005a",
"\u0061"-"\u007a",
"\u00c0"-"\u00d6",
"\u00d8"-"\u00f6",
"\u00f8"-"\u00ff",
"\u0100"-"\u1fff",
"\u3040"-"\u318f",
"\u3300"-"\u337f",
"\u3400"-"\u3d2d",
"\u4e00"-"\u9fff",
"\uf900"-"\ufaff"
]
>
| < #DIGIT: // unicode digits
[
"\u0030"-"\u0039",
"\u0660"-"\u0669",
"\u06f0"-"\u06f9",
"\u0966"-"\u096f",
"\u09e6"-"\u09ef",
"\u0a66"-"\u0a6f",
"\u0ae6"-"\u0aef",
"\u0b66"-"\u0b6f",
"\u0be7"-"\u0bef",
"\u0c66"-"\u0c6f",
"\u0ce6"-"\u0cef",
"\u0d66"-"\u0d6f",
"\u0e50"-"\u0e59",
"\u0ed0"-"\u0ed9",
"\u1040"-"\u1049"
]
>
}
SKIP : { // skip unrecognized chars
<NOISE: ~[] >
}
/** Returns the next token in the stream, or null at EOS.
* <p>The returned token's type is set to an element of {@link
* StandardTokenizerConstants#tokenImage}.
*/
org.apache.lucene.analysis.Token next() throws IOException :
{
Token token = null;
}
{
( token = <ALPHANUM> |
token = <APOSTROPHE> |
token = <ACRONYM> |
token = <COMPANY> |
token = <EMAIL> |
token = <HOST> |
token = <NUM> |
token = <EOF>
)
{
if (token.kind == EOF) {
return null;
} else {
return
new org.apache.lucene.analysis.Token(token.image,
token.beginColumn,token.endColumn,
tokenImage[token.kind]);
}
}
}