blob: b9a18ce74fa7420e9a4202c2ebebfd0251bf3831 [file] [log] [blame]
Index: E:/projects/lucene/trunk/common-build.xml
===================================================================
--- E:/projects/lucene/trunk/common-build.xml (revision 561292)
+++ E:/projects/lucene/trunk/common-build.xml (working copy)
@@ -7,16 +7,16 @@
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
-
+
http://www.apache.org/licenses/LICENSE-2.0
-
+
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
-
+
<project name="common" xmlns:artifact="antlib:org.apache.maven.artifact.ant">
<description>
This file is designed for importing into a main build file, and not intended
@@ -58,6 +58,7 @@
<property name="maven.dist.dir" location="dist/maven"/>
<property name="javacc.home" location="${common.dir}"/>
+ <property name="jflex.home" location="${common.dir}"/>
<property name="junit.output.dir" location="${build.dir}/test"/>
<property name="junit.reports" location="${build.dir}/test/reports"/>
@@ -71,11 +72,11 @@
<include name="FIND_NOTHING"/>
<exclude name="**/*"/>
</patternset>
-
+
<condition property="build-1-5-contrib">
<equals arg1="1.5" arg2="${ant.java.version}" />
</condition>
-
+
<property name="clover.db.dir" location="${build.dir}/test/clover/db"/>
<property name="clover.report.dir" location="${build.dir}/test/clover/reports"/>
@@ -98,6 +99,12 @@
/>
<available
+ property="jflex.present"
+ classname="JFlex.anttask.JFlexTask"
+ classpath="${jflex.home}/lib/JFlex.jar"
+ />
+
+ <available
property="junit.present"
classname="junit.framework.TestCase"
/>
@@ -106,7 +113,7 @@
property="maven.ant.tasks.present"
classname="org.apache.maven.artifact.ant.Pom"
/>
-
+
<target name="clean"
description="Removes contents of build and dist directories">
<delete dir="${build.dir}"/>
@@ -129,11 +136,25 @@
One or more of the JavaCC .jj files is newer than its corresponding
.java file. Run the "javacc" target to regenerate the artifacts.
</echo>
+ </target>
+
+ <target name="jflex-uptodate-check">
+ <uptodate property="jflex.files.uptodate">
+ <srcfiles dir="src" includes="**/*.jflex" />
+ <mapper type="glob" from="*.jflex" to="*.java"/>
+ </uptodate>
</target>
- <target name="init" depends="javacc-uptodate-check, javacc-notice">
+ <target name="jflex-notice" unless="jflex.files.uptodate">
+ <echo>
+ One or more of the JFlex .jflex files is newer than its corresponding
+ .java file. Run the "jflex" target to regenerate the artifacts.
+ </echo>
</target>
+ <target name="init" depends="javacc-uptodate-check, javacc-notice, jflex-uptodate-check, jflex-notice">
+ </target>
+
<target name="javacc-check">
<fail unless="javacc.present">
##################################################################
@@ -161,7 +182,30 @@
</fail>
</target>
-
+
+ <target name="jflex-check">
+ <fail unless="jflex.present">
+ ##################################################################
+ JFlex not found.
+ JFlex Home: ${jflex.home}
+
+ Please download and install JFlex from:
+
+ &lt;http://jflex.de/download.html&gt;
+
+ Then, create a build.properties file either in your home
+ directory, or within the Lucene directory and set the jflex.home
+ property to the path where JFlex is installed. For example,
+ if you installed JFlex in /usr/local/java/jflex-1.4.1, then set the
+ jflex.home property to:
+
+ jflex.home=/usr/local/java/jflex-1.4.1
+
+ ##################################################################
+ </fail>
+
+ </target>
+
<target name="compile-core" depends="init, clover"
description="Compiles core classes">
<compile
@@ -179,12 +223,12 @@
description="Packages the JAR file">
<jarify />
</target>
-
+
<target name="maven.ant.tasks-check">
<fail unless="maven.ant.tasks.present">
##################################################################
Maven ant tasks not found.
- Please make sure the maven-ant-tasks jar is in ANT_HOME/lib, or made
+ Please make sure the maven-ant-tasks jar is in ANT_HOME/lib, or made
available to Ant using other mechanisms like -lib or CLASSPATH.
##################################################################
</fail>
@@ -196,7 +240,7 @@
<attribute name="pom.xml" default="${pom.xml}"/>
<sequential>
<copy file="@{pom.xml}" tofile="${build.dir}/@{pom.xml}">
- <filterset begintoken="@" endtoken="@">
+ <filterset begintoken="@" endtoken="@">
<filter token="version" value="${version}"/>
</filterset>
</copy>
@@ -206,9 +250,9 @@
<pom refid="maven.project"/>
</artifact:install>
</sequential>
- </macrodef>
-
-
+ </macrodef>
+
+
<macrodef name="jarify" description="Builds a JAR file">
<attribute name="title" default="Lucene Search Engine: ${ant.project.name}" />
<element name="manifest-attributes" optional="yes"/>
@@ -218,7 +262,7 @@
outputproperty="svnversion" failifexecutionfails="false">
<arg line="."/>
</exec>
-
+
<jar
destfile="${build.dir}/${final.name}.jar"
basedir="${build.dir}/classes/java">
@@ -232,14 +276,14 @@
-->
<!-- Don't set 'Manifest-Version' it identifies the version of the
manifest file format, and should allways be 1.0 (the default)
-
- Don't set 'Created-by' attribute, it's purpose is
+
+ Don't set 'Created-by' attribute, it's purpose is
to identify the version of java used to build the jar,
which ant will do by default.
-
+
Ant will happily override these with bogus strings if you
tell it to, so don't.
-
+
NOTE: we don't use section info because all of our manifest data
applies to the entire jar/war ... no package specific info.
-->
@@ -254,9 +298,9 @@
value="${version} ${svnversion} - ${user.name} - ${DSTAMP} ${TSTAMP}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
- <attribute name="X-Compile-Source-JDK"
+ <attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
- <attribute name="X-Compile-Target-JDK"
+ <attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
<manifest-attributes/>
</manifest>
@@ -315,7 +359,7 @@
anywhere.
-->
<sysproperty key="lucene.common.dir" file="${common.dir}" />
-
+
<!-- contrib/ant IndexTaskTest needs these two system properties -->
<sysproperty key="docs.dir" file="src/test"/>
<sysproperty key="index.dir" file="${build.dir}/test/index"/>
@@ -339,7 +383,7 @@
<fail if="tests.failed">Tests failed!</fail>
<!-- life would be easier if echo had an 'if' attribute like fail -->
<delete file="${build.dir}/test/junitfailed.flag" />
-
+
</target>
<!--
Index: E:/projects/lucene/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
===================================================================
--- E:/projects/lucene/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (revision 560135)
+++ E:/projects/lucene/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (working copy)
@@ -23,89 +23,161 @@
public class TestStandardAnalyzer extends TestCase {
- public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
- TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- for (int i = 0; i < expected.length; i++) {
- Token t = ts.next();
- assertNotNull(t);
- assertEquals(expected[i], t.termText());
+ private Analyzer a = new StandardAnalyzer();
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
+ assertAnalyzesTo(a, input, expected, null);
}
- assertNull(ts.next());
- ts.close();
- }
+ public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ for (int i = 0; i < expectedImages.length; i++) {
+ Token t = ts.next();
+ assertNotNull(t);
+ assertEquals(expectedImages[i], t.termText());
+ if (expectedTypes != null)
+ {
+ assertEquals(expectedTypes[i], t.type());
+ }
+ }
+ assertNull(ts.next());
+ ts.close();
+ }
- public void testStandard() throws Exception {
- Analyzer a = new StandardAnalyzer();
- // alphanumeric tokens
- assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
- assertAnalyzesTo(a, "2B", new String[]{"2b"});
+ public void testAlphanumeric() throws Exception {
+ // alphanumeric tokens
+ assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+ assertAnalyzesTo(a, "2B", new String[]{"2b"});
+ }
- // underscores are delimiters, but not in email addresses (below)
- assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
- assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
+ public void testUnderscores() throws Exception {
+ // underscores are delimiters, but not in email addresses (below)
+ assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
+ assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
+ }
- // other delimiters: "-", "/", ","
- assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase" });
- assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
- assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+ public void testDelimiters() throws Exception {
+ // other delimiters: "-", "/", ","
+ assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase" });
+ assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+ assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+ }
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- // possessives are actually removed by StardardFilter, not the tokenizer
- assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
- assertAnalyzesTo(a, "you're", new String[]{"you're"});
- assertAnalyzesTo(a, "she's", new String[]{"she"});
- assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
- assertAnalyzesTo(a, "don't", new String[]{"don't"});
- assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
+ public void testApostrophes() throws Exception {
+ // internal apostrophes: O'Reilly, you're, O'Reilly's
+ // possessives are actually removed by StardardFilter, not the tokenizer
+ assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+ assertAnalyzesTo(a, "you're", new String[]{"you're"});
+ assertAnalyzesTo(a, "she's", new String[]{"she"});
+ assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
+ assertAnalyzesTo(a, "don't", new String[]{"don't"});
+ assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
+ }
- // t and s had been stopwords in Lucene <= 2.0, which made it impossible
- // to correctly search for these terms:
- assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
- assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
- // 'a' is still a stopword:
- assertAnalyzesTo(a, "a-class", new String[]{"class"});
+ public void testTSADash() throws Exception {
+ // t and s had been stopwords in Lucene <= 2.0, which made it impossible
+ // to correctly search for these terms:
+ assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
+ assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
+ // 'a' is still a stopword:
+ assertAnalyzesTo(a, "a-class", new String[]{"class"});
+ }
- // company names
- assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
- assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
+ public void testCompanyNames() throws Exception {
+ // company names
+ assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
+ assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
+ }
- // domain names
- assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org" });
+ public void testDomainNames() throws Exception {
+ // domain names
+ assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org" });
+ }
- // email addresses, possibly with underscores, periods, etc
- assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
- assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
- assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
+ public void testEMailAddresses() throws Exception {
+ // email addresses, possibly with underscores, periods, etc
+ assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
+ assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
+ assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
+ }
- // floating point, serial, model numbers, ip addresses, etc.
- // every other segment must have at least one digit
- assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
- assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
- assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
- assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
- assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
- assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
+ public void testNumeric() throws Exception {
+ // floating point, serial, model numbers, ip addresses, etc.
+ // every other segment must have at least one digit
+ assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
+ assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+ assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+ assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
+ assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
+ assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
+ }
- // numbers
- assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+ public void testTextWithNumbers() throws Exception {
+ // numbers
+ assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+ }
- // various
- assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
- assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
- assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
- assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+ public void testVariousText() throws Exception {
+ // various
+ assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
+ assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+ assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+ assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+ }
- // acronyms have their dots stripped
- assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
+ public void testAcronyms() throws Exception {
+ // acronyms have their dots stripped
+ assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
+ }
- // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
- assertAnalyzesTo(a, "C++", new String[]{"c"});
- assertAnalyzesTo(a, "C#", new String[]{"c"});
+ public void testCPlusPlusHash() throws Exception {
+ // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
+ assertAnalyzesTo(a, "C++", new String[]{"c"});
+ assertAnalyzesTo(a, "C#", new String[]{"c"});
+ }
- // Korean words
- assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+ public void testKorean() throws Exception {
+ // Korean words
+ assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+ }
- }
+ // Compliance with the "old" JavaCC-based analyzer, see:
+ // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
+
+ public void testComplianceFileName() throws Exception {
+ assertAnalyzesTo(a, "2004.jpg",
+ new String[] { "2004.jpg" },
+ new String[] { "<HOST>" });
+ }
+
+ public void testComplianceNumericIncorrect() throws Exception {
+ // The original analyzer produced a <HOST> token type
+ // for this (even though it looks more like a <NUM>)
+ assertAnalyzesTo(a, "62.46,37004,37009,type",
+ new String[] { "62.46,37004,37009,type" },
+ new String[] { "<NUM>" });
+ }
+
+ public void testComplianceNumericLong() throws Exception {
+ assertAnalyzesTo(a, "978-0-94045043-1,86408,86424,type",
+ new String[] { "978-0-94045043-1,86408,86424,type" },
+ new String[] { "<NUM>" });
+ }
+
+ public void testComplianceNumericFile() throws Exception {
+ assertAnalyzesTo(
+ a,
+ "78academyawards/rules/rule02.html,7194,7227,type",
+ new String[] { "78academyawards/rules/rule02.html,7194,7227,type" },
+ new String[] { "<NUM>" });
+ }
+
+ public void testComplianceNumericWithUnderscores() throws Exception {
+ assertAnalyzesTo(
+ a,
+ "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs,2076,2123,type",
+ new String[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs,2076,2123,type" },
+ new String[] { "<NUM>" });
+ }
}
Property changes on: E:\projects\lucene\trunk\src\java\org\apache\lucene\analysis\standard
___________________________________________________________________
Name: svn:ignore
- Token.java
StandardTokenizer.java
StandardTokenizerTokenManager.java
TokenMgrError.java
CharStream.java
StandardTokenizerConstants.java
+ Token.java
StandardTokenizer.java
StandardTokenizerTokenManager.java
TokenMgrError.java
CharStream.java
StandardTokenizerConstants.java
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/Token.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/Token.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/Token.java (working copy)
@@ -1,81 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. Token.java Version 3.0 */
-package org.apache.lucene.analysis.standard;
-
-/**
- * Describes the input token stream.
- */
-
-public class Token {
-
- /**
- * An integer that describes the kind of this token. This numbering
- * system is determined by JavaCCParser, and a table of these numbers is
- * stored in the file ...Constants.java.
- */
- public int kind;
-
- /**
- * beginLine and beginColumn describe the position of the first character
- * of this token; endLine and endColumn describe the position of the
- * last character of this token.
- */
- public int beginLine, beginColumn, endLine, endColumn;
-
- /**
- * The string image of the token.
- */
- public String image;
-
- /**
- * A reference to the next regular (non-special) token from the input
- * stream. If this is the last token from the input stream, or if the
- * token manager has not read tokens beyond this one, this field is
- * set to null. This is true only if this token is also a regular
- * token. Otherwise, see below for a description of the contents of
- * this field.
- */
- public Token next;
-
- /**
- * This field is used to access special tokens that occur prior to this
- * token, but after the immediately preceding regular (non-special) token.
- * If there are no such special tokens, this field is set to null.
- * When there are more than one such special token, this field refers
- * to the last of these special tokens, which in turn refers to the next
- * previous special token through its specialToken field, and so on
- * until the first special token (whose specialToken field is null).
- * The next fields of special tokens refer to other special tokens that
- * immediately follow it (without an intervening regular token). If there
- * is no such token, this field is null.
- */
- public Token specialToken;
-
- /**
- * Returns the image.
- */
- public String toString()
- {
- return image;
- }
-
- /**
- * Returns a new Token object, by default. However, if you want, you
- * can create and return subclass objects based on the value of ofKind.
- * Simply add the cases to the switch for all those special cases.
- * For example, if you have a subclass of Token called IDToken that
- * you want to create if ofKind is ID, simlpy add something like :
- *
- * case MyParserConstants.ID : return new IDToken();
- *
- * to the following switch statement. Then you can cast matchedToken
- * variable to the appropriate type and use it in your lexical actions.
- */
- public static final Token newToken(int ofKind)
- {
- switch(ofKind)
- {
- default : return new Token();
- }
- }
-
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/TokenMgrError.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/TokenMgrError.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/TokenMgrError.java (working copy)
@@ -1,133 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 3.0 */
-package org.apache.lucene.analysis.standard;
-
-public class TokenMgrError extends Error
-{
- /*
- * Ordinals for various reasons why an Error of this type can be thrown.
- */
-
- /**
- * Lexical error occured.
- */
- static final int LEXICAL_ERROR = 0;
-
- /**
- * An attempt wass made to create a second instance of a static token manager.
- */
- static final int STATIC_LEXER_ERROR = 1;
-
- /**
- * Tried to change to an invalid lexical state.
- */
- static final int INVALID_LEXICAL_STATE = 2;
-
- /**
- * Detected (and bailed out of) an infinite loop in the token manager.
- */
- static final int LOOP_DETECTED = 3;
-
- /**
- * Indicates the reason why the exception is thrown. It will have
- * one of the above 4 values.
- */
- int errorCode;
-
- /**
- * Replaces unprintable characters by their espaced (or unicode escaped)
- * equivalents in the given string
- */
- protected static final String addEscapes(String str) {
- StringBuffer retval = new StringBuffer();
- char ch;
- for (int i = 0; i < str.length(); i++) {
- switch (str.charAt(i))
- {
- case 0 :
- continue;
- case '\b':
- retval.append("\\b");
- continue;
- case '\t':
- retval.append("\\t");
- continue;
- case '\n':
- retval.append("\\n");
- continue;
- case '\f':
- retval.append("\\f");
- continue;
- case '\r':
- retval.append("\\r");
- continue;
- case '\"':
- retval.append("\\\"");
- continue;
- case '\'':
- retval.append("\\\'");
- continue;
- case '\\':
- retval.append("\\\\");
- continue;
- default:
- if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
- String s = "0000" + Integer.toString(ch, 16);
- retval.append("\\u" + s.substring(s.length() - 4, s.length()));
- } else {
- retval.append(ch);
- }
- continue;
- }
- }
- return retval.toString();
- }
-
- /**
- * Returns a detailed message for the Error when it is thrown by the
- * token manager to indicate a lexical error.
- * Parameters :
- * EOFSeen : indicates if EOF caused the lexicl error
- * curLexState : lexical state in which this error occured
- * errorLine : line number when the error occured
- * errorColumn : column number when the error occured
- * errorAfter : prefix that was seen before this error occured
- * curchar : the offending character
- * Note: You can customize the lexical error message by modifying this method.
- */
- protected static String LexicalError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar) {
- return("Lexical error at line " +
- errorLine + ", column " +
- errorColumn + ". Encountered: " +
- (EOFSeen ? "<EOF> " : ("\"" + addEscapes(String.valueOf(curChar)) + "\"") + " (" + (int)curChar + "), ") +
- "after : \"" + addEscapes(errorAfter) + "\"");
- }
-
- /**
- * You can also modify the body of this method to customize your error messages.
- * For example, cases like LOOP_DETECTED and INVALID_LEXICAL_STATE are not
- * of end-users concern, so you can return something like :
- *
- * "Internal Error : Please file a bug report .... "
- *
- * from this method for such cases in the release version of your parser.
- */
- public String getMessage() {
- return super.getMessage();
- }
-
- /*
- * Constructors of various flavors follow.
- */
-
- public TokenMgrError() {
- }
-
- public TokenMgrError(String message, int reason) {
- super(message);
- errorCode = reason;
- }
-
- public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColumn, String errorAfter, char curChar, int reason) {
- this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason);
- }
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 0)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex (revision 0)
@@ -0,0 +1,107 @@
+package org.apache.lucene.analysis.standard;
+%%
+
+%class StandardTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>"
+};
+
+public final int yychar()
+{
+ return yychar;
+}
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE = {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM = {ALPHA} "." ({ALPHA} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY = {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST = {ALPHANUM} ("." {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+/*
+NUM = ({ALPHANUM} {P} {HAS_DIGIT}
+ | {HAS_DIGIT} {P} {ALPHANUM}
+ | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+*/
+
+NUM = ({P} ({HAS_DIGIT} | {ALPHANUM}))* {HAS_DIGIT} ({P} ({HAS_DIGIT} | {ALPHANUM}))*
+
+// punctuation
+P = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT =
+ ({LETTER}|{DIGIT})*
+ {DIGIT}
+ ({LETTER}|{DIGIT})*
+
+ALPHA = ({LETTER})+
+
+
+LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+%%
+
+{ALPHANUM} { return ALPHANUM; }
+{HOST} { return HOST; }
+{NUM} { return NUM; }
+{APOSTROPHE} { return APOSTROPHE; }
+{ACRONYM} { return ACRONYM; }
+{COMPANY} { return COMPANY; }
+{EMAIL} { return EMAIL; }
+{CJ} { return CJ; }
+
+/** Ignore the rest */
+. | {WHITESPACE} { /* ignore */ }
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardFilter.java (working copy)
@@ -17,12 +17,12 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
-public final class StandardFilter extends TokenFilter
- implements StandardTokenizerConstants {
+public final class StandardFilter extends TokenFilter {
/** Construct filtering <i>in</i>. */
@@ -30,9 +30,9 @@
super(in);
}
- private static final String APOSTROPHE_TYPE = tokenImage[APOSTROPHE];
- private static final String ACRONYM_TYPE = tokenImage[ACRONYM];
-
+ private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
+ private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
+
/** Returns the next token in the stream, or null at EOS.
* <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms.
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/FastCharStream.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/FastCharStream.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/FastCharStream.java (working copy)
@@ -1,122 +0,0 @@
-// FastCharStream.java
-package org.apache.lucene.analysis.standard;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.*;
-
-/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
- * this does not do line-number counting, but instead keeps track of the
- * character position of the token in the input, as required by Lucene's {@link
- * org.apache.lucene.analysis.Token} API. */
-public final class FastCharStream implements CharStream {
- char[] buffer = null;
-
- int bufferLength = 0; // end of valid chars
- int bufferPosition = 0; // next char to read
-
- int tokenStart = 0; // offset in buffer
- int bufferStart = 0; // position in file of buffer
-
- Reader input; // source of chars
-
- /** Constructs from a Reader. */
- public FastCharStream(Reader r) {
- input = r;
- }
-
- public final char readChar() throws IOException {
- if (bufferPosition >= bufferLength)
- refill();
- return buffer[bufferPosition++];
- }
-
- private final void refill() throws IOException {
- int newPosition = bufferLength - tokenStart;
-
- if (tokenStart == 0) { // token won't fit in buffer
- if (buffer == null) { // first time: alloc buffer
- buffer = new char[2048];
- } else if (bufferLength == buffer.length) { // grow buffer
- char[] newBuffer = new char[buffer.length*2];
- System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
- buffer = newBuffer;
- }
- } else { // shift token to front
- System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
- }
-
- bufferLength = newPosition; // update state
- bufferPosition = newPosition;
- bufferStart += tokenStart;
- tokenStart = 0;
-
- int charsRead = // fill space in buffer
- input.read(buffer, newPosition, buffer.length-newPosition);
- if (charsRead == -1)
- throw new IOException("read past eof");
- else
- bufferLength += charsRead;
- }
-
- public final char BeginToken() throws IOException {
- tokenStart = bufferPosition;
- return readChar();
- }
-
- public final void backup(int amount) {
- bufferPosition -= amount;
- }
-
- public final String GetImage() {
- return new String(buffer, tokenStart, bufferPosition - tokenStart);
- }
-
- public final char[] GetSuffix(int len) {
- char[] value = new char[len];
- System.arraycopy(buffer, bufferPosition - len, value, 0, len);
- return value;
- }
-
- public final void Done() {
- try {
- input.close();
- } catch (IOException e) {
- System.err.println("Caught: " + e + "; ignoring.");
- }
- }
-
- public final int getColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getLine() {
- return 1;
- }
- public final int getEndColumn() {
- return bufferStart + bufferPosition;
- }
- public final int getEndLine() {
- return 1;
- }
- public final int getBeginColumn() {
- return bufferStart + tokenStart;
- }
- public final int getBeginLine() {
- return 1;
- }
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerTokenManager.java (working copy)
@@ -1,1233 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. StandardTokenizerTokenManager.java */
-package org.apache.lucene.analysis.standard;
-import java.io.*;
-
-public class StandardTokenizerTokenManager implements StandardTokenizerConstants
-{
- public java.io.PrintStream debugStream = System.out;
- public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; }
-private final int jjMoveStringLiteralDfa0_0()
-{
- return jjMoveNfa_0(0, 0);
-}
-private final void jjCheckNAdd(int state)
-{
- if (jjrounds[state] != jjround)
- {
- jjstateSet[jjnewStateCnt++] = state;
- jjrounds[state] = jjround;
- }
-}
-private final void jjAddStates(int start, int end)
-{
- do {
- jjstateSet[jjnewStateCnt++] = jjnextStates[start];
- } while (start++ != end);
-}
-private final void jjCheckNAddTwoStates(int state1, int state2)
-{
- jjCheckNAdd(state1);
- jjCheckNAdd(state2);
-}
-private final void jjCheckNAddStates(int start, int end)
-{
- do {
- jjCheckNAdd(jjnextStates[start]);
- } while (start++ != end);
-}
-private final void jjCheckNAddStates(int start)
-{
- jjCheckNAdd(jjnextStates[start]);
- jjCheckNAdd(jjnextStates[start + 1]);
-}
-static final long[] jjbitVec0 = {
- 0xfff0000000000000L, 0xffffffffffffdfffL, 0xffffffffL, 0x600000000000000L
-};
-static final long[] jjbitVec2 = {
- 0x0L, 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL
-};
-static final long[] jjbitVec3 = {
- 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffL, 0xffff000000000000L
-};
-static final long[] jjbitVec4 = {
- 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L, 0x0L
-};
-static final long[] jjbitVec5 = {
- 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffffffL, 0x0L
-};
-static final long[] jjbitVec6 = {
- 0x0L, 0xffffffe000000000L, 0xffffffffL, 0x0L
-};
-static final long[] jjbitVec7 = {
- 0x20000L, 0x0L, 0xfffff00000000000L, 0x7fffffL
-};
-static final long[] jjbitVec8 = {
- 0xffffffffffffffffL, 0xffffffffffffffffL, 0xffffffffffffL, 0x0L
-};
-static final long[] jjbitVec9 = {
- 0xfffffffeL, 0x0L, 0x0L, 0x0L
-};
-static final long[] jjbitVec10 = {
- 0x0L, 0x0L, 0x0L, 0xff7fffffff7fffffL
-};
-static final long[] jjbitVec11 = {
- 0x0L, 0x0L, 0xffffffff00000000L, 0x1fffffffL
-};
-static final long[] jjbitVec12 = {
- 0x1600L, 0x0L, 0x0L, 0x0L
-};
-static final long[] jjbitVec13 = {
- 0x0L, 0xffc000000000L, 0x0L, 0xffc000000000L
-};
-static final long[] jjbitVec14 = {
- 0x0L, 0x3ff00000000L, 0x0L, 0x3ff000000000000L
-};
-static final long[] jjbitVec15 = {
- 0x0L, 0xffc000000000L, 0x0L, 0xff8000000000L
-};
-static final long[] jjbitVec16 = {
- 0x0L, 0xffc000000000L, 0x0L, 0x0L
-};
-static final long[] jjbitVec17 = {
- 0x0L, 0x3ff0000L, 0x0L, 0x3ff0000L
-};
-static final long[] jjbitVec18 = {
- 0x0L, 0x3ffL, 0x0L, 0x0L
-};
-static final long[] jjbitVec19 = {
- 0xfffffffeL, 0x0L, 0xfffff00000000000L, 0x7fffffL
-};
-private final int jjMoveNfa_0(int startState, int curPos)
-{
- int[] nextStates;
- int startsAt = 0;
- jjnewStateCnt = 75;
- int i = 1;
- jjstateSet[0] = startState;
- int j, kind = 0x7fffffff;
- for (;;)
- {
- if (++jjround == 0x7fffffff)
- ReInitRounds();
- if (curChar < 64)
- {
- long l = 1L << curChar;
- MatchLoop: do
- {
- switch(jjstateSet[--i])
- {
- case 0:
- if ((0x3ff000000000000L & l) != 0L)
- {
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- }
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(12, 17);
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(18, 23);
- break;
- case 2:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(18, 23);
- break;
- case 3:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(3, 4);
- break;
- case 4:
- case 5:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(5, 6);
- break;
- case 6:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(7);
- break;
- case 7:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAdd(7);
- break;
- case 8:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(8, 9);
- break;
- case 9:
- case 10:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(10, 11);
- break;
- case 11:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(12);
- break;
- case 12:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(12, 13);
- break;
- case 13:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(14, 15);
- break;
- case 14:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(14, 15);
- break;
- case 15:
- case 16:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(11, 16);
- break;
- case 17:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(17, 18);
- break;
- case 18:
- case 19:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(19, 20);
- break;
- case 20:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(21);
- break;
- case 21:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(21, 22);
- break;
- case 22:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(23, 24);
- break;
- case 23:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(23, 24);
- break;
- case 24:
- case 25:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(25, 26);
- break;
- case 26:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(27);
- break;
- case 27:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(22, 27);
- break;
- case 28:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(12, 17);
- break;
- case 29:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- break;
- case 30:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAdd(30);
- break;
- case 31:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(24, 26);
- break;
- case 32:
- if ((0x600000000000L & l) != 0L)
- jjCheckNAdd(33);
- break;
- case 33:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddStates(27, 29);
- break;
- case 35:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(35, 36);
- break;
- case 36:
- if ((0x600000000000L & l) != 0L)
- jjCheckNAdd(37);
- break;
- case 37:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 5)
- kind = 5;
- jjCheckNAddTwoStates(36, 37);
- break;
- case 38:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(38, 39);
- break;
- case 39:
- if (curChar == 46)
- jjCheckNAdd(40);
- break;
- case 40:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 6)
- kind = 6;
- jjCheckNAddTwoStates(39, 40);
- break;
- case 41:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(41, 42);
- break;
- case 42:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(43, 44);
- break;
- case 43:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(43, 44);
- break;
- case 44:
- case 45:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAdd(45);
- break;
- case 46:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(46, 47);
- break;
- case 47:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(48, 49);
- break;
- case 48:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(48, 49);
- break;
- case 49:
- case 50:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(50, 51);
- break;
- case 51:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(52);
- break;
- case 52:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(47, 52);
- break;
- case 53:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(53, 54);
- break;
- case 54:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(55, 56);
- break;
- case 55:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(55, 56);
- break;
- case 56:
- case 57:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(57, 58);
- break;
- case 58:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAdd(59);
- break;
- case 59:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(59, 60);
- break;
- case 60:
- if ((0xf00000000000L & l) != 0L)
- jjCheckNAddTwoStates(61, 62);
- break;
- case 61:
- if ((0x3ff000000000000L & l) != 0L)
- jjCheckNAddTwoStates(61, 62);
- break;
- case 62:
- case 63:
- if ((0x3ff000000000000L & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(58, 63);
- break;
- case 66:
- if (curChar == 39)
- jjstateSet[jjnewStateCnt++] = 67;
- break;
- case 69:
- if (curChar == 46)
- jjCheckNAdd(70);
- break;
- case 71:
- if (curChar != 46)
- break;
- if (kind > 3)
- kind = 3;
- jjCheckNAdd(70);
- break;
- case 73:
- if (curChar == 38)
- jjstateSet[jjnewStateCnt++] = 74;
- break;
- default : break;
- }
- } while(i != startsAt);
- }
- else if (curChar < 128)
- {
- long l = 1L << (curChar & 077);
- MatchLoop: do
- {
- switch(jjstateSet[--i])
- {
- case 0:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(30, 35);
- if ((0x7fffffe07fffffeL & l) != 0L)
- {
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- }
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(18, 23);
- break;
- case 2:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(18, 23);
- break;
- case 3:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(3, 4);
- break;
- case 5:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjAddStates(36, 37);
- break;
- case 6:
- if (curChar == 95)
- jjCheckNAdd(7);
- break;
- case 7:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAdd(7);
- break;
- case 8:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(8, 9);
- break;
- case 10:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(10, 11);
- break;
- case 11:
- if (curChar == 95)
- jjCheckNAdd(12);
- break;
- case 12:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(12, 13);
- break;
- case 13:
- if (curChar == 95)
- jjCheckNAddTwoStates(14, 15);
- break;
- case 14:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(14, 15);
- break;
- case 16:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(11, 16);
- break;
- case 17:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(17, 18);
- break;
- case 19:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjAddStates(38, 39);
- break;
- case 20:
- if (curChar == 95)
- jjCheckNAdd(21);
- break;
- case 21:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(21, 22);
- break;
- case 22:
- if (curChar == 95)
- jjCheckNAddTwoStates(23, 24);
- break;
- case 23:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(23, 24);
- break;
- case 25:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjAddStates(40, 41);
- break;
- case 26:
- if (curChar == 95)
- jjCheckNAdd(27);
- break;
- case 27:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(22, 27);
- break;
- case 29:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- break;
- case 30:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAdd(30);
- break;
- case 31:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(24, 26);
- break;
- case 32:
- if (curChar == 95)
- jjCheckNAdd(33);
- break;
- case 33:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(27, 29);
- break;
- case 34:
- if (curChar == 64)
- jjCheckNAdd(35);
- break;
- case 35:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(35, 36);
- break;
- case 37:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 5)
- kind = 5;
- jjCheckNAddTwoStates(36, 37);
- break;
- case 38:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(38, 39);
- break;
- case 40:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 6)
- kind = 6;
- jjCheckNAddTwoStates(39, 40);
- break;
- case 41:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(41, 42);
- break;
- case 42:
- if (curChar == 95)
- jjCheckNAddTwoStates(43, 44);
- break;
- case 43:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(43, 44);
- break;
- case 45:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjstateSet[jjnewStateCnt++] = 45;
- break;
- case 46:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(46, 47);
- break;
- case 47:
- if (curChar == 95)
- jjCheckNAddTwoStates(48, 49);
- break;
- case 48:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(48, 49);
- break;
- case 50:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjAddStates(42, 43);
- break;
- case 51:
- if (curChar == 95)
- jjCheckNAdd(52);
- break;
- case 52:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(47, 52);
- break;
- case 53:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(53, 54);
- break;
- case 54:
- if (curChar == 95)
- jjCheckNAddTwoStates(55, 56);
- break;
- case 55:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(55, 56);
- break;
- case 57:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(57, 58);
- break;
- case 58:
- if (curChar == 95)
- jjCheckNAdd(59);
- break;
- case 59:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(59, 60);
- break;
- case 60:
- if (curChar == 95)
- jjCheckNAddTwoStates(61, 62);
- break;
- case 61:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(61, 62);
- break;
- case 63:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(58, 63);
- break;
- case 64:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddStates(30, 35);
- break;
- case 65:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(65, 66);
- break;
- case 67:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 2)
- kind = 2;
- jjCheckNAddTwoStates(66, 67);
- break;
- case 68:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(68, 69);
- break;
- case 70:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjAddStates(44, 45);
- break;
- case 72:
- if ((0x7fffffe07fffffeL & l) != 0L)
- jjCheckNAddTwoStates(72, 73);
- break;
- case 73:
- if (curChar == 64)
- jjCheckNAdd(74);
- break;
- case 74:
- if ((0x7fffffe07fffffeL & l) == 0L)
- break;
- if (kind > 4)
- kind = 4;
- jjCheckNAdd(74);
- break;
- default : break;
- }
- } while(i != startsAt);
- }
- else
- {
- int hiByte = (int)(curChar >> 8);
- int i1 = hiByte >> 6;
- long l1 = 1L << (hiByte & 077);
- int i2 = (curChar & 0xff) >> 6;
- long l2 = 1L << (curChar & 077);
- MatchLoop: do
- {
- switch(jjstateSet[--i])
- {
- case 0:
- if (jjCanMove_0(hiByte, i1, i2, l1, l2))
- {
- if (kind > 12)
- kind = 12;
- }
- if (jjCanMove_1(hiByte, i1, i2, l1, l2))
- {
- if (kind > 13)
- kind = 13;
- }
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(18, 23);
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(12, 17);
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- {
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- }
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(30, 35);
- break;
- case 1:
- if (jjCanMove_1(hiByte, i1, i2, l1, l2) && kind > 13)
- kind = 13;
- break;
- case 2:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(18, 23);
- break;
- case 3:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(3, 4);
- break;
- case 4:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(5, 6);
- break;
- case 5:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(5, 6);
- break;
- case 7:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjstateSet[jjnewStateCnt++] = 7;
- break;
- case 8:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(8, 9);
- break;
- case 9:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(10, 11);
- break;
- case 10:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(10, 11);
- break;
- case 12:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjAddStates(46, 47);
- break;
- case 14:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(48, 49);
- break;
- case 15:
- if (!jjCanMove_3(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(11, 16);
- break;
- case 16:
- if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(11, 16);
- break;
- case 17:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(17, 18);
- break;
- case 18:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(19, 20);
- break;
- case 19:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(19, 20);
- break;
- case 21:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(21, 22);
- break;
- case 23:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(50, 51);
- break;
- case 24:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(25, 26);
- break;
- case 25:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(25, 26);
- break;
- case 27:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(22, 27);
- break;
- case 28:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(12, 17);
- break;
- case 29:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAddStates(0, 11);
- break;
- case 30:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 1)
- kind = 1;
- jjCheckNAdd(30);
- break;
- case 31:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(24, 26);
- break;
- case 33:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(27, 29);
- break;
- case 35:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(35, 36);
- break;
- case 37:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 5)
- kind = 5;
- jjCheckNAddTwoStates(36, 37);
- break;
- case 38:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(38, 39);
- break;
- case 40:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 6)
- kind = 6;
- jjCheckNAddTwoStates(39, 40);
- break;
- case 41:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(41, 42);
- break;
- case 43:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(52, 53);
- break;
- case 44:
- if (!jjCanMove_3(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAdd(45);
- break;
- case 45:
- if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAdd(45);
- break;
- case 46:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(46, 47);
- break;
- case 48:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(54, 55);
- break;
- case 49:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(50, 51);
- break;
- case 50:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(50, 51);
- break;
- case 52:
- if (!jjCanMove_4(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(47, 52);
- break;
- case 53:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(53, 54);
- break;
- case 55:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(56, 57);
- break;
- case 56:
- if (jjCanMove_3(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(57, 58);
- break;
- case 57:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(57, 58);
- break;
- case 59:
- if (jjCanMove_4(hiByte, i1, i2, l1, l2))
- jjAddStates(58, 59);
- break;
- case 61:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(60, 61);
- break;
- case 62:
- if (!jjCanMove_3(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(58, 63);
- break;
- case 63:
- if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 7)
- kind = 7;
- jjCheckNAddTwoStates(58, 63);
- break;
- case 64:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(30, 35);
- break;
- case 65:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(65, 66);
- break;
- case 67:
- if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 2)
- kind = 2;
- jjCheckNAddTwoStates(66, 67);
- break;
- case 68:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(68, 69);
- break;
- case 70:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjAddStates(44, 45);
- break;
- case 72:
- if (jjCanMove_2(hiByte, i1, i2, l1, l2))
- jjCheckNAddTwoStates(72, 73);
- break;
- case 74:
- if (!jjCanMove_2(hiByte, i1, i2, l1, l2))
- break;
- if (kind > 4)
- kind = 4;
- jjstateSet[jjnewStateCnt++] = 74;
- break;
- default : break;
- }
- } while(i != startsAt);
- }
- if (kind != 0x7fffffff)
- {
- jjmatchedKind = kind;
- jjmatchedPos = curPos;
- kind = 0x7fffffff;
- }
- ++curPos;
- if ((i = jjnewStateCnt) == (startsAt = 75 - (jjnewStateCnt = startsAt)))
- return curPos;
- try { curChar = input_stream.readChar(); }
- catch(java.io.IOException e) { return curPos; }
- }
-}
-static final int[] jjnextStates = {
- 30, 31, 32, 34, 38, 39, 41, 42, 46, 47, 53, 54, 5, 6, 10, 11,
- 19, 20, 3, 4, 8, 9, 17, 18, 31, 32, 34, 32, 33, 34, 65, 66,
- 68, 69, 72, 73, 5, 6, 19, 20, 25, 26, 50, 51, 70, 71, 12, 13,
- 14, 15, 23, 24, 43, 44, 48, 49, 55, 56, 59, 60, 61, 62,
-};
-private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
-{
- switch(hiByte)
- {
- case 48:
- return ((jjbitVec2[i2] & l2) != 0L);
- case 49:
- return ((jjbitVec3[i2] & l2) != 0L);
- case 51:
- return ((jjbitVec4[i2] & l2) != 0L);
- case 77:
- return ((jjbitVec5[i2] & l2) != 0L);
- case 255:
- return ((jjbitVec6[i2] & l2) != 0L);
- default :
- if ((jjbitVec0[i1] & l1) != 0L)
- return true;
- return false;
- }
-}
-private static final boolean jjCanMove_1(int hiByte, int i1, int i2, long l1, long l2)
-{
- switch(hiByte)
- {
- case 215:
- return ((jjbitVec8[i2] & l2) != 0L);
- default :
- if ((jjbitVec7[i1] & l1) != 0L)
- return true;
- return false;
- }
-}
-private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, long l2)
-{
- switch(hiByte)
- {
- case 0:
- return ((jjbitVec10[i2] & l2) != 0L);
- case 255:
- return ((jjbitVec11[i2] & l2) != 0L);
- default :
- if ((jjbitVec9[i1] & l1) != 0L)
- return true;
- return false;
- }
-}
-private static final boolean jjCanMove_3(int hiByte, int i1, int i2, long l1, long l2)
-{
- switch(hiByte)
- {
- case 6:
- return ((jjbitVec14[i2] & l2) != 0L);
- case 11:
- return ((jjbitVec15[i2] & l2) != 0L);
- case 13:
- return ((jjbitVec16[i2] & l2) != 0L);
- case 14:
- return ((jjbitVec17[i2] & l2) != 0L);
- case 16:
- return ((jjbitVec18[i2] & l2) != 0L);
- default :
- if ((jjbitVec12[i1] & l1) != 0L)
- if ((jjbitVec13[i2] & l2) == 0L)
- return false;
- else
- return true;
- return false;
- }
-}
-private static final boolean jjCanMove_4(int hiByte, int i1, int i2, long l1, long l2)
-{
- switch(hiByte)
- {
- case 0:
- return ((jjbitVec10[i2] & l2) != 0L);
- case 215:
- return ((jjbitVec8[i2] & l2) != 0L);
- case 255:
- return ((jjbitVec11[i2] & l2) != 0L);
- default :
- if ((jjbitVec19[i1] & l1) != 0L)
- return true;
- return false;
- }
-}
-public static final String[] jjstrLiteralImages = {
-"", null, null, null, null, null, null, null, null, null, null, null, null,
-null, null, null, };
-public static final String[] lexStateNames = {
- "DEFAULT",
-};
-static final long[] jjtoToken = {
- 0x30ffL,
-};
-static final long[] jjtoSkip = {
- 0x8000L,
-};
-protected CharStream input_stream;
-private final int[] jjrounds = new int[75];
-private final int[] jjstateSet = new int[150];
-protected char curChar;
-public StandardTokenizerTokenManager(CharStream stream)
-{
- input_stream = stream;
-}
-public StandardTokenizerTokenManager(CharStream stream, int lexState)
-{
- this(stream);
- SwitchTo(lexState);
-}
-public void ReInit(CharStream stream)
-{
- jjmatchedPos = jjnewStateCnt = 0;
- curLexState = defaultLexState;
- input_stream = stream;
- ReInitRounds();
-}
-private final void ReInitRounds()
-{
- int i;
- jjround = 0x80000001;
- for (i = 75; i-- > 0;)
- jjrounds[i] = 0x80000000;
-}
-public void ReInit(CharStream stream, int lexState)
-{
- ReInit(stream);
- SwitchTo(lexState);
-}
-public void SwitchTo(int lexState)
-{
- if (lexState >= 1 || lexState < 0)
- throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE);
- else
- curLexState = lexState;
-}
-
-protected Token jjFillToken()
-{
- Token t = Token.newToken(jjmatchedKind);
- t.kind = jjmatchedKind;
- String im = jjstrLiteralImages[jjmatchedKind];
- t.image = (im == null) ? input_stream.GetImage() : im;
- t.beginLine = input_stream.getBeginLine();
- t.beginColumn = input_stream.getBeginColumn();
- t.endLine = input_stream.getEndLine();
- t.endColumn = input_stream.getEndColumn();
- return t;
-}
-
-int curLexState = 0;
-int defaultLexState = 0;
-int jjnewStateCnt;
-int jjround;
-int jjmatchedPos;
-int jjmatchedKind;
-
-public Token getNextToken()
-{
- int kind;
- Token specialToken = null;
- Token matchedToken;
- int curPos = 0;
-
- EOFLoop :
- for (;;)
- {
- try
- {
- curChar = input_stream.BeginToken();
- }
- catch(java.io.IOException e)
- {
- jjmatchedKind = 0;
- matchedToken = jjFillToken();
- return matchedToken;
- }
-
- jjmatchedKind = 0x7fffffff;
- jjmatchedPos = 0;
- curPos = jjMoveStringLiteralDfa0_0();
- if (jjmatchedPos == 0 && jjmatchedKind > 15)
- {
- jjmatchedKind = 15;
- }
- if (jjmatchedKind != 0x7fffffff)
- {
- if (jjmatchedPos + 1 < curPos)
- input_stream.backup(curPos - jjmatchedPos - 1);
- if ((jjtoToken[jjmatchedKind >> 6] & (1L << (jjmatchedKind & 077))) != 0L)
- {
- matchedToken = jjFillToken();
- return matchedToken;
- }
- else
- {
- continue EOFLoop;
- }
- }
- int error_line = input_stream.getEndLine();
- int error_column = input_stream.getEndColumn();
- String error_after = null;
- boolean EOFSeen = false;
- try { input_stream.readChar(); input_stream.backup(1); }
- catch (java.io.IOException e1) {
- EOFSeen = true;
- error_after = curPos <= 1 ? "" : input_stream.GetImage();
- if (curChar == '\n' || curChar == '\r') {
- error_line++;
- error_column = 0;
- }
- else
- error_column++;
- }
- if (!EOFSeen) {
- input_stream.backup(1);
- error_after = curPos <= 1 ? "" : input_stream.GetImage();
- }
- throw new TokenMgrError(EOFSeen, curLexState, error_line, error_column, error_after, curChar, TokenMgrError.LEXICAL_ERROR);
- }
-}
-
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerConstants.java (working copy)
@@ -1,44 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. StandardTokenizerConstants.java */
-package org.apache.lucene.analysis.standard;
-
-public interface StandardTokenizerConstants {
-
- int EOF = 0;
- int ALPHANUM = 1;
- int APOSTROPHE = 2;
- int ACRONYM = 3;
- int COMPANY = 4;
- int EMAIL = 5;
- int HOST = 6;
- int NUM = 7;
- int P = 8;
- int HAS_DIGIT = 9;
- int ALPHA = 10;
- int LETTER = 11;
- int CJ = 12;
- int KOREAN = 13;
- int DIGIT = 14;
- int NOISE = 15;
-
- int DEFAULT = 0;
-
- String[] tokenImage = {
- "<EOF>",
- "<ALPHANUM>",
- "<APOSTROPHE>",
- "<ACRONYM>",
- "<COMPANY>",
- "<EMAIL>",
- "<HOST>",
- "<NUM>",
- "<P>",
- "<HAS_DIGIT>",
- "<ALPHA>",
- "<LETTER>",
- "<CJ>",
- "<KOREAN>",
- "<DIGIT>",
- "<NOISE>",
- };
-
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/CharStream.java (working copy)
@@ -1,110 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 3.0 */
-package org.apache.lucene.analysis.standard;
-
-/**
- * This interface describes a character stream that maintains line and
- * column number positions of the characters. It also has the capability
- * to backup the stream to some extent. An implementation of this
- * interface is used in the TokenManager implementation generated by
- * JavaCCParser.
- *
- * All the methods except backup can be implemented in any fashion. backup
- * needs to be implemented correctly for the correct operation of the lexer.
- * Rest of the methods are all used to get information like line number,
- * column number and the String that constitutes a token and are not used
- * by the lexer. Hence their implementation won't affect the generated lexer's
- * operation.
- */
-
-public interface CharStream {
-
- /**
- * Returns the next character from the selected input. The method
- * of selecting the input is the responsibility of the class
- * implementing this interface. Can throw any java.io.IOException.
- */
- char readChar() throws java.io.IOException;
-
- /**
- * Returns the column position of the character last read.
- * @deprecated
- * @see #getEndColumn
- */
- int getColumn();
-
- /**
- * Returns the line number of the character last read.
- * @deprecated
- * @see #getEndLine
- */
- int getLine();
-
- /**
- * Returns the column number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndColumn();
-
- /**
- * Returns the line number of the last character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getEndLine();
-
- /**
- * Returns the column number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginColumn();
-
- /**
- * Returns the line number of the first character for current token (being
- * matched after the last call to BeginTOken).
- */
- int getBeginLine();
-
- /**
- * Backs up the input stream by amount steps. Lexer calls this method if it
- * had already read some characters, but could not use them to match a
- * (longer) token. So, they will be used again as the prefix of the next
- * token and it is the implemetation's responsibility to do this right.
- */
- void backup(int amount);
-
- /**
- * Returns the next character that marks the beginning of the next token.
- * All characters must remain in the buffer between two successive calls
- * to this method to implement backup correctly.
- */
- char BeginToken() throws java.io.IOException;
-
- /**
- * Returns a string made up of characters from the marked token beginning
- * to the current buffer position. Implementations have the choice of returning
- * anything that they want to. For example, for efficiency, one might decide
- * to just return null, which is a valid implementation.
- */
- String GetImage();
-
- /**
- * Returns an array of characters that make up the suffix of length 'len' for
- * the currently matched token. This is used to build up the matched string
- * for use in actions in the case of MORE. A simple and inefficient
- * implementation of this is as follows :
- *
- * {
- * String t = GetImage();
- * return t.substring(t.length() - len, t.length()).toCharArray();
- * }
- */
- char[] GetSuffix(int len);
-
- /**
- * The lexer calls this function to indicate that it is done with the stream
- * and hence implementations can free any resources held by this class.
- * Again, the body of this function can be just empty and it will not
- * affect the lexer's operation.
- */
- void Done();
-
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/ParseException.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/ParseException.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/ParseException.java (working copy)
@@ -1,194 +0,0 @@
-/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 0.7pre6 */
-package org.apache.lucene.analysis.standard;
-
-/* Note: This file was also manually modified.
- * Regenerating it by JavaCC might undo these changes!. */
-
-/**
- * This exception is thrown when parse errors are encountered.
- * You can explicitly create objects of this exception type by
- * calling the method generateParseException in the generated
- * parser.
- *
- * You can modify this class to customize your error reporting
- * mechanisms so long as you retain the public fields.
- */
-public class ParseException extends java.io.IOException {
-
- /**
- * This constructor is used by the method "generateParseException"
- * in the generated parser. Calling this constructor generates
- * a new object of this type with the fields "currentToken",
- * "expectedTokenSequences", and "tokenImage" set. The boolean
- * flag "specialConstructor" is also set to true to indicate that
- * this constructor was used to create this object.
- * This constructor calls its super class with the empty string
- * to force the "toString" method of parent class "Throwable" to
- * print the error message in the form:
- * ParseException: &lt;result of getMessage&gt;
- */
- public ParseException(Token currentTokenVal,
- int[][] expectedTokenSequencesVal,
- String[] tokenImageVal
- )
- {
- super("");
- specialConstructor = true;
- currentToken = currentTokenVal;
- expectedTokenSequences = expectedTokenSequencesVal;
- tokenImage = tokenImageVal;
- }
-
- /**
- * The following constructors are for use by you for whatever
- * purpose you can think of. Constructing the exception in this
- * manner makes the exception behave in the normal way - i.e., as
- * documented in the class "Throwable". The fields "errorToken",
- * "expectedTokenSequences", and "tokenImage" do not contain
- * relevant information. The JavaCC generated code does not use
- * these constructors.
- */
-
- public ParseException() {
- super();
- specialConstructor = false;
- }
-
- public ParseException(String message) {
- super(message);
- specialConstructor = false;
- }
-
- /**
- * This variable determines which constructor was used to create
- * this object and thereby affects the semantics of the
- * "getMessage" method (see below).
- */
- protected boolean specialConstructor;
-
- /**
- * This is the last token that has been consumed successfully. If
- * this object has been created due to a parse error, the token
- * followng this token will (therefore) be the first error token.
- */
- public Token currentToken;
-
- /**
- * Each entry in this array is an array of integers. Each array
- * of integers represents a sequence of tokens (by their ordinal
- * values) that is expected at this point of the parse.
- */
- public int[][] expectedTokenSequences;
-
- /**
- * This is a reference to the "tokenImage" array of the generated
- * parser within which the parse error occurred. This array is
- * defined in the generated ...Constants interface.
- */
- public String[] tokenImage;
-
- /**
- * This method has the standard behavior when this object has been
- * created using the standard constructors. Otherwise, it uses
- * "currentToken" and "expectedTokenSequences" to generate a parse
- * error message and returns it. If this object has been created
- * due to a parse error, and you do not catch it (it gets thrown
- * from the parser), then this method is called during the printing
- * of the final stack trace, and hence the correct error message
- * gets displayed.
- */
- public String getMessage() {
- if (!specialConstructor) {
- return super.getMessage();
- }
- String expected = "";
- int maxSize = 0;
- for (int i = 0; i < expectedTokenSequences.length; i++) {
- if (maxSize < expectedTokenSequences[i].length) {
- maxSize = expectedTokenSequences[i].length;
- }
- for (int j = 0; j < expectedTokenSequences[i].length; j++) {
- expected += tokenImage[expectedTokenSequences[i][j]] + " ";
- }
- if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
- expected += "...";
- }
- expected += eol + " ";
- }
- String retval = "Encountered \"";
- Token tok = currentToken.next;
- for (int i = 0; i < maxSize; i++) {
- if (i != 0) retval += " ";
- if (tok.kind == 0) {
- retval += tokenImage[0];
- break;
- }
- retval += add_escapes(tok.image);
- tok = tok.next;
- }
- retval += "\" at line " + currentToken.next.beginLine + ", column " + currentToken.next.beginColumn + "." + eol;
- if (expectedTokenSequences.length == 1) {
- retval += "Was expecting:" + eol + " ";
- } else {
- retval += "Was expecting one of:" + eol + " ";
- }
- retval += expected;
- return retval;
- }
-
- /**
- * The end of line string for this machine.
- */
- protected String eol = System.getProperty("line.separator", "\n");
-
- /**
- * Used to convert raw characters to their escaped version
- * when these raw version cannot be used as part of an ASCII
- * string literal.
- */
- protected String add_escapes(String str) {
- StringBuffer retval = new StringBuffer();
- char ch;
- for (int i = 0; i < str.length(); i++) {
- switch (str.charAt(i))
- {
- case 0 :
- continue;
- case '\b':
- retval.append("\\b");
- continue;
- case '\t':
- retval.append("\\t");
- continue;
- case '\n':
- retval.append("\\n");
- continue;
- case '\f':
- retval.append("\\f");
- continue;
- case '\r':
- retval.append("\\r");
- continue;
- case '\"':
- retval.append("\\\"");
- continue;
- case '\'':
- retval.append("\\\'");
- continue;
- case '\\':
- retval.append("\\\\");
- continue;
- default:
- if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
- String s = "0000" + Integer.toString(ch, 16);
- retval.append("\\u" + s.substring(s.length() - 4, s.length()));
- } else {
- retval.append(ch);
- }
- continue;
- }
- }
- return retval.toString();
- }
-
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java (working copy)
@@ -1,206 +1,68 @@
-/* Generated By:JavaCC: Do not edit this line. StandardTokenizer.java */
-package org.apache.lucene.analysis.standard;
-
-import java.io.*;
-
-/** A grammar-based tokenizer constructed with JavaCC.
+/*
+ * Carrot2 project.
*
- * <p> This should be a good tokenizer for most European-language documents:
+ * Copyright (C) 2002-2007, Dawid Weiss, Stanisław Osiński.
+ * Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
+ * All rights reserved.
*
- * <ul>
- * <li>Splits words at punctuation characters, removing punctuation. However, a
- * dot that's not followed by whitespace is considered part of a token.
- * <li>Splits words at hyphens, unless there's a number in the token, in which case
- * the whole token is interpreted as a product number and is not split.
- * <li>Recognizes email addresses and internet hostnames as one token.
- * </ul>
- *
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
+ * Refer to the full license file "carrot2.LICENSE"
+ * in the root folder of the repository checkout or at:
+ * http://www.carrot2.org/carrot2.LICENSE
*/
-public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer implements StandardTokenizerConstants {
- /** Constructs a tokenizer for this Reader. */
- public StandardTokenizer(Reader reader) {
- this(new FastCharStream(reader));
- this.input = reader;
- }
+package org.apache.lucene.analysis.standard;
-/** Returns the next token in the stream, or null at EOS.
- * <p>The returned token's type is set to an element of {@link
- * StandardTokenizerConstants#tokenImage}.
- */
- final public org.apache.lucene.analysis.Token next() throws ParseException, IOException {
- Token token = null;
- switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case ALPHANUM:
- token = jj_consume_token(ALPHANUM);
- break;
- case APOSTROPHE:
- token = jj_consume_token(APOSTROPHE);
- break;
- case ACRONYM:
- token = jj_consume_token(ACRONYM);
- break;
- case COMPANY:
- token = jj_consume_token(COMPANY);
- break;
- case EMAIL:
- token = jj_consume_token(EMAIL);
- break;
- case HOST:
- token = jj_consume_token(HOST);
- break;
- case NUM:
- token = jj_consume_token(NUM);
- break;
- case CJ:
- token = jj_consume_token(CJ);
- break;
- case 0:
- token = jj_consume_token(0);
- break;
- default:
- jj_la1[0] = jj_gen;
- jj_consume_token(-1);
- throw new ParseException();
- }
- if (token.kind == EOF) {
- {if (true) return null;}
- } else {
- {if (true) return
- new org.apache.lucene.analysis.Token(token.image,
- token.beginColumn,token.endColumn,
- tokenImage[token.kind]);}
- }
- throw new Error("Missing return statement in function");
- }
+import java.io.IOException;
+import java.io.Reader;
- public StandardTokenizerTokenManager token_source;
- public Token token, jj_nt;
- private int jj_ntk;
- private int jj_gen;
- final private int[] jj_la1 = new int[1];
- static private int[] jj_la1_0;
- static {
- jj_la1_0();
- }
- private static void jj_la1_0() {
- jj_la1_0 = new int[] {0x10ff,};
- }
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
- public StandardTokenizer(CharStream stream) {
- token_source = new StandardTokenizerTokenManager(stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++) jj_la1[i] = -1;
- }
+/**
+ * An interface to the tokenizer constructed with JFlex.
+ *
+ * @author Stanislaw Osinski
+ */
+public class StandardTokenizer extends Tokenizer {
+ /** A private instance of the JFlex-constructed scanner */
+ private final StandardTokenizerImpl scanner;
- public void ReInit(CharStream stream) {
- token_source.ReInit(stream);
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++) jj_la1[i] = -1;
- }
-
- public StandardTokenizer(StandardTokenizerTokenManager tm) {
- token_source = tm;
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++) jj_la1[i] = -1;
- }
-
- public void ReInit(StandardTokenizerTokenManager tm) {
- token_source = tm;
- token = new Token();
- jj_ntk = -1;
- jj_gen = 0;
- for (int i = 0; i < 1; i++) jj_la1[i] = -1;
- }
-
- final private Token jj_consume_token(int kind) throws ParseException {
- Token oldToken;
- if ((oldToken = token).next != null) token = token.next;
- else token = token.next = token_source.getNextToken();
- jj_ntk = -1;
- if (token.kind == kind) {
- jj_gen++;
- return token;
+ /**
+ * Creates a new instance of the {@link StandardTokenizer}. Attaches the
+ * <code>input</code> to a newly created JFlex scanner.
+ */
+ public StandardTokenizer(Reader input) {
+ this.input = input;
+ this.scanner = new StandardTokenizerImpl(input);
}
- token = oldToken;
- jj_kind = kind;
- throw generateParseException();
- }
- final public Token getNextToken() {
- if (token.next != null) token = token.next;
- else token = token.next = token_source.getNextToken();
- jj_ntk = -1;
- jj_gen++;
- return token;
- }
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public Token next() throws IOException {
+ int tokenType = scanner.getNextToken();
- final public Token getToken(int index) {
- Token t = token;
- for (int i = 0; i < index; i++) {
- if (t.next != null) t = t.next;
- else t = t.next = token_source.getNextToken();
- }
- return t;
- }
+ if (tokenType == StandardTokenizerImpl.YYEOF) {
+ return null;
+ }
- final private int jj_ntk() {
- if ((jj_nt=token.next) == null)
- return (jj_ntk = (token.next=token_source.getNextToken()).kind);
- else
- return (jj_ntk = jj_nt.kind);
- }
+ int startPosition = scanner.yychar();
- private java.util.Vector jj_expentries = new java.util.Vector();
- private int[] jj_expentry;
- private int jj_kind = -1;
-
- public ParseException generateParseException() {
- jj_expentries.removeAllElements();
- boolean[] la1tokens = new boolean[16];
- for (int i = 0; i < 16; i++) {
- la1tokens[i] = false;
+ final String tokenImage = scanner.yytext();
+ return new Token(tokenImage, startPosition, startPosition
+ + tokenImage.length(),
+ StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
- if (jj_kind >= 0) {
- la1tokens[jj_kind] = true;
- jj_kind = -1;
- }
- for (int i = 0; i < 1; i++) {
- if (jj_la1[i] == jj_gen) {
- for (int j = 0; j < 32; j++) {
- if ((jj_la1_0[i] & (1<<j)) != 0) {
- la1tokens[j] = true;
- }
- }
- }
- }
- for (int i = 0; i < 16; i++) {
- if (la1tokens[i]) {
- jj_expentry = new int[1];
- jj_expentry[0] = i;
- jj_expentries.addElement(jj_expentry);
- }
- }
- int[][] exptokseq = new int[jj_expentries.size()][];
- for (int i = 0; i < jj_expentries.size(); i++) {
- exptokseq[i] = (int[])jj_expentries.elementAt(i);
- }
- return new ParseException(token, exptokseq, tokenImage);
- }
- final public void enable_tracing() {
- }
-
- final public void disable_tracing() {
- }
-
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#reset()
+ */
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ }
}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj (working copy)
@@ -1,196 +0,0 @@
-/**f
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-options {
- STATIC = false;
-//IGNORE_CASE = true;
-//BUILD_PARSER = false;
- UNICODE_INPUT = true;
- USER_CHAR_STREAM = true;
- OPTIMIZE_TOKEN_MANAGER = true;
-//DEBUG_TOKEN_MANAGER = true;
-}
-PARSER_BEGIN(StandardTokenizer)
-
-package org.apache.lucene.analysis.standard;
-
-import java.io.*;
-
-/** A grammar-based tokenizer constructed with JavaCC.
- *
- * <p> This should be a good tokenizer for most European-language documents:
- *
- * <ul>
- * <li>Splits words at punctuation characters, removing punctuation. However, a
- * dot that's not followed by whitespace is considered part of a token.
- * <li>Splits words at hyphens, unless there's a number in the token, in which case
- * the whole token is interpreted as a product number and is not split.
- * <li>Recognizes email addresses and internet hostnames as one token.
- * </ul>
- *
- * <p>Many applications have specific tokenizer needs. If this tokenizer does
- * not suit your application, please consider copying this source code
- * directory to your project and maintaining your own grammar-based tokenizer.
- */
-public class StandardTokenizer extends org.apache.lucene.analysis.Tokenizer {
-
- /** Constructs a tokenizer for this Reader. */
- public StandardTokenizer(Reader reader) {
- this(new FastCharStream(reader));
- this.input = reader;
- }
-}
-
-PARSER_END(StandardTokenizer)
-
-TOKEN : { // token patterns
-
- // basic word: a sequence of digits & letters
- <ALPHANUM: (<LETTER>|<DIGIT>|<KOREAN>)+ >
-
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- // use a post-filter to remove possesives
-| <APOSTROPHE: <ALPHA> ("'" <ALPHA>)+ >
-
- // acronyms: U.S.A., I.B.M., etc.
- // use a post-filter to remove dots
-| <ACRONYM: <ALPHA> "." (<ALPHA> ".")+ >
-
- // company names like AT&T and Excite@Home.
-| <COMPANY: <ALPHA> ("&"|"@") <ALPHA> >
-
- // email addresses
-| <EMAIL: <ALPHANUM> (("."|"-"|"_") <ALPHANUM>)* "@" <ALPHANUM> (("."|"-") <ALPHANUM>)+ >
-
- // hostname
-| <HOST: <ALPHANUM> ("." <ALPHANUM>)+ >
-
- // floating point, serial, model numbers, ip addresses, etc.
- // every other segment must have at least one digit
-| <NUM: (<ALPHANUM> <P> <HAS_DIGIT>
- | <HAS_DIGIT> <P> <ALPHANUM>
- | <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- | <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <ALPHANUM> <P> <HAS_DIGIT> (<P> <ALPHANUM> <P> <HAS_DIGIT>)+
- | <HAS_DIGIT> <P> <ALPHANUM> (<P> <HAS_DIGIT> <P> <ALPHANUM>)+
- )
- >
-| <#P: ("_"|"-"|"/"|"."|",") >
-| <#HAS_DIGIT: // at least one digit
- (<LETTER>|<DIGIT>)*
- <DIGIT>
- (<LETTER>|<DIGIT>)*
- >
-
-| < #ALPHA: (<LETTER>)+>
-| < #LETTER: // unicode letters
- [
- "\u0041"-"\u005a",
- "\u0061"-"\u007a",
- "\u00c0"-"\u00d6",
- "\u00d8"-"\u00f6",
- "\u00f8"-"\u00ff",
- "\u0100"-"\u1fff",
- "\uffa0"-"\uffdc"
- ]
- >
-| < CJ: // Chinese, Japanese
- [
- "\u3040"-"\u318f",
- "\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin)
- "\u3040"-"\u309F", // Japanese: Hiragana
- "\u30A0"-"\u30FF", // Japanese: Katakana
- "\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions
- "\u3300"-"\u337f",
- "\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A
- "\u4e00"-"\u9fff",
- "\uf900"-"\ufaff",
- "\uff65"-"\uff9f"
-
-// Otis: consider adding these, too
-//
-// 2E80-2EFF: CJK Radicals Supplement
-// 2F00-2FDF: Kangxi Radicals
-// 3190-319F: Kanbun
-// 31C0-31EF: CJK Strokes
-// 4E00-9FBF: CJK Unified
-// F900-FAFF: CJK Compatibility Ideographs
-
- ]
- >
-| < KOREAN: // Korean
- [
- "\uac00"-"\ud7af", // Hangul Syllables
- "\u1100"-"\u11ff" // Hangul Jamo
- // "\uac00"-"\ud7a3"
- ]
- >
-| < #DIGIT: // unicode digits
- [
- "\u0030"-"\u0039",
- "\u0660"-"\u0669",
- "\u06f0"-"\u06f9",
- "\u0966"-"\u096f",
- "\u09e6"-"\u09ef",
- "\u0a66"-"\u0a6f",
- "\u0ae6"-"\u0aef",
- "\u0b66"-"\u0b6f",
- "\u0be7"-"\u0bef",
- "\u0c66"-"\u0c6f",
- "\u0ce6"-"\u0cef",
- "\u0d66"-"\u0d6f",
- "\u0e50"-"\u0e59",
- "\u0ed0"-"\u0ed9",
- "\u1040"-"\u1049"
- ]
- >
-}
-
-SKIP : { // skip unrecognized chars
- <NOISE: ~[] >
-}
-
-/** Returns the next token in the stream, or null at EOS.
- * <p>The returned token's type is set to an element of {@link
- * StandardTokenizerConstants#tokenImage}.
- */
-org.apache.lucene.analysis.Token next() throws IOException :
-{
- Token token = null;
-}
-{
- ( token = <ALPHANUM> |
- token = <APOSTROPHE> |
- token = <ACRONYM> |
- token = <COMPANY> |
- token = <EMAIL> |
- token = <HOST> |
- token = <NUM> |
- token = <CJ> |
- token = <EOF>
- )
- {
- if (token.kind == EOF) {
- return null;
- } else {
- return
- new org.apache.lucene.analysis.Token(token.image,
- token.beginColumn,token.endColumn,
- tokenImage[token.kind]);
- }
- }
-}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 0)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java (revision 0)
@@ -0,0 +1,606 @@
+/* The following code was generated by JFlex 1.4.1 on 07-08-01 09:29 */
+
+package org.apache.lucene.analysis.standard;
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ * on 07-08-01 09:29 from the specification file
+ * <tt>E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ */
+class StandardTokenizerImpl {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int YYINITIAL = 0;
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\11\0\1\0\1\16\1\0\1\0\1\15\22\0\1\0\5\0\1\3"+
+ "\1\1\4\0\1\7\1\5\1\2\1\7\12\11\6\0\1\4\32\10"+
+ "\4\0\1\6\1\0\32\10\105\0\27\10\1\0\37\10\1\0\u0568\10"+
+ "\12\12\206\10\12\12\u026c\10\12\12\166\10\12\12\166\10\12\12\166\10"+
+ "\12\12\166\10\12\12\167\10\11\12\166\10\12\12\166\10\12\12\166\10"+
+ "\12\12\340\10\12\12\166\10\12\12\u0166\10\12\12\266\10\u0100\10\u0e00\10"+
+ "\u1040\0\u0150\14\140\0\20\14\u0100\0\200\14\200\0\u19c0\14\100\0\u5200\14"+
+ "\u0c00\0\u2bb0\13\u2150\0\u0200\14\u0465\0\73\14\75\10\43\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\1\0\2\1\4\2\1\3\1\1\15\0\1\4\1\5"+
+ "\2\6\2\7\2\0\1\6\2\4\1\6\1\10\1\0"+
+ "\1\10\1\11";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[38];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\17\0\36\0\55\0\74\0\113\0\132\0\17"+
+ "\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
+ "\0\341\0\360\0\377\0\u010e\0\u011d\0\36\0\u012c\0\u013b"+
+ "\0\u014a\0\u0159\0\245\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4"+
+ "\0\u01b3\0\u01c2\0\226\0\u01d1\0\u010e\0\u01e0";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[38];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\2\2\1\3\2\2\3\3\1\4\1\5\1\6\1\7"+
+ "\1\10\1\11\1\2\27\0\4\12\4\0\1\13\1\14"+
+ "\1\15\1\16\2\17\1\0\1\4\1\5\1\6\1\7"+
+ "\5\0\1\20\1\0\1\21\2\22\1\23\3\5\1\7"+
+ "\4\0\1\13\1\24\1\15\1\16\2\22\1\23\1\6"+
+ "\1\5\1\6\1\7\5\0\1\25\1\0\1\21\2\17"+
+ "\1\0\4\7\21\0\1\2\2\0\1\26\2\0\3\26"+
+ "\1\12\2\27\1\12\13\0\1\30\1\0\1\30\14\0"+
+ "\1\31\1\32\1\31\1\32\13\0\1\33\1\0\1\33"+
+ "\14\0\1\34\1\35\1\34\1\35\13\0\4\36\13\0"+
+ "\4\37\13\0\4\35\13\0\4\40\13\0\4\41\13\0"+
+ "\1\42\1\37\1\42\1\37\13\0\4\32\5\0\1\23"+
+ "\2\0\3\23\3\27\1\12\4\0\1\13\6\0\1\30"+
+ "\1\0\1\30\6\0\1\43\1\0\1\21\2\17\1\0"+
+ "\1\31\1\32\1\31\1\32\5\0\1\25\1\0\1\21"+
+ "\2\17\1\0\4\32\5\0\1\44\2\0\1\44\2\0"+
+ "\1\34\1\35\1\34\1\35\5\0\1\44\2\0\1\44"+
+ "\2\0\4\35\5\0\1\17\1\0\1\21\2\17\1\0"+
+ "\4\36\5\0\1\20\1\0\1\21\2\22\1\23\4\37"+
+ "\5\0\1\22\1\0\1\21\2\22\1\23\4\40\5\0"+
+ "\1\23\2\0\3\23\4\41\5\0\1\45\1\0\1\21"+
+ "\2\22\1\23\1\42\1\37\1\42\1\37\13\0\4\46"+
+ "\5\0\1\44\2\0\1\44\2\0\4\46\3\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[495];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\1\0\1\11\5\1\1\11\1\1\15\0\6\1\2\0"+
+ "\5\1\1\0\2\1";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[38];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the textposition at the last state to be included in yytext */
+ private int zzPushbackPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /* user code: */
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>"
+};
+
+public final int yychar()
+{
+ return yychar;
+}
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ StandardTokenizerImpl(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ StandardTokenizerImpl(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 156) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzPushbackPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead < 0) {
+ return true;
+ }
+ else {
+ zzEndRead+= numRead;
+ return false;
+ }
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public int getNextToken() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar+= zzMarkedPosL-zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = zzLexicalState;
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 6:
+ { return HOST;
+ }
+ case 10: break;
+ case 8:
+ { return ACRONYM;
+ }
+ case 11: break;
+ case 1:
+ { /* ignore */
+ }
+ case 12: break;
+ case 4:
+ { return NUM;
+ }
+ case 13: break;
+ case 3:
+ { return CJ;
+ }
+ case 14: break;
+ case 2:
+ { return ALPHANUM;
+ }
+ case 15: break;
+ case 7:
+ { return COMPANY;
+ }
+ case 16: break;
+ case 5:
+ { return APOSTROPHE;
+ }
+ case 17: break;
+ case 9:
+ { return EMAIL;
+ }
+ case 18: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
Index: E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/package.html
===================================================================
--- E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/package.html (revision 560135)
+++ E:/projects/lucene/trunk/src/java/org/apache/lucene/analysis/standard/package.html (working copy)
@@ -2,14 +2,9 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
- <meta name="Author" content="Doug Cutting">
+ <meta name="Author" content="Stanislaw Osinski">
</head>
<body>
-A grammar-based tokenizer constructed with JavaCC.
-<p>Note that JavaCC defines lots of public classes, methods and fields
-that do not need to be public.&nbsp; These clutter the documentation.&nbsp;
-Sorry.
-<p>Note that because JavaCC defines a class named <tt>Token</tt>, <tt>org.apache.lucene.analysis.Token</tt>
-must always be fully qualified in source code in this package.
+A fast grammar-based tokenizer constructed with JFlex.
</body>
</html>
Index: E:/projects/lucene/trunk/build.xml
===================================================================
--- E:/projects/lucene/trunk/build.xml (revision 560135)
+++ E:/projects/lucene/trunk/build.xml (working copy)
@@ -7,9 +7,9 @@
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
-
+
http://www.apache.org/licenses/LICENSE-2.0
-
+
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,7 +23,7 @@
<import file="common-build.xml"/>
- <property name="build.demo.template" value="src/demo/demo-build.template"/>
+ <property name="build.demo.template" value="src/demo/demo-build.template"/>
<property name="demo.name" value="lucene-demos-${version}"/>
<property name="demo.war.name" value="luceneweb"/>
@@ -31,7 +31,7 @@
<!-- Type of checksum to compute for distribution files -->
<property name="checksum.algorithm" value="md5" />
-
+
<!-- Build classpath -->
<path id="classpath">
<pathelement location="${build.dir}/classes/java"/>
@@ -98,7 +98,7 @@
<target name="test" depends="test-core, test-contrib"
description="Runs all unit tests (including contribs)"
/>
-
+
<!-- ================================================================== -->
<!-- J A R -->
<!-- ================================================================== -->
@@ -122,11 +122,11 @@
value="${version}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
- <attribute name="X-Compile-Source-JDK"
+ <attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
- <attribute name="X-Compile-Target-JDK"
+ <attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
- </manifest>
+ </manifest>
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
@@ -152,11 +152,11 @@
value="${version}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
- <attribute name="X-Compile-Source-JDK"
+ <attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
- <attribute name="X-Compile-Target-JDK"
+ <attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
- </manifest>
+ </manifest>
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
@@ -224,7 +224,7 @@
<!-- ================================================================== -->
<target name="package" depends="jar-core, javadocs, war-demo, build-contrib, init-dist">
<copy file="${build.demo.template}" tofile="${build.dir}/build-demo.xml">
- <filterset begintoken="@PLACEHOLDER_" endtoken="@">
+ <filterset begintoken="@PLACEHOLDER_" endtoken="@">
<filter token="version" value="${version}"/>
<filter token="javac.source" value="${javac.source}"/>
<filter token="javac.target" value="${javac.target}"/>
@@ -372,14 +372,14 @@
<checksum forceOverwrite="yes" fileext=".md5">
<fileset dir="${maven.dist.dir}" excludes="**/*.md5"/>
</checksum>
-
+
<!-- do sha1 checksums -->
<checksum forceOverwrite="yes" algorithm="SHA" fileext=".sha1">
<fileset dir="${maven.dist.dir}" excludes="**/*.sha1, **/*.md5"/>
</checksum>
</sequential>
</target>
-
+
<!-- ================================================================== -->
<!-- Build the JavaCC files into the source tree -->
<!-- ================================================================== -->
@@ -414,35 +414,44 @@
</delete>
</target>
- <target name="javacc" depends="clean-javacc,javacc-StandardAnalyzer,javacc-QueryParser,javacc-HTMLParser"/>
+ <target name="javacc" depends="clean-javacc,javacc-QueryParser,javacc-HTMLParser"/>
- <target name="javacc-StandardAnalyzer" depends="init,javacc-check" if="javacc.present">
- <!-- generate this in a build directory so we can exclude ParseException -->
- <mkdir dir="${build.dir}/gen/org/apache/lucene/analysis/standard"/>
-
- <invoke-javacc target="src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj"
- outputDir="${build.dir}/gen/org/apache/lucene/analysis/standard"
- />
- <copy todir="src/java/org/apache/lucene/analysis/standard">
- <fileset dir="${build.dir}/gen/org/apache/lucene/analysis/standard">
- <include name="*.java"/>
- <exclude name="ParseException.java"/>
- </fileset>
- </copy>
- </target>
-
<target name="javacc-QueryParser" depends="init,javacc-check" if="javacc.present">
<invoke-javacc target="src/java/org/apache/lucene/queryParser/QueryParser.jj"
outputDir="src/java/org/apache/lucene/queryParser"
/>
</target>
-
+
<target name="javacc-HTMLParser" depends="init,javacc-check" if="javacc.present">
<invoke-javacc target="src/demo/org/apache/lucene/demo/html/HTMLParser.jj"
outputDir="src/demo/org/apache/lucene/demo/html"
/>
</target>
-
+
+ <!-- ================================================================== -->
+ <!-- Build the JFlex files into the source tree -->
+ <!-- ================================================================== -->
+
+ <target name="jflex" depends="clean-jflex,jflex-StandardAnalyzer" />
+
+ <target name="jflex-StandardAnalyzer" depends="init,jflex-check" if="jflex.present">
+ <taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
+ <classpath location="${jflex.home}/lib/JFlex.jar" />
+ </taskdef>
+
+ <jflex file="src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex"
+ outdir="src/java/org/apache/lucene/analysis/standard"
+ nobak="on" />
+ </target>
+
+ <target name="clean-jflex">
+ <delete>
+ <fileset dir="src/java/org/apache/lucene/analysis/standard" includes="*.java">
+ <containsregexp expression="generated.*by.*JFlex"/>
+ </fileset>
+ </delete>
+ </target>
+
<macrodef name="contrib-crawl">
<attribute name="target" default=""/>
<attribute name="failonerror" default="true"/>
@@ -485,7 +494,7 @@
<attribute name="access"/>
<attribute name="destdir"/>
<sequential>
-
+
<dirset dir="contrib/gdata-server/src/core/src/java" id="gdata-server-core">
<include name="**" if="build-1-5-contrib" />
</dirset>
@@ -520,7 +529,7 @@
<!-- please keep this list up to date, and in alpha order... -->
<!-- with the minor exception of gdata which is managed by ref -->
-
+
<!-- ie: `find contrib/* -path \*src/java | grep -v gdata | sort` -->
<!-- if you make changes to the list of package sets, also -->
@@ -552,7 +561,7 @@
<!-- If the main javadoc Group listing includes an "Other -->
<!-- Packages" group after the ones listed here, then those -->
<!-- packages are not being matched by any of these rules -->
-
+
<group title="Core" packages="org.apache.*:org.apache.lucene.analysis:org.apache.lucene.analysis.standard*"/>
<group title="Demo" packages="org.apache.lucene.demo*"/>
@@ -585,7 +594,7 @@
</sequential>
</macrodef>
-
+
<!-- Macro for building checksum files
This is only needed until the "format" option is supported
by ant's built in checksum task
Index: E:/projects/lucene/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java
===================================================================
--- E:/projects/lucene/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java (revision 560135)
+++ E:/projects/lucene/trunk/contrib/miscellaneous/src/test/org/apache/lucene/misc/ChainedFilterTest.java (working copy)
@@ -18,8 +18,7 @@
*/
import junit.framework.TestCase;
-import java.util.Calendar;
-import java.util.Date;
+import java.util.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import org.apache.lucene.index.IndexWriter;
@@ -131,7 +130,7 @@
}
private Date parseDate(String s) throws ParseException {
- return new SimpleDateFormat("yyyy MMM dd").parse(s);
+ return new SimpleDateFormat("yyyy MMM dd", Locale.US).parse(s);
}
}