blob: 66099b4ceeda0800cfbf76a60b54b367519fc2ad [file] [log] [blame]
Index: contrib/regex/build.xml
===================================================================
--- contrib/regex/build.xml (revision 725886)
+++ contrib/regex/build.xml (working copy)
@@ -24,7 +24,7 @@
</description>
<path id="additional.dependencies">
- <fileset dir="lib" includes="*-oro-*.jar,*-regexp-*.jar"/>
+ <fileset dir="lib" includes="*-oro-*.jar,*-regexp-*.jar,*automaton*.jar"/>
</path>
<pathconvert property="project.classpath"
Index: contrib/regex/lib/automaton.LICENSE
===================================================================
--- contrib/regex/lib/automaton.LICENSE (revision 0)
+++ contrib/regex/lib/automaton.LICENSE (revision 0)
@@ -0,0 +1,29 @@
+
+The BSD License
+
+ The following is a BSD license template. To generate your own license, change the values of OWNER, ORGANIZATION and YEAR from their original values as given here, and substitute your own. Also, you may optionally omit clause 3 and still be OSD conformant.
+
+ Note: On January 9th, 2008 the OSI Board approved the "Simplified BSD License" variant used by FreeBSD and others, which omits the final "no-endorsement" clause and is thus roughly equivalent to the MIT License.
+
+ Historical Note: The original license used on BSD Unix had four clauses. The advertising clause (the third of four clauses) required you to acknowledge use of U.C. Berkeley code in your advertising of any product using that code. It was officially rescinded by the Director of the Office of Technology Licensing of the University of California on July 22nd, 1999. He states that clause 3 is "hereby deleted in its entirety." The four clause license has not been approved by OSI. The license below does not contain the advertising clause.
+
+ This prelude is not part of the license.
+
+<OWNER> = Regents of the University of California
+<ORGANIZATION> = University of California, Berkeley
+<YEAR> = 1998
+
+In the original BSD license, both occurrences of the phrase "COPYRIGHT HOLDERS AND CONTRIBUTORS" in the disclaimer read "REGENTS AND CONTRIBUTORS".
+
+Here is the license template:
+
+Copyright (c) <YEAR>, <OWNER>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+ * Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Index: contrib/regex/lib/automaton.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib\regex\lib\automaton.jar
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java
===================================================================
--- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java (revision 0)
+++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonFilter.java (revision 0)
@@ -0,0 +1,213 @@
+package org.apache.lucene.search.regex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.util.OpenBitSet;
+
+import dk.brics.automaton.Automaton;
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+
+/**
+ * <p>
+ * A RegexpFilter that utilizes the BRICS automaton package: http://www.brics.dk/automaton/
+ * </p>
+ *
+ * <p>
+ * The expression is converted to a DFA, and the state machine is used to optimize term enumeration.
+ * </p>
+ *
+ * <p>
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ * </p>
+ *
+ */
+
+public class AutomatonFilter extends Filter {
+ private final Term term;
+ private static final int ACCEPTED = -2;
+
+ /**
+ * <p>
+ * Construct a new AutomatonFilter.
+ * </p>
+ * Term is expected to contain regex syntax compatible with the BRICS package:
+ * http://www.brics.dk/automaton/
+ * </p>
+ * <p>
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ * </p>
+ * @param term Term containing field and regular expression
+ */
+ public AutomatonFilter(Term term) {
+ super();
+ this.term = term;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
+ */
+ public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
+ /*
+ * The algorithm here is pretty basic. Enumerate terms but instead of a binary accept/reject do:
+ *
+ * Look at the portion that is OK (did not enter a reject state in the DFA)
+ * Generate the next possible String and seek to that.
+ *
+ * Because this implementation is bounded by alphabet size, it could be slightly improved.
+ * One possibility is the use of Automaton State/Transition classes which provide character intervals.
+ * For large numbers of CJK terms where the "alphabet" is large, this might optimize things a bit better.
+ *
+ */
+ OpenBitSet bits = new OpenBitSet(reader.maxDoc());
+ Automaton automaton = new RegExp(term.text()).toAutomaton();
+ String prefix = automaton.getCommonPrefix();
+ RunAutomaton runAutomaton = new RunAutomaton(automaton);
+
+ /* if there is a static prefix, why not start here, but probably not truly necessary */
+ TermEnum enumerator = reader.terms(term.createTerm(prefix));
+ TermDocs termDocs = reader.termDocs();
+
+ Term t = null;
+ while ((t = enumerator.term()) != null) {
+
+ if (t.field() != term.field()) { /* wrong field, enumerated all the terms we need */
+ break;
+ }
+
+ String termText = t.text();
+
+ /* run the string against the automaton.
+ * Either the string is accepted, or it is rejected.
+ * When rejected, the acceptStatus contains the highest index that DID NOT go into a reject state.
+ */
+ int acceptStatus = run(runAutomaton, termText);
+ if (acceptStatus == ACCEPTED) {
+ /* in this case, add all the docs and keep enumerating
+ */
+ termDocs.seek(t);
+ while (termDocs.next())
+ bits.set(termDocs.doc());
+ enumerator.next();
+ } else {
+ /* in this case, want to take the portion that wasn't rejected, and generate the next possible unicode string.
+ * instruct TermEnum to seek to that location.
+ */
+ enumerator.close();
+ Term next = term.createTerm(nextString(termText, acceptStatus));
+ enumerator = reader.terms(next);
+ }
+ }
+ enumerator.close();
+ termDocs.close();
+ return bits;
+ }
+
+ /**
+ * Helper function to generate the next possible Unicode String
+ * @param termText String value of term
+ * @param acceptStatus max character position that did not enter into a reject state
+ * @return next possible unicode String
+ */
+
+ private static final String nextString(String termText, int acceptStatus) {
+ if (termText.length() == 0) /* empty string */
+ return "\u0000";
+
+ int boundary = acceptStatus + 1;
+
+ StringBuffer prefix = new StringBuffer();
+ prefix.append(termText.substring(0, boundary));
+ int nextChar = -1;
+ if (boundary < termText.length())
+ nextChar = termText.charAt(boundary);
+ /* U+FFFF is guaranteed not to ever be a valid unicode character so no overflow risk here */
+ prefix.append((char)(nextChar + 1));
+
+ return prefix.toString();
+ }
+
+
+ /**
+ * <p>
+ * Return the character position of the longest portion that doesn't enter a reject state.
+ * This method returns one of three values
+ * ACCEPTED (-2): This means this is a match, it ends in an accept state.
+ * -1: the first character entered a reject state, therefore no characters are "useful"
+ * n: where n is the position of the longest portion that did not enter a reject state.
+ * </p>
+ * @param ra RunAutomaton
+ * @param s String
+ * @return index of the longest portion that doesn't enter reject state, or ACCEPTED, or -1
+ */
+ private static final int run(RunAutomaton ra, String s) {
+ int state = ra.getInitialState();
+ int length = s.length();
+ int max = -1;
+ for (int offset = 0; offset < length; offset++) {
+ state = ra.step(state, s.charAt(offset));
+ if (state == -1)
+ break;
+ else
+ max++;
+ }
+
+ if (state >= 0 && ra.isAccept(state))
+ return ACCEPTED;
+
+ return max;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#hashCode()
+ */
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final AutomatonFilter other = (AutomatonFilter) obj;
+ if (term == null) {
+ if (other.term != null)
+ return false;
+ } else if (!term.equals(other.term))
+ return false;
+ return true;
+ }
+
+}
Index: contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java
===================================================================
--- contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0)
+++ contrib/regex/src/java/org/apache/lucene/search/regex/AutomatonQuery.java (revision 0)
@@ -0,0 +1,115 @@
+package org.apache.lucene.search.regex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.ToStringUtils;
+
+/**
+ * <p>
+ * A RegexpQuery that utilizes the BRICS automaton package: http://www.brics.dk/automaton/
+ * </p>
+ *
+ * <p>
+ * Queries are converted to a DFA, and the state machine is used to optimize term enumeration.
+ * </p>
+ * Score is constant and equal to the boost.
+ * </p>
+ *
+ * <p>
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ * </p>
+ *
+ */
+public class AutomatonQuery extends Query {
+ protected Term term;
+
+ /**
+ * <p>
+ * Construct a new AutomatonQuery.
+ * </p>
+ * Term is expected to contain regex syntax compatible with the BRICS package:
+ * http://www.brics.dk/automaton/
+ * </p>
+ * <p>
+ * Please note that this regex syntax is a bit different from others in that ^ and $ are implied.
+ * </p>
+ * @param term Term containing field and regular expression
+ */
+ public AutomatonQuery(Term term) {
+ this.term = term;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader)
+ */
+ public Query rewrite(IndexReader reader) throws IOException {
+ Query query = new ConstantScoreQuery(new AutomatonFilter(term));
+ query.setBoost(getBoost());
+ return query;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Query#toString(java.lang.String)
+ */
+ public String toString(String field) {
+ StringBuffer buffer = new StringBuffer();
+ buffer.append("automatonQuery(");
+ buffer.append(term);
+ buffer.append(")");
+ buffer.append(ToStringUtils.boost(getBoost()));
+ return buffer.toString();
+ }
+
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#hashCode()
+ */
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ final AutomatonQuery other = (AutomatonQuery) obj;
+ if (term == null) {
+ if (other.term != null)
+ return false;
+ } else if (!term.equals(other.term))
+ return false;
+ return true;
+ }
+
+}
Index: contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java
===================================================================
--- contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0)
+++ contrib/regex/src/test/org/apache/lucene/search/regex/TestAutomatonQuery.java (revision 0)
@@ -0,0 +1,89 @@
+package org.apache.lucene.search.regex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.IndexSearcher;
+
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+
+public class TestAutomatonQuery extends TestCase {
+ private IndexSearcher searcher;
+ private final String FN = "field";
+
+ public void setUp() {
+ RAMDirectory directory = new RAMDirectory();
+ try {
+ IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true,
+ IndexWriter.MaxFieldLength.LIMITED);
+ Document doc = new Document();
+ doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED));
+ writer.addDocument(doc);
+ writer.optimize();
+ writer.close();
+ searcher = new IndexSearcher(directory);
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ public void tearDown() {
+ try {
+ searcher.close();
+ } catch (Exception e) {
+ fail(e.toString());
+ }
+ }
+
+ private Term newTerm(String value) { return new Term(FN, value); }
+
+ private int regexQueryNrHits(String regex) throws Exception {
+ AutomatonQuery query = new AutomatonQuery( newTerm(regex));
+ return searcher.search(query).length();
+ }
+
+
+ public void testRegex1() throws Exception {
+ assertEquals(1, regexQueryNrHits("q.[aeiou]c.*"));
+ }
+
+ public void testRegex2() throws Exception {
+ assertEquals(0, regexQueryNrHits(".[aeiou]c.*"));
+ }
+
+ public void testRegex3() throws Exception {
+ assertEquals(0, regexQueryNrHits("q.[aeiou]c"));
+ }
+
+
+ public void testEquals() throws Exception {
+ RegexQuery query1 = new RegexQuery( newTerm("foo.*"));
+
+ RegexQuery query2 = new RegexQuery( newTerm("foo.*"));
+ assertEquals(query1.equals(query2), true);
+ }
+
+}
+