This commit was manufactured by cvs2svn to create tag 'lucene_1_4_2'.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/tags/lucene_1_4_2@150560 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index b16ef3c..de5af12 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,11 +2,44 @@
$Id$
-1.5 RC1
+1.4.2
+
+ 1. Fixed bug #31241: Sorting could lead to incorrect results (documents
+ missing, others duplicated) if the sort keys were not unique and there
+ were more than 100 matches. (Daniel Naber)
+
+ 2. Memory leak in Sort code (bug #31240) eliminated.
+ (Rafal Krzewski via Christoph and Daniel)
+
+ 3. FuzzyQuery now takes an additional parameter that specifies the
+ minimum similarity that is required for a term to match the query.
+ The QueryParser syntax for this is term~x, where x is a floating
+ point number between 0 and 1 (a bigger number means that a higher
+ similarity is required). Furthermore, a prefix can be specified
+ for FuzzyQuerys so that only those terms are considered similar that
+ start with this prefix. This can speed up FuzzyQuery greatly.
+ (Daniel Naber, Christoph Goller)
+
+ 4. PhraseQuery and PhrasePrefixQuery now allow the explicit specification
+ of relative positions. (Christoph Goller)
+
+ 5. QueryParser changes: Fix for ArrayIndexOutOfBoundsExceptions
+ (patch #9110); some unused method parameters removed; The ability
+ to specify a minimum similarity for FuzzyQuery has been added.
+ (Christoph Goller)
+
+ 6. IndexSearcher optimization: a new ScoreDoc is no longer allocated
+ for every non-zero-scoring hit. This makes 'OR' queries that
+ contain common terms substantially faster. (cutting)
+
+
+1.4.1
1. Fixed a performance bug in hit sorting code, where values were not
correctly cached. (Aviran via cutting)
+ 2. Fixed errors in file format documentation. (Daniel Naber)
+
1.4 final
diff --git a/docs/index.html b/docs/index.html
index d80a5b8..18e22b7 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -167,11 +167,10 @@
</td></tr>
<tr><td>
<blockquote>
- <h3>1 July 2004 - Lucene 1.4 Final Released</h3>
- <p>A new release of Lucene is available with many new
- features and bug fixes. See <a href="http://cvs.apache.org/viewcvs.cgi/*checkout*/jakarta-lucene/CHANGES.txt?rev=1.94">CHANGES.txt</a>
+ <h3>1 October 2004 - Lucene 1.4.2 Released</h3>
+ <p>This fixes a few bugs in 1.4.1. See <a href="http://cvs.apache.org/viewcvs.cgi/*checkout*/jakarta-lucene/CHANGES.txt?rev=1.96.2.4">CHANGES.txt</a>
for details. Binary and source distributions are
- available <a href="http://cvs.apache.org/dist/jakarta/lucene/v1.4-final/">here</a>.
+ available <a href="http://cvs.apache.org/dist/jakarta/lucene/v1.4.2/">here</a>.
</p>
</blockquote>
</p>
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.java b/src/java/org/apache/lucene/queryParser/QueryParser.java
index e1eefff..f7fdbb3 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.java
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.java
@@ -73,6 +73,7 @@
Analyzer analyzer;
String field;
int phraseSlop = 0;
+ float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
Locale locale = Locale.getDefault();
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -115,6 +116,33 @@
}
}
+ /**
+ * @return Returns the analyzer.
+ */
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ /**
+ * @return Returns the field.
+ */
+ public String getField() {
+ return field;
+ }
+
+ /**
+ * Get the default minimal similarity for fuzzy queries.
+ */
+ public float getFuzzyMinSim() {
+ return fuzzyMinSim;
+ }
+ /**
+ *Set the default minimum similarity for fuzzy queries.
+ */
+ public void setFuzzyMinSim(float fuzzyMinSim) {
+ this.fuzzyMinSim = fuzzyMinSim;
+ }
+
/**
* Sets the default slop for phrases. If zero, then exact phrase matches
* are required. Default value is zero.
@@ -172,18 +200,18 @@
return locale;
}
- protected void addClause(Vector clauses, int conj, int mods, Query q) {
+ protected void addClause(Vector clauses, int conj, int mods, Query q) {
boolean required, prohibited;
// If this term is introduced by AND, make the preceding term required,
// unless it's already prohibited
- if (conj == CONJ_AND) {
+ if (clauses.size() > 0 && conj == CONJ_AND) {
BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
if (!c.prohibited)
c.required = true;
}
- if (operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
+ if (clauses.size() > 0 && operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
// If this term is introduced by OR, make the preceding term optional,
// unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
// notice if the input is a OR b, first term is parsed as required; without
@@ -218,9 +246,7 @@
/**
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field,
- Analyzer analyzer,
- String queryText) throws ParseException {
+ protected Query getFieldQuery(String field, String queryText) throws ParseException {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
@@ -262,17 +288,15 @@
}
/**
- * Base implementation delegates to {@link #getFieldQuery(String,Analyzer,String)}.
+ * Base implementation delegates to {@link #getFieldQuery(String,String)}.
* This method may be overridden, for example, to return
* a SpanNearQuery instead of a PhraseQuery.
*
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field,
- Analyzer analyzer,
- String queryText,
- int slop) throws ParseException {
- Query query = getFieldQuery(field, analyzer, queryText);
+ protected Query getFieldQuery(String field, String queryText, int slop)
+ throws ParseException {
+ Query query = getFieldQuery(field, queryText);
if (query instanceof PhraseQuery) {
((PhraseQuery) query).setSlop(slop);
@@ -285,7 +309,6 @@
* @exception ParseException throw in overridden method to disallow
*/
protected Query getRangeQuery(String field,
- Analyzer analyzer,
String part1,
String part2,
boolean inclusive) throws ParseException
@@ -400,10 +423,10 @@
* @return Resulting {@link Query} built for the term
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFuzzyQuery(String field, String termStr) throws ParseException
+ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
{
Term t = new Term(field, termStr);
- return new FuzzyQuery(t);
+ return new FuzzyQuery(t, minSimilarity);
}
/**
@@ -422,6 +445,25 @@
return new String(caDest, 0, j);
}
+ /**
+ * Returns a String where those characters that QueryParser
+ * expects to be escaped are escaped, i.e. preceded by a <code>\</code>.
+ */
+ public static String escape(String s) {
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ // NOTE: keep this in sync with _ESCAPED_CHAR below!
+ if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
+ || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
+ || c == '*' || c == '?') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
public static void main(String[] args) throws Exception {
QueryParser qp = new QueryParser("field",
new org.apache.lucene.analysis.SimpleAnalyzer());
@@ -587,7 +629,7 @@
}
final public Query Term(String field) throws ParseException {
- Token term, boost=null, slop=null, goop1, goop2;
+ Token term, boost=null, fuzzySlop=null, goop1, goop2;
boolean prefix = false;
boolean wildcard = false;
boolean fuzzy = false;
@@ -619,9 +661,9 @@
throw new ParseException();
}
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case FUZZY:
- jj_consume_token(FUZZY);
- fuzzy=true;
+ case FUZZY_SLOP:
+ fuzzySlop = jj_consume_token(FUZZY_SLOP);
+ fuzzy=true;
break;
default:
jj_la1[8] = jj_gen;
@@ -632,9 +674,9 @@
jj_consume_token(CARAT);
boost = jj_consume_token(NUMBER);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case FUZZY:
- jj_consume_token(FUZZY);
- fuzzy=true;
+ case FUZZY_SLOP:
+ fuzzySlop = jj_consume_token(FUZZY_SLOP);
+ fuzzy=true;
break;
default:
jj_la1[9] = jj_gen;
@@ -653,9 +695,16 @@
discardEscapeChar(term.image.substring
(0, term.image.length()-1)));
} else if (fuzzy) {
- q = getFuzzyQuery(field, termImage);
+ float fms = fuzzyMinSim;
+ try {
+ fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
+ } catch (Exception ignored) { }
+ if(fms < 0.0f || fms > 1.0f){
+ {if (true) throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");}
+ }
+ q = getFuzzyQuery(field, termImage, fms);
} else {
- q = getFieldQuery(field, analyzer, termImage);
+ q = getFieldQuery(field, termImage);
}
break;
case RANGEIN_START:
@@ -712,7 +761,7 @@
} else {
goop2.image = discardEscapeChar(goop2.image);
}
- q = getRangeQuery(field, analyzer, goop1.image, goop2.image, true);
+ q = getRangeQuery(field, goop1.image, goop2.image, true);
break;
case RANGEEX_START:
jj_consume_token(RANGEEX_START);
@@ -769,13 +818,13 @@
goop2.image = discardEscapeChar(goop2.image);
}
- q = getRangeQuery(field, analyzer, goop1.image, goop2.image, false);
+ q = getRangeQuery(field, goop1.image, goop2.image, false);
break;
case QUOTED:
term = jj_consume_token(QUOTED);
switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
- case SLOP:
- slop = jj_consume_token(SLOP);
+ case FUZZY_SLOP:
+ fuzzySlop = jj_consume_token(FUZZY_SLOP);
break;
default:
jj_la1[19] = jj_gen;
@@ -792,15 +841,13 @@
}
int s = phraseSlop;
- if (slop != null) {
+ if (fuzzySlop != null) {
try {
- s = Float.valueOf(slop.image.substring(1)).intValue();
+ s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
}
catch (Exception ignored) { }
}
- q = getFieldQuery(field, analyzer,
- term.image.substring(1, term.image.length()-1),
- s);
+ q = getFieldQuery(field, term.image.substring(1, term.image.length()-1), s);
break;
default:
jj_la1[21] = jj_gen;
@@ -850,16 +897,11 @@
private int jj_gen;
final private int[] jj_la1 = new int[22];
static private int[] jj_la1_0;
- static private int[] jj_la1_1;
static {
jj_la1_0();
- jj_la1_1();
}
private static void jj_la1_0() {
- jj_la1_0 = new int[] {0x180,0x180,0xe00,0xe00,0x1f31f80,0x8000,0x1f31000,0x1320000,0x40000,0x40000,0x8000,0x18000000,0x2000000,0x18000000,0x8000,0x80000000,0x20000000,0x80000000,0x8000,0x80000,0x8000,0x1f30000,};
- }
- private static void jj_la1_1() {
- jj_la1_1 = new int[] {0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0x0,0x0,};
+ jj_la1_0 = new int[] {0x180,0x180,0xe00,0xe00,0xfb1f80,0x8000,0xfb1000,0x9a0000,0x40000,0x40000,0x8000,0xc000000,0x1000000,0xc000000,0x8000,0xc0000000,0x10000000,0xc0000000,0x8000,0x40000,0x8000,0xfb0000,};
}
final private JJCalls[] jj_2_rtns = new JJCalls[1];
private boolean jj_rescan = false;
@@ -1008,8 +1050,8 @@
public ParseException generateParseException() {
jj_expentries.removeAllElements();
- boolean[] la1tokens = new boolean[33];
- for (int i = 0; i < 33; i++) {
+ boolean[] la1tokens = new boolean[32];
+ for (int i = 0; i < 32; i++) {
la1tokens[i] = false;
}
if (jj_kind >= 0) {
@@ -1022,13 +1064,10 @@
if ((jj_la1_0[i] & (1<<j)) != 0) {
la1tokens[j] = true;
}
- if ((jj_la1_1[i] & (1<<j)) != 0) {
- la1tokens[32+j] = true;
- }
}
}
}
- for (int i = 0; i < 33; i++) {
+ for (int i = 0; i < 32; i++) {
if (la1tokens[i]) {
jj_expentry = new int[1];
jj_expentry[0] = i;
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj
index 43fbf98..05cf75c 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@@ -96,6 +96,7 @@
Analyzer analyzer;
String field;
int phraseSlop = 0;
+ float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
Locale locale = Locale.getDefault();
/** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
@@ -137,6 +138,33 @@
throw new ParseException("Too many boolean clauses");
}
}
+
+ /**
+ * @return Returns the analyzer.
+ */
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+ /**
+ * @return Returns the field.
+ */
+ public String getField() {
+ return field;
+ }
+
+ /**
+ * Get the default minimal similarity for fuzzy queries.
+ */
+ public float getFuzzyMinSim() {
+ return fuzzyMinSim;
+ }
+ /**
+ *Set the default minimum similarity for fuzzy queries.
+ */
+ public void setFuzzyMinSim(float fuzzyMinSim) {
+ this.fuzzyMinSim = fuzzyMinSim;
+ }
/**
* Sets the default slop for phrases. If zero, then exact phrase matches
@@ -194,19 +222,19 @@
public Locale getLocale() {
return locale;
}
-
- protected void addClause(Vector clauses, int conj, int mods, Query q) {
+
+ protected void addClause(Vector clauses, int conj, int mods, Query q) {
boolean required, prohibited;
// If this term is introduced by AND, make the preceding term required,
// unless it's already prohibited
- if (conj == CONJ_AND) {
+ if (clauses.size() > 0 && conj == CONJ_AND) {
BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
if (!c.prohibited)
c.required = true;
}
- if (operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
+ if (clauses.size() > 0 && operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
// If this term is introduced by OR, make the preceding term optional,
// unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
// notice if the input is a OR b, first term is parsed as required; without
@@ -241,9 +269,7 @@
/**
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field,
- Analyzer analyzer,
- String queryText) throws ParseException {
+ protected Query getFieldQuery(String field, String queryText) throws ParseException {
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
@@ -285,17 +311,15 @@
}
/**
- * Base implementation delegates to {@link #getFieldQuery(String,Analyzer,String)}.
+ * Base implementation delegates to {@link #getFieldQuery(String,String)}.
* This method may be overridden, for example, to return
* a SpanNearQuery instead of a PhraseQuery.
*
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFieldQuery(String field,
- Analyzer analyzer,
- String queryText,
- int slop) throws ParseException {
- Query query = getFieldQuery(field, analyzer, queryText);
+ protected Query getFieldQuery(String field, String queryText, int slop)
+ throws ParseException {
+ Query query = getFieldQuery(field, queryText);
if (query instanceof PhraseQuery) {
((PhraseQuery) query).setSlop(slop);
@@ -308,7 +332,6 @@
* @exception ParseException throw in overridden method to disallow
*/
protected Query getRangeQuery(String field,
- Analyzer analyzer,
String part1,
String part2,
boolean inclusive) throws ParseException
@@ -423,10 +446,10 @@
* @return Resulting {@link Query} built for the term
* @exception ParseException throw in overridden method to disallow
*/
- protected Query getFuzzyQuery(String field, String termStr) throws ParseException
+ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException
{
Term t = new Term(field, termStr);
- return new FuzzyQuery(t);
+ return new FuzzyQuery(t, minSimilarity);
}
/**
@@ -445,6 +468,25 @@
return new String(caDest, 0, j);
}
+ /**
+ * Returns a String where those characters that QueryParser
+ * expects to be escaped are escaped, i.e. preceded by a <code>\</code>.
+ */
+ public static String escape(String s) {
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ // NOTE: keep this in sync with _ESCAPED_CHAR below!
+ if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':'
+ || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~'
+ || c == '*' || c == '?') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
public static void main(String[] args) throws Exception {
QueryParser qp = new QueryParser("field",
new org.apache.lucene.analysis.SimpleAnalyzer());
@@ -461,6 +503,7 @@
<*> TOKEN : {
<#_NUM_CHAR: ["0"-"9"] >
+// NOTE: keep this in sync with escape(String) above!
| <#_ESCAPED_CHAR: "\\" [ "\\", "+", "-", "!", "(", ")", ":", "^",
"[", "]", "\"", "{", "}", "~", "*", "?" ] >
| <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "+", "-", "!", "(", ")", ":", "^",
@@ -495,8 +538,7 @@
| <CARAT: "^" > : Boost
| <QUOTED: "\"" (~["\""])+ "\"">
| <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
-| <FUZZY: "~" >
-| <SLOP: "~" (<_NUM_CHAR>)+ >
+| <FUZZY_SLOP: "~" ( (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? )? >
| <PREFIXTERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
| <WILDTERM: <_TERM_START_CHAR>
(<_TERM_CHAR> | ( [ "*", "?" ] ))* >
@@ -605,7 +647,7 @@
Query Term(String field) : {
- Token term, boost=null, slop=null, goop1, goop2;
+ Token term, boost=null, fuzzySlop=null, goop1, goop2;
boolean prefix = false;
boolean wildcard = false;
boolean fuzzy = false;
@@ -620,8 +662,8 @@
| term=<WILDTERM> { wildcard=true; }
| term=<NUMBER>
)
- [ <FUZZY> { fuzzy=true; } ]
- [ <CARAT> boost=<NUMBER> [ <FUZZY> { fuzzy=true; } ] ]
+ [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ]
+ [ <CARAT> boost=<NUMBER> [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ] ]
{
String termImage=discardEscapeChar(term.image);
if (wildcard) {
@@ -631,9 +673,16 @@
discardEscapeChar(term.image.substring
(0, term.image.length()-1)));
} else if (fuzzy) {
- q = getFuzzyQuery(field, termImage);
+ float fms = fuzzyMinSim;
+ try {
+ fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
+ } catch (Exception ignored) { }
+ if(fms < 0.0f || fms > 1.0f){
+ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !");
+ }
+ q = getFuzzyQuery(field, termImage, fms);
} else {
- q = getFieldQuery(field, analyzer, termImage);
+ q = getFieldQuery(field, termImage);
}
}
| ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )
@@ -651,7 +700,7 @@
} else {
goop2.image = discardEscapeChar(goop2.image);
}
- q = getRangeQuery(field, analyzer, goop1.image, goop2.image, true);
+ q = getRangeQuery(field, goop1.image, goop2.image, true);
}
| ( <RANGEEX_START> ( goop1=<RANGEEX_GOOP>|goop1=<RANGEEX_QUOTED> )
[ <RANGEEX_TO> ] ( goop2=<RANGEEX_GOOP>|goop2=<RANGEEX_QUOTED> )
@@ -669,23 +718,21 @@
goop2.image = discardEscapeChar(goop2.image);
}
- q = getRangeQuery(field, analyzer, goop1.image, goop2.image, false);
+ q = getRangeQuery(field, goop1.image, goop2.image, false);
}
| term=<QUOTED>
- [ slop=<SLOP> ]
+ [ fuzzySlop=<FUZZY_SLOP> ]
[ <CARAT> boost=<NUMBER> ]
{
int s = phraseSlop;
- if (slop != null) {
+ if (fuzzySlop != null) {
try {
- s = Float.valueOf(slop.image.substring(1)).intValue();
+ s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
}
catch (Exception ignored) { }
}
- q = getFieldQuery(field, analyzer,
- term.image.substring(1, term.image.length()-1),
- s);
+ q = getFieldQuery(field, term.image.substring(1, term.image.length()-1), s);
}
)
{
diff --git a/src/java/org/apache/lucene/queryParser/QueryParserConstants.java b/src/java/org/apache/lucene/queryParser/QueryParserConstants.java
index f95074e..e74067f 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParserConstants.java
+++ b/src/java/org/apache/lucene/queryParser/QueryParserConstants.java
@@ -20,21 +20,20 @@
int CARAT = 15;
int QUOTED = 16;
int TERM = 17;
- int FUZZY = 18;
- int SLOP = 19;
- int PREFIXTERM = 20;
- int WILDTERM = 21;
- int RANGEIN_START = 22;
- int RANGEEX_START = 23;
- int NUMBER = 24;
- int RANGEIN_TO = 25;
- int RANGEIN_END = 26;
- int RANGEIN_QUOTED = 27;
- int RANGEIN_GOOP = 28;
- int RANGEEX_TO = 29;
- int RANGEEX_END = 30;
- int RANGEEX_QUOTED = 31;
- int RANGEEX_GOOP = 32;
+ int FUZZY_SLOP = 18;
+ int PREFIXTERM = 19;
+ int WILDTERM = 20;
+ int RANGEIN_START = 21;
+ int RANGEEX_START = 22;
+ int NUMBER = 23;
+ int RANGEIN_TO = 24;
+ int RANGEIN_END = 25;
+ int RANGEIN_QUOTED = 26;
+ int RANGEIN_GOOP = 27;
+ int RANGEEX_TO = 28;
+ int RANGEEX_END = 29;
+ int RANGEEX_QUOTED = 30;
+ int RANGEEX_GOOP = 31;
int Boost = 0;
int RangeEx = 1;
@@ -60,8 +59,7 @@
"\"^\"",
"<QUOTED>",
"<TERM>",
- "\"~\"",
- "<SLOP>",
+ "<FUZZY_SLOP>",
"<PREFIXTERM>",
"<WILDTERM>",
"\"[\"",
diff --git a/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java b/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
index c4202da..22de31f 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
+++ b/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
@@ -54,13 +54,11 @@
case 58:
return jjStopAtPos(0, 14);
case 91:
- return jjStopAtPos(0, 22);
+ return jjStopAtPos(0, 21);
case 94:
return jjStopAtPos(0, 15);
case 123:
- return jjStopAtPos(0, 23);
- case 126:
- return jjStartNfaWithStates_3(0, 18, 18);
+ return jjStopAtPos(0, 22);
default :
return jjMoveNfa_3(0, 0);
}
@@ -105,7 +103,7 @@
{
int[] nextStates;
int startsAt = 0;
- jjnewStateCnt = 31;
+ jjnewStateCnt = 33;
int i = 1;
jjstateSet[0] = startState;
int j, kind = 0x7fffffff;
@@ -169,56 +167,67 @@
case 18:
if ((0x3ff000000000000L & l) == 0L)
break;
- if (kind > 19)
- kind = 19;
- jjstateSet[jjnewStateCnt++] = 18;
+ if (kind > 18)
+ kind = 18;
+ jjAddStates(7, 8);
break;
case 19:
+ if (curChar == 46)
+ jjCheckNAdd(20);
+ break;
+ case 20:
+ if ((0x3ff000000000000L & l) == 0L)
+ break;
+ if (kind > 18)
+ kind = 18;
+ jjCheckNAdd(20);
+ break;
+ case 21:
if ((0x7bffd0f8ffffd9ffL & l) == 0L)
break;
if (kind > 17)
kind = 17;
jjCheckNAddStates(0, 6);
break;
- case 20:
+ case 22:
if ((0x7bfff8f8ffffd9ffL & l) == 0L)
break;
if (kind > 17)
kind = 17;
- jjCheckNAddTwoStates(20, 21);
+ jjCheckNAddTwoStates(22, 23);
break;
- case 22:
+ case 24:
if ((0x84002f0600000000L & l) == 0L)
break;
if (kind > 17)
kind = 17;
- jjCheckNAddTwoStates(20, 21);
+ jjCheckNAddTwoStates(22, 23);
break;
- case 23:
+ case 25:
if ((0x7bfff8f8ffffd9ffL & l) != 0L)
- jjCheckNAddStates(7, 9);
- break;
- case 24:
- if (curChar == 42 && kind > 20)
- kind = 20;
+ jjCheckNAddStates(9, 11);
break;
case 26:
- if ((0x84002f0600000000L & l) != 0L)
- jjCheckNAddStates(7, 9);
+ if (curChar == 42 && kind > 19)
+ kind = 19;
break;
- case 27:
- if ((0xfbfffcf8ffffd9ffL & l) == 0L)
- break;
- if (kind > 21)
- kind = 21;
- jjCheckNAddTwoStates(27, 28);
+ case 28:
+ if ((0x84002f0600000000L & l) != 0L)
+ jjCheckNAddStates(9, 11);
break;
case 29:
+ if ((0xfbfffcf8ffffd9ffL & l) == 0L)
+ break;
+ if (kind > 20)
+ kind = 20;
+ jjCheckNAddTwoStates(29, 30);
+ break;
+ case 31:
if ((0x84002f0600000000L & l) == 0L)
break;
- if (kind > 21)
- kind = 21;
- jjCheckNAddTwoStates(27, 28);
+ if (kind > 20)
+ kind = 20;
+ jjCheckNAddTwoStates(29, 30);
break;
default : break;
}
@@ -239,9 +248,13 @@
jjCheckNAddStates(0, 6);
}
else if (curChar == 126)
+ {
+ if (kind > 18)
+ kind = 18;
jjstateSet[jjnewStateCnt++] = 18;
+ }
if (curChar == 92)
- jjCheckNAddStates(10, 12);
+ jjCheckNAddStates(12, 14);
else if (curChar == 78)
jjstateSet[jjnewStateCnt++] = 11;
else if (curChar == 124)
@@ -292,70 +305,73 @@
jjstateSet[jjnewStateCnt++] = 11;
break;
case 15:
- jjAddStates(13, 14);
+ jjAddStates(15, 16);
break;
case 17:
- if (curChar == 126)
- jjstateSet[jjnewStateCnt++] = 18;
+ if (curChar != 126)
+ break;
+ if (kind > 18)
+ kind = 18;
+ jjstateSet[jjnewStateCnt++] = 18;
break;
- case 19:
+ case 21:
if ((0x97ffffff97ffffffL & l) == 0L)
break;
if (kind > 17)
kind = 17;
jjCheckNAddStates(0, 6);
break;
- case 20:
+ case 22:
if ((0x97ffffff97ffffffL & l) == 0L)
break;
if (kind > 17)
kind = 17;
- jjCheckNAddTwoStates(20, 21);
- break;
- case 21:
- if (curChar == 92)
- jjCheckNAddTwoStates(22, 22);
- break;
- case 22:
- if ((0x6800000078000000L & l) == 0L)
- break;
- if (kind > 17)
- kind = 17;
- jjCheckNAddTwoStates(20, 21);
+ jjCheckNAddTwoStates(22, 23);
break;
case 23:
- if ((0x97ffffff97ffffffL & l) != 0L)
- jjCheckNAddStates(7, 9);
- break;
- case 25:
if (curChar == 92)
- jjCheckNAddTwoStates(26, 26);
+ jjCheckNAddTwoStates(24, 24);
break;
- case 26:
- if ((0x6800000078000000L & l) != 0L)
- jjCheckNAddStates(7, 9);
- break;
- case 27:
- if ((0x97ffffff97ffffffL & l) == 0L)
- break;
- if (kind > 21)
- kind = 21;
- jjCheckNAddTwoStates(27, 28);
- break;
- case 28:
- if (curChar == 92)
- jjCheckNAddTwoStates(29, 29);
- break;
- case 29:
+ case 24:
if ((0x6800000078000000L & l) == 0L)
break;
- if (kind > 21)
- kind = 21;
- jjCheckNAddTwoStates(27, 28);
+ if (kind > 17)
+ kind = 17;
+ jjCheckNAddTwoStates(22, 23);
+ break;
+ case 25:
+ if ((0x97ffffff97ffffffL & l) != 0L)
+ jjCheckNAddStates(9, 11);
+ break;
+ case 27:
+ if (curChar == 92)
+ jjCheckNAddTwoStates(28, 28);
+ break;
+ case 28:
+ if ((0x6800000078000000L & l) != 0L)
+ jjCheckNAddStates(9, 11);
+ break;
+ case 29:
+ if ((0x97ffffff97ffffffL & l) == 0L)
+ break;
+ if (kind > 20)
+ kind = 20;
+ jjCheckNAddTwoStates(29, 30);
break;
case 30:
if (curChar == 92)
- jjCheckNAddStates(10, 12);
+ jjCheckNAddTwoStates(31, 31);
+ break;
+ case 31:
+ if ((0x6800000078000000L & l) == 0L)
+ break;
+ if (kind > 20)
+ kind = 20;
+ jjCheckNAddTwoStates(29, 30);
+ break;
+ case 32:
+ if (curChar == 92)
+ jjCheckNAddStates(12, 14);
break;
default : break;
}
@@ -381,25 +397,25 @@
break;
case 15:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
- jjAddStates(13, 14);
+ jjAddStates(15, 16);
break;
- case 20:
+ case 22:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
if (kind > 17)
kind = 17;
- jjCheckNAddTwoStates(20, 21);
+ jjCheckNAddTwoStates(22, 23);
break;
- case 23:
+ case 25:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
- jjCheckNAddStates(7, 9);
+ jjCheckNAddStates(9, 11);
break;
- case 27:
+ case 29:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
- if (kind > 21)
- kind = 21;
- jjCheckNAddTwoStates(27, 28);
+ if (kind > 20)
+ kind = 20;
+ jjCheckNAddTwoStates(29, 30);
break;
default : break;
}
@@ -412,7 +428,7 @@
kind = 0x7fffffff;
}
++curPos;
- if ((i = jjnewStateCnt) == (startsAt = 31 - (jjnewStateCnt = startsAt)))
+ if ((i = jjnewStateCnt) == (startsAt = 33 - (jjnewStateCnt = startsAt)))
return curPos;
try { curChar = input_stream.readChar(); }
catch(java.io.IOException e) { return curPos; }
@@ -423,9 +439,9 @@
switch (pos)
{
case 0:
- if ((active0 & 0x20000000L) != 0L)
+ if ((active0 & 0x10000000L) != 0L)
{
- jjmatchedKind = 32;
+ jjmatchedKind = 31;
return 4;
}
return -1;
@@ -450,9 +466,9 @@
switch(curChar)
{
case 84:
- return jjMoveStringLiteralDfa1_1(0x20000000L);
+ return jjMoveStringLiteralDfa1_1(0x10000000L);
case 125:
- return jjStopAtPos(0, 30);
+ return jjStopAtPos(0, 29);
default :
return jjMoveNfa_1(0, 0);
}
@@ -467,8 +483,8 @@
switch(curChar)
{
case 79:
- if ((active0 & 0x20000000L) != 0L)
- return jjStartNfaWithStates_1(1, 29, 4);
+ if ((active0 & 0x10000000L) != 0L)
+ return jjStartNfaWithStates_1(1, 28, 4);
break;
default :
break;
@@ -497,8 +513,8 @@
case 0:
if ((0xfffffffeffffffffL & l) != 0L)
{
- if (kind > 32)
- kind = 32;
+ if (kind > 31)
+ kind = 31;
jjCheckNAdd(4);
}
if ((0x100002600L & l) != 0L)
@@ -518,14 +534,14 @@
jjCheckNAddTwoStates(2, 3);
break;
case 3:
- if (curChar == 34 && kind > 31)
- kind = 31;
+ if (curChar == 34 && kind > 30)
+ kind = 30;
break;
case 4:
if ((0xfffffffeffffffffL & l) == 0L)
break;
- if (kind > 32)
- kind = 32;
+ if (kind > 31)
+ kind = 31;
jjCheckNAdd(4);
break;
default : break;
@@ -543,12 +559,12 @@
case 4:
if ((0xdfffffffffffffffL & l) == 0L)
break;
- if (kind > 32)
- kind = 32;
+ if (kind > 31)
+ kind = 31;
jjCheckNAdd(4);
break;
case 2:
- jjAddStates(15, 16);
+ jjAddStates(17, 18);
break;
default : break;
}
@@ -569,13 +585,13 @@
case 4:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
- if (kind > 32)
- kind = 32;
+ if (kind > 31)
+ kind = 31;
jjCheckNAdd(4);
break;
case 2:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
- jjAddStates(15, 16);
+ jjAddStates(17, 18);
break;
default : break;
}
@@ -620,9 +636,9 @@
case 0:
if ((0x3ff000000000000L & l) == 0L)
break;
- if (kind > 24)
- kind = 24;
- jjAddStates(17, 18);
+ if (kind > 23)
+ kind = 23;
+ jjAddStates(19, 20);
break;
case 1:
if (curChar == 46)
@@ -631,8 +647,8 @@
case 2:
if ((0x3ff000000000000L & l) == 0L)
break;
- if (kind > 24)
- kind = 24;
+ if (kind > 23)
+ kind = 23;
jjCheckNAdd(2);
break;
default : break;
@@ -683,9 +699,9 @@
switch (pos)
{
case 0:
- if ((active0 & 0x2000000L) != 0L)
+ if ((active0 & 0x1000000L) != 0L)
{
- jjmatchedKind = 28;
+ jjmatchedKind = 27;
return 4;
}
return -1;
@@ -710,9 +726,9 @@
switch(curChar)
{
case 84:
- return jjMoveStringLiteralDfa1_2(0x2000000L);
+ return jjMoveStringLiteralDfa1_2(0x1000000L);
case 93:
- return jjStopAtPos(0, 26);
+ return jjStopAtPos(0, 25);
default :
return jjMoveNfa_2(0, 0);
}
@@ -727,8 +743,8 @@
switch(curChar)
{
case 79:
- if ((active0 & 0x2000000L) != 0L)
- return jjStartNfaWithStates_2(1, 25, 4);
+ if ((active0 & 0x1000000L) != 0L)
+ return jjStartNfaWithStates_2(1, 24, 4);
break;
default :
break;
@@ -757,8 +773,8 @@
case 0:
if ((0xfffffffeffffffffL & l) != 0L)
{
- if (kind > 28)
- kind = 28;
+ if (kind > 27)
+ kind = 27;
jjCheckNAdd(4);
}
if ((0x100002600L & l) != 0L)
@@ -778,14 +794,14 @@
jjCheckNAddTwoStates(2, 3);
break;
case 3:
- if (curChar == 34 && kind > 27)
- kind = 27;
+ if (curChar == 34 && kind > 26)
+ kind = 26;
break;
case 4:
if ((0xfffffffeffffffffL & l) == 0L)
break;
- if (kind > 28)
- kind = 28;
+ if (kind > 27)
+ kind = 27;
jjCheckNAdd(4);
break;
default : break;
@@ -803,12 +819,12 @@
case 4:
if ((0xffffffffdfffffffL & l) == 0L)
break;
- if (kind > 28)
- kind = 28;
+ if (kind > 27)
+ kind = 27;
jjCheckNAdd(4);
break;
case 2:
- jjAddStates(15, 16);
+ jjAddStates(17, 18);
break;
default : break;
}
@@ -829,13 +845,13 @@
case 4:
if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
break;
- if (kind > 28)
- kind = 28;
+ if (kind > 27)
+ kind = 27;
jjCheckNAdd(4);
break;
case 2:
if (jjCanMove_0(hiByte, i1, i2, l1, l2))
- jjAddStates(15, 16);
+ jjAddStates(17, 18);
break;
default : break;
}
@@ -855,8 +871,8 @@
}
}
static final int[] jjnextStates = {
- 20, 23, 24, 27, 28, 25, 21, 23, 24, 25, 22, 26, 29, 15, 16, 2,
- 3, 0, 1,
+ 22, 25, 26, 29, 30, 27, 23, 18, 19, 25, 26, 27, 24, 28, 31, 15,
+ 16, 2, 3, 0, 1,
};
private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2)
{
@@ -872,8 +888,8 @@
}
public static final String[] jjstrLiteralImages = {
"", null, null, null, null, null, null, null, null, null, "\53", "\55", "\50",
-"\51", "\72", "\136", null, null, "\176", null, null, null, "\133", "\173", null,
-"\124\117", "\135", null, null, "\124\117", "\175", null, null, };
+"\51", "\72", "\136", null, null, null, null, null, "\133", "\173", null, "\124\117",
+"\135", null, null, "\124\117", "\175", null, null, };
public static final String[] lexStateNames = {
"Boost",
"RangeEx",
@@ -881,18 +897,18 @@
"DEFAULT",
};
public static final int[] jjnewLexState = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, 2, 1, 3,
- -1, 3, -1, -1, -1, 3, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, 2, 1, 3, -1,
+ 3, -1, -1, -1, 3, -1, -1,
};
static final long[] jjtoToken = {
- 0x1ffffff81L,
+ 0xffffff81L,
};
static final long[] jjtoSkip = {
0x40L,
};
protected CharStream input_stream;
-private final int[] jjrounds = new int[31];
-private final int[] jjstateSet = new int[62];
+private final int[] jjrounds = new int[33];
+private final int[] jjstateSet = new int[66];
protected char curChar;
public QueryParserTokenManager(CharStream stream)
{
@@ -914,7 +930,7 @@
{
int i;
jjround = 0x80000001;
- for (i = 31; i-- > 0;)
+ for (i = 33; i-- > 0;)
jjrounds[i] = 0x80000000;
}
public void ReInit(CharStream stream, int lexState)
diff --git a/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index 8d28837..8232834 100644
--- a/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -21,9 +21,9 @@
final class ExactPhraseScorer extends PhraseScorer {
- ExactPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
+ ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
byte[] norms) throws IOException {
- super(weight, tps, similarity, norms);
+ super(weight, tps, positions, similarity, norms);
}
protected final float phraseFreq() throws IOException {
diff --git a/src/java/org/apache/lucene/search/FieldCacheImpl.java b/src/java/org/apache/lucene/search/FieldCacheImpl.java
index 26302da..28c74cd 100644
--- a/src/java/org/apache/lucene/search/FieldCacheImpl.java
+++ b/src/java/org/apache/lucene/search/FieldCacheImpl.java
@@ -388,7 +388,7 @@
store (reader, field, SortField.CUSTOM, retArray);
return retArray;
}
- return (String[]) ret;
+ return (Comparable[]) ret;
}
}
diff --git a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
index 9663c84..158fed8 100644
--- a/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
+++ b/src/java/org/apache/lucene/search/FieldSortedHitQueue.java
@@ -95,6 +95,9 @@
c = (fields[i].reverse) ? comparators[i].compare (docB, docA)
: comparators[i].compare (docA, docB);
}
+ // avoid random sort order that could lead to duplicates (bug #31241):
+ if (c == 0)
+ return docA.doc > docB.doc;
return c > 0;
}
@@ -197,10 +200,9 @@
static ScoreDocComparator comparatorInt (final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
+ final int[] fieldOrder = FieldCache.DEFAULT.getInts (reader, field);
return new ScoreDocComparator() {
- final int[] fieldOrder = FieldCache.DEFAULT.getInts (reader, field);
-
public final int compare (final ScoreDoc i, final ScoreDoc j) {
final int fi = fieldOrder[i.doc];
final int fj = fieldOrder[j.doc];
@@ -229,10 +231,9 @@
static ScoreDocComparator comparatorFloat (final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
+ final float[] fieldOrder = FieldCache.DEFAULT.getFloats (reader, field);
return new ScoreDocComparator () {
- protected final float[] fieldOrder = FieldCache.DEFAULT.getFloats (reader, field);
-
public final int compare (final ScoreDoc i, final ScoreDoc j) {
final float fi = fieldOrder[i.doc];
final float fj = fieldOrder[j.doc];
@@ -261,8 +262,8 @@
static ScoreDocComparator comparatorString (final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
+ final FieldCache.StringIndex index = FieldCache.DEFAULT.getStringIndex (reader, field);
return new ScoreDocComparator () {
- final FieldCache.StringIndex index = FieldCache.DEFAULT.getStringIndex (reader, field);
public final int compare (final ScoreDoc i, final ScoreDoc j) {
final int fi = index.order[i.doc];
@@ -293,8 +294,8 @@
throws IOException {
final Collator collator = Collator.getInstance (locale);
final String field = fieldname.intern();
+ final String[] index = FieldCache.DEFAULT.getStrings (reader, field);
return new ScoreDocComparator() {
- final String[] index = FieldCache.DEFAULT.getStrings (reader, field);
public final int compare (final ScoreDoc i, final ScoreDoc j) {
return collator.compare (index[i.doc], index[j.doc]);
diff --git a/src/java/org/apache/lucene/search/FuzzyQuery.java b/src/java/org/apache/lucene/search/FuzzyQuery.java
index a318a5c..13b56e4 100644
--- a/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/src/java/org/apache/lucene/search/FuzzyQuery.java
@@ -20,17 +20,83 @@
import org.apache.lucene.index.Term;
import java.io.IOException;
-/** Implements the fuzzy search query */
+/** Implements the fuzzy search query. The similiarity measurement
+ * is based on the Levenshtein (edit distance) algorithm.
+ */
public final class FuzzyQuery extends MultiTermQuery {
- public FuzzyQuery(Term term) {
+
+ public final static float defaultMinSimilarity = 0.5f;
+ private float minimumSimilarity;
+ private int prefixLength;
+
+ /**
+ * Create a new FuzzyQuery that will match terms with a similarity
+ * of at least <code>minimumSimilarity</code> to <code>term</code>.
+ * If a <code>prefixLength</code> > 0 is specified, a common prefix
+ * of that length is also required.
+ *
+ * @param term the term to search for
+ * @param minimumSimilarity a value between 0 and 1 to set the required similarity
+ * between the query term and the matching terms. For example, for a
+ * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
+ * as the query term is considered similar to the query term if the edit distance
+ * between both terms is less than <code>length(term)*0.5</code>
+ * @param prefixLength length of common (non-fuzzy) prefix
+ * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0
+ * or if prefixLength < 0 or > <code>term.text().length()</code>.
+ */
+ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
super(term);
+
+ if (minimumSimilarity > 1.0f)
+ throw new IllegalArgumentException("minimumSimilarity > 1");
+ else if (minimumSimilarity < 0.0f)
+ throw new IllegalArgumentException("minimumSimilarity < 0");
+ this.minimumSimilarity = minimumSimilarity;
+
+ if(prefixLength < 0)
+ throw new IllegalArgumentException("prefixLength < 0");
+ else if(prefixLength >= term.text().length())
+ throw new IllegalArgumentException("prefixLength >= term.text().length()");
+ this.prefixLength = prefixLength;
+ }
+
+ /**
+ * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
+ */
+ public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
+ this(term, minimumSimilarity, 0);
+ }
+
+ /**
+ * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
+ */
+ public FuzzyQuery(Term term) {
+ this(term, defaultMinSimilarity, 0);
+ }
+
+ /**
+ * Returns the minimum similarity that is required for this query to match.
+ * @return float value between 0.0 and 1.0
+ */
+ public float getMinSimilarity() {
+ return minimumSimilarity;
}
+ /**
+ * Returns the prefix length, i.e. the number of characters at the start
+ * of a term that must be identical (not fuzzy) to the query term if the query
+ * is to match that term.
+ */
+ public int getPrefixLength() {
+ return prefixLength;
+ }
+
protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
- return new FuzzyTermEnum(reader, getTerm());
+ return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
}
public String toString(String field) {
- return super.toString(field) + '~';
+ return super.toString(field) + '~' + Float.toString(minimumSimilarity);
}
}
diff --git a/src/java/org/apache/lucene/search/FuzzyTermEnum.java b/src/java/org/apache/lucene/search/FuzzyTermEnum.java
index 9335e31..7ede172 100644
--- a/src/java/org/apache/lucene/search/FuzzyTermEnum.java
+++ b/src/java/org/apache/lucene/search/FuzzyTermEnum.java
@@ -26,21 +26,69 @@
the enumeration is greater than all that precede it. */
public final class FuzzyTermEnum extends FilteredTermEnum {
double distance;
- boolean fieldMatch = false;
boolean endEnum = false;
Term searchTerm = null;
String field = "";
String text = "";
int textlen;
+ String prefix = "";
+ int prefixLength = 0;
+ float minimumSimilarity;
+ double scale_factor;
+
+ /**
+ * Empty prefix and minSimilarity of 0.5f are used.
+ *
+ * @param reader
+ * @param term
+ * @throws IOException
+ * @see #FuzzyTermEnum(IndexReader, Term, float, int)
+ */
public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
+ this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
+ }
+
+ /**
+ * This is the standard FuzzyTermEnum with an empty prefix.
+ *
+ * @param reader
+ * @param term
+ * @param minSimilarity
+ * @throws IOException
+ * @see #FuzzyTermEnum(IndexReader, Term, float, int)
+ */
+ public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
+ this(reader, term, minSimilarity, 0);
+ }
+
+ /**
+ * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
+ * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity >
+ * <code>minSimilarity</code>.
+ *
+ * @param reader Delivers terms.
+ * @param term Pattern term.
+ * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
+ * @param prefixLength Length of required common prefix. Default value is 0.
+ * @throws IOException
+ */
+ public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException {
super();
+ minimumSimilarity = minSimilarity;
+ scale_factor = 1.0f / (1.0f - minimumSimilarity);
searchTerm = term;
field = searchTerm.field();
text = searchTerm.text();
textlen = text.length();
- setEnum(reader.terms(new Term(searchTerm.field(), "")));
+ if(prefixLength > 0 && prefixLength < textlen){
+ this.prefixLength = prefixLength;
+ prefix = text.substring(0, prefixLength);
+ text = text.substring(prefixLength);
+ textlen = text.length();
+ }
+ setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
}
/**
@@ -48,19 +96,20 @@
calculate the distance between the given term and the comparing term.
*/
protected final boolean termCompare(Term term) {
- if (field == term.field()) {
- String target = term.text();
+ String termText = term.text();
+ if (field == term.field() && termText.startsWith(prefix)) {
+ String target = termText.substring(prefixLength);
int targetlen = target.length();
int dist = editDistance(text, target, textlen, targetlen);
distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
- return (distance > FUZZY_THRESHOLD);
+ return (distance > minimumSimilarity);
}
endEnum = true;
return false;
}
protected final float difference() {
- return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
+ return (float)((distance - minimumSimilarity) * scale_factor);
}
public final boolean endEnum() {
@@ -71,9 +120,6 @@
* Compute Levenshtein distance
******************************/
- public static final double FUZZY_THRESHOLD = 0.5;
- public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
-
/**
Finds and returns the smallest of three integers
*/
diff --git a/src/java/org/apache/lucene/search/IndexSearcher.java b/src/java/org/apache/lucene/search/IndexSearcher.java
index 0795ce3..ad2f935 100644
--- a/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/src/java/org/apache/lucene/search/IndexSearcher.java
@@ -90,11 +90,15 @@
final HitQueue hq = new HitQueue(nDocs);
final int[] totalHits = new int[1];
scorer.score(new HitCollector() {
+ private float minScore = 0.0f;
public final void collect(int doc, float score) {
if (score > 0.0f && // ignore zeroed buckets
(bits==null || bits.get(doc))) { // skip docs not in bits
totalHits[0]++;
- hq.insert(new ScoreDoc(doc, score));
+ if (hq.size() < nDocs || score >= minScore) {
+ hq.insert(new ScoreDoc(doc, score));
+ minScore = ((ScoreDoc)hq.top()).score; // maintain minScore
+ }
}
}
});
diff --git a/src/java/org/apache/lucene/search/PhrasePositions.java b/src/java/org/apache/lucene/search/PhrasePositions.java
index e035e7b..a4f1362 100644
--- a/src/java/org/apache/lucene/search/PhrasePositions.java
+++ b/src/java/org/apache/lucene/search/PhrasePositions.java
@@ -27,7 +27,7 @@
TermPositions tp; // stream of positions
PhrasePositions next; // used to make lists
- PhrasePositions(TermPositions t, int o) throws IOException {
+ PhrasePositions(TermPositions t, int o) {
tp = t;
offset = o;
}
diff --git a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
index 345cb38..b2be426 100644
--- a/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
+++ b/src/java/org/apache/lucene/search/PhrasePrefixQuery.java
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
+import java.util.Vector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultipleTermPositions;
@@ -40,42 +41,69 @@
public class PhrasePrefixQuery extends Query {
private String field;
private ArrayList termArrays = new ArrayList();
+ private Vector positions = new Vector();
private int slop = 0;
- /* Sets the phrase slop for this query.
+ /** Sets the phrase slop for this query.
* @see PhraseQuery#setSlop(int)
*/
public void setSlop(int s) { slop = s; }
- /* Sets the phrase slop for this query.
+ /** Sets the phrase slop for this query.
* @see PhraseQuery#getSlop()
*/
public int getSlop() { return slop; }
- /* Add a single term at the next position in the phrase.
+ /** Add a single term at the next position in the phrase.
* @see PhraseQuery#add(Term)
*/
public void add(Term term) { add(new Term[]{term}); }
- /* Add multiple terms at the next position in the phrase. Any of the terms
+ /** Add multiple terms at the next position in the phrase. Any of the terms
* may match.
*
* @see PhraseQuery#add(Term)
*/
public void add(Term[] terms) {
+ int position = 0;
+ if (positions.size() > 0)
+ position = ((Integer) positions.lastElement()).intValue() + 1;
+
+ add(terms, position);
+ }
+
+ /**
+ * Allows to specify the relative position of terms within the phrase.
+ *
+ * @see PhraseQuery#add(Term, int)
+ * @param terms
+ * @param position
+ */
+ public void add(Term[] terms, int position) {
if (termArrays.size() == 0)
field = terms[0].field();
-
- for (int i=0; i<terms.length; i++) {
+
+ for (int i = 0; i < terms.length; i++) {
if (terms[i].field() != field) {
- throw new IllegalArgumentException
- ("All phrase terms must be in the same field (" + field + "): "
- + terms[i]);
+ throw new IllegalArgumentException(
+ "All phrase terms must be in the same field (" + field + "): "
+ + terms[i]);
}
}
termArrays.add(terms);
+ positions.addElement(new Integer(position));
+ }
+
+ /**
+ * Returns the relative positions of terms in this phrase.
+ */
+ public int[] getPositions() {
+ int[] result = new int[positions.size()];
+ for (int i = 0; i < positions.size(); i++)
+ result[i] = ((Integer) positions.elementAt(i)).intValue();
+ return result;
}
private class PhrasePrefixWeight implements Weight {
@@ -131,10 +159,10 @@
}
if (slop == 0)
- return new ExactPhraseScorer(this, tps, getSimilarity(searcher),
+ return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
reader.norms(field));
else
- return new SloppyPhraseScorer(this, tps, getSimilarity(searcher),
+ return new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
slop, reader.norms(field));
}
@@ -222,7 +250,9 @@
Iterator i = termArrays.iterator();
while (i.hasNext()) {
Term[] terms = (Term[])i.next();
- buffer.append(terms[0].text() + (terms.length > 0 ? "*" : ""));
+ buffer.append(terms[0].text() + (terms.length > 1 ? "*" : ""));
+ if (i.hasNext())
+ buffer.append(" ");
}
buffer.append("\"");
diff --git a/src/java/org/apache/lucene/search/PhraseQuery.java b/src/java/org/apache/lucene/search/PhraseQuery.java
index d0e40da..ac65b81 100644
--- a/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -29,6 +29,7 @@
public class PhraseQuery extends Query {
private String field;
private Vector terms = new Vector();
+ private Vector positions = new Vector();
private int slop = 0;
/** Constructs an empty phrase query. */
@@ -52,21 +53,51 @@
/** Returns the slop. See setSlop(). */
public int getSlop() { return slop; }
- /** Adds a term to the end of the query phrase. */
+ /**
+ * Adds a term to the end of the query phrase.
+ * The relative position of the term is the one immediately after the last term added.
+ */
public void add(Term term) {
- if (terms.size() == 0)
- field = term.field();
- else if (term.field() != field)
- throw new IllegalArgumentException
- ("All phrase terms must be in the same field: " + term);
-
- terms.addElement(term);
+ int position = 0;
+ if(positions.size() > 0)
+ position = ((Integer) positions.lastElement()).intValue() + 1;
+
+ add(term, position);
+ }
+
+ /**
+ * Adds a term to the end of the query phrase.
+ * The relative position of the term within the phrase is specified explicitly.
+ * This allows e.g. phrases with more than one term at the same position
+ * or phrases with gaps (e.g. in connection with stopwords).
+ *
+ * @param term
+ * @param position
+ */
+ public void add(Term term, int position) {
+ if (terms.size() == 0)
+ field = term.field();
+ else if (term.field() != field)
+ throw new IllegalArgumentException("All phrase terms must be in the same field: " + term);
+
+ terms.addElement(term);
+ positions.addElement(new Integer(position));
}
/** Returns the set of terms in this phrase. */
public Term[] getTerms() {
return (Term[])terms.toArray(new Term[0]);
}
+
+ /**
+ * Returns the relative positions of terms in this phrase.
+ */
+ public int[] getPositions() {
+ int[] result = new int[positions.size()];
+ for(int i = 0; i < positions.size(); i++)
+ result[i] = ((Integer) positions.elementAt(i)).intValue();
+ return result;
+ }
private class PhraseWeight implements Weight {
private Searcher searcher;
@@ -109,11 +140,11 @@
}
if (slop == 0) // optimize exact case
- return new ExactPhraseScorer(this, tps, getSimilarity(searcher),
+ return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher),
reader.norms(field));
else
return
- new SloppyPhraseScorer(this, tps, getSimilarity(searcher), slop,
+ new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), slop,
reader.norms(field));
}
@@ -244,14 +275,16 @@
PhraseQuery other = (PhraseQuery)o;
return (this.getBoost() == other.getBoost())
&& (this.slop == other.slop)
- && this.terms.equals(other.terms);
+ && this.terms.equals(other.terms)
+ && this.positions.equals(other.positions);
}
/** Returns a hash code value for this object.*/
public int hashCode() {
return Float.floatToIntBits(getBoost())
^ Float.floatToIntBits(slop)
- ^ terms.hashCode();
+ ^ terms.hashCode()
+ ^ positions.hashCode();
}
}
diff --git a/src/java/org/apache/lucene/search/PhraseScorer.java b/src/java/org/apache/lucene/search/PhraseScorer.java
index afeaf4c..311b20e 100644
--- a/src/java/org/apache/lucene/search/PhraseScorer.java
+++ b/src/java/org/apache/lucene/search/PhraseScorer.java
@@ -32,8 +32,9 @@
private float freq;
- PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
- byte[] norms) throws IOException {
+
+ PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ byte[] norms) {
super(similarity);
this.norms = norms;
this.weight = weight;
@@ -41,7 +42,7 @@
// convert tps to a list
for (int i = 0; i < tps.length; i++) {
- PhrasePositions pp = new PhrasePositions(tps[i], i);
+ PhrasePositions pp = new PhrasePositions(tps[i], positions[i]);
if (last != null) { // add next to end of list
last.next = pp;
} else
diff --git a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
index 543970c..af6064d 100644
--- a/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
+++ b/src/java/org/apache/lucene/search/SloppyPhraseScorer.java
@@ -23,9 +23,9 @@
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
- SloppyPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity,
- int slop, byte[] norms) throws IOException {
- super(weight, tps, similarity, norms);
+ SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity,
+ int slop, byte[] norms) {
+ super(weight, tps, positions, similarity, norms);
this.slop = slop;
}
diff --git a/src/java/org/apache/lucene/search/SortComparator.java b/src/java/org/apache/lucene/search/SortComparator.java
index c1a904f..50cfe8a 100644
--- a/src/java/org/apache/lucene/search/SortComparator.java
+++ b/src/java/org/apache/lucene/search/SortComparator.java
@@ -29,8 +29,8 @@
public ScoreDocComparator newComparator (final IndexReader reader, final String fieldname)
throws IOException {
final String field = fieldname.intern();
+ final Comparable[] cachedValues = FieldCache.DEFAULT.getCustom (reader, field, SortComparator.this);
return new ScoreDocComparator() {
- protected Comparable[] cachedValues = FieldCache.DEFAULT.getCustom (reader, field, SortComparator.this);
public int compare (ScoreDoc i, ScoreDoc j) {
return cachedValues[i.doc].compareTo (cachedValues[j.doc]);
diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
index c3519bb..8332dba 100644
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@@ -89,7 +89,7 @@
super(f, a);
}
- protected Query getFuzzyQuery(String field, String termStr) throws ParseException {
+ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException {
throw new ParseException("Fuzzy queries not allowed");
}
@@ -235,15 +235,29 @@
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");
- assertQueryEquals("term~", null, "term~");
- assertQueryEquals("term~^2", null, "term^2.0~");
- assertQueryEquals("term^2~", null, "term^2.0~");
+ assertQueryEquals("term~", null, "term~0.5");
+ assertQueryEquals("term~0.7", null, "term~0.7");
+ assertQueryEquals("term~^2", null, "term^2.0~0.5");
+ assertQueryEquals("term^2~", null, "term^2.0~0.5");
assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
assertTrue(getQuery("term*", null) instanceof PrefixQuery);
assertTrue(getQuery("term*^2", null) instanceof PrefixQuery);
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
+ assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
+ FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
+ assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(0, fq.getPrefixLength());
+ fq = (FuzzyQuery)getQuery("term~", null);
+ assertEquals(0.5f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(0, fq.getPrefixLength());
+ try {
+ getQuery("term~1.1", null); // value > 1, throws exception
+ fail();
+ } catch(ParseException pe) {
+ // expected exception
+ }
assertTrue(getQuery("term*germ", null) instanceof WildcardQuery);
/* Tests to see that wild card terms are (or are not) properly
@@ -317,7 +331,8 @@
public void testEscaped() throws Exception {
Analyzer a = new WhitespaceAnalyzer();
- /* assertQueryEquals("\\[brackets", a, "\\[brackets");
+
+ /*assertQueryEquals("\\[brackets", a, "\\[brackets");
assertQueryEquals("\\[brackets", null, "brackets");
assertQueryEquals("\\\\", a, "\\\\");
assertQueryEquals("\\+blah", a, "\\+blah");
@@ -337,38 +352,40 @@
assertQueryEquals("\\~blah", a, "\\~blah");
assertQueryEquals("\\*blah", a, "\\*blah");
assertQueryEquals("\\?blah", a, "\\?blah");
- assertQueryEquals("foo \\&& bar", a, "foo \\&& bar");
- assertQueryEquals("foo \\|| bar", a, "foo \\|| bar");
- assertQueryEquals("foo \\AND bar", a, "foo \\AND bar"); */
+ //assertQueryEquals("foo \\&\\& bar", a, "foo \\&\\& bar");
+ //assertQueryEquals("foo \\|| bar", a, "foo \\|| bar");
+ //assertQueryEquals("foo \\AND bar", a, "foo \\AND bar");*/
- assertQueryEquals("a\\-b:c",a,"a-b:c");
- assertQueryEquals("a\\+b:c",a,"a+b:c");
- assertQueryEquals("a\\:b:c",a,"a:b:c");
- assertQueryEquals("a\\\\b:c",a,"a\\b:c");
+ assertQueryEquals("a\\-b:c", a, "a-b:c");
+ assertQueryEquals("a\\+b:c", a, "a+b:c");
+ assertQueryEquals("a\\:b:c", a, "a:b:c");
+ assertQueryEquals("a\\\\b:c", a, "a\\b:c");
- assertQueryEquals("a:b\\-c",a,"a:b-c");
- assertQueryEquals("a:b\\+c",a,"a:b+c");
- assertQueryEquals("a:b\\:c",a,"a:b:c");
- assertQueryEquals("a:b\\\\c",a,"a:b\\c");
+ assertQueryEquals("a:b\\-c", a, "a:b-c");
+ assertQueryEquals("a:b\\+c", a, "a:b+c");
+ assertQueryEquals("a:b\\:c", a, "a:b:c");
+ assertQueryEquals("a:b\\\\c", a, "a:b\\c");
- assertQueryEquals("a:b\\-c*",a,"a:b-c*");
- assertQueryEquals("a:b\\+c*",a,"a:b+c*");
- assertQueryEquals("a:b\\:c*",a,"a:b:c*");
- assertQueryEquals("a:b\\\\c*",a,"a:b\\c*");
+ assertQueryEquals("a:b\\-c*", a, "a:b-c*");
+ assertQueryEquals("a:b\\+c*", a, "a:b+c*");
+ assertQueryEquals("a:b\\:c*", a, "a:b:c*");
- assertQueryEquals("a:b\\-?c",a,"a:b-?c");
- assertQueryEquals("a:b\\+?c",a,"a:b+?c");
- assertQueryEquals("a:b\\:?c",a,"a:b:?c");
- assertQueryEquals("a:b\\\\?c",a,"a:b\\?c");
+ assertQueryEquals("a:b\\\\c*", a, "a:b\\c*");
- assertQueryEquals("a:b\\-c~",a,"a:b-c~");
- assertQueryEquals("a:b\\+c~",a,"a:b+c~");
- assertQueryEquals("a:b\\:c~",a,"a:b:c~");
- assertQueryEquals("a:b\\\\c~",a,"a:b\\c~");
+ assertQueryEquals("a:b\\-?c", a, "a:b-?c");
+ assertQueryEquals("a:b\\+?c", a, "a:b+?c");
+ assertQueryEquals("a:b\\:?c", a, "a:b:?c");
- assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
- assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
- assertQueryEquals("[ a\\\\ TO a\\* ]", null, "[a\\ TO a*]");
+ assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
+
+ assertQueryEquals("a:b\\-c~", a, "a:b-c~0.5");
+ assertQueryEquals("a:b\\+c~", a, "a:b+c~0.5");
+ assertQueryEquals("a:b\\:c~", a, "a:b:c~0.5");
+ assertQueryEquals("a:b\\\\c~", a, "a:b\\c~0.5");
+
+ assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
+ assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
+ assertQueryEquals("[ a\\\\ TO a\\* ]", null, "[a\\ TO a*]");
}
public void testTabNewlineCarriageReturn()
diff --git a/xdocs/index.xml b/xdocs/index.xml
index ad3bb27..67d9560 100644
--- a/xdocs/index.xml
+++ b/xdocs/index.xml
@@ -23,14 +23,13 @@
<section name="Lucene News">
- <h3>1 July 2004 - Lucene 1.4 Final Released</h3>
+ <h3>1 October 2004 - Lucene 1.4.2 Released</h3>
- <p>A new release of Lucene is available with many new
- features and bug fixes. See <a
- href="http://cvs.apache.org/viewcvs.cgi/*checkout*/jakarta-lucene/CHANGES.txt?rev=1.94">CHANGES.txt</a>
+ <p>This fixes a few bugs in 1.4.1. See <a
+ href="http://cvs.apache.org/viewcvs.cgi/*checkout*/jakarta-lucene/CHANGES.txt?rev=1.96.2.4">CHANGES.txt</a>
for details. Binary and source distributions are
available <a
- href="http://cvs.apache.org/dist/jakarta/lucene/v1.4-final/">here</a>.
+ href="http://cvs.apache.org/dist/jakarta/lucene/v1.4.2/">here</a>.
</p>
</section>