blob: a5292c72fec1f96fbbd2c3a567fa5fa233824494 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* Converts the query string to a Collection of Lucene tokens using a regular expression.
* Boolean operators AND, OR, NOT are skipped.
*
* Each term is checked to determine if it is optional, required or prohibited. Required
* terms output a {@link Token} with the {@link QueryConverter#REQUIRED_TERM_FLAG} set.
* Prohibited terms output a {@link Token} with the {@link QueryConverter#PROHIBITED_TERM_FLAG}
* set. If the query uses the plus (+) and minus (-) to denote required and prohibited, this
* determination will be accurate. In the case boolean AND/OR/NOTs are used, this
* converter makes an uninformed guess as to whether the term would likely behave as if it
* is Required or Prohibited and sets the flags accordingly. These flags are used downstream
* to generate collations for {@link WordBreakSolrSpellChecker}, in cases where an original
* term is split up into multiple Tokens.
*
* @since solr 1.3
**/
public class SpellingQueryConverter extends QueryConverter {
/*
* The following builds up a regular expression that matches productions
* of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
* important exception (see below).
*
* http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
*
* http://www.w3.org/TR/REC-xml/#NT-Nmtoken
*
* An NMTOKEN is a series of one or more NAMECHAR characters, which is an
* extension of the NAMESTARTCHAR character class.
*
* The EXCEPTION referred to above concerns the colon, which is legal in an
* NMTOKEN, but cannot currently be used as a valid field name within Solr,
* as it is used to delimit the field name from the query string.
*/
final static String[] NAMESTARTCHAR_PARTS = {
"A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
"\\u0370-\\u037d", "\\u037f-\\u1fff",
"\\u200c-\\u200d", "\\u2070-\\u218f",
"\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
"\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
};
final static String[] ADDITIONAL_NAMECHAR_PARTS = {
"\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
};
final static String SURROGATE_PAIR = "\\p{Cs}{2}";
final static String NMTOKEN;
static {
StringBuilder sb = new StringBuilder();
for (String part : NAMESTARTCHAR_PARTS)
sb.append(part);
for (String part : ADDITIONAL_NAMECHAR_PARTS)
sb.append(part);
NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
}
final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|[\\^.]\\d+)))[^^.:(\\s][\\p{L}_\\-0-9]+";
// previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);
/**
* Converts the original query string to a collection of Lucene Tokens.
* @param original the original query string
* @return a Collection of Lucene Tokens
*/
@Override
public Collection<Token> convert(String original) {
if (original == null) { // this can happen with q.alt = and no query
return Collections.emptyList();
}
boolean mightContainRangeQuery = (original.indexOf('[') != -1 || original.indexOf('{') != -1)
&& (original.indexOf(']') != -1 || original.indexOf('}') != -1);
Collection<Token> result = new ArrayList<>();
Matcher matcher = QUERY_REGEX.matcher(original);
String nextWord = null;
int nextStartIndex = 0;
String lastBooleanOp = null;
while (nextWord!=null || matcher.find()) {
String word = null;
int startIndex = 0;
if(nextWord != null) {
word = nextWord;
startIndex = nextStartIndex;
nextWord = null;
} else {
word = matcher.group(0);
startIndex = matcher.start();
}
if(matcher.find()) {
nextWord = matcher.group(0);
nextStartIndex = matcher.start();
}
if(mightContainRangeQuery && "TO".equals(word)) {
continue;
}
if("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) {
lastBooleanOp = word;
continue;
}
// treat "AND NOT" as "NOT"...
if ("AND".equals(nextWord)
&& original.length() > nextStartIndex + 7
&& original.substring(nextStartIndex, nextStartIndex + 7).equals(
"AND NOT")) {
nextWord = "NOT";
}
int flagValue = 0;
if (word.charAt(0) == '-'
|| (startIndex > 0 && original.charAt(startIndex - 1) == '-')) {
flagValue = PROHIBITED_TERM_FLAG;
} else if (word.charAt(0) == '+'
|| (startIndex > 0 && original.charAt(startIndex - 1) == '+')) {
flagValue = REQUIRED_TERM_FLAG;
//we don't know the default operator so just assume the first operator isn't new.
} else if (nextWord != null
&& lastBooleanOp != null
&& !nextWord.equals(lastBooleanOp)
&& ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) {
flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
//...unless the 1st boolean operator is a NOT, because only AND/OR can be default.
} else if (nextWord != null
&& lastBooleanOp == null
&& !nextWord.equals(lastBooleanOp)
&& ("NOT".equals(nextWord))) {
flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
}
try {
analyze(result, word, startIndex, flagValue);
} catch (IOException e) {
// TODO: shouldn't we log something?
}
}
if(lastBooleanOp != null) {
for(Token t : result) {
int f = t.getFlags();
t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG);
}
}
return result;
}
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
TokenStream stream = analyzer.tokenStream("", text);
// TODO: support custom attributes
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offset + offsetAtt.startOffset(),
offset + offsetAtt.endOffset());
token.setFlags(flagsAttValue); //overwriting any flags already set...
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
stream.end();
stream.close();
}
}