| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.spelling; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| |
| /** |
| * Converts the query string to a Collection of Lucene tokens using a regular expression. |
| * Boolean operators AND, OR, NOT are skipped. |
| * |
| * Each term is checked to determine if it is optional, required or prohibited. Required |
| * terms output a {@link Token} with the {@link QueryConverter#REQUIRED_TERM_FLAG} set. |
| * Prohibited terms output a {@link Token} with the {@link QueryConverter#PROHIBITED_TERM_FLAG} |
| * set. If the query uses the plus (+) and minus (-) to denote required and prohibited, this |
| * determination will be accurate. In the case boolean AND/OR/NOTs are used, this |
| * converter makes an uninformed guess as to whether the term would likely behave as if it |
| * is Required or Prohibited and sets the flags accordingly. These flags are used downstream |
| * to generate collations for {@link WordBreakSolrSpellChecker}, in cases where an original |
| * term is split up into multiple Tokens. |
| * |
| * @since solr 1.3 |
| **/ |
| public class SpellingQueryConverter extends QueryConverter { |
| |
| /* |
| * The following builds up a regular expression that matches productions |
| * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one |
| * important exception (see below). |
| * |
| * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference |
| * |
| * http://www.w3.org/TR/REC-xml/#NT-Nmtoken |
| * |
| * An NMTOKEN is a series of one or more NAMECHAR characters, which is an |
| * extension of the NAMESTARTCHAR character class. |
| * |
| * The EXCEPTION referred to above concerns the colon, which is legal in an |
| * NMTOKEN, but cannot currently be used as a valid field name within Solr, |
| * as it is used to delimit the field name from the query string. |
| */ |
| |
| final static String[] NAMESTARTCHAR_PARTS = { |
| "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff", |
| "\\u0370-\\u037d", "\\u037f-\\u1fff", |
| "\\u200c-\\u200d", "\\u2070-\\u218f", |
| "\\u2c00-\\u2fef", "\\u2001-\\ud7ff", |
| "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd" |
| }; |
| final static String[] ADDITIONAL_NAMECHAR_PARTS = { |
| "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040" |
| }; |
| final static String SURROGATE_PAIR = "\\p{Cs}{2}"; |
| final static String NMTOKEN; |
| |
| static { |
| StringBuilder sb = new StringBuilder(); |
| for (String part : NAMESTARTCHAR_PARTS) |
| sb.append(part); |
| for (String part : ADDITIONAL_NAMECHAR_PARTS) |
| sb.append(part); |
| NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+"; |
| } |
| |
| final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|[\\^.]\\d+)))[^^.:(\\s][\\p{L}_\\-0-9]+"; |
| // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+"); |
| protected Pattern QUERY_REGEX = Pattern.compile(PATTERN); |
| |
| /** |
| * Converts the original query string to a collection of Lucene Tokens. |
| * @param original the original query string |
| * @return a Collection of Lucene Tokens |
| */ |
| @Override |
| public Collection<Token> convert(String original) { |
| if (original == null) { // this can happen with q.alt = and no query |
| return Collections.emptyList(); |
| } |
| boolean mightContainRangeQuery = (original.indexOf('[') != -1 || original.indexOf('{') != -1) |
| && (original.indexOf(']') != -1 || original.indexOf('}') != -1); |
| Collection<Token> result = new ArrayList<>(); |
| Matcher matcher = QUERY_REGEX.matcher(original); |
| String nextWord = null; |
| int nextStartIndex = 0; |
| String lastBooleanOp = null; |
| while (nextWord!=null || matcher.find()) { |
| String word = null; |
| int startIndex = 0; |
| if(nextWord != null) { |
| word = nextWord; |
| startIndex = nextStartIndex; |
| nextWord = null; |
| } else { |
| word = matcher.group(0); |
| startIndex = matcher.start(); |
| } |
| if(matcher.find()) { |
| nextWord = matcher.group(0); |
| nextStartIndex = matcher.start(); |
| } |
| if(mightContainRangeQuery && "TO".equals(word)) { |
| continue; |
| } |
| if("AND".equals(word) || "OR".equals(word) || "NOT".equals(word)) { |
| lastBooleanOp = word; |
| continue; |
| } |
| // treat "AND NOT" as "NOT"... |
| if ("AND".equals(nextWord) |
| && original.length() > nextStartIndex + 7 |
| && original.substring(nextStartIndex, nextStartIndex + 7).equals( |
| "AND NOT")) { |
| nextWord = "NOT"; |
| } |
| |
| int flagValue = 0; |
| if (word.charAt(0) == '-' |
| || (startIndex > 0 && original.charAt(startIndex - 1) == '-')) { |
| flagValue = PROHIBITED_TERM_FLAG; |
| } else if (word.charAt(0) == '+' |
| || (startIndex > 0 && original.charAt(startIndex - 1) == '+')) { |
| flagValue = REQUIRED_TERM_FLAG; |
| //we don't know the default operator so just assume the first operator isn't new. |
| } else if (nextWord != null |
| && lastBooleanOp != null |
| && !nextWord.equals(lastBooleanOp) |
| && ("AND".equals(nextWord) || "OR".equals(nextWord) || "NOT".equals(nextWord))) { |
| flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; |
| //...unless the 1st boolean operator is a NOT, because only AND/OR can be default. |
| } else if (nextWord != null |
| && lastBooleanOp == null |
| && !nextWord.equals(lastBooleanOp) |
| && ("NOT".equals(nextWord))) { |
| flagValue = TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG; |
| } |
| try { |
| analyze(result, word, startIndex, flagValue); |
| } catch (IOException e) { |
| // TODO: shouldn't we log something? |
| } |
| } |
| if(lastBooleanOp != null) { |
| for(Token t : result) { |
| int f = t.getFlags(); |
| t.setFlags(f |= QueryConverter.TERM_IN_BOOLEAN_QUERY_FLAG); |
| } |
| } |
| return result; |
| } |
| |
| protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { |
| TokenStream stream = analyzer.tokenStream("", text); |
| // TODO: support custom attributes |
| CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); |
| TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); |
| PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); |
| PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); |
| OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); |
| stream.reset(); |
| while (stream.incrementToken()) { |
| Token token = new Token(); |
| token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); |
| token.setOffset(offset + offsetAtt.startOffset(), |
| offset + offsetAtt.endOffset()); |
| token.setFlags(flagsAttValue); //overwriting any flags already set... |
| token.setType(typeAtt.type()); |
| token.setPayload(payloadAtt.getPayload()); |
| token.setPositionIncrement(posIncAtt.getPositionIncrement()); |
| result.add(token); |
| } |
| stream.end(); |
| stream.close(); |
| } |
| } |
| |