blob: 642f98e62e8c3423d26123d750e9556d6f2b0a22 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.maven.index;
import javax.inject.Named;
import javax.inject.Singleton;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParser.Operator;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.maven.index.context.NexusAnalyzer;
import org.apache.maven.index.creator.JarFileContentsIndexCreator;
import org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator;
import org.apache.maven.index.expr.SearchExpression;
import org.apache.maven.index.expr.SearchTyped;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A default {@link QueryCreator} constructs Lucene query for provided query text.
* <p>
* By default wildcards are created such as query text matches beginning of the field value or beginning of the
* class/package name segment for {@link ArtifactInfo#NAMES NAMES} field. But it can be controlled by using special
* markers:
* <ul>
* <li>* - any character</li>
* <li>'^' - beginning of the text</li>
* <li>'$' or '&lt;' or ' ' end of the text</li>
* </ul>
* For example:
* <ul>
* <li>junit - matches junit and junit-foo, but not foo-junit</li>
* <li>*junit - matches junit, junit-foo and foo-junit</li>
* <li>^junit$ - matches junit, but not junit-foo, nor foo-junit</li>
* </ul>
*
* @author Eugene Kuleshov
*/
@Singleton
@Named
public class DefaultQueryCreator implements QueryCreator {
private final Logger logger = LoggerFactory.getLogger(getClass());
protected Logger getLogger() {
return logger;
}
// ==
public IndexerField selectIndexerField(final Field field, final SearchType type) {
IndexerField lastField = null;
for (IndexerField indexerField : field.getIndexerFields()) {
lastField = indexerField;
if (type.matchesIndexerField(indexerField)) {
return indexerField;
}
}
return lastField;
}
public Query constructQuery(final Field field, final SearchExpression expression) throws ParseException {
SearchType searchType = SearchType.SCORED;
if (expression instanceof SearchTyped) {
searchType = ((SearchTyped) expression).getSearchType();
}
return constructQuery(field, expression.getStringValue(), searchType);
}
public Query constructQuery(final Field field, final String query, final SearchType type) throws ParseException {
if (type == null) {
throw new NullPointerException("Cannot construct query with type of \"null\"!");
}
if (field == null) {
throw new NullPointerException("Cannot construct query for field \"null\"!");
} else {
return constructQuery(field, selectIndexerField(field, type), query, type);
}
}
@Deprecated
public Query constructQuery(String field, String query) {
Query result;
if (MinimalArtifactInfoIndexCreator.FLD_GROUP_ID_KW.getKey().equals(field)
|| MinimalArtifactInfoIndexCreator.FLD_ARTIFACT_ID_KW.getKey().equals(field)
|| MinimalArtifactInfoIndexCreator.FLD_VERSION_KW.getKey().equals(field)
|| JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals(field)) {
// these are special untokenized fields, kept for use cases like TreeView is (exact matching).
result = legacyConstructQuery(field, query);
} else {
QueryParser qp = new QueryParser(field, new NexusAnalyzer());
// small cheap trick
// if a query is not "expert" (does not contain field:val kind of expression)
// but it contains star and/or punctuation chars, example: "common-log*"
if (!query.contains(":")) {
if (query.contains("*") && query.matches(".*(\\.|-|_).*")) {
query = query.toLowerCase()
.replaceAll("\\*", "X")
.replaceAll("\\.|-|_", " ")
.replaceAll("X", "*");
}
}
try {
result = qp.parse(query);
} catch (ParseException e) {
getLogger()
.debug("Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
+ e.getMessage());
result = legacyConstructQuery(field, query);
}
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("Query parsed as: " + result.toString());
}
return result;
}
// ==
public Query constructQuery(
final Field field, final IndexerField indexerField, final String query, final SearchType type)
throws ParseException {
if (indexerField == null) {
getLogger()
.warn("Querying for field \"" + field.toString() + "\" without any indexer field was tried. "
+ "Please review your code, and consider adding this field to index!");
return null;
}
if (!indexerField.isIndexed()) {
getLogger()
.warn("Querying for non-indexed field " + field.toString()
+ " was tried. Please review your code or consider adding this field to index!");
return null;
}
if (Field.NOT_PRESENT.equals(query)) {
return new WildcardQuery(new Term(indexerField.getKey(), "*"));
}
if (SearchType.EXACT.equals(type)) {
if (indexerField.isKeyword()) {
// no tokenization should happen against the field!
if (query.contains("*") || query.contains("?")) {
return new WildcardQuery(new Term(indexerField.getKey(), query));
} else {
// exactly what callee wants
return new TermQuery(new Term(indexerField.getKey(), query));
}
} else if (!indexerField.isKeyword() && indexerField.isStored()) {
// TODO: resolve this better! Decouple QueryCreator and IndexCreators!
// This is a hack/workaround here
if (JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.equals(indexerField)) {
if (query.startsWith("/")) {
return new TermQuery(new Term(
indexerField.getKey(), query.toLowerCase().replaceAll("\\.", "/")));
} else {
return new TermQuery(new Term(
indexerField.getKey(), "/" + query.toLowerCase().replaceAll("\\.", "/")));
}
} else {
getLogger()
.warn(type.toString()
+ " type of querying for non-keyword (but stored) field "
+ indexerField.getOntology().toString()
+ " was tried. Please review your code, or indexCreator involved, "
+ "since this type of querying of this field is currently unsupported.");
// will never succeed (unless we supply him "filter" too, but that would kill performance)
// and is possible with stored fields only
return null;
}
} else {
getLogger()
.warn(type.toString()
+ " type of querying for non-keyword (and not stored) field "
+ indexerField.getOntology().toString()
+ " was tried. Please review your code, or indexCreator involved, "
+ "since this type of querying of this field is impossible.");
// not a keyword indexerField, nor stored. No hope at all. Impossible even with "filtering"
return null;
}
} else if (SearchType.SCORED.equals(type)) {
if (JarFileContentsIndexCreator.FLD_CLASSNAMES.equals(indexerField)) {
String qpQuery = query.toLowerCase().replaceAll("\\.", " ").replaceAll("/", " ");
// tokenization should happen against the field!
QueryParser qp = new QueryParser(indexerField.getKey(), new NexusAnalyzer());
qp.setDefaultOperator(Operator.AND);
return qp.parse(qpQuery);
} else if (indexerField.isKeyword()) {
// no tokenization should happen against the field!
if (query.contains("*") || query.contains("?")) {
return new WildcardQuery(new Term(indexerField.getKey(), query));
} else {
Term t = new Term(indexerField.getKey(), query);
return new BooleanQuery.Builder()
.add(new TermQuery(t), Occur.SHOULD)
.add(new BoostQuery(new PrefixQuery(t), 0.8f), Occur.SHOULD)
.build();
}
} else {
// to save "original" query
String qpQuery = query;
// tokenization should happen against the field!
QueryParser qp = new QueryParser(indexerField.getKey(), new NexusAnalyzer());
qp.setDefaultOperator(Operator.AND);
// small cheap trick
// if a query is not "expert" (does not contain field:val kind of expression)
// but it contains star and/or punctuation chars, example: "common-log*"
// since Lucene does not support multi-terms WITH wildcards.
// So, here, we "mimic" NexusAnalyzer (this should be fixed!)
// but do this with PRESERVING original query!
if (qpQuery.matches(".*(\\.|-|_|/).*")) {
qpQuery = qpQuery.toLowerCase()
.replaceAll("\\*", "X")
.replaceAll("\\.|-|_|/", " ")
.replaceAll("X", "*")
.replaceAll(" \\* ", "")
.replaceAll("^\\* ", "")
.replaceAll(" \\*$", "");
}
// "fix" it with trailing "*" if not there, but only if it not ends with a space
if (!qpQuery.endsWith("*") && !qpQuery.endsWith(" ")) {
qpQuery += "*";
}
try {
// qpQuery = "\"" + qpQuery + "\"";
BooleanQuery.Builder q1b = new BooleanQuery.Builder().add(qp.parse(qpQuery), Occur.SHOULD);
if (qpQuery.contains(" ")) {
q1b.add(qp.parse("\"" + qpQuery + "\""), Occur.SHOULD);
}
Query q2 = null;
int termCount = countTerms(indexerField, query);
// try with KW only if the processed query in qpQuery does not have spaces!
if (!query.contains(" ") && termCount > 1) {
// get the KW field
IndexerField keywordField = selectIndexerField(indexerField.getOntology(), SearchType.EXACT);
if (keywordField.isKeyword()) {
q2 = constructQuery(indexerField.getOntology(), keywordField, query, type);
}
}
if (q2 == null) {
return q1b.build();
} else {
return new BooleanQuery.Builder()
// trick with order
.add(q2, Occur.SHOULD)
.add(q1b.build(), Occur.SHOULD)
.build();
}
} catch (ParseException e) {
// TODO: we are not falling back anymore to legacy!
throw e;
// getLogger().debug(
// "Query parsing with \"legacy\" method, we got ParseException from QueryParser: "
// + e.getMessage() );
//
// return legacyConstructQuery( indexerField.getKey(), query );
}
}
} else {
// what search type is this?
return null;
}
}
public Query legacyConstructQuery(String field, String query) {
if (query == null || query.length() == 0) {
getLogger().info("Empty or null query for field:" + field);
return null;
}
String q = query.toLowerCase();
char h = query.charAt(0);
if (JarFileContentsIndexCreator.FLD_CLASSNAMES_KW.getKey().equals(field)
|| JarFileContentsIndexCreator.FLD_CLASSNAMES.getKey().equals(field)) {
q = q.replaceAll("\\.", "/");
if (h == '^') {
q = q.substring(1);
if (q.charAt(0) != '/') {
q = '/' + q;
}
} else if (h != '*') {
q = "*/" + q;
}
} else {
if (h == '^') {
q = q.substring(1);
} else if (h != '*') {
q = "*" + q;
}
}
int l = q.length() - 1;
char c = q.charAt(l);
if (c == ' ' || c == '<' || c == '$') {
q = q.substring(0, q.length() - 1);
} else if (c != '*') {
q += "*";
}
int n = q.indexOf('*');
if (n == -1) {
return new TermQuery(new Term(field, q));
} else if (n > 0 && n == q.length() - 1) {
return new PrefixQuery(new Term(field, q.substring(0, q.length() - 1)));
}
return new WildcardQuery(new Term(field, q));
}
// ==
private NexusAnalyzer nexusAnalyzer = new NexusAnalyzer();
protected int countTerms(final IndexerField indexerField, final String query) {
try {
TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
ts.reset();
int result = 0;
while (ts.incrementToken()) {
result++;
}
ts.end();
ts.close();
return result;
} catch (IOException e) {
// will not happen
return 1;
}
}
}