blob: 67c002f6d9268dec37e5ed800134c1db6e4b374c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.oak.query;
import java.util.BitSet;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.PropertyValue;
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.query.ast.AndImpl;
import org.apache.jackrabbit.oak.query.ast.ConstraintImpl;
import org.apache.jackrabbit.oak.query.ast.FullTextSearchImpl;
import org.apache.jackrabbit.oak.query.ast.LiteralImpl;
import org.apache.jackrabbit.oak.query.ast.OrImpl;
import org.apache.jackrabbit.oak.plugins.memory.PropertyValues;
import static java.lang.Character.isLetterOrDigit;
import static org.apache.jackrabbit.util.Text.encodeIllegalXMLCharacters;
/**
* This class can extract excerpts from node.
*/
class SimpleExcerptProvider {
static final String REP_EXCERPT_FN = "rep:excerpt(.)";
static final String EXCERPT_END = "</span></div>";
static final String EXCERPT_BEGIN = "<div><span>";
private static final boolean CASE_SENSITIVE_HIGHLIGHT = Boolean.getBoolean("oak.query.caseSensitiveHighlight");
private static int maxFragmentSize = 150;
private SimpleExcerptProvider() {
}
static String getExcerpt(String path, String columnName,
Query query, boolean highlight) {
if (path == null) {
return null;
}
Tree t = query.getTree(path);
if (t == null || !t.exists()) {
return null;
}
columnName = extractExcerptProperty(columnName);
if (columnName != null && columnName.contains("/")) {
for (String p : PathUtils.elements(PathUtils
.getParentPath(columnName))) {
if (t.hasChild(p)) {
t = t.getChild(p);
} else {
return null;
}
}
columnName = PathUtils.getName(columnName);
}
StringBuilder text = new StringBuilder();
String separator = "";
for (PropertyState p : t.getProperties()) {
if (p.getType().tag() == Type.STRING.tag()
&& (columnName == null || columnName.equalsIgnoreCase(p
.getName()))) {
text.append(separator);
separator = " ";
for (String v : p.getValue(Type.STRINGS)) {
text.append(v);
}
}
}
Set<String> searchToken = extractFulltext(query);
if (highlight && searchToken != null) {
return highlight(text, searchToken);
}
return noHighlight(text);
}
private static String extractExcerptProperty(String column) {
// most frequent case first
if (REP_EXCERPT_FN.equalsIgnoreCase(column)) {
return null;
}
return column.substring(column.indexOf("(") + 1, column.indexOf(")"));
}
private static Set<String> extractFulltext(Query q) {
// TODO instanceof should not be used
if (q instanceof QueryImpl) {
return extractFulltext(((QueryImpl) q).getConstraint());
}
return ImmutableSet.of();
}
private static Set<String> extractFulltext(ConstraintImpl c) {
Set<String> tokens = new HashSet<String>();
// TODO instanceof should not be used,
// as it will break without us noticing if we extend the AST
if (c instanceof FullTextSearchImpl) {
FullTextSearchImpl f = (FullTextSearchImpl) c;
if (f.getFullTextSearchExpression() instanceof LiteralImpl) {
LiteralImpl l = (LiteralImpl) f.getFullTextSearchExpression();
tokens.add(l.getLiteralValue().getValue(Type.STRING));
}
}
if (c instanceof AndImpl) {
for (ConstraintImpl constraint : ((AndImpl) c).getConstraints()) {
tokens.addAll(extractFulltext(constraint));
}
}
if (c instanceof OrImpl) {
for (ConstraintImpl constraint : ((OrImpl) c).getConstraints()) {
tokens.addAll(extractFulltext(constraint));
}
}
return tokens;
}
private static Set<String> tokenize(Set<String> in) {
Set<String> tokens = new HashSet<String>();
for (String s : in) {
tokens.addAll(tokenize(s));
}
return tokens;
}
private static Set<String> tokenize(String in) {
Set<String> out = new HashSet<String>();
StringBuilder token = new StringBuilder();
boolean quote = false;
for (int i = 0; i < in.length(); ) {
final int c = in.codePointAt(i);
int length = Character.charCount(c);
switch (c) {
case ' ':
if (quote) {
token.append(' ');
} else if (token.length() > 0) {
out.add(token.toString());
token = new StringBuilder();
}
break;
case '"':
case '\'':
if (quote) {
quote = false;
if (token.length() > 0) {
out.add(token.toString());
token = new StringBuilder();
}
} else {
quote = true;
}
break;
default:
token.append(new String(Character.toChars(c)));
}
i += length;
}
if (token.length() > 0) {
out.add(token.toString());
}
return out;
}
private static String noHighlight(StringBuilder text) {
if (text.length() > maxFragmentSize) {
int lastSpace = text.lastIndexOf(" ", maxFragmentSize);
if (lastSpace != -1) {
text.setLength(lastSpace);
} else {
text.setLength(maxFragmentSize);
}
text.append(" ...");
}
StringBuilder excerpt = new StringBuilder("<div><span>");
excerpt.append(encodeIllegalXMLCharacters(text.toString()));
excerpt.append("</span></div>");
return excerpt.toString();
}
static String highlight(StringBuilder text, Set<String> searchToken) {
Set<String> tokens = tokenize(searchToken);
String escaped = encodeIllegalXMLCharacters(text.toString());
BitSet highlight = new BitSet();
for (String token : tokens) {
highlight(escaped, highlight, token);
}
StringBuilder excerpt = new StringBuilder(EXCERPT_BEGIN);
boolean strong = false;
for (int i = 0; i < escaped.length(); i++) {
if (highlight.get(i) && !strong) {
strong = true;
excerpt.append("<strong>");
} else if (!highlight.get(i) && strong) {
strong = false;
excerpt.append("</strong>");
}
excerpt.append(escaped.charAt(i));
}
if (strong) {
excerpt.append("</strong>");
}
excerpt.append(EXCERPT_END);
return excerpt.toString();
}
private static void highlight(String text, BitSet highlightBits, String token) {
boolean isLike = false;
if (token.endsWith("*")) {
if (token.length() == 1) {
// don't highlight the '*' character itself
return;
}
token = token.substring(0, token.length() - 1);
isLike = true;
}
int index = 0;
while (index < text.length()) {
index = indexOfSearchText(text, token, index);
if (index < 0) {
break;
}
int endIndex = index + token.length();
if (isLike) {
int nextSpace = endIndex;
while (nextSpace < text.length() && !isDelimeter(text.codePointAt(nextSpace))) {
nextSpace++;
}
if (nextSpace != text.length()) {
endIndex = nextSpace;
} else {
endIndex = text.length();
}
}
boolean isStartOk = (index == 0) || //allow for highlighting for token at the beginning
isDelimeter(text.codePointAt(index - 1)); //else token must follow a delimeter
boolean isEndOk = (endIndex == text.length()) || //token is at the end of string
isDelimeter(text.codePointAt(endIndex)); //else token must precede a delimeter
if (isStartOk && isEndOk) {
while (index < endIndex) {
highlightBits.set(index++);
}
} else {
index = endIndex;
}
}
}
private static int indexOfSearchText(String text, String searchStr, int fromIndex) {
if (CASE_SENSITIVE_HIGHLIGHT) {
return text.indexOf(searchStr, fromIndex);
}
return indexOfIgnoreCase(text, searchStr, fromIndex);
}
public static int indexOfIgnoreCase(String str, String searchStr, int startPos) {
// This is not very efficient, specially as we create the pattern each time.
// An alternative is to use apache commons lang StringUtils.indexOfIgnoreCase,
// but that would require a new dependency
String quotedSearchStr = Pattern.quote(searchStr);
Pattern pattern = Pattern.compile(quotedSearchStr, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(str);
if(matcher.find(startPos)) {
return matcher.start();
}
return -1;
}
static boolean isDelimeter(int codePoint) {
return !isLetterOrDigit(codePoint);
}
static PropertyValue getExcerpt(PropertyValue value) {
Splitter listSplitter = Splitter.on(',').trimResults().omitEmptyStrings();
StringBuilder excerpt = new StringBuilder(EXCERPT_BEGIN);
for (String v : listSplitter.splitToList(value.toString())) {
excerpt.append(v);
}
excerpt.append(EXCERPT_END);
return PropertyValues.newString(excerpt.toString());
}
}