blob: 701197aa9804e85d16054de7582bd34a7cf445af [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.uhighlight;
/**
* Creates a formatted snippet from the top passages.
* <p>
* The default implementation marks the query terms as bold, and places
* ellipses between unconnected passages.
*/
public class DefaultPassageFormatter extends PassageFormatter {
/** text that will appear before highlighted terms */
protected final String preTag;
/** text that will appear after highlighted terms */
protected final String postTag;
/** text that will appear between two unconnected passages */
protected final String ellipsis;
/** true if we should escape for html */
protected final boolean escape;
/**
* Creates a new DefaultPassageFormatter with the default tags.
*/
public DefaultPassageFormatter() {
this("<b>", "</b>", "... ", false);
}
/**
* Creates a new DefaultPassageFormatter with custom tags.
*
* @param preTag text which should appear before a highlighted term.
* @param postTag text which should appear after a highlighted term.
* @param ellipsis text which should be used to connect two unconnected passages.
* @param escape true if text should be html-escaped
*/
public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
if (preTag == null || postTag == null || ellipsis == null) {
throw new NullPointerException();
}
this.preTag = preTag;
this.postTag = postTag;
this.ellipsis = ellipsis;
this.escape = escape;
}
@Override
public String format(Passage passages[], String content) {
StringBuilder sb = new StringBuilder();
int pos = 0;
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
if (passage.getStartOffset() > pos && pos > 0) {
sb.append(ellipsis);
}
pos = passage.getStartOffset();
for (int i = 0; i < passage.getNumMatches(); i++) {
int start = passage.getMatchStarts()[i];
assert start >= pos && start < passage.getEndOffset();
//append content before this start
append(sb, content, pos, start);
int end = passage.getMatchEnds()[i];
assert end > start;
// its possible to have overlapping terms.
// Look ahead to expand 'end' past all overlapping:
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
end = passage.getMatchEnds()[++i];
}
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
sb.append(preTag);
append(sb, content, start, end);
sb.append(postTag);
pos = end;
}
// its possible a "term" from the analyzer could span a sentence boundary.
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
pos = passage.getEndOffset();
}
return sb.toString();
}
/**
* Appends original text to the response.
*
* @param dest resulting text, possibly transformed or encoded
* @param content original text content
* @param start index of the first character in content
* @param end index of the character following the last character in content
*/
protected void append(StringBuilder dest, String content, int start, int end) {
if (escape) {
// note: these are the rules from owasp.org
for (int i = start; i < end; i++) {
char ch = content.charAt(i);
switch (ch) {
case '&':
dest.append("&amp;");
break;
case '<':
dest.append("&lt;");
break;
case '>':
dest.append("&gt;");
break;
case '"':
dest.append("&quot;");
break;
case '\'':
dest.append("&#x27;");
break;
case '/':
dest.append("&#x2F;");
break;
default:
dest.append(ch);
}
}
} else {
dest.append(content, start, end);
}
}
}