| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.uhighlight; |
| |
| |
| /** |
| * Creates a formatted snippet from the top passages. |
| * <p> |
| * The default implementation marks the query terms as bold, and places |
| * ellipses between unconnected passages. |
| */ |
| public class DefaultPassageFormatter extends PassageFormatter { |
| /** text that will appear before highlighted terms */ |
| protected final String preTag; |
| /** text that will appear after highlighted terms */ |
| protected final String postTag; |
| /** text that will appear between two unconnected passages */ |
| protected final String ellipsis; |
| /** true if we should escape for html */ |
| protected final boolean escape; |
| |
| /** |
| * Creates a new DefaultPassageFormatter with the default tags. |
| */ |
| public DefaultPassageFormatter() { |
| this("<b>", "</b>", "... ", false); |
| } |
| |
| /** |
| * Creates a new DefaultPassageFormatter with custom tags. |
| * |
| * @param preTag text which should appear before a highlighted term. |
| * @param postTag text which should appear after a highlighted term. |
| * @param ellipsis text which should be used to connect two unconnected passages. |
| * @param escape true if text should be html-escaped |
| */ |
| public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) { |
| if (preTag == null || postTag == null || ellipsis == null) { |
| throw new NullPointerException(); |
| } |
| this.preTag = preTag; |
| this.postTag = postTag; |
| this.ellipsis = ellipsis; |
| this.escape = escape; |
| } |
| |
| @Override |
| public String format(Passage passages[], String content) { |
| StringBuilder sb = new StringBuilder(); |
| int pos = 0; |
| for (Passage passage : passages) { |
| // don't add ellipsis if its the first one, or if its connected. |
| if (passage.getStartOffset() > pos && pos > 0) { |
| sb.append(ellipsis); |
| } |
| pos = passage.getStartOffset(); |
| for (int i = 0; i < passage.getNumMatches(); i++) { |
| int start = passage.getMatchStarts()[i]; |
| assert start >= pos && start < passage.getEndOffset(); |
| //append content before this start |
| append(sb, content, pos, start); |
| |
| int end = passage.getMatchEnds()[i]; |
| assert end > start; |
| // its possible to have overlapping terms. |
| // Look ahead to expand 'end' past all overlapping: |
| while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) { |
| end = passage.getMatchEnds()[++i]; |
| } |
| end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage |
| |
| sb.append(preTag); |
| append(sb, content, start, end); |
| sb.append(postTag); |
| |
| pos = end; |
| } |
| // its possible a "term" from the analyzer could span a sentence boundary. |
| append(sb, content, pos, Math.max(pos, passage.getEndOffset())); |
| pos = passage.getEndOffset(); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Appends original text to the response. |
| * |
| * @param dest resulting text, possibly transformed or encoded |
| * @param content original text content |
| * @param start index of the first character in content |
| * @param end index of the character following the last character in content |
| */ |
| protected void append(StringBuilder dest, String content, int start, int end) { |
| if (escape) { |
| // note: these are the rules from owasp.org |
| for (int i = start; i < end; i++) { |
| char ch = content.charAt(i); |
| switch (ch) { |
| case '&': |
| dest.append("&"); |
| break; |
| case '<': |
| dest.append("<"); |
| break; |
| case '>': |
| dest.append(">"); |
| break; |
| case '"': |
| dest.append("""); |
| break; |
| case '\'': |
| dest.append("'"); |
| break; |
| case '/': |
| dest.append("/"); |
| break; |
| default: |
| dest.append(ch); |
| } |
| } |
| } else { |
| dest.append(content, start, end); |
| } |
| } |
| } |