lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/DefaultPassageFormatter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.uhighlight;


 /**
  * Creates a formatted snippet from the top passages.
  * <p>
  * The default implementation marks the query terms as bold, and places
  * ellipses between unconnected passages.
  */
 public class DefaultPassageFormatter extends PassageFormatter {
   /** text that will appear before highlighted terms */
   protected final String preTag;
   /** text that will appear after highlighted terms */
   protected final String postTag;
   /** text that will appear between two unconnected passages */
   protected final String ellipsis;
   /** true if we should escape for html */
   protected final boolean escape;

   /**
    * Creates a new DefaultPassageFormatter with the default tags.
    */
   public DefaultPassageFormatter() {
     this("<b>", "</b>", "... ", false);
   }

   /**
    * Creates a new DefaultPassageFormatter with custom tags.
    *
    * @param preTag   text which should appear before a highlighted term.
    * @param postTag  text which should appear after a highlighted term.
    * @param ellipsis text which should be used to connect two unconnected passages.
    * @param escape   true if text should be html-escaped
    */
   public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
     if (preTag == null || postTag == null || ellipsis == null) {
       throw new NullPointerException();
     }
     this.preTag = preTag;
     this.postTag = postTag;
     this.ellipsis = ellipsis;
     this.escape = escape;
   }

   @Override
   public String format(Passage passages[], String content) {
     StringBuilder sb = new StringBuilder();
     int pos = 0;
     for (Passage passage : passages) {
       // don't add ellipsis if its the first one, or if its connected.
       if (passage.getStartOffset() > pos && pos > 0) {
         sb.append(ellipsis);
       }
       pos = passage.getStartOffset();
       for (int i = 0; i < passage.getNumMatches(); i++) {
         int start = passage.getMatchStarts()[i];
         assert start >= pos && start < passage.getEndOffset();
         //append content before this start
         append(sb, content, pos, start);

         int end = passage.getMatchEnds()[i];
         assert end > start;
         // its possible to have overlapping terms.
         //   Look ahead to expand 'end' past all overlapping:
         while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
           end = passage.getMatchEnds()[++i];
         }
         end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

         sb.append(preTag);
         append(sb, content, start, end);
         sb.append(postTag);

         pos = end;
       }
       // its possible a "term" from the analyzer could span a sentence boundary.
       append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
       pos = passage.getEndOffset();
     }
     return sb.toString();
   }

   /**
    * Appends original text to the response.
    *
    * @param dest    resulting text, possibly transformed or encoded
    * @param content original text content
    * @param start   index of the first character in content
    * @param end     index of the character following the last character in content
    */
   protected void append(StringBuilder dest, String content, int start, int end) {
     if (escape) {
       // note: these are the rules from owasp.org
       for (int i = start; i < end; i++) {
         char ch = content.charAt(i);
         switch (ch) {
           case '&':
             dest.append("&amp;");
             break;
           case '<':
             dest.append("&lt;");
             break;
           case '>':
             dest.append("&gt;");
             break;
           case '"':
             dest.append("&quot;");
             break;
           case '\'':
             dest.append("&#x27;");
             break;
           case '/':
             dest.append("&#x2F;");
             break;
           default:
             dest.append(ch);
         }
       }
     } else {
       dest.append(content, start, end);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.uhighlight;


	/**
	* Creates a formatted snippet from the top passages.
	* <p>
	* The default implementation marks the query terms as bold, and places
	* ellipses between unconnected passages.
	*/
	public class DefaultPassageFormatter extends PassageFormatter {
	/** text that will appear before highlighted terms */
	protected final String preTag;
	/** text that will appear after highlighted terms */
	protected final String postTag;
	/** text that will appear between two unconnected passages */
	protected final String ellipsis;
	/** true if we should escape for html */
	protected final boolean escape;

	/**
	* Creates a new DefaultPassageFormatter with the default tags.
	*/
	public DefaultPassageFormatter() {
	this("<b>", "</b>", "... ", false);
	}

	/**
	* Creates a new DefaultPassageFormatter with custom tags.
	*
	* @param preTag text which should appear before a highlighted term.
	* @param postTag text which should appear after a highlighted term.
	* @param ellipsis text which should be used to connect two unconnected passages.
	* @param escape true if text should be html-escaped
	*/
	public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
	if (preTag == null \|\| postTag == null \|\| ellipsis == null) {
	throw new NullPointerException();
	}
	this.preTag = preTag;
	this.postTag = postTag;
	this.ellipsis = ellipsis;
	this.escape = escape;
	}

	@Override
	public String format(Passage passages[], String content) {
	StringBuilder sb = new StringBuilder();
	int pos = 0;
	for (Passage passage : passages) {
	// don't add ellipsis if its the first one, or if its connected.
	if (passage.getStartOffset() > pos && pos > 0) {
	sb.append(ellipsis);
	}
	pos = passage.getStartOffset();
	for (int i = 0; i < passage.getNumMatches(); i++) {
	int start = passage.getMatchStarts()[i];
	assert start >= pos && start < passage.getEndOffset();
	//append content before this start
	append(sb, content, pos, start);

	int end = passage.getMatchEnds()[i];
	assert end > start;
	// its possible to have overlapping terms.
	// Look ahead to expand 'end' past all overlapping:
	while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i+1] < end) {
	end = passage.getMatchEnds()[++i];
	}
	end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

	sb.append(preTag);
	append(sb, content, start, end);
	sb.append(postTag);

	pos = end;
	}
	// its possible a "term" from the analyzer could span a sentence boundary.
	append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
	pos = passage.getEndOffset();
	}
	return sb.toString();
	}

	/**
	* Appends original text to the response.
	*
	* @param dest resulting text, possibly transformed or encoded
	* @param content original text content
	* @param start index of the first character in content
	* @param end index of the character following the last character in content
	*/
	protected void append(StringBuilder dest, String content, int start, int end) {
	if (escape) {
	// note: these are the rules from owasp.org
	for (int i = start; i < end; i++) {
	char ch = content.charAt(i);
	switch (ch) {
	case '&':
	dest.append("&");
	break;
	case '<':
	dest.append("<");
	break;
	case '>':
	dest.append(">");
	break;
	case '"':
	dest.append(""");
	break;
	case '\'':
	dest.append("'");
	break;
	case '/':
	dest.append("/");
	break;
	default:
	dest.append(ch);
	}
	}
	} else {
	dest.append(content, start, end);
	}
	}
	}