app/src/main/java/org/apache/roller/weblogger/util/HTMLSanitizer.java - roller - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  The ASF licenses this file to You
  * under the Apache License, Version 2.0 (the "License"); you may not
  * use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.  For additional information regarding
  * copyright in this work, please see the NOTICE file in the top level
  * directory of this distribution.
  */

 /**
  Copyright (c) 2009 Open Lab, http://www.open-lab.com/
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  "Software"), to deal in the Software without restriction, including
  without limitation the rights to use, copy, modify, merge, publish,
  distribute, sublicense, and/or sell copies of the Software, and to
  permit persons to whom the Software is furnished to do so, subject to
  the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */

 package org.apache.roller.weblogger.util;

 import java.util.ArrayList;
 import java.util.List;
 import java.util.Stack;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.validator.UrlValidator;
 import org.apache.roller.weblogger.config.WebloggerConfig;

 public class HTMLSanitizer {
     public static Boolean xssEnabled = WebloggerConfig.getBooleanProperty("weblogAdminsUntrusted", Boolean.FALSE);

     public static Pattern forbiddenTags = Pattern.compile("^(script|object|embed|link|style|form|input)$");
     public static Pattern allowedTags = Pattern.compile("^(b|p|i|s|a|img|table|thead|tbody|tfoot|tr|th|td|dd|dl|dt|em|h1|h2|h3|h4|h5|h6|li|ul|ol|span|div|strike|strong|"
             + "sub|sup|pre|del|code|blockquote|strike|kbd|br|hr|area|map|object|embed|param|link|form|small|big)$");
     // <!--.........>
     private static Pattern commentPattern = Pattern.compile("<!--.*");
     // <tag ....props.....>
     private static Pattern tagStartPattern = Pattern.compile("<(?i)(\\w+\\b)\\s*(.*)/?>$");
     // </tag .........>
     private static Pattern tagClosePattern = Pattern.compile("</(?i)(\\w+\\b)\\s*>$");
     private static Pattern standAloneTags = Pattern.compile("^(img|br|hr)$");
     private static Pattern selfClosed = Pattern.compile("<.+/>");
     // prop="...."
     private static Pattern attributesPattern = Pattern.compile("(\\w*)\\s*=\\s*\"([^\"]*)\"");
     // color:red;
     private static Pattern stylePattern = Pattern.compile("([^\\s^:]+)\\s*:\\s*([^;]+);?");
     // url('....')"
     private static Pattern urlStylePattern = Pattern.compile("(?i).*\\b\\s*url\\s*\\(['\"]([^)]*)['\"]\\)");
     // expression(....)"   thanks to Ben Summer
     private static Pattern forbiddenStylePattern = Pattern.compile("(?:(expression|eval|javascript))\\s*\\(");

     /**
      * This method should be used to test input.
      *
      * @param html
      * @return true if the input is "valid"
      */
     public static boolean isSanitized(String html) {
         return sanitizer(html).isValid;
     }

     /**
      * Used to clean every html before to output it in any html page
      *
      * @param html
      * @return sanitized html
      */
     public static String sanitize(String html) {
         return sanitizer(html).html;
     }

     public static String conditionallySanitize(String ret) {
         // if XSS is enabled then sanitize HTML
         if (xssEnabled && ret != null) {
             ret = HTMLSanitizer.sanitize(ret);
         }
         return ret;
     }

     /**
      * Used to get the text,  tags removed or encoded
      *
      * @param html
      * @return sanitized text
      */
     public static String getText(String html) {
         return sanitizer(html).text;
     }

     /**
      * This is the main method of sanitizing. It will be used both for validation and cleaning
      *
      * @param html
      * @return a SanitizeResult object
      */
     public static SanitizeResult sanitizer(String html) {
         return sanitizer(html, allowedTags, forbiddenTags);
     }

     public static SanitizeResult sanitizer(String html, Pattern allowedTags, Pattern forbiddenTags) {
         SanitizeResult ret = new SanitizeResult();
         Stack<String> openTags = new Stack<String>();


         List<String> tokens = tokenize(html);

         // -------------------   LOOP for every token --------------------------
         for (String token : tokens) {
             boolean isAcceptedToken = false;

             Matcher startMatcher = tagStartPattern.matcher(token);
             Matcher endMatcher = tagClosePattern.matcher(token);


             //--------------------------------------------------------------------------------  COMMENT    <!-- ......... -->
             if (commentPattern.matcher(token).find()) {
                 ret.val = ret.val + token + (token.endsWith("-->") ? "" : "-->");
                 ret.invalidTags.add(token + (token.endsWith("-->") ? "" : "-->"));
                 continue;


                 //--------------------------------------------------------------------------------  OPEN TAG    <tag .........>
             } else if (startMatcher.find()) {

                 //tag name extraction
                 String tag = startMatcher.group(1).toLowerCase();


                 //-----------------------------------------------------  FORBIDDEN TAG   <script .........>
                 if (forbiddenTags.matcher(tag).find()) {
                     ret.invalidTags.add("<" + tag + ">");
                     continue;


                     // --------------------------------------------------  WELL KNOWN TAG
                 } else if (allowedTags.matcher(tag).find()) {


                     String cleanToken = "<" + tag;
                     String tokenBody = startMatcher.group(2);


                     //first test table consistency
                     //table tbody tfoot thead th tr td
                     if ("thead".equals(tag) || "tbody".equals(tag) || "tfoot".equals(tag) || "tr".equals(tag)) {
                         if (openTags.search("table") < 1) {
                             ret.invalidTags.add("<" + tag + ">");
                             continue;
                         }
                     } else if (("td".equals(tag) || "th".equals(tag)) && openTags.search("tr") < 1) {
                         ret.invalidTags.add("<" + tag + ">");
                         continue;
                     }


                     // then test properties
                     Matcher attributes = attributesPattern.matcher(tokenBody);

                     // URL flag
                     boolean foundURL = false;
                     while (attributes.find()) {

                         String attr = attributes.group(1).toLowerCase();
                         String val = attributes.group(2);

                         // we will accept href in case of <A>
                         // <a href="......">
                         if ("a".equals(tag) && "href".equals(attr)) {
                             String[] customSchemes = {"http", "https"};
                             if (new UrlValidator(customSchemes).isValid(val)) {
                                 foundURL = true;
                             } else {
                                 // may be it is a mailto?
                                 // case <a href="mailto:pippo@pippo.com?subject=...."
                                 if (val.toLowerCase().startsWith("mailto:") && val.indexOf('@') >= 0) {
                                     String val1 = "http://www." + val.substring(val.indexOf('@') + 1);
                                     if (new UrlValidator(customSchemes).isValid(val1)) {
                                         foundURL = true;
                                     } else {
                                         ret.invalidTags.add(attr + " " + val);
                                         val = "";
                                     }
                                 } else {
                                     ret.invalidTags.add(attr + " " + val);
                                     val = "";
                                 }
                             }

                         } else if (tag.matches("img|embed") && "src".equals(attr)) {
                             // <img src="......">
                             String[] customSchemes = {"http", "https"};
                             if (new UrlValidator(customSchemes).isValid(val)) {
                                 foundURL = true;
                             } else {
                                 ret.invalidTags.add(attr + " " + val);
                                 val = "";
                             }
                         } else if ("href".equals(attr) || "src".equals(attr)) {
                             // <tag src/href="......">   skipped
                             ret.invalidTags.add(tag + " " + attr + " " + val);
                             continue;
                         } else if (attr.matches("width|height")) {
                             // <tag width/height="......">
                             if (!val.toLowerCase().matches("\\d+%|\\d+$")) {
                                 // test numeric values
                                 ret.invalidTags.add(tag + " " + attr + " " + val);
                                 continue;
                             }

                         } else if ("style".equals(attr)) {
                             // <tag style="......">
                             // then test properties
                             Matcher styles = stylePattern.matcher(val);
                             String cleanStyle = "";

                             while (styles.find()) {
                                 String styleName = styles.group(1).toLowerCase();
                                 String styleValue = styles.group(2);

                                 // suppress invalid styles values
                                 if (forbiddenStylePattern.matcher(styleValue).find()) {
                                     ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                     continue;
                                 }

                                 // check if valid url
                                 Matcher urlStyleMatcher = urlStylePattern.matcher(styleValue);
                                 if (urlStyleMatcher.find()) {
                                     String[] customSchemes = {"http", "https"};
                                     String url = urlStyleMatcher.group(1);
                                     if (!new UrlValidator(customSchemes).isValid(url)) {
                                         ret.invalidTags.add(tag + " " + attr + " " + styleValue);
                                         continue;
                                     }
                                 }

                                 cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";

                             }
                             val = cleanStyle;

                         } else if (attr.startsWith("on")) {
                             // skip all javascript events
                             ret.invalidTags.add(tag + " " + attr + " " + val);
                             continue;

                         } else {
                             // by default encode all properties
                             val = encode(val);
                         }

                         cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
                     }
                     cleanToken = cleanToken + ">";

                     isAcceptedToken = true;

                     // for <img> and <a>
                     if (tag.matches("a|img|embed") && !foundURL) {
                         isAcceptedToken = false;
                         cleanToken = "";
                     }

                     token = cleanToken;


                     // push the tag if require closure and it is accepted (otherwise is encoded)
                     if (isAcceptedToken && !(standAloneTags.matcher(tag).find() || selfClosed.matcher(tag).find())) {
                         openTags.push(tag);
                     }

                     // --------------------------------------------------------------------------------  UNKNOWN TAG
                 } else {
                     ret.invalidTags.add(token);
                     ret.val = ret.val + token;
                     continue;


                 }

                 // --------------------------------------------------------------------------------  CLOSE TAG </tag>
             } else if (endMatcher.find()) {
                 String tag = endMatcher.group(1).toLowerCase();

                 //is self closing
                 if (selfClosed.matcher(tag).find()) {
                     ret.invalidTags.add(token);
                     continue;
                 }
                 if (forbiddenTags.matcher(tag).find()) {
                     ret.invalidTags.add("/" + tag);
                     continue;
                 }
                 if (!allowedTags.matcher(tag).find()) {
                     ret.invalidTags.add(token);
                     ret.val = ret.val + token;
                     continue;
                 } else {


                     String cleanToken = "";

                     // check tag position in the stack
                     int pos = openTags.search(tag);
                     // if found on top ok
                     for (int i = 1; i <= pos; i++) {
                         //pop all elements before tag and close it
                         String poppedTag = openTags.pop();
                         cleanToken = cleanToken + "</" + poppedTag + ">";
                         isAcceptedToken = true;
                     }

                     token = cleanToken;
                 }

             }

             ret.val = ret.val + token;

             if (isAcceptedToken) {
                 ret.html = ret.html + token;
                 //ret.text = ret.text + " ";
             } else {
                 String sanToken = htmlEncodeApexesAndTags(token);
                 ret.html = ret.html + sanToken;
                 ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
             }


         }

         // must close remaining tags
         while (openTags.size() > 0) {
             //pop all elements before tag and close it
             String poppedTag = openTags.pop();
             ret.html = ret.html + "</" + poppedTag + ">";
             ret.val = ret.val + "</" + poppedTag + ">";
         }

         //set boolean value
         ret.isValid = ret.invalidTags.size() == 0;

         return ret;
     }

     /**
      * Splits html tag and tag content <......>.
      *
      * @param html
      * @return a list of token
      */
     private static List<String> tokenize(String html) {
         ArrayList tokens = new ArrayList();
         int pos = 0;
         String token = "";
         int len = html.length();
         while (pos < len) {
             char c = html.charAt(pos);

             String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);

             //a comment is starting
             if ("<!--".equals(ahead)) {
                 //store the current token
                 if (token.length() > 0) {
                     tokens.add(token);
                 }

                 //clear the token
                 token = "";

                 // search the end of <......>
                 int end = moveToMarkerEnd(pos, "-->", html);
                 tokens.add(html.substring(pos, end));
                 pos = end;


                 // a new "<" token is starting
             } else if ('<' == c) {

                 //store the current token
                 if (token.length() > 0) {
                     tokens.add(token);
                 }

                 //clear the token
                 token = "";

                 // serch the end of <......>
                 int end = moveToMarkerEnd(pos, ">", html);
                 tokens.add(html.substring(pos, end));
                 pos = end;

             } else {
                 token = token + c;
                 pos++;
             }

         }

         //store the last token
         if (token.length() > 0) {
             tokens.add(token);
         }

         return tokens;
     }

     private static int moveToMarkerEnd(int pos, String marker, String s) {
         int i = s.indexOf(marker, pos);
         if (i > -1) {
             pos = i + marker.length();
         } else {
             pos = s.length();
         }
         return pos;
     }

     /**
      * Contains the sanitizing results.
      * html is the sanitized html encoded  ready to be printed. Unaccepted tag are encode, text inside tag is always encoded    MUST BE USED WHEN PRINTING HTML
      * text is the text inside valid tags. Contains invalid tags encoded                                                        SHOULD BE USED TO PRINT EXCERPTS
      * val  is the html source cleaned from unaccepted tags. It is not encoded:                                                 SHOULD BE USED IN SAVE ACTIONS
      * isValid is true when every tag is accepted without forcing encoding
      * invalidTags is the list of encoded-killed tags
      */
     static class SanitizeResult {

         public String html = "";
         public String text = "";
         public String val = "";
         public boolean isValid = true;
         public List<String> invalidTags = new ArrayList<String>();
     }

     public static String encode(String s) {
         return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s));
     }

     public static final String htmlEncodeApexesAndTags(String source) {
         return htmlEncodeTag(htmlEncodeApexes(source));
     }

     public static final String htmlEncodeApexes(String source) {
         if (source != null) {
             return replaceAllNoRegex(source, new String[]{"\"", "'"}, new String[]{"&quot;", "&#39;"});
         } else {
             return null;
         }
     }

     public static final String htmlEncodeTag(String source) {
         if (source != null) {
             return replaceAllNoRegex(source, new String[]{"<", ">"}, new String[]{"&lt;", "&gt;"});
         } else {
             return null;
         }
     }

     public static String convertLineFeedToBR(String text) {
         if (text != null) {
             return replaceAllNoRegex(text, new String[]{"\n", "\f", "\r"}, new String[]{"<br>", "<br>", " "});
         } else {
             return null;
         }
     }

     public static String removeLineFeed(String text) {

         if (text != null) {
             return replaceAllNoRegex(text, new String[]{"\n", "\f", "\r"}, new String[]{" ", " ", " "});
         } else {
             return null;
         }
     }

     public static final String replaceAllNoRegex(String source, String searches[], String replaces[]) {
         int k;
         String tmp = source;
         for (k = 0; k < searches.length; k++) {
             tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]);
         }
         return tmp;
     }

     public static final String replaceAllNoRegex(String source, String search, String replace) {
         StringBuilder buffer = new StringBuilder();
         if (source != null) {
             if (search.length() == 0) {
                 return source;
             }
             int oldPos, pos;
             for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos + search.length(),
                     pos = source.indexOf(search, oldPos)) {
                 buffer.append(source.substring(oldPos, pos));
                 buffer.append(replace);
             }
             if (oldPos < source.length()) {
                 buffer.append(source.substring(oldPos));
             }
         }
         return new String(buffer);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. The ASF licenses this file to You
	* under the Apache License, Version 2.0 (the "License"); you may not
	* use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License. For additional information regarding
	* copyright in this work, please see the NOTICE file in the top level
	* directory of this distribution.
	*/

	/**
	Copyright (c) 2009 Open Lab, http://www.open-lab.com/
	Permission is hereby granted, free of charge, to any person obtaining
	a copy of this software and associated documentation files (the
	"Software"), to deal in the Software without restriction, including
	without limitation the rights to use, copy, modify, merge, publish,
	distribute, sublicense, and/or sell copies of the Software, and to
	permit persons to whom the Software is furnished to do so, subject to
	the following conditions:

	The above copyright notice and this permission notice shall be
	included in all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
	LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
	OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
	WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*/

	package org.apache.roller.weblogger.util;

	import java.util.ArrayList;
	import java.util.List;
	import java.util.Stack;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.commons.validator.UrlValidator;
	import org.apache.roller.weblogger.config.WebloggerConfig;

	public class HTMLSanitizer {
	public static Boolean xssEnabled = WebloggerConfig.getBooleanProperty("weblogAdminsUntrusted", Boolean.FALSE);

	public static Pattern forbiddenTags = Pattern.compile("^(script\|object\|embed\|link\|style\|form\|input)$");
	public static Pattern allowedTags = Pattern.compile("^(b\|p\|i\|s\|a\|img\|table\|thead\|tbody\|tfoot\|tr\|th\|td\|dd\|dl\|dt\|em\|h1\|h2\|h3\|h4\|h5\|h6\|li\|ul\|ol\|span\|div\|strike\|strong\|"
	+ "sub\|sup\|pre\|del\|code\|blockquote\|strike\|kbd\|br\|hr\|area\|map\|object\|embed\|param\|link\|form\|small\|big)$");
	// <!--.........>
	private static Pattern commentPattern = Pattern.compile("<!--.*");
	// <tag ....props.....>
	private static Pattern tagStartPattern = Pattern.compile("<(?i)(\\w+\\b)\\s(.)/?>$");
	// </tag .........>
	private static Pattern tagClosePattern = Pattern.compile("</(?i)(\\w+\\b)\\s*>$");
	private static Pattern standAloneTags = Pattern.compile("^(img\|br\|hr)$");
	private static Pattern selfClosed = Pattern.compile("<.+/>");
	// prop="...."
	private static Pattern attributesPattern = Pattern.compile("(\\w)\\s=\\s\"([^\"])\"");
	// color:red;
	private static Pattern stylePattern = Pattern.compile("([^\\s^:]+)\\s:\\s([^;]+);?");
	// url('....')"
	private static Pattern urlStylePattern = Pattern.compile("(?i).\\b\\surl\\s\\(['\"]([^)])['\"]\\)");
	// expression(....)" thanks to Ben Summer
	private static Pattern forbiddenStylePattern = Pattern.compile("(?:(expression\|eval\|javascript))\\s*\\(");

	/**
	* This method should be used to test input.
	*
	* @param html
	* @return true if the input is "valid"
	*/
	public static boolean isSanitized(String html) {
	return sanitizer(html).isValid;
	}

	/**
	* Used to clean every html before to output it in any html page
	*
	* @param html
	* @return sanitized html
	*/
	public static String sanitize(String html) {
	return sanitizer(html).html;
	}

	public static String conditionallySanitize(String ret) {
	// if XSS is enabled then sanitize HTML
	if (xssEnabled && ret != null) {
	ret = HTMLSanitizer.sanitize(ret);
	}
	return ret;
	}

	/**
	* Used to get the text, tags removed or encoded
	*
	* @param html
	* @return sanitized text
	*/
	public static String getText(String html) {
	return sanitizer(html).text;
	}

	/**
	* This is the main method of sanitizing. It will be used both for validation and cleaning
	*
	* @param html
	* @return a SanitizeResult object
	*/
	public static SanitizeResult sanitizer(String html) {
	return sanitizer(html, allowedTags, forbiddenTags);
	}

	public static SanitizeResult sanitizer(String html, Pattern allowedTags, Pattern forbiddenTags) {
	SanitizeResult ret = new SanitizeResult();
	Stack<String> openTags = new Stack<String>();


	List<String> tokens = tokenize(html);

	// ------------------- LOOP for every token --------------------------
	for (String token : tokens) {
	boolean isAcceptedToken = false;

	Matcher startMatcher = tagStartPattern.matcher(token);
	Matcher endMatcher = tagClosePattern.matcher(token);


	//-------------------------------------------------------------------------------- COMMENT <!-- ......... -->
	if (commentPattern.matcher(token).find()) {
	ret.val = ret.val + token + (token.endsWith("-->") ? "" : "-->");
	ret.invalidTags.add(token + (token.endsWith("-->") ? "" : "-->"));
	continue;


	//-------------------------------------------------------------------------------- OPEN TAG <tag .........>
	} else if (startMatcher.find()) {

	//tag name extraction
	String tag = startMatcher.group(1).toLowerCase();


	//----------------------------------------------------- FORBIDDEN TAG <script .........>
	if (forbiddenTags.matcher(tag).find()) {
	ret.invalidTags.add("<" + tag + ">");
	continue;


	// -------------------------------------------------- WELL KNOWN TAG
	} else if (allowedTags.matcher(tag).find()) {


	String cleanToken = "<" + tag;
	String tokenBody = startMatcher.group(2);


	//first test table consistency
	//table tbody tfoot thead th tr td
	if ("thead".equals(tag) \|\| "tbody".equals(tag) \|\| "tfoot".equals(tag) \|\| "tr".equals(tag)) {
	if (openTags.search("table") < 1) {
	ret.invalidTags.add("<" + tag + ">");
	continue;
	}
	} else if (("td".equals(tag) \|\| "th".equals(tag)) && openTags.search("tr") < 1) {
	ret.invalidTags.add("<" + tag + ">");
	continue;
	}


	// then test properties
	Matcher attributes = attributesPattern.matcher(tokenBody);

	// URL flag
	boolean foundURL = false;
	while (attributes.find()) {

	String attr = attributes.group(1).toLowerCase();
	String val = attributes.group(2);

	// we will accept href in case of <A>
	// <a href="......">
	if ("a".equals(tag) && "href".equals(attr)) {
	String[] customSchemes = {"http", "https"};
	if (new UrlValidator(customSchemes).isValid(val)) {
	foundURL = true;
	} else {
	// may be it is a mailto?
	// case <a href="mailto:pippo@pippo.com?subject=...."
	if (val.toLowerCase().startsWith("mailto:") && val.indexOf('@') >= 0) {
	String val1 = "http://www." + val.substring(val.indexOf('@') + 1);
	if (new UrlValidator(customSchemes).isValid(val1)) {
	foundURL = true;
	} else {
	ret.invalidTags.add(attr + " " + val);
	val = "";
	}
	} else {
	ret.invalidTags.add(attr + " " + val);
	val = "";
	}
	}

	} else if (tag.matches("img\|embed") && "src".equals(attr)) {
	// <img src="......">
	String[] customSchemes = {"http", "https"};
	if (new UrlValidator(customSchemes).isValid(val)) {
	foundURL = true;
	} else {
	ret.invalidTags.add(attr + " " + val);
	val = "";
	}
	} else if ("href".equals(attr) \|\| "src".equals(attr)) {
	// <tag src/href="......"> skipped
	ret.invalidTags.add(tag + " " + attr + " " + val);
	continue;
	} else if (attr.matches("width\|height")) {
	// <tag width/height="......">
	if (!val.toLowerCase().matches("\\d+%\|\\d+$")) {
	// test numeric values
	ret.invalidTags.add(tag + " " + attr + " " + val);
	continue;
	}

	} else if ("style".equals(attr)) {
	// <tag style="......">
	// then test properties
	Matcher styles = stylePattern.matcher(val);
	String cleanStyle = "";

	while (styles.find()) {
	String styleName = styles.group(1).toLowerCase();
	String styleValue = styles.group(2);

	// suppress invalid styles values
	if (forbiddenStylePattern.matcher(styleValue).find()) {
	ret.invalidTags.add(tag + " " + attr + " " + styleValue);
	continue;
	}

	// check if valid url
	Matcher urlStyleMatcher = urlStylePattern.matcher(styleValue);
	if (urlStyleMatcher.find()) {
	String[] customSchemes = {"http", "https"};
	String url = urlStyleMatcher.group(1);
	if (!new UrlValidator(customSchemes).isValid(url)) {
	ret.invalidTags.add(tag + " " + attr + " " + styleValue);
	continue;
	}
	}

	cleanStyle = cleanStyle + styleName + ":" + encode(styleValue) + ";";

	}
	val = cleanStyle;

	} else if (attr.startsWith("on")) {
	// skip all javascript events
	ret.invalidTags.add(tag + " " + attr + " " + val);
	continue;

	} else {
	// by default encode all properties
	val = encode(val);
	}

	cleanToken = cleanToken + " " + attr + "=\"" + val + "\"";
	}
	cleanToken = cleanToken + ">";

	isAcceptedToken = true;

	// for <img> and <a>
	if (tag.matches("a\|img\|embed") && !foundURL) {
	isAcceptedToken = false;
	cleanToken = "";
	}

	token = cleanToken;


	// push the tag if require closure and it is accepted (otherwise is encoded)
	if (isAcceptedToken && !(standAloneTags.matcher(tag).find() \|\| selfClosed.matcher(tag).find())) {
	openTags.push(tag);
	}

	// -------------------------------------------------------------------------------- UNKNOWN TAG
	} else {
	ret.invalidTags.add(token);
	ret.val = ret.val + token;
	continue;


	}

	// -------------------------------------------------------------------------------- CLOSE TAG </tag>
	} else if (endMatcher.find()) {
	String tag = endMatcher.group(1).toLowerCase();

	//is self closing
	if (selfClosed.matcher(tag).find()) {
	ret.invalidTags.add(token);
	continue;
	}
	if (forbiddenTags.matcher(tag).find()) {
	ret.invalidTags.add("/" + tag);
	continue;
	}
	if (!allowedTags.matcher(tag).find()) {
	ret.invalidTags.add(token);
	ret.val = ret.val + token;
	continue;
	} else {


	String cleanToken = "";

	// check tag position in the stack
	int pos = openTags.search(tag);
	// if found on top ok
	for (int i = 1; i <= pos; i++) {
	//pop all elements before tag and close it
	String poppedTag = openTags.pop();
	cleanToken = cleanToken + "</" + poppedTag + ">";
	isAcceptedToken = true;
	}

	token = cleanToken;
	}

	}

	ret.val = ret.val + token;

	if (isAcceptedToken) {
	ret.html = ret.html + token;
	//ret.text = ret.text + " ";
	} else {
	String sanToken = htmlEncodeApexesAndTags(token);
	ret.html = ret.html + sanToken;
	ret.text = ret.text + htmlEncodeApexesAndTags(removeLineFeed(token));
	}


	}

	// must close remaining tags
	while (openTags.size() > 0) {
	//pop all elements before tag and close it
	String poppedTag = openTags.pop();
	ret.html = ret.html + "</" + poppedTag + ">";
	ret.val = ret.val + "</" + poppedTag + ">";
	}

	//set boolean value
	ret.isValid = ret.invalidTags.size() == 0;

	return ret;
	}

	/**
	* Splits html tag and tag content <......>.
	*
	* @param html
	* @return a list of token
	*/
	private static List<String> tokenize(String html) {
	ArrayList tokens = new ArrayList();
	int pos = 0;
	String token = "";
	int len = html.length();
	while (pos < len) {
	char c = html.charAt(pos);

	String ahead = html.substring(pos, pos > len - 4 ? len : pos + 4);

	//a comment is starting
	if ("<!--".equals(ahead)) {
	//store the current token
	if (token.length() > 0) {
	tokens.add(token);
	}

	//clear the token
	token = "";

	// search the end of <......>
	int end = moveToMarkerEnd(pos, "-->", html);
	tokens.add(html.substring(pos, end));
	pos = end;


	// a new "<" token is starting
	} else if ('<' == c) {

	//store the current token
	if (token.length() > 0) {
	tokens.add(token);
	}

	//clear the token
	token = "";

	// serch the end of <......>
	int end = moveToMarkerEnd(pos, ">", html);
	tokens.add(html.substring(pos, end));
	pos = end;

	} else {
	token = token + c;
	pos++;
	}

	}

	//store the last token
	if (token.length() > 0) {
	tokens.add(token);
	}

	return tokens;
	}

	private static int moveToMarkerEnd(int pos, String marker, String s) {
	int i = s.indexOf(marker, pos);
	if (i > -1) {
	pos = i + marker.length();
	} else {
	pos = s.length();
	}
	return pos;
	}

	/**
	* Contains the sanitizing results.
	* html is the sanitized html encoded ready to be printed. Unaccepted tag are encode, text inside tag is always encoded MUST BE USED WHEN PRINTING HTML
	* text is the text inside valid tags. Contains invalid tags encoded SHOULD BE USED TO PRINT EXCERPTS
	* val is the html source cleaned from unaccepted tags. It is not encoded: SHOULD BE USED IN SAVE ACTIONS
	* isValid is true when every tag is accepted without forcing encoding
	* invalidTags is the list of encoded-killed tags
	*/
	static class SanitizeResult {

	public String html = "";
	public String text = "";
	public String val = "";
	public boolean isValid = true;
	public List<String> invalidTags = new ArrayList<String>();
	}

	public static String encode(String s) {
	return convertLineFeedToBR(htmlEncodeApexesAndTags(s == null ? "" : s));
	}

	public static final String htmlEncodeApexesAndTags(String source) {
	return htmlEncodeTag(htmlEncodeApexes(source));
	}

	public static final String htmlEncodeApexes(String source) {
	if (source != null) {
	return replaceAllNoRegex(source, new String[]{"\"", "'"}, new String[]{""", "'"});
	} else {
	return null;
	}
	}

	public static final String htmlEncodeTag(String source) {
	if (source != null) {
	return replaceAllNoRegex(source, new String[]{"<", ">"}, new String[]{"<", ">"});
	} else {
	return null;
	}
	}

	public static String convertLineFeedToBR(String text) {
	if (text != null) {
	return replaceAllNoRegex(text, new String[]{"\n", "\f", "\r"}, new String[]{"<br>", "<br>", " "});
	} else {
	return null;
	}
	}

	public static String removeLineFeed(String text) {

	if (text != null) {
	return replaceAllNoRegex(text, new String[]{"\n", "\f", "\r"}, new String[]{" ", " ", " "});
	} else {
	return null;
	}
	}

	public static final String replaceAllNoRegex(String source, String searches[], String replaces[]) {
	int k;
	String tmp = source;
	for (k = 0; k < searches.length; k++) {
	tmp = replaceAllNoRegex(tmp, searches[k], replaces[k]);
	}
	return tmp;
	}

	public static final String replaceAllNoRegex(String source, String search, String replace) {
	StringBuilder buffer = new StringBuilder();
	if (source != null) {
	if (search.length() == 0) {
	return source;
	}
	int oldPos, pos;
	for (oldPos = 0, pos = source.indexOf(search, oldPos); pos != -1; oldPos = pos + search.length(),
	pos = source.indexOf(search, oldPos)) {
	buffer.append(source.substring(oldPos, pos));
	buffer.append(replace);
	}
	if (oldPos < source.length()) {
	buffer.append(source.substring(oldPos));
	}
	}
	return new String(buffer);
	}
	}