| //This module is adapted from the CodeIgniter framework |
| //The license is available at http://codeigniter.com/ |
| |
| var html_entity_decode = require('./entities').decode; |
| |
| var never_allowed_str = { |
| 'document.cookie': '[removed]', |
| 'document.write': '[removed]', |
| '.parentNode': '[removed]', |
| '.innerHTML': '[removed]', |
| 'window.location': '[removed]', |
| '-moz-binding': '[removed]', |
| '<!--': '<!--', |
| '-->': '-->', |
| '<![CDATA[': '<![CDATA[' |
| }; |
| |
| var never_allowed_regex = { |
| 'javascript\\s*:': '[removed]', |
| 'expression\\s*(\\(|&\\#40;)': '[removed]', |
| 'vbscript\\s*:': '[removed]', |
| 'Redirect\\s+302': '[removed]' |
| }; |
| |
| var non_displayables = [ |
| /%0[0-8bcef]/g, // url encoded 00-08, 11, 12, 14, 15 |
| /%1[0-9a-f]/g, // url encoded 16-31 |
| /[\x00-\x08]/g, // 00-08 |
| /\x0b/g, /\x0c/g, // 11,12 |
| /[\x0e-\x1f]/g // 14-31 |
| ]; |
| |
| var compact_words = [ |
| 'javascript', 'expression', 'vbscript', |
| 'script', 'applet', 'alert', 'document', |
| 'write', 'cookie', 'window' |
| ]; |
| |
| exports.clean = function(str, is_image) { |
| |
| //Recursively clean objects and arrays |
| if (typeof str === 'object') { |
| for (var i in str) { |
| str[i] = exports.clean(str[i]); |
| } |
| return str; |
| } |
| |
| //Remove invisible characters |
| str = remove_invisible_characters(str); |
| |
| //Protect query string variables in URLs => 901119URL5918AMP18930PROTECT8198 |
| str = str.replace(/\&([a-z\_0-9]+)\=([a-z\_0-9]+)/i, xss_hash() + '$1=$2'); |
| |
| //Validate standard character entities - add a semicolon if missing. We do this to enable |
| //the conversion of entities to ASCII later. |
| str = str.replace(/(&\#?[0-9a-z]{2,})([\x00-\x20])*;?/i, '$1;$2'); |
| |
| //Validate UTF16 two byte encoding (x00) - just as above, adds a semicolon if missing. |
| str = str.replace(/(&\#x?)([0-9A-F]+);?/i, '$1;$2'); |
| |
| //Un-protect query string variables |
| str = str.replace(xss_hash(), '&'); |
| |
| //Decode just in case stuff like this is submitted: |
| //<a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> |
| try{ |
| str = decodeURIComponent(str); |
| } |
| catch(error){ |
| // str was not actually URI-encoded |
| } |
| |
| //Convert character entities to ASCII - this permits our tests below to work reliably. |
| //We only convert entities that are within tags since these are the ones that will pose security problems. |
| str = str.replace(/[a-z]+=([\'\"]).*?\1/gi, function(m, match) { |
| return m.replace(match, convert_attribute(match)); |
| }); |
| |
| //Remove invisible characters again |
| str = remove_invisible_characters(str); |
| |
| //Convert tabs to spaces |
| str = str.replace('\t', ' '); |
| |
| //Captured the converted string for later comparison |
| var converted_string = str; |
| |
| //Remove strings that are never allowed |
| for (var i in never_allowed_str) { |
| str = str.replace(i, never_allowed_str[i]); |
| } |
| |
| //Remove regex patterns that are never allowed |
| for (var i in never_allowed_regex) { |
| str = str.replace(new RegExp(i, 'i'), never_allowed_regex[i]); |
| } |
| |
| //Compact any exploded words like: j a v a s c r i p t |
| // We only want to do this when it is followed by a non-word character |
| for (var i in compact_words) { |
| var spacified = compact_words[i].split('').join('\\s*')+'\\s*'; |
| |
| str = str.replace(new RegExp('('+spacified+')(\\W)', 'ig'), function(m, compat, after) { |
| return compat.replace(/\s+/g, '') + after; |
| }); |
| } |
| |
| //Remove disallowed Javascript in links or img tags |
| do { |
| var original = str; |
| |
| if (str.match(/<a/i)) { |
| str = str.replace(/<a\s+([^>]*?)(>|$)/gi, function(m, attributes, end_tag) { |
| attributes = filter_attributes(attributes.replace('<','').replace('>','')); |
| return m.replace(attributes, attributes.replace(/href=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)/gi, '')); |
| }); |
| } |
| |
| if (str.match(/<img/i)) { |
| str = str.replace(/<img\s+([^>]*?)(\s?\/?>|$)/gi, function(m, attributes, end_tag) { |
| attributes = filter_attributes(attributes.replace('<','').replace('>','')); |
| return m.replace(attributes, attributes.replace(/src=.*?(alert\(|alert&\#40;|javascript\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)/gi, '')); |
| }); |
| } |
| |
| if (str.match(/script/i) || str.match(/xss/i)) { |
| str = str.replace(/<(\/*)(script|xss)(.*?)\>/gi, '[removed]'); |
| } |
| |
| } while(original != str); |
| |
| //Remove JavaScript Event Handlers - Note: This code is a little blunt. It removes the event |
| //handler and anything up to the closing >, but it's unlikely to be a problem. |
| event_handlers = ['[^a-z_\-]on\\w*']; |
| |
| //Adobe Photoshop puts XML metadata into JFIF images, including namespacing, |
| //so we have to allow this for images |
| if (!is_image) { |
| event_handlers.push('xmlns'); |
| } |
| |
| str = str.replace(new RegExp("<([^><]+?)("+event_handlers.join('|')+")(\\s*=\\s*[^><]*)([><]*)", 'i'), '<$1$4'); |
| |
| //Sanitize naughty HTML elements |
| //If a tag containing any of the words in the list |
| //below is found, the tag gets converted to entities. |
| //So this: <blink> |
| //Becomes: <blink> |
| naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss'; |
| str = str.replace(new RegExp('<(/*\\s*)('+naughty+')([^><]*)([><]*)', 'gi'), function(m, a, b, c, d) { |
| return '<' + a + b + c + d.replace('>','>').replace('<','<'); |
| }); |
| |
| //Sanitize naughty scripting elements Similar to above, only instead of looking for |
| //tags it looks for PHP and JavaScript commands that are disallowed. Rather than removing the |
| //code, it simply converts the parenthesis to entities rendering the code un-executable. |
| //For example: eval('some code') |
| //Becomes: eval('some code') |
| str = str.replace(/(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)/gi, '$1$2($3)'); |
| |
| //This adds a bit of extra precaution in case something got through the above filters |
| for (var i in never_allowed_str) { |
| str = str.replace(i, never_allowed_str[i]); |
| } |
| for (var i in never_allowed_regex) { |
| str = str.replace(new RegExp(i, 'i'), never_allowed_regex[i]); |
| } |
| |
| //Images are handled in a special way |
| if (is_image && str !== converted_string) { |
| throw new Error('Image may contain XSS'); |
| } |
| |
| return str; |
| } |
| |
| function remove_invisible_characters(str) { |
| for (var i in non_displayables) { |
| str = str.replace(non_displayables[i], ''); |
| } |
| return str; |
| } |
| |
| function xss_hash() { |
| //TODO: Create a random hash |
| return '!*$^#(@*#&'; |
| } |
| |
| function convert_attribute(str) { |
| return str.replace('>','>').replace('<','<').replace('\\','\\\\'); |
| } |
| |
| //Filter Attributes - filters tag attributes for consistency and safety |
| function filter_attributes(str) { |
| out = ''; |
| |
| str.replace(/\s*[a-z\-]+\s*=\s*(?:\042|\047)(?:[^\1]*?)\1/gi, function(m) { |
| $out += m.replace(/\/\*.*?\*\//g, ''); |
| }); |
| |
| return out; |
| } |