| 'use strict' |
| |
| var legacy = require('character-entities-legacy') |
| var invalid = require('character-reference-invalid') |
| var decimal = require('is-decimal') |
| var hexadecimal = require('is-hexadecimal') |
| var alphanumerical = require('is-alphanumerical') |
| var decodeEntity = require('./decode-entity') |
| |
| module.exports = parseEntities |
| |
| var own = {}.hasOwnProperty |
| var fromCharCode = String.fromCharCode |
| var noop = Function.prototype |
| |
| // Default settings. |
| var defaults = { |
| warning: null, |
| reference: null, |
| text: null, |
| warningContext: null, |
| referenceContext: null, |
| textContext: null, |
| position: {}, |
| additional: null, |
| attribute: false, |
| nonTerminated: true |
| } |
| |
| // Characters. |
| var tab = 9 // '\t' |
| var lineFeed = 10 // '\n' |
| var formFeed = 12 // '\f' |
| var space = 32 // ' ' |
| var ampersand = 38 // '&' |
| var semicolon = 59 // ';' |
| var lessThan = 60 // '<' |
| var equalsTo = 61 // '=' |
| var numberSign = 35 // '#' |
| var uppercaseX = 88 // 'X' |
| var lowercaseX = 120 // 'x' |
| var replacementCharacter = 65533 // '�' |
| |
| // Reference types. |
| var name = 'named' |
| var hexa = 'hexadecimal' |
| var deci = 'decimal' |
| |
| // Map of bases. |
| var bases = {} |
| |
| bases[hexa] = 16 |
| bases[deci] = 10 |
| |
| // Map of types to tests. |
| // Each type of character reference accepts different characters. |
| // This test is used to detect whether a reference has ended (as the semicolon |
| // is not strictly needed). |
| var tests = {} |
| |
| tests[name] = alphanumerical |
| tests[deci] = decimal |
| tests[hexa] = hexadecimal |
| |
| // Warning types. |
| var namedNotTerminated = 1 |
| var numericNotTerminated = 2 |
| var namedEmpty = 3 |
| var numericEmpty = 4 |
| var namedUnknown = 5 |
| var numericDisallowed = 6 |
| var numericProhibited = 7 |
| |
| // Warning messages. |
| var messages = {} |
| |
| messages[namedNotTerminated] = |
| 'Named character references must be terminated by a semicolon' |
| messages[numericNotTerminated] = |
| 'Numeric character references must be terminated by a semicolon' |
| messages[namedEmpty] = 'Named character references cannot be empty' |
| messages[numericEmpty] = 'Numeric character references cannot be empty' |
| messages[namedUnknown] = 'Named character references must be known' |
| messages[numericDisallowed] = |
| 'Numeric character references cannot be disallowed' |
| messages[numericProhibited] = |
| 'Numeric character references cannot be outside the permissible Unicode range' |
| |
| // Wrap to ensure clean parameters are given to `parse`. |
| function parseEntities(value, options) { |
| var settings = {} |
| var option |
| var key |
| |
| if (!options) { |
| options = {} |
| } |
| |
| for (key in defaults) { |
| option = options[key] |
| settings[key] = |
| option === null || option === undefined ? defaults[key] : option |
| } |
| |
| if (settings.position.indent || settings.position.start) { |
| settings.indent = settings.position.indent || [] |
| settings.position = settings.position.start |
| } |
| |
| return parse(value, settings) |
| } |
| |
| // Parse entities. |
| // eslint-disable-next-line complexity |
| function parse(value, settings) { |
| var additional = settings.additional |
| var nonTerminated = settings.nonTerminated |
| var handleText = settings.text |
| var handleReference = settings.reference |
| var handleWarning = settings.warning |
| var textContext = settings.textContext |
| var referenceContext = settings.referenceContext |
| var warningContext = settings.warningContext |
| var pos = settings.position |
| var indent = settings.indent || [] |
| var length = value.length |
| var index = 0 |
| var lines = -1 |
| var column = pos.column || 1 |
| var line = pos.line || 1 |
| var queue = '' |
| var result = [] |
| var entityCharacters |
| var namedEntity |
| var terminated |
| var characters |
| var character |
| var reference |
| var following |
| var warning |
| var reason |
| var output |
| var entity |
| var begin |
| var start |
| var type |
| var test |
| var prev |
| var next |
| var diff |
| var end |
| |
| if (typeof additional === 'string') { |
| additional = additional.charCodeAt(0) |
| } |
| |
| // Cache the current point. |
| prev = now() |
| |
| // Wrap `handleWarning`. |
| warning = handleWarning ? parseError : noop |
| |
| // Ensure the algorithm walks over the first character and the end (inclusive). |
| index-- |
| length++ |
| |
| while (++index < length) { |
| // If the previous character was a newline. |
| if (character === lineFeed) { |
| column = indent[lines] || 1 |
| } |
| |
| character = value.charCodeAt(index) |
| |
| if (character === ampersand) { |
| following = value.charCodeAt(index + 1) |
| |
| // The behaviour depends on the identity of the next character. |
| if ( |
| following === tab || |
| following === lineFeed || |
| following === formFeed || |
| following === space || |
| following === ampersand || |
| following === lessThan || |
| following !== following || |
| (additional && following === additional) |
| ) { |
| // Not a character reference. |
| // No characters are consumed, and nothing is returned. |
| // This is not an error, either. |
| queue += fromCharCode(character) |
| column++ |
| |
| continue |
| } |
| |
| start = index + 1 |
| begin = start |
| end = start |
| |
| if (following === numberSign) { |
| // Numerical entity. |
| end = ++begin |
| |
| // The behaviour further depends on the next character. |
| following = value.charCodeAt(end) |
| |
| if (following === uppercaseX || following === lowercaseX) { |
| // ASCII hex digits. |
| type = hexa |
| end = ++begin |
| } else { |
| // ASCII digits. |
| type = deci |
| } |
| } else { |
| // Named entity. |
| type = name |
| } |
| |
| entityCharacters = '' |
| entity = '' |
| characters = '' |
| test = tests[type] |
| end-- |
| |
| while (++end < length) { |
| following = value.charCodeAt(end) |
| |
| if (!test(following)) { |
| break |
| } |
| |
| characters += fromCharCode(following) |
| |
| // Check if we can match a legacy named reference. |
| // If so, we cache that as the last viable named reference. |
| // This ensures we do not need to walk backwards later. |
| if (type === name && own.call(legacy, characters)) { |
| entityCharacters = characters |
| entity = legacy[characters] |
| } |
| } |
| |
| terminated = value.charCodeAt(end) === semicolon |
| |
| if (terminated) { |
| end++ |
| |
| namedEntity = type === name ? decodeEntity(characters) : false |
| |
| if (namedEntity) { |
| entityCharacters = characters |
| entity = namedEntity |
| } |
| } |
| |
| diff = 1 + end - start |
| |
| if (!terminated && !nonTerminated) { |
| // Empty. |
| } else if (!characters) { |
| // An empty (possible) entity is valid, unless it’s numeric (thus an |
| // ampersand followed by an octothorp). |
| if (type !== name) { |
| warning(numericEmpty, diff) |
| } |
| } else if (type === name) { |
| // An ampersand followed by anything unknown, and not terminated, is |
| // invalid. |
| if (terminated && !entity) { |
| warning(namedUnknown, 1) |
| } else { |
| // If theres something after an entity name which is not known, cap |
| // the reference. |
| if (entityCharacters !== characters) { |
| end = begin + entityCharacters.length |
| diff = 1 + end - begin |
| terminated = false |
| } |
| |
| // If the reference is not terminated, warn. |
| if (!terminated) { |
| reason = entityCharacters ? namedNotTerminated : namedEmpty |
| |
| if (settings.attribute) { |
| following = value.charCodeAt(end) |
| |
| if (following === equalsTo) { |
| warning(reason, diff) |
| entity = null |
| } else if (alphanumerical(following)) { |
| entity = null |
| } else { |
| warning(reason, diff) |
| } |
| } else { |
| warning(reason, diff) |
| } |
| } |
| } |
| |
| reference = entity |
| } else { |
| if (!terminated) { |
| // All non-terminated numeric entities are not rendered, and trigger a |
| // warning. |
| warning(numericNotTerminated, diff) |
| } |
| |
| // When terminated and number, parse as either hexadecimal or decimal. |
| reference = parseInt(characters, bases[type]) |
| |
| // Trigger a warning when the parsed number is prohibited, and replace |
| // with replacement character. |
| if (prohibited(reference)) { |
| warning(numericProhibited, diff) |
| reference = fromCharCode(replacementCharacter) |
| } else if (reference in invalid) { |
| // Trigger a warning when the parsed number is disallowed, and replace |
| // by an alternative. |
| warning(numericDisallowed, diff) |
| reference = invalid[reference] |
| } else { |
| // Parse the number. |
| output = '' |
| |
| // Trigger a warning when the parsed number should not be used. |
| if (disallowed(reference)) { |
| warning(numericDisallowed, diff) |
| } |
| |
| // Stringify the number. |
| if (reference > 0xffff) { |
| reference -= 0x10000 |
| output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800) |
| reference = 0xdc00 | (reference & 0x3ff) |
| } |
| |
| reference = output + fromCharCode(reference) |
| } |
| } |
| |
| // Found it! |
| // First eat the queued characters as normal text, then eat an entity. |
| if (reference) { |
| flush() |
| |
| prev = now() |
| index = end - 1 |
| column += end - start + 1 |
| result.push(reference) |
| next = now() |
| next.offset++ |
| |
| if (handleReference) { |
| handleReference.call( |
| referenceContext, |
| reference, |
| {start: prev, end: next}, |
| value.slice(start - 1, end) |
| ) |
| } |
| |
| prev = next |
| } else { |
| // If we could not find a reference, queue the checked characters (as |
| // normal characters), and move the pointer to their end. |
| // This is possible because we can be certain neither newlines nor |
| // ampersands are included. |
| characters = value.slice(start - 1, end) |
| queue += characters |
| column += characters.length |
| index = end - 1 |
| } |
| } else { |
| // Handle anything other than an ampersand, including newlines and EOF. |
| if ( |
| character === 10 // Line feed |
| ) { |
| line++ |
| lines++ |
| column = 0 |
| } |
| |
| if (character === character) { |
| queue += fromCharCode(character) |
| column++ |
| } else { |
| flush() |
| } |
| } |
| } |
| |
| // Return the reduced nodes, and any possible warnings. |
| return result.join('') |
| |
| // Get current position. |
| function now() { |
| return { |
| line: line, |
| column: column, |
| offset: index + (pos.offset || 0) |
| } |
| } |
| |
| // “Throw” a parse-error: a warning. |
| function parseError(code, offset) { |
| var position = now() |
| |
| position.column += offset |
| position.offset += offset |
| |
| handleWarning.call(warningContext, messages[code], position, code) |
| } |
| |
| // Flush `queue` (normal text). |
| // Macro invoked before each entity and at the end of `value`. |
| // Does nothing when `queue` is empty. |
| function flush() { |
| if (queue) { |
| result.push(queue) |
| |
| if (handleText) { |
| handleText.call(textContext, queue, {start: prev, end: now()}) |
| } |
| |
| queue = '' |
| } |
| } |
| } |
| |
| // Check if `character` is outside the permissible unicode range. |
| function prohibited(code) { |
| return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff |
| } |
| |
| // Check if `character` is disallowed. |
| function disallowed(code) { |
| return ( |
| (code >= 0x0001 && code <= 0x0008) || |
| code === 0x000b || |
| (code >= 0x000d && code <= 0x001f) || |
| (code >= 0x007f && code <= 0x009f) || |
| (code >= 0xfdd0 && code <= 0xfdef) || |
| (code & 0xffff) === 0xffff || |
| (code & 0xffff) === 0xfffe |
| ) |
| } |