| 'use strict'; |
| |
| const generate = require('regjsgen').generate; |
| const parse = require('regjsparser').parse; |
| const regenerate = require('regenerate'); |
| const unicodeMatchProperty = require('unicode-match-property-ecmascript'); |
| const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript'); |
| const iuMappings = require('./data/iu-mappings.js'); |
| const ESCAPE_SETS = require('./data/character-class-escape-sets.js'); |
| |
| // Prepare a Regenerate set containing all code points, used for negative |
| // character classes (if any). |
| const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); |
| // Without the `u` flag, the range stops at 0xFFFF. |
| // https://mths.be/es6#sec-pattern-semantics |
| const BMP_SET = regenerate().addRange(0x0, 0xFFFF); |
| |
| // Prepare a Regenerate set containing all code points that are supposed to be |
| // matched by `/./u`. https://mths.be/es6#sec-atom |
| const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points |
| .remove( |
| // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators): |
| 0x000A, // Line Feed <LF> |
| 0x000D, // Carriage Return <CR> |
| 0x2028, // Line Separator <LS> |
| 0x2029 // Paragraph Separator <PS> |
| ); |
| // Prepare a Regenerate set containing all code points that are supposed to be |
| // matched by `/./` (only BMP code points). |
| const DOT_SET = DOT_SET_UNICODE.clone() |
| .intersection(BMP_SET); |
| |
| const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => { |
| if (unicode) { |
| if (ignoreCase) { |
| return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character); |
| } |
| return ESCAPE_SETS.UNICODE.get(character); |
| } |
| return ESCAPE_SETS.REGULAR.get(character); |
| }; |
| |
| const getDotSet = (unicode, dotAll) => { |
| if (dotAll) { |
| return unicode ? UNICODE_SET : BMP_SET; |
| } |
| return unicode ? DOT_SET_UNICODE : DOT_SET; |
| }; |
| |
| const getUnicodePropertyValueSet = (property, value) => { |
| const path = value ? |
| `${ property }/${ value }` : |
| `Binary_Property/${ property }`; |
| try { |
| return require(`regenerate-unicode-properties/${ path }.js`); |
| } catch (exception) { |
| throw new Error( |
| `Failed to recognize value \`${ value }\` for property ` + |
| `\`${ property }\`.` |
| ); |
| } |
| }; |
| |
| const handleLoneUnicodePropertyNameOrValue = (value) => { |
| // It could be a `General_Category` value or a binary property. |
| // Note: `unicodeMatchPropertyValue` throws on invalid values. |
| try { |
| const property = 'General_Category'; |
| const category = unicodeMatchPropertyValue(property, value); |
| return getUnicodePropertyValueSet(property, category); |
| } catch (exception) {} |
| // It’s not a `General_Category` value, so check if it’s a binary |
| // property. Note: `unicodeMatchProperty` throws on invalid properties. |
| const property = unicodeMatchProperty(value); |
| return getUnicodePropertyValueSet(property); |
| }; |
| |
| const getUnicodePropertyEscapeSet = (value, isNegative) => { |
| const parts = value.split('='); |
| const firstPart = parts[0]; |
| let set; |
| if (parts.length == 1) { |
| set = handleLoneUnicodePropertyNameOrValue(firstPart); |
| } else { |
| // The pattern consists of two parts, i.e. `Property=Value`. |
| const property = unicodeMatchProperty(firstPart); |
| const value = unicodeMatchPropertyValue(property, parts[1]); |
| set = getUnicodePropertyValueSet(property, value); |
| } |
| if (isNegative) { |
| return UNICODE_SET.clone().remove(set); |
| } |
| return set.clone(); |
| }; |
| |
| // Given a range of code points, add any case-folded code points in that range |
| // to a set. |
| regenerate.prototype.iuAddRange = function(min, max) { |
| const $this = this; |
| do { |
| const folded = caseFold(min); |
| if (folded) { |
| $this.add(folded); |
| } |
| } while (++min <= max); |
| return $this; |
| }; |
| |
| const update = (item, pattern) => { |
| let tree = parse(pattern, config.useUnicodeFlag ? 'u' : ''); |
| switch (tree.type) { |
| case 'characterClass': |
| case 'group': |
| case 'value': |
| // No wrapping needed. |
| break; |
| default: |
| // Wrap the pattern in a non-capturing group. |
| tree = wrap(tree, pattern); |
| } |
| Object.assign(item, tree); |
| }; |
| |
| const wrap = (tree, pattern) => { |
| // Wrap the pattern in a non-capturing group. |
| return { |
| 'type': 'group', |
| 'behavior': 'ignore', |
| 'body': [tree], |
| 'raw': `(?:${ pattern })` |
| }; |
| }; |
| |
| const caseFold = (codePoint) => { |
| return iuMappings.get(codePoint) || false; |
| }; |
| |
| const processCharacterClass = (characterClassItem, regenerateOptions) => { |
| let set = regenerate(); |
| for (const item of characterClassItem.body) { |
| switch (item.type) { |
| case 'value': |
| set.add(item.codePoint); |
| if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| const folded = caseFold(item.codePoint); |
| if (folded) { |
| set.add(folded); |
| } |
| } |
| break; |
| case 'characterClassRange': |
| const min = item.min.codePoint; |
| const max = item.max.codePoint; |
| set.addRange(min, max); |
| if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| set.iuAddRange(min, max); |
| } |
| break; |
| case 'characterClassEscape': |
| set.add(getCharacterClassEscapeSet( |
| item.value, |
| config.unicode, |
| config.ignoreCase |
| )); |
| break; |
| case 'unicodePropertyEscape': |
| set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); |
| break; |
| // The `default` clause is only here as a safeguard; it should never be |
| // reached. Code coverage tools should ignore it. |
| /* istanbul ignore next */ |
| default: |
| throw new Error(`Unknown term type: ${ item.type }`); |
| } |
| } |
| if (characterClassItem.negative) { |
| set = (config.unicode ? UNICODE_SET : BMP_SET).clone().remove(set); |
| } |
| update(characterClassItem, set.toString(regenerateOptions)); |
| return characterClassItem; |
| }; |
| |
| const updateNamedReference = (item, index) => { |
| delete item.name; |
| item.matchIndex = index; |
| }; |
| |
| const assertNoUnmatchedReferences = (groups) => { |
| const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences); |
| if (unmatchedReferencesNames.length > 0) { |
| throw new Error(`Unknown group names: ${unmatchedReferencesNames}`); |
| } |
| }; |
| |
| const processTerm = (item, regenerateOptions, groups) => { |
| switch (item.type) { |
| case 'dot': |
| update( |
| item, |
| getDotSet(config.unicode, config.dotAll).toString(regenerateOptions) |
| ); |
| break; |
| case 'characterClass': |
| item = processCharacterClass(item, regenerateOptions); |
| break; |
| case 'unicodePropertyEscape': |
| update( |
| item, |
| getUnicodePropertyEscapeSet(item.value, item.negative) |
| .toString(regenerateOptions) |
| ); |
| break; |
| case 'characterClassEscape': |
| update( |
| item, |
| getCharacterClassEscapeSet( |
| item.value, |
| config.unicode, |
| config.ignoreCase |
| ).toString(regenerateOptions) |
| ); |
| break; |
| case 'group': |
| groups.lastIndex++; |
| if (item.name) { |
| const name = item.name.value; |
| |
| if (groups.names[name]) { |
| throw new Error( |
| `Multiple groups with the same name (${ name }) are not allowed.` |
| ); |
| } |
| |
| const index = groups.lastIndex; |
| delete item.name; |
| |
| groups.names[name] = index; |
| if (groups.onNamedGroup) { |
| groups.onNamedGroup.call(null, name, index); |
| } |
| |
| if (groups.unmatchedReferences[name]) { |
| groups.unmatchedReferences[name].forEach(reference => { |
| updateNamedReference(reference, index); |
| }); |
| delete groups.unmatchedReferences[name]; |
| } |
| } |
| /* falls through */ |
| case 'alternative': |
| case 'disjunction': |
| case 'quantifier': |
| item.body = item.body.map(term => { |
| return processTerm(term, regenerateOptions, groups); |
| }); |
| break; |
| case 'value': |
| const codePoint = item.codePoint; |
| const set = regenerate(codePoint); |
| if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { |
| const folded = caseFold(codePoint); |
| if (folded) { |
| set.add(folded); |
| } |
| } |
| update(item, set.toString(regenerateOptions)); |
| break; |
| case 'reference': |
| if (item.name) { |
| const name = item.name.value; |
| const index = groups.names[name]; |
| if (index) { |
| updateNamedReference(item, index); |
| break; |
| } |
| |
| if (!groups.unmatchedReferences[name]) { |
| groups.unmatchedReferences[name] = []; |
| } |
| // Keep track of references used before the corresponding group. |
| groups.unmatchedReferences[name].push(item); |
| } |
| break; |
| case 'anchor': |
| case 'empty': |
| case 'group': |
| // Nothing to do here. |
| break; |
| // The `default` clause is only here as a safeguard; it should never be |
| // reached. Code coverage tools should ignore it. |
| /* istanbul ignore next */ |
| default: |
| throw new Error(`Unknown term type: ${ item.type }`); |
| } |
| return item; |
| }; |
| |
| const config = { |
| 'ignoreCase': false, |
| 'unicode': false, |
| 'dotAll': false, |
| 'useUnicodeFlag': false |
| }; |
| const rewritePattern = (pattern, flags, options) => { |
| const regjsparserFeatures = { |
| 'unicodePropertyEscape': options && options.unicodePropertyEscape, |
| 'namedGroups': options && options.namedGroup, |
| 'lookbehind': options && options.lookbehind |
| }; |
| config.ignoreCase = flags && flags.includes('i'); |
| config.unicode = flags && flags.includes('u'); |
| const supportDotAllFlag = options && options.dotAllFlag; |
| config.dotAll = supportDotAllFlag && flags && flags.includes('s'); |
| config.useUnicodeFlag = options && options.useUnicodeFlag; |
| const regenerateOptions = { |
| 'hasUnicodeFlag': config.useUnicodeFlag, |
| 'bmpOnly': !config.unicode |
| }; |
| const groups = { |
| 'onNamedGroup': options && options.onNamedGroup, |
| 'lastIndex': 0, |
| 'names': Object.create(null), // { [name]: index } |
| 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> } |
| }; |
| const tree = parse(pattern, flags, regjsparserFeatures); |
| // Note: `processTerm` mutates `tree` and `groups`. |
| processTerm(tree, regenerateOptions, groups); |
| assertNoUnmatchedReferences(groups); |
| return generate(tree); |
| }; |
| |
| module.exports = rewritePattern; |