| "use strict"; |
| const whatwgEncoding = require("whatwg-encoding"); |
| |
| // https://html.spec.whatwg.org/#encoding-sniffing-algorithm |
| module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { |
| let encoding = whatwgEncoding.getBOMEncoding(uint8Array); |
| |
| if (encoding === null && transportLayerEncodingLabel !== undefined) { |
| encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel); |
| } |
| |
| if (encoding === null) { |
| encoding = prescanMetaCharset(uint8Array); |
| } |
| |
| if (encoding === null) { |
| encoding = defaultEncoding; |
| } |
| |
| return encoding; |
| }; |
| |
| // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding |
| function prescanMetaCharset(uint8Array) { |
| const l = Math.min(uint8Array.byteLength, 1024); |
| for (let i = 0; i < l; i++) { |
| let c = uint8Array[i]; |
| if (c === 0x3C) { |
| // "<" |
| const c1 = uint8Array[i + 1]; |
| const c2 = uint8Array[i + 2]; |
| const c3 = uint8Array[i + 3]; |
| const c4 = uint8Array[i + 4]; |
| const c5 = uint8Array[i + 5]; |
| // !-- (comment start) |
| if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) { |
| i += 4; |
| for (; i < l; i++) { |
| c = uint8Array[i]; |
| const cMinus1 = uint8Array[i - 1]; |
| const cMinus2 = uint8Array[i - 2]; |
| // --> (comment end) |
| if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) { |
| break; |
| } |
| } |
| } else if ((c1 === 0x4D || c1 === 0x6D) && |
| (c2 === 0x45 || c2 === 0x65) && |
| (c3 === 0x54 || c3 === 0x74) && |
| (c4 === 0x41 || c4 === 0x61) && |
| (isSpaceCharacter(c5) || c5 === 0x2F)) { |
| // "meta" + space or / |
| i += 6; |
| const attributeList = new Set(); |
| let gotPragma = false; |
| let needPragma = null; |
| let charset = null; |
| |
| let attrRes; |
| do { |
| attrRes = getAttribute(uint8Array, i, l); |
| if (attrRes.attr && !attributeList.has(attrRes.attr.name)) { |
| attributeList.add(attrRes.attr.name); |
| if (attrRes.attr.name === "http-equiv") { |
| gotPragma = attrRes.attr.value === "content-type"; |
| } else if (attrRes.attr.name === "content" && !charset) { |
| charset = extractCharacterEncodingFromMeta(attrRes.attr.value); |
| if (charset !== null) { |
| needPragma = true; |
| } |
| } else if (attrRes.attr.name === "charset") { |
| charset = whatwgEncoding.labelToName(attrRes.attr.value); |
| needPragma = false; |
| } |
| } |
| i = attrRes.i; |
| } while (attrRes.attr); |
| |
| if (needPragma === null) { |
| continue; |
| } |
| if (needPragma === true && gotPragma === false) { |
| continue; |
| } |
| if (charset === null) { |
| continue; |
| } |
| |
| if (charset === "UTF-16LE" || charset === "UTF-16BE") { |
| charset = "UTF-8"; |
| } |
| if (charset === "x-user-defined") { |
| charset = "windows-1252"; |
| } |
| |
| return charset; |
| } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) { |
| // a-z or A-Z |
| for (i += 2; i < l; i++) { |
| c = uint8Array[i]; |
| // space or > |
| if (isSpaceCharacter(c) || c === 0x3E) { |
| break; |
| } |
| } |
| let attrRes; |
| do { |
| attrRes = getAttribute(uint8Array, i, l); |
| i = attrRes.i; |
| } while (attrRes.attr); |
| } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) { |
| // ! or / or ? |
| for (i += 2; i < l; i++) { |
| c = uint8Array[i]; |
| // > |
| if (c === 0x3E) { |
| break; |
| } |
| } |
| } |
| } |
| } |
| return null; |
| } |
| |
| // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing |
| function getAttribute(uint8Array, i, l) { |
| for (; i < l; i++) { |
| let c = uint8Array[i]; |
| // space or / |
| if (isSpaceCharacter(c) || c === 0x2F) { |
| continue; |
| } |
| // ">" |
| if (c === 0x3E) { |
| break; |
| } |
| let name = ""; |
| let value = ""; |
| nameLoop:for (; i < l; i++) { |
| c = uint8Array[i]; |
| // "=" |
| if (c === 0x3D && name !== "") { |
| i++; |
| break; |
| } |
| // space |
| if (isSpaceCharacter(c)) { |
| for (i++; i < l; i++) { |
| c = uint8Array[i]; |
| // space |
| if (isSpaceCharacter(c)) { |
| continue; |
| } |
| // not "=" |
| if (c !== 0x3D) { |
| return { attr: { name, value }, i }; |
| } |
| |
| i++; |
| break nameLoop; |
| } |
| break; |
| } |
| // / or > |
| if (c === 0x2F || c === 0x3E) { |
| return { attr: { name, value }, i }; |
| } |
| // A-Z |
| if (c >= 0x41 && c <= 0x5A) { |
| name += String.fromCharCode(c + 0x20); // lowercase |
| } else { |
| name += String.fromCharCode(c); |
| } |
| } |
| c = uint8Array[i]; |
| // space |
| if (isSpaceCharacter(c)) { |
| for (i++; i < l; i++) { |
| c = uint8Array[i]; |
| // space |
| if (isSpaceCharacter(c)) { |
| continue; |
| } else { |
| break; |
| } |
| } |
| } |
| // " or ' |
| if (c === 0x22 || c === 0x27) { |
| const quote = c; |
| for (i++; i < l; i++) { |
| c = uint8Array[i]; |
| |
| if (c === quote) { |
| i++; |
| return { attr: { name, value }, i }; |
| } |
| |
| // A-Z |
| if (c >= 0x41 && c <= 0x5A) { |
| value += String.fromCharCode(c + 0x20); // lowercase |
| } else { |
| value += String.fromCharCode(c); |
| } |
| } |
| } |
| |
| // > |
| if (c === 0x3E) { |
| return { attr: { name, value }, i }; |
| } |
| |
| // A-Z |
| if (c >= 0x41 && c <= 0x5A) { |
| value += String.fromCharCode(c + 0x20); // lowercase |
| } else { |
| value += String.fromCharCode(c); |
| } |
| |
| for (i++; i < l; i++) { |
| c = uint8Array[i]; |
| |
| // space or > |
| if (isSpaceCharacter(c) || c === 0x3E) { |
| return { attr: { name, value }, i }; |
| } |
| |
| // A-Z |
| if (c >= 0x41 && c <= 0x5A) { |
| value += String.fromCharCode(c + 0x20); // lowercase |
| } else { |
| value += String.fromCharCode(c); |
| } |
| } |
| } |
| return { i }; |
| } |
| |
| function extractCharacterEncodingFromMeta(string) { |
| let position = 0; |
| |
| while (true) { |
| const indexOfCharset = string.substring(position).search(/charset/ui); |
| |
| if (indexOfCharset === -1) { |
| return null; |
| } |
| let subPosition = position + indexOfCharset + "charset".length; |
| |
| while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { |
| ++subPosition; |
| } |
| |
| if (string[subPosition] !== "=") { |
| position = subPosition - 1; |
| continue; |
| } |
| |
| ++subPosition; |
| |
| while (isSpaceCharacter(string[subPosition].charCodeAt(0))) { |
| ++subPosition; |
| } |
| |
| position = subPosition; |
| break; |
| } |
| |
| if (string[position] === "\"" || string[position] === "'") { |
| const nextIndex = string.indexOf(string[position], position + 1); |
| |
| if (nextIndex !== -1) { |
| return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex)); |
| } |
| |
| // It is an unmatched quotation mark |
| return null; |
| } |
| |
| if (string.length === position + 1) { |
| return null; |
| } |
| |
| const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u); |
| const end = indexOfASCIIWhitespaceOrSemicolon === -1 ? |
| string.length : |
| position + indexOfASCIIWhitespaceOrSemicolon + 1; |
| |
| return whatwgEncoding.labelToName(string.substring(position, end)); |
| } |
| |
| function isSpaceCharacter(c) { |
| return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20; |
| } |