node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js - nifi-fds - Git at Google

 "use strict";
 const whatwgEncoding = require("whatwg-encoding");

 // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
 module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
   let encoding = whatwgEncoding.getBOMEncoding(uint8Array);

   if (encoding === null && transportLayerEncodingLabel !== undefined) {
     encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
   }

   if (encoding === null) {
     encoding = prescanMetaCharset(uint8Array);
   }

   if (encoding === null) {
     encoding = defaultEncoding;
   }

   return encoding;
 };

 // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
 function prescanMetaCharset(uint8Array) {
   const l = Math.min(uint8Array.byteLength, 1024);
   for (let i = 0; i < l; i++) {
     let c = uint8Array[i];
     if (c === 0x3C) {
       // "<"
       const c1 = uint8Array[i + 1];
       const c2 = uint8Array[i + 2];
       const c3 = uint8Array[i + 3];
       const c4 = uint8Array[i + 4];
       const c5 = uint8Array[i + 5];
       // !-- (comment start)
       if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
         i += 4;
         for (; i < l; i++) {
           c = uint8Array[i];
           const cMinus1 = uint8Array[i - 1];
           const cMinus2 = uint8Array[i - 2];
           // --> (comment end)
           if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
             break;
           }
         }
       } else if ((c1 === 0x4D || c1 === 0x6D) &&
          (c2 === 0x45 || c2 === 0x65) &&
          (c3 === 0x54 || c3 === 0x74) &&
          (c4 === 0x41 || c4 === 0x61) &&
          (isSpaceCharacter(c5) || c5 === 0x2F)) {
         // "meta" + space or /
         i += 6;
         const attributeList = new Set();
         let gotPragma = false;
         let needPragma = null;
         let charset = null;

         let attrRes;
         do {
           attrRes = getAttribute(uint8Array, i, l);
           if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
             attributeList.add(attrRes.attr.name);
             if (attrRes.attr.name === "http-equiv") {
               gotPragma = attrRes.attr.value === "content-type";
             } else if (attrRes.attr.name === "content" && !charset) {
               charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
               if (charset !== null) {
                 needPragma = true;
               }
             } else if (attrRes.attr.name === "charset") {
               charset = whatwgEncoding.labelToName(attrRes.attr.value);
               needPragma = false;
             }
           }
           i = attrRes.i;
         } while (attrRes.attr);

         if (needPragma === null) {
           continue;
         }
         if (needPragma === true && gotPragma === false) {
           continue;
         }
         if (charset === null) {
           continue;
         }

         if (charset === "UTF-16LE" || charset === "UTF-16BE") {
           charset = "UTF-8";
         }
         if (charset === "x-user-defined") {
           charset = "windows-1252";
         }

         return charset;
       } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
         // a-z or A-Z
         for (i += 2; i < l; i++) {
           c = uint8Array[i];
           // space or >
           if (isSpaceCharacter(c) || c === 0x3E) {
             break;
           }
         }
         let attrRes;
         do {
           attrRes = getAttribute(uint8Array, i, l);
           i = attrRes.i;
         } while (attrRes.attr);
       } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
         // ! or / or ?
         for (i += 2; i < l; i++) {
           c = uint8Array[i];
           // >
           if (c === 0x3E) {
             break;
           }
         }
       }
     }
   }
   return null;
 }

 // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
 function getAttribute(uint8Array, i, l) {
   for (; i < l; i++) {
     let c = uint8Array[i];
     // space or /
     if (isSpaceCharacter(c) || c === 0x2F) {
       continue;
     }
     // ">"
     if (c === 0x3E) {
       break;
     }
     let name = "";
     let value = "";
     nameLoop:for (; i < l; i++) {
       c = uint8Array[i];
       // "="
       if (c === 0x3D && name !== "") {
         i++;
         break;
       }
       // space
       if (isSpaceCharacter(c)) {
         for (i++; i < l; i++) {
           c = uint8Array[i];
           // space
           if (isSpaceCharacter(c)) {
             continue;
           }
           // not "="
           if (c !== 0x3D) {
             return { attr: { name, value }, i };
           }

           i++;
           break nameLoop;
         }
         break;
       }
       // / or >
       if (c === 0x2F || c === 0x3E) {
         return { attr: { name, value }, i };
       }
       // A-Z
       if (c >= 0x41 && c <= 0x5A) {
         name += String.fromCharCode(c + 0x20); // lowercase
       } else {
         name += String.fromCharCode(c);
       }
     }
     c = uint8Array[i];
     // space
     if (isSpaceCharacter(c)) {
       for (i++; i < l; i++) {
         c = uint8Array[i];
         // space
         if (isSpaceCharacter(c)) {
           continue;
         } else {
           break;
         }
       }
     }
     // " or '
     if (c === 0x22 || c === 0x27) {
       const quote = c;
       for (i++; i < l; i++) {
         c = uint8Array[i];

         if (c === quote) {
           i++;
           return { attr: { name, value }, i };
         }

         // A-Z
         if (c >= 0x41 && c <= 0x5A) {
           value += String.fromCharCode(c + 0x20); // lowercase
         } else {
           value += String.fromCharCode(c);
         }
       }
     }

     // >
     if (c === 0x3E) {
       return { attr: { name, value }, i };
     }

     // A-Z
     if (c >= 0x41 && c <= 0x5A) {
       value += String.fromCharCode(c + 0x20); // lowercase
     } else {
       value += String.fromCharCode(c);
     }

     for (i++; i < l; i++) {
       c = uint8Array[i];

       // space or >
       if (isSpaceCharacter(c) || c === 0x3E) {
         return { attr: { name, value }, i };
       }

       // A-Z
       if (c >= 0x41 && c <= 0x5A) {
         value += String.fromCharCode(c + 0x20); // lowercase
       } else {
         value += String.fromCharCode(c);
       }
     }
   }
   return { i };
 }

 function extractCharacterEncodingFromMeta(string) {
   let position = 0;

   while (true) {
     const indexOfCharset = string.substring(position).search(/charset/ui);

     if (indexOfCharset === -1) {
       return null;
     }
     let subPosition = position + indexOfCharset + "charset".length;

     while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
       ++subPosition;
     }

     if (string[subPosition] !== "=") {
       position = subPosition - 1;
       continue;
     }

     ++subPosition;

     while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
       ++subPosition;
     }

     position = subPosition;
     break;
   }

   if (string[position] === "\"" || string[position] === "'") {
     const nextIndex = string.indexOf(string[position], position + 1);

     if (nextIndex !== -1) {
       return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
     }

     // It is an unmatched quotation mark
     return null;
   }

   if (string.length === position + 1) {
     return null;
   }

   const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u);
   const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
     string.length :
     position + indexOfASCIIWhitespaceOrSemicolon + 1;

   return whatwgEncoding.labelToName(string.substring(position, end));
 }

 function isSpaceCharacter(c) {
   return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
 }
	"use strict";
	const whatwgEncoding = require("whatwg-encoding");

	// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
	module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
	let encoding = whatwgEncoding.getBOMEncoding(uint8Array);

	if (encoding === null && transportLayerEncodingLabel !== undefined) {
	encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
	}

	if (encoding === null) {
	encoding = prescanMetaCharset(uint8Array);
	}

	if (encoding === null) {
	encoding = defaultEncoding;
	}

	return encoding;
	};

	// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
	function prescanMetaCharset(uint8Array) {
	const l = Math.min(uint8Array.byteLength, 1024);
	for (let i = 0; i < l; i++) {
	let c = uint8Array[i];
	if (c === 0x3C) {
	// "<"
	const c1 = uint8Array[i + 1];
	const c2 = uint8Array[i + 2];
	const c3 = uint8Array[i + 3];
	const c4 = uint8Array[i + 4];
	const c5 = uint8Array[i + 5];
	// !-- (comment start)
	if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
	i += 4;
	for (; i < l; i++) {
	c = uint8Array[i];
	const cMinus1 = uint8Array[i - 1];
	const cMinus2 = uint8Array[i - 2];
	// --> (comment end)
	if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
	break;
	}
	}
	} else if ((c1 === 0x4D \|\| c1 === 0x6D) &&
	(c2 === 0x45 \|\| c2 === 0x65) &&
	(c3 === 0x54 \|\| c3 === 0x74) &&
	(c4 === 0x41 \|\| c4 === 0x61) &&
	(isSpaceCharacter(c5) \|\| c5 === 0x2F)) {
	// "meta" + space or /
	i += 6;
	const attributeList = new Set();
	let gotPragma = false;
	let needPragma = null;
	let charset = null;

	let attrRes;
	do {
	attrRes = getAttribute(uint8Array, i, l);
	if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
	attributeList.add(attrRes.attr.name);
	if (attrRes.attr.name === "http-equiv") {
	gotPragma = attrRes.attr.value === "content-type";
	} else if (attrRes.attr.name === "content" && !charset) {
	charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
	if (charset !== null) {
	needPragma = true;
	}
	} else if (attrRes.attr.name === "charset") {
	charset = whatwgEncoding.labelToName(attrRes.attr.value);
	needPragma = false;
	}
	}
	i = attrRes.i;
	} while (attrRes.attr);

	if (needPragma === null) {
	continue;
	}
	if (needPragma === true && gotPragma === false) {
	continue;
	}
	if (charset === null) {
	continue;
	}

	if (charset === "UTF-16LE" \|\| charset === "UTF-16BE") {
	charset = "UTF-8";
	}
	if (charset === "x-user-defined") {
	charset = "windows-1252";
	}

	return charset;
	} else if ((c1 >= 0x41 && c1 <= 0x5A) \|\| (c1 >= 0x61 && c1 <= 0x7A)) {
	// a-z or A-Z
	for (i += 2; i < l; i++) {
	c = uint8Array[i];
	// space or >
	if (isSpaceCharacter(c) \|\| c === 0x3E) {
	break;
	}
	}
	let attrRes;
	do {
	attrRes = getAttribute(uint8Array, i, l);
	i = attrRes.i;
	} while (attrRes.attr);
	} else if (c1 === 0x21 \|\| c1 === 0x2F \|\| c1 === 0x3F) {
	// ! or / or ?
	for (i += 2; i < l; i++) {
	c = uint8Array[i];
	// >
	if (c === 0x3E) {
	break;
	}
	}
	}
	}
	}
	return null;
	}

	// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
	function getAttribute(uint8Array, i, l) {
	for (; i < l; i++) {
	let c = uint8Array[i];
	// space or /
	if (isSpaceCharacter(c) \|\| c === 0x2F) {
	continue;
	}
	// ">"
	if (c === 0x3E) {
	break;
	}
	let name = "";
	let value = "";
	nameLoop:for (; i < l; i++) {
	c = uint8Array[i];
	// "="
	if (c === 0x3D && name !== "") {
	i++;
	break;
	}
	// space
	if (isSpaceCharacter(c)) {
	for (i++; i < l; i++) {
	c = uint8Array[i];
	// space
	if (isSpaceCharacter(c)) {
	continue;
	}
	// not "="
	if (c !== 0x3D) {
	return { attr: { name, value }, i };
	}

	i++;
	break nameLoop;
	}
	break;
	}
	// / or >
	if (c === 0x2F \|\| c === 0x3E) {
	return { attr: { name, value }, i };
	}
	// A-Z
	if (c >= 0x41 && c <= 0x5A) {
	name += String.fromCharCode(c + 0x20); // lowercase
	} else {
	name += String.fromCharCode(c);
	}
	}
	c = uint8Array[i];
	// space
	if (isSpaceCharacter(c)) {
	for (i++; i < l; i++) {
	c = uint8Array[i];
	// space
	if (isSpaceCharacter(c)) {
	continue;
	} else {
	break;
	}
	}
	}
	// " or '
	if (c === 0x22 \|\| c === 0x27) {
	const quote = c;
	for (i++; i < l; i++) {
	c = uint8Array[i];

	if (c === quote) {
	i++;
	return { attr: { name, value }, i };
	}

	// A-Z
	if (c >= 0x41 && c <= 0x5A) {
	value += String.fromCharCode(c + 0x20); // lowercase
	} else {
	value += String.fromCharCode(c);
	}
	}
	}

	// >
	if (c === 0x3E) {
	return { attr: { name, value }, i };
	}

	// A-Z
	if (c >= 0x41 && c <= 0x5A) {
	value += String.fromCharCode(c + 0x20); // lowercase
	} else {
	value += String.fromCharCode(c);
	}

	for (i++; i < l; i++) {
	c = uint8Array[i];

	// space or >
	if (isSpaceCharacter(c) \|\| c === 0x3E) {
	return { attr: { name, value }, i };
	}

	// A-Z
	if (c >= 0x41 && c <= 0x5A) {
	value += String.fromCharCode(c + 0x20); // lowercase
	} else {
	value += String.fromCharCode(c);
	}
	}
	}
	return { i };
	}

	function extractCharacterEncodingFromMeta(string) {
	let position = 0;

	while (true) {
	const indexOfCharset = string.substring(position).search(/charset/ui);

	if (indexOfCharset === -1) {
	return null;
	}
	let subPosition = position + indexOfCharset + "charset".length;

	while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
	++subPosition;
	}

	if (string[subPosition] !== "=") {
	position = subPosition - 1;
	continue;
	}

	++subPosition;

	while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
	++subPosition;
	}

	position = subPosition;
	break;
	}

	if (string[position] === "\"" \|\| string[position] === "'") {
	const nextIndex = string.indexOf(string[position], position + 1);

	if (nextIndex !== -1) {
	return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
	}

	// It is an unmatched quotation mark
	return null;
	}

	if (string.length === position + 1) {
	return null;
	}

	const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09\|\x0A\|\x0C\|\x0D\|\x20\|;/u);
	const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
	string.length :
	position + indexOfASCIIWhitespaceOrSemicolon + 1;

	return whatwgEncoding.labelToName(string.substring(position, end));
	}

	function isSpaceCharacter(c) {
	return c === 0x09 \|\| c === 0x0A \|\| c === 0x0C \|\| c === 0x0D \|\| c === 0x20;
	}