urltomarkdown/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js

296 lines
7.5 KiB
JavaScript
Raw Normal View History

2022-01-08 17:05:05 +00:00
"use strict";
const whatwgEncoding = require("whatwg-encoding");
// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
if (encoding === null && transportLayerEncodingLabel !== undefined) {
encoding = whatwgEncoding.labelToName(transportLayerEncodingLabel);
}
if (encoding === null) {
encoding = prescanMetaCharset(buffer);
}
if (encoding === null) {
encoding = defaultEncoding;
}
return encoding;
};
// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
function prescanMetaCharset(buffer) {
const l = Math.min(buffer.length, 1024);
for (let i = 0; i < l; i++) {
let c = buffer[i];
if (c === 0x3C) {
// "<"
const c1 = buffer[i + 1];
const c2 = buffer[i + 2];
const c3 = buffer[i + 3];
const c4 = buffer[i + 4];
const c5 = buffer[i + 5];
// !-- (comment start)
if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
i += 4;
for (; i < l; i++) {
c = buffer[i];
const cMinus1 = buffer[i - 1];
const cMinus2 = buffer[i - 2];
// --> (comment end)
if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
break;
}
}
} else if ((c1 === 0x4D || c1 === 0x6D) &&
(c2 === 0x45 || c2 === 0x65) &&
(c3 === 0x54 || c3 === 0x74) &&
(c4 === 0x41 || c4 === 0x61) &&
(isSpaceCharacter(c5) || c5 === 0x2F)) {
// "meta" + space or /
i += 6;
const attributeList = new Set();
let gotPragma = false;
let needPragma = null;
let charset = null;
let attrRes;
do {
attrRes = getAttribute(buffer, i, l);
if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
attributeList.add(attrRes.attr.name);
if (attrRes.attr.name === "http-equiv") {
gotPragma = attrRes.attr.value === "content-type";
} else if (attrRes.attr.name === "content" && !charset) {
charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
if (charset !== null) {
needPragma = true;
}
} else if (attrRes.attr.name === "charset") {
charset = whatwgEncoding.labelToName(attrRes.attr.value);
needPragma = false;
}
}
i = attrRes.i;
} while (attrRes.attr);
if (needPragma === null) {
continue;
}
if (needPragma === true && gotPragma === false) {
continue;
}
if (charset === null) {
continue;
}
if (charset === "UTF-16LE" || charset === "UTF-16BE") {
charset = "UTF-8";
}
if (charset === "x-user-defined") {
charset = "windows-1252";
}
return charset;
} else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
// a-z or A-Z
for (i += 2; i < l; i++) {
c = buffer[i];
// space or >
if (isSpaceCharacter(c) || c === 0x3E) {
break;
}
}
let attrRes;
do {
attrRes = getAttribute(buffer, i, l);
i = attrRes.i;
} while (attrRes.attr);
} else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
// ! or / or ?
for (i += 2; i < l; i++) {
c = buffer[i];
// >
if (c === 0x3E) {
break;
}
}
}
}
}
return null;
}
// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
function getAttribute(buffer, i, l) {
for (; i < l; i++) {
let c = buffer[i];
// space or /
if (isSpaceCharacter(c) || c === 0x2F) {
continue;
}
// ">"
if (c === 0x3E) {
break;
}
let name = "";
let value = "";
nameLoop:for (; i < l; i++) {
c = buffer[i];
// "="
if (c === 0x3D && name !== "") {
i++;
break;
}
// space
if (isSpaceCharacter(c)) {
for (i++; i < l; i++) {
c = buffer[i];
// space
if (isSpaceCharacter(c)) {
continue;
}
// not "="
if (c !== 0x3D) {
return { attr: { name, value }, i };
}
i++;
break nameLoop;
}
break;
}
// / or >
if (c === 0x2F || c === 0x3E) {
return { attr: { name, value }, i };
}
// A-Z
if (c >= 0x41 && c <= 0x5A) {
name += String.fromCharCode(c + 0x20); // lowercase
} else {
name += String.fromCharCode(c);
}
}
c = buffer[i];
// space
if (isSpaceCharacter(c)) {
for (i++; i < l; i++) {
c = buffer[i];
// space
if (isSpaceCharacter(c)) {
continue;
} else {
break;
}
}
}
// " or '
if (c === 0x22 || c === 0x27) {
const quote = c;
for (i++; i < l; i++) {
c = buffer[i];
if (c === quote) {
i++;
return { attr: { name, value }, i };
}
// A-Z
if (c >= 0x41 && c <= 0x5A) {
value += String.fromCharCode(c + 0x20); // lowercase
} else {
value += String.fromCharCode(c);
}
}
}
// >
if (c === 0x3E) {
return { attr: { name, value }, i };
}
// A-Z
if (c >= 0x41 && c <= 0x5A) {
value += String.fromCharCode(c + 0x20); // lowercase
} else {
value += String.fromCharCode(c);
}
for (i++; i < l; i++) {
c = buffer[i];
// space or >
if (isSpaceCharacter(c) || c === 0x3E) {
return { attr: { name, value }, i };
}
// A-Z
if (c >= 0x41 && c <= 0x5A) {
value += String.fromCharCode(c + 0x20); // lowercase
} else {
value += String.fromCharCode(c);
}
}
}
return { i };
}
function extractCharacterEncodingFromMeta(string) {
let position = 0;
while (true) {
const indexOfCharset = string.substring(position).search(/charset/i);
if (indexOfCharset === -1) {
return null;
}
let subPosition = position + indexOfCharset + "charset".length;
while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
++subPosition;
}
if (string[subPosition] !== "=") {
position = subPosition - 1;
continue;
}
++subPosition;
while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
++subPosition;
}
position = subPosition;
break;
}
if (string[position] === "\"" || string[position] === "'") {
const nextIndex = string.indexOf(string[position], position + 1);
if (nextIndex !== -1) {
return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
}
// It is an unmatched quotation mark
return null;
}
if (string.length === position + 1) {
return null;
}
const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
string.length :
position + indexOfASCIIWhitespaceOrSemicolon + 1;
return whatwgEncoding.labelToName(string.substring(position, end));
}
function isSpaceCharacter(c) {
return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
}