urltomarkdown/node_modules/parse5/lib/tokenizer/preprocessor.js

160 lines
4.4 KiB
JavaScript

'use strict';
const unicode = require('../common/unicode');
const ERR = require('../common/error-codes');
//Aliases
const $ = unicode.CODE_POINTS;
//Const
const DEFAULT_BUFFER_WATERLINE = 1 << 16;
//Preprocessor
//NOTE: HTML input preprocessing
//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
class Preprocessor {
constructor() {
this.html = null;
this.pos = -1;
this.lastGapPos = -1;
this.lastCharPos = -1;
this.gapStack = [];
this.skipNextNewLine = false;
this.lastChunkWritten = false;
this.endOfChunkHit = false;
this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
}
_err() {
// NOTE: err reporting is noop by default. Enabled by mixin.
}
_addGap() {
this.gapStack.push(this.lastGapPos);
this.lastGapPos = this.pos;
}
_processSurrogate(cp) {
//NOTE: try to peek a surrogate pair
if (this.pos !== this.lastCharPos) {
const nextCp = this.html.charCodeAt(this.pos + 1);
if (unicode.isSurrogatePair(nextCp)) {
//NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
this.pos++;
//NOTE: add gap that should be avoided during retreat
this._addGap();
return unicode.getSurrogatePairCodePoint(cp, nextCp);
}
}
//NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet.
else if (!this.lastChunkWritten) {
this.endOfChunkHit = true;
return $.EOF;
}
//NOTE: isolated surrogate
this._err(ERR.surrogateInInputStream);
return cp;
}
dropParsedChunk() {
if (this.pos > this.bufferWaterline) {
this.lastCharPos -= this.pos;
this.html = this.html.substring(this.pos);
this.pos = 0;
this.lastGapPos = -1;
this.gapStack = [];
}
}
write(chunk, isLastChunk) {
if (this.html) {
this.html += chunk;
} else {
this.html = chunk;
}
this.lastCharPos = this.html.length - 1;
this.endOfChunkHit = false;
this.lastChunkWritten = isLastChunk;
}
insertHtmlAtCurrentPos(chunk) {
this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length);
this.lastCharPos = this.html.length - 1;
this.endOfChunkHit = false;
}
advance() {
this.pos++;
if (this.pos > this.lastCharPos) {
this.endOfChunkHit = !this.lastChunkWritten;
return $.EOF;
}
let cp = this.html.charCodeAt(this.pos);
//NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
//must be ignored.
if (this.skipNextNewLine && cp === $.LINE_FEED) {
this.skipNextNewLine = false;
this._addGap();
return this.advance();
}
//NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
if (cp === $.CARRIAGE_RETURN) {
this.skipNextNewLine = true;
return $.LINE_FEED;
}
this.skipNextNewLine = false;
if (unicode.isSurrogate(cp)) {
cp = this._processSurrogate(cp);
}
//OPTIMIZATION: first check if code point is in the common allowed
//range (ASCII alphanumeric, whitespaces, big chunk of BMP)
//before going into detailed performance cost validation.
const isCommonValidRange =
(cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0);
if (!isCommonValidRange) {
this._checkForProblematicCharacters(cp);
}
return cp;
}
_checkForProblematicCharacters(cp) {
if (unicode.isControlCodePoint(cp)) {
this._err(ERR.controlCharacterInInputStream);
} else if (unicode.isUndefinedCodePoint(cp)) {
this._err(ERR.noncharacterInInputStream);
}
}
retreat() {
if (this.pos === this.lastGapPos) {
this.lastGapPos = this.gapStack.pop();
this.pos--;
}
this.pos--;
}
}
module.exports = Preprocessor;