From 15c455cd244ac97cb71e5acbf54bf5563c6cafe5 Mon Sep 17 00:00:00 2001 From: macsplit Date: Thu, 12 May 2022 15:36:59 +0100 Subject: [PATCH] strip tags and decode entities in code blocks --- index.js | 11 +- node_modules/.package-lock.json | 6 +- node_modules/html-entities/CHANGELOG.md | 205 ------------------------ node_modules/html-entities/README.md | 66 ++++---- node_modules/html-entities/package.json | 16 +- package-lock.json | 14 +- package.json | 2 +- 7 files changed, 59 insertions(+), 261 deletions(-) delete mode 100644 node_modules/html-entities/CHANGELOG.md diff --git a/index.js b/index.js index 92c42b3..554e1b7 100755 --- a/index.js +++ b/index.js @@ -8,6 +8,7 @@ const table_to_markdown = require('./html_table_to_markdown.js'); const validURL = require('@7c/validurl'); const express = require('express'); const rateLimit = require('express-rate-limit'); +const htmlentities = require('html-entities'); const port = process.env.PORT; @@ -71,14 +72,14 @@ app.post('/', function(req, res) { if (!html) { res.status(400).send("Please provide a POST parameter called html"); } else { - try { + //try { let document = new JSDOM(html); let markdown = process_dom(url, document, res, inline_title, ignore_links); send_headers(res); res.send(markdown); - } catch (error) { - res.status(400).send("Could not parse that document"); - } + //} catch (error) { + // res.status(400).send("Could not parse that document"); + //} } }); @@ -172,6 +173,8 @@ function code_block_to_markdown (html) { const match_code = /^\s*]*>[\r\n]*([\s\S]*)<\/code>\s*$/ig.exec(inner_html); if (match_code && match_code[1]) inner_html = match_code[1]; + inner_html = inner_html.replace(/(<([^>]+)>)/ig, ""); + inner_html = htmlentities.decode(inner_html); const markdown = "```\n"+inner_html+"\n```\n"; return markdown; } diff --git a/node_modules/.package-lock.json b/node_modules/.package-lock.json index f566c0c..b4e59e0 100644 --- a/node_modules/.package-lock.json +++ b/node_modules/.package-lock.json @@ -485,9 +485,9 @@ } }, "node_modules/html-entities": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz", - "integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ==" + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz", + "integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA==" }, "node_modules/http-errors": { "version": "1.8.1", diff --git a/node_modules/html-entities/CHANGELOG.md b/node_modules/html-entities/CHANGELOG.md deleted file mode 100644 index 9071e09..0000000 --- a/node_modules/html-entities/CHANGELOG.md +++ /dev/null @@ -1,205 +0,0 @@ -2.3.2 ------ - - * Minimize data files, remove unnecessary files. - -2.3.1 ------ - - * Improve performance of `encode()`, `decode()` and `decodeEntity()` by using function inlining. - * Fix decoding HEX HTML entities in some cases. - -2.3.0 ------ - - * Add flow types. - -2.2.0 ------ - - * A fast `decodeEntity()` method to decode a single HTML entity. - -2.1.1 ------ - - * Speed up both `encode()` and `decode()` methods. - -2.1.0 ------ - - * Add `extensive` mode to `encode()` method. This mode encodes all non-printable characters, non-ASCII characters and all characters with named references. - -2.0.6 ------ - - * Handle invalid numeric HTML entities: mimic browser behaviour. - -2.0.5 ------ - - * Handling behaviour of ambiguous ampersands. - -2.0.4 ------ - - * Fix webpack build warning. - -2.0.3 ------ - - * Handle invalid numeric HTML entities. - -2.0.2 ------ - - * Handle `null` and `undefined` text values. - -2.0.1 ------ - - * Fix decoding numeric HTML entities. - -2.0.0 ------ - - * Performance was greatly improved. - * New API: simpler and more flexible. - - `htmlEntitiesInstance.encode(text)` -> `encode(text)` - - Before: - - ```js - import {AllHtmlEntities} from 'html-entities'; - - const entities = new AllHtmlEntities(); - console.log( - entities.encode('') - ); - ``` - - After: - - ```js - import {encode} from 'html-entities'; - - console.log( - encode('') - ); - ``` - - --- - - `instance.encodeNonASCII(text)` -> `encode(text, {mode: 'nonAscii'})` - - Before: - - ```js - import {AllHtmlEntities} from 'html-entities'; - - const entities = new AllHtmlEntities(); - console.log( - entities.encodeNonASCII('& © ∆') - ); - ``` - - After: - - ```js - import {encode} from 'html-entities'; - - console.log( - encode('& © ∆', {mode: 'nonAscii'}) - ); - ``` - - --- - - `instance.encodeNonASCII(text)` -> `encode(text, {mode: 'nonAsciiPrintable'})` - - Before: - - ```js - import {AllHtmlEntities} from 'html-entities'; - - const entities = new AllHtmlEntities(); - console.log( - entities.encodeNonASCII('& © ∆ \x01') - ); - ``` - - After: - - ```js - import {encode} from 'html-entities'; - - console.log( - encode('& © ∆ \x01', {mode: 'nonAsciiPrintable'}) - ); - ``` - - --- - - `instance.decode(text)` -> `decode(text)` - - Before: - - ```js - import {AllHtmlEntities} from 'html-entities'; - - const entities = new AllHtmlEntities(); - console.log( - entities.decode('<>&') - ); - ``` - - After: - - ```js - import {decode} from 'html-entities'; - - console.log( - decode('<>&') - ); - ``` - - --- - - Different XML/HTML versions are now implemented via options instead of different classes. - - Before: - - ```js - import {XmlEntities, Html4Entities, Html5Entities, AllHtmlEntities} from 'html-entities'; - - const xmlEntities = new XmlEntities(); - const html4Entities = new Html4Entities(); - const html5Entities = new Html5Entities(); - const allHtmlEntities = new AllHtmlEntities(); - - console.log(xmlEntities.encode('<>&')); - console.log(html4Entities.encode('<>&©')); - console.log(html5Entities.encode('<>&©℞')); - console.log(allHtmlEntities.encode('<>&©℞')); - - console.log(xmlEntities.decode('<>&')); - console.log(html4Entities.decode('<>&©')); - console.log(html5Entities.decode('<>&©℞')); - console.log(allHtmlEntities.decode('<>&©℞')); - ``` - - After: - - ```js - import {encode, decode} from 'html-entities'; - - console.log(encode('<>&', {level: 'xml'})); - console.log(encode('<>&©', {level: 'html4', mode: 'nonAscii'})); - console.log(encode('<>&©℞', {level: 'html5', mode: 'nonAscii'})); - console.log(encode('<>&©℞', {level: 'all', mode: 'nonAscii'})); - - console.log(decode('<>&', {level: 'xml'})); - console.log(decode('<>&©', {level: 'html4'})); - console.log(decode('<>&©℞', {level: 'html5'})); - console.log(decode('<>&©℞', {level: 'all'})); - ``` diff --git a/node_modules/html-entities/README.md b/node_modules/html-entities/README.md index 59f9d67..b07b366 100644 --- a/node_modules/html-entities/README.md +++ b/node_modules/html-entities/README.md @@ -124,68 +124,68 @@ Common Initialization / Load speed - * #1: html-entities x 2,544,400 ops/sec ±4.52% (77 runs sampled) - #2: entities x 1,757,526 ops/sec ±3.99% (81 runs sampled) - #3: he x 1,281,542 ops/sec ±9.31% (74 runs sampled) + * #1: html-entities x 2,632,942 ops/sec ±3.71% (72 runs sampled) + #2: entities x 1,379,154 ops/sec ±5.87% (75 runs sampled) + #3: he x 1,334,035 ops/sec ±3.14% (83 runs sampled) HTML5 Encode test - * #1: html-entities.encode - html5, nonAscii x 402,711 ops/sec ±0.61% (92 runs sampled) - * #2: html-entities.encode - html5, nonAsciiPrintable x 402,631 ops/sec ±2.99% (92 runs sampled) - * #3: html-entities.encode - html5, extensive x 269,162 ops/sec ±0.26% (97 runs sampled) - #4: entities.encodeNonAsciiHTML x 260,447 ops/sec ±2.53% (95 runs sampled) - #5: entities.encodeHTML x 101,059 ops/sec ±3.99% (91 runs sampled) - #6: he.encode x 93,180 ops/sec ±3.17% (92 runs sampled) + * #1: html-entities.encode - html5, nonAscii x 415,806 ops/sec ±0.73% (85 runs sampled) + * #2: html-entities.encode - html5, nonAsciiPrintable x 401,420 ops/sec ±0.35% (93 runs sampled) + #3: entities.encodeNonAsciiHTML x 401,235 ops/sec ±0.41% (88 runs sampled) + #4: entities.encodeHTML x 284,868 ops/sec ±0.45% (93 runs sampled) + * #5: html-entities.encode - html5, extensive x 237,613 ops/sec ±0.42% (93 runs sampled) + #6: he.encode x 91,459 ops/sec ±0.50% (84 runs sampled) Decode test - * #1: html-entities.decode - html5, attribute x 340,043 ops/sec ±2.82% (92 runs sampled) - * #2: html-entities.decode - html5, body x 330,002 ops/sec ±1.52% (87 runs sampled) - * #3: html-entities.decode - html5, strict x 320,582 ops/sec ±5.34% (88 runs sampled) - #4: entities.decodeHTMLStrict x 286,294 ops/sec ±3.14% (89 runs sampled) - #5: entities.decodeHTML x 232,856 ops/sec ±3.05% (90 runs sampled) - #6: he.decode x 163,300 ops/sec ±0.62% (92 runs sampled) + #1: entities.decodeHTMLStrict x 614,920 ops/sec ±0.41% (89 runs sampled) + #2: entities.decodeHTML x 577,698 ops/sec ±0.44% (90 runs sampled) + * #3: html-entities.decode - html5, strict x 323,680 ops/sec ±0.39% (92 runs sampled) + * #4: html-entities.decode - html5, body x 297,548 ops/sec ±0.45% (91 runs sampled) + * #5: html-entities.decode - html5, attribute x 293,617 ops/sec ±0.37% (94 runs sampled) + #6: he.decode x 145,383 ops/sec ±0.36% (94 runs sampled) HTML4 Encode test - * #1: html-entities.encode - html4, nonAsciiPrintable x 391,885 ops/sec ±0.27% (95 runs sampled) - * #2: html-entities.encode - html4, nonAscii x 400,086 ops/sec ±2.54% (94 runs sampled) - * #3: html-entities.encode - html4, extensive x 193,623 ops/sec ±2.70% (92 runs sampled) + * #1: html-entities.encode - html4, nonAscii x 379,799 ops/sec ±0.29% (96 runs sampled) + * #2: html-entities.encode - html4, nonAsciiPrintable x 350,003 ops/sec ±0.42% (92 runs sampled) + * #3: html-entities.encode - html4, extensive x 169,759 ops/sec ±0.43% (90 runs sampled) Decode test - * #1: html-entities.decode - html4, attribute x 356,174 ops/sec ±0.49% (96 runs sampled) - * #2: html-entities.decode - html4, body x 342,666 ops/sec ±2.38% (91 runs sampled) - * #3: html-entities.decode - html4, strict x 341,667 ops/sec ±4.46% (87 runs sampled) + * #1: html-entities.decode - html4, attribute x 291,048 ops/sec ±0.42% (92 runs sampled) + * #2: html-entities.decode - html4, strict x 287,110 ops/sec ±0.56% (93 runs sampled) + * #3: html-entities.decode - html4, body x 285,529 ops/sec ±0.57% (93 runs sampled) XML Encode test - * #1: html-entities.encode - xml, nonAscii x 450,968 ops/sec ±2.73% (92 runs sampled) - * #2: html-entities.encode - xml, nonAsciiPrintable x 432,058 ops/sec ±4.12% (93 runs sampled) - * #3: html-entities.encode - xml, extensive x 265,336 ops/sec ±3.41% (93 runs sampled) - #4: entities.encodeXML x 254,862 ops/sec ±3.01% (95 runs sampled) + #1: entities.encodeXML x 418,561 ops/sec ±0.80% (90 runs sampled) + * #2: html-entities.encode - xml, nonAsciiPrintable x 402,868 ops/sec ±0.30% (89 runs sampled) + * #3: html-entities.encode - xml, nonAscii x 403,669 ops/sec ±7.87% (83 runs sampled) + * #4: html-entities.encode - xml, extensive x 237,766 ops/sec ±0.45% (93 runs sampled) Decode test - * #1: html-entities.decode - xml, strict x 432,820 ops/sec ±0.53% (89 runs sampled) - * #2: html-entities.decode - xml, attribute x 426,037 ops/sec ±0.75% (94 runs sampled) - * #3: html-entities.decode - xml, body x 424,618 ops/sec ±3.47% (93 runs sampled) - #4: entities.decodeXML x 378,536 ops/sec ±2.48% (93 runs sampled) + #1: entities.decodeXML x 888,700 ops/sec ±0.48% (93 runs sampled) + * #2: html-entities.decode - xml, strict x 353,127 ops/sec ±0.40% (92 runs sampled) + * #3: html-entities.decode - xml, body x 355,796 ops/sec ±1.58% (86 runs sampled) + * #4: html-entities.decode - xml, attribute x 369,454 ops/sec ±8.74% (84 runs sampled) Escaping Escape test - * #1: html-entities.encode - xml, specialChars x 1,424,362 ops/sec ±0.55% (95 runs sampled) - #2: he.escape x 962,420 ops/sec ±3.12% (94 runs sampled) - #3: entities.escapeUTF8 x 443,138 ops/sec ±1.06% (90 runs sampled) - #4: entities.escape x 197,515 ops/sec ±2.73% (91 runs sampled) + #1: entities.escapeUTF8 x 1,308,013 ops/sec ±0.37% (91 runs sampled) + * #2: html-entities.encode - xml, specialChars x 1,258,760 ops/sec ±1.00% (93 runs sampled) + #3: he.escape x 822,569 ops/sec ±0.24% (94 runs sampled) + #4: entities.escape x 434,243 ops/sec ±0.34% (91 runs sampled) ``` License diff --git a/node_modules/html-entities/package.json b/node_modules/html-entities/package.json index 55615b6..65f14b1 100644 --- a/node_modules/html-entities/package.json +++ b/node_modules/html-entities/package.json @@ -1,6 +1,6 @@ { "name": "html-entities", - "version": "2.3.2", + "version": "2.3.3", "description": "Fastest HTML entities encode/decode library.", "keywords": [ "html", @@ -15,7 +15,6 @@ "name": "Marat Dulin", "email": "mdevils@yandex.ru" }, - "dependencies": {}, "devDependencies": { "@types/benchmark": "^2.1.0", "@types/chai": "^4.2.11", @@ -26,7 +25,7 @@ "@typescript-eslint/parser": "^4.6.1", "benchmark": "^2.1.4", "chai": "^4.2.0", - "entities": "^2.2.0", + "entities": "^3.0.1", "eslint": "^7.12.1", "eslint-config-prettier": "^6.15.0", "eslint-plugin-import": "^2.22.1", @@ -34,7 +33,7 @@ "flowgen": "^1.13.0", "he": "^1.2.0", "husky": "^4.3.6", - "mocha": "^7.1.2", + "mocha": "^9.1.3", "prettier": "^2.1.2", "terser": "^5.6.1", "ts-node": "^8.9.1", @@ -46,19 +45,20 @@ "type": "git", "url": "https://github.com/mdevils/html-entities.git" }, + "sideEffects": false, "main": "./lib/index.js", "typings": "./lib/index.d.ts", "types": "./lib/index.d.ts", "scripts": { "test": "TS_NODE_COMPILER=ttypescript mocha --recursive -r ts-node/register test/**/*.ts", - "test:lib": "TEST_LIB=1 yarn test", + "test:lib": "TEST_LIB=1 npm run test", "benchmark": "TS_NODE_COMPILER=ttypescript ts-node benchmark/benchmark", "lint": "eslint src/**.ts", "flow-type-gen": "flowgen --add-flow-header lib/index.d.ts -o lib/index.js.flow", "remove-unused-declarations": "find lib -type f \\( -name '*.d.ts' ! -name index.d.ts \\) | xargs rm", "minimize-lib-files": "find lib -type f \\( -name '*.js' ! -name index.js \\) | while read fn; do terser $fn -o $fn; done", - "build": "rm -Rf lib/* && ttsc && yarn remove-unused-declarations && yarn flow-type-gen && yarn minimize-lib-files && yarn test:lib", - "prepublishOnly": "yarn build" + "build": "rm -Rf lib/* && ttsc && npm run remove-unused-declarations && npm run flow-type-gen && npm run minimize-lib-files && npm run test:lib", + "prepublishOnly": "npm run build" }, "files": [ "lib", @@ -66,7 +66,7 @@ ], "husky": { "hooks": { - "pre-commit": "yarn lint && yarn test" + "pre-commit": "npm run lint && npm run test" } }, "license": "MIT" diff --git a/package-lock.json b/package-lock.json index fd26366..5cac65d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,7 @@ "@mozilla/readability": "^0.3.0", "express": "^4.17.1", "express-rate-limit": "^6.0.5", - "html-entities": "^2.3.2", + "html-entities": "^2.3.3", "jsdom": "^16.4.0", "justify-text": "^1.1.3", "turndown": "^7.0.0", @@ -501,9 +501,9 @@ } }, "node_modules/html-entities": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz", - "integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ==" + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz", + "integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA==" }, "node_modules/http-errors": { "version": "1.8.1", @@ -1545,9 +1545,9 @@ } }, "html-entities": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz", - "integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ==" + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz", + "integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA==" }, "http-errors": { "version": "1.8.1", diff --git a/package.json b/package.json index 5082d47..089e108 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "@mozilla/readability": "^0.3.0", "express": "^4.17.1", "express-rate-limit": "^6.0.5", - "html-entities": "^2.3.2", + "html-entities": "^2.3.3", "jsdom": "^16.4.0", "justify-text": "^1.1.3", "turndown": "^7.0.0",