strip tags and decode entities in code blocks

main
macsplit 2022-05-12 15:36:59 +01:00
parent 536bbea0ea
commit 15c455cd24
7 changed files with 59 additions and 261 deletions

View File

@ -8,6 +8,7 @@ const table_to_markdown = require('./html_table_to_markdown.js');
const validURL = require('@7c/validurl');
const express = require('express');
const rateLimit = require('express-rate-limit');
const htmlentities = require('html-entities');
const port = process.env.PORT;
@ -71,14 +72,14 @@ app.post('/', function(req, res) {
if (!html) {
res.status(400).send("Please provide a POST parameter called html");
} else {
try {
//try {
let document = new JSDOM(html);
let markdown = process_dom(url, document, res, inline_title, ignore_links);
send_headers(res);
res.send(markdown);
} catch (error) {
res.status(400).send("Could not parse that document");
}
//} catch (error) {
// res.status(400).send("Could not parse that document");
//}
}
});
@ -172,6 +173,8 @@ function code_block_to_markdown (html) {
const match_code = /^\s*<code[^>]*>[\r\n]*([\s\S]*)<\/code>\s*$/ig.exec(inner_html);
if (match_code && match_code[1])
inner_html = match_code[1];
inner_html = inner_html.replace(/(<([^>]+)>)/ig, "");
inner_html = htmlentities.decode(inner_html);
const markdown = "```\n"+inner_html+"\n```\n";
return markdown;
}

6
node_modules/.package-lock.json generated vendored
View File

@ -485,9 +485,9 @@
}
},
"node_modules/html-entities": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz",
"integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ=="
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz",
"integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA=="
},
"node_modules/http-errors": {
"version": "1.8.1",

View File

@ -1,205 +0,0 @@
2.3.2
-----
* Minimize data files, remove unnecessary files.
2.3.1
-----
* Improve performance of `encode()`, `decode()` and `decodeEntity()` by using function inlining.
* Fix decoding HEX HTML entities in some cases.
2.3.0
-----
* Add flow types.
2.2.0
-----
* A fast `decodeEntity()` method to decode a single HTML entity.
2.1.1
-----
* Speed up both `encode()` and `decode()` methods.
2.1.0
-----
* Add `extensive` mode to `encode()` method. This mode encodes all non-printable characters, non-ASCII characters and all characters with named references.
2.0.6
-----
* Handle invalid numeric HTML entities: mimic browser behaviour.
2.0.5
-----
* Handling behaviour of ambiguous ampersands.
2.0.4
-----
* Fix webpack build warning.
2.0.3
-----
* Handle invalid numeric HTML entities.
2.0.2
-----
* Handle `null` and `undefined` text values.
2.0.1
-----
* Fix decoding numeric HTML entities.
2.0.0
-----
* Performance was greatly improved.
* New API: simpler and more flexible.
`htmlEntitiesInstance.encode(text)` -> `encode(text)`
Before:
```js
import {AllHtmlEntities} from 'html-entities';
const entities = new AllHtmlEntities();
console.log(
entities.encode('<Hello & World>')
);
```
After:
```js
import {encode} from 'html-entities';
console.log(
encode('<Hello & World>')
);
```
---
`instance.encodeNonASCII(text)` -> `encode(text, {mode: 'nonAscii'})`
Before:
```js
import {AllHtmlEntities} from 'html-entities';
const entities = new AllHtmlEntities();
console.log(
entities.encodeNonASCII('& © ∆')
);
```
After:
```js
import {encode} from 'html-entities';
console.log(
encode('& © ∆', {mode: 'nonAscii'})
);
```
---
`instance.encodeNonASCII(text)` -> `encode(text, {mode: 'nonAsciiPrintable'})`
Before:
```js
import {AllHtmlEntities} from 'html-entities';
const entities = new AllHtmlEntities();
console.log(
entities.encodeNonASCII('& © ∆ \x01')
);
```
After:
```js
import {encode} from 'html-entities';
console.log(
encode('& © ∆ \x01', {mode: 'nonAsciiPrintable'})
);
```
---
`instance.decode(text)` -> `decode(text)`
Before:
```js
import {AllHtmlEntities} from 'html-entities';
const entities = new AllHtmlEntities();
console.log(
entities.decode('&lt;&gt;&amp;')
);
```
After:
```js
import {decode} from 'html-entities';
console.log(
decode('&lt;&gt;&amp;')
);
```
---
Different XML/HTML versions are now implemented via options instead of different classes.
Before:
```js
import {XmlEntities, Html4Entities, Html5Entities, AllHtmlEntities} from 'html-entities';
const xmlEntities = new XmlEntities();
const html4Entities = new Html4Entities();
const html5Entities = new Html5Entities();
const allHtmlEntities = new AllHtmlEntities();
console.log(xmlEntities.encode('<>&'));
console.log(html4Entities.encode('<>&©'));
console.log(html5Entities.encode('<>&©℞'));
console.log(allHtmlEntities.encode('<>&©℞'));
console.log(xmlEntities.decode('&lt;&gt;&amp;'));
console.log(html4Entities.decode('&lt;&gt;&amp;&copy;'));
console.log(html5Entities.decode('&lt;&gt;&amp;&copy;&rx;'));
console.log(allHtmlEntities.decode('&lt;&gt;&amp;&copy;&rx;'));
```
After:
```js
import {encode, decode} from 'html-entities';
console.log(encode('<>&', {level: 'xml'}));
console.log(encode('<>&©', {level: 'html4', mode: 'nonAscii'}));
console.log(encode('<>&©℞', {level: 'html5', mode: 'nonAscii'}));
console.log(encode('<>&©℞', {level: 'all', mode: 'nonAscii'}));
console.log(decode('&lt;&gt;&amp;', {level: 'xml'}));
console.log(decode('&lt;&gt;&amp;&copy;', {level: 'html4'}));
console.log(decode('&lt;&gt;&amp;&copy;&rx;', {level: 'html5'}));
console.log(decode('&lt;&gt;&amp;&copy;&rx;', {level: 'all'}));
```

66
node_modules/html-entities/README.md generated vendored
View File

@ -124,68 +124,68 @@ Common
Initialization / Load speed
* #1: html-entities x 2,544,400 ops/sec ±4.52% (77 runs sampled)
#2: entities x 1,757,526 ops/sec ±3.99% (81 runs sampled)
#3: he x 1,281,542 ops/sec ±9.31% (74 runs sampled)
* #1: html-entities x 2,632,942 ops/sec ±3.71% (72 runs sampled)
#2: entities x 1,379,154 ops/sec ±5.87% (75 runs sampled)
#3: he x 1,334,035 ops/sec ±3.14% (83 runs sampled)
HTML5
Encode test
* #1: html-entities.encode - html5, nonAscii x 402,711 ops/sec ±0.61% (92 runs sampled)
* #2: html-entities.encode - html5, nonAsciiPrintable x 402,631 ops/sec ±2.99% (92 runs sampled)
* #3: html-entities.encode - html5, extensive x 269,162 ops/sec ±0.26% (97 runs sampled)
#4: entities.encodeNonAsciiHTML x 260,447 ops/sec ±2.53% (95 runs sampled)
#5: entities.encodeHTML x 101,059 ops/sec ±3.99% (91 runs sampled)
#6: he.encode x 93,180 ops/sec ±3.17% (92 runs sampled)
* #1: html-entities.encode - html5, nonAscii x 415,806 ops/sec ±0.73% (85 runs sampled)
* #2: html-entities.encode - html5, nonAsciiPrintable x 401,420 ops/sec ±0.35% (93 runs sampled)
#3: entities.encodeNonAsciiHTML x 401,235 ops/sec ±0.41% (88 runs sampled)
#4: entities.encodeHTML x 284,868 ops/sec ±0.45% (93 runs sampled)
* #5: html-entities.encode - html5, extensive x 237,613 ops/sec ±0.42% (93 runs sampled)
#6: he.encode x 91,459 ops/sec ±0.50% (84 runs sampled)
Decode test
* #1: html-entities.decode - html5, attribute x 340,043 ops/sec ±2.82% (92 runs sampled)
* #2: html-entities.decode - html5, body x 330,002 ops/sec ±1.52% (87 runs sampled)
* #3: html-entities.decode - html5, strict x 320,582 ops/sec ±5.34% (88 runs sampled)
#4: entities.decodeHTMLStrict x 286,294 ops/sec ±3.14% (89 runs sampled)
#5: entities.decodeHTML x 232,856 ops/sec ±3.05% (90 runs sampled)
#6: he.decode x 163,300 ops/sec ±0.62% (92 runs sampled)
#1: entities.decodeHTMLStrict x 614,920 ops/sec ±0.41% (89 runs sampled)
#2: entities.decodeHTML x 577,698 ops/sec ±0.44% (90 runs sampled)
* #3: html-entities.decode - html5, strict x 323,680 ops/sec ±0.39% (92 runs sampled)
* #4: html-entities.decode - html5, body x 297,548 ops/sec ±0.45% (91 runs sampled)
* #5: html-entities.decode - html5, attribute x 293,617 ops/sec ±0.37% (94 runs sampled)
#6: he.decode x 145,383 ops/sec ±0.36% (94 runs sampled)
HTML4
Encode test
* #1: html-entities.encode - html4, nonAsciiPrintable x 391,885 ops/sec ±0.27% (95 runs sampled)
* #2: html-entities.encode - html4, nonAscii x 400,086 ops/sec ±2.54% (94 runs sampled)
* #3: html-entities.encode - html4, extensive x 193,623 ops/sec ±2.70% (92 runs sampled)
* #1: html-entities.encode - html4, nonAscii x 379,799 ops/sec ±0.29% (96 runs sampled)
* #2: html-entities.encode - html4, nonAsciiPrintable x 350,003 ops/sec ±0.42% (92 runs sampled)
* #3: html-entities.encode - html4, extensive x 169,759 ops/sec ±0.43% (90 runs sampled)
Decode test
* #1: html-entities.decode - html4, attribute x 356,174 ops/sec ±0.49% (96 runs sampled)
* #2: html-entities.decode - html4, body x 342,666 ops/sec ±2.38% (91 runs sampled)
* #3: html-entities.decode - html4, strict x 341,667 ops/sec ±4.46% (87 runs sampled)
* #1: html-entities.decode - html4, attribute x 291,048 ops/sec ±0.42% (92 runs sampled)
* #2: html-entities.decode - html4, strict x 287,110 ops/sec ±0.56% (93 runs sampled)
* #3: html-entities.decode - html4, body x 285,529 ops/sec ±0.57% (93 runs sampled)
XML
Encode test
* #1: html-entities.encode - xml, nonAscii x 450,968 ops/sec ±2.73% (92 runs sampled)
* #2: html-entities.encode - xml, nonAsciiPrintable x 432,058 ops/sec ±4.12% (93 runs sampled)
* #3: html-entities.encode - xml, extensive x 265,336 ops/sec ±3.41% (93 runs sampled)
#4: entities.encodeXML x 254,862 ops/sec ±3.01% (95 runs sampled)
#1: entities.encodeXML x 418,561 ops/sec ±0.80% (90 runs sampled)
* #2: html-entities.encode - xml, nonAsciiPrintable x 402,868 ops/sec ±0.30% (89 runs sampled)
* #3: html-entities.encode - xml, nonAscii x 403,669 ops/sec ±7.87% (83 runs sampled)
* #4: html-entities.encode - xml, extensive x 237,766 ops/sec ±0.45% (93 runs sampled)
Decode test
* #1: html-entities.decode - xml, strict x 432,820 ops/sec ±0.53% (89 runs sampled)
* #2: html-entities.decode - xml, attribute x 426,037 ops/sec ±0.75% (94 runs sampled)
* #3: html-entities.decode - xml, body x 424,618 ops/sec ±3.47% (93 runs sampled)
#4: entities.decodeXML x 378,536 ops/sec ±2.48% (93 runs sampled)
#1: entities.decodeXML x 888,700 ops/sec ±0.48% (93 runs sampled)
* #2: html-entities.decode - xml, strict x 353,127 ops/sec ±0.40% (92 runs sampled)
* #3: html-entities.decode - xml, body x 355,796 ops/sec ±1.58% (86 runs sampled)
* #4: html-entities.decode - xml, attribute x 369,454 ops/sec ±8.74% (84 runs sampled)
Escaping
Escape test
* #1: html-entities.encode - xml, specialChars x 1,424,362 ops/sec ±0.55% (95 runs sampled)
#2: he.escape x 962,420 ops/sec ±3.12% (94 runs sampled)
#3: entities.escapeUTF8 x 443,138 ops/sec ±1.06% (90 runs sampled)
#4: entities.escape x 197,515 ops/sec ±2.73% (91 runs sampled)
#1: entities.escapeUTF8 x 1,308,013 ops/sec ±0.37% (91 runs sampled)
* #2: html-entities.encode - xml, specialChars x 1,258,760 ops/sec ±1.00% (93 runs sampled)
#3: he.escape x 822,569 ops/sec ±0.24% (94 runs sampled)
#4: entities.escape x 434,243 ops/sec ±0.34% (91 runs sampled)
```
License

View File

@ -1,6 +1,6 @@
{
"name": "html-entities",
"version": "2.3.2",
"version": "2.3.3",
"description": "Fastest HTML entities encode/decode library.",
"keywords": [
"html",
@ -15,7 +15,6 @@
"name": "Marat Dulin",
"email": "mdevils@yandex.ru"
},
"dependencies": {},
"devDependencies": {
"@types/benchmark": "^2.1.0",
"@types/chai": "^4.2.11",
@ -26,7 +25,7 @@
"@typescript-eslint/parser": "^4.6.1",
"benchmark": "^2.1.4",
"chai": "^4.2.0",
"entities": "^2.2.0",
"entities": "^3.0.1",
"eslint": "^7.12.1",
"eslint-config-prettier": "^6.15.0",
"eslint-plugin-import": "^2.22.1",
@ -34,7 +33,7 @@
"flowgen": "^1.13.0",
"he": "^1.2.0",
"husky": "^4.3.6",
"mocha": "^7.1.2",
"mocha": "^9.1.3",
"prettier": "^2.1.2",
"terser": "^5.6.1",
"ts-node": "^8.9.1",
@ -46,19 +45,20 @@
"type": "git",
"url": "https://github.com/mdevils/html-entities.git"
},
"sideEffects": false,
"main": "./lib/index.js",
"typings": "./lib/index.d.ts",
"types": "./lib/index.d.ts",
"scripts": {
"test": "TS_NODE_COMPILER=ttypescript mocha --recursive -r ts-node/register test/**/*.ts",
"test:lib": "TEST_LIB=1 yarn test",
"test:lib": "TEST_LIB=1 npm run test",
"benchmark": "TS_NODE_COMPILER=ttypescript ts-node benchmark/benchmark",
"lint": "eslint src/**.ts",
"flow-type-gen": "flowgen --add-flow-header lib/index.d.ts -o lib/index.js.flow",
"remove-unused-declarations": "find lib -type f \\( -name '*.d.ts' ! -name index.d.ts \\) | xargs rm",
"minimize-lib-files": "find lib -type f \\( -name '*.js' ! -name index.js \\) | while read fn; do terser $fn -o $fn; done",
"build": "rm -Rf lib/* && ttsc && yarn remove-unused-declarations && yarn flow-type-gen && yarn minimize-lib-files && yarn test:lib",
"prepublishOnly": "yarn build"
"build": "rm -Rf lib/* && ttsc && npm run remove-unused-declarations && npm run flow-type-gen && npm run minimize-lib-files && npm run test:lib",
"prepublishOnly": "npm run build"
},
"files": [
"lib",
@ -66,7 +66,7 @@
],
"husky": {
"hooks": {
"pre-commit": "yarn lint && yarn test"
"pre-commit": "npm run lint && npm run test"
}
},
"license": "MIT"

14
package-lock.json generated
View File

@ -13,7 +13,7 @@
"@mozilla/readability": "^0.3.0",
"express": "^4.17.1",
"express-rate-limit": "^6.0.5",
"html-entities": "^2.3.2",
"html-entities": "^2.3.3",
"jsdom": "^16.4.0",
"justify-text": "^1.1.3",
"turndown": "^7.0.0",
@ -501,9 +501,9 @@
}
},
"node_modules/html-entities": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz",
"integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ=="
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz",
"integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA=="
},
"node_modules/http-errors": {
"version": "1.8.1",
@ -1545,9 +1545,9 @@
}
},
"html-entities": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.2.tgz",
"integrity": "sha512-c3Ab/url5ksaT0WyleslpBEthOzWhrjQbg75y7XUsfSzi3Dgzt0l8w5e7DylRn15MTlMMD58dTfzddNS2kcAjQ=="
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.3.3.tgz",
"integrity": "sha512-DV5Ln36z34NNTDgnz0EWGBLZENelNAtkiFA4kyNOG2tDI6Mz1uSWiq1wAKdyjnJwyDiDO7Fa2SO1CTxPXL8VxA=="
},
"http-errors": {
"version": "1.8.1",

View File

@ -8,7 +8,7 @@
"@mozilla/readability": "^0.3.0",
"express": "^4.17.1",
"express-rate-limit": "^6.0.5",
"html-entities": "^2.3.2",
"html-entities": "^2.3.3",
"jsdom": "^16.4.0",
"justify-text": "^1.1.3",
"turndown": "^7.0.0",