add per domain regexp post filtering

main
Lee Hanken 2022-01-13 14:40:21 +00:00
parent 3e4cbbf077
commit 07fb74e0b2
2 changed files with 29 additions and 1 deletions

View File

@ -2,6 +2,7 @@ const https = require('https');
const turndown = require('turndown');
const { Readability } = require('@mozilla/readability');
const JSDOM = require('jsdom').JSDOM;
const common_filters = require('./url_to_markdown_common_filters');
service = new turndown();
@ -37,7 +38,8 @@ function read_url(url, res) {
JSDOM.fromURL(url).then((document)=>{
let reader = new Readability(document.window.document);
let article = reader.parse();
let result = service.turndown(article.content);
let markdown = service.turndown(article.content);
let result = common_filters.filter(url, markdown);
res.send(result);
});
}

View File

@ -0,0 +1,26 @@
var urlparser = require('url');
module.exports = {
list: [
{
domain: /.*\.wikipedia\.org/,
remove: [
/\\\[\[edit\]\([^\s]+\s+"[^"]*"\)\\\]/i
]
}
],
filter: function (url, data) {
let domain = urlparser.parse(url).hostname
for (let i=0;i<this.list.length;i++) {
if (domain.match(this.list[i].domain)) {
for (let j=0;j<this.list[i].remove.length; j++) {
data = data.replace(this.list[i].remove[j],"");
}
}
}
return data;
}
}