add per domain regexp post filtering
parent
3e4cbbf077
commit
07fb74e0b2
4
index.js
4
index.js
|
@ -2,6 +2,7 @@ const https = require('https');
|
|||
const turndown = require('turndown');
|
||||
const { Readability } = require('@mozilla/readability');
|
||||
const JSDOM = require('jsdom').JSDOM;
|
||||
const common_filters = require('./url_to_markdown_common_filters');
|
||||
|
||||
service = new turndown();
|
||||
|
||||
|
@ -37,7 +38,8 @@ function read_url(url, res) {
|
|||
JSDOM.fromURL(url).then((document)=>{
|
||||
let reader = new Readability(document.window.document);
|
||||
let article = reader.parse();
|
||||
let result = service.turndown(article.content);
|
||||
let markdown = service.turndown(article.content);
|
||||
let result = common_filters.filter(url, markdown);
|
||||
res.send(result);
|
||||
});
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
var urlparser = require('url');
|
||||
|
||||
module.exports = {
|
||||
|
||||
list: [
|
||||
{
|
||||
domain: /.*\.wikipedia\.org/,
|
||||
remove: [
|
||||
/\\\[\[edit\]\([^\s]+\s+"[^"]*"\)\\\]/i
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
filter: function (url, data) {
|
||||
let domain = urlparser.parse(url).hostname
|
||||
for (let i=0;i<this.list.length;i++) {
|
||||
if (domain.match(this.list[i].domain)) {
|
||||
for (let j=0;j<this.list[i].remove.length; j++) {
|
||||
data = data.replace(this.list[i].remove[j],"");
|
||||
}
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue