add per domain regexp post filtering
parent
3e4cbbf077
commit
07fb74e0b2
4
index.js
4
index.js
|
@ -2,6 +2,7 @@ const https = require('https');
|
||||||
const turndown = require('turndown');
|
const turndown = require('turndown');
|
||||||
const { Readability } = require('@mozilla/readability');
|
const { Readability } = require('@mozilla/readability');
|
||||||
const JSDOM = require('jsdom').JSDOM;
|
const JSDOM = require('jsdom').JSDOM;
|
||||||
|
const common_filters = require('./url_to_markdown_common_filters');
|
||||||
|
|
||||||
service = new turndown();
|
service = new turndown();
|
||||||
|
|
||||||
|
@ -37,7 +38,8 @@ function read_url(url, res) {
|
||||||
JSDOM.fromURL(url).then((document)=>{
|
JSDOM.fromURL(url).then((document)=>{
|
||||||
let reader = new Readability(document.window.document);
|
let reader = new Readability(document.window.document);
|
||||||
let article = reader.parse();
|
let article = reader.parse();
|
||||||
let result = service.turndown(article.content);
|
let markdown = service.turndown(article.content);
|
||||||
|
let result = common_filters.filter(url, markdown);
|
||||||
res.send(result);
|
res.send(result);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
var urlparser = require('url');
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
|
||||||
|
list: [
|
||||||
|
{
|
||||||
|
domain: /.*\.wikipedia\.org/,
|
||||||
|
remove: [
|
||||||
|
/\\\[\[edit\]\([^\s]+\s+"[^"]*"\)\\\]/i
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
filter: function (url, data) {
|
||||||
|
let domain = urlparser.parse(url).hostname
|
||||||
|
for (let i=0;i<this.list.length;i++) {
|
||||||
|
if (domain.match(this.list[i].domain)) {
|
||||||
|
for (let j=0;j<this.list[i].remove.length; j++) {
|
||||||
|
data = data.replace(this.list[i].remove[j],"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue