2022-01-08 17:05:05 +00:00
|
|
|
const https = require('https');
|
|
|
|
const turndown = require('turndown');
|
|
|
|
const { Readability } = require('@mozilla/readability');
|
|
|
|
const JSDOM = require('jsdom').JSDOM;
|
2022-01-13 14:40:21 +00:00
|
|
|
const common_filters = require('./url_to_markdown_common_filters');
|
2022-05-02 00:47:43 +00:00
|
|
|
const apple_dev_parser = require('./url_to_markdown_apple_dev_docs.js');
|
2022-01-30 20:14:46 +00:00
|
|
|
const table_to_markdown = require('./html_table_to_markdown.js');
|
2022-01-18 16:39:42 +00:00
|
|
|
const validURL = require('@7c/validurl');
|
2022-01-29 19:11:43 +00:00
|
|
|
const express = require('express');
|
2022-01-09 15:20:59 +00:00
|
|
|
const rateLimit = require('express-rate-limit');
|
2022-11-22 11:37:33 +00:00
|
|
|
const htmlEntities = require('html-entities');
|
2022-01-29 19:11:43 +00:00
|
|
|
|
|
|
|
const port = process.env.PORT;
|
2022-01-09 15:20:59 +00:00
|
|
|
|
2022-01-30 11:50:17 +00:00
|
|
|
const app = express();
|
|
|
|
|
|
|
|
const service = new turndown();
|
|
|
|
|
2022-05-02 00:47:43 +00:00
|
|
|
const apple_dev_prefix = "https://developer.apple.com";
|
|
|
|
|
2022-11-22 22:12:45 +00:00
|
|
|
const stackoverflow_prefix = "https://stackoverflow.com/questions";
|
2022-11-05 02:58:50 +00:00
|
|
|
|
2022-01-09 15:20:59 +00:00
|
|
|
const rateLimiter = rateLimit({
|
|
|
|
windowMs: 30 * 1000,
|
|
|
|
max: 5,
|
|
|
|
message: 'Rate limit exceeded',
|
|
|
|
headers: true
|
|
|
|
});
|
|
|
|
|
2022-01-29 19:11:43 +00:00
|
|
|
app.use(rateLimiter);
|
2022-01-08 17:05:05 +00:00
|
|
|
|
2022-01-29 19:11:43 +00:00
|
|
|
app.use(express.urlencoded({
|
2022-01-30 11:50:17 +00:00
|
|
|
extended: true,
|
|
|
|
limit: '10mb'
|
2022-01-29 19:11:43 +00:00
|
|
|
}));
|
2022-01-09 15:20:59 +00:00
|
|
|
|
2022-01-08 17:05:05 +00:00
|
|
|
app.get('/', (req, res) => {
|
2022-04-08 10:16:54 +00:00
|
|
|
const url = req.query.url;
|
|
|
|
const title = req.query.title;
|
|
|
|
const links = req.query.links;
|
2022-02-04 12:13:41 +00:00
|
|
|
let inline_title = false;
|
2022-04-08 10:16:54 +00:00
|
|
|
let ignore_links = false;
|
|
|
|
if (title) {
|
|
|
|
inline_title = (title === 'true');
|
|
|
|
}
|
|
|
|
if (links) {
|
|
|
|
ignore_links = (links === 'false');
|
|
|
|
}
|
2022-01-18 16:39:42 +00:00
|
|
|
if (url && validURL(url)) {
|
2022-01-29 19:11:43 +00:00
|
|
|
send_headers(res);
|
2022-05-02 00:47:43 +00:00
|
|
|
if (url.startsWith(apple_dev_prefix)) {
|
|
|
|
read_apple_url(url, res, inline_title, ignore_links);
|
2022-11-22 22:12:45 +00:00
|
|
|
} else if (url.startsWith(stackoverflow_prefix)) {
|
|
|
|
read_stack_url(url, res, inline_title, ignore_links);
|
2022-05-02 00:47:43 +00:00
|
|
|
} else {
|
|
|
|
read_url(url, res, inline_title, ignore_links);
|
|
|
|
}
|
2022-01-08 17:35:43 +00:00
|
|
|
} else {
|
2022-01-18 16:39:42 +00:00
|
|
|
res.status(400).send("Please specify a valid url query parameter");
|
2022-01-08 17:35:43 +00:00
|
|
|
}
|
2022-01-08 17:05:05 +00:00
|
|
|
});
|
|
|
|
|
2022-01-29 19:11:43 +00:00
|
|
|
app.post('/', function(req, res) {
|
2022-04-08 10:16:54 +00:00
|
|
|
const html = req.body.html;
|
|
|
|
const url = req.body.url;
|
|
|
|
const links = req.query.links;
|
|
|
|
const title = req.query.title;
|
|
|
|
let ignore_links = false;
|
2022-02-04 12:13:41 +00:00
|
|
|
let inline_title = false;
|
2022-04-08 10:16:54 +00:00
|
|
|
if (title) {
|
|
|
|
inline_title = (title === 'true');
|
|
|
|
}
|
|
|
|
if (links) {
|
|
|
|
ignore_links = (links === 'false');
|
|
|
|
}
|
2022-11-05 02:58:50 +00:00
|
|
|
if (url && validURL(url) && url.startsWith(stackoverflow_prefix)) {
|
|
|
|
send_headers(res);
|
2022-11-22 22:12:45 +00:00
|
|
|
read_stack_url(url, res, inline_title, ignore_links);
|
2022-11-05 02:49:07 +00:00
|
|
|
return;
|
|
|
|
}
|
2022-01-30 11:50:17 +00:00
|
|
|
if (!html) {
|
|
|
|
res.status(400).send("Please provide a POST parameter called html");
|
|
|
|
} else {
|
2022-05-13 16:47:42 +00:00
|
|
|
try {
|
2022-05-12 14:09:43 +00:00
|
|
|
let document = new JSDOM(html);
|
|
|
|
let markdown = process_dom(url, document, res, inline_title, ignore_links);
|
|
|
|
send_headers(res);
|
|
|
|
res.send(markdown);
|
2022-05-13 16:47:42 +00:00
|
|
|
} catch (error) {
|
|
|
|
res.status(400).send("Could not parse that document");
|
|
|
|
}
|
2022-01-29 19:11:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
});
|
|
|
|
|
2022-01-08 17:05:05 +00:00
|
|
|
app.listen(port, () => {
|
|
|
|
})
|
|
|
|
|
2022-01-29 19:11:43 +00:00
|
|
|
function send_headers(res) {
|
|
|
|
res.header("Access-Control-Allow-Origin", '*');
|
|
|
|
res.header("Access-Control-Expose-Headers", 'X-Title');
|
|
|
|
res.header("Content-Type", 'text/markdown');
|
|
|
|
}
|
|
|
|
|
2022-11-22 22:12:45 +00:00
|
|
|
function process_dom(url, document, res, inline_title, ignore_links, id="") {
|
2022-01-29 19:11:43 +00:00
|
|
|
let title = document.window.document.querySelector('title');
|
|
|
|
if (title)
|
|
|
|
res.header("X-Title", encodeURIComponent(title.textContent));
|
2022-11-22 22:12:45 +00:00
|
|
|
if (id) {
|
|
|
|
document = new JSDOM('<!DOCTYPE html>'+ document.window.document.querySelector("#"+id).innerHTML);
|
|
|
|
}
|
2022-01-29 19:11:43 +00:00
|
|
|
let reader = new Readability(document.window.document);
|
2022-01-30 15:04:54 +00:00
|
|
|
let readable = reader.parse().content;
|
2022-12-03 11:58:54 +00:00
|
|
|
let replacements = [];
|
2022-11-22 11:37:33 +00:00
|
|
|
readable = format_codeblocks(readable, replacements);
|
2022-12-03 11:58:54 +00:00
|
|
|
readable = format_tables(readable, replacements);
|
2022-01-30 15:04:54 +00:00
|
|
|
let markdown = service.turndown(readable);
|
2022-05-12 14:09:43 +00:00
|
|
|
for (let i=0;i<replacements.length;i++) {
|
|
|
|
markdown = markdown.replace(replacements[i].placeholder, replacements[i].replacement);
|
2022-01-30 15:04:54 +00:00
|
|
|
}
|
2022-04-08 10:16:54 +00:00
|
|
|
let result = (url) ? common_filters.filter(url, markdown, ignore_links) : markdown;
|
2022-02-04 12:13:41 +00:00
|
|
|
if (inline_title && title) {
|
|
|
|
result = "# " + title.textContent + "\n" + result;
|
|
|
|
}
|
2022-01-30 11:50:17 +00:00
|
|
|
return result;
|
2022-01-29 19:11:43 +00:00
|
|
|
}
|
|
|
|
|
2022-04-08 10:16:54 +00:00
|
|
|
function read_url(url, res, inline_title, ignore_links) {
|
2022-01-08 17:05:05 +00:00
|
|
|
JSDOM.fromURL(url).then((document)=>{
|
2022-04-08 10:16:54 +00:00
|
|
|
let markdown = process_dom(url, document, res, inline_title, ignore_links);
|
2022-01-30 11:50:17 +00:00
|
|
|
res.send(markdown);
|
2022-01-18 16:39:42 +00:00
|
|
|
}).catch((error)=> {
|
|
|
|
res.status(400).send("Sorry, could not fetch and convert that URL");
|
2022-01-08 17:05:05 +00:00
|
|
|
});
|
|
|
|
}
|
2022-01-30 15:04:54 +00:00
|
|
|
|
2022-11-22 22:12:45 +00:00
|
|
|
function read_stack_url(url, res, inline_title, ignore_links) {
|
|
|
|
JSDOM.fromURL(url).then((document)=>{
|
|
|
|
let markdown_q = process_dom(url, document, res, inline_title, ignore_links, 'question');
|
|
|
|
let markdown_a = process_dom(url, document, res, false, ignore_links, 'answers');
|
|
|
|
if (markdown_a.startsWith('Your Answer')) {
|
|
|
|
res.send(markdown_q);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
res.send(markdown_q + "\n\n## Answer\n"+ markdown_a);
|
|
|
|
}
|
|
|
|
}).catch((error)=> {
|
|
|
|
res.status(400).send("Sorry, could not fetch and convert that URL");
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2022-05-02 00:47:43 +00:00
|
|
|
function read_apple_url(url, res, inline_title, ignore_links) {
|
|
|
|
json_url = apple_dev_parser.dev_doc_url(url);
|
|
|
|
https.get(json_url,(apple_res) => {
|
|
|
|
let body = "";
|
|
|
|
apple_res.on("data", (chunk) => {
|
|
|
|
body += chunk;
|
|
|
|
});
|
|
|
|
apple_res.on("end", () => {
|
|
|
|
let json = JSON.parse(body);
|
2022-05-02 11:35:23 +00:00
|
|
|
let markdown = apple_dev_parser.parse_dev_doc_json(json, inline_title, ignore_links);
|
2022-05-02 00:47:43 +00:00
|
|
|
res.send(markdown);
|
|
|
|
});
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2022-01-30 15:04:54 +00:00
|
|
|
function format_tables(html, replacements) {
|
2022-05-12 14:09:43 +00:00
|
|
|
const start = replacements.length;
|
2022-01-30 15:04:54 +00:00
|
|
|
const tables = html.match(/(<table[^>]*>(?:.|\n)*?<\/table>)/gi);
|
2022-01-30 21:05:14 +00:00
|
|
|
if (tables) {
|
|
|
|
for (let t=0;t<tables.length;t++) {
|
2022-11-22 11:37:33 +00:00
|
|
|
const table = tables[t];
|
2022-01-30 21:05:14 +00:00
|
|
|
let markdown = table_to_markdown.convert(table);
|
|
|
|
let placeholder = "urltomarkdowntableplaceholder"+t+Math.random();
|
2022-05-12 14:09:43 +00:00
|
|
|
replacements[start+t] = { placeholder: placeholder, replacement: markdown};
|
2022-01-30 21:05:14 +00:00
|
|
|
html = html.replace(table, "<p>"+placeholder+"</p>");
|
|
|
|
}
|
2022-01-30 15:04:54 +00:00
|
|
|
}
|
|
|
|
return html;
|
|
|
|
}
|
|
|
|
|
2022-11-22 11:37:33 +00:00
|
|
|
function format_codeblocks(html, replacements) {
|
|
|
|
const start = replacements.length;
|
|
|
|
const codeblocks = html.match(/(<pre[^>]*>(?:.|\n)*?<\/pre>)/gi);
|
|
|
|
if (codeblocks) {
|
|
|
|
for (let c=0;c<codeblocks.length;c++) {
|
|
|
|
const codeblock = codeblocks[c];
|
|
|
|
let filtered = codeblock;
|
|
|
|
filtered = filtered.replace(/<br[^>]*>/g, "\n");
|
|
|
|
filtered = filtered.replace(/<p>/g, "\n");
|
|
|
|
filtered = filtered.replace(/<\/?[^>]+(>|$)/g, "");
|
|
|
|
filtered = htmlEntities.decode(filtered);
|
|
|
|
let markdown = "```\n"+filtered+"\n```\n";
|
|
|
|
let placeholder = "urltomarkdowncodeblockplaceholder"+c+Math.random();
|
|
|
|
replacements[start+c] = { placeholder: placeholder, replacement: markdown};
|
|
|
|
html = html.replace(codeblock, "<p>"+placeholder+"</p>");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return html;
|
|
|
|
}
|
|
|
|
|