'use strict'; function extend (destination) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (source.hasOwnProperty(key)) destination[key] = source[key]; } } return destination } function repeat (character, count) { return Array(count + 1).join(character) } function trimLeadingNewlines (string) { return string.replace(/^\n*/, '') } function trimTrailingNewlines (string) { // avoid match-at-end regexp bottleneck, see #370 var indexEnd = string.length; while (indexEnd > 0 && string[indexEnd - 1] === '\n') indexEnd--; return string.substring(0, indexEnd) } var blockElements = [ 'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS', 'CENTER', 'DD', 'DIR', 'DIV', 'DL', 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE', 'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HEADER', 'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES', 'NOSCRIPT', 'OL', 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD', 'TFOOT', 'TH', 'THEAD', 'TR', 'UL' ]; function isBlock (node) { return is(node, blockElements) } var voidElements = [ 'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT', 'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR' ]; function isVoid (node) { return is(node, voidElements) } function hasVoid (node) { return has(node, voidElements) } var meaningfulWhenBlankElements = [ 'A', 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TH', 'TD', 'IFRAME', 'SCRIPT', 'AUDIO', 'VIDEO' ]; function isMeaningfulWhenBlank (node) { return is(node, meaningfulWhenBlankElements) } function hasMeaningfulWhenBlank (node) { return has(node, meaningfulWhenBlankElements) } function is (node, tagNames) { return tagNames.indexOf(node.nodeName) >= 0 } function has (node, tagNames) { return ( node.getElementsByTagName && tagNames.some(function (tagName) { return node.getElementsByTagName(tagName).length }) ) } var rules = {}; rules.paragraph = { filter: 'p', replacement: function (content) { return '\n\n' + content + '\n\n' } }; rules.lineBreak = { filter: 'br', replacement: function (content, node, options) { return options.br + '\n' } }; rules.heading = { filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], replacement: function (content, node, options) { var hLevel = Number(node.nodeName.charAt(1)); if (options.headingStyle === 'setext' && hLevel < 3) { var underline = repeat((hLevel === 1 ? '=' : '-'), content.length); return ( '\n\n' + content + '\n' + underline + '\n\n' ) } else { return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n' } } }; rules.blockquote = { filter: 'blockquote', replacement: function (content) { content = content.replace(/^\n+|\n+$/g, ''); content = content.replace(/^/gm, '> '); return '\n\n' + content + '\n\n' } }; rules.list = { filter: ['ul', 'ol'], replacement: function (content, node) { var parent = node.parentNode; if (parent.nodeName === 'LI' && parent.lastElementChild === node) { return '\n' + content } else { return '\n\n' + content + '\n\n' } } }; rules.listItem = { filter: 'li', replacement: function (content, node, options) { content = content .replace(/^\n+/, '') // remove leading newlines .replace(/\n+$/, '\n') // replace trailing newlines with just a single one .replace(/\n/gm, '\n '); // indent var prefix = options.bulletListMarker + ' '; var parent = node.parentNode; if (parent.nodeName === 'OL') { var start = parent.getAttribute('start'); var index = Array.prototype.indexOf.call(parent.children, node); prefix = (start ? Number(start) + index : index + 1) + '. '; } return ( prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '') ) } }; rules.indentedCodeBlock = { filter: function (node, options) { return ( options.codeBlockStyle === 'indented' && node.nodeName === 'PRE' && node.firstChild && node.firstChild.nodeName === 'CODE' ) }, replacement: function (content, node, options) { return ( '\n\n ' + node.firstChild.textContent.replace(/\n/g, '\n ') + '\n\n' ) } }; rules.fencedCodeBlock = { filter: function (node, options) { return ( options.codeBlockStyle === 'fenced' && node.nodeName === 'PRE' && node.firstChild && node.firstChild.nodeName === 'CODE' ) }, replacement: function (content, node, options) { var className = node.firstChild.getAttribute('class') || ''; var language = (className.match(/language-(\S+)/) || [null, ''])[1]; var code = node.firstChild.textContent; var fenceChar = options.fence.charAt(0); var fenceSize = 3; var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm'); var match; while ((match = fenceInCodeRegex.exec(code))) { if (match[0].length >= fenceSize) { fenceSize = match[0].length + 1; } } var fence = repeat(fenceChar, fenceSize); return ( '\n\n' + fence + language + '\n' + code.replace(/\n$/, '') + '\n' + fence + '\n\n' ) } }; rules.horizontalRule = { filter: 'hr', replacement: function (content, node, options) { return '\n\n' + options.hr + '\n\n' } }; rules.inlineLink = { filter: function (node, options) { return ( options.linkStyle === 'inlined' && node.nodeName === 'A' && node.getAttribute('href') ) }, replacement: function (content, node) { var href = node.getAttribute('href'); var title = cleanAttribute(node.getAttribute('title')); if (title) title = ' "' + title + '"'; return '[' + content + '](' + href + title + ')' } }; rules.referenceLink = { filter: function (node, options) { return ( options.linkStyle === 'referenced' && node.nodeName === 'A' && node.getAttribute('href') ) }, replacement: function (content, node, options) { var href = node.getAttribute('href'); var title = cleanAttribute(node.getAttribute('title')); if (title) title = ' "' + title + '"'; var replacement; var reference; switch (options.linkReferenceStyle) { case 'collapsed': replacement = '[' + content + '][]'; reference = '[' + content + ']: ' + href + title; break case 'shortcut': replacement = '[' + content + ']'; reference = '[' + content + ']: ' + href + title; break default: var id = this.references.length + 1; replacement = '[' + content + '][' + id + ']'; reference = '[' + id + ']: ' + href + title; } this.references.push(reference); return replacement }, references: [], append: function (options) { var references = ''; if (this.references.length) { references = '\n\n' + this.references.join('\n') + '\n\n'; this.references = []; // Reset references } return references } }; rules.emphasis = { filter: ['em', 'i'], replacement: function (content, node, options) { if (!content.trim()) return '' return options.emDelimiter + content + options.emDelimiter } }; rules.strong = { filter: ['strong', 'b'], replacement: function (content, node, options) { if (!content.trim()) return '' return options.strongDelimiter + content + options.strongDelimiter } }; rules.code = { filter: function (node) { var hasSiblings = node.previousSibling || node.nextSibling; var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; return node.nodeName === 'CODE' && !isCodeBlock }, replacement: function (content) { if (!content) return '' content = content.replace(/\r?\n|\r/g, ' '); var extraSpace = /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''; var delimiter = '`'; var matches = content.match(/`+/gm) || []; while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'; return delimiter + extraSpace + content + extraSpace + delimiter } }; rules.image = { filter: 'img', replacement: function (content, node) { var alt = cleanAttribute(node.getAttribute('alt')); var src = node.getAttribute('src') || ''; var title = cleanAttribute(node.getAttribute('title')); var titlePart = title ? ' "' + title + '"' : ''; return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : '' } }; function cleanAttribute (attribute) { return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '' } /** * Manages a collection of rules used to convert HTML to Markdown */ function Rules (options) { this.options = options; this._keep = []; this._remove = []; this.blankRule = { replacement: options.blankReplacement }; this.keepReplacement = options.keepReplacement; this.defaultRule = { replacement: options.defaultReplacement }; this.array = []; for (var key in options.rules) this.array.push(options.rules[key]); } Rules.prototype = { add: function (key, rule) { this.array.unshift(rule); }, keep: function (filter) { this._keep.unshift({ filter: filter, replacement: this.keepReplacement }); }, remove: function (filter) { this._remove.unshift({ filter: filter, replacement: function () { return '' } }); }, forNode: function (node) { if (node.isBlank) return this.blankRule var rule; if ((rule = findRule(this.array, node, this.options))) return rule if ((rule = findRule(this._keep, node, this.options))) return rule if ((rule = findRule(this._remove, node, this.options))) return rule return this.defaultRule }, forEach: function (fn) { for (var i = 0; i < this.array.length; i++) fn(this.array[i], i); } }; function findRule (rules, node, options) { for (var i = 0; i < rules.length; i++) { var rule = rules[i]; if (filterValue(rule, node, options)) return rule } return void 0 } function filterValue (rule, node, options) { var filter = rule.filter; if (typeof filter === 'string') { if (filter === node.nodeName.toLowerCase()) return true } else if (Array.isArray(filter)) { if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true } else if (typeof filter === 'function') { if (filter.call(rule, node, options)) return true } else { throw new TypeError('`filter` needs to be a string, array, or function') } } /** * The collapseWhitespace function is adapted from collapse-whitespace * by Luc Thevenard. * * The MIT License (MIT) * * Copyright (c) 2014 Luc Thevenard * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ /** * collapseWhitespace(options) removes extraneous whitespace from an the given element. * * @param {Object} options */ function collapseWhitespace (options) { var element = options.element; var isBlock = options.isBlock; var isVoid = options.isVoid; var isPre = options.isPre || function (node) { return node.nodeName === 'PRE' }; if (!element.firstChild || isPre(element)) return var prevText = null; var keepLeadingWs = false; var prev = null; var node = next(prev, element, isPre); while (node !== element) { if (node.nodeType === 3 || node.nodeType === 4) { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE var text = node.data.replace(/[ \r\n\t]+/g, ' '); if ((!prevText || / $/.test(prevText.data)) && !keepLeadingWs && text[0] === ' ') { text = text.substr(1); } // `text` might be empty at this point. if (!text) { node = remove(node); continue } node.data = text; prevText = node; } else if (node.nodeType === 1) { // Node.ELEMENT_NODE if (isBlock(node) || node.nodeName === 'BR') { if (prevText) { prevText.data = prevText.data.replace(/ $/, ''); } prevText = null; keepLeadingWs = false; } else if (isVoid(node) || isPre(node)) { // Avoid trimming space around non-block, non-BR void elements and inline PRE. prevText = null; keepLeadingWs = true; } else if (prevText) { // Drop protection if set previously. keepLeadingWs = false; } } else { node = remove(node); continue } var nextNode = next(prev, node, isPre); prev = node; node = nextNode; } if (prevText) { prevText.data = prevText.data.replace(/ $/, ''); if (!prevText.data) { remove(prevText); } } } /** * remove(node) removes the given node from the DOM and returns the * next node in the sequence. * * @param {Node} node * @return {Node} node */ function remove (node) { var next = node.nextSibling || node.parentNode; node.parentNode.removeChild(node); return next } /** * next(prev, current, isPre) returns the next node in the sequence, given the * current and previous nodes. * * @param {Node} prev * @param {Node} current * @param {Function} isPre * @return {Node} */ function next (prev, current, isPre) { if ((prev && prev.parentNode === current) || isPre(current)) { return current.nextSibling || current.parentNode } return current.firstChild || current.nextSibling || current.parentNode } /* * Set up window for Node.js */ var root = (typeof window !== 'undefined' ? window : {}); /* * Parsing HTML strings */ function canParseHTMLNatively () { var Parser = root.DOMParser; var canParse = false; // Adapted from https://gist.github.com/1129031 // Firefox/Opera/IE throw errors on unsupported types try { // WebKit returns null on unsupported types if (new Parser().parseFromString('', 'text/html')) { canParse = true; } } catch (e) {} return canParse } function createHTMLParser () { var Parser = function () {}; { if (shouldUseActiveX()) { Parser.prototype.parseFromString = function (string) { var doc = new window.ActiveXObject('htmlfile'); doc.designMode = 'on'; // disable on-page scripts doc.open(); doc.write(string); doc.close(); return doc }; } else { Parser.prototype.parseFromString = function (string) { var doc = document.implementation.createHTMLDocument(''); doc.open(); doc.write(string); doc.close(); return doc }; } } return Parser } function shouldUseActiveX () { var useActiveX = false; try { document.implementation.createHTMLDocument('').open(); } catch (e) { if (window.ActiveXObject) useActiveX = true; } return useActiveX } var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser(); function RootNode (input, options) { var root; if (typeof input === 'string') { var doc = htmlParser().parseFromString( // DOM parsers arrange elements in the and . // Wrapping in a custom element ensures elements are reliably arranged in // a single element. '' + input + '', 'text/html' ); root = doc.getElementById('turndown-root'); } else { root = input.cloneNode(true); } collapseWhitespace({ element: root, isBlock: isBlock, isVoid: isVoid, isPre: options.preformattedCode ? isPreOrCode : null }); return root } var _htmlParser; function htmlParser () { _htmlParser = _htmlParser || new HTMLParser(); return _htmlParser } function isPreOrCode (node) { return node.nodeName === 'PRE' || node.nodeName === 'CODE' } function Node (node, options) { node.isBlock = isBlock(node); node.isCode = node.nodeName === 'CODE' || node.parentNode.isCode; node.isBlank = isBlank(node); node.flankingWhitespace = flankingWhitespace(node, options); return node } function isBlank (node) { return ( !isVoid(node) && !isMeaningfulWhenBlank(node) && /^\s*$/i.test(node.textContent) && !hasVoid(node) && !hasMeaningfulWhenBlank(node) ) } function flankingWhitespace (node, options) { if (node.isBlock || (options.preformattedCode && node.isCode)) { return { leading: '', trailing: '' } } var edges = edgeWhitespace(node.textContent); // abandon leading ASCII WS if left-flanked by ASCII WS if (edges.leadingAscii && isFlankedByWhitespace('left', node, options)) { edges.leading = edges.leadingNonAscii; } // abandon trailing ASCII WS if right-flanked by ASCII WS if (edges.trailingAscii && isFlankedByWhitespace('right', node, options)) { edges.trailing = edges.trailingNonAscii; } return { leading: edges.leading, trailing: edges.trailing } } function edgeWhitespace (string) { var m = string.match(/^(([ \t\r\n]*)(\s*))[\s\S]*?((\s*?)([ \t\r\n]*))$/); return { leading: m[1], // whole string for whitespace-only strings leadingAscii: m[2], leadingNonAscii: m[3], trailing: m[4], // empty for whitespace-only strings trailingNonAscii: m[5], trailingAscii: m[6] } } function isFlankedByWhitespace (side, node, options) { var sibling; var regExp; var isFlanked; if (side === 'left') { sibling = node.previousSibling; regExp = / $/; } else { sibling = node.nextSibling; regExp = /^ /; } if (sibling) { if (sibling.nodeType === 3) { isFlanked = regExp.test(sibling.nodeValue); } else if (options.preformattedCode && sibling.nodeName === 'CODE') { isFlanked = false; } else if (sibling.nodeType === 1 && !isBlock(sibling)) { isFlanked = regExp.test(sibling.textContent); } } return isFlanked } var reduce = Array.prototype.reduce; var escapes = [ [/\\/g, '\\\\'], [/\*/g, '\\*'], [/^-/g, '\\-'], [/^\+ /g, '\\+ '], [/^(=+)/g, '\\$1'], [/^(#{1,6}) /g, '\\$1 '], [/`/g, '\\`'], [/^~~~/g, '\\~~~'], [/\[/g, '\\['], [/\]/g, '\\]'], [/^>/g, '\\>'], [/_/g, '\\_'], [/^(\d+)\. /g, '$1\\. '] ]; function TurndownService (options) { if (!(this instanceof TurndownService)) return new TurndownService(options) var defaults = { rules: rules, headingStyle: 'setext', hr: '* * *', bulletListMarker: '*', codeBlockStyle: 'indented', fence: '```', emDelimiter: '_', strongDelimiter: '**', linkStyle: 'inlined', linkReferenceStyle: 'full', br: ' ', preformattedCode: false, blankReplacement: function (content, node) { return node.isBlock ? '\n\n' : '' }, keepReplacement: function (content, node) { return node.isBlock ? '\n\n' + node.outerHTML + '\n\n' : node.outerHTML }, defaultReplacement: function (content, node) { return node.isBlock ? '\n\n' + content + '\n\n' : content } }; this.options = extend({}, defaults, options); this.rules = new Rules(this.options); } TurndownService.prototype = { /** * The entry point for converting a string or DOM node to Markdown * @public * @param {String|HTMLElement} input The string or DOM node to convert * @returns A Markdown representation of the input * @type String */ turndown: function (input) { if (!canConvert(input)) { throw new TypeError( input + ' is not a string, or an element/document/fragment node.' ) } if (input === '') return '' var output = process.call(this, new RootNode(input, this.options)); return postProcess.call(this, output) }, /** * Add one or more plugins * @public * @param {Function|Array} plugin The plugin or array of plugins to add * @returns The Turndown instance for chaining * @type Object */ use: function (plugin) { if (Array.isArray(plugin)) { for (var i = 0; i < plugin.length; i++) this.use(plugin[i]); } else if (typeof plugin === 'function') { plugin(this); } else { throw new TypeError('plugin must be a Function or an Array of Functions') } return this }, /** * Adds a rule * @public * @param {String} key The unique key of the rule * @param {Object} rule The rule * @returns The Turndown instance for chaining * @type Object */ addRule: function (key, rule) { this.rules.add(key, rule); return this }, /** * Keep a node (as HTML) that matches the filter * @public * @param {String|Array|Function} filter The unique key of the rule * @returns The Turndown instance for chaining * @type Object */ keep: function (filter) { this.rules.keep(filter); return this }, /** * Remove a node that matches the filter * @public * @param {String|Array|Function} filter The unique key of the rule * @returns The Turndown instance for chaining * @type Object */ remove: function (filter) { this.rules.remove(filter); return this }, /** * Escapes Markdown syntax * @public * @param {String} string The string to escape * @returns A string with Markdown syntax escaped * @type String */ escape: function (string) { return escapes.reduce(function (accumulator, escape) { return accumulator.replace(escape[0], escape[1]) }, string) } }; /** * Reduces a DOM node down to its Markdown string equivalent * @private * @param {HTMLElement} parentNode The node to convert * @returns A Markdown representation of the node * @type String */ function process (parentNode) { var self = this; return reduce.call(parentNode.childNodes, function (output, node) { node = new Node(node, self.options); var replacement = ''; if (node.nodeType === 3) { replacement = node.isCode ? node.nodeValue : self.escape(node.nodeValue); } else if (node.nodeType === 1) { replacement = replacementForNode.call(self, node); } return join(output, replacement) }, '') } /** * Appends strings as each rule requires and trims the output * @private * @param {String} output The conversion output * @returns A trimmed version of the ouput * @type String */ function postProcess (output) { var self = this; this.rules.forEach(function (rule) { if (typeof rule.append === 'function') { output = join(output, rule.append(self.options)); } }); return output.replace(/^[\t\r\n]+/, '').replace(/[\t\r\n\s]+$/, '') } /** * Converts an element node to its Markdown equivalent * @private * @param {HTMLElement} node The node to convert * @returns A Markdown representation of the node * @type String */ function replacementForNode (node) { var rule = this.rules.forNode(node); var content = process.call(this, node); var whitespace = node.flankingWhitespace; if (whitespace.leading || whitespace.trailing) content = content.trim(); return ( whitespace.leading + rule.replacement(content, node, this.options) + whitespace.trailing ) } /** * Joins replacement to the current output with appropriate number of new lines * @private * @param {String} output The current conversion output * @param {String} replacement The string to append to the output * @returns Joined output * @type String */ function join (output, replacement) { var s1 = trimTrailingNewlines(output); var s2 = trimLeadingNewlines(replacement); var nls = Math.max(output.length - s1.length, replacement.length - s2.length); var separator = '\n\n'.substring(0, nls); return s1 + separator + s2 } /** * Determines whether an input can be converted * @private * @param {String|HTMLElement} input Describe this parameter * @returns Describe what it returns * @type String|Object|Array|Boolean|Number */ function canConvert (input) { return ( input != null && ( typeof input === 'string' || (input.nodeType && ( input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11 )) ) ) } module.exports = TurndownService;