# -*- coding: utf-8 -*- """Markdown filters with mistune Used from markdown.py """ # Copyright (c) IPython Development Team. # Distributed under the terms of the Modified BSD License. from __future__ import print_function import base64 import os import mimetypes import re from functools import partial try: from html import escape html_escape = partial(escape, quote=False) except ImportError: # Python 2 from cgi import escape as html_escape import bs4 import mistune from pygments import highlight from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter from pygments.util import ClassNotFound from nbconvert.filters.strings import add_anchor class InvalidNotebook(Exception): pass class MathBlockGrammar(mistune.BlockGrammar): """This defines a single regex comprised of the different patterns that identify math content spanning multiple lines. These are used by the MathBlockLexer. """ multi_math_str = "|".join([r"^\$\$.*?\$\$", r"^\\\\\[.*?\\\\\]", r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}"]) multiline_math = re.compile(multi_math_str, re.DOTALL) class MathBlockLexer(mistune.BlockLexer): """ This acts as a pass-through to the MathInlineLexer. It is needed in order to avoid other block level rules splitting math sections apart. """ default_rules = (['multiline_math'] + mistune.BlockLexer.default_rules) def __init__(self, rules=None, **kwargs): if rules is None: rules = MathBlockGrammar() super().__init__(rules, **kwargs) def parse_multiline_math(self, m): """Add token to pass through mutiline math.""" self.tokens.append({ "type": "multiline_math", "text": m.group(0) }) class MathInlineGrammar(mistune.InlineGrammar): """This defines different ways of declaring math objects that should be passed through to mathjax unaffected. These are used by the MathInlineLexer. """ inline_math = re.compile(r"^\$(.+?)\$|^\\\\\((.+?)\\\\\)", re.DOTALL) block_math = re.compile(r"^\$\$(.*?)\$\$|^\\\\\[(.*?)\\\\\]", re.DOTALL) latex_environment = re.compile(r"^\\begin\{([a-z]*\*?)\}(.*?)\\end\{\1\}", re.DOTALL) text = re.compile(r'^[\s\S]+?(?=[\\%s\n' % \ mistune.escape(code) formatter = HtmlFormatter() return highlight(code, lexer, formatter) def block_html(self, html): embed_images = self.options.get('embed_images', False) if embed_images: html = self._html_embed_images(html) return super().block_html(html) def inline_html(self, html): embed_images = self.options.get('embed_images', False) if embed_images: html = self._html_embed_images(html) return super().inline_html(html) def header(self, text, level, raw=None): html = super().header(text, level, raw=raw) if self.options.get("exclude_anchor_links"): return html anchor_link_text = self.options.get('anchor_link_text', u'ΒΆ') return add_anchor(html, anchor_link_text=anchor_link_text) def escape_html(self, text): return html_escape(text) def block_math(self, text): return '$$%s$$' % self.escape_html(text) def latex_environment(self, name, text): name = self.escape_html(name) text = self.escape_html(text) return r'\begin{%s}%s\end{%s}' % (name, text, name) def inline_math(self, text): return '$%s$' % self.escape_html(text) def image(self, src, title, text): """Rendering a image with title and text. :param src: source link of the image. :param title: title text of the image. :param text: alt text of the image. """ attachments = self.options.get('attachments', {}) attachment_prefix = 'attachment:' embed_images = self.options.get('embed_images', False) if src.startswith(attachment_prefix): name = src[len(attachment_prefix):] if name not in attachments: raise InvalidNotebook("missing attachment: {}".format(name)) attachment = attachments[name] # we choose vector over raster, and lossless over lossy preferred_mime_types = ['image/svg+xml', 'image/png', 'image/jpeg'] for preferred_mime_type in preferred_mime_types: if preferred_mime_type in attachment: break else: # otherwise we choose the first mimetype we can find preferred_mime_type = list(attachment.keys())[0] mime_type = preferred_mime_type data = attachment[mime_type] src = 'data:' + mime_type + ';base64,' + data elif embed_images: base64_url = self._src_to_base64(src) if base64_url is not None: src = base64_url return super().image(src, title, text) def _src_to_base64(self, src): """Turn the source file into a base64 url. :param src: source link of the file. :return: the base64 url or None if the file was not found. """ path = self.options.get('path', '') src_path = os.path.join(path, src) if not os.path.exists(src_path): return None with open(src_path, 'rb') as fobj: mime_type = mimetypes.guess_type(src_path)[0] base64_data = base64.b64encode(fobj.read()) base64_data = base64_data.replace(b'\n', b'').decode('ascii') return 'data:{};base64,{}'.format(mime_type, base64_data) def _html_embed_images(self, html): parsed_html = bs4.BeautifulSoup(html, features="html.parser") imgs = parsed_html.find_all('img') # Replace img tags's sources by base64 dataurls for img in imgs: if 'src' not in img.attrs: continue base64_url = self._src_to_base64(img.attrs['src']) if base64_url is not None: img.attrs['src'] = base64_url return str(parsed_html) def markdown2html_mistune(source): """Convert a markdown string to HTML using mistune""" return MarkdownWithMath(renderer=IPythonRenderer( escape=False)).render(source)