#!/usr/bin/env python # -*- coding: utf-8 -*- # :Author: David Goodger, Günter Milde # Based on the html4css1 writer by David Goodger. # :Maintainer: docutils-develop@lists.sourceforge.net # :Revision: $Revision: 8644 $ # :Date: $Date: 2005-06-28$ # :Copyright: © 2016 David Goodger, Günter Milde # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. # # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause """common definitions for Docutils HTML writers""" import base64 import mimetypes import os, os.path import re import sys try: # check for the Python Imaging Library import PIL.Image except ImportError: try: # sometimes PIL modules are put in PYTHONPATH's root import Image class PIL(object): pass # dummy wrapper PIL.Image = Image except ImportError: PIL = None import docutils from docutils import nodes, utils, writers, languages, io from docutils.utils.error_reporting import SafeString from docutils.transforms import writer_aux from docutils.utils.math import (unichar2tex, pick_math_environment, math2html, latex2mathml, tex2mathml_extern) if sys.version_info >= (3, 0): from urllib.request import url2pathname else: from urllib import url2pathname if sys.version_info >= (3, 0): unicode = str # noqa class Writer(writers.Writer): supported = ('html', 'xhtml') # update in subclass """Formats this writer supports.""" # default_stylesheets = [] # set in subclass! # default_stylesheet_dirs = ['.'] # set in subclass! default_template = 'template.txt' # default_template_path = ... # set in subclass! # settings_spec = ... # set in subclass! settings_defaults = {'output_encoding_error_handler': 'xmlcharrefreplace'} # config_section = ... # set in subclass! config_section_dependencies = ('writers', 'html writers') visitor_attributes = ( 'head_prefix', 'head', 'stylesheet', 'body_prefix', 'body_pre_docinfo', 'docinfo', 'body', 'body_suffix', 'title', 'subtitle', 'header', 'footer', 'meta', 'fragment', 'html_prolog', 'html_head', 'html_title', 'html_subtitle', 'html_body') def get_transforms(self): return writers.Writer.get_transforms(self) + [writer_aux.Admonitions] def translate(self): self.visitor = visitor = self.translator_class(self.document) self.document.walkabout(visitor) for attr in self.visitor_attributes: setattr(self, attr, getattr(visitor, attr)) self.output = self.apply_template() def apply_template(self): template_file = open(self.document.settings.template, 'rb') template = unicode(template_file.read(), 'utf-8') template_file.close() subs = self.interpolation_dict() return template % subs def interpolation_dict(self): subs = {} settings = self.document.settings for attr in self.visitor_attributes: subs[attr] = ''.join(getattr(self, attr)).rstrip('\n') subs['encoding'] = settings.output_encoding subs['version'] = docutils.__version__ return subs def assemble_parts(self): writers.Writer.assemble_parts(self) for part in self.visitor_attributes: self.parts[part] = ''.join(getattr(self, part)) class HTMLTranslator(nodes.NodeVisitor): """ Generic Docutils to HTML translator. See the `html4css1` and `html5_polyglot` writers for full featured HTML writers. .. IMPORTANT:: The `visit_*` and `depart_*` methods use a heterogeneous stack, `self.context`. When subclassing, make sure to be consistent in its use! Examples for robust coding: a) Override both `visit_*` and `depart_*` methods, don't call the parent functions. b) Extend both and unconditionally call the parent functions:: def visit_example(self, node): if foo: self.body.append('
') html4css1.HTMLTranslator.visit_example(self, node) def depart_example(self, node): html4css1.HTMLTranslator.depart_example(self, node) if foo: self.body.append('
') c) Extend both, calling the parent functions under the same conditions:: def visit_example(self, node): if foo: self.body.append('
\n') else: # call the parent method _html_base.HTMLTranslator.visit_example(self, node) def depart_example(self, node): if foo: self.body.append('
\n') else: # call the parent method _html_base.HTMLTranslator.depart_example(self, node) d) Extend one method (call the parent), but don't otherwise use the `self.context` stack:: def depart_example(self, node): _html_base.HTMLTranslator.depart_example(self, node) if foo: # implementation-specific code # that does not use `self.context` self.body.append('\n') This way, changes in stack use will not bite you. """ xml_declaration = '\n' doctype = '\n' doctype_mathml = doctype head_prefix_template = ('\n\n') content_type = '\n' generator = ('\n') # Template for the MathJax script in the header: mathjax_script = '\n' mathjax_url = 'file:/usr/share/javascript/mathjax/MathJax.js' """ URL of the MathJax javascript library. The MathJax library ought to be installed on the same server as the rest of the deployed site files and specified in the `math-output` setting appended to "mathjax". See `Docutils Configuration`__. __ http://docutils.sourceforge.net/docs/user/config.html#math-output The fallback tries a local MathJax installation at ``/usr/share/javascript/mathjax/MathJax.js``. """ stylesheet_link = '\n' embedded_stylesheet = '\n' words_and_spaces = re.compile(r'[^ \n]+| +|\n') # wrap point inside word: in_word_wrap_point = re.compile(r'.+\W\W.+|[-?].+', re.U) lang_attribute = 'lang' # name changes to 'xml:lang' in XHTML 1.1 special_characters = {ord('&'): u'&', ord('<'): u'<', ord('"'): u'"', ord('>'): u'>', ord('@'): u'@', # may thwart address harvesters } """Character references for characters with a special meaning in HTML.""" def __init__(self, document): nodes.NodeVisitor.__init__(self, document) self.settings = settings = document.settings lcode = settings.language_code self.language = languages.get_language(lcode, document.reporter) self.meta = [self.generator % docutils.__version__] self.head_prefix = [] self.html_prolog = [] if settings.xml_declaration: self.head_prefix.append(self.xml_declaration % settings.output_encoding) # self.content_type = "" # encoding not interpolated: self.html_prolog.append(self.xml_declaration) self.head = self.meta[:] self.stylesheet = [self.stylesheet_call(path) for path in utils.get_stylesheet_list(settings)] self.body_prefix = ['\n\n'] # document title, subtitle display self.body_pre_docinfo = [] # author, date, etc. self.docinfo = [] self.body = [] self.fragment = [] self.body_suffix = ['\n\n'] self.section_level = 0 self.initial_header_level = int(settings.initial_header_level) self.math_output = settings.math_output.split() self.math_output_options = self.math_output[1:] self.math_output = self.math_output[0].lower() self.context = [] """Heterogeneous stack. Used by visit_* and depart_* functions in conjunction with the tree traversal. Make sure that the pops correspond to the pushes.""" self.topic_classes = [] self.colspecs = [] self.compact_p = True self.compact_simple = False self.compact_field_list = False self.in_docinfo = False self.in_sidebar = False self.in_footnote_list = False self.title = [] self.subtitle = [] self.header = [] self.footer = [] self.html_head = [self.content_type] # charset not interpolated self.html_title = [] self.html_subtitle = [] self.html_body = [] self.in_document_title = 0 # len(self.body) or 0 self.in_mailto = False self.author_in_authors = False # for html4css1 self.math_header = [] def astext(self): return ''.join(self.head_prefix + self.head + self.stylesheet + self.body_prefix + self.body_pre_docinfo + self.docinfo + self.body + self.body_suffix) def encode(self, text): """Encode special characters in `text` & return.""" # Use only named entities known in both XML and HTML # other characters are automatically encoded "by number" if required. # @@@ A codec to do these and all other HTML entities would be nice. text = unicode(text) return text.translate(self.special_characters) def cloak_mailto(self, uri): """Try to hide a mailto: URL from harvesters.""" # Encode "@" using a URL octet reference (see RFC 1738). # Further cloaking with HTML entities will be done in the # `attval` function. return uri.replace('@', '%40') def cloak_email(self, addr): """Try to hide the link text of a email link from harversters.""" # Surround at-signs and periods with tags. ("@" has # already been encoded to "@" by the `encode` method.) addr = addr.replace('@', '@') addr = addr.replace('.', '.') return addr def attval(self, text, whitespace=re.compile('[\n\r\t\v\f]')): """Cleanse, HTML encode, and return attribute value text.""" encoded = self.encode(whitespace.sub(' ', text)) if self.in_mailto and self.settings.cloak_email_addresses: # Cloak at-signs ("%40") and periods with HTML entities. encoded = encoded.replace('%40', '%40') encoded = encoded.replace('.', '.') return encoded def stylesheet_call(self, path): """Return code to reference or embed stylesheet file `path`""" if self.settings.embed_stylesheet: try: content = io.FileInput(source_path=path, encoding='utf-8').read() self.settings.record_dependencies.add(path) except IOError as err: msg = u"Cannot embed stylesheet '%r': %s." % ( path, SafeString(err.strerror)) self.document.reporter.error(msg) return '<--- %s --->\n' % msg return self.embedded_stylesheet % content # else link to style file: if self.settings.stylesheet_path: # adapt path relative to output (cf. config.html#stylesheet-path) path = utils.relative_path(self.settings._destination, path) return self.stylesheet_link % self.encode(path) def starttag(self, node, tagname, suffix='\n', empty=False, **attributes): """ Construct and return a start tag given a node (id & class attributes are extracted), tag name, and optional attributes. """ tagname = tagname.lower() prefix = [] atts = {} ids = [] for (name, value) in attributes.items(): atts[name.lower()] = value classes = [] languages = [] # unify class arguments and move language specification for cls in node.get('classes', []) + atts.pop('class', '').split(): if cls.startswith('language-'): languages.append(cls[9:]) elif cls.strip() and cls not in classes: classes.append(cls) if languages: # attribute name is 'lang' in XHTML 1.0 but 'xml:lang' in 1.1 atts[self.lang_attribute] = languages[0] if classes: atts['class'] = ' '.join(classes) assert 'id' not in atts ids.extend(node.get('ids', [])) if 'ids' in atts: ids.extend(atts['ids']) del atts['ids'] if ids: atts['id'] = ids[0] for id in ids[1:]: # Add empty "span" elements for additional IDs. Note # that we cannot use empty "a" elements because there # may be targets inside of references, but nested "a" # elements aren't allowed in XHTML (even if they do # not all have a "href" attribute). if empty or isinstance(node, (nodes.bullet_list, nodes.docinfo, nodes.definition_list, nodes.enumerated_list, nodes.field_list, nodes.option_list, nodes.table)): # Insert target right in front of element. prefix.append('' % id) else: # Non-empty tag. Place the auxiliary tag # *inside* the element, as the first child. suffix += '' % id attlist = sorted(atts.items()) parts = [tagname] for name, value in attlist: # value=None was used for boolean attributes without # value, but this isn't supported by XHTML. assert value is not None if isinstance(value, list): values = [unicode(v) for v in value] parts.append('%s="%s"' % (name.lower(), self.attval(' '.join(values)))) else: parts.append('%s="%s"' % (name.lower(), self.attval(unicode(value)))) if empty: infix = ' /' else: infix = '' return ''.join(prefix) + '<%s%s>' % (' '.join(parts), infix) + suffix def emptytag(self, node, tagname, suffix='\n', **attributes): """Construct and return an XML-compatible empty tag.""" return self.starttag(node, tagname, suffix, empty=True, **attributes) def set_class_on_child(self, node, class_, index=0): """ Set class `class_` on the visible child no. index of `node`. Do nothing if node has fewer children than `index`. """ children = [n for n in node if not isinstance(n, nodes.Invisible)] try: child = children[index] except IndexError: return child['classes'].append(class_) def visit_Text(self, node): text = node.astext() encoded = self.encode(text) if self.in_mailto and self.settings.cloak_email_addresses: encoded = self.cloak_email(encoded) self.body.append(encoded) def depart_Text(self, node): pass def visit_abbreviation(self, node): # @@@ implementation incomplete ("title" attribute) self.body.append(self.starttag(node, 'abbr', '')) def depart_abbreviation(self, node): self.body.append('') def visit_acronym(self, node): # @@@ implementation incomplete ("title" attribute) self.body.append(self.starttag(node, 'acronym', '')) def depart_acronym(self, node): self.body.append('') def visit_address(self, node): self.visit_docinfo_item(node, 'address', meta=False) self.body.append(self.starttag(node, 'pre', suffix= '', CLASS='address')) def depart_address(self, node): self.body.append('\n\n') self.depart_docinfo_item() def visit_admonition(self, node): node['classes'].insert(0, 'admonition') self.body.append(self.starttag(node, 'div')) def depart_admonition(self, node=None): self.body.append('\n') attribution_formats = {'dash': (u'\u2014', ''), 'parentheses': ('(', ')'), 'parens': ('(', ')'), 'none': ('', '')} def visit_attribution(self, node): prefix, suffix = self.attribution_formats[self.settings.attribution] self.context.append(suffix) self.body.append( self.starttag(node, 'p', prefix, CLASS='attribution')) def depart_attribution(self, node): self.body.append(self.context.pop() + '

\n') def visit_author(self, node): if not(isinstance(node.parent, nodes.authors)): self.visit_docinfo_item(node, 'author') self.body.append('

') def depart_author(self, node): self.body.append('

') if isinstance(node.parent, nodes.authors): self.body.append('\n') else: self.depart_docinfo_item() def visit_authors(self, node): self.visit_docinfo_item(node, 'authors') def depart_authors(self, node): self.depart_docinfo_item() def visit_block_quote(self, node): self.body.append(self.starttag(node, 'blockquote')) def depart_block_quote(self, node): self.body.append('\n') def check_simple_list(self, node): """Check for a simple list that can be rendered compactly.""" visitor = SimpleListChecker(self.document) try: node.walk(visitor) except nodes.NodeFound: return False else: return True # Compact lists # ------------ # Include definition lists and field lists (in addition to ordered # and unordered lists) in the test if a list is "simple" (cf. the # html4css1.HTMLTranslator docstring and the SimpleListChecker class at # the end of this file). def is_compactable(self, node): # explicite class arguments have precedence if 'compact' in node['classes']: return True if 'open' in node['classes']: return False # check config setting: if (isinstance(node, (nodes.field_list, nodes.definition_list)) and not self.settings.compact_field_lists): return False if (isinstance(node, (nodes.enumerated_list, nodes.bullet_list)) and not self.settings.compact_lists): return False # Table of Contents: if (self.topic_classes == ['contents']): # TODO: look in parent nodes, remove self.topic_classes? return True # check the list items: return self.check_simple_list(node) def visit_bullet_list(self, node): atts = {} old_compact_simple = self.compact_simple self.context.append((self.compact_simple, self.compact_p)) self.compact_p = None self.compact_simple = self.is_compactable(node) if self.compact_simple and not old_compact_simple: atts['class'] = 'simple' self.body.append(self.starttag(node, 'ul', **atts)) def depart_bullet_list(self, node): self.compact_simple, self.compact_p = self.context.pop() self.body.append('\n') def visit_caption(self, node): self.body.append(self.starttag(node, 'p', '', CLASS='caption')) def depart_caption(self, node): self.body.append('

\n') def visit_citation(self, node): # Use definition list for bibliographic references. # Join adjacent citation entries. # TODO: use