1 files changed, 364 insertions, 0 deletions
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
new file mode 100644
index 0000000..ec7dcba
--- /dev/null
+++ b/markdown/extensions/md_in_html.py
@@ -0,0 +1,364 @@
+"""
+Python-Markdown Markdown in HTML Extension
+===============================
+
+An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s
+parsing of Markdown syntax in raw HTML.
+
+See <https://Python-Markdown.github.io/extensions/raw_html>
+for documentation.
+
+Copyright The Python Markdown Project
+
+License: [BSD](https://opensource.org/licenses/bsd-license.php)
+
+"""
+
+from . import Extension
+from ..blockprocessors import BlockProcessor
+from ..preprocessors import Preprocessor
+from ..postprocessors import RawHtmlPostprocessor
+from .. import util
+from ..htmlparser import HTMLExtractor, blank_line_re
+import xml.etree.ElementTree as etree
+
+
+class HTMLExtractorExtra(HTMLExtractor):
+    """
+    Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
+    """
+
+    def __init__(self, md, *args, **kwargs):
+        # All block-level tags.
+        self.block_level_tags = set(md.block_level_elements.copy())
+        # Block-level tags in which the content only gets span level parsing
+        self.span_tags = set(
+            ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th']
+        )
+        # Block-level tags which never get their content parsed.
+        self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
+
+        super().__init__(md, *args, **kwargs)
+
+        # Block-level tags in which the content gets parsed as blocks
+        self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+        self.span_and_blocks_tags = self.block_tags | self.span_tags
+
+    def reset(self):
+        """Reset this instance.  Loses all unprocessed data."""
+        self.mdstack = []  # When markdown=1, stack contains a list of tags
+        self.treebuilder = etree.TreeBuilder()
+        self.mdstate = []  # one of 'block', 'span', 'off', or None
+        super().reset()
+
+    def close(self):
+        """Handle any buffered data."""
+        super().close()
+        # Handle any unclosed tags.
+        if self.mdstack:
+            # Close the outermost parent. handle_endtag will close all unclosed children.
+            self.handle_endtag(self.mdstack[0])
+
+    def get_element(self):
+        """ Return element from treebuilder and reset treebuilder for later use. """
+        element = self.treebuilder.close()
+        self.treebuilder = etree.TreeBuilder()
+        return element
+
+    def get_state(self, tag, attrs):
+        """ Return state from tag and `markdown` attr. One of 'block', 'span', or 'off'. """
+        md_attr = attrs.get('markdown', '0')
+        if md_attr == 'markdown':
+            # `<tag markdown>` is the same as `<tag markdown='1'>`.
+            md_attr = '1'
+        parent_state = self.mdstate[-1] if self.mdstate else None
+        if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
+            # Only use the parent state if it is more restrictive than the markdown attribute.
+            md_attr = parent_state
+        if ((md_attr == '1' and tag in self.block_tags) or
+                (md_attr == 'block' and tag in self.span_and_blocks_tags)):
+            return 'block'
+        elif ((md_attr == '1' and tag in self.span_tags) or
+              (md_attr == 'span' and tag in self.span_and_blocks_tags)):
+            return 'span'
+        elif tag in self.block_level_tags:
+            return 'off'
+        else:  # pragma: no cover
+            return None
+
+    def handle_starttag(self, tag, attrs):
+        # Handle tags that should always be empty and do not specify a closing tag
+        if tag in self.empty_tags and (self.at_line_start() or self.intail):
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+            self.handle_empty_tag(data, True)
+            return
+
+        if tag in self.block_level_tags and (self.at_line_start() or self.intail):
+            # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
+            # Convert to `{'checked': 'checked'}`.
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            state = self.get_state(tag, attrs)
+            if self.inraw or (state in [None, 'off'] and not self.mdstack):
+                # fall back to default behavior
+                attrs.pop('markdown', None)
+                super().handle_starttag(tag, attrs)
+            else:
+                if 'p' in self.mdstack and tag in self.block_level_tags:
+                    # Close unclosed 'p' tag
+                    self.handle_endtag('p')
+                self.mdstate.append(state)
+                self.mdstack.append(tag)
+                attrs['markdown'] = state
+                self.treebuilder.start(tag, attrs)
+        else:
+            # Span level tag
+            if self.inraw:
+                super().handle_starttag(tag, attrs)
+            else:
+                text = self.get_starttag_text()
+                if self.mdstate and self.mdstate[-1] == "off":
+                    self.handle_data(self.md.htmlStash.store(text))
+                else:
+                    self.handle_data(text)
+                if tag in self.CDATA_CONTENT_ELEMENTS:
+                    # This is presumably a standalone tag in a code span (see #1036).
+                    self.clear_cdata_mode()
+
+    def handle_endtag(self, tag):
+        if tag in self.block_level_tags:
+            if self.inraw:
+                super().handle_endtag(tag)
+            elif tag in self.mdstack:
+                # Close element and any unclosed children
+                while self.mdstack:
+                    item = self.mdstack.pop()
+                    self.mdstate.pop()
+                    self.treebuilder.end(item)
+                    if item == tag:
+                        break
+                if not self.mdstack:
+                    # Last item in stack is closed. Stash it
+                    element = self.get_element()
+                    # Get last entry to see if it ends in newlines
+                    # If it is an element, assume there is no newlines
+                    item = self.cleandoc[-1] if self.cleandoc else ''
+                    # If we only have one newline before block element, add another
+                    if not item.endswith('\n\n') and item.endswith('\n'):
+                        self.cleandoc.append('\n')
+                    self.cleandoc.append(self.md.htmlStash.store(element))
+                    self.cleandoc.append('\n\n')
+                    self.state = []
+                    # Check if element has a tail
+                    if not blank_line_re.match(
+                            self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]):
+                        # More content exists after endtag.
+                        self.intail = True
+            else:
+                # Treat orphan closing tag as a span level tag.
+                text = self.get_endtag_text(tag)
+                if self.mdstate and self.mdstate[-1] == "off":
+                    self.handle_data(self.md.htmlStash.store(text))
+                else:
+                    self.handle_data(text)
+        else:
+            # Span level tag
+            if self.inraw:
+                super().handle_endtag(tag)
+            else:
+                text = self.get_endtag_text(tag)
+                if self.mdstate and self.mdstate[-1] == "off":
+                    self.handle_data(self.md.htmlStash.store(text))
+                else:
+                    self.handle_data(text)
+
+    def handle_startendtag(self, tag, attrs):
+        if tag in self.empty_tags:
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+        else:
+            data = self.get_starttag_text()
+        self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
+    def handle_data(self, data):
+        if self.intail and '\n' in data:
+            self.intail = False
+        if self.inraw or not self.mdstack:
+            super().handle_data(data)
+        else:
+            self.treebuilder.data(data)
+
+    def handle_empty_tag(self, data, is_block):
+        if self.inraw or not self.mdstack:
+            super().handle_empty_tag(data, is_block)
+        else:
+            if self.at_line_start() and is_block:
+                self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
+            else:
+                self.handle_data(self.md.htmlStash.store(data))
+
+    def parse_pi(self, i):
+        if self.at_line_start() or self.intail or self.mdstack:
+            # The same override exists in HTMLExtractor without the check
+            # for mdstack. Therefore, use HTMLExtractor's parent instead.
+            return super(HTMLExtractor, self).parse_pi(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<?')
+        return i + 2
+
+    def parse_html_declaration(self, i):
+        if self.at_line_start() or self.intail or self.mdstack:
+            # The same override exists in HTMLExtractor without the check
+            # for mdstack. Therefore, use HTMLExtractor's parent instead.
+            return super(HTMLExtractor, self).parse_html_declaration(i)
+        # This is not the beginning of a raw block so treat as plain data
+        # and avoid consuming any tags which may follow (see #1066).
+        self.handle_data('<!')
+        return i + 2
+
+
+class HtmlBlockPreprocessor(Preprocessor):
+    """Remove html blocks from the text and store them for later retrieval."""
+
+    def run(self, lines):
+        source = '\n'.join(lines)
+        parser = HTMLExtractorExtra(self.md)
+        parser.feed(source)
+        parser.close()
+        return ''.join(parser.cleandoc).split('\n')
+
+
+class MarkdownInHtmlProcessor(BlockProcessor):
+    """Process Markdown Inside HTML Blocks which have been stored in the HtmlStash."""
+
+    def test(self, parent, block):
+        # ALways return True. `run` will return `False` it not a valid match.
+        return True
+
+    def parse_element_content(self, element):
+        """
+        Recursively parse the text content of an etree Element as Markdown.
+
+        Any block level elements generated from the Markdown will be inserted as children of the element in place
+        of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has
+        been disabled, the text content of it and its chidlren are wrapped in an `AtomicString`.
+        """
+
+        md_attr = element.attrib.pop('markdown', 'off')
+
+        if md_attr == 'block':
+            # Parse content as block level
+            # The order in which the different parts are parsed (text, children, tails) is important here as the
+            # order of elements needs to be preserved. We can't be inserting items at a later point in the current
+            # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
+            # example). Therefore, the order of operations is children, tails, text.
+
+            # Recursively parse existing children from raw HTML
+            for child in list(element):
+                self.parse_element_content(child)
+
+            # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
+            # Save the position of each item to be inserted later in reverse.
+            tails = []
+            for pos, child in enumerate(element):
+                if child.tail:
+                    block = child.tail.rstrip('\n')
+                    child.tail = ''
+                    # Use a dummy placeholder element.
+                    dummy = etree.Element('div')
+                    self.parser.parseBlocks(dummy, block.split('\n\n'))
+                    children = list(dummy)
+                    children.reverse()
+                    tails.append((pos + 1, children))
+
+            # Insert the elements created from the tails in reverse.
+            tails.reverse()
+            for pos, tail in tails:
+                for item in tail:
+                    element.insert(pos, item)
+
+            # Parse Markdown text content. Do this last to avoid raw HTML parsing.
+            if element.text:
+                block = element.text.rstrip('\n')
+                element.text = ''
+                # Use a dummy placeholder element as the content needs to get inserted before existing children.
+                dummy = etree.Element('div')
+                self.parser.parseBlocks(dummy, block.split('\n\n'))
+                children = list(dummy)
+                children.reverse()
+                for child in children:
+                    element.insert(0, child)
+
+        elif md_attr == 'span':
+            # Span level parsing will be handled by inlineprocessors.
+            # Walk children here to remove any `markdown` attributes.
+            for child in list(element):
+                self.parse_element_content(child)
+
+        else:
+            # Disable inline parsing for everything else
+            if element.text is None:
+                element.text = ''
+            element.text = util.AtomicString(element.text)
+            for child in list(element):
+                self.parse_element_content(child)
+                if child.tail:
+                    child.tail = util.AtomicString(child.tail)
+
+    def run(self, parent, blocks):
+        m = util.HTML_PLACEHOLDER_RE.match(blocks[0])
+        if m:
+            index = int(m.group(1))
+            element = self.parser.md.htmlStash.rawHtmlBlocks[index]
+            if isinstance(element, etree.Element):
+                # We have a matched element. Process it.
+                blocks.pop(0)
+                self.parse_element_content(element)
+                parent.append(element)
+                # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
+                self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
+                self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
+                # Confirm the match to the blockparser.
+                return True
+        # No match found.
+        return False
+
+
+class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor):
+    def stash_to_string(self, text):
+        """ Override default to handle any etree elements still in the stash. """
+        if isinstance(text, etree.Element):
+            return self.md.serializer(text)
+        else:
+            return str(text)
+
+
+class MarkdownInHtmlExtension(Extension):
+    """Add Markdown parsing in HTML to Markdown class."""
+
+    def extendMarkdown(self, md):
+        """ Register extension instances. """
+
+        # Replace raw HTML preprocessor
+        md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+        # Add blockprocessor which handles the placeholders for etree elements
+        md.parser.blockprocessors.register(
+            MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
+        )
+        # Replace raw HTML postprocessor
+        md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30)
+
+
+def makeExtension(**kwargs):  # pragma: no cover
+    return MarkdownInHtmlExtension(**kwargs)