1 files changed, 45 insertions, 178 deletions
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
index ef04cab..e1023c5 100644
--- a/markdown/preprocessors.py
+++ b/markdown/preprocessors.py
@@ -1,24 +1,44 @@
-
 """
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+
 PRE-PROCESSORS
 =============================================================================
 
 Preprocessors work on source text before we start doing anything too
-complicated. 
+complicated.
 """
 
+from . import util
+from .htmlparser import HTMLExtractor
 import re
-import markdown
 
-HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
-HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
 
-class Processor:
-    def __init__(self, markdown_instance=None):
-        if markdown_instance:
-            self.markdown = markdown_instance
+def build_preprocessors(md, **kwargs):
+    """ Build the default set of preprocessors used by Markdown. """
+    preprocessors = util.Registry()
+    preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
+    preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
+    return preprocessors
+
 
-class Preprocessor (Processor):
+class Preprocessor(util.Processor):
     """
     Preprocessors are run after the text is broken into lines.
 
@@ -36,180 +56,27 @@ class Preprocessor (Processor):
         the (possibly modified) list of lines.
 
         """
-        pass
+        pass  # pragma: no cover
 
-class HtmlStash:
-    """
-    This class is used for stashing HTML objects that we extract
-    in the beginning and replace with place-holders.
-    """
 
-    def __init__ (self):
-        """ Create a HtmlStash. """
-        self.html_counter = 0 # for counting inline html segments
-        self.rawHtmlBlocks=[]
+class NormalizeWhitespace(Preprocessor):
+    """ Normalize whitespace for consistent parsing. """
 
-    def store(self, html, safe=False):
-        """
-        Saves an HTML segment for later reinsertion.  Returns a
-        placeholder string that needs to be inserted into the
-        document.
-
-        Keyword arguments:
-
-        * html: an html segment
-        * safe: label an html segment as safe for safemode
-
-        Returns : a placeholder string
-
-        """
-        self.rawHtmlBlocks.append((html, safe))
-        placeholder = HTML_PLACEHOLDER % self.html_counter
-        self.html_counter += 1
-        return placeholder
-
-    def reset(self):
-        self.html_counter = 0
-        self.rawHtmlBlocks = []
+    def run(self, lines):
+        source = '\n'.join(lines)
+        source = source.replace(util.STX, "").replace(util.ETX, "")
+        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
+        source = source.expandtabs(self.md.tab_length)
+        source = re.sub(r'(?<=\n) +\n', '\n', source)
+        return source.split('\n')
 
 
 class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""
 
-    right_tag_patterns = ["</%s>", "%s>"]
-
-    def _get_left_tag(self, block):
-        return block[1:].replace(">", " ", 1).split()[0].lower()
-
-    def _get_right_tag(self, left_tag, block):
-        for p in self.right_tag_patterns:
-            tag = p % left_tag
-            i = block.rfind(tag)
-            if i > 2:
-                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
-        return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
-
-    def _equal_tags(self, left_tag, right_tag):
-        if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
-            return True
-        if ("/" + left_tag) == right_tag:
-            return True
-        if (right_tag == "--" and left_tag == "--"):
-            return True
-        elif left_tag == right_tag[1:] \
-            and right_tag[0] != "<":
-            return True
-        else:
-            return False
-
-    def _is_oneliner(self, tag):
-        return (tag in ['hr', 'hr/'])
-
     def run(self, lines):
-        text = "\n".join(lines)
-        new_blocks = []
-        text = text.split("\n\n")
-        items = []
-        left_tag = ''
-        right_tag = ''
-        in_tag = False # flag
-
-        while text:
-            block = text[0]
-            if block.startswith("\n"):
-                block = block[1:]
-            text = text[1:]
-
-            if block.startswith("\n"):
-                block = block[1:]
-
-            if not in_tag:
-                if block.startswith("<"):
-                    left_tag = self._get_left_tag(block)
-                    right_tag, data_index = self._get_right_tag(left_tag, block)
-
-                    if block[1] == "!":
-                        # is a comment block
-                        left_tag = "--"
-                        right_tag, data_index = self._get_right_tag(left_tag, block)
-                        # keep checking conditions below and maybe just append
-                    
-                    if data_index < len(block) \
-                        and markdown.isBlockLevel(left_tag): 
-                        text.insert(0, block[data_index:])
-                        block = block[:data_index]
-
-                    if not (markdown.isBlockLevel(left_tag) \
-                        or block[1] in ["!", "?", "@", "%"]):
-                        new_blocks.append(block)
-                        continue
-
-                    if self._is_oneliner(left_tag):
-                        new_blocks.append(block.strip())
-                        continue
-
-                    if block.rstrip().endswith(">") \
-                        and self._equal_tags(left_tag, right_tag):
-                        new_blocks.append(
-                            self.markdown.htmlStash.store(block.strip()))
-                        continue
-                    else: #if not block[1] == "!":
-                        # if is block level tag and is not complete
-
-                        if markdown.isBlockLevel(left_tag) or left_tag == "--" \
-                            and not block.rstrip().endswith(">"):
-                            items.append(block.strip())
-                            in_tag = True
-                        else:
-                            new_blocks.append(
-                            self.markdown.htmlStash.store(block.strip()))
-
-                        continue
-
-                new_blocks.append(block)
-
-            else:
-                items.append(block.strip())
-
-                right_tag, data_index = self._get_right_tag(left_tag, block)
-
-                if self._equal_tags(left_tag, right_tag):
-                    # if find closing tag
-                    in_tag = False
-                    new_blocks.append(
-                        self.markdown.htmlStash.store('\n\n'.join(items)))
-                    items = []
-
-        if items:
-            new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
-            new_blocks.append('\n')
-
-        new_text = "\n\n".join(new_blocks)
-        return new_text.split("\n")
-
-
-class ReferencePreprocessor(Preprocessor):
-    """ Remove reference definitions from text and store for later use. """
-
-    RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
-
-    def run (self, lines):
-        new_text = [];
-        for line in lines:
-            m = self.RE.match(line)
-            if m:
-                id = m.group(2).strip().lower()
-                t = m.group(4).strip()  # potential title
-                if not t:
-                    self.markdown.references[id] = (m.group(3), t)
-                elif (len(t) >= 2
-                      and (t[0] == t[-1] == "\""
-                           or t[0] == t[-1] == "\'"
-                           or (t[0] == "(" and t[-1] == ")") ) ):
-                    self.markdown.references[id] = (m.group(3), t[1:-1])
-                else:
-                    new_text.append(line)
-            else:
-                new_text.append(line)
-
-        return new_text #+ "\n"
+        source = '\n'.join(lines)
+        parser = HTMLExtractor(self.md)
+        parser.feed(source)
+        parser.close()
+        return ''.join(parser.cleandoc).split('\n')