diff options
Diffstat (limited to 'markdown/preprocessors.py')
-rw-r--r-- | markdown/preprocessors.py | 223 |
1 files changed, 45 insertions, 178 deletions
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index ef04cab..e1023c5 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -1,24 +1,44 @@ - """ +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). + PRE-PROCESSORS ============================================================================= Preprocessors work on source text before we start doing anything too -complicated. +complicated. """ +from . import util +from .htmlparser import HTMLExtractor import re -import markdown -HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" -HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX -class Processor: - def __init__(self, markdown_instance=None): - if markdown_instance: - self.markdown = markdown_instance +def build_preprocessors(md, **kwargs): + """ Build the default set of preprocessors used by Markdown. """ + preprocessors = util.Registry() + preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30) + preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20) + return preprocessors + -class Preprocessor (Processor): +class Preprocessor(util.Processor): """ Preprocessors are run after the text is broken into lines. @@ -36,180 +56,27 @@ class Preprocessor (Processor): the (possibly modified) list of lines. """ - pass + pass # pragma: no cover -class HtmlStash: - """ - This class is used for stashing HTML objects that we extract - in the beginning and replace with place-holders. - """ - def __init__ (self): - """ Create a HtmlStash. """ - self.html_counter = 0 # for counting inline html segments - self.rawHtmlBlocks=[] +class NormalizeWhitespace(Preprocessor): + """ Normalize whitespace for consistent parsing. """ - def store(self, html, safe=False): - """ - Saves an HTML segment for later reinsertion. Returns a - placeholder string that needs to be inserted into the - document. - - Keyword arguments: - - * html: an html segment - * safe: label an html segment as safe for safemode - - Returns : a placeholder string - - """ - self.rawHtmlBlocks.append((html, safe)) - placeholder = HTML_PLACEHOLDER % self.html_counter - self.html_counter += 1 - return placeholder - - def reset(self): - self.html_counter = 0 - self.rawHtmlBlocks = [] + def run(self, lines): + source = '\n'.join(lines) + source = source.replace(util.STX, "").replace(util.ETX, "") + source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" + source = source.expandtabs(self.md.tab_length) + source = re.sub(r'(?<=\n) +\n', '\n', source) + return source.split('\n') class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" - right_tag_patterns = ["</%s>", "%s>"] - - def _get_left_tag(self, block): - return block[1:].replace(">", " ", 1).split()[0].lower() - - def _get_right_tag(self, left_tag, block): - for p in self.right_tag_patterns: - tag = p % left_tag - i = block.rfind(tag) - if i > 2: - return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) - - def _equal_tags(self, left_tag, right_tag): - if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. - return True - if ("/" + left_tag) == right_tag: - return True - if (right_tag == "--" and left_tag == "--"): - return True - elif left_tag == right_tag[1:] \ - and right_tag[0] != "<": - return True - else: - return False - - def _is_oneliner(self, tag): - return (tag in ['hr', 'hr/']) - def run(self, lines): - text = "\n".join(lines) - new_blocks = [] - text = text.split("\n\n") - items = [] - left_tag = '' - right_tag = '' - in_tag = False # flag - - while text: - block = text[0] - if block.startswith("\n"): - block = block[1:] - text = text[1:] - - if block.startswith("\n"): - block = block[1:] - - if not in_tag: - if block.startswith("<"): - left_tag = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, block) - - if block[1] == "!": - # is a comment block - left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, block) - # keep checking conditions below and maybe just append - - if data_index < len(block) \ - and markdown.isBlockLevel(left_tag): - text.insert(0, block[data_index:]) - block = block[:data_index] - - if not (markdown.isBlockLevel(left_tag) \ - or block[1] in ["!", "?", "@", "%"]): - new_blocks.append(block) - continue - - if self._is_oneliner(left_tag): - new_blocks.append(block.strip()) - continue - - if block.rstrip().endswith(">") \ - and self._equal_tags(left_tag, right_tag): - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - continue - else: #if not block[1] == "!": - # if is block level tag and is not complete - - if markdown.isBlockLevel(left_tag) or left_tag == "--" \ - and not block.rstrip().endswith(">"): - items.append(block.strip()) - in_tag = True - else: - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - - continue - - new_blocks.append(block) - - else: - items.append(block.strip()) - - right_tag, data_index = self._get_right_tag(left_tag, block) - - if self._equal_tags(left_tag, right_tag): - # if find closing tag - in_tag = False - new_blocks.append( - self.markdown.htmlStash.store('\n\n'.join(items))) - items = [] - - if items: - new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) - new_blocks.append('\n') - - new_text = "\n\n".join(new_blocks) - return new_text.split("\n") - - -class ReferencePreprocessor(Preprocessor): - """ Remove reference definitions from text and store for later use. """ - - RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) - - def run (self, lines): - new_text = []; - for line in lines: - m = self.RE.match(line) - if m: - id = m.group(2).strip().lower() - t = m.group(4).strip() # potential title - if not t: - self.markdown.references[id] = (m.group(3), t) - elif (len(t) >= 2 - and (t[0] == t[-1] == "\"" - or t[0] == t[-1] == "\'" - or (t[0] == "(" and t[-1] == ")") ) ): - self.markdown.references[id] = (m.group(3), t[1:-1]) - else: - new_text.append(line) - else: - new_text.append(line) - - return new_text #+ "\n" + source = '\n'.join(lines) + parser = HTMLExtractor(self.md) + parser.feed(source) + parser.close() + return ''.join(parser.cleandoc).split('\n') |