diff options
Diffstat (limited to 'markdown/treeprocessors.py')
-rw-r--r-- | markdown/treeprocessors.py | 307 |
1 files changed, 218 insertions, 89 deletions
diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py index 1dc612a..e9f48ca 100644 --- a/markdown/treeprocessors.py +++ b/markdown/treeprocessors.py @@ -1,16 +1,47 @@ -import markdown +""" +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). +""" + import re +import xml.etree.ElementTree as etree +from . import util +from . import inlinepatterns + + +def build_treeprocessors(md, **kwargs): + """ Build the default treeprocessors for Markdown. """ + treeprocessors = util.Registry() + treeprocessors.register(InlineProcessor(md), 'inline', 20) + treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) + treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) + return treeprocessors + def isString(s): """ Check if it's string """ - return isinstance(s, unicode) or isinstance(s, str) + if not isinstance(s, util.AtomicString): + return isinstance(s, str) + return False -class Processor: - def __init__(self, markdown_instance=None): - if markdown_instance: - self.markdown = markdown_instance -class Treeprocessor(Processor): +class Treeprocessor(util.Processor): """ Treeprocessors are run on the ElementTree object before serialization. @@ -24,11 +55,11 @@ class Treeprocessor(Processor): def run(self, root): """ Subclasses of Treeprocessor should implement a `run` method, which - takes a root ElementTree. This method can return another ElementTree - object, and the existing root ElementTree will be replaced, or it can + takes a root ElementTree. This method can return another ElementTree + object, and the existing root ElementTree will be replaced, or it can modify the current tree and return None. """ - pass + pass # pragma: no cover class InlineProcessor(Treeprocessor): @@ -36,18 +67,20 @@ class InlineProcessor(Treeprocessor): A Treeprocessor that traverses a tree, applying inline patterns. """ - def __init__ (self, md): - self.__placeholder_prefix = markdown.INLINE_PLACEHOLDER_PREFIX - self.__placeholder_suffix = markdown.ETX + def __init__(self, md): + self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX + self.__placeholder_suffix = util.ETX self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ + len(self.__placeholder_suffix) - self.__placeholder_re = re.compile(markdown.INLINE_PLACEHOLDER % r'([0-9]{4})') - self.markdown = md + self.__placeholder_re = util.INLINE_PLACEHOLDER_RE + self.md = md + self.inlinePatterns = md.inlinePatterns + self.ancestors = [] def __makePlaceholder(self, type): """ Generate a placeholder """ id = "%04d" % len(self.stashed_nodes) - hash = markdown.INLINE_PLACEHOLDER % id + hash = util.INLINE_PLACEHOLDER % id return hash, id def __findPlaceholder(self, data, index): @@ -60,8 +93,8 @@ class InlineProcessor(Treeprocessor): * index: index, from which we start search Returns: placeholder id and string index, after the found placeholder. - """ + """ m = self.__placeholder_re.search(data, index) if m: return m.group(1), m.end() @@ -87,12 +120,13 @@ class InlineProcessor(Treeprocessor): Returns: String with placeholders. """ - if not isinstance(data, markdown.AtomicString): + if not isinstance(data, util.AtomicString): startIndex = 0 - while patternIndex < len(self.markdown.inlinePatterns): + count = len(self.inlinePatterns) + while patternIndex < count: data, matched, startIndex = self.__applyPattern( - self.markdown.inlinePatterns.value_for_index(patternIndex), - data, patternIndex, startIndex) + self.inlinePatterns[patternIndex], data, patternIndex, startIndex + ) if not matched: patternIndex += 1 return data @@ -118,19 +152,18 @@ class InlineProcessor(Treeprocessor): text = subnode.tail subnode.tail = None - childResult = self.__processPlaceholders(text, subnode) + childResult = self.__processPlaceholders(text, subnode, isText) if not isText and node is not subnode: - pos = node.getchildren().index(subnode) - node.remove(subnode) + pos = list(node).index(subnode) + 1 else: pos = 0 childResult.reverse() for newChild in childResult: - node.insert(pos, newChild) + node.insert(pos, newChild[0]) - def __processPlaceholders(self, data, parent): + def __processPlaceholders(self, data, parent, isText=True): """ Process string with placeholders and generate ElementTree tree. @@ -140,20 +173,25 @@ class InlineProcessor(Treeprocessor): * parent: Element, which contains processing inline data Returns: list with ElementTree elements with applied inline patterns. + """ def linkText(text): if text: if result: - if result[-1].tail: - result[-1].tail += text + if result[-1][0].tail: + result[-1][0].tail += text + else: + result[-1][0].tail = text + elif not isText: + if parent.tail: + parent.tail += text else: - result[-1].tail = text + parent.tail = text else: if parent.text: parent.text += text else: parent.text = text - result = [] strartIndex = 0 while data: @@ -168,28 +206,33 @@ class InlineProcessor(Treeprocessor): text = data[strartIndex:index] linkText(text) - if not isString(node): # it's Element - for child in [node] + node.getchildren(): + if not isString(node): # it's Element + for child in [node] + list(node): if child.tail: if child.tail.strip(): - self.__processElementText(node, child, False) + self.__processElementText( + node, child, False + ) if child.text: if child.text.strip(): self.__processElementText(child, child) - else: # it's just a string + else: # it's just a string linkText(node) strartIndex = phEndIndex continue strartIndex = phEndIndex - result.append(node) + result.append((node, self.ancestors[:])) - else: # wrong placeholder - end = index + len(prefix) + else: # wrong placeholder + end = index + len(self.__placeholder_prefix) linkText(data[strartIndex:end]) strartIndex = end else: text = data[strartIndex:] + if isinstance(data, util.AtomicString): + # We don't want to loose the AtomicString + text = util.AtomicString(text) linkText(text) data = "" @@ -205,94 +248,149 @@ class InlineProcessor(Treeprocessor): * data: the text to be processed * pattern: the pattern to be checked * patternIndex: index of current pattern - * startIndex: string index, from which we starting search + * startIndex: string index, from which we start searching Returns: String with placeholders instead of ElementTree elements. """ - match = pattern.getCompiledRegExp().match(data[startIndex:]) - leftData = data[:startIndex] + new_style = isinstance(pattern, inlinepatterns.InlineProcessor) + + for exclude in pattern.ANCESTOR_EXCLUDES: + if exclude.lower() in self.ancestors: + return data, False, 0 + + if new_style: + match = None + # Since handleMatch may reject our first match, + # we iterate over the buffer looking for matches + # until we can't find any more. + for match in pattern.getCompiledRegExp().finditer(data, startIndex): + node, start, end = pattern.handleMatch(match, data) + if start is None or end is None: + startIndex += match.end(0) + match = None + continue + break + else: # pragma: no cover + match = pattern.getCompiledRegExp().match(data[startIndex:]) + leftData = data[:startIndex] if not match: return data, False, 0 - node = pattern.handleMatch(match) + if not new_style: # pragma: no cover + node = pattern.handleMatch(match) + start = match.start(0) + end = match.end(0) if node is None: - return data, True, len(leftData) + match.span(len(match.groups()))[0] + return data, True, end if not isString(node): - if not isinstance(node.text, markdown.AtomicString): + if not isinstance(node.text, util.AtomicString): # We need to process current node too - for child in [node] + node.getchildren(): + for child in [node] + list(node): if not isString(node): if child.text: - child.text = self.__handleInline(child.text, - patternIndex + 1) + self.ancestors.append(child.tag.lower()) + child.text = self.__handleInline( + child.text, patternIndex + 1 + ) + self.ancestors.pop() if child.tail: - child.tail = self.__handleInline(child.tail, - patternIndex) + child.tail = self.__handleInline( + child.tail, patternIndex + ) placeholder = self.__stashNode(node, pattern.type()) - return "%s%s%s%s" % (leftData, - match.group(1), - placeholder, match.groups()[-1]), True, 0 - - def run(self, tree): + if new_style: + return "{}{}{}".format(data[:start], + placeholder, data[end:]), True, 0 + else: # pragma: no cover + return "{}{}{}{}".format(leftData, + match.group(1), + placeholder, match.groups()[-1]), True, 0 + + def __build_ancestors(self, parent, parents): + """Build the ancestor list.""" + ancestors = [] + while parent is not None: + if parent is not None: + ancestors.append(parent.tag.lower()) + parent = self.parent_map.get(parent) + ancestors.reverse() + parents.extend(ancestors) + + def run(self, tree, ancestors=None): """Apply inline patterns to a parsed Markdown tree. Iterate over ElementTree, find elements with inline tag, apply inline patterns and append newly created Elements to tree. If you don't - want process your data with inline paterns, instead of normal string, - use subclass AtomicString: + want to process your data with inline patterns, instead of normal + string, use subclass AtomicString: - node.text = markdown.AtomicString("data won't be processed with inline patterns") + node.text = markdown.AtomicString("This will not be processed.") Arguments: - * markdownTree: ElementTree object, representing Markdown tree. + * tree: ElementTree object, representing Markdown tree. + * ancestors: List of parent tag names that precede the tree node (if needed). Returns: ElementTree object with applied inline patterns. """ self.stashed_nodes = {} - stack = [tree] + # Ensure a valid parent list, but copy passed in lists + # to ensure we don't have the user accidentally change it on us. + tree_parents = [] if ancestors is None else ancestors[:] + + self.parent_map = {c: p for p in tree.iter() for c in p} + stack = [(tree, tree_parents)] while stack: - currElement = stack.pop() + currElement, parents = stack.pop() + + self.ancestors = parents + self.__build_ancestors(currElement, self.ancestors) + insertQueue = [] - for child in currElement.getchildren(): - if child.text and not isinstance(child.text, markdown.AtomicString): + for child in currElement: + if child.text and not isinstance( + child.text, util.AtomicString + ): + self.ancestors.append(child.tag.lower()) text = child.text child.text = None - lst = self.__processPlaceholders(self.__handleInline( - text), child) + lst = self.__processPlaceholders( + self.__handleInline(text), child + ) + for item in lst: + self.parent_map[item[0]] = child stack += lst insertQueue.append((child, lst)) - - if child.getchildren(): - stack.append(child) + self.ancestors.pop() + if child.tail: + tail = self.__handleInline(child.tail) + dumby = etree.Element('d') + child.tail = None + tailResult = self.__processPlaceholders(tail, dumby, False) + if dumby.tail: + child.tail = dumby.tail + pos = list(currElement).index(child) + 1 + tailResult.reverse() + for newChild in tailResult: + self.parent_map[newChild[0]] = currElement + currElement.insert(pos, newChild[0]) + if len(child): + self.parent_map[child] = currElement + stack.append((child, self.ancestors[:])) for element, lst in insertQueue: - if element.text: - element.text = \ - markdown.inlinepatterns.handleAttributes(element.text, - element) - i = 0 - for newChild in lst: - # Processing attributes - if newChild.tail: - newChild.tail = \ - markdown.inlinepatterns.handleAttributes(newChild.tail, - element) - if newChild.text: - newChild.text = \ - markdown.inlinepatterns.handleAttributes(newChild.text, - newChild) + for i, obj in enumerate(lst): + newChild = obj[0] element.insert(i, newChild) - i += 1 return tree @@ -303,15 +401,13 @@ class PrettifyTreeprocessor(Treeprocessor): """ Recursively add linebreaks to ElementTree children. """ i = "\n" - if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: + if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: if (not elem.text or not elem.text.strip()) \ - and len(elem) and markdown.isBlockLevel(elem[0].tag): + and len(elem) and self.md.is_block_level(elem[0].tag): elem.text = i for e in elem: - if markdown.isBlockLevel(e.tag): + if self.md.is_block_level(e.tag): self._prettifyETree(e) - if not elem.tail or not elem.tail.strip(): - elem.tail = i if not elem.tail or not elem.tail.strip(): elem.tail = i @@ -319,11 +415,44 @@ class PrettifyTreeprocessor(Treeprocessor): """ Add linebreaks to ElementTree root object. """ self._prettifyETree(root) - # Do <br />'s seperately as they are often in the middle of + # Do <br />'s separately as they are often in the middle of # inline content and missed by _prettifyETree. - brs = root.getiterator('br') + brs = root.iter('br') for br in brs: if not br.tail or not br.tail.strip(): br.tail = '\n' else: br.tail = '\n%s' % br.tail + # Clean up extra empty lines at end of code blocks. + pres = root.iter('pre') + for pre in pres: + if len(pre) and pre[0].tag == 'code': + code = pre[0] + # Only prettify code containing text only + if not len(code) and code.text is not None: + code.text = util.AtomicString(code.text.rstrip() + '\n') + + +class UnescapeTreeprocessor(Treeprocessor): + """ Restore escaped chars """ + + RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) + + def _unescape(self, m): + return chr(int(m.group(1))) + + def unescape(self, text): + return self.RE.sub(self._unescape, text) + + def run(self, root): + """ Loop over all elements and unescape all text. """ + for elem in root.iter(): + # Unescape text content + if elem.text and not elem.tag == 'code': + elem.text = self.unescape(elem.text) + # Unescape tail content + if elem.tail: + elem.tail = self.unescape(elem.tail) + # Unescape attribute values + for key, value in elem.items(): + elem.set(key, self.unescape(value)) |