aboutsummaryrefslogtreecommitdiff
path: root/markdown/treeprocessors.py
diff options
context:
space:
mode:
Diffstat (limited to 'markdown/treeprocessors.py')
-rw-r--r--markdown/treeprocessors.py307
1 files changed, 218 insertions, 89 deletions
diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py
index 1dc612a..e9f48ca 100644
--- a/markdown/treeprocessors.py
+++ b/markdown/treeprocessors.py
@@ -1,16 +1,47 @@
-import markdown
+"""
+Python Markdown
+
+A Python implementation of John Gruber's Markdown.
+
+Documentation: https://python-markdown.github.io/
+GitHub: https://github.com/Python-Markdown/markdown/
+PyPI: https://pypi.org/project/Markdown/
+
+Started by Manfred Stienstra (http://www.dwerg.net/).
+Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
+Currently maintained by Waylan Limberg (https://github.com/waylan),
+Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
+
+Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
+
+License: BSD (see LICENSE.md for details).
+"""
+
import re
+import xml.etree.ElementTree as etree
+from . import util
+from . import inlinepatterns
+
+
+def build_treeprocessors(md, **kwargs):
+ """ Build the default treeprocessors for Markdown. """
+ treeprocessors = util.Registry()
+ treeprocessors.register(InlineProcessor(md), 'inline', 20)
+ treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
+ treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
+ return treeprocessors
+
def isString(s):
""" Check if it's string """
- return isinstance(s, unicode) or isinstance(s, str)
+ if not isinstance(s, util.AtomicString):
+ return isinstance(s, str)
+ return False
-class Processor:
- def __init__(self, markdown_instance=None):
- if markdown_instance:
- self.markdown = markdown_instance
-class Treeprocessor(Processor):
+class Treeprocessor(util.Processor):
"""
Treeprocessors are run on the ElementTree object before serialization.
@@ -24,11 +55,11 @@ class Treeprocessor(Processor):
def run(self, root):
"""
Subclasses of Treeprocessor should implement a `run` method, which
- takes a root ElementTree. This method can return another ElementTree
- object, and the existing root ElementTree will be replaced, or it can
+ takes a root ElementTree. This method can return another ElementTree
+ object, and the existing root ElementTree will be replaced, or it can
modify the current tree and return None.
"""
- pass
+ pass # pragma: no cover
class InlineProcessor(Treeprocessor):
@@ -36,18 +67,20 @@ class InlineProcessor(Treeprocessor):
A Treeprocessor that traverses a tree, applying inline patterns.
"""
- def __init__ (self, md):
- self.__placeholder_prefix = markdown.INLINE_PLACEHOLDER_PREFIX
- self.__placeholder_suffix = markdown.ETX
+ def __init__(self, md):
+ self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
+ self.__placeholder_suffix = util.ETX
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ len(self.__placeholder_suffix)
- self.__placeholder_re = re.compile(markdown.INLINE_PLACEHOLDER % r'([0-9]{4})')
- self.markdown = md
+ self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
+ self.md = md
+ self.inlinePatterns = md.inlinePatterns
+ self.ancestors = []
def __makePlaceholder(self, type):
""" Generate a placeholder """
id = "%04d" % len(self.stashed_nodes)
- hash = markdown.INLINE_PLACEHOLDER % id
+ hash = util.INLINE_PLACEHOLDER % id
return hash, id
def __findPlaceholder(self, data, index):
@@ -60,8 +93,8 @@ class InlineProcessor(Treeprocessor):
* index: index, from which we start search
Returns: placeholder id and string index, after the found placeholder.
- """
+ """
m = self.__placeholder_re.search(data, index)
if m:
return m.group(1), m.end()
@@ -87,12 +120,13 @@ class InlineProcessor(Treeprocessor):
Returns: String with placeholders.
"""
- if not isinstance(data, markdown.AtomicString):
+ if not isinstance(data, util.AtomicString):
startIndex = 0
- while patternIndex < len(self.markdown.inlinePatterns):
+ count = len(self.inlinePatterns)
+ while patternIndex < count:
data, matched, startIndex = self.__applyPattern(
- self.markdown.inlinePatterns.value_for_index(patternIndex),
- data, patternIndex, startIndex)
+ self.inlinePatterns[patternIndex], data, patternIndex, startIndex
+ )
if not matched:
patternIndex += 1
return data
@@ -118,19 +152,18 @@ class InlineProcessor(Treeprocessor):
text = subnode.tail
subnode.tail = None
- childResult = self.__processPlaceholders(text, subnode)
+ childResult = self.__processPlaceholders(text, subnode, isText)
if not isText and node is not subnode:
- pos = node.getchildren().index(subnode)
- node.remove(subnode)
+ pos = list(node).index(subnode) + 1
else:
pos = 0
childResult.reverse()
for newChild in childResult:
- node.insert(pos, newChild)
+ node.insert(pos, newChild[0])
- def __processPlaceholders(self, data, parent):
+ def __processPlaceholders(self, data, parent, isText=True):
"""
Process string with placeholders and generate ElementTree tree.
@@ -140,20 +173,25 @@ class InlineProcessor(Treeprocessor):
* parent: Element, which contains processing inline data
Returns: list with ElementTree elements with applied inline patterns.
+
"""
def linkText(text):
if text:
if result:
- if result[-1].tail:
- result[-1].tail += text
+ if result[-1][0].tail:
+ result[-1][0].tail += text
+ else:
+ result[-1][0].tail = text
+ elif not isText:
+ if parent.tail:
+ parent.tail += text
else:
- result[-1].tail = text
+ parent.tail = text
else:
if parent.text:
parent.text += text
else:
parent.text = text
-
result = []
strartIndex = 0
while data:
@@ -168,28 +206,33 @@ class InlineProcessor(Treeprocessor):
text = data[strartIndex:index]
linkText(text)
- if not isString(node): # it's Element
- for child in [node] + node.getchildren():
+ if not isString(node): # it's Element
+ for child in [node] + list(node):
if child.tail:
if child.tail.strip():
- self.__processElementText(node, child, False)
+ self.__processElementText(
+ node, child, False
+ )
if child.text:
if child.text.strip():
self.__processElementText(child, child)
- else: # it's just a string
+ else: # it's just a string
linkText(node)
strartIndex = phEndIndex
continue
strartIndex = phEndIndex
- result.append(node)
+ result.append((node, self.ancestors[:]))
- else: # wrong placeholder
- end = index + len(prefix)
+ else: # wrong placeholder
+ end = index + len(self.__placeholder_prefix)
linkText(data[strartIndex:end])
strartIndex = end
else:
text = data[strartIndex:]
+ if isinstance(data, util.AtomicString):
+ # We don't want to loose the AtomicString
+ text = util.AtomicString(text)
linkText(text)
data = ""
@@ -205,94 +248,149 @@ class InlineProcessor(Treeprocessor):
* data: the text to be processed
* pattern: the pattern to be checked
* patternIndex: index of current pattern
- * startIndex: string index, from which we starting search
+ * startIndex: string index, from which we start searching
Returns: String with placeholders instead of ElementTree elements.
"""
- match = pattern.getCompiledRegExp().match(data[startIndex:])
- leftData = data[:startIndex]
+ new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
+
+ for exclude in pattern.ANCESTOR_EXCLUDES:
+ if exclude.lower() in self.ancestors:
+ return data, False, 0
+
+ if new_style:
+ match = None
+ # Since handleMatch may reject our first match,
+ # we iterate over the buffer looking for matches
+ # until we can't find any more.
+ for match in pattern.getCompiledRegExp().finditer(data, startIndex):
+ node, start, end = pattern.handleMatch(match, data)
+ if start is None or end is None:
+ startIndex += match.end(0)
+ match = None
+ continue
+ break
+ else: # pragma: no cover
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
if not match:
return data, False, 0
- node = pattern.handleMatch(match)
+ if not new_style: # pragma: no cover
+ node = pattern.handleMatch(match)
+ start = match.start(0)
+ end = match.end(0)
if node is None:
- return data, True, len(leftData) + match.span(len(match.groups()))[0]
+ return data, True, end
if not isString(node):
- if not isinstance(node.text, markdown.AtomicString):
+ if not isinstance(node.text, util.AtomicString):
# We need to process current node too
- for child in [node] + node.getchildren():
+ for child in [node] + list(node):
if not isString(node):
if child.text:
- child.text = self.__handleInline(child.text,
- patternIndex + 1)
+ self.ancestors.append(child.tag.lower())
+ child.text = self.__handleInline(
+ child.text, patternIndex + 1
+ )
+ self.ancestors.pop()
if child.tail:
- child.tail = self.__handleInline(child.tail,
- patternIndex)
+ child.tail = self.__handleInline(
+ child.tail, patternIndex
+ )
placeholder = self.__stashNode(node, pattern.type())
- return "%s%s%s%s" % (leftData,
- match.group(1),
- placeholder, match.groups()[-1]), True, 0
-
- def run(self, tree):
+ if new_style:
+ return "{}{}{}".format(data[:start],
+ placeholder, data[end:]), True, 0
+ else: # pragma: no cover
+ return "{}{}{}{}".format(leftData,
+ match.group(1),
+ placeholder, match.groups()[-1]), True, 0
+
+ def __build_ancestors(self, parent, parents):
+ """Build the ancestor list."""
+ ancestors = []
+ while parent is not None:
+ if parent is not None:
+ ancestors.append(parent.tag.lower())
+ parent = self.parent_map.get(parent)
+ ancestors.reverse()
+ parents.extend(ancestors)
+
+ def run(self, tree, ancestors=None):
"""Apply inline patterns to a parsed Markdown tree.
Iterate over ElementTree, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. If you don't
- want process your data with inline paterns, instead of normal string,
- use subclass AtomicString:
+ want to process your data with inline patterns, instead of normal
+ string, use subclass AtomicString:
- node.text = markdown.AtomicString("data won't be processed with inline patterns")
+ node.text = markdown.AtomicString("This will not be processed.")
Arguments:
- * markdownTree: ElementTree object, representing Markdown tree.
+ * tree: ElementTree object, representing Markdown tree.
+ * ancestors: List of parent tag names that precede the tree node (if needed).
Returns: ElementTree object with applied inline patterns.
"""
self.stashed_nodes = {}
- stack = [tree]
+ # Ensure a valid parent list, but copy passed in lists
+ # to ensure we don't have the user accidentally change it on us.
+ tree_parents = [] if ancestors is None else ancestors[:]
+
+ self.parent_map = {c: p for p in tree.iter() for c in p}
+ stack = [(tree, tree_parents)]
while stack:
- currElement = stack.pop()
+ currElement, parents = stack.pop()
+
+ self.ancestors = parents
+ self.__build_ancestors(currElement, self.ancestors)
+
insertQueue = []
- for child in currElement.getchildren():
- if child.text and not isinstance(child.text, markdown.AtomicString):
+ for child in currElement:
+ if child.text and not isinstance(
+ child.text, util.AtomicString
+ ):
+ self.ancestors.append(child.tag.lower())
text = child.text
child.text = None
- lst = self.__processPlaceholders(self.__handleInline(
- text), child)
+ lst = self.__processPlaceholders(
+ self.__handleInline(text), child
+ )
+ for item in lst:
+ self.parent_map[item[0]] = child
stack += lst
insertQueue.append((child, lst))
-
- if child.getchildren():
- stack.append(child)
+ self.ancestors.pop()
+ if child.tail:
+ tail = self.__handleInline(child.tail)
+ dumby = etree.Element('d')
+ child.tail = None
+ tailResult = self.__processPlaceholders(tail, dumby, False)
+ if dumby.tail:
+ child.tail = dumby.tail
+ pos = list(currElement).index(child) + 1
+ tailResult.reverse()
+ for newChild in tailResult:
+ self.parent_map[newChild[0]] = currElement
+ currElement.insert(pos, newChild[0])
+ if len(child):
+ self.parent_map[child] = currElement
+ stack.append((child, self.ancestors[:]))
for element, lst in insertQueue:
- if element.text:
- element.text = \
- markdown.inlinepatterns.handleAttributes(element.text,
- element)
- i = 0
- for newChild in lst:
- # Processing attributes
- if newChild.tail:
- newChild.tail = \
- markdown.inlinepatterns.handleAttributes(newChild.tail,
- element)
- if newChild.text:
- newChild.text = \
- markdown.inlinepatterns.handleAttributes(newChild.text,
- newChild)
+ for i, obj in enumerate(lst):
+ newChild = obj[0]
element.insert(i, newChild)
- i += 1
return tree
@@ -303,15 +401,13 @@ class PrettifyTreeprocessor(Treeprocessor):
""" Recursively add linebreaks to ElementTree children. """
i = "\n"
- if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
+ if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
- and len(elem) and markdown.isBlockLevel(elem[0].tag):
+ and len(elem) and self.md.is_block_level(elem[0].tag):
elem.text = i
for e in elem:
- if markdown.isBlockLevel(e.tag):
+ if self.md.is_block_level(e.tag):
self._prettifyETree(e)
- if not elem.tail or not elem.tail.strip():
- elem.tail = i
if not elem.tail or not elem.tail.strip():
elem.tail = i
@@ -319,11 +415,44 @@ class PrettifyTreeprocessor(Treeprocessor):
""" Add linebreaks to ElementTree root object. """
self._prettifyETree(root)
- # Do <br />'s seperately as they are often in the middle of
+ # Do <br />'s separately as they are often in the middle of
# inline content and missed by _prettifyETree.
- brs = root.getiterator('br')
+ brs = root.iter('br')
for br in brs:
if not br.tail or not br.tail.strip():
br.tail = '\n'
else:
br.tail = '\n%s' % br.tail
+ # Clean up extra empty lines at end of code blocks.
+ pres = root.iter('pre')
+ for pre in pres:
+ if len(pre) and pre[0].tag == 'code':
+ code = pre[0]
+ # Only prettify code containing text only
+ if not len(code) and code.text is not None:
+ code.text = util.AtomicString(code.text.rstrip() + '\n')
+
+
+class UnescapeTreeprocessor(Treeprocessor):
+ """ Restore escaped chars """
+
+ RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
+
+ def _unescape(self, m):
+ return chr(int(m.group(1)))
+
+ def unescape(self, text):
+ return self.RE.sub(self._unescape, text)
+
+ def run(self, root):
+ """ Loop over all elements and unescape all text. """
+ for elem in root.iter():
+ # Unescape text content
+ if elem.text and not elem.tag == 'code':
+ elem.text = self.unescape(elem.text)
+ # Unescape tail content
+ if elem.tail:
+ elem.tail = self.unescape(elem.tail)
+ # Unescape attribute values
+ for key, value in elem.items():
+ elem.set(key, self.unescape(value))