diff options
Diffstat (limited to 'markdown/inlinepatterns.py')
-rw-r--r-- | markdown/inlinepatterns.py | 875 |
1 files changed, 695 insertions, 180 deletions
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py index 917a9d3..eb313bd 100644 --- a/markdown/inlinepatterns.py +++ b/markdown/inlinepatterns.py @@ -1,4 +1,23 @@ """ +Python Markdown + +A Python implementation of John Gruber's Markdown. + +Documentation: https://python-markdown.github.io/ +GitHub: https://github.com/Python-Markdown/markdown/ +PyPI: https://pypi.org/project/Markdown/ + +Started by Manfred Stienstra (http://www.dwerg.net/). +Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). +Currently maintained by Waylan Limberg (https://github.com/waylan), +Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). + +Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see LICENSE.md for details). + INLINE PATTERNS ============================================================================= @@ -41,71 +60,129 @@ So, we apply the expressions in the following order: * finally we apply strong and emphasis """ -import markdown +from . import util +from collections import namedtuple import re -from urlparse import urlparse, urlunparse -import sys -if sys.version >= "3.0": - from html import entities as htmlentitydefs -else: - import htmlentitydefs +import xml.etree.ElementTree as etree +try: # pragma: no cover + from html import entities +except ImportError: # pragma: no cover + import htmlentitydefs as entities + + +def build_inlinepatterns(md, **kwargs): + """ Build the default set of inline patterns for Markdown. """ + inlinePatterns = util.Registry() + inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) + inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) + inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) + inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) + inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) + inlinePatterns.register( + ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 + ) + inlinePatterns.register( + ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 + ) + inlinePatterns.register( + ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 + ) + inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) + inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) + inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) + inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) + inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) + inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) + inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) + inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) + return inlinePatterns + """ The actual regular expressions for patterns ----------------------------------------------------------------------------- """ -NOBRACKET = r'[^\]\[]*' -BRK = ( r'\[(' - + (NOBRACKET + r'(\[')*6 - + (NOBRACKET+ r'\])*')*6 - + NOBRACKET + r')\]' ) NOIMG = r'(?<!\!)' -BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` -ESCAPE_RE = r'\\(.)' # \< -EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis* -STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong** -STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong*** +# `e=f()` or ``e=f("`")`` +BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' + +# \< +ESCAPE_RE = r'\\(.)' + +# *emphasis* +EMPHASIS_RE = r'(\*)([^\*]+)\1' + +# **strong** +STRONG_RE = r'(\*{2})(.+?)\1' + +# __smart__strong__ +SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' -if markdown.SMART_EMPHASIS: - EMPHASIS_2_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)' # _emphasis_ -else: - EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_ +# _smart_emphasis_ +SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' -LINK_RE = NOIMG + BRK + \ -r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12)?\)''' -# [text](url) or [text](<url>) +# __strong _em__ +SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' + +# ***strongem*** or ***em*strong** +EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' + +# ___strongem___ or ___em_strong__ +EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' + +# ***strong**em* +STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' + +# ___strong__em_ +STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' + +# **strong*em*** +STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' + +# [text](url) or [text](<url>) or [text](url "title") +LINK_RE = NOIMG + r'\[' -IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) -REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] -IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] -NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _ -AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com> -AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com> +IMAGE_LINK_RE = r'\!\[' + +# [Google][3] +REFERENCE_RE = LINK_RE + +# ![alt text][2] +IMAGE_REFERENCE_RE = IMAGE_LINK_RE + +# stand-alone * or _ +NOT_STRONG_RE = r'((^|\s)(\*|_)(\s|$))' + +# <http://www.123.com> +AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' -HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> -ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & -LINE_BREAK_RE = r' \n' # two spaces at end of line -LINE_BREAK_2_RE = r' $' # two spaces at end of text +# <me@example.com> +AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' + +# <...> +HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)' + +# "&" (decimal) or "&" (hex) or "&" (named) +ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' + +# two spaces at end of line +LINE_BREAK_RE = r' \n' def dequote(string): """Remove quotes from around a string.""" - if ( ( string.startswith('"') and string.endswith('"')) - or (string.startswith("'") and string.endswith("'")) ): + if ((string.startswith('"') and string.endswith('"')) or + (string.startswith("'") and string.endswith("'"))): return string[1:-1] else: return string -ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} -def handleAttributes(text, parent): - """Set values of an element based on attribute definitions ({@id=123}).""" - def attributeCallback(match): - parent.set(match.group(1), match.group(2).replace('\n', ' ')) - return ATTR_RE.sub(attributeCallback, text) +class EmStrongItem(namedtuple('EmStrongItem', ['pattern', 'builder', 'tags'])): + """Emphasis/strong pattern item.""" """ @@ -113,10 +190,13 @@ The pattern classes ----------------------------------------------------------------------------- """ -class Pattern: + +class Pattern: # pragma: no cover """Base class that inline patterns subclass. """ - def __init__ (self, pattern, markdown_instance=None): + ANCESTOR_EXCLUDES = tuple() + + def __init__(self, pattern, md=None): """ Create an instant of an inline pattern. @@ -126,14 +206,12 @@ class Pattern: """ self.pattern = pattern - self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) + self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, + re.DOTALL | re.UNICODE) - # Api for Markdown to pass safe_mode into instance - self.safe_mode = False - if markdown_instance: - self.markdown = markdown_instance + self.md = md - def getCompiledRegExp (self): + def getCompiledRegExp(self): """ Return a compiled regular expression. """ return self.compiled_re @@ -147,57 +225,163 @@ class Pattern: * m: A re match object containing a match of the pattern. """ - pass + pass # pragma: no cover def type(self): """ Return class name, to define pattern type """ return self.__class__.__name__ -BasePattern = Pattern # for backward compatibility + def unescape(self, text): + """ Return unescaped text given text with an inline placeholder. """ + try: + stash = self.md.treeprocessors['inline'].stashed_nodes + except KeyError: # pragma: no cover + return text + + def get_stash(m): + id = m.group(1) + if id in stash: + value = stash.get(id) + if isinstance(value, str): + return value + else: + # An etree Element - return text content only + return ''.join(value.itertext()) + return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) + + +class InlineProcessor(Pattern): + """ + Base class that inline patterns subclass. + + This is the newer style inline processor that uses a more + efficient and flexible search approach. + """ + + def __init__(self, pattern, md=None): + """ + Create an instant of an inline pattern. + + Keyword arguments: + + * pattern: A regular expression that matches a pattern + + """ + self.pattern = pattern + self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) + + # Api for Markdown to pass safe_mode into instance + self.safe_mode = False + self.md = md + + def handleMatch(self, m, data): + """Return a ElementTree element from the given match and the + start and end index of the matched text. + + If `start` and/or `end` are returned as `None`, it will be + assumed that the processor did not find a valid region of text. + + Subclasses should override this method. + + Keyword arguments: + + * m: A re match object containing a match of the pattern. + * data: The buffer current under analysis -class SimpleTextPattern (Pattern): + Returns: + + * el: The ElementTree element, text or None. + * start: The start of the region that has been matched or None. + * end: The end of the region that has been matched or None. + + """ + pass # pragma: no cover + + +class SimpleTextPattern(Pattern): # pragma: no cover """ Return a simple text of group(2) of a Pattern. """ def handleMatch(self, m): - text = m.group(2) - if text == markdown.INLINE_PLACEHOLDER_PREFIX: - return None - return text + return m.group(2) + + +class SimpleTextInlineProcessor(InlineProcessor): + """ Return a simple text of group(1) of a Pattern. """ + def handleMatch(self, m, data): + return m.group(1), m.start(0), m.end(0) + + +class EscapeInlineProcessor(InlineProcessor): + """ Return an escaped character. """ -class SimpleTagPattern (Pattern): + def handleMatch(self, m, data): + char = m.group(1) + if char in self.md.ESCAPED_CHARS: + return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) + else: + return None, m.start(0), m.end(0) + + +class SimpleTagPattern(Pattern): # pragma: no cover """ Return element of type `tag` with a text attribute of group(3) of a Pattern. """ - def __init__ (self, pattern, tag): + def __init__(self, pattern, tag): Pattern.__init__(self, pattern) self.tag = tag def handleMatch(self, m): - el = markdown.etree.Element(self.tag) + el = etree.Element(self.tag) el.text = m.group(3) return el -class SubstituteTagPattern (SimpleTagPattern): - """ Return a eLement of type `tag` with no children. """ - def handleMatch (self, m): - return markdown.etree.Element(self.tag) +class SimpleTagInlineProcessor(InlineProcessor): + """ + Return element of type `tag` with a text attribute of group(2) + of a Pattern. + """ + def __init__(self, pattern, tag): + InlineProcessor.__init__(self, pattern) + self.tag = tag -class BacktickPattern (Pattern): - """ Return a `<code>` element containing the matching text. """ - def __init__ (self, pattern): - Pattern.__init__(self, pattern) - self.tag = "code" + def handleMatch(self, m, data): # pragma: no cover + el = etree.Element(self.tag) + el.text = m.group(2) + return el, m.start(0), m.end(0) + +class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover + """ Return an element of type `tag` with no children. """ def handleMatch(self, m): - el = markdown.etree.Element(self.tag) - el.text = markdown.AtomicString(m.group(3).strip()) - return el + return etree.Element(self.tag) + + +class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): + """ Return an element of type `tag` with no children. """ + def handleMatch(self, m, data): + return etree.Element(self.tag), m.start(0), m.end(0) + + +class BacktickInlineProcessor(InlineProcessor): + """ Return a `<code>` element containing the matching text. """ + def __init__(self, pattern): + InlineProcessor.__init__(self, pattern) + self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) + self.tag = 'code' + + def handleMatch(self, m, data): + if m.group(3): + el = etree.Element(self.tag) + el.text = util.AtomicString(util.code_escape(m.group(3).strip())) + return el, m.start(0), m.end(0) + else: + return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) -class DoubleTagPattern (SimpleTagPattern): +class DoubleTagPattern(SimpleTagPattern): # pragma: no cover """Return a ElementTree element nested in tag2 nested in tag1. Useful for strong emphasis etc. @@ -205,117 +389,432 @@ class DoubleTagPattern (SimpleTagPattern): """ def handleMatch(self, m): tag1, tag2 = self.tag.split(",") - el1 = markdown.etree.Element(tag1) - el2 = markdown.etree.SubElement(el1, tag2) + el1 = etree.Element(tag1) + el2 = etree.SubElement(el1, tag2) el2.text = m.group(3) + if len(m.groups()) == 5: + el2.tail = m.group(4) return el1 -class HtmlPattern (Pattern): +class DoubleTagInlineProcessor(SimpleTagInlineProcessor): + """Return a ElementTree element nested in tag2 nested in tag1. + + Useful for strong emphasis etc. + + """ + def handleMatch(self, m, data): # pragma: no cover + tag1, tag2 = self.tag.split(",") + el1 = etree.Element(tag1) + el2 = etree.SubElement(el1, tag2) + el2.text = m.group(2) + if len(m.groups()) == 3: + el2.tail = m.group(3) + return el1, m.start(0), m.end(0) + + +class HtmlInlineProcessor(InlineProcessor): """ Store raw inline html and return a placeholder. """ - def handleMatch (self, m): - rawhtml = m.group(2) - inline = True - place_holder = self.markdown.htmlStash.store(rawhtml) - return place_holder + def handleMatch(self, m, data): + rawhtml = self.unescape(m.group(1)) + place_holder = self.md.htmlStash.store(rawhtml) + return place_holder, m.start(0), m.end(0) + + def unescape(self, text): + """ Return unescaped text given text with an inline placeholder. """ + try: + stash = self.md.treeprocessors['inline'].stashed_nodes + except KeyError: # pragma: no cover + return text + + def get_stash(m): + id = m.group(1) + value = stash.get(id) + if value is not None: + try: + return self.md.serializer(value) + except Exception: + return r'\%s' % value + + return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) + + +class AsteriskProcessor(InlineProcessor): + """Emphasis processor for handling strong and em matches inside asterisks.""" + + PATTERNS = [ + EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), + EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), + EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), + EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), + EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') + ] + + def build_single(self, m, tag, idx): + """Return single tag.""" + el1 = etree.Element(tag) + text = m.group(2) + self.parse_sub_patterns(text, el1, None, idx) + return el1 + def build_double(self, m, tags, idx): + """Return double tag.""" -class LinkPattern (Pattern): - """ Return a link element from the given match. """ - def handleMatch(self, m): - el = markdown.etree.Element("a") - el.text = m.group(2) - title = m.group(11) - href = m.group(9) + tag1, tag2 = tags.split(",") + el1 = etree.Element(tag1) + el2 = etree.Element(tag2) + text = m.group(2) + self.parse_sub_patterns(text, el2, None, idx) + el1.append(el2) + if len(m.groups()) == 3: + text = m.group(3) + self.parse_sub_patterns(text, el1, el2, idx) + return el1 - if href: - if href[0] == "<": - href = href[1:-1] - el.set("href", self.sanitize_url(href.strip())) - else: - el.set("href", "") + def build_double2(self, m, tags, idx): + """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" - if title: - title = dequote(title) #.replace('"', """) - el.set("title", title) - return el + tag1, tag2 = tags.split(",") + el1 = etree.Element(tag1) + el2 = etree.Element(tag2) + text = m.group(2) + self.parse_sub_patterns(text, el1, None, idx) + text = m.group(3) + el1.append(el2) + self.parse_sub_patterns(text, el2, None, idx) + return el1 - def sanitize_url(self, url): + def parse_sub_patterns(self, data, parent, last, idx): """ - Sanitize a url against xss attacks in "safe_mode". - - Rather than specifically blacklisting `javascript:alert("XSS")` and all - its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known - safe url formats. Most urls contain a network location, however some - are known not to (i.e.: mailto links). Script urls do not contain a - location. Additionally, for `javascript:...`, the scheme would be - "javascript" but some aliases will appear to `urlparse()` to have no - scheme. On top of that relative links (i.e.: "foo/bar.html") have no - scheme. Therefore we must check "path", "parameters", "query" and - "fragment" for any literal colons. We don't check "scheme" for colons - because it *should* never have any and "netloc" must allow the form: - `username:password@host:port`. + Parses sub patterns. + + `data` (`str`): + text to evaluate. + + `parent` (`etree.Element`): + Parent to attach text and sub elements to. + + `last` (`etree.Element`): + Last appended child to parent. Can also be None if parent has no children. + + `idx` (`int`): + Current pattern index that was used to evaluate the parent. """ - locless_schemes = ['', 'mailto', 'news'] - scheme, netloc, path, params, query, fragment = url = urlparse(url) - safe_url = False - if netloc != '' or scheme in locless_schemes: - safe_url = True - - for part in url[2:]: - if ":" in part: - safe_url = False - - if self.markdown.safeMode and not safe_url: - return '' + + offset = 0 + pos = 0 + + length = len(data) + while pos < length: + # Find the start of potential emphasis or strong tokens + if self.compiled_re.match(data, pos): + matched = False + # See if the we can match an emphasis/strong pattern + for index, item in enumerate(self.PATTERNS): + # Only evaluate patterns that are after what was used on the parent + if index <= idx: + continue + m = item.pattern.match(data, pos) + if m: + # Append child nodes to parent + # Text nodes should be appended to the last + # child if present, and if not, it should + # be added as the parent's text node. + text = data[offset:m.start(0)] + if text: + if last is not None: + last.tail = text + else: + parent.text = text + el = self.build_element(m, item.builder, item.tags, index) + parent.append(el) + last = el + # Move our position past the matched hunk + offset = pos = m.end(0) + matched = True + if not matched: + # We matched nothing, move on to the next character + pos += 1 + else: + # Increment position as no potential emphasis start was found. + pos += 1 + + # Append any leftover text as a text node. + text = data[offset:] + if text: + if last is not None: + last.tail = text + else: + parent.text = text + + def build_element(self, m, builder, tags, index): + """Element builder.""" + + if builder == 'double2': + return self.build_double2(m, tags, index) + elif builder == 'double': + return self.build_double(m, tags, index) else: - return urlunparse(url) + return self.build_single(m, tags, index) + + def handleMatch(self, m, data): + """Parse patterns.""" + + el = None + start = None + end = None + + for index, item in enumerate(self.PATTERNS): + m1 = item.pattern.match(data, m.start(0)) + if m1: + start = m1.start(0) + end = m1.end(0) + el = self.build_element(m1, item.builder, item.tags, index) + break + return el, start, end -class ImagePattern(LinkPattern): + +class UnderscoreProcessor(AsteriskProcessor): + """Emphasis processor for handling strong and em matches inside underscores.""" + + PATTERNS = [ + EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), + EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), + EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), + EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), + EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') + ] + + +class LinkInlineProcessor(InlineProcessor): + """ Return a link element from the given match. """ + RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) + RE_TITLE_CLEAN = re.compile(r'\s') + + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + + if not handled: + return None, None, None + + href, title, index, handled = self.getLink(data, index) + if not handled: + return None, None, None + + el = etree.Element("a") + el.text = text + + el.set("href", href) + + if title is not None: + el.set("title", title) + + return el, m.start(0), index + + def getLink(self, data, index): + """Parse data between `()` of `[Text]()` allowing recursive `()`. """ + + href = '' + title = None + handled = False + + m = self.RE_LINK.match(data, pos=index) + if m and m.group(1): + # Matches [Text](<link> "title") + href = m.group(1)[1:-1].strip() + if m.group(2): + title = m.group(2)[1:-1] + index = m.end(0) + handled = True + elif m: + # Track bracket nesting and index in string + bracket_count = 1 + backtrack_count = 1 + start_index = m.end() + index = start_index + last_bracket = -1 + + # Primary (first found) quote tracking. + quote = None + start_quote = -1 + exit_quote = -1 + ignore_matches = False + + # Secondary (second found) quote tracking. + alt_quote = None + start_alt_quote = -1 + exit_alt_quote = -1 + + # Track last character + last = '' + + for pos in range(index, len(data)): + c = data[pos] + if c == '(': + # Count nested ( + # Don't increment the bracket count if we are sure we're in a title. + if not ignore_matches: + bracket_count += 1 + elif backtrack_count > 0: + backtrack_count -= 1 + elif c == ')': + # Match nested ) to ( + # Don't decrement if we are sure we are in a title that is unclosed. + if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): + bracket_count = 0 + elif not ignore_matches: + bracket_count -= 1 + elif backtrack_count > 0: + backtrack_count -= 1 + # We've found our backup end location if the title doesn't resolve. + if backtrack_count == 0: + last_bracket = index + 1 + + elif c in ("'", '"'): + # Quote has started + if not quote: + # We'll assume we are now in a title. + # Brackets are quoted, so no need to match them (except for the final one). + ignore_matches = True + backtrack_count = bracket_count + bracket_count = 1 + start_quote = index + 1 + quote = c + # Secondary quote (in case the first doesn't resolve): [text](link'"title") + elif c != quote and not alt_quote: + start_alt_quote = index + 1 + alt_quote = c + # Update primary quote match + elif c == quote: + exit_quote = index + 1 + # Update secondary quote match + elif alt_quote and c == alt_quote: + exit_alt_quote = index + 1 + + index += 1 + + # Link is closed, so let's break out of the loop + if bracket_count == 0: + # Get the title if we closed a title string right before link closed + if exit_quote >= 0 and quote == last: + href = data[start_index:start_quote - 1] + title = ''.join(data[start_quote:exit_quote - 1]) + elif exit_alt_quote >= 0 and alt_quote == last: + href = data[start_index:start_alt_quote - 1] + title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) + else: + href = data[start_index:index - 1] + break + + if c != ' ': + last = c + + # We have a scenario: [test](link"notitle) + # When we enter a string, we stop tracking bracket resolution in the main counter, + # but we do keep a backup counter up until we discover where we might resolve all brackets + # if the title string fails to resolve. + if bracket_count != 0 and backtrack_count == 0: + href = data[start_index:last_bracket - 1] + index = last_bracket + bracket_count = 0 + + handled = bracket_count == 0 + + if title is not None: + title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) + + href = self.unescape(href).strip() + + return href, title, index, handled + + def getText(self, data, index): + """Parse the content between `[]` of the start of an image or link + resolving nested square brackets. + + """ + bracket_count = 1 + text = [] + for pos in range(index, len(data)): + c = data[pos] + if c == ']': + bracket_count -= 1 + elif c == '[': + bracket_count += 1 + index += 1 + if bracket_count == 0: + break + text.append(c) + return ''.join(text), index, bracket_count == 0 + + +class ImageInlineProcessor(LinkInlineProcessor): """ Return a img element from the given match. """ - def handleMatch(self, m): - el = markdown.etree.Element("img") - src_parts = m.group(9).split() - if src_parts: - src = src_parts[0] - if src[0] == "<" and src[-1] == ">": - src = src[1:-1] - el.set('src', self.sanitize_url(src)) - else: - el.set('src', "") - if len(src_parts) > 1: - el.set('title', dequote(" ".join(src_parts[1:]))) - if markdown.ENABLE_ATTRIBUTES: - truealt = handleAttributes(m.group(2), el) - else: - truealt = m.group(2) + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + if not handled: + return None, None, None - el.set('alt', truealt) - return el + src, title, index, handled = self.getLink(data, index) + if not handled: + return None, None, None -class ReferencePattern(LinkPattern): + el = etree.Element("img") + + el.set("src", src) + + if title is not None: + el.set("title", title) + + el.set('alt', self.unescape(text)) + return el, m.start(0), index + + +class ReferenceInlineProcessor(LinkInlineProcessor): """ Match to a stored reference and return link element. """ - def handleMatch(self, m): - if m.group(9): - id = m.group(9).lower() - else: - # if we got something like "[Google][]" - # we'll use "google" as the id - id = m.group(2).lower() + NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) - if not id in self.markdown.references: # ignore undefined refs - return None - href, title = self.markdown.references[id] + RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) - text = m.group(2) - return self.makeTag(href, title, text) + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + if not handled: + return None, None, None + + id, end, handled = self.evalId(data, index, text) + if not handled: + return None, None, None + + # Clean up linebreaks in id + id = self.NEWLINE_CLEANUP_RE.sub(' ', id) + if id not in self.md.references: # ignore undefined refs + return None, m.start(0), end + + href, title = self.md.references[id] + + return self.makeTag(href, title, text), m.start(0), end + + def evalId(self, data, index, text): + """ + Evaluate the id portion of [ref][id]. + + If [ref][] use [ref]. + """ + m = self.RE_LINK.match(data, pos=index) + if not m: + return None, index, False + else: + id = m.group(1).lower() + end = m.end(0) + if not id: + id = text.lower() + return id, end, True def makeTag(self, href, title, text): - el = markdown.etree.Element('a') + el = etree.Element('a') - el.set('href', self.sanitize_url(href)) + el.set('href', href) if title: el.set('title', title) @@ -323,49 +822,65 @@ class ReferencePattern(LinkPattern): return el -class ImageReferencePattern (ReferencePattern): +class ShortReferenceInlineProcessor(ReferenceInlineProcessor): + """Short form of reference: [google]. """ + def evalId(self, data, index, text): + """Evaluate the id from of [ref] """ + + return text.lower(), index, True + + +class ImageReferenceInlineProcessor(ReferenceInlineProcessor): """ Match to a stored reference and return img element. """ def makeTag(self, href, title, text): - el = markdown.etree.Element("img") - el.set("src", self.sanitize_url(href)) + el = etree.Element("img") + el.set("src", href) if title: el.set("title", title) - el.set("alt", text) + el.set("alt", self.unescape(text)) return el -class AutolinkPattern (Pattern): +class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): + """ Short form of inage reference: ![ref]. """ + def evalId(self, data, index, text): + """Evaluate the id from of [ref] """ + + return text.lower(), index, True + + +class AutolinkInlineProcessor(InlineProcessor): """ Return a link Element given an autolink (`<http://example/com>`). """ - def handleMatch(self, m): - el = markdown.etree.Element("a") - el.set('href', m.group(2)) - el.text = markdown.AtomicString(m.group(2)) - return el + def handleMatch(self, m, data): + el = etree.Element("a") + el.set('href', self.unescape(m.group(1))) + el.text = util.AtomicString(m.group(1)) + return el, m.start(0), m.end(0) + -class AutomailPattern (Pattern): +class AutomailInlineProcessor(InlineProcessor): """ Return a mailto link Element given an automail link (`<foo@example.com>`). """ - def handleMatch(self, m): - el = markdown.etree.Element('a') - email = m.group(2) + def handleMatch(self, m, data): + el = etree.Element('a') + email = self.unescape(m.group(1)) if email.startswith("mailto:"): email = email[len("mailto:"):] def codepoint2name(code): """Return entity definition by code, or the code if not defined.""" - entity = htmlentitydefs.codepoint2name.get(code) + entity = entities.codepoint2name.get(code) if entity: - return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity) + return "{}{};".format(util.AMP_SUBSTITUTE, entity) else: - return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code) + return "%s#%d;" % (util.AMP_SUBSTITUTE, code) letters = [codepoint2name(ord(letter)) for letter in email] - el.text = markdown.AtomicString(''.join(letters)) + el.text = util.AtomicString(''.join(letters)) mailto = "mailto:" + email - mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' % + mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % ord(letter) for letter in mailto]) el.set('href', mailto) - return el - + return el, m.start(0), m.end(0) |