integrate c2c_markdown into v6_api (#1043)

* integrate c2c_markdown into v6_api * update bleach version
c2corg · Apr 19, 2021 · eb40019 · eb40019
1 parent 2430ee2
commit eb40019
Show file tree

Hide file tree

Showing 236 changed files with 2,405 additions and 4 deletions.
diff --git a/c2corg_api/markdown/README.md b/c2corg_api/markdown/README.md
@@ -0,0 +1,25 @@
+# Parsing the custom formating syntax of camptocamp.org
+
+## Syntax
+
+Camptocamp.org markdown to format the documents text attributes. It uses base features of [Python-Markdown](https://github.com/waylan/Python-Markdown).
+
+Upon these features, other custom tags are added:
+
+* LTag `L# | 6a | tremendous pitch`
+* Emojis `:smile:`
+* images `[img=123]Legend[/img]`
+* toc `[toc]`
+* alerts `!!!! This is an alert banner`
+* wikilinks `[[routes/123|Walker ridge]]`
+* custom headers `## Approach # 10 mn`
+* ptag (hard new line) `[p]`
+* video `[video]https://youtube.com/123[/video]`
+
+## Sanitizer
+
+Output is cleaned from any XSS injection using [Mozilla Bleach](https://github.com/mozilla/bleach)
+
+## Rialability
+
+This parser has been tested and fuzzed (~100,000,000 tests). Issues have also been found in python markdown and bleach: [1](https://github.com/mozilla/bleach/issues/352), [2](https://github.com/Python-Markdown/markdown/issues/643), [3](https://github.com/Python-Markdown/markdown/issues/640) and [4](https://github.com/Python-Markdown/markdown/issues/639) :sunglasses:.
diff --git a/c2corg_api/markdown/__init__.py b/c2corg_api/markdown/__init__.py
@@ -0,0 +1,184 @@
+import markdown
+import bleach
+import binascii
+import os
+from threading import RLock
+
+from c2corg_api.markdown.wikilinks import C2CWikiLinkExtension
+from c2corg_api.markdown.img import C2CImageExtension
+from c2corg_api.markdown.video import C2CVideoExtension
+from c2corg_api.markdown.ltag import C2CLTagExtension
+from c2corg_api.markdown.header import C2CHeaderExtension
+from c2corg_api.markdown.ptag import C2CPTagExtension
+from c2corg_api.markdown.alerts import AlertExtension
+from c2corg_api.markdown.toc import C2CTocExtension
+from c2corg_api.markdown.emojis import C2CEmojiExtension
+from c2corg_api.markdown.nbsp import C2CNbspExtension
+from markdown.extensions.nl2br import Nl2BrExtension
+
+
+def _get_secret():
+    return binascii.hexlify(os.urandom(32)).decode('ascii')
+
+
+_PARSER_EXCEPTION_MESSAGE = """
+<div c2c:role="danger" style="font-weight:bold">
+Parser error, please send a mail to
+<a href="mailto:dev@camptocamp.org">dev@camptocamp.org</a>
+or post a message on
+<a href="https://forum.camptocamp.org/c/site-et-association/v6-suggestions-bugs-et-problemes">
+forum</a>.
+</div>
+"""  # noqa
+
+# RLock because this lock can be released
+# only by the thread who acquires it.
+_parser_lock = RLock()
+
+_markdown_parser = None
+_cleaner = None
+_iframe_secret_tag = "iframe_" + _get_secret()
+
+"""
+_***_secret_tag is used as a private key to replace critical HTML node and
+attributes. The key point is this : the parser will use them. bleach will
+remove all critical nodes. Then, a very end parser replace secret_tag by good
+HTML node/attribute
+
+PEP 506 :
+os.urandom is the safe way to generate private data, where random module only
+generate random data without entropy. Hexlify() and ascii() convert it to
+lower case string. Once V6_ui will be into python 3.6 or higher, we will use
+secrets module.
+
+How to hack C2C ? if you want to inject an iframe, you will need to know the
+value of _iframe_secret_tag present into server memory.
+"""
+
+
+def _get_cleaner():
+    global _cleaner
+
+    if not _cleaner:
+        allowed_tags = bleach.ALLOWED_TAGS + [
+            # headers
+            "h1", "h2", "h3", "h4", "h5", "h6",
+
+            # blocks
+            "div", "p", "pre", "hr", "center",
+
+            # inline nodes
+            "span", "br", "sub", "sup", "s", "del", "ins", "small",
+
+            # images
+            "figure", "img", "figcaption",
+
+            _iframe_secret_tag,
+
+            # tables
+            "table", "tr", "td", "th", "tbody"
+        ]
+
+        allowed_attributes = dict(bleach.ALLOWED_ATTRIBUTES)
+        allowed_extra_attributes = {
+            "a": [
+                "c2c:role",
+                "c2c:document-type",
+                "c2c:document-id",
+                "c2c:lang",
+                "c2c:slug",
+                "c2c:anchor"
+            ],
+            "h1": ["id", "c2c:role"],
+            "h2": ["id", "c2c:role"],
+            "h3": ["id", "c2c:role"],
+            "h4": ["id", "c2c:role"],
+            "h5": ["id", "c2c:role"],
+            "h6": ["id", "c2c:role"],
+            "table": ["c2c:role"],
+            "div": ["class", "style", "c2c:role"],
+            "td": ["colspan"],
+            "span": ["class", "translate", "id", "c2c:role"],
+            _iframe_secret_tag: ["src"],
+            "figure": ["c2c:position", "c2c:role", "c2c:size"],
+            "img": [
+                "alt",
+                "c2c:document-id",
+                "c2c:role",
+                "c2c:size",
+                "c2c:url-proxy",
+                "c2c:svg-name",
+                "c2c:emoji-db"
+            ],
+        }
+
+        for key in allowed_extra_attributes:
+            if key not in allowed_attributes:
+                allowed_attributes[key] = []
+
+            allowed_attributes[key] += allowed_extra_attributes[key]
+
+        _cleaner = bleach.Cleaner(tags=allowed_tags,
+                                  attributes=allowed_attributes,
+                                  styles=bleach.ALLOWED_STYLES + ["clear"],
+                                  protocols=bleach.ALLOWED_PROTOCOLS,
+                                  strip=False,
+                                  strip_comments=True)
+
+    return _cleaner
+
+
+def _get_markdown_parser():
+    global _markdown_parser
+    if not _markdown_parser:
+        extensions = [
+            C2CWikiLinkExtension(),
+            C2CImageExtension(),
+            Nl2BrExtension(),
+            C2CTocExtension(marker='[toc]', baselevel=2),
+            C2CVideoExtension(iframe_secret_tag=_iframe_secret_tag),
+            C2CLTagExtension(),
+            C2CHeaderExtension(),
+            C2CPTagExtension(),
+            AlertExtension(),
+            C2CEmojiExtension(),
+            C2CNbspExtension(),
+        ]
+        _markdown_parser = markdown.Markdown(output_format='xhtml5',
+                                             extensions=extensions,
+                                             enable_attributes=False)
+    return _markdown_parser
+
+
+def parse_code(text):
+    """
+    Get markdown, and returns HTML.
+    This function is thread-safe
+    """
+
+    # we need parsing to be thread safe because
+    # L numbering, and Markdown() has internal global variables
+
+    # for explanation about Lock context manager usage
+    # see https://docs.python.org/3/library/threading.html
+    # on paragraph 17.1.10 (with statement)
+    with _parser_lock:
+        parser = _get_markdown_parser()
+        cleaner = _get_cleaner()
+
+        # reset parser state. Otherwise, internals parser cache grows
+        # indefinitely, and performance decreases over time
+        parser.reset()
+
+        try:
+            text = parser.convert(text)
+
+            # we keep clean function into thread safe part,
+            # because we are not sure of this function
+            text = cleaner.clean(text=text)
+        except:  # noqa
+            text = _PARSER_EXCEPTION_MESSAGE
+
+    text = text.replace(_iframe_secret_tag, "iframe")
+
+    return text
diff --git a/c2corg_api/markdown/alerts.py b/c2corg_api/markdown/alerts.py
@@ -0,0 +1,73 @@
+from markdown.extensions import Extension
+from markdown.blockprocessors import BlockProcessor
+from markdown import util
+import re
+
+
+class AlertProcessor(BlockProcessor):
+    RE = re.compile(r'(^|\n)[ ]{0,3}(!{2,4})(([^!]|$).*)')
+
+    roles = {
+        "!!": "info",
+        "!!!": "warning",
+        "!!!!": "danger",
+    }
+
+    def test(self, parent, block):
+        return bool(self.RE.search(block))
+
+    def run(self, parent, blocks):
+        block = blocks.pop(0)
+        m = self.RE.search(block)
+        level = m.group(2)
+        tester = re.compile("^[ ]{0,3}" + level + "([^!]|$)")
+
+        before = block[:m.start()]  # Lines before blockquote
+        # Pass lines before alert banner
+        self.parser.parseBlocks(parent, [before])
+
+        after = block[m.start():].split('\n')
+        if len(after[0]) == 0:
+            after.pop(0)
+
+        block = []
+
+        # get all lines starting with the same prefix
+        while len(after):
+            is_same_level = bool(tester.search(after[0]))
+            if is_same_level:
+                block.append(after.pop(0))
+            else:
+                break
+
+        # Remove `!!!` from begining of each line.
+        block = '\n'.join([self.clean(line) for line in block])
+
+        quote = util.etree.SubElement(parent, 'div')
+        quote.set("c2c:role", self.roles[level])
+        # Recursively parse block with div as parent.
+        self.parser.parseChunk(quote, block)
+
+        # and continue parsing next part of the block
+        self.parser.parseBlocks(parent, ["\n".join(after)])
+
+    def clean(self, line):
+        """ Remove ``!`` from beginning of a line. """
+        m = self.RE.match(line)
+        if line.strip() in ("!!", "!!!", "!!!!"):
+            return ""
+        elif m:
+            return m.group(3)
+        else:
+            return line
+
+
+class AlertExtension(Extension):
+    def extendMarkdown(self, md, md_globals):  # noqa
+        md.parser.blockprocessors.add('c2calert',
+                                      AlertProcessor(md.parser),
+                                      "<paragraph")
+
+
+def makeExtension(*args, **kwargs):  # noqa
+    return AlertExtension(*args, **kwargs)
diff --git a/c2corg_api/markdown/emoji_databases/__init__.py b/c2corg_api/markdown/emoji_databases/__init__.py
diff --git a/c2corg_api/markdown/emoji_databases/c2c_activities.py b/c2corg_api/markdown/emoji_databases/c2c_activities.py
@@ -0,0 +1,65 @@
+SVG_CDN = "/static/img/documents/activities/"
+
+name = "c2c-activities"
+
+emoji = {
+    ":rock_climbing:": {
+        "category": "activitiy",
+        "name": "rock climbing",
+        "svg_name": "rock_climbing",
+        "unicode": "1f9d7",
+    },
+    ":skitouring:": {
+        "category": "activitiy",
+        "name": "ski touring",
+        "svg_name": "skitouring",
+        "unicode": "26f7"
+    },
+    ":hiking:": {
+        "category": "activitiy",
+        "name": "hiking",
+        "svg_name": "hiking",
+    },
+    ":ice_climbing:": {
+        "category": "activitiy",
+        "name": "ice climbing",
+        "svg_name": "ice_climbing",
+    },
+    ":mountain_biking:": {
+        "category": "activitiy",
+        "name": "mountain biking",
+        "svg_name": "mountain_biking",
+    },
+    ":paragliding:": {
+        "category": "activitiy",
+        "name": "paragliding",
+        "svg_name": "paragliding",
+    },
+    ":slacklining:": {
+        "category": "activitiy",
+        "name": "slacklining",
+        "svg_name": "slacklining",
+    },
+    ":snow_ice_mixed:": {
+        "category": "activitiy",
+        "name": "snow ice mixed",
+        "svg_name": "snow_ice_mixed",
+    },
+    ":snowshoeing:": {
+        "category": "activitiy",
+        "name": "snowshoeing",
+        "svg_name": "snowshoeing",
+    },
+    ":via_ferrata:": {
+        "category": "activitiy",
+        "name": "via ferrata",
+        "svg_name": "via_ferrata",
+    },
+    ":mountain_climbing:": {
+        "category": "activitiy",
+        "name": "mountain climbing",
+        "svg_name": "mountain_climbing",
+    }
+}
+
+aliases = {}