From 17e6c67530bf15b8f06bb7bdf4b139c4d4e2f1ea Mon Sep 17 00:00:00 2001
From: Ivan Gotovchits <ivg@ieee.org>
Date: Thu, 2 Feb 2017 16:14:37 -0500
Subject: [PATCH] simplifies quotation in the comment parser

This commit makes the comment parser more robust. Instead of trying to
support both kind of quotes (single and double) we now switch to the
double quotes as the only method to delimit a token that has non-wordy
symbols. A signle quote is no longer considered a special symbol and
should not be escaped.

For the future pioneers of comment parsing I added a debug option, that
will be passed to the underlying shlex lexer (where all the problems
usually happen). If `debug=3`, then it will print every single state.

In case even if this robust version will fail, I've added a final line
of defense - if a comment cannot be parsed the BAP comments View will
not fail, but will just ignore the failed comment (of course a
diagnostic message will be printed into the message box).
---
 plugins/bap/plugins/bap_comments.py | 12 ++++++++----
 plugins/bap/utils/bap_comment.py    | 19 ++++++++-----------
 tests/test_bap_comment.py           |  9 +++++++--
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/plugins/bap/plugins/bap_comments.py b/plugins/bap/plugins/bap_comments.py
index b78e572..da37427 100644
--- a/plugins/bap/plugins/bap_comments.py
+++ b/plugins/bap/plugins/bap_comments.py
@@ -45,10 +45,14 @@ def run(self, arg):
         for addr in ida.addresses():
             comm = idaapi.get_cmt(addr, 0)
             if comm:
-                parsed = bap_comment.parse(comm)
-                if parsed:
-                    for (name, data) in parsed.items():
-                        comms[(addr, name)] = data
+                try:
+                    parsed = bap_comment.parse(comm)
+                    if parsed:
+                        for (name, data) in parsed.items():
+                            comms[(addr, name)] = data
+                except:
+                    idc.Message("BAP> failed to parse string {0}\n{1}".
+                                format(comm, str(sys.exc_info()[1])))
         comms = [(name, addr, data)
                  for ((addr, name), data) in comms.items()]
         attrs = Attributes(comms)
diff --git a/plugins/bap/utils/bap_comment.py b/plugins/bap/utils/bap_comment.py
index e6207b7..245d95c 100644
--- a/plugins/bap/utils/bap_comment.py
+++ b/plugins/bap/utils/bap_comment.py
@@ -11,7 +11,7 @@
 Basically, the comment string includes an arbitrary amount of
 key=value pairs. If a value contains whitespaces, punctuation or any
 non-word character, then it should be delimited with double quotes. If
-a value contains quote character, then it should be escaped with the
+a value contains a quote character, then it should be escaped with the
 backslash character (the backslash character can escape
 itself). Properties that doesn't have values (or basically has a
 property of a unit type, so called boolean properties) are represented
@@ -96,15 +96,17 @@
 WORDCHARS = ''.join(['-:', string.ascii_letters, string.digits])
 
 
-def parse(comment):
+def parse(comment, debug=0):
     """ Parse comment string.
 
     Returns a dictionary that maps properties to their values.
     Raises SyntaxError if the comment is syntactically incorrect.
     Returns None if comment doesn't start with the `BAP:` prefix.
     """
-    lexer = shlex(comment)
+    lexer = shlex(comment, posix=True)
     lexer.wordchars = WORDCHARS
+    lexer.debug = debug
+    lexer.quotes = '"'
     result = {}
     key = ''
     values = []
@@ -193,14 +195,9 @@ def quote(token):
     >>> quote('hello, world')
     '"hello, world"'
     """
-    if set(token) - set(WORDCHARS):
-        if "'" not in token:
-            return "'{}'".format(token)
-        elif '"' not in token:
-            return '"{}"'.format(token)
-        else:  # we ran out of quotes, so we need
-            return "'{}'".format(''.join('\\'+c if c == "'" else c
-                                         for c in token))
+    if not token.startswith('"') and set(token) - set(WORDCHARS):
+        return '"{}"'.format(''.join('\\'+c if c == '"' else c
+                                     for c in token))
     else:
         return token
 
diff --git a/tests/test_bap_comment.py b/tests/test_bap_comment.py
index e100119..655131b 100644
--- a/tests/test_bap_comment.py
+++ b/tests/test_bap_comment.py
@@ -18,7 +18,7 @@ def test_dumps():
     assert 'BAP:' in dumps({'hello': []})
     assert dumps({'hello': ['cruel', 'world'], 'nice': [], 'thing': []}) == \
         'BAP: nice,thing hello=cruel,world'
-    assert dumps({'hello': ["world\'"]}) == 'BAP: hello="world\'"'
+    assert dumps({'hello': ["world'"]}) == 'BAP: hello="world\'"'
 
 
 def test_is_valid():
@@ -39,6 +39,11 @@ def test_roundup():
 
 
 def test_quotation():
-    data = 'BAP: chars=\'{"a", "b", "c"}\''
+    data = 'BAP: chars="{\\\"a\\\", \\\"b\\\", \\\"c\\\"}"'
     assert parse(data) == {'chars': ['{"a", "b", "c"}']}
     assert parse(data) == parse(dumps(parse(data)))
+
+
+def test_single_quote():
+    data = 'BAP: key="{can\\\'t do}"'
+    assert parse(data) == {'key': ["{can\\'t do}"]}