Merge pull request #27 from ivg/robust-comment-parser

simplifies quotation in the comment parser
BinaryAnalysisPlatform · Feb 2, 2017 · f9a2255 · f9a2255
2 parents 07ad00c + 17e6c67
commit f9a2255
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 17 deletions.
diff --git a/plugins/bap/plugins/bap_comments.py b/plugins/bap/plugins/bap_comments.py
@@ -45,10 +45,14 @@ def run(self, arg):
         for addr in ida.addresses():
             comm = idaapi.get_cmt(addr, 0)
             if comm:
-                parsed = bap_comment.parse(comm)
-                if parsed:
-                    for (name, data) in parsed.items():
-                        comms[(addr, name)] = data
+                try:
+                    parsed = bap_comment.parse(comm)
+                    if parsed:
+                        for (name, data) in parsed.items():
+                            comms[(addr, name)] = data
+                except:
+                    idc.Message("BAP> failed to parse string {0}\n{1}".
+                                format(comm, str(sys.exc_info()[1])))
         comms = [(name, addr, data)
                  for ((addr, name), data) in comms.items()]
         attrs = Attributes(comms)

diff --git a/plugins/bap/utils/bap_comment.py b/plugins/bap/utils/bap_comment.py
@@ -11,7 +11,7 @@
 Basically, the comment string includes an arbitrary amount of
 key=value pairs. If a value contains whitespaces, punctuation or any
 non-word character, then it should be delimited with double quotes. If
-a value contains quote character, then it should be escaped with the
+a value contains a quote character, then it should be escaped with the
 backslash character (the backslash character can escape
 itself). Properties that doesn't have values (or basically has a
 property of a unit type, so called boolean properties) are represented
@@ -96,15 +96,17 @@
 WORDCHARS = ''.join(['-:', string.ascii_letters, string.digits])
 
 
-def parse(comment):
+def parse(comment, debug=0):
     """ Parse comment string.
 
     Returns a dictionary that maps properties to their values.
     Raises SyntaxError if the comment is syntactically incorrect.
     Returns None if comment doesn't start with the `BAP:` prefix.
     """
-    lexer = shlex(comment)
+    lexer = shlex(comment, posix=True)
     lexer.wordchars = WORDCHARS
+    lexer.debug = debug
+    lexer.quotes = '"'
     result = {}
     key = ''
     values = []
@@ -193,14 +195,9 @@ def quote(token):
     >>> quote('hello, world')
     '"hello, world"'
     """
-    if set(token) - set(WORDCHARS):
-        if "'" not in token:
-            return "'{}'".format(token)
-        elif '"' not in token:
-            return '"{}"'.format(token)
-        else:  # we ran out of quotes, so we need
-            return "'{}'".format(''.join('\\'+c if c == "'" else c
-                                         for c in token))
+    if not token.startswith('"') and set(token) - set(WORDCHARS):
+        return '"{}"'.format(''.join('\\'+c if c == '"' else c
+                                     for c in token))
     else:
         return token
 

diff --git a/tests/test_bap_comment.py b/tests/test_bap_comment.py
@@ -18,7 +18,7 @@ def test_dumps():
     assert 'BAP:' in dumps({'hello': []})
     assert dumps({'hello': ['cruel', 'world'], 'nice': [], 'thing': []}) == \
         'BAP: nice,thing hello=cruel,world'
-    assert dumps({'hello': ["world\'"]}) == 'BAP: hello="world\'"'
+    assert dumps({'hello': ["world'"]}) == 'BAP: hello="world\'"'
 
 
 def test_is_valid():
@@ -39,6 +39,11 @@ def test_roundup():
 
 
 def test_quotation():
-    data = 'BAP: chars=\'{"a", "b", "c"}\''
+    data = 'BAP: chars="{\\\"a\\\", \\\"b\\\", \\\"c\\\"}"'
     assert parse(data) == {'chars': ['{"a", "b", "c"}']}
     assert parse(data) == parse(dumps(parse(data)))
+
+
+def test_single_quote():
+    data = 'BAP: key="{can\\\'t do}"'
+    assert parse(data) == {'key': ["{can\\'t do}"]}