From 17e6c67530bf15b8f06bb7bdf4b139c4d4e2f1ea Mon Sep 17 00:00:00 2001 From: Ivan Gotovchits Date: Thu, 2 Feb 2017 16:14:37 -0500 Subject: [PATCH] simplifies quotation in the comment parser This commit makes the comment parser more robust. Instead of trying to support both kind of quotes (single and double) we now switch to the double quotes as the only method to delimit a token that has non-wordy symbols. A signle quote is no longer considered a special symbol and should not be escaped. For the future pioneers of comment parsing I added a debug option, that will be passed to the underlying shlex lexer (where all the problems usually happen). If `debug=3`, then it will print every single state. In case even if this robust version will fail, I've added a final line of defense - if a comment cannot be parsed the BAP comments View will not fail, but will just ignore the failed comment (of course a diagnostic message will be printed into the message box). --- plugins/bap/plugins/bap_comments.py | 12 ++++++++---- plugins/bap/utils/bap_comment.py | 19 ++++++++----------- tests/test_bap_comment.py | 9 +++++++-- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/plugins/bap/plugins/bap_comments.py b/plugins/bap/plugins/bap_comments.py index b78e572..da37427 100644 --- a/plugins/bap/plugins/bap_comments.py +++ b/plugins/bap/plugins/bap_comments.py @@ -45,10 +45,14 @@ def run(self, arg): for addr in ida.addresses(): comm = idaapi.get_cmt(addr, 0) if comm: - parsed = bap_comment.parse(comm) - if parsed: - for (name, data) in parsed.items(): - comms[(addr, name)] = data + try: + parsed = bap_comment.parse(comm) + if parsed: + for (name, data) in parsed.items(): + comms[(addr, name)] = data + except: + idc.Message("BAP> failed to parse string {0}\n{1}". + format(comm, str(sys.exc_info()[1]))) comms = [(name, addr, data) for ((addr, name), data) in comms.items()] attrs = Attributes(comms) diff --git a/plugins/bap/utils/bap_comment.py b/plugins/bap/utils/bap_comment.py index e6207b7..245d95c 100644 --- a/plugins/bap/utils/bap_comment.py +++ b/plugins/bap/utils/bap_comment.py @@ -11,7 +11,7 @@ Basically, the comment string includes an arbitrary amount of key=value pairs. If a value contains whitespaces, punctuation or any non-word character, then it should be delimited with double quotes. If -a value contains quote character, then it should be escaped with the +a value contains a quote character, then it should be escaped with the backslash character (the backslash character can escape itself). Properties that doesn't have values (or basically has a property of a unit type, so called boolean properties) are represented @@ -96,15 +96,17 @@ WORDCHARS = ''.join(['-:', string.ascii_letters, string.digits]) -def parse(comment): +def parse(comment, debug=0): """ Parse comment string. Returns a dictionary that maps properties to their values. Raises SyntaxError if the comment is syntactically incorrect. Returns None if comment doesn't start with the `BAP:` prefix. """ - lexer = shlex(comment) + lexer = shlex(comment, posix=True) lexer.wordchars = WORDCHARS + lexer.debug = debug + lexer.quotes = '"' result = {} key = '' values = [] @@ -193,14 +195,9 @@ def quote(token): >>> quote('hello, world') '"hello, world"' """ - if set(token) - set(WORDCHARS): - if "'" not in token: - return "'{}'".format(token) - elif '"' not in token: - return '"{}"'.format(token) - else: # we ran out of quotes, so we need - return "'{}'".format(''.join('\\'+c if c == "'" else c - for c in token)) + if not token.startswith('"') and set(token) - set(WORDCHARS): + return '"{}"'.format(''.join('\\'+c if c == '"' else c + for c in token)) else: return token diff --git a/tests/test_bap_comment.py b/tests/test_bap_comment.py index e100119..655131b 100644 --- a/tests/test_bap_comment.py +++ b/tests/test_bap_comment.py @@ -18,7 +18,7 @@ def test_dumps(): assert 'BAP:' in dumps({'hello': []}) assert dumps({'hello': ['cruel', 'world'], 'nice': [], 'thing': []}) == \ 'BAP: nice,thing hello=cruel,world' - assert dumps({'hello': ["world\'"]}) == 'BAP: hello="world\'"' + assert dumps({'hello': ["world'"]}) == 'BAP: hello="world\'"' def test_is_valid(): @@ -39,6 +39,11 @@ def test_roundup(): def test_quotation(): - data = 'BAP: chars=\'{"a", "b", "c"}\'' + data = 'BAP: chars="{\\\"a\\\", \\\"b\\\", \\\"c\\\"}"' assert parse(data) == {'chars': ['{"a", "b", "c"}']} assert parse(data) == parse(dumps(parse(data))) + + +def test_single_quote(): + data = 'BAP: key="{can\\\'t do}"' + assert parse(data) == {'key': ["{can\\'t do}"]}