-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathjsonply.py
executable file
·467 lines (382 loc) · 11.8 KB
/
jsonply.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
#!/usr/bin/python2.5
# Copyright 2009 DeWitt Clinton All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''A JSON parser built using the PLY (Python Lex-Yacc) library.
Sample usage:
>>> import jsonply
>>> jsonply.parse('{"foo": "bar", "arr": [1, {"a": -2.50e4}, true]}')
{u'arr': [1, {u'a': -25000.0}, True], u'foo': u'bar'}
'''
__author__ = 'dewitt@unto.net'
__version__ = '0.1-devel'
import ply
import ply.lex
import ply.yacc
import sys
# The list of tokens to be extracted by the JsonLexer and parsed by
# the JsonParser. These tokens form the contract between the
# JsonLexer and the JsonParser and any changes here will need to
# be synchronized among those classes.
JSON_TOKENS = [
# Initial state tokens
'BEGIN_ARRAY',
'BEGIN_OBJECT',
'END_ARRAY',
'END_OBJECT',
'NAME_SEPARATOR',
'VALUE_SEPARATOR',
'QUOTATION_MARK',
'FALSE',
'TRUE',
'NULL',
'DECIMAL_POINT',
'DIGITS',
'E',
'MINUS',
'PLUS',
'ZERO',
# String state tokens
'UNESCAPED',
'ESCAPE',
# Escaped state tokens
'REVERSE_SOLIDUS',
'SOLIDUS',
'BACKSPACE_CHAR',
'FORM_FEED_CHAR',
'LINE_FEED_CHAR',
'CARRIAGE_RETURN_CHAR',
'TAB_CHAR',
'UNICODE_HEX'
]
class JsonLexer(object):
'''A class-based wrapper around the ply.lex instance.
The JsonLexer tokenizes an input string and produces LexToken instances
corresponding to the JSON_TOKENS values.
'''
def __init__(self, **kwargs):
'''Constructs the JsonLexer based on the tokenization rules herein.
Successful construction builds the ply.lex instance and sets
self.lexer.
'''
self.lexer = ply.lex.lex(module=self, **kwargs)
# The JsonLexer uses the JSON_TOKENS values as a contact between
# the lexer and the parser.
tokens = JSON_TOKENS
# The JsonLexer has three exclusive states:
#
# default:
# The default context, tokenizing objects, arrays, numbers, etc.
# string:
# Within quote-delimited strings.
# escaped:
# A single-use state that treats the next character literally.
states = (
('string', 'exclusive'),
('escaped', 'exclusive')
)
def t_ANY_error(self, t):
last_cr = self.lexer.lexdata.rfind('\n', 0, t.lexpos)
if last_cr < 0:
last_cr = 0
column = (t.lexpos - last_cr) + 1
print "Illegal character '%s' at line %d pos %d" % (
t.value[0], t.lineno, column)
t.lexer.skip(1)
# Skips over '\s', '\t', '\n', and '\r' characters in the default state
t_ignore = '\x20\x09\x0A\x0D'
# Default state tokens
t_BEGIN_ARRAY = r'\x5B' # '['
t_BEGIN_OBJECT = r'\x7B' # '{'
t_END_ARRAY = r'\x5D' # ']'
t_END_OBJECT = r'\x7D' # '}'
t_NAME_SEPARATOR = r'\x3A' # ':'
t_VALUE_SEPARATOR = r'\x2C' # ','
t_FALSE = r'\x66\x61\x6c\x73\x65' # 'false'
t_TRUE = r'\x74\x72\x75\x65' # 'true'
t_NULL = r'\x6e\x75\x6c\x6c' # 'null'
t_DECIMAL_POINT = r'\x2E' # '.'
t_DIGITS = r'[\x30-\x39]+' # '0'..'9'
t_E = r'[\x45\x65]' # 'e' or 'E'
t_MINUS = r'\x2D' # '-'
t_PLUS = r'\x2B' # '+'
t_ZERO = r'\x30' # '0'
# Enters the string state on an opening quotation mark
def t_QUOTATION_MARK(self, t):
r'\x22' # '"'
t.lexer.push_state('string')
return t
# Don't skip over any tokens inside the string state
t_string_ignore = ''
# TODO(dewitt): Verify that this matches the correct range, the spec
# says '%x5D-10FFFF' but most pythons by default will not handle that
def t_string_UNESCAPED(self, t):
r'[\x20-\x21,\x23-\x5B,\x5D-\xFF]+'
t.value = unicode(t.value, encoding='utf8')
return t
# Exits the string state on an unescaped closing quotation mark
def t_string_QUOTATION_MARK(self, t):
r'\x22' # '"'
t.lexer.pop_state()
return t
# Enter the escaped state on a '\' character
def t_string_ESCAPE(self, t):
r'\x5C' # '\'
t.lexer.push_state('escaped')
return t
# Don't skip over any tokens inside the escaped state
t_escaped_ignore = ''
def t_escaped_QUOTATION_MARK(self, t):
r'\x22' # '"'
t.lexer.pop_state()
return t
def t_escaped_REVERSE_SOLIDUS(self, t):
r'\x5C' # '\'
t.lexer.pop_state()
return t
def t_escaped_SOLIDUS(self, t):
r'\x2F' # '/'
t.lexer.pop_state()
return t
def t_escaped_BACKSPACE_CHAR(self, t):
r'\x62' # 'b'
t.lexer.pop_state()
t.value = unichr(0x0008)
return t
def t_escaped_FORM_FEED_CHAR(self, t):
r'\x66' # 'f'
t.lexer.pop_state()
t.value = unichr(0x000c)
return t
def t_escaped_CARRIAGE_RETURN_CHAR(self, t):
r'\x72' # 'r'
t.lexer.pop_state()
t.value = unichr(0x000d)
return t
def t_escaped_LINE_FEED_CHAR(self, t):
r'\x6E' # 'n'
t.lexer.pop_state()
t.value = unichr(0x000a)
return t
def t_escaped_TAB_CHAR(self, t):
r'\x74' # 't'
t.lexer.pop_state()
t.value = unichr(0x0009)
return t
def t_escaped_UNICODE_HEX(self, t):
r'\x75[\x30-\x39,\x41-\x46,\x61-\x66]{4}' # 'uXXXX'
t.lexer.pop_state()
return t
def tokenize(self, data, *args, **kwargs):
'''Invoke the lexer on an input string an return the list of tokens.
This is relatively inefficient and should only be used for
testing/debugging as it slurps up all tokens into one list.
Args:
data: The input to be tokenized.
Returns:
A list of LexTokens
'''
self.lexer.input(data)
tokens = list()
while True:
token = self.lexer.token()
if not token:
break
tokens.append(token)
return tokens
class JsonParser(object):
'''A class-based wrapper around the ply.yacc instance.
The JsonParser takes the tokenized output from the JsonLexer and
parses it accoring to the JSON grammar rules. The output is a
python data structure that represents the input data.
'''
def __init__(self, lexer=None, **kwargs):
'''Constructs the JsonParser based on the grammar contained herein.
Successful construction builds the ply.yacc instance and sets
self.parser.
Args:
lexer: A ply.lex or JsonLexer instance that will produce JSON_TOKENS.
'''
if lexer is not None:
if isinstance(lexer, JsonLexer):
self.lexer = lexer.lexer
else:
# Assume that the lexer is a ply.lex instance or similar
self.lexer = lexer
else:
self.lexer = JsonLexer().lexer
self.parser = ply.yacc.yacc(module=self, **kwargs)
# The JsonParser uses the JSON_TOKENS values as a contact between
# the lexer and the parser.
tokens = JSON_TOKENS
# Define the parser
def p_text(self, p):
'''text : object
| array'''
p[0] = p[1]
def p_value(self, p):
'''value : object
| array
| number
| string'''
p[0] = p[1]
def p_value_false(self, p):
'''value : FALSE'''
p[0] = False
def p_value_true(self, p):
'''value : TRUE'''
p[0] = True
def p_value_null(self, p):
'''value : NULL'''
p[0] = None
def p_object(self, p):
'''object : BEGIN_OBJECT members END_OBJECT'''
p[0] = dict(p[2])
def p_members(self, p):
'''members :
| members member VALUE_SEPARATOR
| members member'''
if len(p) == 1:
p[0] = list()
else:
p[1].append(p[2])
p[0] = p[1]
def p_member(self, p):
'''member : string NAME_SEPARATOR value'''
p[0] = (p[1], p[3])
def p_values(self, p):
'''values :
| values value VALUE_SEPARATOR
| values value'''
if len(p) == 1:
p[0] = list()
else:
p[1].append(p[2])
p[0] = p[1]
def p_array(self, p):
'''array : BEGIN_ARRAY values END_ARRAY'''
p[0] = p[2]
def p_number_positive(self, p):
'''number : integer
| float'''
p[0] = p[1]
def p_number_negative(self, p):
'''number : MINUS integer
| MINUS float'''
p[0] = -p[2]
def p_integer(self, p):
'''integer : int'''
p[0] = p[1]
def p_integer_exp(self, p):
'''integer : int exp'''
p[0] = p[1] * (10**p[2])
def p_number_float(self, p):
'''float : int frac'''
p[0] = p[1] + p[2]
def p_number_float_exp(self, p):
'''float : int frac exp'''
p[0] = (p[1] + p[2]) * (10**p[3])
def p_exp_negative(self, p):
'''exp : E MINUS DIGITS'''
p[0] = -int(p[3])
def p_exp(self, p):
'''exp : E DIGITS'''
p[0] = int(p[2])
def p_exp_positive(self, p):
'''exp : E PLUS DIGITS'''
p[0] = int(p[3])
def p_frac(self, p):
'''frac : DECIMAL_POINT DIGITS'''
p[0] = float('.' + p[2])
def p_int_zero(self, p):
'''int : ZERO'''
p[0] = int(0)
def p_int_non_zero(self, p):
'''int : DIGITS'''
if p[1].startswith('0'):
raise SyntaxError('Leading zeroes are not allowed.')
p[0] = int(p[1])
def p_string(self, p):
'''string : QUOTATION_MARK chars QUOTATION_MARK'''
p[0] = p[2]
def p_chars(self, p):
'''chars :
| chars char'''
if len(p) == 1:
p[0] = unicode()
else:
p[0] = p[1] + p[2]
def p_char(self, p):
'''char : UNESCAPED
| ESCAPE QUOTATION_MARK
| ESCAPE REVERSE_SOLIDUS
| ESCAPE SOLIDUS
| ESCAPE BACKSPACE_CHAR
| ESCAPE FORM_FEED_CHAR
| ESCAPE LINE_FEED_CHAR
| ESCAPE CARRIAGE_RETURN_CHAR
| ESCAPE TAB_CHAR'''
# Because the subscript [-1] has special meaning for YaccProduction
# slices we use [len(p) - 1] to always take the last value.
p[0] = p[len(p) - 1]
def p_char_unicode_hex(self, p):
'''char : ESCAPE UNICODE_HEX'''
# This looks more complicated than it is. The escaped string is of
# the form \uXXXX and is assigned to p[2]. We take the trailing
# XXXX string via p[2][1:], parse it as a radix 16 (hex) integer,
# and convert that to the corresponding unicode character.
p[0] = unichr(int(p[2][1:], 16))
def p_error(self, p):
print "Syntax error at '%s'" % p
# Invoke the parser
def parse(self, data, lexer=None, *args, **kwargs):
'''Parse the input JSON data string into a python data structure.
Args:
data: An input data string
lexer: An optional ply.lex instance that overrides the default lexer.
Returns:
A python dict or list representing the input JSON data.
'''
if lexer is None:
lexer = self.lexer
return self.parser.parse(data, lexer=lexer, *args, **kwargs)
# Maintain a reusable parser instance
parser = None
def parse(s):
'''Parse a string-like object and return the corresponding python structure.
Args:
s: a string-like object
Returns:
A python dict or array
'''
global parser
if parser is None:
parser = JsonParser()
return parser.parse(s)
def parse_file(f):
'''Parse a file-like object and return the corresponding python structure.
Args:
f: a file-like object
Returns:
A Python dict or array
'''
return parse(f.read())
def main(argv):
'''Parses JSON files or stdin and prints the python data structure.'''
if len(argv) > 1:
for filename in argv[1:]:
print parse_file(open(filename))
else:
print parse_file(sys.stdin)
if __name__ == '__main__':
main(sys.argv)