-
Notifications
You must be signed in to change notification settings - Fork 1
/
fundyparse.py
254 lines (206 loc) · 8.95 KB
/
fundyparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#
# Copyright 2009 Benjamin Mellor
#
# This file is part of Fundy.
#
# Fundy is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import py
import sys
from rpython.rlib.parsing.ebnfparse import parse_ebnf, check_for_missing_names
from rpython.rlib.parsing.parsing import PackratParser, ParseError
from rpython.rlib.parsing.lexer import Lexer, Token, SourcePos
from utils import preparer
def get_grammar(for_translation):
"""
NOT_RPYTHON: This function is only called to process the grammarfile, which
occurs before translation, so it does not need to be RPython.
The for_translation argument should be True if the grammar should be built
for translating the Fundy interpreter to low level code, or False for
running on top of CPython.
"""
grammarfile = py.path.local().join('fundy.grammar')
lines = []
skipdepth = 0
for line in grammarfile.readlines():
if skipdepth is not 0:
if line.startswith('#!endif'):
skipdepth -= 1
continue # we're in a false #!if, keep going
if line.startswith('#!if '):
condition = line.lstrip('#!if ').strip()
if not eval(condition):
skipdepth += 1
continue
lines.append(line)
# end for
grammar = '\n'.join(lines)
try:
regexes, rules, ToAST = parse_ebnf(grammar)
except ParseError, e:
print e.nice_error_message(filename=str(grammarfile), source=grammar)
sys.exit(1)
return regexes, rules, ToAST
# Similar to ebnfparse.make_parse_function, but accepts a function to
# run the lexer's token stream through before passing it to the parser,
# and a list of extra names that are added by the post-processing stage.
def make_messy_parse_function(regexes, rules, eof=False, post_lexer=None,
extra_names=()):
"""
NOT_RPYTHON: This function is only called to process the grammarfile, which
occurs before translation, so it does not need to be RPython.
The parse function it returns parses Fundy code into a "messy" AST, which
can be cleaned up using the ToAST object obtained from parse_ebnf(grammar).
"""
names, regexes = zip(*regexes)
if "IGNORE" in names:
ignore = ["IGNORE"]
else:
ignore = []
check_for_missing_names(names + extra_names, regexes, rules)
lexer = Lexer(list(regexes), list(names), ignore=ignore)
parser = PackratParser(rules, rules[0].nonterminal)
def parse(s):
tokens = lexer.tokenize(s, eof=eof)
if post_lexer is not None:
tokens = post_lexer(tokens)
s = parser.parse(tokens)
return s
return parse
def process_indentation(tokens):
"""
Remove LINEBREAK tokens from a token stream, inserting TERM where needed.
A LINEBREAK token is assumed to be produced for every sequence of whitespace
containing one or more newlines. It is also assumed that it is impossible
for two LINEBREAK tokens to appear consecutively, since they would be read
as one multi-line LINEBREAK.
All whitespace up to and including the last newline is ignored, and the
length of the remaining text (plus 3 times the numeber of tabs to bring the
indent level for tabs to 4 spaces) is taken as the indent level of the line.
The current indentation level is the top value in the indentation stack.
When a line is indented more than the current indentation level, the
LINEBREAK token is simply removed.
When a line is indented equal to the current indentation level, a TERM token
is inserted into the token stream.
When a BEGIN token is encountered, if the next line is indented more than
the current indentation level, its indentation level is pushed onto the
indentation stack.
When a line is indented less than the current indentation level, the
indentation stack is popped until the current indentation level is less than
the indentation of the line. For each value popped, a TERM token is inserted
into the token stream, plus one additional TERM. i.e. one TERM for not being
indented, as normal, plus one for each dedent.
NOTE: this function is called at runtime, so it must be RPython.
"""
stack = [0]
ret = []
begin_block = False
for tok in tokens:
if tok.name == 'BEGIN':
begin_block = True
ret.append(tok)
elif tok.name == 'LINEBREAK':
indentstr = tok.source.split('\n')[-1]
indent = len(indentstr) + indentstr.count('\t')*3
if indent <= stack[-1]:
ret.append(Token('TERM', tok.source, tok.source_pos))
if begin_block:
# If we were supposed to begin a block but there was no
# indent, then the block finished on the same line as it
# started, so we need to emit an extra TERM to complete it.
ret.append(Token('TERM', tok.source, tok.source_pos))
while indent < stack[-1]:
stack.pop()
ret.append(Token('TERM', tok.source, tok.source_pos))
if indent > stack[-1]:
if begin_block:
stack.append(indent)
# Regardless of the indentation level, after processing a newline
# we're no longer looking to start a new block.
begin_block = False
elif tok.name == 'EOF':
while stack:
stack.pop()
ret.append(Token('TERM', tok.source, tok.source_pos))
ret.append(tok)
else:
ret.append(tok)
# end for tok in tokens
return ret
# end def process_indentation
def make_parse_function(for_translation):
"""
Convenience function. Goes the whole way from reading the grammar file to
building a function that parses Fundy code into the sort of AST that the
asteval module expects. Not called at runtime, so it does not need to be
RPython, but the function it returns is.
"""
regexes, rules, ToAST = get_grammar(for_translation)
messy_parse = make_messy_parse_function(regexes, rules, eof=True,
post_lexer=process_indentation)
tidyer = ToAST()
def parse(code):
messy_tree = messy_parse(code)
tidy_tree = tidyer.transform(messy_tree)
return tidy_tree
return parse
class __SecretParser(object):
def __init__(self):
"""
NOT_RPYTHON: This class exists to encapsulate the state of the parser,
which can either be in translated mode or untranslated mode, since the
show statement can only be supported when running on top of CPython.
The parse function exported from this module calls out to an instance
of this class, allowing other modules to be ignorant of the fact that
this is parameterised on whether we're translating or not. Although
this class is not RPython in its entirety (because the process of
building a parser is not), when analyzing the parse method the
appropriate parse function will already be built.
I'm reasonably sure the if statement in the parse method will not
appear in the translated interpreter either (nor will two copies of
the parse function and grammar), since self.for_translation will be
a constant from the point of view of the translator?
"""
self.for_translation = False
self.setup(self.for_translation)
def setup(self, for_translation):
"""
NOT_RPYTHON:
"""
self.for_translation = for_translation
if for_translation:
attr = 'translated_parse'
else:
attr = 'untranslated_parse'
if getattr(self, attr, None) is None:
setattr(self, attr, make_parse_function(for_translation))
def parse(self, code):
if self.for_translation:
return self.translated_parse(code)
else:
return self.untranslated_parse(code)
__secret_parser_state = __SecretParser()
preparer.register(__secret_parser_state.setup)
# This is the actual parse function that other modules can import from here.
# (finally!)
def parse(code):
return __secret_parser_state.parse(code)
def show_parse(code):
"""
NOT_RPYTHON: For testing/debugging use
"""
try:
parse(code).view()
except ParseError, e:
print e.nice_error_message(source=code)