-
Notifications
You must be signed in to change notification settings - Fork 11
/
tokenizer.py
68 lines (64 loc) · 2.76 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Kimi language interpreter in Python 3
# Anjana Vakil
# http://www.github.com/vakila/kimi
from errors import *
def tokenize(string):
'''Take a program as a string, return the tokenized program as a list of strings.
>>> tokenize("-1")
[('literal', -1)]
>>> tokenize("(+ 1 2)")
[('opening', None), ('symbol', '+'), ('literal', 1), ('literal', 2), ('closing', None)]
'''
assert_or_throw(string.count('(') == string.count(')'), "syntax", "Mismatching parentheses!")
assert_or_throw('(((' not in string, "syntax", 'Incorrect parenthesis use: "(((". Opening parenthesis must be immediately followed by a function.')
special = ['(',')','"']
whitespaces = [' ','\n','\t']
tokens = []
remaining = string
while remaining:
this_char = remaining[0]
if this_char in whitespaces:
remaining = remaining[1:]
continue
if this_char in ["(", ")"]:
# the token is this character
if this_char == "(":
token_type = 'opening'
try:
next_char = remaining[1]
except IndexError:
throw_error("syntax", 'Incorrect parenthesis use: "(" at end of program.')
else:
if next_char in [")", '"'] or next_char in whitespaces :
throw_error("syntax", "Incorrect parenthesis use: " + '"' + this_char + next_char + '". Opening parenthesis must be immediately followed by a function.')
if this_char == ")":
token_type = 'closing'
token_value = None
remaining = remaining[1:]
elif this_char == '"':
# the token is everything until the next "
endquote_index = remaining[1:].find('"')
if endquote_index == -1:
throw_error("syntax", "Improper string syntax.")
endquote_index += 1
token_value = remaining[1:endquote_index]
token_type = 'literal'
remaining = remaining[endquote_index+1:]
else:
# the token is everything until the next whitespace or special character
token_value = ""
while this_char not in special and this_char not in whitespaces:
token_value += this_char
remaining = remaining[1:]
if not remaining:
break
this_char = remaining[0]
try:
# anything that can be converted to int is a literal number
token_value = int(token_value)
token_type = "literal"
except ValueError:
# everything else is a symbol
token_type = "symbol"
tokens.append((token_type, token_value))
return tokens