-
Notifications
You must be signed in to change notification settings - Fork 29
/
parser.jl
360 lines (328 loc) · 11.6 KB
/
parser.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
isalnum(c) = isletter(c) || isnumeric(c)
is_url_char(c) = ((@assert UInt32(c) < 0x80); 'A' <= c <= '~' || '$' <= c <= '>' || c == '\f' || c == '\t')
is_mark(c) = (c == '-') || (c == '_') || (c == '.') || (c == '!') || (c == '~') ||
(c == '*') || (c == '\'') || (c == '(') || (c == ')')
is_userinfo_char(c) = isalnum(c) || is_mark(c) || (c == '%') || (c == ';') ||
(c == ':') || (c == '&') || (c == '+') || (c == '$' || c == ',')
isnum(c) = ('0' <= c <= '9')
ishex(c) = (isnum(c) || 'a' <= lowercase(c) <= 'f')
is_host_char(c) = isalnum(c) || (c == '.') || (c == '-') || (c == '_') || (c == "~")
struct URI
scheme::String
host::String
port::UInt16
path::String
query::String
fragment::String
userinfo::String
specifies_authority::Bool
URI(scheme,host,port,path,query="",fragment="",userinfo="",specifies_authority=false) =
new(scheme,host,UInt16(port),path,query,fragment,userinfo,specifies_authority)
end
==(a::URI,b::URI) = isequal(a,b)
isequal(a::URI,b::URI) = (a.scheme == b.scheme) &&
(a.host == b.host) &&
(a.port == b.port) &&
(a.path == b.path) &&
(a.query == b.query) &&
(a.fragment == b.fragment) &&
(a.userinfo == b.userinfo)
URI(host,path) = URI("http",host,UInt16(80),path,"","","",true)
URI(uri::URI;
scheme=uri.scheme,
host=uri.host,
port=uri.port,
path=uri.path,
query=uri.query,
fragment=uri.fragment,
userinfo=uri.userinfo,
specifies_authority=uri.specifies_authority) =
URI( scheme, host, port, path, query, fragment, userinfo, specifies_authority)
# URL parser based on the http-parser package by Joyent
# Licensed under the BSD license
# Parse authority (user@host:port)
# return (host,port,user)
function parse_authority(authority,seen_at)
host=""
port=""
user=""
last_state = state = seen_at ? :http_userinfo_start : :http_host_start
i = firstindex(authority)
li = s = 0
while true
if li > ncodeunits(authority)
last_state = state
state = :done
end
if s == 0
s = li
end
if state != last_state
r = s:prevind(authority,li)
s = li
if last_state == :http_userinfo
user = authority[r]
elseif last_state == :http_host || last_state == :http_host_v6
host = authority[r]
elseif last_state == :http_host_port
port = authority[r]
end
end
if state == :done
break
end
if i > ncodeunits(authority)
li = i
continue
end
li = i
(ch,i) = iterate(authority,i)
last_state = state
if state == :http_userinfo || state == :http_userinfo_start
if ch == '@'
state = :http_host_start
elseif is_userinfo_char(ch)
state = :http_userinfo
else
error("Unexpected character '$ch' in userinfo")
end
elseif state == :http_host_start
if ch == '['
state = :http_host_v6_start
elseif ch == '%'
pos_escape_char2 = nextind(authority, li, 2)
if pos_escape_char2 > ncodeunits(authority)
error("Invalid escape sequence in host")
end
pos_escape_char1 = nextind(authority, i)
if !ishex(authority[pos_escape_char1]) || !ishex(authority[pos_escape_char2])
error("Invalid escape sequence in host")
end
state = :http_host
elseif is_host_char(ch)
state = :http_host
else
error("Unexpected character '$ch' at the beginning of the host string")
end
elseif state == :http_host
if ch == ':'
state = :http_host_port_start
elseif ch == '%'
pos_escape_char2 = nextind(authority, li, 2)
if pos_escape_char2 > ncodeunits(authority)
error("Invalid escape sequence in host")
end
pos_escape_char1 = nextind(authority, i)
if ishex(authority[pos_escape_char1]) && ishex(authority[pos_escape_char2])
li = pos_escape_char2
(ch,i) = iterate(authority,li)
else
error("Invalid escape sequence in host")
end
elseif !is_host_char(ch)
error("Unexpected character '$ch' in host")
end
elseif state == :http_host_v6_end
if ch != ':'
error("Only port allowed in authority after IPv6 address")
end
state = :http_host_port_start
elseif state == :http_host_v6 || state == :http_host_v6_start
if ch == ']' && state == :http_host_v6
state = :http_host_v6_end
elseif ishex(ch) || ch == ':' || ch == '.'
state = :http_host_v6
else
error("Unrecognized character in IPv6 address")
end
elseif state == :http_host_port || state == :http_host_port_start
if !isnum(ch)
error("Port must be numeric (decimal)")
end
state = :http_host_port
else
error("Unexpected state $state")
end
end
(host, UInt16(port == "" ? 0 : parse(Int, port, base=10)), user)
end
function parse_url(url)
scheme = ""
host = ""
server = ""
port = 80
query = ""
fragment = ""
username = ""
pass = ""
path = "/"
last_state = state = :req_spaces_before_url
seen_at = false
specifies_authority = false
i = firstindex(url)
li = s = 0
while true
if li > ncodeunits(url)
last_state = state
state = :done
end
if s == 0
s = li
end
if state != last_state
r = s:prevind(url,li)
s = li
if last_state == :req_scheme
scheme = url[r]
elseif last_state == :req_server_start
specifies_authority = true
elseif last_state == :req_server
server = url[r]
elseif last_state == :req_query_string
query = url[r]
elseif last_state == :req_path
path = url[r]
elseif last_state == :req_fragment
fragment = url[r]
end
end
if state == :done
break
end
if i > ncodeunits(url)
li = i
continue
end
li = i
(ch,i) = iterate(url,i)
if !isascii(ch)
error("Non-ASCII characters not supported in URIs. Encode the URL and try again.")
end
last_state = state
if state == :req_spaces_before_url
if ch == '/' || ch == '*'
state = :req_path
elseif isletter(ch)
state = :req_scheme
else
error("Unexpected start of URL")
end
elseif state == :req_scheme
if ch == ':'
state = :req_scheme_slash
elseif !(isletter(ch) || isdigit(ch) || ch == '+' || ch == '-' || ch == '.')
error("Unexpected character $ch after scheme")
end
elseif state == :req_scheme_slash
if ch == '/'
state = :req_scheme_slash_slash
elseif is_url_char(ch)
state = :req_path
else
error("Expecting scheme:path scheme:/path format not scheme:$ch")
end
elseif state == :req_scheme_slash_slash
if ch == '/'
state = :req_server_start
elseif is_url_char(ch)
s -= 1
state = :req_path
else
error("Expecting scheme:// or scheme: format not scheme:/$ch")
end
elseif state == :req_server_start || state == :req_server
# In accordence with RFC3986:
# 'The authority component is preceded by a double slash ("//") and isterminated by the next slash ("/")'
# This is different from the joyent http-parser, which considers empty hosts to be invalid. c.f. also the
# following part of RFC 3986:
# "If the URI scheme defines a default for host, then that default
# applies when the host subcomponent is undefined or when the
# registered name is empty (zero length). For example, the "file" URI
# scheme is defined so that no authority, an empty host, and
# "localhost" all mean the end-user's machine, whereas the "http"
# scheme considers a missing authority or empty host invalid."
if ch == '/'
state = :req_path
elseif ch == '?'
state = :req_query_string_start
elseif ch == '@'
seen_at = true
state = :req_server
elseif is_userinfo_char(ch) || ch == '[' || ch == ']'
state = :req_server
else
error("Unexpected character $ch in server")
end
elseif state == :req_path
if ch == '?'
state = :req_query_string_start
elseif ch == '#'
state = :req_fragment_start
elseif !is_url_char(ch) && ch != '@'
error("Path contained unexpected character")
end
elseif state == :req_query_string_start || state == :req_query_string
if ch == '?'
state = :req_query_string
elseif ch == '#'
state = :req_fragment_start
elseif !is_url_char(ch)
error("Query string contained unexpected character")
else
state = :req_query_string
end
elseif state == :req_fragment_start
if ch == '?'
state = :req_fragment
elseif ch == '#'
state = :req_fragment_start
elseif ch != '#' && !is_url_char(ch)
error("Start of fragment contained unexpected character")
else
state = :req_fragment
end
elseif state == :req_fragment
if !is_url_char(ch) && ch != '?' && ch != '#'
error("Fragment contained unexpected character")
end
else
error("Unrecognized state")
end
end
host, port, user = parse_authority(server,seen_at)
URI(lowercase(scheme),host,port,path,query,fragment,user,specifies_authority)
end
URI(url) = parse_url(url)
show(io::IO, uri::URI) = print(io,"URI(",uri,")")
function print(io::IO, uri::URI)
if uri.specifies_authority || !isempty(uri.host)
print(io,uri.scheme,"://")
if !isempty(uri.userinfo)
print(io,uri.userinfo,'@')
end
if ':' in uri.host #is IPv6
print(io,'[',uri.host,']')
else
print(io,uri.host)
end
if uri.port != 0
print(io,':',Int(uri.port))
end
else
print(io,uri.scheme,":")
end
print(io,uri.path)
if !isempty(uri.query)
print(io,"?",uri.query)
end
if !isempty(uri.fragment)
print(io,"#",uri.fragment)
end
end
function show(io::IO, ::MIME"text/html", uri::URI)
print(io, "<a href=\"")
print(io, uri)
print(io, "\">")
print(io, uri)
print(io, "</a>")
end