-
Notifications
You must be signed in to change notification settings - Fork 2
/
keccak_top.v
434 lines (380 loc) · 16.4 KB
/
keccak_top.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
`timescale 1ns/1ns
module keccak_top #(
// BASIC OPTIONS
parameter RATE = 1088, // The rate of the sponge construction
parameter W = 64, // The width of one Keccak register, aka lane length
parameter SHARES = 1, // Number of shares to use (SHARES-1 == protection order).
//TODO Would be nice to have a coarse configuration setting in case we don't need full configurability
// 0 = Minimum area. Detailed description goes here
// 1 = Almost minimum area. Detailed description goes here
// ...
//parameter CONFIGURATION = 0,
// ADVANCED OPTIONS
// Number of lanes that are absorbed in one cycle.
// You probably want to set ABSORB_ITERATIVE=0 if this is not RATE/W,
// otherwise absorbtion will take a significant amount of time.
parameter ABSORB_LANES = RATE/W,
// Note: If all of the following *_ITERATIVE options are 0, all steps
// are done in one cycle. Consequently the whole computation takes
// 12+2*ld(W) (== #rounds) cycles
// 0 = State reset is done asynchronously
// 1 = State reset takes W cycles, but the more expensive Clear FF are
// not used
parameter RESET_ITERATIVE = 1,
// 0 = Absorb in one cycle. (Automatically true if W==SLICES_PARALLEL)
// 1 = Absorb concurrently with Theta step if W!=SLICES_PARALLEL.
// ABSORB_ITERATIVE implies THETA_ITERATIVE, thus THETA_ITERATIVE is
// ignored if (ABSORB_ITERATIVE == 1)
parameter ABSORB_ITERATIVE = 1,
// 0 = Theta executed in 1 cycle
// 1 = Theta takes W/SLICES_PARALLEL cycles
// ABSORB_ITERATIVE implies THETA_ITERATIVE, thus THETA_ITERATIVE is
// ignored if (ABSORB_ITERATIVE == 1)
parameter THETA_ITERATIVE = 1,
//TODO rename
// 0 = Rho+Pi executed in 1 cycle
// 1 = Rho is done in W cycles, Pi takes W/SLICES_PARALLEL cycles
parameter RHO_PI_ITERATIVE = 0,
// 0 = Chi+Iota executed in 1 cycle
// 1 = Chi+Iota takes W/SLICES_PARALLEL cycles if unprotected, and
// 2*W/SLICES_PARALLEL cycles if DOM is used (pipelining and double
// clocking decrease this to W/SLICES_PARALLEL+1 and W/SLICES_PARALLEL
// respectively)
parameter CHI_IOTA_ITERATIVE = 1,
// The iterative steps operate on SLICES_PARALLEL slices.
// Each iterative step thus takes W/SLICES_PARALLEL cycles.
// Note that an iterative Chi step drastically reduces the required
// randomness of a masked configuration (see further documentation for
// details).
parameter SLICES_PARALLEL = 1,
// The Chi step takes 2*ABSORB_SLICES cycles in DOM. Clock the inner domain
// register with the negative edge to overcome this. This should hardly
// influence the critical path, but increase throughput.
parameter CHI_DOUBLE_CLK = 1,
parameter LESS_RAND = 1,
// Controls the insertion of the inner-domain FF when using more than 1 share
// 0 = Only the cross-domain-FF are inserted
// 1 = Inner-domain FF are inserted and used as pipeline stage if the
// configuration allows it
parameter DOM_PIPELINE = 1,
// LOCAL PARAMETERS - DO NOT MODIFY
parameter ABSORB_SLICES = ABSORB_ITERATIVE ? SLICES_PARALLEL : W,
parameter THETA_SLICES = THETA_ITERATIVE ? SLICES_PARALLEL : ABSORB_SLICES,
parameter CHI_SLICES = CHI_IOTA_ITERATIVE ? SLICES_PARALLEL : W,
parameter CONNECT_ABSORB_CHI = (ABSORB_ITERATIVE && CHI_IOTA_ITERATIVE && RATE/W == ABSORB_LANES) ? 1 : 0,
parameter DATAOUT_SIZE = (CONNECT_ABSORB_CHI) ? 25*SLICES_PARALLEL : RATE
)(
input wire ClkxCI,
input wire RstxRBI,
input wire RandomnessAvailablexSI,
input wire StartAbsorbxSI,
input wire StartSqueezexSI,
//TODO a valid/ready signal pair for DataxDO from Chi step (last round output)
output wire ReadyxSO,
input wire[SHARES*(ABSORB_LANES*ABSORB_SLICES)-1:0] AbsorbSlicesxDI,
//TODO ZxDI length can be adjusted to LESS_RAND (should be no functional difference, since upper bits are just unused at the moment in that case)
input wire[(SHARES*SHARES-SHARES)/2 * 25 * CHI_SLICES - 1:0] ZxDI,
output reg[SHARES*DATAOUT_SIZE-1:0] DataxDO
);
localparam STATE_SIZE = 5*5*W;
localparam THETA_SLICES_SIZE = 5*5*THETA_SLICES;
localparam CHI_SLICES_SIZE = 5*5*CHI_SLICES;
localparam PI_SLICES = SLICES_PARALLEL; //TODO hmm
localparam PI_SLICES_SIZE = 5*5*PI_SLICES;
localparam ABSORB_SLICES_SIZE = ABSORB_SLICES * ABSORB_LANES;
function integer getLaneNr(input integer x_coord, input integer y_coord);
getLaneNr = 5*x_coord + y_coord;
endfunction
function integer getXCoord(input integer lane_nr);
getXCoord = lane_nr / 5;
endfunction
function integer getYCoord(input integer lane_nr);
getYCoord = lane_nr % 5;
endfunction
function integer Idx(input integer s,
input integer x,
input integer y,
input integer len);
Idx = s*25*len + getLaneNr(x,y)*len;
endfunction
function integer StateIdx(input integer s, input integer x, input integer y);
StateIdx = s*STATE_SIZE + getLaneNr(x,y)*W;
endfunction
`define QUEUE(target_state, slices, slice_len, shift) \
for(i=0; i < SHARES; i=i+1) begin \
for(x=0; x < 5; x=x+1) begin \
for(y=0; y < 5; y=y+1) begin \
target_state[StateIdx(i,x,y) +: W] = { \
slices[Idx(i,x,y,slice_len) +: shift], \
StatexD[StateIdx(i,x,y) +: W] } >> shift; \
end \
end \
end
`define CON_SLICES(dst, dst_len, src, src_len, start_slice, amount) \
for(i=0; i < SHARES; i=i+1) begin \
for(x=0; x < 5; x=x+1) begin \
for(y=0; y < 5; y=y+1) begin \
dst[Idx(i,x,y,dst_len) +: amount] \
= src[Idx(i,x,y,src_len) + start_slice +: amount]; \
end \
end \
end
`define CON_ABSORB_XOR(dst, dst_len, src, src_len, amount) \
for(i=0; i < SHARES; i=i+1) begin \
for(x=0; x < 5; x=x+1) begin \
for(y=0; y < 5; y=y+1) begin \
if(x+5*y < RATE/W) begin \
dst[Idx(i,x,y, dst_len) +: amount] \
= AbsorbSlicesxDI[i*(ABSORB_LANES*ABSORB_SLICES) + ((x+5*y)%ABSORB_LANES)*ABSORB_SLICES +: ABSORB_SLICES] \
^ src[Idx(i,x,y,src_len) +: amount]; \
end \
end \
end \
end
generate begin
// Some parameter sanity checks
if(RATE % W || RATE > 25*W)
_RATE__must_be_a_multiple_of_the_lane_length__W__and_smaller_or_equal_to_25W DONT_COMPILE();
else if(SLICES_PARALLEL > W)
_SLICES_PARALLEL__must_be_smaller_or_equal_to_the_lane_length__W_ DONT_COMPILE();
else if(!(W == 1 || W == 2 || W == 4 || W == 8 || W == 16 || W == 32 || W == 64))
The_lane_length__W__must_be_a_power_of_2 DONT_COMPILE();
else if( (W==SLICES_PARALLEL) && (ABSORB_ITERATIVE || THETA_ITERATIVE || RHO_PI_ITERATIVE || CHI_IOTA_ITERATIVE) )
W_eq_SLICES_PARALLEL_but_at_least_one_step_should_be_iterative_according_to_a_xxITERATIVE_parameter DONT_COMPILE();
else if( (W!=SLICES_PARALLEL) && !(ABSORB_ITERATIVE || THETA_ITERATIVE || RHO_PI_ITERATIVE || CHI_IOTA_ITERATIVE) )
W_neq_SLICES_PARALLEL_but_no_iterative_step DONT_COMPILE();
else if( W==SLICES_PARALLEL && SHARES > 1 ) begin
// Just a warning. Comment the line if you really want this
//W_equals_SLICES_PARALLEL_but_using_a_masked_implementation_This_requires_a_lot_of_fresh_randomness_in_the_Chi_step DONT_COMPILE();
end
else if( ABSORB_LANES < 1 || ABSORB_LANES > RATE/W || (RATE/W) % ABSORB_LANES != 0)
Allowed_range_is_1_le_ABSORB_LANES_le_RATE_over_W___Further_restriction_is_RATE_over_W_mod_ABSORB_LANES_eq_0 DONT_COMPILE();
else if(RHO_PI_ITERATIVE && !CHI_IOTA_ITERATIVE)
Not_useful_because_Rho_and_pi_will_take_W_cycles_each_Set_CHI_IOTA_ITERATIVE_so_that_the_Pi_step_is_done_concurrently_with_Chi DONT_COMPILE();
else if(RHO_PI_ITERATIVE && SLICES_PARALLEL != 1)
The_iterative_Rho_step_is_done_by_shifting_the_lanes_Thus_we_need_W_cycles_and_not_W_over_SLICES_PARALLEL DONT_COMPILE();
else if(CHI_DOUBLE_CLK && DOM_PIPELINE)
CHI_DOUBLE_CLK_and_DOM_PIPELINE_cannot_both_be_true DONT_COMPILE();
end endgenerate
wire[SHARES*STATE_SIZE-1:0] StatexD;
reg[SHARES*5*5*THETA_SLICES-1:0] SlicesToThetaxD;
wire[SHARES*5*5*THETA_SLICES-1:0] SlicesFromThetaxD;
reg[SHARES*5*5-1:0] SliceZ0ToThetaxD;
wire[SHARES*5*5-1:0] SliceZ0FromThetaxD;
wire[SHARES*5*5*SLICES_PARALLEL-1:0] SlicesFromPixD;
reg[SHARES*5*5*SLICES_PARALLEL-1:0] SlicesToPixD;
wire[SHARES*STATE_SIZE-1:0] StateFromRhoPixD;
reg[SHARES*STATE_SIZE-1:0] StateToRhoPixD;
reg[SHARES*5*5*CHI_SLICES-1:0] SlicesToChixD;
wire[SHARES*5*5*CHI_SLICES-1:0] SlicesFromChixD;
wire[24:0] ctrl_enable_lane;
wire ctrl_enable_absorb;
wire ctrl_enable_lambda;
wire ctrl_enable_theta;
wire ctrl_enable_rhopi;
wire ctrl_enable_chi_iota;
wire ctrl_theta_last;
wire ctrl_enable_absorb_theta;
wire ctrl_enable_DOM_ff;
wire ctrl_reset_state;
genvar i;
generate begin
for(i = 0; i < SHARES; i=i+1) begin : gen_linear_steps
//---------------------------------------------------------------------
// States
keccak_state #(
.RATE(RATE),
.W(W),
.RESET_ITERATIVE(RESET_ITERATIVE),
.ABSORB_ITERATIVE(ABSORB_ITERATIVE),
.THETA_ITERATIVE(THETA_ITERATIVE),
.RHO_PI_ITERATIVE(RHO_PI_ITERATIVE),
.CHI_IOTA_ITERATIVE(CHI_IOTA_ITERATIVE),
.CONNECT_ABSORB_CHI(CONNECT_ABSORB_CHI),
.SLICES_PARALLEL(SLICES_PARALLEL),
.THETA_SLICES(THETA_SLICES),
.CHI_SLICES(CHI_SLICES),
.ABSORB_LANES(ABSORB_LANES),
.ABSORB_SLICES(ABSORB_SLICES)
) SHARE(
.ClkxCI(ClkxCI),
.RstxRBI(RstxRBI),
.EnableLanexSI (ctrl_enable_lane ),
.ctrl_reset_state (ctrl_reset_state ),
.ctrl_enable_rhopi (ctrl_enable_rhopi ),
.ctrl_enable_theta (ctrl_enable_theta ),
.ctrl_theta_last (ctrl_theta_last ),
.ctrl_enable_lambda(ctrl_enable_lambda),
.ctrl_enable_absorb(ctrl_enable_absorb),
.AbsorbSlicesxDI(AbsorbSlicesxDI[i*(ABSORB_LANES*ABSORB_SLICES) +: (ABSORB_LANES*ABSORB_SLICES)]),
.SliceZ0FromThetaxDI(SliceZ0FromThetaxD[i*25 +: 25]),
.SlicesFromThetaxDI(SlicesFromThetaxD[i*THETA_SLICES_SIZE +: THETA_SLICES_SIZE]),
.StateFromRhoPixDI(StateFromRhoPixD[i*STATE_SIZE +: STATE_SIZE]),
.SlicesFromChixDI(SlicesFromChixD[i*CHI_SLICES_SIZE +: CHI_SLICES_SIZE]),
.StatexDO(StatexD[i*STATE_SIZE +: STATE_SIZE])
);
//---------------------------------------------------------------------
// Theta
keccak_theta #(.W(W), .SLICES_PARALLEL(THETA_SLICES)) THETA(
.ClkxCI(ClkxCI),
.RstxRBI(RstxRBI),
.RstSyncxRI(ctrl_theta_last),
.EnablexSI(ctrl_enable_theta),
.SlicesxDI(SlicesToThetaxD[i*THETA_SLICES_SIZE +: THETA_SLICES_SIZE]),
.SliceZ0xDI(SliceZ0ToThetaxD[i*25 +: 25]),
.SlicesxDO(SlicesFromThetaxD[i*THETA_SLICES_SIZE +: THETA_SLICES_SIZE]),
.SliceZ0xDO(SliceZ0FromThetaxD[i*25 +: 25])
);
//---------------------------------------------------------------------
// Rho + Pi
if(RHO_PI_ITERATIVE) begin
keccak_pi #(
.SLICES_PARALLEL(PI_SLICES)
) PI (
.ClkxCI(ClkxCI),
.RstxRBI(RstxRBI),
.SlicesxDI(SlicesToPixD[i*PI_SLICES_SIZE +: PI_SLICES_SIZE]),
.SlicesxDO(SlicesFromPixD[i*PI_SLICES_SIZE +: PI_SLICES_SIZE])
);
end
else begin
keccak_rhopi #(.W(W)) RHOPI(
.StatexDI(StateToRhoPixD[i*STATE_SIZE +: STATE_SIZE]),
.StatexDO(StateFromRhoPixD[i*STATE_SIZE +: STATE_SIZE])
);
end
end
end
endgenerate
//-----------------------------------------------------------------------------
// Chi + Iota
wire[CHI_SLICES-1:0] IotaRCxD;
keccak_chi_iota #(
.SHARES(SHARES),
.SLICES(CHI_SLICES),
.CHI_DOUBLE_CLK(CHI_DOUBLE_CLK),
.LESS_RAND(LESS_RAND),
.DOM_PIPELINE(DOM_PIPELINE)
) CHI(
.ClkxCI(ClkxCI),
.EnablexSI(ctrl_enable_DOM_ff),
.RstxRBI(RstxRBI),
.SlicesxDI(SlicesToChixD),
.ZxDI(ZxDI),
.IotaRCxDI(IotaRCxD),
.SlicesxDO(SlicesFromChixD)
);
//-----------------------------------------------------------------------------
// Connection of the steps in different configurations
always @(*) begin : OUTPUT_CONNECT
integer i, x, y;
if(CONNECT_ABSORB_CHI) begin
// Outputs are slices from Chi. User has to store them and put them
// back together
DataxDO = SlicesFromChixD;
end
else begin
// Output is part of the state
for(i=0; i < SHARES; i=i+1)
for(x=0; x < 5; x=x+1)
for(y=0; y < 5; y=y+1)
if(x+5*y < RATE/W) // First Rate/W lanes
DataxDO[i*RATE + (x+5*y)*W +: W] = StatexD[StateIdx(i,x,y) +: W];
end
end
always @(*) begin : THETA_CONNECT
integer i, x, y;
//`CON_SLICES(dst, dst_len, src, src_len, start_slice, amount)
if(THETA_ITERATIVE || ABSORB_ITERATIVE) begin
`CON_SLICES(SliceZ0ToThetaxD, 1, StatexD, W, THETA_SLICES, 1)
end
else begin
SliceZ0ToThetaxD = {SHARES*25{1'b0}};
end
if(CONNECT_ABSORB_CHI) begin
// ABSORB_SLICES == THETA_SLICES == CHI_SLICES
`CON_SLICES(SlicesToThetaxD, THETA_SLICES, SlicesFromChixD, CHI_SLICES, 0, THETA_SLICES)
if(ctrl_enable_absorb_theta) begin
`CON_ABSORB_XOR(SlicesToThetaxD, THETA_SLICES, SlicesFromChixD, CHI_SLICES, THETA_SLICES)
end
end
else if(ABSORB_ITERATIVE) begin
// Absorbtion and Theta step concurrent in slice based absorbtion
`CON_SLICES(SlicesToThetaxD, THETA_SLICES, StatexD, W, 0, THETA_SLICES)
if(ctrl_enable_absorb_theta) begin
`CON_ABSORB_XOR(SlicesToThetaxD, THETA_SLICES, StatexD, W, THETA_SLICES)
end
end
else begin
`CON_SLICES(SlicesToThetaxD, THETA_SLICES, StatexD, W, 0, THETA_SLICES)
end
end
always @(*) begin : RHO_PI_CONNECT
integer i, x, y;
if(RHO_PI_ITERATIVE) begin
`CON_SLICES(SlicesToPixD, PI_SLICES, StatexD, W, 0, PI_SLICES)
end
else if(!RHO_PI_ITERATIVE && !THETA_ITERATIVE && !ABSORB_ITERATIVE) begin
// Linear steps not iterative -> chain Theta+Rho+Pi steps together
`CON_SLICES(StateToRhoPixD, W, SlicesFromThetaxD, W, 0, W)
end
else begin
StateToRhoPixD = StatexD;
end
end
always @(*) begin : CHI_CONNECT
integer i, x, y;
//`CON_SLICES(dst, dst_len, src, src_len, start_slice, amount)
// NOTE: The non-completeness property would not be fulfilled
// if chi(pi(rho(theta(STATE)))) is performed combinationally
if(CHI_IOTA_ITERATIVE || (SHARES > 1)) begin
if(RHO_PI_ITERATIVE) begin
`CON_SLICES(SlicesToChixD, CHI_SLICES, SlicesFromPixD, PI_SLICES, 0, CHI_SLICES)
end
else begin
`CON_SLICES(SlicesToChixD, CHI_SLICES, StatexD, W, 0, CHI_SLICES)
end
end
else begin
`CON_SLICES(SlicesToChixD, CHI_SLICES, StateFromRhoPixD, W, 0, CHI_SLICES)
end
end
//-----------------------------------------------------------------------------
// Control path
keccak_control #(
.RATE(RATE),
.W(W),
.SHARES(SHARES),
.ABSORB_LANES(ABSORB_LANES),
.RESET_ITERATIVE(RESET_ITERATIVE),
.ABSORB_ITERATIVE(ABSORB_ITERATIVE),
.THETA_ITERATIVE(THETA_ITERATIVE),
.RHO_PI_ITERATIVE(RHO_PI_ITERATIVE),
.CHI_IOTA_ITERATIVE(CHI_IOTA_ITERATIVE),
.SLICES_PARALLEL(SLICES_PARALLEL),
.ABSORB_SLICES(ABSORB_SLICES),
.THETA_SLICES(THETA_SLICES),
.CHI_SLICES(CHI_SLICES),
.CHI_DOUBLE_CLK(CHI_DOUBLE_CLK),
.CONNECT_ABSORB_CHI(CONNECT_ABSORB_CHI),
.DOM_PIPELINE(DOM_PIPELINE)
) KECCAK_CONTROL (
.ClkxCI(ClkxCI),
.RstxRBI(RstxRBI),
.StartAbsorbxSI(StartAbsorbxSI),
.StartSqueezexSI(StartSqueezexSI),
.RandomnessAvailablexSI(RandomnessAvailablexSI),
.ReadyxSO(ReadyxSO),
.IotaRCxDO(IotaRCxD),
.StateCtrlxSO( {ctrl_enable_lane,
ctrl_enable_absorb,
ctrl_enable_lambda,
ctrl_enable_theta,
ctrl_enable_rhopi,
ctrl_enable_chi_iota,
ctrl_theta_last,
ctrl_enable_absorb_theta,
ctrl_enable_DOM_ff,
ctrl_reset_state } )
);
endmodule