This repository has been archived by the owner on Aug 2, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
/
log.asm
557 lines (444 loc) · 16.1 KB
/
log.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
;
; MIT License
; -----------
;
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
;
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
;
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
;
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log.asm
;
; An implementation of the log libm function.
;
; Prototype:
;
; double log(double x);
;
;
; Algorithm:
;
; Based on:
; Ping-Tak Peter Tang
; "Table-driven implementation of the logarithm function in IEEE
; floating-point arithmetic"
; ACM Transactions on Mathematical Software (TOMS)
; Volume 16, Issue 4 (December 1990)
;
;
; x very close to 1.0 is handled differently, for x everywhere else
; a brief explanation is given below
;
; x = (2^m)*A
; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
; x = (2^m)*2*(G/2+g/2)
; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
;
; Y = (2^(-1))*(2^(-m))*(2^m)*A
; Now, range of Y is: 0.5 <= Y < 1
;
; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
; Now, range of F is: 256 <= F <= 512
; F = F / 512
; Now, range of F is: 0.5 <= F <= 1
;
; f = -(Y-F), with (f <= 2^(-10))
;
; log(x) = m*log(2) + log(2) + log(F-f)
; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
; log(x) = m*log(2) + log(2*F) + log(1-r)
;
; r = (f/F), with (r <= 2^(-9))
; r = f*(1/F) with (1/F) precomputed to avoid division
;
; log(x) = m*log(2) + log(G) - poly
;
; log(G) is precomputed
; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
;
; log(2) and log(G) need to be maintained in extra precision
; to avoid losing precision in the calculations
;
.const
ALIGN 16
__real_ninf DQ 0fff0000000000000h ; -inf
DQ 0000000000000000h
__real_inf DQ 7ff0000000000000h ; +inf
DQ 0000000000000000h
__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN
DQ 0000000000000000h
__real_qnanbit DQ 0008000000000000h
DQ 0000000000000000h
__real_min_norm DQ 0010000000000000h
DQ 0000000000000000h
__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits
DQ 0000000000000000h
__mask_1023 DQ 00000000000003ffh
DQ 0000000000000000h
__mask_001 DQ 0000000000000001h
DQ 0000000000000000h
__mask_mant_all8 DQ 000ff00000000000h
DQ 0000000000000000h
__mask_mant9 DQ 0000080000000000h
DQ 0000000000000000h
__real_two DQ 4000000000000000h ; 2
DQ 0000000000000000h
__real_one DQ 3ff0000000000000h ; 1
DQ 0000000000000000h
__real_near_one_lt DQ 3fee000000000000h ; .9375
DQ 0000000000000000h
__real_near_one_gt DQ 3ff1000000000000h ; 1.0625
DQ 0000000000000000h
__real_half DQ 3fe0000000000000h ; 1/2
DQ 0000000000000000h
__mask_100 DQ 0000000000000100h
DQ 0000000000000000h
__real_1_over_512 DQ 3f60000000000000h
DQ 0000000000000000h
__real_1_over_2 DQ 3fe0000000000000h
DQ 0000000000000000h
__real_1_over_3 DQ 3fd5555555555555h
DQ 0000000000000000h
__real_1_over_4 DQ 3fd0000000000000h
DQ 0000000000000000h
__real_1_over_5 DQ 3fc999999999999ah
DQ 0000000000000000h
__real_1_over_6 DQ 3fc5555555555555h
DQ 0000000000000000h
__mask_1023_f DQ 0c08ff80000000000h
DQ 0000000000000000h
__mask_2045 DQ 00000000000007fdh
DQ 0000000000000000h
__real_threshold DQ 3fb0000000000000h ; .0625
DQ 0000000000000000h
__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
DQ 0000000000000000h
__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
DQ 0000000000000000h
__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
DQ 0000000000000000h
__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
DQ 0000000000000000h
__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
DQ 0000000000000000h
__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
DQ 00000000000000000h
__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
DQ 00000000000000000h
; these codes and the ones in the corresponding .c file have to match
__flag_x_zero DD 00000001
__flag_x_neg DD 00000002
__flag_x_nan DD 00000003
EXTRN __log_256_lead:QWORD
EXTRN __log_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD
fname TEXTEQU <log>
fname_special TEXTEQU <_log_special>
; define local variable storage offsets
save_xmm6 EQU 20h
dummy_space EQU 40h
stack_size EQU 58h
include fm.inc
; external function
EXTERN fname_special:PROC
.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
StackAllocate stack_size
SaveXmm xmm6, save_xmm6
.ENDPROLOG
cmp DWORD PTR __use_fma3_lib, 0
jne Llog_fma3
Llog_sse2:
; compute exponent part
movdqa xmm3, xmm0
movapd xmm4, xmm0
psrlq xmm3, 52
movd rax, xmm0
psubq xmm3, XMMWORD PTR __mask_1023
; NaN or inf
mov rcx, rax
btr rcx, 63
cmp rcx, QWORD PTR __real_inf
jae __x_is_inf_or_nan
movdqa xmm2, xmm0
cvtdq2pd xmm6, xmm3 ; xexp
pand xmm2, XMMWORD PTR __real_mant
subsd xmm4, QWORD PTR __real_one
comisd xmm6, QWORD PTR __mask_1023_f
je __denormal_adjust
__continue_common:
andpd xmm4, XMMWORD PTR __real_notsign
; compute index into the log tables
mov r9, rax
and rax, QWORD PTR __mask_mant_all8
and r9, QWORD PTR __mask_mant9
shl r9, 1
add rax, r9
movd xmm1, rax
; near one codepath
comisd xmm4, QWORD PTR __real_threshold
jb __near_one
; F, Y
shr rax, 44
por xmm2, XMMWORD PTR __real_half
por xmm1, XMMWORD PTR __real_half
lea r9, __log_F_inv_qword
; check for negative numbers or zero
xorpd xmm5, xmm5
comisd xmm0, xmm5
jbe __x_is_zero_or_neg
; f = F - Y, r = f * inv
subsd xmm1, xmm2 ; xmm1 <-- f = F - Y
mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv
movapd xmm2, xmm1 ; xmm2 <-- copy of r
movapd xmm0, xmm1 ; xmm0 <-- copy of r
lea r9, QWORD PTR __log_256_lead
; poly
movsd xmm3, QWORD PTR __real_1_over_6
movsd xmm1, QWORD PTR __real_1_over_3
mulsd xmm3, xmm2 ; xmm3 <-- r/6
mulsd xmm1, xmm2 ; xmm1 <-- r/3
mulsd xmm0, xmm2 ; xmm0 <-- r*r
movapd xmm4, xmm0 ; xmm4 <-- copy of r*r
addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
mulsd xmm4, xmm0 ; xmm4 <-- r^4
mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r
mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2
addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r
mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
addsd xmm1, xmm3 ; xmm1 <-- poly
; m*log(2)_tail + log(G)_tail - poly
movsd xmm5, QWORD PTR __real_log2_tail
mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail
subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly
movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead
lea rdx, QWORD PTR __log_256_tail
movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail
addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail
movsd xmm4, QWORD PTR __real_log2_lead
mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead
addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead
addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__near_one:
; r = x - 1.0
movsd xmm2, QWORD PTR __real_two
subsd xmm0, QWORD PTR __real_one ; r
addsd xmm2, xmm0
movsd xmm1, xmm0
divsd xmm1, xmm2 ; r/(2+r) = u/2
movsd xmm4, QWORD PTR __real_ca2
movsd xmm5, QWORD PTR __real_ca4
movsd xmm6, xmm0
mulsd xmm6, xmm1 ; correction
addsd xmm1, xmm1 ; u
movsd xmm2, xmm1
mulsd xmm2, xmm1 ; u^2
mulsd xmm4, xmm2
mulsd xmm5, xmm2
addsd xmm4, __real_ca1
addsd xmm5, __real_ca3
mulsd xmm2, xmm1 ; u^3
mulsd xmm4, xmm2
mulsd xmm2, xmm2
mulsd xmm2, xmm1 ; u^7
mulsd xmm5, xmm2
addsd xmm4, xmm5
subsd xmm4, xmm6
addsd xmm0, xmm4
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
__denormal_adjust:
por xmm2, XMMWORD PTR __real_one
subsd xmm2, QWORD PTR __real_one
movsd xmm5, xmm2
pand xmm2, XMMWORD PTR __real_mant
movd rax, xmm2
psrlq xmm5, 52
psubd xmm5, XMMWORD PTR __mask_2045
cvtdq2pd xmm6, xmm5
jmp __continue_common
ALIGN 16
__x_is_zero_or_neg:
jne __x_is_neg
movsd xmm1, QWORD PTR __real_ninf
mov r8d, DWORD PTR __flag_x_zero
call fname_special
jmp __finish
ALIGN 16
__x_is_neg:
movsd xmm1, QWORD PTR __real_neg_qnan
mov r8d, DWORD PTR __flag_x_neg
call fname_special
jmp __finish
ALIGN 16
__x_is_inf_or_nan:
cmp rax, QWORD PTR __real_inf
je __finish
cmp rax, QWORD PTR __real_ninf
je __x_is_neg
or rax, QWORD PTR __real_qnanbit
movd xmm1, rax
mov r8d, DWORD PTR __flag_x_nan
call fname_special
jmp __finish
ALIGN 16
__finish:
RestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3:
; compute exponent part
xor rax,rax
vpsrlq xmm3,xmm0,52
vmovq rax,xmm0
vpsubq xmm3,xmm3,QWORD PTR __mask_1023
vcvtdq2pd xmm6,xmm3 ; xexp
; NaN or inf
vpand xmm5,xmm0,QWORD PTR __real_inf
vcomisd xmm5,QWORD PTR __real_inf
je Llog_fma3_x_is_inf_or_nan
; check for negative numbers or zero
vpxor xmm5,xmm5,xmm5
vcomisd xmm0,xmm5
jbe Llog_fma3_x_is_zero_or_neg
vpand xmm2,xmm0,QWORD PTR __real_mant
vsubsd xmm4,xmm0,QWORD PTR __real_one
vcomisd xmm6,QWORD PTR __mask_1023_f
je Llog_fma3_denormal_adjust
Llog_fma3_continue_common:
; compute index into the log tables
vpand xmm1,xmm0,QWORD PTR __mask_mant_all8
vpand xmm3,xmm0,QWORD PTR __mask_mant9
vpsllq xmm3,xmm3,1
vpaddq xmm1,xmm3,xmm1
vmovq rax,xmm1
; near one codepath
vpand xmm4,xmm4,QWORD PTR __real_notsign
vcomisd xmm4,QWORD PTR __real_threshold
jb Llog_fma3_near_one
; F,Y
shr rax,44
vpor xmm2,xmm2,QWORD PTR __real_half
vpor xmm1,xmm1,QWORD PTR __real_half
lea r9,QWORD PTR __log_F_inv_qword
; f = F - Y,r = f * inv
vsubsd xmm1,xmm1,xmm2
vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8]
lea r9,QWORD PTR __log_256_lead
; poly
vmulsd xmm0,xmm1,xmm1 ; r*r
vmovsd xmm3,QWORD PTR __real_1_over_6
vmovsd xmm5,QWORD PTR __real_1_over_3
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
vmovsd xmm4,xmm0,xmm0
vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
vmulsd xmm4,xmm0,xmm0 ; r*r*r*r
vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r
vfmadd231sd xmm1,xmm3,xmm4
; m*log(2) + log(G) - poly
vmovsd xmm5,QWORD PTR __real_log2_tail
vfmsub213sd xmm5,xmm6,xmm1
vmovsd xmm0,QWORD PTR[r9 + rax * 8]
lea rdx,QWORD PTR __log_256_tail
vmovsd xmm1,QWORD PTR[rdx + rax * 8]
vaddsd xmm1,xmm1,xmm5
vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead
vaddsd xmm0,xmm0,xmm1
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_near_one:
; r = x - 1.0
vmovsd xmm3,QWORD PTR __real_two
vsubsd xmm0,xmm0,QWORD PTR __real_one ; r
vaddsd xmm3,xmm3,xmm0
vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2
vmovsd xmm4,QWORD PTR __real_ca2
vmovsd xmm5,QWORD PTR __real_ca4
vmulsd xmm3,xmm0,xmm1 ; correction
vaddsd xmm1,xmm1,xmm1 ; u
vmulsd xmm2,xmm1,xmm1 ; u^2
vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1
vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3
vmulsd xmm2,xmm2,xmm1 ; u^3
vmulsd xmm4,xmm4,xmm2
vmulsd xmm2,xmm2,xmm2
vmulsd xmm2,xmm2,xmm1 ; u^7
vfmadd231sd xmm4,xmm5,xmm2
vsubsd xmm4,xmm4,xmm3
vaddsd xmm0,xmm0,xmm4
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
Llog_fma3_denormal_adjust:
vpor xmm2,xmm2,QWORD PTR __real_one
vsubsd xmm2,xmm2,QWORD PTR __real_one
vpsrlq xmm5,xmm2,52
vpand xmm2,xmm2,QWORD PTR __real_mant
vmovapd xmm0,xmm2
vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045
vcvtdq2pd xmm6,xmm5
jmp Llog_fma3_continue_common
ALIGN 16
Llog_fma3_x_is_zero_or_neg:
jne Llog_fma3_x_is_neg
vmovsd xmm1,QWORD PTR __real_ninf
mov r8d,DWORD PTR __flag_x_zero
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_neg:
vmovsd xmm1,QWORD PTR __real_neg_qnan
mov r8d,DWORD PTR __flag_x_neg
call fname_special
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
ALIGN 16
Llog_fma3_x_is_inf_or_nan:
cmp rax,QWORD PTR __real_inf
je Llog_fma3_finish
cmp rax,QWORD PTR __real_ninf
je Llog_fma3_x_is_neg
or rax,QWORD PTR __real_qnanbit
vmovq xmm1,rax
mov r8d,DWORD PTR __flag_x_nan
call fname_special
ALIGN 16
Llog_fma3_finish:
AVXRestoreXmm xmm6, save_xmm6
StackDeallocate stack_size
ret
fname endp
END