-
-
Notifications
You must be signed in to change notification settings - Fork 357
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #542 from Gathros/fft_x64
Adding FFT in X86_64
- Loading branch information
Showing
2 changed files
with
405 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,399 @@ | ||
.intel_syntax noprefix | ||
|
||
.section .rodata | ||
two: .double 2.0 | ||
one: .double 1.0 | ||
two_pi: .double -6.28318530718 | ||
rand_max: .long 4290772992 | ||
.long 1105199103 | ||
fmt: .string "%g\n" | ||
|
||
.section .text | ||
.global main | ||
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2 | ||
|
||
# rdi - array ptr | ||
# rsi - array size | ||
dft: | ||
push rbx | ||
push r12 | ||
push r13 | ||
push r14 | ||
push r15 | ||
mov r12, rdi # Save parameters | ||
mov r13, rsi | ||
sub rsp, r13 # Make a double complex array | ||
xor r14, r14 # Set index to 0 | ||
dft_loop_i: | ||
cmp r14, r13 # Check if index is equal to array size | ||
je dft_end_i | ||
lea rax, [rsp + r14] # Set tmp array to zero at r14 | ||
mov QWORD PTR [rax], 0 | ||
mov QWORD PTR [rax + 8], 0 | ||
xor r15, r15 # Set second index to 0 | ||
dft_loop_j: | ||
cmp r15, r13 # Check if the index is equal to array size | ||
je dft_end_j | ||
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N | ||
mov rax, r14 | ||
imul rax, r15 | ||
shr rax, 4 | ||
cvtsi2sdq xmm2, rax | ||
mulsd xmm1, xmm2 | ||
cvtsi2sdq xmm2, r13 | ||
divsd xmm1, xmm2 | ||
pxor xmm0, xmm0 # Set xmm0 to 0 | ||
call cexp | ||
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N) | ||
movsd xmm2, QWORD PTR [rax] | ||
movsd xmm3, QWORD PTR [rax + 8] | ||
call __muldc3 | ||
lea rax, [rsp + r14] | ||
movsd xmm6, QWORD PTR [rax] # Sum to tmp array | ||
movsd xmm7, QWORD PTR [rax + 8] | ||
addsd xmm6, xmm0 | ||
addsd xmm7, xmm1 | ||
movsd QWORD PTR [rax], xmm6 # Save to tmp array | ||
movsd QWORD PTR [rax + 8], xmm7 | ||
add r15, 16 | ||
jmp dft_loop_j | ||
dft_end_j: | ||
add r14, 16 | ||
jmp dft_loop_i | ||
dft_end_i: | ||
mov rdi, r12 # Move tmp array to array ptr | ||
mov rsi, rsp | ||
mov rdx, r13 | ||
call memcpy | ||
add rsp, r13 | ||
pop r15 | ||
pop r14 | ||
pop r13 | ||
pop r12 | ||
pop rbx | ||
ret | ||
|
||
# rdi - array ptr | ||
# rsi - array size | ||
cooley_tukey: | ||
cmp rsi, 16 # Check if size if greater then 1 | ||
jle cooley_tukey_return | ||
push rbx | ||
push r12 | ||
push r13 | ||
push r14 | ||
push r15 | ||
mov r12, rdi # Save parameters | ||
mov r13, rsi | ||
mov r14, rsi # Save N / 2 | ||
shr r14, 1 | ||
sub rsp, r14 # Make a tmp array | ||
xor r15, r15 | ||
mov rbx, r12 | ||
cooley_tukey_spliting: | ||
cmp r15, r14 | ||
je cooley_tukey_split | ||
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array | ||
movaps xmm0, XMMWORD PTR [rax + 16] | ||
movaps xmm1, XMMWORD PTR [rax] | ||
movaps XMMWORD PTR [rsp + r15], xmm0 | ||
movaps XMMWORD PTR [rbx], xmm1 | ||
add rbx, 16 | ||
add r15, 16 | ||
jmp cooley_tukey_spliting | ||
cooley_tukey_split: | ||
mov rax, rsp | ||
lea rdi, [r12 + r13] | ||
cooley_tukey_mov_data: | ||
cmp rbx, rdi | ||
je cooley_tukey_moved | ||
movaps xmm0, XMMWORD PTR [rax] | ||
movaps XMMWORD PTR [rbx], xmm0 | ||
add rbx, 16 | ||
add rax, 16 | ||
jmp cooley_tukey_mov_data | ||
cooley_tukey_moved: | ||
add rsp, r14 | ||
mov rdi, r12 # Makking a recursive call | ||
mov rsi, r14 | ||
call cooley_tukey | ||
lea rdi, [r12 + r14] # Makking a recursive call | ||
mov rsi, r14 | ||
call cooley_tukey | ||
lea rbx, [r12 + r14] | ||
mov r14, rbx | ||
mov r15, r12 | ||
cooley_tukey_loop: | ||
cmp r15, rbx | ||
je cooley_tukey_end | ||
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N) | ||
movsd xmm1, two_pi | ||
mov rax, r14 | ||
sub rax, rbx | ||
cvtsi2sdq xmm2, rax | ||
cvtsi2sdq xmm3, r13 | ||
divsd xmm2, xmm3 | ||
mulsd xmm1, xmm2 | ||
call cexp | ||
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2] | ||
movq xmm3, QWORD PTR [r14 + 8] | ||
call __muldc3 | ||
movq xmm2, QWORD PTR [r15] | ||
movq xmm3, QWORD PTR [r15 + 8] | ||
subsd xmm2, xmm0 | ||
subsd xmm3, xmm1 | ||
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2] | ||
movq QWORD PTR [r14 + 8], xmm3 | ||
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i] | ||
movq xmm1, QWORD PTR [r15 + 8] | ||
subsd xmm2, xmm0 | ||
subsd xmm3, xmm1 | ||
subsd xmm0, xmm2 | ||
subsd xmm1, xmm3 | ||
movq QWORD PTR [r15], xmm0 | ||
movq QWORD PTR [r15 + 8], xmm1 | ||
add r14, 16 | ||
add r15, 16 | ||
jmp cooley_tukey_loop | ||
cooley_tukey_end: | ||
pop r15 | ||
pop r14 | ||
pop r13 | ||
pop r12 | ||
pop rbx | ||
cooley_tukey_return: | ||
ret | ||
|
||
# rdi - array ptr | ||
# rsi - array size | ||
bit_reverse: | ||
push rbx | ||
push r12 | ||
push r13 | ||
push r14 | ||
push r15 | ||
mov r12, rdi # Save parameters | ||
mov r13, rsi | ||
shr r13, 4 | ||
xor r14, r14 # Loop through all entries | ||
bit_reverse_entries: | ||
cmp r14, r13 | ||
je bit_reverse_return | ||
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N | ||
call log2 | ||
cvttsd2si rcx, xmm0 | ||
mov rdi, 1 # Calculating (1 << log2(N)) - 1 | ||
sal edi, cl | ||
sub edi, 1 | ||
sub ecx, 1 | ||
mov rax, r14 | ||
mov r15, r14 | ||
bit_reverse_loop: | ||
sar r15 # Check if r15 is 0 | ||
je bit_reverse_reversed | ||
sal rax, 1 # Calculating (rax << 1) | (r15 & 1) | ||
mov rsi, r15 | ||
and rsi, 1 | ||
or rax, rsi | ||
sub ecx, 1 # Decrement bit count | ||
jmp bit_reverse_loop | ||
bit_reverse_reversed: | ||
sal eax, cl # Calculate (rax << rcx) & (1 << bit count) | ||
and rax, rdi | ||
cmp rax, r14 # Check if rax is greater then r14 | ||
jle bit_reverse_no_swap # If so then swap entries | ||
shl rax, 4 # Times index by 16 to get bytes to entry | ||
shl r14, 4 | ||
movaps xmm0, XMMWORD PTR [r12 + rax] | ||
movaps xmm1, XMMWORD PTR [r12 + r14] | ||
movaps XMMWORD PTR [r12 + rax], xmm1 | ||
movaps XMMWORD PTR [r12 + r14], xmm0 | ||
shr r14, 4 | ||
bit_reverse_no_swap: | ||
add r14, 1 | ||
jmp bit_reverse_entries | ||
bit_reverse_return: | ||
pop r15 | ||
pop r14 | ||
pop r13 | ||
pop r12 | ||
pop rbx | ||
ret | ||
|
||
# rdi - array ptr | ||
# rsi - array size | ||
iterative_cooley_tukey: | ||
push r12 | ||
push r13 | ||
push r14 | ||
push r15 | ||
push rbx | ||
sub rsp, 48 | ||
mov r12, rdi | ||
mov r13, rsi | ||
call bit_reverse # Bit reversing array | ||
sar r13, 4 # Calculate log2(N) | ||
cvtsi2sdq xmm0, r13 | ||
call log2 | ||
cvttsd2si rax, xmm0 | ||
mov QWORD PTR [rsp], rax # Save it to the stack | ||
mov r14, 1 | ||
iter_ct_loop_i: | ||
cmp r14, rax # Check if r14 is greater then log2(N) | ||
jg iter_ct_end_i | ||
movsd xmm0, two # Calculate stride = 2^(r14) | ||
cvtsi2sdq xmm1, r14 | ||
call pow | ||
cvttsd2si r10, xmm0 | ||
mov QWORD PTR [rsp + 40], r10# move stride to stack | ||
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride) | ||
divsd xmm1, xmm0 | ||
pxor xmm0, xmm0 | ||
call cexp | ||
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack | ||
movq QWORD PTR [rsp + 16], xmm1 | ||
xor r15, r15 | ||
iter_ct_loop_j: | ||
cmp r15, r13 # Check if r15 is less then array size | ||
je iter_ct_end_j | ||
movsd xmm4, one # Save 1 + 0i to stack | ||
pxor xmm5, xmm5 | ||
movsd QWORD PTR [rsp + 24], xmm4 | ||
movsd QWORD PTR [rsp + 32], xmm5 | ||
xor rbx, rbx | ||
mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2 | ||
sar rax, 1 | ||
iter_ct_loop_k: | ||
cmp rbx, rax # Check if rbx is less then stride / 2 | ||
je iter_ct_end_k | ||
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j] | ||
add r8, rbx | ||
sal r8, 4 | ||
mov r9, QWORD PTR [rsp + 40] | ||
sal r9, 3 | ||
add r9, r8 | ||
lea r9, [r12 + r9] | ||
lea r8, [r12 + r8] | ||
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2] | ||
movsd xmm1, QWORD PTR [r9 + 8] | ||
movsd xmm2, QWORD PTR [rsp + 24] | ||
movsd xmm3, QWORD PTR [rsp + 32] | ||
call __muldc3 | ||
movsd xmm2, QWORD PTR [r8] | ||
movsd xmm3, QWORD PTR [r8 + 8] | ||
subsd xmm2, xmm0 | ||
subsd xmm3, xmm1 | ||
movsd QWORD PTR [r9], xmm2 # Saving answer | ||
movsd QWORD PTR [r9 + 8], xmm3 | ||
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j]) | ||
movsd xmm1, QWORD PTR [r8 + 8] | ||
subsd xmm2, xmm0 | ||
subsd xmm3, xmm1 | ||
subsd xmm0, xmm2 | ||
subsd xmm1, xmm3 | ||
movsd QWORD PTR [r8], xmm0 # Saving answer | ||
movsd QWORD PTR [r8 + 8], xmm1 | ||
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w | ||
movsd xmm1, QWORD PTR [rsp + 32] | ||
movsd xmm2, QWORD PTR [rsp + 8] | ||
movsd xmm3, QWORD PTR [rsp + 16] | ||
call __muldc3 | ||
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer | ||
movsd QWORD PTR [rsp + 32], xmm1 | ||
add rbx, 1 | ||
mov rax, QWORD PTR [rsp + 40] | ||
sar rax, 1 | ||
jmp iter_ct_loop_k | ||
iter_ct_end_k: | ||
add r15, QWORD PTR [rsp + 40] | ||
jmp iter_ct_loop_j | ||
iter_ct_end_j: | ||
add r14, 1 | ||
mov rax, QWORD PTR [rsp] | ||
jmp iter_ct_loop_i | ||
iter_ct_end_i: | ||
add rsp, 48 | ||
pop rbx | ||
pop r15 | ||
pop r14 | ||
pop r13 | ||
pop r12 | ||
ret | ||
|
||
# rdi - array a ptr | ||
# rsi - array b ptr | ||
# rdx - array size | ||
approx: | ||
push r12 | ||
push r13 | ||
push r14 | ||
push r15 | ||
mov r12, rdi | ||
mov r13, rsi | ||
mov r14, rdx | ||
lea r15, [rdi + rdx] | ||
sub rsp, 8 | ||
approx_loop: | ||
cmp r12, r15 | ||
je approx_return | ||
movsd xmm0, QWORD PTR[r13] | ||
movsd xmm1, QWORD PTR[r13 + 8] | ||
call cabs | ||
movsd QWORD PTR [rsp], xmm0 | ||
movsd xmm0, QWORD PTR[r12] | ||
movsd xmm1, QWORD PTR[r12 + 8] | ||
call cabs | ||
movsd xmm1, QWORD PTR [rsp] | ||
subsd xmm0, xmm1 | ||
mov rdi, OFFSET fmt | ||
mov rax, 1 | ||
call printf | ||
add r12, 16 | ||
add r13, 16 | ||
jmp approx_loop | ||
approx_return: | ||
add rsp, 8 | ||
pop r15 | ||
pop r14 | ||
pop r13 | ||
pop r12 | ||
ret | ||
|
||
main: | ||
push r12 | ||
sub rsp, 2048 | ||
mov rdi, 0 | ||
call time | ||
mov edi, eax | ||
call srand | ||
lea r12, [rsp + 1024] | ||
loop: | ||
cmp r12, rsp | ||
je end_loop | ||
sub r12, 16 | ||
call rand | ||
cvtsi2sd xmm0, rax | ||
divsd xmm0, rand_max | ||
lea rax, [r12 + 1024] | ||
movsd QWORD PTR [r12], xmm0 | ||
movsd QWORD PTR [rax], xmm0 | ||
mov QWORD PTR [r12 + 8], 0 | ||
mov QWORD PTR [rax + 8], 0 | ||
jmp loop | ||
end_loop: | ||
mov rdi, rsp | ||
mov rsi, 1024 | ||
call iterative_cooley_tukey | ||
lea rdi, [rsp + 1024] | ||
mov rsi, 1024 | ||
call cooley_tukey | ||
mov rdi, rsp | ||
lea rsi, [rsp + 1024] | ||
mov rdx, 1024 | ||
call approx | ||
xor rax, rax | ||
add rsp, 2048 | ||
pop r12 | ||
ret | ||
|
||
|
Oops, something went wrong.