Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding FFT in X86_64 #542

Merged
merged 4 commits into from
Dec 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
399 changes: 399 additions & 0 deletions contents/cooley_tukey/code/asm-x64/fft.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,399 @@
.intel_syntax noprefix

.section .rodata
two: .double 2.0
one: .double 1.0
two_pi: .double -6.28318530718
rand_max: .long 4290772992
.long 1105199103
fmt: .string "%g\n"

.section .text
.global main
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2

# rdi - array ptr
# rsi - array size
dft:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
sub rsp, r13 # Make a double complex array
xor r14, r14 # Set index to 0
dft_loop_i:
cmp r14, r13 # Check if index is equal to array size
je dft_end_i
lea rax, [rsp + r14] # Set tmp array to zero at r14
mov QWORD PTR [rax], 0
mov QWORD PTR [rax + 8], 0
xor r15, r15 # Set second index to 0
dft_loop_j:
cmp r15, r13 # Check if the index is equal to array size
je dft_end_j
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N
mov rax, r14
imul rax, r15
shr rax, 4
cvtsi2sdq xmm2, rax
mulsd xmm1, xmm2
cvtsi2sdq xmm2, r13
divsd xmm1, xmm2
pxor xmm0, xmm0 # Set xmm0 to 0
call cexp
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N)
movsd xmm2, QWORD PTR [rax]
movsd xmm3, QWORD PTR [rax + 8]
call __muldc3
lea rax, [rsp + r14]
movsd xmm6, QWORD PTR [rax] # Sum to tmp array
movsd xmm7, QWORD PTR [rax + 8]
addsd xmm6, xmm0
addsd xmm7, xmm1
movsd QWORD PTR [rax], xmm6 # Save to tmp array
movsd QWORD PTR [rax + 8], xmm7
add r15, 16
jmp dft_loop_j
dft_end_j:
add r14, 16
jmp dft_loop_i
dft_end_i:
mov rdi, r12 # Move tmp array to array ptr
mov rsi, rsp
mov rdx, r13
call memcpy
add rsp, r13
pop r15
pop r14
pop r13
pop r12
pop rbx
ret

# rdi - array ptr
# rsi - array size
cooley_tukey:
cmp rsi, 16 # Check if size if greater then 1
jle cooley_tukey_return
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
mov r14, rsi # Save N / 2
shr r14, 1
sub rsp, r14 # Make a tmp array
xor r15, r15
mov rbx, r12
cooley_tukey_spliting:
cmp r15, r14
je cooley_tukey_split
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array
movaps xmm0, XMMWORD PTR [rax + 16]
movaps xmm1, XMMWORD PTR [rax]
movaps XMMWORD PTR [rsp + r15], xmm0
movaps XMMWORD PTR [rbx], xmm1
add rbx, 16
add r15, 16
jmp cooley_tukey_spliting
cooley_tukey_split:
mov rax, rsp
lea rdi, [r12 + r13]
cooley_tukey_mov_data:
cmp rbx, rdi
je cooley_tukey_moved
movaps xmm0, XMMWORD PTR [rax]
movaps XMMWORD PTR [rbx], xmm0
add rbx, 16
add rax, 16
jmp cooley_tukey_mov_data
cooley_tukey_moved:
add rsp, r14
mov rdi, r12 # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rdi, [r12 + r14] # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rbx, [r12 + r14]
mov r14, rbx
mov r15, r12
cooley_tukey_loop:
cmp r15, rbx
je cooley_tukey_end
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N)
movsd xmm1, two_pi
mov rax, r14
sub rax, rbx
cvtsi2sdq xmm2, rax
cvtsi2sdq xmm3, r13
divsd xmm2, xmm3
mulsd xmm1, xmm2
call cexp
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2]
movq xmm3, QWORD PTR [r14 + 8]
call __muldc3
movq xmm2, QWORD PTR [r15]
movq xmm3, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2]
movq QWORD PTR [r14 + 8], xmm3
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i]
movq xmm1, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movq QWORD PTR [r15], xmm0
movq QWORD PTR [r15 + 8], xmm1
add r14, 16
add r15, 16
jmp cooley_tukey_loop
cooley_tukey_end:
pop r15
pop r14
pop r13
pop r12
pop rbx
cooley_tukey_return:
ret

# rdi - array ptr
# rsi - array size
bit_reverse:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
shr r13, 4
xor r14, r14 # Loop through all entries
bit_reverse_entries:
cmp r14, r13
je bit_reverse_return
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N
call log2
cvttsd2si rcx, xmm0
mov rdi, 1 # Calculating (1 << log2(N)) - 1
sal edi, cl
sub edi, 1
sub ecx, 1
mov rax, r14
mov r15, r14
bit_reverse_loop:
sar r15 # Check if r15 is 0
je bit_reverse_reversed
sal rax, 1 # Calculating (rax << 1) | (r15 & 1)
mov rsi, r15
and rsi, 1
or rax, rsi
sub ecx, 1 # Decrement bit count
jmp bit_reverse_loop
bit_reverse_reversed:
sal eax, cl # Calculate (rax << rcx) & (1 << bit count)
and rax, rdi
cmp rax, r14 # Check if rax is greater then r14
jle bit_reverse_no_swap # If so then swap entries
shl rax, 4 # Times index by 16 to get bytes to entry
shl r14, 4
movaps xmm0, XMMWORD PTR [r12 + rax]
movaps xmm1, XMMWORD PTR [r12 + r14]
movaps XMMWORD PTR [r12 + rax], xmm1
movaps XMMWORD PTR [r12 + r14], xmm0
shr r14, 4
Liikt marked this conversation as resolved.
Show resolved Hide resolved
bit_reverse_no_swap:
add r14, 1
jmp bit_reverse_entries
bit_reverse_return:
pop r15
pop r14
pop r13
pop r12
pop rbx
ret

# rdi - array ptr
# rsi - array size
iterative_cooley_tukey:
push r12
push r13
push r14
push r15
push rbx
sub rsp, 48
mov r12, rdi
mov r13, rsi
call bit_reverse # Bit reversing array
sar r13, 4 # Calculate log2(N)
cvtsi2sdq xmm0, r13
call log2
cvttsd2si rax, xmm0
mov QWORD PTR [rsp], rax # Save it to the stack
mov r14, 1
iter_ct_loop_i:
cmp r14, rax # Check if r14 is greater then log2(N)
jg iter_ct_end_i
movsd xmm0, two # Calculate stride = 2^(r14)
cvtsi2sdq xmm1, r14
call pow
cvttsd2si r10, xmm0
mov QWORD PTR [rsp + 40], r10# move stride to stack
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride)
divsd xmm1, xmm0
pxor xmm0, xmm0
call cexp
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack
movq QWORD PTR [rsp + 16], xmm1
xor r15, r15
iter_ct_loop_j:
cmp r15, r13 # Check if r15 is less then array size
je iter_ct_end_j
movsd xmm4, one # Save 1 + 0i to stack
pxor xmm5, xmm5
movsd QWORD PTR [rsp + 24], xmm4
movsd QWORD PTR [rsp + 32], xmm5
xor rbx, rbx
mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2
sar rax, 1
iter_ct_loop_k:
cmp rbx, rax # Check if rbx is less then stride / 2
je iter_ct_end_k
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j]
add r8, rbx
sal r8, 4
mov r9, QWORD PTR [rsp + 40]
sal r9, 3
add r9, r8
lea r9, [r12 + r9]
lea r8, [r12 + r8]
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2]
movsd xmm1, QWORD PTR [r9 + 8]
movsd xmm2, QWORD PTR [rsp + 24]
movsd xmm3, QWORD PTR [rsp + 32]
call __muldc3
movsd xmm2, QWORD PTR [r8]
movsd xmm3, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movsd QWORD PTR [r9], xmm2 # Saving answer
movsd QWORD PTR [r9 + 8], xmm3
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
movsd xmm1, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movsd QWORD PTR [r8], xmm0 # Saving answer
movsd QWORD PTR [r8 + 8], xmm1
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w
movsd xmm1, QWORD PTR [rsp + 32]
movsd xmm2, QWORD PTR [rsp + 8]
movsd xmm3, QWORD PTR [rsp + 16]
call __muldc3
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer
movsd QWORD PTR [rsp + 32], xmm1
add rbx, 1
mov rax, QWORD PTR [rsp + 40]
sar rax, 1
jmp iter_ct_loop_k
iter_ct_end_k:
add r15, QWORD PTR [rsp + 40]
jmp iter_ct_loop_j
iter_ct_end_j:
add r14, 1
mov rax, QWORD PTR [rsp]
jmp iter_ct_loop_i
iter_ct_end_i:
add rsp, 48
pop rbx
pop r15
pop r14
pop r13
pop r12
ret

# rdi - array a ptr
# rsi - array b ptr
# rdx - array size
approx:
push r12
push r13
push r14
push r15
mov r12, rdi
mov r13, rsi
mov r14, rdx
lea r15, [rdi + rdx]
sub rsp, 8
approx_loop:
cmp r12, r15
je approx_return
movsd xmm0, QWORD PTR[r13]
movsd xmm1, QWORD PTR[r13 + 8]
call cabs
movsd QWORD PTR [rsp], xmm0
movsd xmm0, QWORD PTR[r12]
movsd xmm1, QWORD PTR[r12 + 8]
call cabs
movsd xmm1, QWORD PTR [rsp]
subsd xmm0, xmm1
mov rdi, OFFSET fmt
mov rax, 1
call printf
add r12, 16
add r13, 16
jmp approx_loop
approx_return:
add rsp, 8
pop r15
pop r14
pop r13
pop r12
ret

main:
push r12
sub rsp, 2048
mov rdi, 0
call time
mov edi, eax
call srand
lea r12, [rsp + 1024]
loop:
cmp r12, rsp
je end_loop
sub r12, 16
call rand
cvtsi2sd xmm0, rax
divsd xmm0, rand_max
lea rax, [r12 + 1024]
movsd QWORD PTR [r12], xmm0
movsd QWORD PTR [rax], xmm0
mov QWORD PTR [r12 + 8], 0
mov QWORD PTR [rax + 8], 0
jmp loop
end_loop:
mov rdi, rsp
mov rsi, 1024
call iterative_cooley_tukey
lea rdi, [rsp + 1024]
mov rsi, 1024
call cooley_tukey
mov rdi, rsp
lea rsi, [rsp + 1024]
mov rdx, 1024
call approx
xor rax, rax
add rsp, 2048
pop r12
ret
Gathros marked this conversation as resolved.
Show resolved Hide resolved


Loading