Skip to content

Commit

Permalink
Merge pull request #542 from Gathros/fft_x64
Browse files Browse the repository at this point in the history
Adding FFT in X86_64
  • Loading branch information
Gathros authored Dec 31, 2018
2 parents 6daa16f + 44de64f commit 1048a07
Show file tree
Hide file tree
Showing 2 changed files with 405 additions and 0 deletions.
399 changes: 399 additions & 0 deletions contents/cooley_tukey/code/asm-x64/fft.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,399 @@
.intel_syntax noprefix

.section .rodata
two: .double 2.0
one: .double 1.0
two_pi: .double -6.28318530718
rand_max: .long 4290772992
.long 1105199103
fmt: .string "%g\n"

.section .text
.global main
.extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2

# rdi - array ptr
# rsi - array size
dft:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
sub rsp, r13 # Make a double complex array
xor r14, r14 # Set index to 0
dft_loop_i:
cmp r14, r13 # Check if index is equal to array size
je dft_end_i
lea rax, [rsp + r14] # Set tmp array to zero at r14
mov QWORD PTR [rax], 0
mov QWORD PTR [rax + 8], 0
xor r15, r15 # Set second index to 0
dft_loop_j:
cmp r15, r13 # Check if the index is equal to array size
je dft_end_j
movsd xmm1, two_pi # Calculate xmm1 = -2pi * i * j / N
mov rax, r14
imul rax, r15
shr rax, 4
cvtsi2sdq xmm2, rax
mulsd xmm1, xmm2
cvtsi2sdq xmm2, r13
divsd xmm1, xmm2
pxor xmm0, xmm0 # Set xmm0 to 0
call cexp
lea rax, [r12 + r15] # Calculate X[i] * cexp(-2pi * i * j / N)
movsd xmm2, QWORD PTR [rax]
movsd xmm3, QWORD PTR [rax + 8]
call __muldc3
lea rax, [rsp + r14]
movsd xmm6, QWORD PTR [rax] # Sum to tmp array
movsd xmm7, QWORD PTR [rax + 8]
addsd xmm6, xmm0
addsd xmm7, xmm1
movsd QWORD PTR [rax], xmm6 # Save to tmp array
movsd QWORD PTR [rax + 8], xmm7
add r15, 16
jmp dft_loop_j
dft_end_j:
add r14, 16
jmp dft_loop_i
dft_end_i:
mov rdi, r12 # Move tmp array to array ptr
mov rsi, rsp
mov rdx, r13
call memcpy
add rsp, r13
pop r15
pop r14
pop r13
pop r12
pop rbx
ret

# rdi - array ptr
# rsi - array size
cooley_tukey:
cmp rsi, 16 # Check if size if greater then 1
jle cooley_tukey_return
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
mov r14, rsi # Save N / 2
shr r14, 1
sub rsp, r14 # Make a tmp array
xor r15, r15
mov rbx, r12
cooley_tukey_spliting:
cmp r15, r14
je cooley_tukey_split
lea rax, [r12 + 2 * r15] # Moving all odd entries to the front of the array
movaps xmm0, XMMWORD PTR [rax + 16]
movaps xmm1, XMMWORD PTR [rax]
movaps XMMWORD PTR [rsp + r15], xmm0
movaps XMMWORD PTR [rbx], xmm1
add rbx, 16
add r15, 16
jmp cooley_tukey_spliting
cooley_tukey_split:
mov rax, rsp
lea rdi, [r12 + r13]
cooley_tukey_mov_data:
cmp rbx, rdi
je cooley_tukey_moved
movaps xmm0, XMMWORD PTR [rax]
movaps XMMWORD PTR [rbx], xmm0
add rbx, 16
add rax, 16
jmp cooley_tukey_mov_data
cooley_tukey_moved:
add rsp, r14
mov rdi, r12 # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rdi, [r12 + r14] # Makking a recursive call
mov rsi, r14
call cooley_tukey
lea rbx, [r12 + r14]
mov r14, rbx
mov r15, r12
cooley_tukey_loop:
cmp r15, rbx
je cooley_tukey_end
pxor xmm0, xmm0 # Calculate cexp(-2.0 * I * M_PI * i / N)
movsd xmm1, two_pi
mov rax, r14
sub rax, rbx
cvtsi2sdq xmm2, rax
cvtsi2sdq xmm3, r13
divsd xmm2, xmm3
mulsd xmm1, xmm2
call cexp
movq xmm2, QWORD PTR [r14] # Calculating X[i] - cexp() * X[i + N / 2]
movq xmm3, QWORD PTR [r14 + 8]
call __muldc3
movq xmm2, QWORD PTR [r15]
movq xmm3, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movq QWORD PTR [r14], xmm2 # Save value in X[i + N / 2]
movq QWORD PTR [r14 + 8], xmm3
movq xmm0, QWORD PTR [r15] # Calculating X[i] -= X[i + N / 2] - X[i]
movq xmm1, QWORD PTR [r15 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movq QWORD PTR [r15], xmm0
movq QWORD PTR [r15 + 8], xmm1
add r14, 16
add r15, 16
jmp cooley_tukey_loop
cooley_tukey_end:
pop r15
pop r14
pop r13
pop r12
pop rbx
cooley_tukey_return:
ret

# rdi - array ptr
# rsi - array size
bit_reverse:
push rbx
push r12
push r13
push r14
push r15
mov r12, rdi # Save parameters
mov r13, rsi
shr r13, 4
xor r14, r14 # Loop through all entries
bit_reverse_entries:
cmp r14, r13
je bit_reverse_return
cvtsi2sdq xmm0, r13 # Calculating the number of bit in N
call log2
cvttsd2si rcx, xmm0
mov rdi, 1 # Calculating (1 << log2(N)) - 1
sal edi, cl
sub edi, 1
sub ecx, 1
mov rax, r14
mov r15, r14
bit_reverse_loop:
sar r15 # Check if r15 is 0
je bit_reverse_reversed
sal rax, 1 # Calculating (rax << 1) | (r15 & 1)
mov rsi, r15
and rsi, 1
or rax, rsi
sub ecx, 1 # Decrement bit count
jmp bit_reverse_loop
bit_reverse_reversed:
sal eax, cl # Calculate (rax << rcx) & (1 << bit count)
and rax, rdi
cmp rax, r14 # Check if rax is greater then r14
jle bit_reverse_no_swap # If so then swap entries
shl rax, 4 # Times index by 16 to get bytes to entry
shl r14, 4
movaps xmm0, XMMWORD PTR [r12 + rax]
movaps xmm1, XMMWORD PTR [r12 + r14]
movaps XMMWORD PTR [r12 + rax], xmm1
movaps XMMWORD PTR [r12 + r14], xmm0
shr r14, 4
bit_reverse_no_swap:
add r14, 1
jmp bit_reverse_entries
bit_reverse_return:
pop r15
pop r14
pop r13
pop r12
pop rbx
ret

# rdi - array ptr
# rsi - array size
iterative_cooley_tukey:
push r12
push r13
push r14
push r15
push rbx
sub rsp, 48
mov r12, rdi
mov r13, rsi
call bit_reverse # Bit reversing array
sar r13, 4 # Calculate log2(N)
cvtsi2sdq xmm0, r13
call log2
cvttsd2si rax, xmm0
mov QWORD PTR [rsp], rax # Save it to the stack
mov r14, 1
iter_ct_loop_i:
cmp r14, rax # Check if r14 is greater then log2(N)
jg iter_ct_end_i
movsd xmm0, two # Calculate stride = 2^(r14)
cvtsi2sdq xmm1, r14
call pow
cvttsd2si r10, xmm0
mov QWORD PTR [rsp + 40], r10# move stride to stack
movsd xmm1, two_pi # Calculating cexp(-2pi * I / stride)
divsd xmm1, xmm0
pxor xmm0, xmm0
call cexp
movq QWORD PTR [rsp + 8], xmm0 # Save it to stack
movq QWORD PTR [rsp + 16], xmm1
xor r15, r15
iter_ct_loop_j:
cmp r15, r13 # Check if r15 is less then array size
je iter_ct_end_j
movsd xmm4, one # Save 1 + 0i to stack
pxor xmm5, xmm5
movsd QWORD PTR [rsp + 24], xmm4
movsd QWORD PTR [rsp + 32], xmm5
xor rbx, rbx
mov rax, QWORD PTR [rsp + 40]# Calculate stride / 2
sar rax, 1
iter_ct_loop_k:
cmp rbx, rax # Check if rbx is less then stride / 2
je iter_ct_end_k
mov r8, r15 # Saving pointers to X[k + j + stride / 2] and X[k + j]
add r8, rbx
sal r8, 4
mov r9, QWORD PTR [rsp + 40]
sal r9, 3
add r9, r8
lea r9, [r12 + r9]
lea r8, [r12 + r8]
movsd xmm0, QWORD PTR [r9] # Calculate X[k + j] - v * X[k + j + stride / 2]
movsd xmm1, QWORD PTR [r9 + 8]
movsd xmm2, QWORD PTR [rsp + 24]
movsd xmm3, QWORD PTR [rsp + 32]
call __muldc3
movsd xmm2, QWORD PTR [r8]
movsd xmm3, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
movsd QWORD PTR [r9], xmm2 # Saving answer
movsd QWORD PTR [r9 + 8], xmm3
movsd xmm0, QWORD PTR [r8] # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
movsd xmm1, QWORD PTR [r8 + 8]
subsd xmm2, xmm0
subsd xmm3, xmm1
subsd xmm0, xmm2
subsd xmm1, xmm3
movsd QWORD PTR [r8], xmm0 # Saving answer
movsd QWORD PTR [r8 + 8], xmm1
movsd xmm0, QWORD PTR [rsp + 24] # Calculating v * w
movsd xmm1, QWORD PTR [rsp + 32]
movsd xmm2, QWORD PTR [rsp + 8]
movsd xmm3, QWORD PTR [rsp + 16]
call __muldc3
movsd QWORD PTR [rsp + 24], xmm0 # Saving answer
movsd QWORD PTR [rsp + 32], xmm1
add rbx, 1
mov rax, QWORD PTR [rsp + 40]
sar rax, 1
jmp iter_ct_loop_k
iter_ct_end_k:
add r15, QWORD PTR [rsp + 40]
jmp iter_ct_loop_j
iter_ct_end_j:
add r14, 1
mov rax, QWORD PTR [rsp]
jmp iter_ct_loop_i
iter_ct_end_i:
add rsp, 48
pop rbx
pop r15
pop r14
pop r13
pop r12
ret

# rdi - array a ptr
# rsi - array b ptr
# rdx - array size
approx:
push r12
push r13
push r14
push r15
mov r12, rdi
mov r13, rsi
mov r14, rdx
lea r15, [rdi + rdx]
sub rsp, 8
approx_loop:
cmp r12, r15
je approx_return
movsd xmm0, QWORD PTR[r13]
movsd xmm1, QWORD PTR[r13 + 8]
call cabs
movsd QWORD PTR [rsp], xmm0
movsd xmm0, QWORD PTR[r12]
movsd xmm1, QWORD PTR[r12 + 8]
call cabs
movsd xmm1, QWORD PTR [rsp]
subsd xmm0, xmm1
mov rdi, OFFSET fmt
mov rax, 1
call printf
add r12, 16
add r13, 16
jmp approx_loop
approx_return:
add rsp, 8
pop r15
pop r14
pop r13
pop r12
ret

main:
push r12
sub rsp, 2048
mov rdi, 0
call time
mov edi, eax
call srand
lea r12, [rsp + 1024]
loop:
cmp r12, rsp
je end_loop
sub r12, 16
call rand
cvtsi2sd xmm0, rax
divsd xmm0, rand_max
lea rax, [r12 + 1024]
movsd QWORD PTR [r12], xmm0
movsd QWORD PTR [rax], xmm0
mov QWORD PTR [r12 + 8], 0
mov QWORD PTR [rax + 8], 0
jmp loop
end_loop:
mov rdi, rsp
mov rsi, 1024
call iterative_cooley_tukey
lea rdi, [rsp + 1024]
mov rsi, 1024
call cooley_tukey
mov rdi, rsp
lea rsi, [rsp + 1024]
mov rdx, 1024
call approx
xor rax, rax
add rsp, 2048
pop r12
ret


Loading

0 comments on commit 1048a07

Please sign in to comment.