Merge pull request #542 from Gathros/fft_x64

Adding FFT in X86_64
algorithm-archivists · Dec 31, 2018 · 1048a07 · 1048a07
2 parents 6daa16f + 44de64f
commit 1048a07
Show file tree

Hide file tree

Showing 2 changed files with 405 additions and 0 deletions.
diff --git a/contents/cooley_tukey/code/asm-x64/fft.s b/contents/cooley_tukey/code/asm-x64/fft.s
@@ -0,0 +1,399 @@
+.intel_syntax noprefix
+
+.section .rodata
+  two:           .double 2.0
+  one:           .double 1.0
+  two_pi:        .double -6.28318530718
+  rand_max:      .long 4290772992
+                 .long 1105199103
+  fmt:           .string "%g\n"
+
+.section .text
+  .global main
+  .extern printf, memset, memcpy, srand, rand, time, cexp, __muldc3, cabs, log2
+
+# rdi - array ptr
+# rsi - array size
+dft:
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                    # Save parameters
+  mov    r13, rsi
+  sub    rsp, r13                    # Make a double complex array
+  xor    r14, r14                    # Set index to 0
+dft_loop_i:
+  cmp    r14, r13                    # Check if index is equal to array size
+  je     dft_end_i
+  lea    rax, [rsp + r14]            # Set tmp array to zero at r14
+  mov    QWORD PTR [rax], 0
+  mov    QWORD PTR [rax + 8], 0
+  xor    r15, r15                    # Set second index to 0
+dft_loop_j:
+  cmp    r15, r13                    # Check if the index is equal to array size
+  je     dft_end_j
+  movsd  xmm1, two_pi                # Calculate xmm1 = -2pi * i * j / N
+  mov    rax, r14
+  imul   rax, r15
+  shr    rax, 4
+  cvtsi2sdq xmm2, rax
+  mulsd  xmm1, xmm2
+  cvtsi2sdq xmm2, r13
+  divsd  xmm1, xmm2
+  pxor   xmm0, xmm0                  # Set xmm0 to 0
+  call   cexp
+  lea    rax, [r12 + r15]            # Calculate X[i] * cexp(-2pi * i * j / N)
+  movsd  xmm2, QWORD PTR [rax]
+  movsd  xmm3, QWORD PTR [rax + 8]
+  call   __muldc3
+  lea    rax, [rsp + r14]
+  movsd  xmm6, QWORD PTR [rax]       # Sum to tmp array
+  movsd  xmm7, QWORD PTR [rax + 8]
+  addsd  xmm6, xmm0
+  addsd  xmm7, xmm1
+  movsd  QWORD PTR [rax], xmm6       # Save to tmp array
+  movsd  QWORD PTR [rax + 8], xmm7
+  add    r15, 16
+  jmp    dft_loop_j
+dft_end_j:
+  add    r14, 16
+  jmp    dft_loop_i
+dft_end_i:
+  mov    rdi, r12                    # Move tmp array to array ptr
+  mov    rsi, rsp
+  mov    rdx, r13
+  call   memcpy
+  add    rsp, r13
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+  ret
+
+# rdi - array ptr
+# rsi - array size
+cooley_tukey:
+  cmp    rsi, 16                     # Check if size if greater then 1
+  jle    cooley_tukey_return
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                    # Save parameters
+  mov    r13, rsi
+  mov    r14, rsi                    # Save N / 2
+  shr    r14, 1
+  sub    rsp, r14                    # Make a tmp array
+  xor    r15, r15
+  mov    rbx, r12
+cooley_tukey_spliting:
+  cmp    r15, r14
+  je     cooley_tukey_split
+  lea    rax, [r12 + 2 * r15]        # Moving all odd entries to the front of the array
+  movaps xmm0, XMMWORD PTR [rax + 16]
+  movaps xmm1, XMMWORD PTR [rax]
+  movaps XMMWORD PTR [rsp + r15], xmm0
+  movaps XMMWORD PTR [rbx], xmm1
+  add    rbx, 16
+  add    r15, 16
+  jmp    cooley_tukey_spliting
+cooley_tukey_split:
+  mov    rax, rsp
+  lea    rdi, [r12 + r13]
+cooley_tukey_mov_data:
+  cmp    rbx, rdi
+  je     cooley_tukey_moved
+  movaps xmm0, XMMWORD PTR [rax]
+  movaps XMMWORD PTR [rbx], xmm0
+  add    rbx, 16
+  add    rax, 16
+  jmp    cooley_tukey_mov_data
+cooley_tukey_moved:
+  add    rsp, r14
+  mov    rdi, r12                   # Makking a recursive call
+  mov    rsi, r14
+  call   cooley_tukey
+  lea    rdi, [r12 + r14]           # Makking a recursive call
+  mov    rsi, r14
+  call   cooley_tukey
+  lea    rbx, [r12 + r14]
+  mov    r14, rbx
+  mov    r15, r12
+cooley_tukey_loop:
+  cmp    r15, rbx
+  je     cooley_tukey_end
+  pxor   xmm0, xmm0                 # Calculate cexp(-2.0 * I * M_PI * i / N)
+  movsd  xmm1, two_pi
+  mov    rax, r14
+  sub    rax, rbx
+  cvtsi2sdq xmm2, rax
+  cvtsi2sdq xmm3, r13
+  divsd  xmm2, xmm3
+  mulsd  xmm1, xmm2
+  call   cexp
+  movq   xmm2, QWORD PTR [r14]      # Calculating X[i] - cexp() * X[i + N / 2]
+  movq   xmm3, QWORD PTR [r14 + 8]
+  call   __muldc3
+  movq   xmm2, QWORD PTR [r15]
+  movq   xmm3, QWORD PTR [r15 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  movq   QWORD PTR [r14], xmm2      # Save value in X[i + N / 2]
+  movq   QWORD PTR [r14 + 8], xmm3
+  movq   xmm0, QWORD PTR [r15]      # Calculating X[i] -= X[i + N / 2] - X[i]
+  movq   xmm1, QWORD PTR [r15 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  subsd  xmm0, xmm2
+  subsd  xmm1, xmm3
+  movq   QWORD PTR [r15], xmm0
+  movq   QWORD PTR [r15 + 8], xmm1
+  add    r14, 16
+  add    r15, 16
+  jmp    cooley_tukey_loop
+cooley_tukey_end:
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+cooley_tukey_return:
+  ret
+
+# rdi - array ptr
+# rsi - array size
+bit_reverse:
+  push   rbx
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi                  # Save parameters
+  mov    r13, rsi
+  shr    r13, 4
+  xor    r14, r14                  # Loop through all entries
+bit_reverse_entries:
+  cmp    r14, r13
+  je     bit_reverse_return
+  cvtsi2sdq xmm0, r13              # Calculating the number of bit in N
+  call   log2
+  cvttsd2si rcx, xmm0
+  mov    rdi, 1                    # Calculating (1 << log2(N)) - 1
+  sal    edi, cl
+  sub    edi, 1
+  sub    ecx, 1
+  mov    rax, r14
+  mov    r15, r14
+bit_reverse_loop:
+  sar    r15                       # Check if r15 is 0
+  je     bit_reverse_reversed
+  sal    rax, 1                    # Calculating (rax << 1) | (r15 & 1)
+  mov    rsi, r15
+  and    rsi, 1
+  or     rax, rsi
+  sub    ecx, 1                    # Decrement bit count
+  jmp    bit_reverse_loop
+bit_reverse_reversed:
+  sal    eax, cl                   # Calculate (rax << rcx) & (1 << bit count)
+  and    rax, rdi
+  cmp    rax, r14                  # Check if rax is greater then r14
+  jle    bit_reverse_no_swap       # If so then swap entries
+  shl    rax, 4                    # Times index by 16 to get bytes to entry
+  shl    r14, 4
+  movaps xmm0, XMMWORD PTR [r12 + rax]
+  movaps xmm1, XMMWORD PTR [r12 + r14]
+  movaps XMMWORD PTR [r12 + rax], xmm1
+  movaps XMMWORD PTR [r12 + r14], xmm0
+  shr    r14, 4
+bit_reverse_no_swap:
+  add    r14, 1
+  jmp    bit_reverse_entries
+bit_reverse_return:
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  pop    rbx
+  ret
+
+# rdi - array ptr
+# rsi - array size
+iterative_cooley_tukey:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  push   rbx
+  sub    rsp, 48
+  mov    r12, rdi
+  mov    r13, rsi
+  call   bit_reverse              # Bit reversing array
+  sar    r13, 4                   # Calculate log2(N)
+  cvtsi2sdq xmm0, r13
+  call   log2
+  cvttsd2si rax, xmm0
+  mov    QWORD PTR [rsp], rax     # Save it to the stack
+  mov    r14, 1
+iter_ct_loop_i:
+  cmp    r14, rax                 # Check if r14 is greater then log2(N)
+  jg     iter_ct_end_i
+  movsd  xmm0, two                # Calculate stride = 2^(r14)
+  cvtsi2sdq xmm1, r14
+  call   pow
+  cvttsd2si r10, xmm0
+  mov    QWORD PTR [rsp + 40], r10# move stride to stack
+  movsd  xmm1, two_pi             # Calculating cexp(-2pi * I / stride)
+  divsd  xmm1, xmm0
+  pxor   xmm0, xmm0
+  call   cexp
+  movq   QWORD PTR [rsp + 8], xmm0  # Save it to stack
+  movq   QWORD PTR [rsp + 16], xmm1
+  xor    r15, r15
+iter_ct_loop_j:
+  cmp    r15, r13                 # Check if r15 is less then array size
+  je     iter_ct_end_j
+  movsd  xmm4, one                # Save 1 + 0i to stack
+  pxor   xmm5, xmm5
+  movsd  QWORD PTR [rsp + 24], xmm4
+  movsd  QWORD PTR [rsp + 32], xmm5
+  xor    rbx, rbx
+  mov    rax, QWORD PTR [rsp + 40]# Calculate stride / 2
+  sar    rax, 1
+iter_ct_loop_k:
+  cmp    rbx, rax                 # Check if rbx is less then stride / 2
+  je     iter_ct_end_k
+  mov    r8, r15                  # Saving pointers to X[k + j + stride / 2] and X[k + j]
+  add    r8, rbx
+  sal    r8, 4
+  mov    r9, QWORD PTR [rsp + 40]
+  sal    r9, 3
+  add    r9, r8
+  lea    r9, [r12 + r9]
+  lea    r8, [r12 + r8]
+  movsd  xmm0, QWORD PTR [r9]     # Calculate X[k + j] - v * X[k + j + stride / 2]
+  movsd  xmm1, QWORD PTR [r9 + 8]
+  movsd  xmm2, QWORD PTR [rsp + 24]
+  movsd  xmm3, QWORD PTR [rsp + 32]
+  call   __muldc3
+  movsd  xmm2, QWORD PTR [r8]
+  movsd  xmm3, QWORD PTR [r8 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  movsd  QWORD PTR [r9], xmm2     # Saving answer
+  movsd  QWORD PTR [r9 + 8], xmm3
+  movsd  xmm0, QWORD PTR [r8]     # Calculating X[k + j] - (X[k + j + stride / 2] - X[k + j])
+  movsd  xmm1, QWORD PTR [r8 + 8]
+  subsd  xmm2, xmm0
+  subsd  xmm3, xmm1
+  subsd  xmm0, xmm2
+  subsd  xmm1, xmm3
+  movsd  QWORD PTR [r8], xmm0     # Saving answer
+  movsd  QWORD PTR [r8 + 8], xmm1
+  movsd  xmm0, QWORD PTR [rsp + 24] # Calculating v * w
+  movsd  xmm1, QWORD PTR [rsp + 32]
+  movsd  xmm2, QWORD PTR [rsp + 8]
+  movsd  xmm3, QWORD PTR [rsp + 16]
+  call   __muldc3
+  movsd  QWORD PTR [rsp + 24], xmm0 # Saving answer
+  movsd  QWORD PTR [rsp + 32], xmm1
+  add    rbx, 1
+  mov    rax, QWORD PTR [rsp + 40]
+  sar    rax, 1
+  jmp    iter_ct_loop_k
+iter_ct_end_k:
+  add    r15, QWORD PTR [rsp + 40]
+  jmp    iter_ct_loop_j
+iter_ct_end_j:
+  add    r14, 1
+  mov    rax, QWORD PTR [rsp]
+  jmp    iter_ct_loop_i
+iter_ct_end_i:
+  add    rsp, 48
+  pop    rbx
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+# rdi - array a ptr
+# rsi - array b ptr
+# rdx - array size
+approx:
+  push   r12
+  push   r13
+  push   r14
+  push   r15
+  mov    r12, rdi
+  mov    r13, rsi
+  mov    r14, rdx
+  lea    r15, [rdi + rdx]
+  sub    rsp, 8
+approx_loop:
+  cmp    r12, r15
+  je     approx_return
+  movsd  xmm0, QWORD PTR[r13]
+  movsd  xmm1, QWORD PTR[r13 + 8]
+  call   cabs
+  movsd  QWORD PTR [rsp], xmm0
+  movsd  xmm0, QWORD PTR[r12]
+  movsd  xmm1, QWORD PTR[r12 + 8]
+  call   cabs
+  movsd  xmm1, QWORD PTR [rsp]
+  subsd  xmm0, xmm1
+  mov    rdi, OFFSET fmt
+  mov    rax, 1
+  call   printf
+  add    r12, 16
+  add    r13, 16
+  jmp    approx_loop
+approx_return:
+  add    rsp, 8
+  pop    r15
+  pop    r14
+  pop    r13
+  pop    r12
+  ret
+
+main:
+  push   r12
+  sub    rsp, 2048
+  mov    rdi, 0
+  call   time
+  mov    edi, eax
+  call   srand
+  lea    r12, [rsp + 1024]
+loop:
+  cmp    r12, rsp
+  je     end_loop
+  sub    r12, 16
+  call   rand
+  cvtsi2sd xmm0, rax
+  divsd  xmm0, rand_max
+  lea    rax, [r12 + 1024]
+  movsd  QWORD PTR [r12], xmm0
+  movsd  QWORD PTR [rax], xmm0
+  mov    QWORD PTR [r12 + 8], 0
+  mov    QWORD PTR [rax + 8], 0
+  jmp    loop
+end_loop:
+  mov    rdi, rsp
+  mov    rsi, 1024
+  call   iterative_cooley_tukey
+  lea    rdi, [rsp + 1024]
+  mov    rsi, 1024
+  call   cooley_tukey
+  mov    rdi, rsp
+  lea    rsi, [rsp + 1024]
+  mov    rdx, 1024
+  call   approx
+  xor    rax, rax
+  add    rsp, 2048
+  pop    r12
+  ret
+
+