remainder_piby2_forFMA3.asm

;
;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; An implementation of the remainder by pi/2 function using fma3
; This is a service routine for use by trig functions coded in asm that use fma3
;
; On input,
;   xmm0 = x;
; On ouput
;   xmm0 = r
;   xmm1 = rr
;   rax = region

.const
ALIGN 16
L_piby2_lead   DQ 03ff921fb54442d18h, 03ff921fb54442d18h
L_fff800       DQ 0fffffffff8000000h, 0fffffffff8000000h 
L_piby2_part1  DQ 03ff921fb50000000h, 03ff921fb50000000h 
L_piby2_part2  DQ 03e5110b460000000h, 03e5110b460000000h
L_piby2_part3  DQ 03c91a62633145c06h, 03c91a62633145c06h
L_piby2_1      DQ 03FF921FB54400000h, 03FF921FB54400000h
L_piby2_2      DQ 03DD0B4611A600000h, 03DD0B4611A600000h
L_piby2_3      DQ 03BA3198A2E000000h, 03BA3198A2E000000h
L_piby2_1tail  DQ 03DD0B4611A626331h, 03DD0B4611A626331h
L_piby2_2tail  DQ 03BA3198A2E037073h, 03BA3198A2E037073h
L_piby2_3tail  DQ 0397B839A252049C1h, 0397B839A252049C1h
L_sign_mask    DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh
L_twobypi      DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
L_point_five   DQ 03FE0000000000000h, 03FE0000000000000h
L_int_three    DQ 00000000000000003h, 00000000000000003h
L_inf_mask_64  DQ 07FF0000000000000h, 07FF0000000000000h
L_signbit      DQ 08000000000000000h, 08000000000000000h
;; constants for BDL reduction
L_r            DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h  ; 2/pi
L_xc1          DQ 03FF921FB54442D18H, 03FF921FB54442D18h  ; pi/2 (L_piby2_lead)
L_xc2          DQ 03C91A62633145C00H, 03C91A62633145C00h  ; pi/2 part 2
L_xc3          DQ 0397B839A252049C0H, 0397B839A252049C0h  ; pi/2 part 3
; sigma is 3*2^(p-n-2) where n is 0 and p is 53.
L_sigma        DQ 04338000000000000h, 04338000000000000h  ; 6755399441055744.

EXTRN __L_2_by_pi_bits:BYTE

region      EQU        020h
stack_size  EQU        038h

include fm.inc

fname TEXTEQU  <__remainder_piby2_fma3>
fbname TEXTEQU <__remainder_piby2_fma3_bdl>

.code

PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG

    ; This function is not using rdx, r8, and r9 as pointers;
    ; all returns are in registers

    ; get the unbiased exponent and the mantissa part of x
    lea          r9,__L_2_by_pi_bits
 
    ; xexp = (x >> 52) - 1023
    vmovq        r11,xmm0
    mov          rcx,r11 
    shr          r11,52
    sub          r11,1023                 ; r11 <-- xexp = exponent of input x 

    ; calculate the last byte from which to start multiplication
    ; last = 134 - (xexp >> 3) 
    mov          r10,r11
    shr          r10,3
    sub          r10,134                  ; r10 <-- -last
    neg          r10                      ; r10 <-- last

    ; load 64 bits of 2_by_pi
    mov          rax,[r9 + r10]
 
    ; mantissa of x = ((x << 12) >> 12) | implied bit
    shl          rcx,12
    shr          rcx,12                   ; rcx <-- mantissa part of input x 
    bts          rcx,52                   ; add the implied bit as well 

    ; load next 128 bits of 2_by_pi 
    add          r10,8                    ; increment to next 8 bytes of 2_by_pi
    vmovdqu      xmm0,XMMWORD PTR[r9 + r10] 

    ; do three 64-bit multiplications with mant of x 
    mul          rcx
    mov          r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
    mov          r10,rdx                  ; r10 <-- carry
    vmovq        rax,xmm0
    mul          rcx
    ; resexp = xexp & 7 
    and          r11,7                    ; r11 <-- resexp = last 3 bits of xexp
    vpsrldq      xmm0,xmm0,8 
    add          rax,r10                  ; add the previous carry
    adc          rdx,0
    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
    mov          r10,rdx                  ; r10 <-- carry
    vmovq        rax,xmm0
    mul          rcx
    add          r10,rax                  ; r10 <-- most sig. 64 bits = res1[0]
 
    ; find the region 
    ; last three bits ltb = most sig bits >> (54 - resexp));
    ;   decimal point in last 18 bits ==> 8 lsb's in first 64 bits
    ;   and 8 msb's in next 64 bits
    ; point_five = ltb & 01h;
    ; region = ((ltb >> 1) + point_five) & 3; 
    mov          rcx,54
    mov          rax,r10
    sub          rcx,r11
    xor          rdx,rdx                  ; rdx <-- sign of x 
    shr          rax,cl 
    jnc          L__no_point_five
    ; if there is carry then negate the result of multiplication
    not          r10
    not          r9
    not          r8
    mov          rdx,08000000000000000h

ALIGN  16 
L__no_point_five:
    adc          rax,0
    and          rax,3                    ; rax now has region
    mov          QWORD PTR [region+rsp], rax

    ; calculate the number of integer bits and zero them out
    mov          rcx,r11 
    add          rcx,10                   ; rcx = no. of integer bits
    shl          r10,cl
    shr          r10,cl                   ; r10 contains only mant bits
    sub          rcx,64                   ; form the exponent
    mov          r11,rcx
 
    ; find the highest set bit
    bsr          rcx,r10
    jnz          L__form_mantissa
    mov          r10,r9
    mov          r9,r8
    mov          r8,0
    bsr          rcx,r10                  ; rcx = hsb
    sub          r11,64
 
ALIGN  16 
L__form_mantissa:
    add          r11,rcx                  ; for exp of x
    sub          rcx,52                   ; rcx = no. of bits to shift in r10 
    cmp          rcx,0
    jl           L__hsb_below_52
    je           L__form_numbers
    ; hsb above 52
    mov          r8,r10                   ; previous r8 not required
    shr          r10,cl                   ; r10 = mantissa of x with hsb at 52
    shr          r9,cl                    ; make space for bits from r10
    sub          rcx,64
    neg          rcx
    ; rcx <-- no of bits to shift r10 to move those bits to r9
    shl          r8,cl
    or           r9,r8                    ; r9 = mantissa bits of xx 
    jmp          L__form_numbers
 
ALIGN  16 
L__hsb_below_52:
    ; rcx has shift count (< 0)
    neg          rcx
    mov          rax,r9
    shl          r10,cl
    shl          r9,cl
    sub          rcx,64
    neg          rcx
    shr          rax,cl
    or           r10,rax
    shr          r8,cl
    or           r9,r8
 
ALIGN  16
;   Here r11 has unbiased exponent
;   r10 has mantissa, with implicit bit possibly set
;   rdx has the sign bit
L__form_numbers:
    add          r11,1023                 ; r11 <-- biased exponent
    btr          r10,52                   ; remove the implicit bit
    mov          rcx,r11                  ; rcx <-- copy of biased exponent
    or           r10,rdx                  ; put the sign 
    shl          rcx,52                   ; shift biased exponent into place
    or           r10,rcx                  ; r10 <-- x
    vmovq        xmm2,r10                 ; xmm1l <-- x
 
    ; form xx
;   xor          rcx,rcx ; Why is this necessary???
    bsr          rcx,r9                   ; scan for high bit of xx mantissa
    sub          rcx,64                   ; to shift the implied bit as well
    neg          rcx
    shl          r9,cl
    shr          r9,12
    add          rcx,52
    sub          r11,rcx
    shl          r11,52
    or           r9,rdx
    or           r9,r11
    vmovq        xmm1,r9                  ; xmm1 <-- xx
    vandpd       xmm4,xmm2,L_fff800       ; xmm4 <-- hx
    vsubsd       xmm0,xmm2,xmm4           ; xmm0 <-- tx
    vmulsd       xmm5,xmm2,L_piby2_lead   ; xmm5 <-- c
    vmulsd       xmm3,xmm4,L_piby2_part1
    vsubsd       xmm3,xmm3,xmm5
    vfmadd231sd  xmm3,xmm0,L_piby2_part1
    vfmadd231sd  xmm3,xmm4,L_piby2_part2
    vfmadd231sd  xmm3,xmm0,L_piby2_part2
    vmulsd       xmm4,xmm1,L_piby2_lead
    vfmadd231sd  xmm4,xmm2,L_piby2_part3
    vaddsd       xmm3,xmm3,xmm4           ; xmm3 <-- cc
    vaddsd       xmm0,xmm5,xmm3           ; xmm0 <--r
    vsubsd       xmm1,xmm5,xmm0
    vaddsd       xmm1,xmm1,xmm3           ; xmm1 <-- rr
    mov          rax, QWORD PTR [region+rsp]

    StackDeallocate stack_size
    ret
fname endp

ALIGN 16
PUBLIC fbname
fbname PROC FRAME
    .ENDPROLOG
    ; Boldo, Daumas, annd Li, "Formally Verified Argument
    ; Reduction With a Fused Multiply-Add,"
    ; IEEE Trans. Comp., vol. 58, #8, Aug. 2009
    ; coefficients are from table 1, mutatis mutandis
    ; algorithm is their formula 3.1 (for getting z from sigma) and
    ; algorithm 5.1 (and extended version) for actual reduction
    vmovapd      xmm1,xmm0
    vmovapd      xmm4,L_xc2               ; xmm4 <-- xc2
    vmovapd      xmm2,L_sigma
    vfmadd132sd  xmm1,xmm2,L_r            ; z = arg*r + sigma
    vsubsd       xmm1,xmm1,xmm2           ; xmm1 <-- z -= sigma
    vcvttpd2dq   xmm5,xmm1
    vmovq        rax, xmm5
    vmovapd      xmm2,xmm1
    vfnmadd132sd xmm2,xmm0,L_xc1          ; xmm2 <-- u = arg - z*xc1
    vmulsd       xmm3,xmm1,xmm4           ; xmm3 <-- p1 = z*xc2
    vmovapd      xmm0,xmm1                ; xmm0 <-- copy of z
    vfmsub213sd  xmm0,xmm4,xmm3           ; xmm0 <-- p2 = z*xc2 - p1
    vsubsd       xmm5,xmm2,xmm3           ; xmm5 <-- t1 = u - p1
    ; We really don't want to spill in this code, so we're commandeering xmm4
    vsubsd       xmm4,xmm2,xmm5           ; xmm4 <-- temp = u - t1
    vsubsd       xmm4,xmm4,xmm3           ; xmm4 <-- t2 = temp - p1
    ; used to use xmm4 here for L_xc2
    vfnmadd231sd xmm2,xmm1,L_xc2          ; xmm2 <-- v1 = -xc2*z + u
    vsubsd       xmm5,xmm5,xmm2           ; xmm5 <-- v2 = t1 - v1
    vaddsd       xmm5,xmm5,xmm4           ; xmm5 <-- v2 += t2
    vsubsd       xmm5,xmm5,xmm0           ; xmm5 <-- v2 -= p2
    vmovapd      xmm0,xmm2                ; xmm0 <-- arghead = v1
    vfnmadd132sd xmm1,xmm5,L_xc3          ; xmm1 <-- argtail = -xc3*z + v2
    and          rax, 3                   ; rax <-- region
    ret
fbname        endp
END