sinf.asm

;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; An implementation of the sinf function.
;
; Prototype   
;
;     float sinf(float x);
;
;   Computes sinf(x).
;   It will provide proper C99 return values,
;   but may not raise floating point status bits properly.
;   Based on the NAG C implementation.
;

.const
ALIGN 16
L_signbit        DQ 08000000000000000h
                 DQ 08000000000000000h
L_sign_mask      DQ 07FFFFFFFFFFFFFFFh
                 DQ 07FFFFFFFFFFFFFFFh
L_one            DQ 03FF0000000000000h
                 DQ 03FF0000000000000h
L_int_three      DQ 00000000000000003h
                 DQ 00000000000000003h
L_one_half       DQ 03FE0000000000000h
                 DQ 03FE0000000000000h
L_twobypi        DQ 03FE45F306DC9C883h
                 DQ 03FE45F306DC9C883h
L_piby2_1        DQ 03FF921FB54400000h
                 DQ 03FF921FB54400000h
L_one_sixth      DQ 03FC5555555555555h
                 DQ 03FC5555555555555h
L_piby2_1tail    DQ 03DD0B4611A626331h
                 DQ 03DD0B4611A626331h
L_piby2_2        DQ 03dd0b4611a600000h
                 DQ 03dd0b4611a600000h
L_piby2_2tail    DQ 03ba3198a2e037073h
                 DQ 03ba3198a2e037073h
L_inf_mask_32    DD 07F800000h
                 DD 07F800000h
                 DQ 07F8000007F800000h
L_int_two        DQ 00000000000000002h
                 DQ 00000000000000002h
L_piby2_lead     DQ 03ff921fb54442d18h
                 DQ 03ff921fb54442d18h
L_piby4          DQ 03fe921fb54442d18h
                 DQ 03fe921fb54442d18h
L_mask_3f2       DQ 03f20000000000000h
                 DQ 03f20000000000000h
L_mask_3f8       DQ 03f80000000000000h
                 DQ 03f80000000000000h

; Do these really need to be different?
L_large_x_fma3   DQ 04170008AC0000000h     ; 16779436
L_large_x_sse2   DQ 0416E848000000000h     ; 16000000

EXTRN __Lcosfarray:QWORD
EXTRN __Lsinfarray:QWORD
EXTRN __use_fma3_lib:DWORD
EXTRN __L_2_by_pi_bits:BYTE

; define local variable storage offsets
p_temp              EQU 010h     ; temporary for get/put bits operation
p_temp1             EQU 018h     ; temporary for get/put bits operation
region              EQU 020h     ; pointer to region for remainder_piby2
r                   EQU 028h     ; pointer to r for remainder_piby2
dummy_space         EQU 040h

stack_size          EQU 058h

include fm.inc

fname           TEXTEQU <sinf>
fname_special   TEXTEQU <_sinf_special>

;Define name and any external functions being called
EXTRN           __remainder_piby2d2f_forC : PROC    ; NEAR
EXTERN          fname_special        : PROC

.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG   
    cmp          DWORD PTR __use_fma3_lib, 0
    jne          Lsinf_fma3

Lsinf_sse2:

    xorpd   xmm2, xmm2                                ; zeroed out for later use

;;  if NaN or inf
    movd    edx, xmm0
    mov     eax, 07f800000h
    mov     r10d, eax
    and     r10d, edx
    cmp     r10d, eax 
    jz      Lsinf_sse2_naninf

; GET_BITS_DP64(x, ux);
; get the input value to an integer register.
    cvtss2sd     xmm0, xmm0               ; convert input to double.
    movd         rdx, xmm0                ; rdx is ux

;  ax = (ux & ~SIGNBIT_DP64);
    mov     r10, rdx
    btr     r10, 63                                   ; r10 is ax
    mov     r8d, 1                                    ; for determining region later on

;;  if (ax <= 0x3fe921fb54442d18)  abs(x) <= pi/4 
    mov     rax, 03fe921fb54442d18h
    cmp     r10, rax
    jg      Lsinf_absx_gt_piby4

;;      if (ax < 0x3f80000000000000)  abs(x) < 2.0^(-7) 
    mov     rax, 3f80000000000000h
    cmp     r10, rax
    jge     Lsinf_sse2_small

;;          if (ax < 0x3f20000000000000) abs(x) < 2.0^(-13)
    mov     rax, 3f20000000000000h
    cmp     r10, rax
    jge     Lsinf_sse2_smaller

;                  sinf = x;
    jmp     Lsinf_sse2_cleanup 

ALIGN 16
Lsinf_sse2_smaller:   
;   sinf = x - x^3 * 0.1666666666666666666;
    movsd   xmm2, xmm0
    movsd   xmm4, QWORD PTR L_one_sixth   ; 0.1666666666666666666
    mulsd   xmm2, xmm2                    ; x^2
    mulsd   xmm2, xmm0                    ; x^3
    mulsd   xmm2, xmm4                    ; x^3 * 0.1666666666666666666
    subsd   xmm0, xmm2                    ; x - x^3 * 0.1666666666666666666
    jmp     Lsinf_sse2_cleanup  

ALIGN 16
Lsinf_sse2_small:   
    movsd   xmm2, xmm0                    ; x2 = r * r;
    mulsd   xmm2, xmm0                    ; x2

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; region 0 or 2     - do a sinf calculation
;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
    mulsd   xmm1, xmm2                                ; s4x2
    movsd   xmm4, xmm2                                ; move for x4
    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
    mulsd   xmm4, xmm2                                ; x4
    movsd   xmm3, xmm0                                ; move for x3
    mulsd   xmm5, xmm2                                ; s2x2
    mulsd   xmm3, xmm2                                ; x3        
    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
    mulsd   xmm1, xmm4                                ; s3x4+s4x6
    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
    jmp     Lsinf_sse2_cleanup

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_absx_gt_piby4:   
;  xneg = (ax != ux);
    cmp     rdx, r10
    mov     r11d, 0
;;  if (xneg) x = -x;
    jz      Lsinf_sse2_reduce_moderate
    
    mov     r11d, 1
    subsd   xmm2, xmm0
    movsd   xmm0, xmm2

Lsinf_sse2_reduce_moderate:
;;  if (x < 5.0e6)
    cmp     r10, QWORD PTR L_large_x_sse2
    jae     Lsinf_sse2_reduce_large

; reduce  the argument to be in a range from -pi/4 to +pi/4
; by subtracting multiples of pi/2
    movsd   xmm2, xmm0
    movsd   xmm3, QWORD PTR L_twobypi
    movsd   xmm4, xmm0
    movsd   xmm5, QWORD PTR L_one_half              ; .5
    mulsd   xmm2, xmm3

;/* How many pi/2 is x a multiple of? */
;      xexp  = ax >> EXPSHIFTBITS_DP64;
    mov     r9, r10
    shr     r9, 52                                    ; >>EXPSHIFTBITS_DP64

;        npi2  = (int)(x * twobypi + 0.5);
    addsd   xmm2, xmm5                        ; npi2

    movsd   xmm3, QWORD PTR L_piby2_1
    cvttpd2dq  xmm0, xmm2                             ; convert to integer
    movsd   xmm1, QWORD PTR L_piby2_1tail
    cvtdq2pd   xmm2, xmm0                             ; and back to double.

;      /* Subtract the multiple from x to get an extra-precision remainder */
;      rhead  = x - npi2 * piby2_1;
    mulsd   xmm3, xmm2
    subsd   xmm4, xmm3                                ; rhead

;      rtail  = npi2 * piby2_1tail;
    mulsd   xmm1, xmm2
    movd    eax, xmm0

;      GET_BITS_DP64(rhead-rtail, uy);                   
; originally only rhead
    movsd   xmm0, xmm4
    subsd   xmm0, xmm1

    movsd   xmm3, QWORD PTR L_piby2_2
    movd    rcx, xmm0
    movsd   xmm5, QWORD PTR L_piby2_2tail

;    xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
;      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
    shl     rcx, 1                                    ; strip any sign bit
    shr     rcx, 53                                   ; >> EXPSHIFTBITS_DP64 +1
    sub     r9, rcx                                   ; expdiff

;;      if (expdiff > 15)
    cmp     r9, 15
    jle     Lsinf_sse2_expdiff_le_15

;       The remainder is pretty small compared with x, which
;       implies that x is a near multiple of pi/2
;       (x matches the multiple to at least 15 bits)
;          t  = rhead;
    movsd   xmm1, xmm4

;          rtail  = npi2 * piby2_2;
    mulsd   xmm3, xmm2

;          rhead  = t - rtail;
    mulsd   xmm5, xmm2                                ; npi2 * piby2_2tail
    subsd   xmm4, xmm3                                ; rhead

;          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
    subsd   xmm1, xmm4                                ; t - rhead
    subsd   xmm1, xmm3                                ; -rtail
    subsd   xmm5, xmm1                                ; rtail

;      r = rhead - rtail;
    movsd   xmm0, xmm4

;HARSHA
;xmm1=rtail
    movsd   xmm1, xmm5
    subsd   xmm0, xmm5

;    xmm0=r, xmm4=rhead, xmm1=rtail
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
Lsinf_sse2_expdiff_le_15:  
    cmp     rcx, 03f2h                                ; is r < 2^-13 ?
    jge     Lsinf_sse2_calc_sincosf_piby4             ; use taylor series if not
    cmp     rcx, 03deh                                ; if r really small.
    jle     Lsinf_sse2_r_very_small                   ; then sinf(r) ~ r or 1

    movsd   xmm2, xmm0
    mulsd   xmm2, xmm0                                ; xmm2 <-- r^2

;;      if region is 0 or 2 do a sinf calc.
    and     r8d, eax
    jnz     Lsinf_sse2_small_calc_sin

; region 0 or 2 do a sinf calculation
; use simply polynomial
;              x - x*x*x*0.166666666666666666;
    movsd   xmm3, QWORD PTR L_one_sixth
    mulsd   xmm3, xmm0                                ; * x
    mulsd   xmm3, xmm2                                ; * x^2
    subsd   xmm0, xmm3                                ; xs
    jmp     Lsinf_sse2_adjust_region

ALIGN 16
Lsinf_sse2_small_calc_sin:   
; region 1 or 3 do a cosf calculation
; use simply polynomial
;              1.0 - x*x*0.5;
    movsd   xmm0, QWORD PTR L_one  ; 1.0
    mulsd   xmm2, QWORD PTR L_one_half              ; 0.5 *x^2
    subsd   xmm0, xmm2                                ; xc
    jmp     Lsinf_sse2_adjust_region

ALIGN 16
Lsinf_sse2_r_very_small:
;;      if region is 0 or 2    do a sinf calc. (sinf ~ x)
    and     r8d, eax
    jz      Lsinf_sse2_adjust_region

    movsd   xmm0, QWORD PTR L_one  ; cosf(r) is a 1
    jmp     Lsinf_sse2_adjust_region

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ALIGN 16
Lsinf_sse2_reduce_large:   
;      Reduce x into range [-pi/4, pi/4]
;      __remainder_piby2d2f_forC(x, &r, &region);

    mov          QWORD PTR p_temp[rsp], r11
    lea          rdx, QWORD PTR r[rsp]
    lea          r8,  QWORD PTR region[rsp]
    movd         rcx, xmm0
    call         __remainder_piby2d2f_forC
    mov          r11, QWORD PTR p_temp[rsp]
    mov          r8d, 1                   ; for determining region later on
    movsd        xmm1, QWORD PTR r[rsp]   ; x
    mov          eax,  DWORD PTR region[rsp] ; region

; xmm0 = x, xmm4 = xx, r8d = 1, eax= region
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; perform taylor series to calc sinfx, cosfx
Lsinf_sse2_calc_sincosf_piby4:
;  x2 = r * r;
    movsd   xmm2, xmm0
    mulsd   xmm2, xmm0                                ; x2

;;      if region is 1 or 3, do a cosf calc.
    and     r8d, eax
    jnz     Lsinf_sse2_do_cosf_calc

; region is 0 or 2: do a sinf calc.
;  zs = x + x3((s1 + x2 * s2) + x4(s3 + x2 * s4));
Lsinf_sse2_do_sinf_calc:
    movsd   xmm1, QWORD PTR __Lsinfarray+18h          ; s4
    mulsd   xmm1, xmm2                                ; s4x2
    movsd   xmm4, xmm2                                ; move for x4    
    mulsd   xmm4, xmm2                                ; x4
    movsd   xmm5, QWORD PTR __Lsinfarray+8h           ; s2
    mulsd   xmm5, xmm2                                ; s2x2
    movsd   xmm3, xmm0                                ; move for x3
    mulsd   xmm3, xmm2                                ; x3        
    addsd   xmm1, QWORD PTR __Lsinfarray+10h          ; s3+s4x2
    mulsd   xmm1, xmm4                                ; s3x4+s4x6     
    addsd   xmm5, QWORD PTR __Lsinfarray              ; s1+s2x2
    addsd   xmm1, xmm5                                ; s1+s2x2+s3x4+s4x6
    mulsd   xmm1, xmm3                                ; x3(s1+s2x2+s3x4+s4x6)
    addsd   xmm0, xmm1                                ; x + x3(s1+s2x2+s3x4+s4x6)
    jmp     Lsinf_sse2_adjust_region

ALIGN 16
Lsinf_sse2_do_cosf_calc:   

; region 1 or 3     - do a cosf calculation
;  zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8;
;     zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision
    movsd   xmm1, QWORD PTR __Lcosfarray+20h          ; c4
    movsd   xmm4, xmm2                                ; move for x4
    mulsd   xmm1, xmm2                                ; c4x2
    movsd   xmm3, QWORD PTR __Lcosfarray+10h          ; c2
    mulsd   xmm4, xmm2                                ; x4
    movsd   xmm0, QWORD PTR __Lcosfarray              ; c0
    mulsd   xmm3, xmm2                                ; c2x2
    mulsd   xmm0, xmm2                                ; c0x2 (=-0.5x2)
    addsd   xmm1, QWORD PTR __Lcosfarray+18h          ; c3+c4x2
    mulsd   xmm1, xmm4                                ; c3x4 + c4x6
    addsd   xmm3, QWORD PTR __Lcosfarray+8h           ; c1+c2x2
    addsd   xmm1, xmm3                                ; c1 + c2x2 + c3x4 + c4x6
    mulsd   xmm1, xmm4                                ; c1x4 + c2x6 + c3x8 + c4x10
    addsd   xmm0, QWORD PTR L_one                     ; 1 - 0.5x2
    addsd   xmm0, xmm1                                ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


Lsinf_sse2_adjust_region:           
; positive or negative
;      switch (region)
    shr     eax, 1
    mov     ecx, eax
    and     eax, r11d

    not     ecx
    not     r11d
    and     ecx, r11d

    or      eax, ecx
    and     eax, 1
    jnz     Lsinf_sse2_cleanup

;; if the original region 0, 1 and arg is negative, then we negate the result.
;; if the original region 2, 3 and arg is positive, then we negate the result.
    movsd   xmm2, xmm0
    xorpd   xmm0, xmm0
    subsd   xmm0, xmm2


Lsinf_sse2_cleanup:   
    cvtsd2ss xmm0, xmm0
    StackDeallocate stack_size
    ret

ALIGN 16
Lsinf_sse2_naninf:   
    call    fname_special
    StackDeallocate stack_size
    ret

ALIGN 16
Lsinf_fma3:
    vmovd        eax,xmm0
    mov          r8d,L_inf_mask_32
    and          eax,r8d
    cmp          eax, r8d
    jz           Lsinf_fma3_naninf

    vcvtss2sd    xmm5,xmm0,xmm0
    vmovq        r9,xmm5
    btr          r9,63                    ; r9 <-- |x|
    cmp          r9,L_piby4
    jg           Lsinf_fma3_range_reduce

    cmp          r9,L_mask_3f8
    jge          Lsinf_fma3_compute_sinf_piby_4

    cmp          r9,L_mask_3f2
    jge          Lsinf_fma3_compute_x_xxx_0_1666

    ; Here |x| < 2^-13; just return sin x ~ x
    StackDeallocate stack_size
    ret

ALIGN 16
Lsinf_fma3_compute_x_xxx_0_1666:
    ; Here |x| < 2^-7; return sin x ~ x + 1/6 x^3
    vmulsd       xmm1,xmm5,xmm5
    vmulsd       xmm0,xmm1,xmm5           ; xmm1 <-- x^3
    vfnmadd132sd xmm0,xmm5,L_one_sixth    ; x - x*x*x*0.166666666666666666
    jmp          Lsinf_fma3_return_sinf_s

ALIGN 16
Lsinf_fma3_compute_sinf_piby_4:
    vmovapd      xmm0,xmm5
    vmovsd       xmm1,__Lsinfarray+010h
    vmulsd       xmm3,xmm0,xmm0           ; xmm3 <-- x^2
    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray+08h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray
    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
    vfmadd231sd  xmm0,xmm1,xmm3
    jmp          Lsinf_fma3_return_sinf_s

ALIGN 16
Lsinf_fma3_range_reduce:
    vmovq        xmm0,r9                  ; xmm0 <-- |x|
    cmp          r9,L_large_x_fma3
    jge          Lsinf_fma3_reduce_large

Lsinf_fma3_sinf_reduce_moderate:
    vandpd       xmm1,xmm0,L_sign_mask    ; xmm1 <-- |x|  mov should suffice WAT
    vmovapd      xmm2,L_twobypi
    vfmadd213sd  xmm2,xmm1,L_one_half
    vcvttpd2dq   xmm2,xmm2
    vpmovsxdq    xmm1,xmm2
    vandpd       xmm4,xmm1,L_int_three    ; xmm4 <-- region
    vshufps      xmm1 ,xmm1,xmm1,8
    vcvtdq2pd    xmm1,xmm1
    vmovdqa      xmm2,xmm0
    vfnmadd231sd xmm2,xmm1,L_piby2_1      ; xmm2 <-- rhead
    vmulsd       xmm3,xmm1,L_piby2_1tail  ; xmm3 <-- rtail
    vsubsd       xmm0,xmm2,xmm3           ; xmm0 <-- r_1
    vsubsd       xmm2,xmm2,xmm0
    vsubsd       xmm1,xmm2,xmm3           ; xmm4 <-- rr_1
    jmp          Lsinf_fma3_exit_s

ALIGN 16
Lsinf_fma3_reduce_large:
    lea          r9,__L_2_by_pi_bits
    ;xexp = (x >> 52) 1023
    vmovq        r11,xmm0
    mov          rcx,r11
    shr          r11,52
    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
    ;calculate the last byte from which to start multiplication
    ;last = 134 (xexp >> 3)
    mov          r10,r11
    shr          r10,3
    sub          r10,134 ;r10 = last
    neg          r10 ;r10 = last
    ;load 64 bits of 2_by_pi
    mov          rax,[r9+r10]
    ;mantissa of x = ((x << 12) >> 12) | implied bit
    shl          rcx,12
    shr          rcx,12 ;rcx = mantissa part of input x
    bts          rcx,52 ;add the implied bit as well
    ;load next 128 bits of 2_by_pi
    add          r10,8 ;increment to next 8 bytes of 2_by_pi
    vmovdqu      xmm0,XMMWORD PTR[r9+r10]
    ;do three 64bit multiplications with mant of x
    mul          rcx
    mov          r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
    mov          r10,rdx                  ; r10 <-- carry
    vmovq        rax,xmm0
    mul          rcx
    ;resexp = xexp & 7
    and          r11,7                    ; r11 <-- resexp = last 3 bits
    psrldq       xmm0,8
    add          rax,r10                  ; add the previous carry
    adc          rdx,0
    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
    mov          r10,rdx                  ; r10 <-- carry
    vmovq        rax,xmm0
    mul          rcx
    add          r10,rax                  ; r10 = most sig 64 bits = res1[0]
    ;find the region
    ;last three bits ltb = most sig bits >> (54 resexp))
    ; decimal point in last 18 bits == 8 lsb's in first 64 bits 
    ; and 8 msb's in next 64 bits
    ;point_five = ltb & 01h;
    ;region = ((ltb >> 1) + point_five) & 3;
    mov          rcx,54
    mov          rax,r10
    sub          rcx,r11
    xor          rdx,rdx ;rdx = sign of x(i.e first part of x * 2bypi)
    shr          rax,cl
    jnc          Lsinf_fma3_no_point_five_f
    ;;if there is carry.. then negate the result of multiplication
    not          r10
    not          r9
    not          r8
    mov          rdx,08000000000000000h

Lsinf_fma3_no_point_five_f:
    adc          rax,0
    and          rax,3
    vmovd        xmm4,eax ;store region to xmm4
    ;calculate the number of integer bits and zero them out
    mov          rcx,r11
    add          rcx,10                   ; rcx <-- no. of integer bits
    shl          r10,cl
    shr          r10,cl                   ; r10 contains only mant bits
    sub          rcx,64                   ; form the exponent
    mov          r11,rcx
    ;find the highest set bit
    bsr          rcx,r10
    jnz          Lsinf_fma3_form_mantissa_f
    mov          r10,r9
    mov          r9,r8
    mov          r8,0
    bsr          rcx,r10                  ; rcx <-- hsb
    sub          r11,64

Lsinf_fma3_form_mantissa_f:
    add          r11,rcx ;for exp of x
    sub          rcx,52 ;rcx = no. of bits to shift in r10
    cmp          rcx,0
    jl           Lsinf_fma3_hsb_below_52_f
    je           Lsinf_fma3_form_numbers_f
    ;hsb above 52
    mov         r8,r10                    ; previous contents of r8 not required
    shr         r10,cl                    ; r10 = mantissa of x with hsb at 52
    shr         r9,cl                     ; make space for bits from r10
    sub         rcx,64
    neg         rcx
    shl         r8,cl
    or          r9,r8                     ; r9 = mantissa bits of xx
    jmp         Lsinf_fma3_form_numbers_f

ALIGN 16
Lsinf_fma3_hsb_below_52_f:
    neg          rcx
    mov          rax,r9
    shl          r10,cl
    shl          r9,cl
    sub          rcx,64
    neg          rcx
    shr          rax,cl
    or           r10,rax
    shr          r8,cl
    or           r9,r8

ALIGN 16
Lsinf_fma3_form_numbers_f:
    add          r11,1023
    btr          r10,52                   ; remove the implied bit
    mov          rcx,r11
    or           r10,rdx                  ; put the sign
    shl          rcx,52
    or           r10,rcx                  ; r10 <-- x
    vmovq        xmm0,r10                 ; xmm0 <-- x
    vmulsd       xmm0,xmm0,L_piby2_lead
Lsinf_fma3_exit_s:
    vmovq        rax,xmm4
    and          rax,01h
    cmp          rax,01h
    jz           Lsinf_fma3_cos_piby4_compute

Lsinf_fma3_sin_piby4_compute:
;;    vmovapd      xmm1,__Lsinfarray+010h
    vmovsd       xmm1,__Lsinfarray+010h
    vmulsd       xmm3,xmm0,xmm0
    vfmadd231sd  xmm1,xmm3,__Lsinfarray+018h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray+008h
    vfmadd213sd  xmm1,xmm3,__Lsinfarray
    vmulsd       xmm3,xmm0,xmm3           ; xmm3 <-- x^3
    vfmadd231sd  xmm0,xmm1,xmm3
    jmp          Lsinf_fma3_exit_s_1

ALIGN 16
Lsinf_fma3_cos_piby4_compute:
    vmovapd      xmm2,L_one
    vmulsd       xmm3,xmm0,xmm0
    vfmadd231sd  xmm2,xmm3,__Lcosfarray   ; xmm2 <-- 1 + c0 x^2
    ; would simple Horner's be slower?
    vmovsd       xmm1,__Lcosfarray+018h       ; xmm1 <-- c3
    vfmadd231sd  xmm1,xmm3,__Lcosfarray+020h  ; xmm1 <--  c4 x^2+ c3
    vfmadd213sd  xmm1,xmm3,__Lcosfarray+010h  ; xmm1 <--  (c4 x^2+ c3)x^2 + c2
    vfmadd213sd  xmm1,xmm3,__Lcosfarray+008h  ; xmm1 <--  ((c4 x^2+ c3)x^2 + c2)x^2 + c1
    vmulsd       xmm3,xmm3,xmm3               ; xmm3 <-- x^4
    vmovdqa      xmm0,xmm2
    vfmadd231sd  xmm0,xmm1,xmm3
Lsinf_fma3_exit_s_1:
    ; assuming FMA3 ==> AVX ==> SSE4.1
    vpcmpeqq     xmm2,xmm4,XMMWORD PTR L_int_two
    vpcmpeqq     xmm3,xmm4,XMMWORD PTR L_int_three
    vorpd        xmm3,xmm2,xmm3
    vandnpd      xmm3,xmm3,L_signbit
    vxorpd       xmm0,xmm0,xmm3

    vandnpd      xmm1,xmm5,L_signbit
    vxorpd       xmm0,xmm1,xmm0
Lsinf_fma3_return_sinf_s:
    vcvtsd2ss    xmm0,xmm0,xmm0
    StackDeallocate stack_size
    ret
    
Lsinf_fma3_naninf:
    call         fname_special
    StackDeallocate stack_size
    ret

fname endp
END