Implement NVVMReflect in Julia. #280

maleadt · 2022-01-26T15:44:15Z

sin.(CuArray([1]))

Before:

; PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, typeof(sin), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Int64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64) for sm_75
define ptx_kernel void @_Z27julia_broadcast_kernel_778315CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE4_sinS3_I8ExtrudedIS0_IS5_Li1ELi1EES3_I4BoolES3_IS5_EEEES5_([1 x i64] %state, { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, i64 signext %2) local_unnamed_addr #4 {
entry:
  %q.i.i = alloca i32, align 4
  %q.i.i.i = alloca i32, align 4
  %.fca.3.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 3
  %.fca.0.0.2.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 2, 0
  %.inv = icmp sgt i64 %2, 0
  %3 = select i1 %.inv, i64 %2, i64 0
  br i1 %.inv, label %L13.preheader, label %L76

L13.preheader:                                    ; preds = %entry
  %.fca.0.0.1.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 1, 0
  %.fca.0.0.0.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 0, 0
  %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0
  %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %5 = add nuw nsw i32 %4, 1
  %6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %7 = zext i32 %6 to i64
  %8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  %9 = zext i32 %8 to i64
  %10 = mul nuw nsw i64 %9, %7
  %11 = zext i32 %5 to i64
  %12 = add nuw nsw i64 %10, %11
  %13 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %14 = mul i32 %8, %13
  %15 = sext i32 %14 to i64
  %16 = and i8 %.fca.0.0.1.0.extract, 1
  %.not16 = icmp eq i8 %16, 0
  %17 = bitcast i8 addrspace(1)* %.fca.0.0.0.0.extract to i64 addrspace(1)*
  %18 = bitcast i32* %q.i.i to i8*
  %19 = bitcast i32* %q.i.i.i to i8*
  %20 = bitcast i8 addrspace(1)* %.fca.0.extract to double addrspace(1)*
  br label %L13

L13:                                              ; preds = %__nv_sin.exit, %L13.preheader
  %value_phi3 = phi i64 [ %154, %__nv_sin.exit ], [ 1, %L13.preheader ]
  %21 = add nsw i64 %value_phi3, -1
  %22 = mul i64 %21, %15
  %23 = add i64 %12, %22
  %.not = icmp slt i64 %.fca.3.extract, %23
  br i1 %.not, label %L76, label %L77

L76:                                              ; preds = %__nv_sin.exit, %L13, %entry
  ret void

L77:                                              ; preds = %L13
  %24 = select i1 %.not16, i64 %.fca.0.0.2.0.extract, i64 %23
  %25 = add i64 %24, -1
  %26 = getelementptr inbounds i64, i64 addrspace(1)* %17, i64 %25
  %27 = load i64, i64 addrspace(1)* %26, align 8
  %28 = sitofp i64 %27 to double
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %18)
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %19)
  %29 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %30 = icmp eq i32 %29, 350
  br i1 %30, label %34, label %31

31:                                               ; preds = %L77
  %32 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %33 = icmp eq i32 %32, 370
  br i1 %33, label %34, label %89

34:                                               ; preds = %31, %L77
  %35 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %36 = icmp eq i32 %35, 200
  br i1 %36, label %43, label %37

37:                                               ; preds = %34
  %38 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %39 = icmp eq i32 %38, 350
  br i1 %39, label %43, label %40

40:                                               ; preds = %37
  %41 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %42 = icmp eq i32 %41, 370
  br i1 %42, label %43, label %46

43:                                               ; preds = %40, %37, %34
  %44 = call double @llvm.fabs.f64(double %28) #5
  %45 = fcmp oeq double %44, 0x7FF0000000000000
  br label %__nv_isinfd.exit.i.i

46:                                               ; preds = %40
  %47 = call i32 @llvm.nvvm.d2i.lo(double %28) #5
  %48 = call i32 @llvm.nvvm.d2i.hi(double %28) #5
  %49 = and i32 %48, 2147483647
  %50 = icmp eq i32 %49, 2146435072
  %51 = icmp eq i32 %47, 0
  %52 = and i1 %51, %50
  br label %__nv_isinfd.exit.i.i

__nv_isinfd.exit.i.i:                             ; preds = %46, %43
  %.03.in.i = phi i1 [ %45, %43 ], [ %52, %46 ]
  %53 = fmul double %28, 0.000000e+00
  %spec.select8.i = select i1 %.03.in.i, double %53, double %28
  %54 = fmul double %spec.select8.i, 0x3FE45F306DC9C883
  %55 = call i32 @llvm.nvvm.d2i.rn(double %54) #5
  store i32 %55, i32* %q.i.i.i, align 4
  %56 = sitofp i32 %55 to double
  %57 = fneg double %56
  %58 = call double @llvm.fma.f64(double %57, double 0x3FF921FB54442D18, double %spec.select8.i) #5
  %59 = call double @llvm.fma.f64(double %57, double 0x3C91A62633145C00, double %58) #5
  %60 = call double @llvm.fma.f64(double %57, double 0x397B839A252049C0, double %59) #5
  %61 = call i32 @llvm.nvvm.d2i.hi(double %spec.select8.i) #5
  %62 = and i32 %61, 2145386496
  %63 = icmp ugt i32 %62, 1105199103
  br i1 %63, label %64, label %__internal_trig_reduction_kerneld.exit.i.i

64:                                               ; preds = %__nv_isinfd.exit.i.i
  %65 = call fastcc double @__internal_trig_reduction_slowpathd(double %spec.select8.i, i32* nonnull %q.i.i.i) #5
  %.pre18 = load i32, i32* %q.i.i.i, align 4
  br label %__internal_trig_reduction_kerneld.exit.i.i

__internal_trig_reduction_kerneld.exit.i.i:       ; preds = %64, %__nv_isinfd.exit.i.i
  %66 = phi i32 [ %.pre18, %64 ], [ %55, %__nv_isinfd.exit.i.i ]
  %t.i.i.0.i = phi double [ %65, %64 ], [ %60, %__nv_isinfd.exit.i.i ]
  %67 = fmul double %t.i.i.0.i, %t.i.i.0.i
  %68 = call double @llvm.fma.f64(double %67, double 0xBDA8FF8320FD8164, double 0x3E21EEA7C1EF8528) #5
  %69 = call double @llvm.fma.f64(double %68, double %67, double 0xBE927E4F8E06E6D9) #5
  %70 = call double @llvm.fma.f64(double %69, double %67, double 0x3EFA01A019DDBCE9) #5
  %71 = call double @llvm.fma.f64(double %70, double %67, double 0xBF56C16C16C15D47) #5
  %72 = call double @llvm.fma.f64(double %71, double %67, double 0x3FA5555555555551) #5
  %73 = call double @llvm.fma.f64(double %72, double %67, double -5.000000e-01) #5
  %74 = call double @llvm.fma.f64(double %73, double %67, double 1.000000e+00) #5
  %75 = call double @llvm.fma.f64(double %67, double 0x3DE5DB65F9785EBA, double 0xBE5AE5F12CB0D246) #5
  %76 = call double @llvm.fma.f64(double %75, double %67, double 0x3EC71DE369ACE392) #5
  %77 = call double @llvm.fma.f64(double %76, double %67, double 0xBF2A01A019DB62A1) #5
  %78 = call double @llvm.fma.f64(double %77, double %67, double 0x3F81111111110818) #5
  %79 = call double @llvm.fma.f64(double %78, double %67, double 0xBFC5555555555554) #5
  %80 = call double @llvm.fma.f64(double %79, double %67, double 0.000000e+00) #5
  %81 = call double @llvm.fma.f64(double %80, double %t.i.i.0.i, double %t.i.i.0.i) #5
  %82 = and i32 %66, 1
  %.not6.i = icmp eq i32 %82, 0
  %spec.select.i = select i1 %.not6.i, double %81, double %74
  %83 = and i32 %66, 2
  %.not7.i = icmp eq i32 %83, 0
  br i1 %.not7.i, label %__nv_sincos.exit.i, label %84

84:                                               ; preds = %__internal_trig_reduction_kerneld.exit.i.i
  %85 = call i32 @llvm.nvvm.d2i.hi(double %spec.select.i) #5
  %86 = call i32 @llvm.nvvm.d2i.lo(double %spec.select.i) #5
  %87 = xor i32 %85, -2147483648
  %88 = call double @llvm.nvvm.lohi.i2d(i32 %86, i32 %87) #5
  br label %__nv_sincos.exit.i

89:                                               ; preds = %31
  %90 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %91 = icmp eq i32 %90, 200
  br i1 %91, label %98, label %92

92:                                               ; preds = %89
  %93 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %94 = icmp eq i32 %93, 350
  br i1 %94, label %98, label %95

95:                                               ; preds = %92
  %96 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str.1, i64 0, i64 0)) #5
  %97 = icmp eq i32 %96, 370
  br i1 %97, label %98, label %101

98:                                               ; preds = %95, %92, %89
  %99 = call double @llvm.fabs.f64(double %28) #5
  %100 = fcmp oeq double %99, 0x7FF0000000000000
  br label %__nv_isinfd.exit.i

101:                                              ; preds = %95
  %102 = call i32 @llvm.nvvm.d2i.lo(double %28) #5
  %103 = call i32 @llvm.nvvm.d2i.hi(double %28) #5
  %104 = and i32 %103, 2147483647
  %105 = icmp eq i32 %104, 2146435072
  %106 = icmp eq i32 %102, 0
  %107 = and i1 %106, %105
  br label %__nv_isinfd.exit.i

__nv_isinfd.exit.i:                               ; preds = %101, %98
  %.04.in.i = phi i1 [ %100, %98 ], [ %107, %101 ]
  %108 = fmul double %28, 0.000000e+00
  %spec.select9.i = select i1 %.04.in.i, double %108, double %28
  %109 = fmul double %spec.select9.i, 0x3FE45F306DC9C883
  %110 = call i32 @llvm.nvvm.d2i.rn(double %109) #5
  store i32 %110, i32* %q.i.i, align 4
  %111 = sitofp i32 %110 to double
  %112 = fneg double %111
  %113 = call double @llvm.fma.f64(double %112, double 0x3FF921FB54442D18, double %spec.select9.i) #5
  %114 = call double @llvm.fma.f64(double %112, double 0x3C91A62633145C00, double %113) #5
  %115 = call double @llvm.fma.f64(double %112, double 0x397B839A252049C0, double %114) #5
  %116 = call i32 @llvm.nvvm.d2i.hi(double %spec.select9.i) #5
  %117 = and i32 %116, 2145386496
  %118 = icmp ugt i32 %117, 1105199103
  br i1 %118, label %119, label %__internal_trig_reduction_kerneld.exit.i

119:                                              ; preds = %__nv_isinfd.exit.i
  %120 = call fastcc double @__internal_trig_reduction_slowpathd(double %spec.select9.i, i32* nonnull %q.i.i) #5
  %.pre = load i32, i32* %q.i.i, align 4
  br label %__internal_trig_reduction_kerneld.exit.i

__internal_trig_reduction_kerneld.exit.i:         ; preds = %119, %__nv_isinfd.exit.i
  %121 = phi i32 [ %.pre, %119 ], [ %110, %__nv_isinfd.exit.i ]
  %t.i1.0.i = phi double [ %120, %119 ], [ %115, %__nv_isinfd.exit.i ]
  %122 = shl i32 %121, 3
  %123 = and i32 %122, 8
  %124 = zext i32 %123 to i64
  %125 = getelementptr inbounds [16 x double], [16 x double] addrspace(1)* @__cudart_sin_cos_coeffs, i64 0, i64 %124
  %126 = fmul double %t.i1.0.i, %t.i1.0.i
  %127 = and i32 %121, 1
  %.not.i = icmp eq i32 %127, 0
  %128 = select i1 %.not.i, double 0x3DE5DB65F9785EBA, double 0xBDA8FF8320FD8164
  %129 = getelementptr inbounds double, double addrspace(1)* %125, i64 1
  %130 = load double, double addrspace(1)* %129, align 8
  %131 = call double @llvm.fma.f64(double %128, double %126, double %130) #5
  %132 = getelementptr inbounds double, double addrspace(1)* %125, i64 2
  %133 = load double, double addrspace(1)* %132, align 8
  %134 = call double @llvm.fma.f64(double %131, double %126, double %133) #5
  %135 = getelementptr inbounds double, double addrspace(1)* %125, i64 3
  %136 = load double, double addrspace(1)* %135, align 8
  %137 = call double @llvm.fma.f64(double %134, double %126, double %136) #5
  %138 = getelementptr inbounds double, double addrspace(1)* %125, i64 4
  %139 = load double, double addrspace(1)* %138, align 8
  %140 = call double @llvm.fma.f64(double %137, double %126, double %139) #5
  %141 = getelementptr inbounds double, double addrspace(1)* %125, i64 5
  %142 = load double, double addrspace(1)* %141, align 8
  %143 = call double @llvm.fma.f64(double %140, double %126, double %142) #5
  %144 = getelementptr inbounds double, double addrspace(1)* %125, i64 6
  %145 = load double, double addrspace(1)* %144, align 8
  %146 = call double @llvm.fma.f64(double %143, double %126, double %145) #5
  %147 = call double @llvm.fma.f64(double %146, double %t.i1.0.i, double %t.i1.0.i) #5
  %148 = call double @llvm.fma.f64(double %146, double %126, double 1.000000e+00) #5
  %spec.select10.i = select i1 %.not.i, double %147, double %148
  %149 = and i32 %121, 2
  %.not5.i = icmp eq i32 %149, 0
  %150 = call double @llvm.fma.f64(double %spec.select10.i, double -1.000000e+00, double 0.000000e+00) #5
  %spec.select11.i = select i1 %.not5.i, double %spec.select10.i, double %150
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %18)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19)
  br label %__nv_sin.exit

__nv_sincos.exit.i:                               ; preds = %84, %__internal_trig_reduction_kerneld.exit.i.i
  %.01.i = phi double [ %88, %84 ], [ %spec.select.i, %__internal_trig_reduction_kerneld.exit.i.i ]
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %18)
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19)
  br label %__nv_sin.exit

__nv_sin.exit:                                    ; preds = %__nv_sincos.exit.i, %__internal_trig_reduction_kerneld.exit.i
  %151 = phi double [ %spec.select11.i, %__internal_trig_reduction_kerneld.exit.i ], [ %.01.i, %__nv_sincos.exit.i ]
  %152 = add i64 %23, -1
  %153 = getelementptr inbounds double, double addrspace(1)* %20, i64 %152
  store double %151, double addrspace(1)* %153, align 8
  %.not17 = icmp eq i64 %value_phi3, %3
  %154 = add nuw i64 %value_phi3, 1
  br i1 %.not17, label %L76, label %L13
}

After:

; PTX CompilerJob of kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, typeof(sin), Tuple{Base.Broadcast.Extruded{CuDeviceVector{Int64, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64) for sm_75
define ptx_kernel void @_Z27julia_broadcast_kernel_791115CuKernelContext13CuDeviceArrayI7Float64Li1ELi1EE11BroadcastedIv5TupleI5OneToI5Int64EE4_sinS3_I8ExtrudedIS0_IS5_Li1ELi1EES3_I4BoolES3_IS5_EEEES5_([1 x i64] %state, { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, i64 signext %2) local_unnamed_addr #4 {
entry:
  %q.i.i = alloca i32, align 4
  %.fca.3.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 3
  %.fca.0.0.2.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 2, 0
  %.inv = icmp sgt i64 %2, 0
  %3 = select i1 %.inv, i64 %2, i64 0
  br i1 %.inv, label %L13.preheader, label %L76

L13.preheader:                                    ; preds = %entry
  %.fca.0.0.1.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 1, 0
  %.fca.0.0.0.0.extract = extractvalue { [1 x { { i8 addrspace(1)*, i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %1, 0, 0, 0, 0
  %.fca.0.extract = extractvalue { i8 addrspace(1)*, i64, [1 x i64], i64 } %0, 0
  %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %5 = add nuw nsw i32 %4, 1
  %6 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %7 = zext i32 %6 to i64
  %8 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  %9 = zext i32 %8 to i64
  %10 = mul nuw nsw i64 %9, %7
  %11 = zext i32 %5 to i64
  %12 = add nuw nsw i64 %10, %11
  %13 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %14 = mul i32 %8, %13
  %15 = sext i32 %14 to i64
  %16 = and i8 %.fca.0.0.1.0.extract, 1
  %.not16 = icmp eq i8 %16, 0
  %17 = bitcast i8 addrspace(1)* %.fca.0.0.0.0.extract to i64 addrspace(1)*
  %18 = bitcast i32* %q.i.i to i8*
  %19 = bitcast i8 addrspace(1)* %.fca.0.extract to double addrspace(1)*
  br label %L13

L13:                                              ; preds = %__internal_trig_reduction_kerneld.exit.i, %L13.preheader
  %value_phi3 = phi i64 [ %79, %__internal_trig_reduction_kerneld.exit.i ], [ 1, %L13.preheader ]
  %20 = add nsw i64 %value_phi3, -1
  %21 = mul i64 %20, %15
  %22 = add i64 %12, %21
  %.not = icmp slt i64 %.fca.3.extract, %22
  br i1 %.not, label %L76, label %L77

L76:                                              ; preds = %__internal_trig_reduction_kerneld.exit.i, %L13, %entry
  ret void

L77:                                              ; preds = %L13
  %23 = select i1 %.not16, i64 %.fca.0.0.2.0.extract, i64 %22
  %24 = add i64 %23, -1
  %25 = getelementptr inbounds i64, i64 addrspace(1)* %17, i64 %24
  %26 = load i64, i64 addrspace(1)* %25, align 8
  %27 = sitofp i64 %26 to double
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %18)
  %28 = call i32 @llvm.nvvm.d2i.lo(double %27) #5
  %29 = call i32 @llvm.nvvm.d2i.hi(double %27) #5
  %30 = and i32 %29, 2147483647
  %31 = icmp eq i32 %30, 2146435072
  %32 = icmp eq i32 %28, 0
  %33 = and i1 %32, %31
  %34 = fmul double %27, 0.000000e+00
  %spec.select9.i = select i1 %33, double %34, double %27
  %35 = fmul double %spec.select9.i, 0x3FE45F306DC9C883
  %36 = call i32 @llvm.nvvm.d2i.rn(double %35) #5
  store i32 %36, i32* %q.i.i, align 4
  %37 = sitofp i32 %36 to double
  %38 = fneg double %37
  %39 = call double @llvm.fma.f64(double %38, double 0x3FF921FB54442D18, double %spec.select9.i) #5
  %40 = call double @llvm.fma.f64(double %38, double 0x3C91A62633145C00, double %39) #5
  %41 = call double @llvm.fma.f64(double %38, double 0x397B839A252049C0, double %40) #5
  %42 = call i32 @llvm.nvvm.d2i.hi(double %spec.select9.i) #5
  %43 = and i32 %42, 2145386496
  %44 = icmp ugt i32 %43, 1105199103
  br i1 %44, label %45, label %__internal_trig_reduction_kerneld.exit.i

45:                                               ; preds = %L77
  %46 = call fastcc double @__internal_trig_reduction_slowpathd(double %spec.select9.i, i32* nonnull %q.i.i) #5
  %.pre = load i32, i32* %q.i.i, align 4
  br label %__internal_trig_reduction_kerneld.exit.i

__internal_trig_reduction_kerneld.exit.i:         ; preds = %45, %L77
  %47 = phi i32 [ %.pre, %45 ], [ %36, %L77 ]
  %t.i1.0.i = phi double [ %46, %45 ], [ %41, %L77 ]
  %48 = shl i32 %47, 3
  %49 = and i32 %48, 8
  %50 = zext i32 %49 to i64
  %51 = getelementptr inbounds [16 x double], [16 x double] addrspace(1)* @__cudart_sin_cos_coeffs, i64 0, i64 %50
  %52 = fmul double %t.i1.0.i, %t.i1.0.i
  %53 = and i32 %47, 1
  %.not.i = icmp eq i32 %53, 0
  %54 = select i1 %.not.i, double 0x3DE5DB65F9785EBA, double 0xBDA8FF8320FD8164
  %55 = getelementptr inbounds double, double addrspace(1)* %51, i64 1
  %56 = load double, double addrspace(1)* %55, align 8
  %57 = call double @llvm.fma.f64(double %54, double %52, double %56) #5
  %58 = getelementptr inbounds double, double addrspace(1)* %51, i64 2
  %59 = load double, double addrspace(1)* %58, align 8
  %60 = call double @llvm.fma.f64(double %57, double %52, double %59) #5
  %61 = getelementptr inbounds double, double addrspace(1)* %51, i64 3
  %62 = load double, double addrspace(1)* %61, align 8
  %63 = call double @llvm.fma.f64(double %60, double %52, double %62) #5
  %64 = getelementptr inbounds double, double addrspace(1)* %51, i64 4
  %65 = load double, double addrspace(1)* %64, align 8
  %66 = call double @llvm.fma.f64(double %63, double %52, double %65) #5
  %67 = getelementptr inbounds double, double addrspace(1)* %51, i64 5
  %68 = load double, double addrspace(1)* %67, align 8
  %69 = call double @llvm.fma.f64(double %66, double %52, double %68) #5
  %70 = getelementptr inbounds double, double addrspace(1)* %51, i64 6
  %71 = load double, double addrspace(1)* %70, align 8
  %72 = call double @llvm.fma.f64(double %69, double %52, double %71) #5
  %73 = call double @llvm.fma.f64(double %72, double %t.i1.0.i, double %t.i1.0.i) #5
  %74 = call double @llvm.fma.f64(double %72, double %52, double 1.000000e+00) #5
  %spec.select10.i = select i1 %.not.i, double %73, double %74
  %75 = and i32 %47, 2
  %.not5.i = icmp eq i32 %75, 0
  %76 = call double @llvm.fma.f64(double %spec.select10.i, double -1.000000e+00, double 0.000000e+00) #5
  %spec.select11.i = select i1 %.not5.i, double %spec.select10.i, double %76
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %18)
  %77 = add i64 %22, -1
  %78 = getelementptr inbounds double, double addrspace(1)* %19, i64 %77
  store double %spec.select11.i, double addrspace(1)* %78, align 8
  %.not17 = icmp eq i64 %value_phi3, %3
  %79 = add nuw i64 %value_phi3, 1
  br i1 %.not17, label %L76, label %L13
}

So 250 -> 120 lines, pretty significant.

maleadt · 2022-01-26T15:49:40Z

Weirdly, no change at the PTX level. Could it be that the pass was being run by the back-end after all? It's probably good to do so earlier, and give LLVM more optimization opportunities.

Ah, yes: https://github.com/llvm/llvm-project/blob/75c22b382f2a7b0bb9499215a3d64e146e3f02cc/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp#L313-L318

vchuravy · 2022-02-08T20:37:38Z

Was trying this out.

ERROR: Unknown __nvvm_reflect argument: __CUDA_PREC_SQRT
Stacktrace:
  [1] error(s::String)
    @ Base ./error.jl:33
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/7Za0O/src/ptx.jl:444 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/TimerOutputs/5tW2E/src/TimerOutput.jl:252 [inlined]
  [4] nvvm_reflect!(fun::LLVM.Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/7Za0O/src/ptx.jl:410
  [5] function_pass_callback(ptr::Ptr{Nothing}, data::Ptr{Nothing})
    @ LLVM ~/.julia/packages/LLVM/vQ98J/src/pass.jl:49
  [6] LLVMRunPassManager
    @ ~/.julia/packages/LLVM/vQ98J/lib/12/libLLVM_h.jl:4741 [inlined]
  [7] run!
    @ ~/.julia/packages/LLVM/vQ98J/src/passmanager.jl:39 [inlined]

maleadt · 2022-02-09T17:31:01Z

LLVM doesn't handle it either, but is more conservative about failing: https://github.com/JuliaLang/llvm-project/blob/bc5644ee74f4cb42042257ac129d2be1c252e3f2/llvm/lib/Target/NVPTX/NVVMReflect.cpp#L163-L173

maleadt · 2022-02-14T20:13:48Z

OK, I added a couple more and demoted the error to a warning. Curiously, LLVM currently defaults to 0 for an unsupported flag. That means it is using --prec-div=false and --prec-sqrt=false, both of which NVCC puts under the --use_fast_math=true flag. So we were using some fastmath-y mode by default....

vchuravy · 2022-02-14T20:25:17Z

Also had to dig this out recently https://llvm.org/docs/CompileCudaWithLLVM.html#flags-that-control-numerical-code

One of the biggest divergences to Clang is in our choice of not using ffp-contract and as I mentioned on the call today, KA currently does that, but when switching to method-tables won't do it anymore by default.

src/ptx.jl

vchuravy · 2022-02-14T20:27:07Z

src/ptx.jl

+        # handle possible cases
+        # XXX: put some of these property in the compiler job?
+        #      and/or first set the "nvvm-reflect-*" module flag like Clang does?
+        fast_math = Base.JLOptions().fast_math == 1


Having this part of @cuda would be cool, but this is already a strict improvement!

maleadt · 2022-02-15T09:24:54Z

This should be good to go. I don't want to put too much effort in this pass, because I'm still hoping we can just use the LLVM back-end for this in the future (in that case we'll have to set the appropriate module flags so that LLVM can act in accordance to Julia's fastmath settings).

codecov · 2022-02-15T11:56:22Z

Codecov Report

Merging #280 (e86fe9d) into master (ba3a7e6) will decrease coverage by 0.62%.
The diff coverage is 95.55%.

@@            Coverage Diff             @@
##           master     #280      +/-   ##
==========================================
- Coverage   86.75%   86.12%   -0.63%     
==========================================
  Files          22       22              
  Lines        2016     2062      +46     
==========================================
+ Hits         1749     1776      +27     
- Misses        267      286      +19

Impacted Files	Coverage Δ
src/ptx.jl	`95.67% <95.55%> (-0.04%)`	⬇️
src/gcn.jl	`68.93% <0.00%> (-16.51%)`	⬇️
src/jlgen.jl	`81.81% <0.00%> (-0.50%)`	⬇️
src/irgen.jl	`94.91% <0.00%> (+0.26%)`	⬆️

Continue to review full report at Codecov.

Legend - Click here to learn more
Δ = absolute <relative> (impact), ø = not affected, ? = missing data
Powered by Codecov. Last update ba3a7e6...e86fe9d. Read the comment docs.

This reverts commit 64296ac.

maleadt added the ptx Stuff about the NVIDIA PTX back-end. label Jan 26, 2022

maleadt force-pushed the tb/nvvm_reflect branch from 448f9eb to f4311bf Compare January 26, 2022 15:57

maleadt marked this pull request as draft January 26, 2022 15:57

maleadt mentioned this pull request Feb 14, 2022

Performance deprecation using ^ on Float32 JuliaGPU/CUDA.jl#1358

Closed

maleadt force-pushed the tb/nvvm_reflect branch from f4311bf to 058b615 Compare February 14, 2022 20:12

vchuravy reviewed Feb 14, 2022

View reviewed changes

src/ptx.jl Outdated Show resolved Hide resolved

vchuravy reviewed Feb 14, 2022

View reviewed changes

maleadt marked this pull request as ready for review February 15, 2022 07:06

Implement NVVMReflect in Julia.

e86fe9d

maleadt force-pushed the tb/nvvm_reflect branch from 058b615 to e86fe9d Compare February 15, 2022 09:24

maleadt merged commit 64296ac into master Feb 15, 2022

maleadt deleted the tb/nvvm_reflect branch February 15, 2022 14:14

maleadt added a commit that referenced this pull request Feb 18, 2022

Revert "Implement NVVMReflect in Julia. (#280)"

5223e58

This reverts commit 64296ac.

maleadt added a commit that referenced this pull request Feb 18, 2022

Implement NVVMReflect in Julia. (#280)

0e1195e

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement NVVMReflect in Julia. #280

Implement NVVMReflect in Julia. #280

maleadt commented Jan 26, 2022

maleadt commented Jan 26, 2022 •

edited

Loading

vchuravy commented Feb 8, 2022

maleadt commented Feb 9, 2022

maleadt commented Feb 14, 2022

vchuravy commented Feb 14, 2022

vchuravy Feb 14, 2022

maleadt commented Feb 15, 2022

codecov bot commented Feb 15, 2022

Implement NVVMReflect in Julia. #280

Implement NVVMReflect in Julia. #280

Conversation

maleadt commented Jan 26, 2022

maleadt commented Jan 26, 2022 • edited Loading

vchuravy commented Feb 8, 2022

maleadt commented Feb 9, 2022

maleadt commented Feb 14, 2022

vchuravy commented Feb 14, 2022

vchuravy Feb 14, 2022

Choose a reason for hiding this comment

maleadt commented Feb 15, 2022

codecov bot commented Feb 15, 2022

Codecov Report

maleadt commented Jan 26, 2022 •

edited

Loading