.text .amdgcn_target "amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-" .protected tailMul ; -- Begin function tailMul .globl tailMul .p2align 8 .type tailMul,@function tailMul: ; @tailMul ; %bb.0: s_sub_i32 s7, 0x200, s6 s_cmp_eq_u32 s6, 0 s_load_dwordx4 s[0:3], s[4:5], 0x8 s_load_dwordx2 s[10:11], s[4:5], 0x18 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[12:13], s[8:9], exec s_mov_b32 s15, 0 s_cselect_b32 s7, 0x100, s7 s_and_b32 s14, s6, 0xffffff00 s_lshl_b64 s[12:13], s[14:15], 4 s_waitcnt lgkmcnt(0) s_add_u32 s14, s0, s12 s_addc_u32 s16, s1, s13 s_lshl_b32 s17, s6, 7 s_and_b32 s19, s17, 0xf80 s_add_u32 s14, s14, s19 s_addc_u32 s17, s16, 0 s_lshl_b32 s16, s6, 13 s_and_b32 s20, s16, 0x1c0000 v_lshl_or_b32 v1, v0, 6, v0 s_add_u32 s16, s14, s20 v_and_b32_e32 v1, 0xe07, v1 s_addc_u32 s17, s17, 0 v_lshlrev_b32_e32 v73, 4, v1 v_mov_b32_e32 v1, s17 v_add_co_u32_e32 v5, vcc, s16, v73 v_addc_co_u32_e32 v6, vcc, 0, v1, vcc global_load_dwordx4 v[9:12], v73, s[16:17] s_mov_b32 s17, 0x10000 v_add_co_u32_e32 v1, vcc, s17, v5 v_addc_co_u32_e32 v2, vcc, 0, v6, vcc s_mov_b32 s16, 0x20000 v_add_co_u32_e32 v3, vcc, s16, v5 v_addc_co_u32_e32 v4, vcc, 0, v6, vcc s_mov_b32 s18, 0x30000 global_load_dwordx4 v[13:16], v[1:2], off global_load_dwordx4 v[17:20], v[3:4], off v_add_co_u32_e32 v1, vcc, s18, v5 v_addc_co_u32_e32 v2, vcc, 0, v6, vcc global_load_dwordx4 v[21:24], v[1:2], off v_lshlrev_b32_e32 v134, 4, v0 global_load_dwordx4 v[1:4], v134, s[10:11] global_load_dwordx4 v[5:8], v134, s[10:11] offset:1024 s_and_b32 s14, s7, 0xffffff00 s_lshl_b64 s[14:15], s[14:15], 4 s_add_u32 s0, s0, s14 s_addc_u32 s1, s1, s15 s_lshl_b32 s21, s7, 7 s_and_b32 s21, s21, 0xf80 s_add_u32 s0, s0, s21 s_addc_u32 s1, s1, 0 s_lshl_b32 s22, s7, 13 s_and_b32 s22, s22, 0x1c0000 s_add_u32 s0, s0, s22 s_addc_u32 s1, s1, 0 v_mov_b32_e32 v25, s1 v_add_co_u32_e32 v27, vcc, s0, v73 v_addc_co_u32_e32 v28, vcc, 0, v25, vcc v_add_co_u32_e32 v25, vcc, s17, v27 v_addc_co_u32_e32 v26, vcc, 0, v28, vcc global_load_dwordx4 v[45:48], v73, s[0:1] global_load_dwordx4 v[41:44], v[25:26], off s_add_u32 s0, s2, s12 s_addc_u32 s1, s3, s13 v_add_co_u32_e32 v25, vcc, s16, v27 s_add_u32 s0, s0, s19 v_addc_co_u32_e32 v26, vcc, 0, v28, vcc s_addc_u32 s1, s1, 0 v_add_co_u32_e32 v27, vcc, s18, v27 s_add_u32 s0, s0, s20 v_addc_co_u32_e32 v28, vcc, 0, v28, vcc s_addc_u32 s1, s1, 0 v_mov_b32_e32 v29, s1 v_add_co_u32_e32 v39, vcc, s0, v73 v_addc_co_u32_e32 v40, vcc, 0, v29, vcc v_add_co_u32_e32 v29, vcc, s17, v39 s_add_u32 s2, s2, s14 v_addc_co_u32_e32 v30, vcc, 0, v40, vcc s_addc_u32 s3, s3, s15 v_lshlrev_b32_e32 v130, 5, v0 v_add_co_u32_e32 v35, vcc, s16, v39 s_add_u32 s2, s2, s21 v_addc_co_u32_e32 v36, vcc, 0, v40, vcc s_addc_u32 s3, s3, 0 v_add_co_u32_e32 v39, vcc, s18, v39 s_add_u32 s2, s2, s22 v_addc_co_u32_e32 v40, vcc, 0, v40, vcc s_addc_u32 s3, s3, 0 v_or_b32_e32 v131, 64, v0 v_and_b32_e32 v81, 3, v0 v_lshlrev_b32_e32 v91, 3, v81 v_and_b32_e32 v89, 48, v0 v_lshlrev_b32_e32 v92, 4, v89 v_and_b32_e32 v93, 0x70, v131 s_waitcnt vmcnt(5) v_add_f64 v[31:32], v[11:12], -v[19:20] v_add_f64 v[37:38], v[9:10], v[17:18] v_add_f64 v[19:20], v[11:12], v[19:20] v_add_f64 v[11:12], v[9:10], -v[17:18] s_waitcnt vmcnt(4) v_add_f64 v[33:34], v[13:14], -v[21:22] v_add_f64 v[17:18], v[15:16], v[23:24] v_add_f64 v[15:16], v[15:16], -v[23:24] s_waitcnt vmcnt(2) v_mul_f64 v[9:10], v[1:2], v[7:8] v_add_f64 v[13:14], v[13:14], v[21:22] v_add_f64 v[23:24], v[31:32], -v[33:34] v_add_f64 v[21:22], v[19:20], -v[17:18] v_add_f64 v[69:70], v[11:12], v[15:16] v_add_f64 v[31:32], v[31:32], v[33:34] v_mul_f64 v[33:34], v[3:4], -v[7:8] v_fma_f64 v[97:98], v[5:6], v[3:4], v[9:10] v_add_f64 v[71:72], v[37:38], -v[13:14] v_add_f64 v[9:10], v[37:38], v[13:14] v_mul_f64 v[49:50], v[3:4], -v[23:24] v_add_f64 v[15:16], v[11:12], -v[15:16] v_mul_f64 v[13:14], v[7:8], -v[21:22] v_mov_b32_e32 v37, s3 v_fma_f64 v[99:100], v[5:6], v[1:2], v[33:34] v_mul_f64 v[33:34], v[97:98], -v[31:32] v_mul_f64 v[23:24], v[23:24], v[1:2] v_mul_f64 v[21:22], v[21:22], v[5:6] v_fma_f64 v[11:12], v[69:70], v[1:2], v[49:50] ds_write_b128 v130, v[9:12] v_fma_f64 v[9:10], v[71:72], v[5:6], v[13:14] v_fma_f64 v[11:12], v[15:16], v[99:100], v[33:34] v_add_co_u32_e32 v33, vcc, s2, v73 v_addc_co_u32_e32 v34, vcc, 0, v37, vcc v_mul_f64 v[37:38], v[31:32], v[99:100] v_add_co_u32_e32 v13, vcc, s17, v33 v_addc_co_u32_e32 v14, vcc, 0, v34, vcc ds_write_b128 v130, v[9:12] offset:16 v_add_co_u32_e32 v9, vcc, s16, v33 v_addc_co_u32_e32 v10, vcc, 0, v34, vcc v_add_co_u32_e32 v11, vcc, s18, v33 v_addc_co_u32_e32 v12, vcc, 0, v34, vcc global_load_dwordx4 v[65:68], v[25:26], off global_load_dwordx4 v[61:64], v[27:28], off global_load_dwordx4 v[49:52], v[29:30], off global_load_dwordx4 v[57:60], v[35:36], off global_load_dwordx4 v[53:56], v[39:40], off global_load_dwordx4 v[25:28], v[13:14], off global_load_dwordx4 v[33:36], v[9:10], off global_load_dwordx4 v[29:32], v[11:12], off v_add_f64 v[9:10], v[19:20], v[17:18] v_fma_f64 v[11:12], v[69:70], v[3:4], v[23:24] v_fma_f64 v[13:14], v[71:72], v[7:8], v[21:22] v_fma_f64 v[15:16], v[15:16], v[97:98], v[37:38] global_load_dwordx4 v[69:72], v73, s[0:1] global_load_dwordx4 v[37:40], v73, s[2:3] s_movk_i32 s0, 0xffe8 v_mad_i32_i24 v129, v0, s0, v130 ; wave barrier ds_read2st64_b64 v[17:20], v129 offset1:1 ds_read2st64_b64 v[21:24], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[9:12] ds_write_b128 v130, v[13:16] offset:16 v_and_b32_e32 v9, 60, v0 v_and_b32_e32 v10, 0x7c, v131 v_lshlrev_b32_e32 v9, 4, v9 v_lshlrev_b32_e32 v13, 4, v10 ; wave barrier ds_read2st64_b64 v[73:76], v129 offset1:1 ds_read2st64_b64 v[77:80], v129 offset0:2 offset1:3 ; wave barrier global_load_dwordx4 v[9:12], v9, s[10:11] global_load_dwordx4 v[13:16], v13, s[10:11] s_waitcnt lgkmcnt(4) v_add_f64 v[83:84], v[19:20], -v[23:24] v_add_f64 v[19:20], v[19:20], v[23:24] v_add_f64 v[81:82], v[17:18], v[21:22] s_waitcnt lgkmcnt(0) v_add_f64 v[85:86], v[73:74], -v[77:78] v_add_f64 v[73:74], v[73:74], v[77:78] v_add_f64 v[77:78], v[75:76], v[79:80] v_add_f64 v[17:18], v[17:18], -v[21:22] v_add_f64 v[21:22], v[75:76], -v[79:80] s_movk_i32 s0, 0x780 v_and_or_b32 v133, v130, s0, v91 v_lshlrev_b32_e32 v91, 4, v93 v_add_f64 v[75:76], v[85:86], -v[83:84] v_add_f64 v[79:80], v[85:86], v[83:84] v_add_f64 v[83:84], v[73:74], -v[77:78] s_movk_i32 s0, 0x600 v_add_f64 v[85:86], v[17:18], v[21:22] v_add_f64 v[17:18], v[17:18], -v[21:22] s_waitcnt vmcnt(7) v_add_f64 v[113:114], v[49:50], v[53:54] v_add_f64 v[117:118], v[49:50], -v[53:54] v_add_f64 v[115:116], v[51:52], v[55:56] v_add_f64 v[119:120], v[51:52], -v[55:56] s_waitcnt vmcnt(3) v_add_f64 v[95:96], v[71:72], v[59:60] v_add_f64 v[109:110], v[71:72], -v[59:60] v_add_f64 v[93:94], v[69:70], v[57:58] v_add_f64 v[111:112], v[69:70], -v[57:58] s_waitcnt vmcnt(1) v_mul_f64 v[21:22], v[11:12], -v[75:76] s_waitcnt vmcnt(0) v_mul_f64 v[87:88], v[9:10], v[15:16] v_mul_f64 v[23:24], v[11:12], -v[15:16] v_mul_f64 v[75:76], v[9:10], v[75:76] v_fma_f64 v[21:22], v[85:86], v[9:10], v[21:22] v_fma_f64 v[101:102], v[13:14], v[11:12], v[87:88] v_fma_f64 v[103:104], v[13:14], v[9:10], v[23:24] v_add_f64 v[87:88], v[81:82], -v[19:20] v_mul_f64 v[23:24], v[15:16], -v[83:84] v_mul_f64 v[83:84], v[83:84], v[13:14] v_add_f64 v[19:20], v[81:82], v[19:20] v_fma_f64 v[85:86], v[85:86], v[11:12], v[75:76] v_mul_f64 v[89:90], v[101:102], -v[79:80] v_mul_f64 v[79:80], v[79:80], v[103:104] v_fma_f64 v[23:24], v[87:88], v[13:14], v[23:24] v_fma_f64 v[83:84], v[87:88], v[15:16], v[83:84] v_fma_f64 v[81:82], v[17:18], v[103:104], v[89:90] v_add_f64 v[89:90], v[73:74], v[77:78] v_fma_f64 v[17:18], v[17:18], v[101:102], v[79:80] ds_write2_b64 v133, v[19:20], v[21:22] offset1:4 ds_write2_b64 v133, v[23:24], v[81:82] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[73:76], v129 offset1:1 ds_read2st64_b64 v[77:80], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[89:90], v[85:86] offset1:4 ds_write2_b64 v133, v[83:84], v[17:18] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[81:84], v129 offset1:1 ds_read2st64_b64 v[85:88], v129 offset0:2 offset1:3 ; wave barrier global_load_dwordx4 v[17:20], v92, s[10:11] global_load_dwordx4 v[21:24], v91, s[10:11] v_add_f64 v[91:92], v[47:48], v[67:68] v_add_f64 v[47:48], v[47:48], -v[67:68] v_add_f64 v[67:68], v[43:44], v[63:64] v_and_b32_e32 v89, 15, v0 v_lshlrev_b32_e32 v89, 3, v89 v_and_or_b32 v132, v130, s0, v89 v_add_f64 v[89:90], v[45:46], v[65:66] v_add_f64 v[45:46], v[45:46], -v[65:66] v_add_f64 v[65:66], v[41:42], v[61:62] v_add_f64 v[41:42], v[41:42], -v[61:62] v_add_f64 v[43:44], v[43:44], -v[63:64] v_add_f64 v[63:64], v[91:92], -v[67:68] v_add_f64 v[61:62], v[91:92], v[67:68] s_waitcnt lgkmcnt(0) v_add_f64 v[71:72], v[81:82], v[85:86] s_movk_i32 s0, 0x4000 v_add_f64 v[49:50], v[89:90], -v[65:66] v_add_f64 v[59:60], v[47:48], -v[41:42] v_add_f64 v[41:42], v[47:48], v[41:42] v_mul_f64 v[53:54], v[7:8], -v[63:64] v_add_f64 v[47:48], v[45:46], v[43:44] v_add_f64 v[57:58], v[89:90], v[65:66] v_mul_f64 v[69:70], v[63:64], v[5:6] v_add_f64 v[63:64], v[81:82], -v[85:86] v_add_f64 v[81:82], v[83:84], v[87:88] v_mul_f64 v[51:52], v[3:4], -v[59:60] v_add_f64 v[43:44], v[45:46], -v[43:44] v_fma_f64 v[65:66], v[49:50], v[5:6], v[53:54] v_add_f64 v[53:54], v[75:76], -v[79:80] v_add_f64 v[75:76], v[75:76], v[79:80] v_mul_f64 v[45:46], v[97:98], -v[41:42] v_mul_f64 v[55:56], v[59:60], v[1:2] v_add_f64 v[85:86], v[71:72], -v[81:82] v_fma_f64 v[59:60], v[47:48], v[1:2], v[51:52] v_add_f64 v[51:52], v[73:74], v[77:78] v_add_f64 v[73:74], v[73:74], -v[77:78] v_add_f64 v[77:78], v[83:84], -v[87:88] v_add_f64 v[83:84], v[63:64], -v[53:54] v_add_f64 v[53:54], v[63:64], v[53:54] v_fma_f64 v[63:64], v[47:48], v[3:4], v[55:56] v_mul_f64 v[41:42], v[41:42], v[99:100] v_add_f64 v[81:82], v[71:72], v[81:82] v_add_f64 v[87:88], v[51:52], -v[75:76] v_add_f64 v[51:52], v[51:52], v[75:76] v_fma_f64 v[69:70], v[49:50], v[7:8], v[69:70] v_fma_f64 v[71:72], v[43:44], v[97:98], v[41:42] s_waitcnt vmcnt(1) v_mul_f64 v[47:48], v[17:18], v[83:84] s_waitcnt vmcnt(0) v_mul_f64 v[67:68], v[17:18], v[23:24] v_mul_f64 v[79:80], v[19:20], -v[23:24] v_mul_f64 v[55:56], v[85:86], v[21:22] v_fma_f64 v[105:106], v[21:22], v[19:20], v[67:68] v_fma_f64 v[107:108], v[21:22], v[17:18], v[79:80] v_fma_f64 v[67:68], v[43:44], v[99:100], v[45:46] v_add_f64 v[45:46], v[73:74], v[77:78] v_add_f64 v[73:74], v[73:74], -v[77:78] v_mul_f64 v[77:78], v[19:20], -v[83:84] v_mul_f64 v[79:80], v[23:24], -v[85:86] v_add_f64 v[83:84], v[93:94], -v[113:114] v_mul_f64 v[89:90], v[105:106], -v[53:54] v_mul_f64 v[53:54], v[53:54], v[107:108] v_fma_f64 v[75:76], v[45:46], v[17:18], v[77:78] v_fma_f64 v[45:46], v[45:46], v[19:20], v[47:48] v_fma_f64 v[77:78], v[87:88], v[21:22], v[79:80] v_fma_f64 v[79:80], v[73:74], v[107:108], v[89:90] v_fma_f64 v[47:48], v[87:88], v[23:24], v[55:56] v_fma_f64 v[73:74], v[73:74], v[105:106], v[53:54] ds_write2_b64 v132, v[51:52], v[75:76] offset1:16 ds_write2_b64 v132, v[77:78], v[79:80] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[49:52], v129 offset1:1 ds_read2st64_b64 v[53:56], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v132, v[81:82], v[45:46] offset1:16 ds_write2_b64 v132, v[47:48], v[73:74] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[41:44], v129 offset1:1 ds_read2st64_b64 v[45:48], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[57:60] ds_write_b128 v130, v[65:68] offset:16 ; wave barrier ds_read2st64_b64 v[57:60], v129 offset1:1 ds_read2st64_b64 v[65:68], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[61:64] ds_write_b128 v130, v[69:72] offset:16 ; wave barrier ds_read2st64_b64 v[61:64], v129 offset1:1 ds_read2st64_b64 v[69:72], v129 offset0:2 offset1:3 v_add_f64 v[73:74], v[93:94], v[113:114] v_add_f64 v[79:80], v[109:110], -v[117:118] s_waitcnt lgkmcnt(4) v_add_f64 v[75:76], v[59:60], -v[67:68] v_add_f64 v[85:86], v[57:58], v[65:66] s_waitcnt lgkmcnt(0) v_add_f64 v[77:78], v[61:62], -v[69:70] v_add_f64 v[61:62], v[61:62], v[69:70] v_add_f64 v[69:70], v[63:64], v[71:72] v_add_f64 v[57:58], v[57:58], -v[65:66] v_add_f64 v[63:64], v[63:64], -v[71:72] v_add_f64 v[59:60], v[59:60], v[67:68] v_add_f64 v[81:82], v[111:112], v[119:120] v_add_f64 v[71:72], v[111:112], -v[119:120] v_add_f64 v[65:66], v[77:78], -v[75:76] v_add_f64 v[75:76], v[77:78], v[75:76] v_add_f64 v[67:68], v[61:62], -v[69:70] v_add_f64 v[87:88], v[109:110], v[117:118] v_add_f64 v[77:78], v[57:58], v[63:64] v_add_f64 v[91:92], v[85:86], -v[59:60] v_add_f64 v[63:64], v[57:58], -v[63:64] v_add_f64 v[59:60], v[85:86], v[59:60] v_mul_f64 v[93:94], v[11:12], -v[65:66] v_mul_f64 v[65:66], v[9:10], v[65:66] v_mul_f64 v[109:110], v[15:16], -v[67:68] v_mul_f64 v[111:112], v[101:102], -v[75:76] v_mul_f64 v[67:68], v[13:14], v[67:68] v_mul_f64 v[75:76], v[103:104], v[75:76] v_add_f64 v[69:70], v[61:62], v[69:70] v_add_f64 v[89:90], v[95:96], -v[115:116] v_fma_f64 v[85:86], v[77:78], v[9:10], v[93:94] v_fma_f64 v[77:78], v[77:78], v[11:12], v[65:66] v_add_f64 v[57:58], v[95:96], v[115:116] v_fma_f64 v[93:94], v[91:92], v[13:14], v[109:110] v_fma_f64 v[95:96], v[63:64], v[103:104], v[111:112] v_fma_f64 v[67:68], v[91:92], v[15:16], v[67:68] v_fma_f64 v[75:76], v[63:64], v[101:102], v[75:76] ; wave barrier ds_write2_b64 v133, v[59:60], v[85:86] offset1:4 ds_write2_b64 v133, v[93:94], v[95:96] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[59:62], v129 offset1:1 ds_read2st64_b64 v[63:66], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[69:70], v[77:78] offset1:4 ds_write2_b64 v133, v[67:68], v[75:76] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[67:70], v129 offset1:1 ds_read2st64_b64 v[75:78], v129 offset0:2 offset1:3 v_mul_f64 v[91:92], v[3:4], -v[79:80] v_mul_f64 v[109:110], v[7:8], -v[89:90] s_waitcnt lgkmcnt(4) v_add_f64 v[93:94], v[61:62], -v[65:66] v_mul_f64 v[85:86], v[97:98], -v[87:88] s_waitcnt lgkmcnt(0) v_add_f64 v[95:96], v[67:68], -v[75:76] v_add_f64 v[67:68], v[67:68], v[75:76] v_add_f64 v[111:112], v[69:70], v[77:78] v_add_f64 v[113:114], v[59:60], v[63:64] v_add_f64 v[59:60], v[59:60], -v[63:64] v_add_f64 v[69:70], v[69:70], -v[77:78] v_mul_f64 v[79:80], v[79:80], v[1:2] v_add_f64 v[65:66], v[61:62], v[65:66] v_add_f64 v[77:78], v[95:96], -v[93:94] v_fma_f64 v[75:76], v[81:82], v[1:2], v[91:92] v_add_f64 v[91:92], v[67:68], -v[111:112] v_add_f64 v[93:94], v[95:96], v[93:94] v_fma_f64 v[61:62], v[83:84], v[5:6], v[109:110] v_fma_f64 v[63:64], v[71:72], v[99:100], v[85:86] v_add_f64 v[85:86], v[59:60], v[69:70] v_mul_f64 v[89:90], v[89:90], v[5:6] v_mul_f64 v[109:110], v[19:20], -v[77:78] v_mul_f64 v[77:78], v[17:18], v[77:78] v_mul_f64 v[87:88], v[87:88], v[99:100] v_add_f64 v[95:96], v[113:114], -v[65:66] v_add_f64 v[69:70], v[59:60], -v[69:70] v_mul_f64 v[115:116], v[23:24], -v[91:92] v_mul_f64 v[117:118], v[105:106], -v[93:94] v_fma_f64 v[59:60], v[81:82], v[3:4], v[79:80] v_mul_f64 v[79:80], v[21:22], v[91:92] v_mul_f64 v[81:82], v[107:108], v[93:94] v_add_f64 v[91:92], v[113:114], v[65:66] v_fma_f64 v[93:94], v[85:86], v[17:18], v[109:110] v_add_f64 v[111:112], v[67:68], v[111:112] v_fma_f64 v[85:86], v[85:86], v[19:20], v[77:78] v_fma_f64 v[109:110], v[95:96], v[21:22], v[115:116] v_fma_f64 v[113:114], v[69:70], v[107:108], v[117:118] v_fma_f64 v[95:96], v[95:96], v[23:24], v[79:80] v_fma_f64 v[81:82], v[69:70], v[105:106], v[81:82] v_fma_f64 v[65:66], v[83:84], v[7:8], v[89:90] v_fma_f64 v[67:68], v[71:72], v[97:98], v[87:88] ; wave barrier ds_write2_b64 v132, v[91:92], v[93:94] offset1:16 ds_write2_b64 v132, v[109:110], v[113:114] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[69:72], v129 offset1:1 ds_read2st64_b64 v[77:80], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v132, v[111:112], v[85:86] offset1:16 ds_write2_b64 v132, v[95:96], v[81:82] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[81:84], v129 offset1:1 ds_read2st64_b64 v[85:88], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[73:76] ds_write_b128 v130, v[61:64] offset:16 ; wave barrier ds_read2st64_b64 v[61:64], v129 offset1:1 ds_read2st64_b64 v[73:76], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[57:60] ds_write_b128 v130, v[65:68] offset:16 ; wave barrier ds_read2st64_b64 v[57:60], v129 offset1:1 ds_read2st64_b64 v[65:68], v129 offset0:2 offset1:3 ; wave barrier v_add_f64 v[111:112], v[25:26], -v[29:30] v_add_f64 v[113:114], v[25:26], v[29:30] s_waitcnt lgkmcnt(4) v_add_f64 v[89:90], v[63:64], -v[75:76] v_add_f64 v[93:94], v[61:62], v[73:74] s_waitcnt lgkmcnt(0) v_add_f64 v[91:92], v[57:58], -v[65:66] v_add_f64 v[57:58], v[57:58], v[65:66] v_add_f64 v[65:66], v[59:60], v[67:68] v_add_f64 v[61:62], v[61:62], -v[73:74] v_add_f64 v[59:60], v[59:60], -v[67:68] v_add_f64 v[63:64], v[63:64], v[75:76] v_add_f64 v[115:116], v[27:28], v[31:32] v_add_f64 v[67:68], v[91:92], -v[89:90] v_add_f64 v[75:76], v[91:92], v[89:90] v_add_f64 v[73:74], v[57:58], -v[65:66] v_add_f64 v[65:66], v[57:58], v[65:66] v_add_f64 v[89:90], v[61:62], v[59:60] v_add_f64 v[59:60], v[61:62], -v[59:60] v_add_f64 v[91:92], v[93:94], -v[63:64] v_add_f64 v[63:64], v[93:94], v[63:64] v_mul_f64 v[61:62], v[11:12], -v[67:68] v_mul_f64 v[67:68], v[9:10], v[67:68] v_mul_f64 v[95:96], v[15:16], -v[73:74] v_mul_f64 v[109:110], v[101:102], -v[75:76] v_mul_f64 v[73:74], v[13:14], v[73:74] v_mul_f64 v[75:76], v[103:104], v[75:76] v_fma_f64 v[61:62], v[89:90], v[9:10], v[61:62] v_fma_f64 v[67:68], v[89:90], v[11:12], v[67:68] v_fma_f64 v[93:94], v[91:92], v[13:14], v[95:96] v_fma_f64 v[95:96], v[59:60], v[103:104], v[109:110] v_fma_f64 v[73:74], v[91:92], v[15:16], v[73:74] v_fma_f64 v[75:76], v[59:60], v[101:102], v[75:76] ds_write2_b64 v133, v[63:64], v[61:62] offset1:4 ds_write2_b64 v133, v[93:94], v[95:96] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[57:60], v129 offset1:1 ds_read2st64_b64 v[61:64], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[65:66], v[67:68] offset1:4 ds_write2_b64 v133, v[73:74], v[75:76] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[65:68], v129 offset1:1 ds_read2st64_b64 v[73:76], v129 offset0:2 offset1:3 v_add_f64 v[93:94], v[37:38], v[33:34] v_add_f64 v[33:34], v[37:38], -v[33:34] s_waitcnt lgkmcnt(4) v_add_f64 v[89:90], v[59:60], -v[63:64] v_add_f64 v[95:96], v[57:58], v[61:62] s_waitcnt lgkmcnt(0) v_add_f64 v[91:92], v[65:66], -v[73:74] v_add_f64 v[65:66], v[65:66], v[73:74] v_add_f64 v[73:74], v[67:68], v[75:76] v_add_f64 v[57:58], v[57:58], -v[61:62] v_add_f64 v[61:62], v[67:68], -v[75:76] v_add_f64 v[59:60], v[59:60], v[63:64] v_add_f64 v[63:64], v[39:40], v[35:36] v_add_f64 v[35:36], v[39:40], -v[35:36] v_add_f64 v[67:68], v[91:92], -v[89:90] v_add_f64 v[89:90], v[91:92], v[89:90] v_add_f64 v[75:76], v[65:66], -v[73:74] v_add_f64 v[65:66], v[65:66], v[73:74] v_add_f64 v[37:38], v[57:58], v[61:62] v_add_f64 v[57:58], v[57:58], -v[61:62] v_add_f64 v[39:40], v[95:96], -v[59:60] v_add_f64 v[25:26], v[95:96], v[59:60] v_mul_f64 v[61:62], v[19:20], -v[67:68] v_mul_f64 v[109:110], v[105:106], -v[89:90] v_mul_f64 v[91:92], v[23:24], -v[75:76] v_mul_f64 v[29:30], v[17:18], v[67:68] v_add_f64 v[95:96], v[35:36], -v[111:112] v_mul_f64 v[75:76], v[21:22], v[75:76] v_mul_f64 v[73:74], v[107:108], v[89:90] ; wave barrier v_add_f64 v[35:36], v[35:36], v[111:112] v_fma_f64 v[59:60], v[37:38], v[17:18], v[61:62] v_fma_f64 v[67:68], v[57:58], v[107:108], v[109:110] v_fma_f64 v[61:62], v[39:40], v[21:22], v[91:92] v_add_f64 v[91:92], v[27:28], -v[31:32] v_fma_f64 v[37:38], v[37:38], v[19:20], v[29:30] ds_write2_b64 v132, v[25:26], v[59:60] offset1:16 ds_write2_b64 v132, v[61:62], v[67:68] offset0:32 offset1:48 v_add_f64 v[59:60], v[63:64], -v[115:116] ; wave barrier ds_read2st64_b64 v[25:28], v129 offset1:1 ds_read2st64_b64 v[29:32], v129 offset0:2 offset1:3 ; wave barrier v_fma_f64 v[39:40], v[39:40], v[23:24], v[75:76] v_fma_f64 v[57:58], v[57:58], v[105:106], v[73:74] ds_write2_b64 v132, v[65:66], v[37:38] offset1:16 v_add_f64 v[61:62], v[33:34], v[91:92] v_mul_f64 v[37:38], v[3:4], -v[95:96] v_mul_f64 v[89:90], v[1:2], v[95:96] v_add_f64 v[65:66], v[93:94], -v[113:114] v_add_f64 v[67:68], v[33:34], -v[91:92] v_mul_f64 v[73:74], v[7:8], -v[59:60] v_mul_f64 v[75:76], v[97:98], -v[35:36] v_mul_f64 v[91:92], v[59:60], v[5:6] v_mul_f64 v[95:96], v[35:36], v[99:100] ds_write2_b64 v132, v[39:40], v[57:58] offset0:32 offset1:48 v_fma_f64 v[35:36], v[61:62], v[1:2], v[37:38] v_add_f64 v[33:34], v[93:94], v[113:114] v_add_f64 v[57:58], v[63:64], v[115:116] v_fma_f64 v[59:60], v[61:62], v[3:4], v[89:90] v_fma_f64 v[37:38], v[65:66], v[5:6], v[73:74] v_fma_f64 v[39:40], v[67:68], v[99:100], v[75:76] v_fma_f64 v[61:62], v[65:66], v[7:8], v[91:92] v_fma_f64 v[63:64], v[67:68], v[97:98], v[95:96] ; wave barrier ds_read2st64_b64 v[65:68], v129 offset1:1 ds_read2st64_b64 v[73:76], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[33:36] ds_write_b128 v130, v[37:40] offset:16 ; wave barrier ds_read2st64_b64 v[33:36], v129 offset1:1 ds_read2st64_b64 v[37:40], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[57:60] ds_write_b128 v130, v[61:64] offset:16 ; wave barrier ds_read2st64_b64 v[57:60], v129 offset1:1 ds_read2st64_b64 v[61:64], v129 offset0:2 offset1:3 v_add_f64 v[89:90], v[49:50], v[53:54] v_add_f64 v[91:92], v[49:50], -v[53:54] s_waitcnt lgkmcnt(4) v_add_f64 v[49:50], v[35:36], -v[39:40] v_add_f64 v[93:94], v[51:52], v[55:56] s_waitcnt lgkmcnt(0) v_add_f64 v[53:54], v[57:58], -v[61:62] v_add_f64 v[95:96], v[51:52], -v[55:56] v_add_f64 v[51:52], v[57:58], v[61:62] v_add_f64 v[55:56], v[59:60], v[63:64] v_add_f64 v[57:58], v[33:34], v[37:38] v_add_f64 v[33:34], v[33:34], -v[37:38] v_add_f64 v[37:38], v[59:60], -v[63:64] v_add_f64 v[61:62], v[41:42], v[45:46] v_add_f64 v[59:60], v[53:54], -v[49:50] v_add_f64 v[35:36], v[35:36], v[39:40] v_add_f64 v[63:64], v[41:42], -v[45:46] v_add_f64 v[39:40], v[51:52], -v[55:56] v_add_f64 v[41:42], v[53:54], v[49:50] v_add_f64 v[109:110], v[43:44], v[47:48] v_add_f64 v[111:112], v[43:44], -v[47:48] v_add_f64 v[43:44], v[33:34], v[37:38] v_mul_f64 v[45:46], v[11:12], -v[59:60] v_mul_f64 v[59:60], v[9:10], v[59:60] v_add_f64 v[47:48], v[57:58], -v[35:36] v_add_f64 v[33:34], v[33:34], -v[37:38] v_mul_f64 v[37:38], v[15:16], -v[39:40] v_mul_f64 v[49:50], v[101:102], -v[41:42] v_mul_f64 v[39:40], v[13:14], v[39:40] v_mul_f64 v[41:42], v[103:104], v[41:42] v_fma_f64 v[45:46], v[43:44], v[9:10], v[45:46] v_add_f64 v[35:36], v[57:58], v[35:36] v_add_f64 v[51:52], v[51:52], v[55:56] v_fma_f64 v[43:44], v[43:44], v[11:12], v[59:60] v_fma_f64 v[37:38], v[47:48], v[13:14], v[37:38] v_fma_f64 v[49:50], v[33:34], v[103:104], v[49:50] v_fma_f64 v[47:48], v[47:48], v[15:16], v[39:40] v_fma_f64 v[41:42], v[33:34], v[101:102], v[41:42] ; wave barrier ds_write2_b64 v133, v[35:36], v[45:46] offset1:4 ds_write2_b64 v133, v[37:38], v[49:50] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[33:36], v129 offset1:1 ds_read2st64_b64 v[37:40], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[51:52], v[43:44] offset1:4 ds_write2_b64 v133, v[47:48], v[41:42] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[41:44], v129 offset1:1 ds_read2st64_b64 v[45:48], v129 offset0:2 offset1:3 v_add_f64 v[57:58], v[71:72], v[79:80] v_add_f64 v[59:60], v[71:72], -v[79:80] s_waitcnt lgkmcnt(4) v_add_f64 v[51:52], v[35:36], -v[39:40] v_add_f64 v[53:54], v[69:70], v[77:78] s_waitcnt lgkmcnt(0) v_add_f64 v[71:72], v[41:42], -v[45:46] v_add_f64 v[41:42], v[41:42], v[45:46] v_add_f64 v[45:46], v[43:44], v[47:48] v_add_f64 v[55:56], v[69:70], -v[77:78] v_add_f64 v[49:50], v[81:82], v[85:86] v_add_f64 v[69:70], v[81:82], -v[85:86] v_add_f64 v[81:82], v[33:34], v[37:38] v_add_f64 v[33:34], v[33:34], -v[37:38] v_add_f64 v[37:38], v[43:44], -v[47:48] v_add_f64 v[43:44], v[71:72], -v[51:52] v_add_f64 v[35:36], v[35:36], v[39:40] v_add_f64 v[39:40], v[41:42], -v[45:46] v_add_f64 v[47:48], v[71:72], v[51:52] v_add_f64 v[77:78], v[83:84], v[87:88] v_add_f64 v[79:80], v[83:84], -v[87:88] v_add_f64 v[85:86], v[25:26], v[29:30] v_add_f64 v[87:88], v[25:26], -v[29:30] v_add_f64 v[25:26], v[33:34], v[37:38] v_mul_f64 v[29:30], v[19:20], -v[43:44] v_add_f64 v[51:52], v[81:82], -v[35:36] v_add_f64 v[33:34], v[33:34], -v[37:38] v_mul_f64 v[37:38], v[23:24], -v[39:40] v_mul_f64 v[71:72], v[105:106], -v[47:48] v_add_f64 v[113:114], v[27:28], v[31:32] v_add_f64 v[115:116], v[27:28], -v[31:32] v_add_f64 v[31:32], v[81:82], v[35:36] v_fma_f64 v[27:28], v[25:26], v[17:18], v[29:30] ; wave barrier v_mul_f64 v[29:30], v[17:18], v[43:44] v_add_f64 v[41:42], v[41:42], v[45:46] v_fma_f64 v[35:36], v[51:52], v[21:22], v[37:38] v_fma_f64 v[37:38], v[33:34], v[107:108], v[71:72] ds_write2_b64 v132, v[31:32], v[27:28] offset1:16 ds_write2_b64 v132, v[35:36], v[37:38] offset0:32 offset1:48 v_lshl_add_u32 v37, v0, 9, s6 v_sub_u32_e32 v38, 0x8000, v37 v_cmp_lt_u32_e32 vcc, s0, v37 v_cndmask_b32_e32 v37, v37, v38, vcc v_cvt_f64_u32_e32 v[37:38], v37 s_mov_b32 s0, 0x54442d18 s_mov_b32 s1, 0x3f0921fb v_fma_f64 v[45:46], v[25:26], v[19:20], v[29:30] v_mul_f64 v[71:72], v[37:38], s[0:1] ; wave barrier ds_read2st64_b64 v[25:28], v129 offset1:1 ds_read2st64_b64 v[29:32], v129 offset0:2 offset1:3 s_mov_b32 s0, 0xbe8838d4 v_add_f64 v[123:124], v[65:66], v[73:74] v_add_f64 v[65:66], v[65:66], -v[73:74] s_mov_b32 s1, 0xbda8fae9 s_waitcnt lgkmcnt(0) v_add_f64 v[73:74], v[25:26], v[29:30] v_add_f64 v[25:26], v[25:26], -v[29:30] v_mul_f64 v[117:118], v[71:72], v[71:72] v_mov_b32_e32 v29, 0xbdb4b1c4 v_mov_b32_e32 v30, 0x3e21ee9e v_mul_f64 v[39:40], v[21:22], v[39:40] v_mul_f64 v[43:44], v[107:108], v[47:48] v_add_f64 v[135:136], v[67:68], v[75:76] v_add_f64 v[67:68], v[67:68], -v[75:76] v_add_f64 v[75:76], v[27:28], v[31:32] v_fma_f64 v[29:30], v[117:118], s[0:1], v[29:30] s_mov_b32 s0, 0xf9a43bb8 v_add_f64 v[27:28], v[27:28], -v[31:32] v_mov_b32_e32 v31, 0xb42fdfa7 v_mov_b32_e32 v32, 0xbe5ae600 s_mov_b32 s1, 0x3de5e0b2 v_fma_f64 v[31:32], v[117:118], s[0:1], v[31:32] s_mov_b32 s0, 0x809c52ad s_mov_b32 s1, 0xbe927e4f v_fma_f64 v[29:30], v[29:30], v[117:118], s[0:1] v_fma_f64 v[35:36], v[51:52], v[23:24], v[39:40] v_fma_f64 v[33:34], v[33:34], v[105:106], v[43:44] s_mov_b32 s0, 0x796cde01 s_mov_b32 s1, 0x3ec71de3 v_fma_f64 v[31:32], v[31:32], v[117:118], s[0:1] s_mov_b32 s0, 0x19cb1590 s_mov_b32 s1, 0x3efa01a0 v_fma_f64 v[29:30], v[29:30], v[117:118], s[0:1] ; wave barrier ds_write2_b64 v132, v[41:42], v[45:46] offset1:16 ds_write2_b64 v132, v[35:36], v[33:34] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[33:36], v129 offset1:1 ds_read2st64_b64 v[37:40], v129 offset0:2 offset1:3 s_mov_b32 s0, 0x19e83e5c s_mov_b32 s1, 0xbf2a01a0 v_fma_f64 v[31:32], v[31:32], v[117:118], s[0:1] s_mov_b32 s0, 0x16c15177 s_mov_b32 s1, 0xbf56c16c s_waitcnt lgkmcnt(0) v_add_f64 v[81:82], v[33:34], v[37:38] v_add_f64 v[83:84], v[33:34], -v[37:38] v_fma_f64 v[37:38], v[29:30], v[117:118], s[0:1] s_mov_b32 s0, 0x11110bb3 s_mov_b32 s1, 0x3f811111 v_add_f64 v[41:42], v[53:54], v[57:58] v_add_f64 v[45:46], v[53:54], -v[57:58] v_fma_f64 v[53:54], v[31:32], v[117:118], s[0:1] s_mov_b32 s0, 0x5555554c s_mov_b32 s1, 0x3fa55555 v_add_f64 v[43:44], v[49:50], v[77:78] v_add_f64 v[47:48], v[49:50], -v[77:78] v_add_f64 v[49:50], v[55:56], -v[79:80] v_add_f64 v[29:30], v[55:56], v[79:80] v_fma_f64 v[55:56], v[37:38], v[117:118], s[0:1] s_mov_b32 s0, 0x55555555 s_mov_b32 s1, 0xbfc55555 v_add_f64 v[33:34], v[35:36], v[39:40] v_add_f64 v[35:36], v[35:36], -v[39:40] v_add_f64 v[51:52], v[69:70], v[59:60] v_add_f64 v[31:32], v[69:70], -v[59:60] v_fma_f64 v[69:70], v[53:54], v[117:118], s[0:1] v_mul_f64 v[77:78], v[71:72], v[117:118] v_fma_f64 v[79:80], v[55:56], v[117:118], -0.5 v_add_f64 v[59:60], v[83:84], v[27:28] v_add_f64 v[39:40], v[81:82], v[33:34] v_add_f64 v[55:56], v[81:82], -v[33:34] v_add_f64 v[57:58], v[25:26], -v[35:36] v_add_f64 v[33:34], v[25:26], v[35:36] v_add_f64 v[35:36], v[83:84], -v[27:28] v_fma_f64 v[25:26], v[69:70], v[77:78], v[71:72] v_fma_f64 v[27:28], v[79:80], v[117:118], 1.0 v_add_f64 v[37:38], v[73:74], v[75:76] v_add_f64 v[53:54], v[73:74], -v[75:76] v_add_f64 v[125:126], v[89:90], v[93:94] v_add_f64 v[69:70], v[89:90], -v[93:94] v_add_f64 v[71:72], v[61:62], -v[109:110] v_add_f64 v[127:128], v[61:62], v[109:110] v_add_f64 v[117:118], v[91:92], v[111:112] v_add_f64 v[81:82], v[91:92], -v[111:112] v_add_f64 v[83:84], v[63:64], v[95:96] v_add_f64 v[119:120], v[63:64], -v[95:96] v_add_f64 v[121:122], v[85:86], v[113:114] v_add_f64 v[73:74], v[85:86], -v[113:114] v_add_f64 v[75:76], v[123:124], -v[135:136] v_add_f64 v[123:124], v[123:124], v[135:136] v_add_f64 v[113:114], v[87:88], v[67:68] v_add_f64 v[77:78], v[87:88], -v[67:68] v_add_f64 v[79:80], v[65:66], v[115:116] v_add_f64 v[115:116], v[65:66], -v[115:116] v_cndmask_b32_e64 v112, -v26, -v28, vcc v_cndmask_b32_e32 v111, v25, v27, vcc v_cndmask_b32_e32 v110, v28, v26, vcc v_cndmask_b32_e32 v109, v27, v25, vcc s_and_b64 vcc, exec, s[8:9] s_cbranch_vccnz .LBB0_2 ; %bb.1: v_not_b32_e32 v25, v0 v_lshlrev_b32_e32 v177, 4, v25 ; wave barrier ds_write_b128 v177, v[41:44] offset:4096 ds_write_b128 v177, v[29:32] offset:3072 ds_write_b128 v177, v[45:48] offset:2048 ds_write_b128 v177, v[49:52] offset:1024 ; wave barrier ds_read_b128 v[25:28], v134 ds_read_b128 v[61:64], v134 offset:1024 ds_read_b128 v[65:68], v134 offset:2048 ds_read_b128 v[85:88], v134 offset:3072 ; wave barrier ds_write_b128 v177, v[37:40] offset:4096 ds_write_b128 v177, v[33:36] offset:3072 ds_write_b128 v177, v[53:56] offset:2048 ds_write_b128 v177, v[57:60] offset:1024 ; wave barrier ds_read_b128 v[89:92], v134 ds_read_b128 v[93:96], v134 offset:1024 s_waitcnt lgkmcnt(9) v_add_f64 v[143:144], v[127:128], v[27:28] v_add_f64 v[147:148], v[125:126], -v[25:26] v_add_f64 v[27:28], v[127:128], -v[27:28] s_waitcnt lgkmcnt(1) v_add_f64 v[145:146], v[121:122], -v[89:90] v_add_f64 v[149:150], v[123:124], v[91:92] v_add_f64 v[25:26], v[125:126], v[25:26] v_add_f64 v[161:162], v[71:72], v[67:68] v_add_f64 v[89:90], v[121:122], v[89:90] v_add_f64 v[91:92], v[123:124], -v[91:92] v_add_f64 v[67:68], v[71:72], -v[67:68] v_add_f64 v[167:168], v[117:118], -v[61:62] v_mul_f64 v[135:136], v[143:144], v[145:146] v_mul_f64 v[151:152], v[149:150], -v[143:144] v_mul_f64 v[155:156], v[149:150], -v[27:28] v_mul_f64 v[157:158], v[27:28], v[145:146] v_add_f64 v[61:62], v[117:118], v[61:62] v_fma_f64 v[153:154], v[147:148], v[149:150], v[135:136] ds_read_b128 v[135:138], v134 offset:2048 ds_read_b128 v[139:142], v134 offset:3072 v_fma_f64 v[151:152], v[147:148], v[145:146], v[151:152] v_fma_f64 v[149:150], v[25:26], v[149:150], v[157:158] v_fma_f64 v[145:146], v[25:26], v[145:146], v[155:156] s_waitcnt lgkmcnt(1) v_add_f64 v[163:164], v[73:74], -v[135:136] v_add_f64 v[165:166], v[75:76], v[137:138] v_add_f64 v[137:138], v[75:76], -v[137:138] v_mul_f64 v[159:160], v[111:112], v[153:154] v_mul_f64 v[153:154], -v[109:110], v[153:154] v_add_f64 v[135:136], v[73:74], v[135:136] v_fma_f64 v[143:144], v[143:144], v[91:92], v[149:150] v_fma_f64 v[145:146], v[147:148], v[89:90], v[145:146] v_mul_f64 v[157:158], v[161:162], v[163:164] ; wave barrier v_fma_f64 v[155:156], v[151:152], -v[109:110], v[159:160] v_fma_f64 v[151:152], v[151:152], -v[111:112], v[153:154] v_add_f64 v[153:154], v[69:70], -v[65:66] v_mul_f64 v[159:160], v[165:166], -v[161:162] v_add_f64 v[65:66], v[69:70], v[65:66] v_fma_f64 v[89:90], v[25:26], v[89:90], v[155:156] v_fma_f64 v[91:92], v[27:28], v[91:92], v[151:152] v_fma_f64 v[147:148], v[153:154], v[165:166], v[157:158] v_fma_f64 v[149:150], v[153:154], v[163:164], v[159:160] v_add_f64 v[151:152], v[113:114], -v[93:94] v_mul_f64 v[155:156], v[67:68], v[163:164] v_mul_f64 v[157:158], v[165:166], -v[67:68] v_add_f64 v[159:160], v[115:116], v[95:96] v_add_f64 v[25:26], v[145:146], v[89:90] v_add_f64 v[27:28], -v[91:92], -v[143:144] v_add_f64 v[91:92], v[91:92], -v[143:144] v_mul_f64 v[143:144], v[109:110], v[147:148] v_add_f64 v[89:90], v[89:90], -v[145:146] v_add_f64 v[145:146], v[119:120], v[63:64] v_mul_f64 v[147:148], v[111:112], -v[147:148] v_add_f64 v[63:64], v[119:120], -v[63:64] v_add_f64 v[93:94], v[113:114], v[93:94] v_add_f64 v[95:96], v[115:116], -v[95:96] v_fma_f64 v[143:144], v[149:150], v[111:112], v[143:144] v_mul_f64 v[169:170], v[145:146], v[151:152] v_fma_f64 v[147:148], v[149:150], v[109:110], v[147:148] v_fma_f64 v[149:150], v[65:66], v[165:166], v[155:156] v_fma_f64 v[155:156], v[65:66], v[163:164], v[157:158] s_waitcnt lgkmcnt(0) v_add_f64 v[163:164], v[77:78], -v[139:140] v_mul_f64 v[157:158], v[159:160], -v[145:146] v_fma_f64 v[143:144], v[67:68], v[137:138], v[143:144] v_add_f64 v[67:68], v[83:84], v[87:88] v_fma_f64 v[165:166], v[167:168], v[159:160], v[169:170] v_fma_f64 v[147:148], v[65:66], v[135:136], v[147:148] v_fma_f64 v[137:138], v[161:162], v[137:138], v[149:150] v_fma_f64 v[135:136], v[153:154], v[135:136], v[155:156] v_add_f64 v[149:150], v[81:82], -v[85:86] v_add_f64 v[153:154], v[79:80], v[141:142] v_fma_f64 v[65:66], v[167:168], v[151:152], v[157:158] v_mul_f64 v[155:156], v[67:68], v[163:164] v_mul_f64 v[157:158], v[109:110], -v[165:166] v_mul_f64 v[161:162], -v[111:112], v[165:166] v_add_f64 v[87:88], v[83:84], -v[87:88] v_mul_f64 v[169:170], v[63:64], v[151:152] v_add_f64 v[85:86], v[81:82], v[85:86] v_mul_f64 v[165:166], v[153:154], -v[67:68] v_add_f64 v[139:140], v[77:78], v[139:140] v_fma_f64 v[155:156], v[149:150], v[153:154], v[155:156] v_fma_f64 v[157:158], v[65:66], -v[111:112], v[157:158] v_fma_f64 v[65:66], v[65:66], v[109:110], v[161:162] v_mul_f64 v[161:162], v[159:160], -v[63:64] v_mul_f64 v[173:174], v[87:88], v[163:164] v_mul_f64 v[175:176], v[153:154], -v[87:88] v_fma_f64 v[165:166], v[149:150], v[163:164], v[165:166] v_fma_f64 v[159:160], v[61:62], v[159:160], v[169:170] v_mul_f64 v[171:172], v[111:112], v[155:156] v_mul_f64 v[155:156], v[109:110], v[155:156] v_add_f64 v[141:142], v[79:80], -v[141:142] v_fma_f64 v[151:152], v[61:62], v[151:152], v[161:162] v_fma_f64 v[153:154], v[85:86], v[153:154], v[173:174] v_fma_f64 v[163:164], v[85:86], v[163:164], v[175:176] v_fma_f64 v[63:64], v[63:64], v[95:96], v[65:66] v_fma_f64 v[61:62], v[61:62], v[93:94], v[157:158] v_fma_f64 v[161:162], v[165:166], -v[109:110], v[171:172] v_fma_f64 v[155:156], v[165:166], v[111:112], v[155:156] v_fma_f64 v[145:146], v[145:146], v[95:96], v[159:160] v_fma_f64 v[151:152], v[167:168], v[93:94], v[151:152] v_fma_f64 v[153:154], v[67:68], v[141:142], v[153:154] v_fma_f64 v[149:150], v[149:150], v[139:140], v[163:164] v_add_f64 v[65:66], v[135:136], v[147:148] v_add_f64 v[67:68], -v[143:144], -v[137:138] v_fma_f64 v[157:158], v[87:88], v[141:142], v[161:162] v_fma_f64 v[155:156], v[85:86], v[139:140], v[155:156] v_add_f64 v[95:96], v[143:144], -v[137:138] v_add_f64 v[93:94], v[147:148], -v[135:136] v_add_f64 v[137:138], v[63:64], -v[145:146] v_add_f64 v[135:136], v[61:62], -v[151:152] v_add_f64 v[85:86], v[151:152], v[61:62] v_add_f64 v[87:88], -v[63:64], -v[145:146] v_add_f64 v[141:142], v[157:158], -v[153:154] v_add_f64 v[139:140], v[155:156], -v[149:150] v_add_f64 v[61:62], v[149:150], v[155:156] v_add_f64 v[63:64], -v[157:158], -v[153:154] ds_write_b128 v177, v[89:92] offset:4096 ds_write_b128 v177, v[135:138] offset:3072 ds_write_b128 v177, v[93:96] offset:2048 ds_write_b128 v177, v[139:142] offset:1024 ; wave barrier ds_read_b128 v[89:92], v134 ds_read_b128 v[93:96], v134 offset:1024 v_or_b32_e32 v135, 0x800, v134 v_or_b32_e32 v136, 0xc00, v134 s_load_dwordx2 s[0:1], s[4:5], 0x0 s_cbranch_execz .LBB0_3 s_branch .LBB0_8 .LBB0_2: ; implicit-def: $vgpr135 ; implicit-def: $vgpr136 ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92 ; implicit-def: $vgpr93_vgpr94_vgpr95_vgpr96 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 ; implicit-def: $vgpr85_vgpr86_vgpr87_vgpr88 ; implicit-def: $vgpr65_vgpr66_vgpr67_vgpr68 ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64 s_load_dwordx2 s[0:1], s[4:5], 0x0 .LBB0_3: v_sub_u32_e32 v25, 0, v0 v_and_b32_e32 v25, 0x7f, v25 v_sub_u32_e32 v134, 0, v134 v_lshlrev_b32_e32 v85, 4, v25 v_lshl_add_u32 v135, v0, 3, v129 ; wave barrier ds_write_b128 v134, v[81:84] offset:1024 ds_write_b128 v85, v[69:72] ; wave barrier ds_read_b128 v[81:84], v135 ds_read_b128 v[65:68], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[77:80] offset:1024 ds_write_b128 v85, v[73:76] ; wave barrier ds_read_b128 v[73:76], v135 ds_read_b128 v[61:64], v135 offset:1024 v_cmp_ne_u32_e32 vcc, 0, v0 ; implicit-def: $vgpr69_vgpr70_vgpr71_vgpr72 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 s_and_saveexec_b64 s[2:3], vcc s_xor_b64 s[2:3], exec, s[2:3] s_cbranch_execz .LBB0_5 ; %bb.4: s_waitcnt lgkmcnt(0) v_add_f64 v[25:26], v[127:128], v[83:84] v_add_f64 v[27:28], v[121:122], -v[73:74] v_add_f64 v[69:70], v[125:126], -v[81:82] v_add_f64 v[71:72], v[123:124], v[75:76] v_add_f64 v[79:80], v[127:128], -v[83:84] v_add_f64 v[81:82], v[125:126], v[81:82] v_add_f64 v[73:74], v[121:122], v[73:74] v_add_f64 v[75:76], v[123:124], -v[75:76] ; implicit-def: $vgpr125_vgpr126 ; implicit-def: $vgpr127_vgpr128 ; implicit-def: $vgpr121_vgpr122 ; implicit-def: $vgpr123_vgpr124 v_mul_f64 v[77:78], v[25:26], v[27:28] v_mul_f64 v[83:84], v[71:72], -v[25:26] v_mul_f64 v[86:87], v[71:72], -v[79:80] v_mul_f64 v[88:89], v[79:80], v[27:28] v_fma_f64 v[77:78], v[69:70], v[71:72], v[77:78] v_fma_f64 v[83:84], v[69:70], v[27:28], v[83:84] v_fma_f64 v[27:28], v[81:82], v[27:28], v[86:87] v_fma_f64 v[71:72], v[81:82], v[71:72], v[88:89] v_mul_f64 v[90:91], v[111:112], v[77:78] v_mul_f64 v[77:78], -v[109:110], v[77:78] v_fma_f64 v[69:70], v[69:70], v[73:74], v[27:28] v_fma_f64 v[71:72], v[25:26], v[75:76], v[71:72] v_fma_f64 v[86:87], v[83:84], -v[109:110], v[90:91] v_fma_f64 v[77:78], v[83:84], -v[111:112], v[77:78] v_fma_f64 v[73:74], v[81:82], v[73:74], v[86:87] v_fma_f64 v[75:76], v[79:80], v[75:76], v[77:78] ; implicit-def: $vgpr81_vgpr82_vgpr83_vgpr84 v_add_f64 v[25:26], v[69:70], v[73:74] v_add_f64 v[27:28], -v[75:76], -v[71:72] v_add_f64 v[71:72], v[75:76], -v[71:72] v_add_f64 v[69:70], v[73:74], -v[69:70] ; implicit-def: $vgpr73_vgpr74_vgpr75_vgpr76 .LBB0_5: ; %Flow s_andn2_saveexec_b64 s[2:3], s[2:3] s_cbranch_execz .LBB0_7 ; %bb.6: v_add_f64 v[25:26], v[125:126], v[127:128] v_add_f64 v[27:28], v[125:126], -v[127:128] v_add_f64 v[69:70], v[121:122], v[123:124] v_add_f64 v[71:72], v[121:122], -v[123:124] v_mul_f64 v[25:26], v[25:26], v[69:70] v_mul_f64 v[27:28], v[27:28], v[71:72] s_waitcnt lgkmcnt(0) v_mul_f64 v[69:70], v[73:74], -v[83:84] v_mul_f64 v[71:72], v[83:84], -v[75:76] v_add_f64 v[77:78], v[25:26], v[27:28] v_add_f64 v[27:28], v[25:26], -v[27:28] v_fma_f64 v[69:70], v[81:82], -v[75:76], v[69:70] v_fma_f64 v[73:74], v[81:82], v[73:74], v[71:72] v_add_f64 v[25:26], v[77:78], v[77:78] v_add_f64 v[27:28], v[27:28], v[27:28] v_mul_f64 v[71:72], v[69:70], 4.0 v_mul_f64 v[69:70], v[73:74], 4.0 v_xor_b32_e32 v28, 0x80000000, v28 .LBB0_7: s_or_b64 exec, exec, s[2:3] s_waitcnt lgkmcnt(0) v_add_f64 v[73:74], v[119:120], v[67:68] v_add_f64 v[75:76], v[113:114], -v[61:62] v_add_f64 v[77:78], v[117:118], -v[65:66] v_add_f64 v[79:80], v[115:116], v[63:64] v_add_f64 v[67:68], v[119:120], -v[67:68] v_add_f64 v[65:66], v[117:118], v[65:66] v_add_f64 v[61:62], v[113:114], v[61:62] v_add_f64 v[63:64], v[115:116], -v[63:64] s_mov_b32 s2, 0x8da49510 v_mul_f64 v[81:82], v[73:74], v[75:76] s_mov_b32 s3, 0xbf13bd2c v_mul_f64 v[83:84], v[79:80], -v[73:74] v_mul_f64 v[86:87], v[79:80], -v[67:68] v_mul_f64 v[88:89], v[67:68], v[75:76] s_mov_b32 s5, 0x3f8921d1 s_mov_b32 s4, 0xfcdec784 ; wave barrier v_add_u32_e32 v136, 0x400, v135 v_fma_f64 v[81:82], v[77:78], v[79:80], v[81:82] v_fma_f64 v[83:84], v[77:78], v[75:76], v[83:84] v_fma_f64 v[75:76], v[65:66], v[75:76], v[86:87] v_fma_f64 v[79:80], v[65:66], v[79:80], v[88:89] v_mul_f64 v[90:91], -v[111:112], v[81:82] v_mul_f64 v[81:82], v[109:110], -v[81:82] v_fma_f64 v[75:76], v[77:78], v[61:62], v[75:76] v_fma_f64 v[73:74], v[73:74], v[63:64], v[79:80] v_fma_f64 v[86:87], v[83:84], v[109:110], v[90:91] v_fma_f64 v[81:82], v[83:84], -v[111:112], v[81:82] v_fma_f64 v[77:78], v[67:68], v[63:64], v[86:87] v_fma_f64 v[79:80], v[65:66], v[61:62], v[81:82] v_fma_f64 v[65:66], v[109:110], s[2:3], v[109:110] v_fma_f64 v[81:82], v[111:112], s[2:3], v[111:112] v_add_f64 v[63:64], v[77:78], -v[73:74] v_add_f64 v[61:62], v[79:80], -v[75:76] v_fma_f64 v[83:84], v[111:112], s[4:5], v[65:66] s_mov_b32 s5, 0xbf8921d1 ds_write_b128 v134, v[61:64] offset:1024 ds_write_b128 v85, v[69:72] ; wave barrier ds_read_b128 v[65:68], v135 ds_read_b128 v[61:64], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[49:52] offset:1008 ds_write_b128 v134, v[45:48] offset:2032 ; wave barrier ds_read_b128 v[45:48], v135 ds_read_b128 v[49:52], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[57:60] offset:1008 ds_write_b128 v134, v[53:56] offset:2032 ; wave barrier ds_read_b128 v[53:56], v135 ds_read_b128 v[57:60], v135 offset:1024 v_fma_f64 v[69:70], v[109:110], s[4:5], v[81:82] s_waitcnt lgkmcnt(5) v_add_f64 v[71:72], v[41:42], v[45:46] v_add_f64 v[81:82], v[43:44], v[47:48] s_waitcnt lgkmcnt(1) v_add_f64 v[85:86], v[37:38], -v[53:54] v_add_f64 v[43:44], v[43:44], -v[47:48] v_add_f64 v[41:42], v[41:42], -v[45:46] v_add_f64 v[45:46], v[31:32], v[51:52] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[33:34], -v[57:58] v_add_f64 v[37:38], v[37:38], v[53:54] v_add_f64 v[53:54], v[39:40], -v[55:56] v_add_f64 v[39:40], v[39:40], v[55:56] v_mul_f64 v[55:56], v[81:82], v[85:86] v_add_f64 v[87:88], v[29:30], v[49:50] v_add_f64 v[29:30], v[29:30], -v[49:50] v_add_f64 v[49:50], v[35:36], v[59:60] v_mul_f64 v[89:90], v[45:46], v[47:48] v_add_f64 v[31:32], v[31:32], -v[51:52] v_add_f64 v[35:36], v[35:36], -v[59:60] v_mul_f64 v[51:52], v[39:40], -v[81:82] v_fma_f64 v[55:56], v[41:42], v[39:40], v[55:56] v_add_f64 v[33:34], v[33:34], v[57:58] v_mul_f64 v[57:58], v[39:40], -v[43:44] v_mul_f64 v[59:60], v[49:50], -v[45:46] v_fma_f64 v[89:90], v[29:30], v[49:50], v[89:90] v_mul_f64 v[91:92], v[43:44], v[85:86] v_mul_f64 v[93:94], v[49:50], -v[31:32] v_fma_f64 v[51:52], v[41:42], v[85:86], v[51:52] v_mul_f64 v[95:96], v[69:70], v[55:56] v_mul_f64 v[55:56], -v[83:84], v[55:56] v_mul_f64 v[109:110], v[31:32], v[47:48] v_fma_f64 v[59:60], v[29:30], v[47:48], v[59:60] v_mul_f64 v[111:112], -v[69:70], v[89:90] v_mul_f64 v[89:90], v[83:84], -v[89:90] v_fma_f64 v[39:40], v[71:72], v[39:40], v[91:92] v_fma_f64 v[57:58], v[71:72], v[85:86], v[57:58] v_fma_f64 v[85:86], v[51:52], -v[83:84], v[95:96] v_fma_f64 v[51:52], v[51:52], -v[69:70], v[55:56] v_fma_f64 v[49:50], v[87:88], v[49:50], v[109:110] v_fma_f64 v[47:48], v[87:88], v[47:48], v[93:94] v_fma_f64 v[55:56], v[59:60], v[83:84], v[111:112] v_fma_f64 v[59:60], v[59:60], -v[69:70], v[89:90] v_fma_f64 v[39:40], v[81:82], v[53:54], v[39:40] v_fma_f64 v[41:42], v[41:42], v[37:38], v[57:58] v_fma_f64 v[37:38], v[71:72], v[37:38], v[85:86] v_fma_f64 v[43:44], v[43:44], v[53:54], v[51:52] v_fma_f64 v[45:46], v[45:46], v[35:36], v[49:50] v_fma_f64 v[47:48], v[29:30], v[33:34], v[47:48] v_fma_f64 v[49:50], v[31:32], v[35:36], v[55:56] v_fma_f64 v[51:52], v[87:88], v[33:34], v[59:60] v_add_f64 v[85:86], v[75:76], v[79:80] v_add_f64 v[87:88], -v[77:78], -v[73:74] v_add_f64 v[89:90], v[41:42], v[37:38] v_add_f64 v[91:92], -v[43:44], -v[39:40] v_add_f64 v[29:30], v[37:38], -v[41:42] v_add_f64 v[31:32], v[43:44], -v[39:40] v_add_f64 v[35:36], v[49:50], -v[45:46] v_add_f64 v[33:34], v[51:52], -v[47:48] v_add_f64 v[93:94], v[47:48], v[51:52] v_add_f64 v[95:96], -v[49:50], -v[45:46] ; wave barrier ds_write_b128 v134, v[33:36] offset:1008 ds_write_b128 v134, v[29:32] offset:2032 ; wave barrier .LBB0_8: ; %Flow60 ds_read_b128 v[29:32], v135 ds_read_b128 v[33:36], v136 ; wave barrier s_lshl_b32 s2, s7, 9 s_and_b32 s2, s2, 0x1fe00 s_add_i32 s2, s2, s7 s_waitcnt lgkmcnt(0) v_add_f64 v[37:38], v[91:92], v[31:32] v_add_f64 v[31:32], v[91:92], -v[31:32] v_add_f64 v[39:40], v[93:94], -v[33:34] v_add_f64 v[41:42], v[95:96], v[35:36] v_add_f64 v[43:44], v[89:90], v[29:30] v_add_f64 v[29:30], v[89:90], -v[29:30] v_add_f64 v[35:36], v[95:96], -v[35:36] v_add_f64 v[33:34], v[93:94], v[33:34] s_and_b32 s2, s2, 0xffffff00 s_lshl_b32 s3, s6, 9 v_add_f64 v[45:46], v[31:32], -v[39:40] v_add_f64 v[47:48], v[37:38], -v[41:42] v_add_f64 v[31:32], v[31:32], v[39:40] v_add_f64 v[37:38], v[37:38], v[41:42] v_add_f64 v[51:52], v[29:30], -v[35:36] v_add_f64 v[39:40], v[29:30], v[35:36] v_add_f64 v[49:50], v[43:44], -v[33:34] v_add_f64 v[29:30], v[43:44], v[33:34] v_mul_f64 v[35:36], v[3:4], -v[45:46] v_mul_f64 v[45:46], v[1:2], v[45:46] v_mul_f64 v[53:54], v[7:8], -v[47:48] v_mul_f64 v[55:56], v[97:98], -v[31:32] v_mul_f64 v[47:48], v[5:6], v[47:48] v_mul_f64 v[57:58], v[99:100], v[31:32] s_and_b32 s3, s3, 0x1fe00 s_add_i32 s3, s3, s6 v_fma_f64 v[31:32], v[39:40], v[1:2], v[35:36] v_fma_f64 v[39:40], v[39:40], v[3:4], v[45:46] v_fma_f64 v[33:34], v[49:50], v[5:6], v[53:54] v_fma_f64 v[35:36], v[51:52], v[99:100], v[55:56] v_fma_f64 v[41:42], v[49:50], v[7:8], v[47:48] v_fma_f64 v[43:44], v[51:52], v[97:98], v[57:58] ds_write_b128 v130, v[29:32] ds_write_b128 v130, v[33:36] offset:16 ; wave barrier ds_read2st64_b64 v[29:32], v129 offset1:1 ds_read2st64_b64 v[33:36], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[37:40] ds_write_b128 v130, v[41:44] offset:16 ; wave barrier ds_read2st64_b64 v[37:40], v129 offset1:1 ds_read2st64_b64 v[41:44], v129 offset0:2 offset1:3 ; wave barrier s_waitcnt lgkmcnt(4) v_add_f64 v[49:50], v[29:30], v[33:34] v_add_f64 v[29:30], v[29:30], -v[33:34] s_waitcnt lgkmcnt(0) v_add_f64 v[45:46], v[37:38], v[41:42] v_add_f64 v[37:38], v[37:38], -v[41:42] v_add_f64 v[41:42], v[31:32], -v[35:36] v_add_f64 v[47:48], v[39:40], v[43:44] v_add_f64 v[31:32], v[31:32], v[35:36] v_add_f64 v[33:34], v[39:40], -v[43:44] v_add_f64 v[35:36], v[37:38], -v[41:42] v_add_f64 v[39:40], v[45:46], -v[47:48] v_add_f64 v[37:38], v[37:38], v[41:42] v_add_f64 v[43:44], v[29:30], -v[33:34] v_add_f64 v[29:30], v[29:30], v[33:34] v_add_f64 v[41:42], v[49:50], -v[31:32] v_add_f64 v[31:32], v[49:50], v[31:32] v_add_f64 v[45:46], v[45:46], v[47:48] v_mul_f64 v[33:34], v[11:12], -v[35:36] v_mul_f64 v[35:36], v[9:10], v[35:36] v_mul_f64 v[51:52], v[15:16], -v[39:40] v_mul_f64 v[53:54], v[101:102], -v[37:38] v_mul_f64 v[39:40], v[13:14], v[39:40] v_mul_f64 v[37:38], v[103:104], v[37:38] v_fma_f64 v[33:34], v[29:30], v[9:10], v[33:34] v_fma_f64 v[47:48], v[29:30], v[11:12], v[35:36] v_fma_f64 v[49:50], v[41:42], v[13:14], v[51:52] v_fma_f64 v[51:52], v[43:44], v[103:104], v[53:54] v_fma_f64 v[39:40], v[41:42], v[15:16], v[39:40] v_fma_f64 v[37:38], v[43:44], v[101:102], v[37:38] ds_write2_b64 v133, v[31:32], v[33:34] offset1:4 ds_write2_b64 v133, v[49:50], v[51:52] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[29:32], v129 offset1:1 ds_read2st64_b64 v[33:36], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[45:46], v[47:48] offset1:4 ds_write2_b64 v133, v[39:40], v[37:38] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[37:40], v129 offset1:1 ds_read2st64_b64 v[41:44], v129 offset0:2 offset1:3 v_add_f64 v[45:46], v[65:66], v[25:26] v_add_f64 v[51:52], v[67:68], v[27:28] s_waitcnt lgkmcnt(4) v_add_f64 v[53:54], v[29:30], v[33:34] v_add_f64 v[29:30], v[29:30], -v[33:34] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[37:38], v[41:42] v_add_f64 v[37:38], v[37:38], -v[41:42] v_add_f64 v[41:42], v[31:32], -v[35:36] v_add_f64 v[49:50], v[39:40], v[43:44] v_add_f64 v[31:32], v[31:32], v[35:36] v_add_f64 v[33:34], v[39:40], -v[43:44] v_add_f64 v[27:28], v[27:28], -v[67:68] v_add_f64 v[43:44], v[61:62], v[85:86] ; wave barrier v_add_f64 v[35:36], v[37:38], -v[41:42] v_add_f64 v[39:40], v[47:48], -v[49:50] v_add_f64 v[37:38], v[37:38], v[41:42] v_add_f64 v[41:42], v[25:26], -v[65:66] v_add_f64 v[25:26], v[53:54], -v[31:32] v_add_f64 v[55:56], v[29:30], -v[33:34] v_add_f64 v[29:30], v[29:30], v[33:34] v_add_f64 v[31:32], v[53:54], v[31:32] v_mul_f64 v[33:34], v[19:20], -v[35:36] v_mul_f64 v[57:58], v[23:24], -v[39:40] v_mul_f64 v[59:60], v[105:106], -v[37:38] v_mul_f64 v[35:36], v[17:18], v[35:36] v_add_f64 v[65:66], v[63:64], v[87:88] v_mul_f64 v[39:40], v[21:22], v[39:40] v_mul_f64 v[37:38], v[107:108], v[37:38] v_add_f64 v[47:48], v[47:48], v[49:50] v_fma_f64 v[33:34], v[29:30], v[17:18], v[33:34] v_fma_f64 v[53:54], v[25:26], v[21:22], v[57:58] v_fma_f64 v[57:58], v[55:56], v[107:108], v[59:60] v_add_f64 v[59:60], v[85:86], -v[61:62] v_add_f64 v[61:62], v[87:88], -v[63:64] v_fma_f64 v[35:36], v[29:30], v[19:20], v[35:36] ds_write2_b64 v132, v[31:32], v[33:34] offset1:16 ds_write2_b64 v132, v[53:54], v[57:58] offset0:32 offset1:48 v_add_f64 v[33:34], v[51:52], -v[65:66] v_add_f64 v[49:50], v[27:28], -v[59:60] v_add_f64 v[53:54], v[59:60], v[27:28] v_fma_f64 v[39:40], v[25:26], v[23:24], v[39:40] v_fma_f64 v[37:38], v[55:56], v[105:106], v[37:38] ; wave barrier ds_read2st64_b64 v[25:28], v129 offset1:1 ds_read2st64_b64 v[29:32], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v132, v[47:48], v[35:36] offset1:16 v_add_f64 v[55:56], v[41:42], -v[61:62] v_add_f64 v[41:42], v[61:62], v[41:42] v_mul_f64 v[35:36], v[3:4], -v[49:50] v_mul_f64 v[49:50], v[1:2], v[49:50] v_add_f64 v[47:48], v[45:46], -v[43:44] v_mul_f64 v[57:58], v[7:8], -v[33:34] v_mul_f64 v[59:60], v[97:98], -v[53:54] v_mul_f64 v[61:62], v[5:6], v[33:34] v_mul_f64 v[53:54], v[99:100], v[53:54] v_add_f64 v[33:34], v[43:44], v[45:46] v_fma_f64 v[35:36], v[41:42], v[1:2], v[35:36] v_add_f64 v[1:2], v[65:66], v[51:52] v_fma_f64 v[3:4], v[41:42], v[3:4], v[49:50] ds_write2_b64 v132, v[39:40], v[37:38] offset0:32 offset1:48 v_fma_f64 v[37:38], v[47:48], v[5:6], v[57:58] v_fma_f64 v[39:40], v[55:56], v[99:100], v[59:60] v_fma_f64 v[5:6], v[47:48], v[7:8], v[61:62] v_fma_f64 v[7:8], v[55:56], v[97:98], v[53:54] ; wave barrier ds_read2st64_b64 v[41:44], v129 offset1:1 ds_read2st64_b64 v[45:48], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[33:36] ds_write_b128 v130, v[37:40] offset:16 ; wave barrier ds_read2st64_b64 v[33:36], v129 offset1:1 ds_read2st64_b64 v[37:40], v129 offset0:2 offset1:3 ; wave barrier ds_write_b128 v130, v[1:4] ds_write_b128 v130, v[5:8] offset:16 ; wave barrier ds_read2st64_b64 v[1:4], v129 offset1:1 ds_read2st64_b64 v[5:8], v129 offset0:2 offset1:3 ; wave barrier s_waitcnt lgkmcnt(4) v_add_f64 v[53:54], v[33:34], v[37:38] v_add_f64 v[33:34], v[33:34], -v[37:38] s_waitcnt lgkmcnt(0) v_add_f64 v[49:50], v[1:2], v[5:6] v_add_f64 v[1:2], v[1:2], -v[5:6] v_add_f64 v[5:6], v[35:36], -v[39:40] v_add_f64 v[51:52], v[3:4], v[7:8] v_add_f64 v[3:4], v[3:4], -v[7:8] v_add_f64 v[35:36], v[35:36], v[39:40] v_add_f64 v[7:8], v[1:2], -v[5:6] v_add_f64 v[37:38], v[49:50], -v[51:52] v_add_f64 v[1:2], v[1:2], v[5:6] v_add_f64 v[39:40], v[33:34], -v[3:4] v_add_f64 v[3:4], v[33:34], v[3:4] v_add_f64 v[5:6], v[53:54], -v[35:36] v_add_f64 v[35:36], v[53:54], v[35:36] v_add_f64 v[49:50], v[49:50], v[51:52] v_mul_f64 v[33:34], v[11:12], -v[7:8] v_mul_f64 v[7:8], v[9:10], v[7:8] v_mul_f64 v[55:56], v[15:16], -v[37:38] v_mul_f64 v[57:58], v[101:102], -v[1:2] v_mul_f64 v[37:38], v[13:14], v[37:38] v_mul_f64 v[1:2], v[103:104], v[1:2] v_fma_f64 v[9:10], v[3:4], v[9:10], v[33:34] v_fma_f64 v[11:12], v[3:4], v[11:12], v[7:8] v_fma_f64 v[13:14], v[5:6], v[13:14], v[55:56] v_fma_f64 v[33:34], v[39:40], v[103:104], v[57:58] v_fma_f64 v[15:16], v[5:6], v[15:16], v[37:38] v_fma_f64 v[37:38], v[39:40], v[101:102], v[1:2] ds_write2_b64 v133, v[35:36], v[9:10] offset1:4 ds_write2_b64 v133, v[13:14], v[33:34] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[1:4], v129 offset1:1 ds_read2st64_b64 v[5:8], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v133, v[49:50], v[11:12] offset1:4 ds_write2_b64 v133, v[15:16], v[37:38] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[9:12], v129 offset1:1 ds_read2st64_b64 v[13:16], v129 offset0:2 offset1:3 v_add_f64 v[33:34], v[41:42], v[45:46] v_add_f64 v[37:38], v[41:42], -v[45:46] v_add_f64 v[35:36], v[25:26], v[29:30] v_add_f64 v[29:30], v[25:26], -v[29:30] s_waitcnt lgkmcnt(0) v_add_f64 v[39:40], v[9:10], v[13:14] v_add_f64 v[9:10], v[9:10], -v[13:14] v_add_f64 v[13:14], v[3:4], -v[7:8] v_add_f64 v[41:42], v[11:12], v[15:16] v_add_f64 v[25:26], v[1:2], v[5:6] v_add_f64 v[1:2], v[1:2], -v[5:6] v_add_f64 v[3:4], v[3:4], v[7:8] v_add_f64 v[5:6], v[11:12], -v[15:16] v_add_f64 v[49:50], v[27:28], v[31:32] v_add_f64 v[31:32], v[27:28], -v[31:32] v_add_f64 v[7:8], v[9:10], -v[13:14] v_add_f64 v[11:12], v[39:40], -v[41:42] v_add_f64 v[9:10], v[9:10], v[13:14] v_add_f64 v[45:46], v[43:44], v[47:48] v_add_f64 v[13:14], v[25:26], -v[3:4] v_add_f64 v[15:16], v[1:2], -v[5:6] v_add_f64 v[1:2], v[1:2], v[5:6] v_add_f64 v[25:26], v[25:26], v[3:4] v_mul_f64 v[5:6], v[19:20], -v[7:8] v_mul_f64 v[27:28], v[23:24], -v[11:12] v_mul_f64 v[7:8], v[17:18], v[7:8] v_mul_f64 v[51:52], v[105:106], -v[9:10] v_mul_f64 v[11:12], v[21:22], v[11:12] v_mul_f64 v[9:10], v[107:108], v[9:10] ; wave barrier v_add_f64 v[3:4], v[33:34], v[45:46] v_add_f64 v[43:44], v[43:44], -v[47:48] v_fma_f64 v[5:6], v[1:2], v[17:18], v[5:6] v_fma_f64 v[17:18], v[13:14], v[21:22], v[27:28] v_add_f64 v[27:28], v[39:40], v[41:42] v_fma_f64 v[19:20], v[1:2], v[19:20], v[7:8] v_fma_f64 v[21:22], v[15:16], v[107:108], v[51:52] v_fma_f64 v[13:14], v[13:14], v[23:24], v[11:12] v_fma_f64 v[15:16], v[15:16], v[105:106], v[9:10] ds_write2_b64 v132, v[25:26], v[5:6] offset1:16 ds_write2_b64 v132, v[17:18], v[21:22] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[5:8], v129 offset1:1 ds_read2st64_b64 v[9:12], v129 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v132, v[27:28], v[19:20] offset1:16 ds_write2_b64 v132, v[13:14], v[15:16] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[13:16], v129 offset1:1 ds_read2st64_b64 v[17:20], v129 offset0:2 offset1:3 v_add_f64 v[23:24], v[33:34], -v[45:46] v_add_f64 v[1:2], v[35:36], v[49:50] v_add_f64 v[21:22], v[35:36], -v[49:50] s_waitcnt lgkmcnt(4) v_add_f64 v[35:36], v[5:6], v[9:10] s_waitcnt lgkmcnt(0) v_add_f64 v[33:34], v[13:14], v[17:18] v_add_f64 v[41:42], v[15:16], v[19:20] v_add_f64 v[39:40], v[7:8], v[11:12] v_add_f64 v[45:46], v[7:8], -v[11:12] v_add_f64 v[25:26], v[29:30], -v[43:44] v_add_f64 v[29:30], v[29:30], v[43:44] v_add_f64 v[27:28], v[37:38], v[31:32] v_add_f64 v[43:44], v[5:6], -v[9:10] v_add_f64 v[31:32], v[37:38], -v[31:32] v_add_f64 v[7:8], v[33:34], v[41:42] v_add_f64 v[11:12], v[33:34], -v[41:42] v_or_b32_e32 v33, s2, v0 v_mov_b32_e32 v34, 0 v_add_f64 v[5:6], v[35:36], v[39:40] v_add_f64 v[9:10], v[35:36], -v[39:40] v_lshlrev_b64 v[35:36], 4, v[33:34] v_mov_b32_e32 v37, s1 v_add_co_u32_e32 v35, vcc, s0, v35 v_addc_co_u32_e32 v36, vcc, v37, v36, vcc v_or_b32_e32 v33, s2, v131 global_store_dwordx4 v[35:36], v[1:4], off v_add_f64 v[15:16], v[15:16], -v[19:20] v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v3, 0x80, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s2, v3 v_add_f64 v[19:20], v[13:14], -v[17:18] global_store_dwordx4 v[1:2], v[29:32], off v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v4, 0xc0, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s2, v4 global_store_dwordx4 v[1:2], v[21:24], off v_lshlrev_b64 v[1:2], 4, v[33:34] s_and_b32 s1, s3, 0xffffff00 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s1, v0 v_add_f64 v[13:14], v[43:44], -v[15:16] v_add_f64 v[17:18], v[43:44], v[15:16] v_add_f64 v[15:16], v[19:20], v[45:46] v_add_f64 v[19:20], v[19:20], -v[45:46] global_store_dwordx4 v[1:2], v[25:28], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v131 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[5:8], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v3 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[17:20], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v4 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[9:12], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[13:16], off s_endpgm .section .rodata,"a",@progbits .p2align 6, 0x0 .amdhsa_kernel tailMul .amdhsa_group_segment_fixed_size 4096 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 32 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 178 .amdhsa_next_free_sgpr 23 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 0 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .text .Lfunc_end0: .size tailMul, .Lfunc_end0-tailMul ; -- End function .section .AMDGPU.csdata,"",@progbits ; Kernel info: ; codeLenInByte = 10096 ; NumSgprs: 25 ; NumVgprs: 178 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 4096 bytes/workgroup (compile time only) ; SGPRBlocks: 3 ; VGPRBlocks: 44 ; NumSGPRsForWavesPerEU: 25 ; NumVGPRsForWavesPerEU: 178 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .ident "AMD clang version 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.2.0 24292 26466ce804ac523b398608f17388eb6d605a3f09)" .section ".note.GNU-stack","",@progbits .addrsig .amdgpu_metadata --- amdhsa.kernels: - .args: - .actual_access: write_only .address_space: global .is_restrict: true .name: out .offset: 0 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: in .offset: 8 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: a .offset: 16 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .address_space: constant .is_const: true .name: smallTrig .offset: 24 .size: 8 .type_name: 'T2*' .value_kind: global_buffer .group_segment_fixed_size: 4096 .kernarg_segment_align: 8 .kernarg_segment_size: 32 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 64 .name: tailMul .private_segment_fixed_size: 0 .reqd_workgroup_size: - 64 - 1 - 1 .sgpr_count: 25 .sgpr_spill_count: 0 .symbol: tailMul.kd .uses_dynamic_stack: false .vgpr_count: 178 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: 'amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-' amdhsa.version: - 1 - 2 ... .end_amdgpu_metadata