.text .amdgcn_target "amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-" .protected tailMul ; -- Begin function tailMul .globl tailMul .p2align 8 .type tailMul,@function tailMul: ; @tailMul ; %bb.0: s_sub_i32 s7, 0x200, s6 s_cmp_eq_u32 s6, 0 s_load_dwordx4 s[0:3], s[4:5], 0x8 s_load_dwordx2 s[10:11], s[4:5], 0x18 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[12:13], s[8:9], exec s_mov_b32 s15, 0 s_cselect_b32 s7, 0x100, s7 s_and_b32 s14, s6, 0xffffff00 s_lshl_b64 s[12:13], s[14:15], 4 s_waitcnt lgkmcnt(0) s_add_u32 s14, s0, s12 s_addc_u32 s17, s1, s13 s_lshl_b32 s16, s6, 7 s_and_b32 s16, s16, 0xf80 s_add_u32 s14, s14, s16 s_addc_u32 s18, s17, 0 s_lshl_b32 s17, s6, 13 v_lshl_or_b32 v1, v0, 6, v0 s_and_b32 s17, s17, 0x1c0000 v_and_b32_e32 v1, 0xe07, v1 v_mov_b32_e32 v2, 0 s_add_u32 s14, s14, s17 v_lshlrev_b64 v[25:26], 4, v[1:2] s_addc_u32 s18, s18, 0 v_mov_b32_e32 v2, s18 v_add_co_u32_e32 v1, vcc, s14, v25 v_addc_co_u32_e32 v2, vcc, v2, v26, vcc s_mov_b32 s18, 0x10000 v_add_co_u32_e32 v3, vcc, s18, v1 v_addc_co_u32_e32 v4, vcc, 0, v2, vcc s_mov_b32 s19, 0x20000 global_load_dwordx4 v[9:12], v[1:2], off global_load_dwordx4 v[13:16], v[3:4], off v_add_co_u32_e32 v3, vcc, s19, v1 v_addc_co_u32_e32 v4, vcc, 0, v2, vcc s_mov_b32 s20, 0x30000 v_add_co_u32_e32 v1, vcc, s20, v1 v_addc_co_u32_e32 v2, vcc, 0, v2, vcc global_load_dwordx4 v[17:20], v[3:4], off global_load_dwordx4 v[21:24], v[1:2], off v_lshlrev_b32_e32 v134, 4, v0 global_load_dwordx4 v[1:4], v134, s[10:11] global_load_dwordx4 v[5:8], v134, s[10:11] offset:1024 s_and_b32 s14, s7, 0xffffff00 s_lshl_b64 s[14:15], s[14:15], 4 s_add_u32 s21, s0, s14 s_addc_u32 s22, s1, s15 s_lshl_b32 s0, s7, 7 s_and_b32 s0, s0, 0xf80 s_add_u32 s1, s21, s0 s_addc_u32 s21, s22, 0 s_lshl_b32 s22, s7, 13 s_and_b32 s22, s22, 0x1c0000 s_add_u32 s1, s1, s22 s_addc_u32 s21, s21, 0 v_mov_b32_e32 v27, s21 v_add_co_u32_e32 v35, vcc, s1, v25 v_addc_co_u32_e32 v36, vcc, v27, v26, vcc v_add_co_u32_e32 v31, vcc, s18, v35 s_add_u32 s1, s2, s12 v_addc_co_u32_e32 v32, vcc, 0, v36, vcc s_addc_u32 s12, s3, s13 v_add_co_u32_e32 v37, vcc, s19, v35 s_add_u32 s1, s1, s16 v_addc_co_u32_e32 v38, vcc, 0, v36, vcc s_addc_u32 s12, s12, 0 v_add_co_u32_e32 v39, vcc, s20, v35 s_add_u32 s1, s1, s17 v_addc_co_u32_e32 v40, vcc, 0, v36, vcc s_addc_u32 s12, s12, 0 v_mov_b32_e32 v43, s12 v_add_co_u32_e32 v51, vcc, s1, v25 v_addc_co_u32_e32 v52, vcc, v43, v26, vcc v_add_co_u32_e32 v47, vcc, s18, v51 s_add_u32 s1, s2, s14 v_addc_co_u32_e32 v48, vcc, 0, v52, vcc s_addc_u32 s2, s3, s15 v_add_co_u32_e32 v53, vcc, s19, v51 s_add_u32 s0, s1, s0 v_addc_co_u32_e32 v54, vcc, 0, v52, vcc s_addc_u32 s1, s2, 0 v_add_co_u32_e32 v55, vcc, s20, v51 s_add_u32 s0, s0, s22 v_addc_co_u32_e32 v56, vcc, 0, v52, vcc s_addc_u32 s1, s1, 0 v_mov_b32_e32 v59, s1 v_add_co_u32_e32 v25, vcc, s0, v25 v_addc_co_u32_e32 v26, vcc, v59, v26, vcc v_add_co_u32_e32 v63, vcc, s18, v25 v_addc_co_u32_e32 v64, vcc, 0, v26, vcc global_load_dwordx4 v[27:30], v[35:36], off global_load_dwordx4 v[31:34], v[31:32], off global_load_dwordx4 v[35:38], v[37:38], off global_load_dwordx4 v[39:42], v[39:40], off global_load_dwordx4 v[43:46], v[51:52], off global_load_dwordx4 v[47:50], v[47:48], off global_load_dwordx4 v[51:54], v[53:54], off global_load_dwordx4 v[55:58], v[55:56], off global_load_dwordx4 v[59:62], v[25:26], off global_load_dwordx4 v[63:66], v[63:64], off v_lshlrev_b32_e32 v129, 3, v0 v_and_b32_e32 v91, 60, v0 v_and_b32_e32 v92, 3, v0 v_and_b32_e32 v93, 48, v0 s_waitcnt vmcnt(13) v_add_f64 v[75:76], v[9:10], v[17:18] v_add_f64 v[77:78], v[11:12], v[19:20] v_add_f64 v[11:12], v[11:12], -v[19:20] v_add_f64 v[9:10], v[9:10], -v[17:18] s_waitcnt vmcnt(12) v_add_f64 v[17:18], v[13:14], v[21:22] v_add_f64 v[13:14], v[13:14], -v[21:22] v_add_f64 v[79:80], v[15:16], v[23:24] s_waitcnt vmcnt(10) v_mul_f64 v[21:22], v[1:2], v[7:8] v_mul_f64 v[69:70], v[3:4], -v[7:8] v_add_co_u32_e32 v19, vcc, s19, v25 v_addc_co_u32_e32 v20, vcc, 0, v26, vcc v_add_f64 v[15:16], v[15:16], -v[23:24] v_add_co_u32_e32 v23, vcc, s20, v25 v_add_f64 v[67:68], v[11:12], -v[13:14] v_addc_co_u32_e32 v24, vcc, 0, v26, vcc v_add_f64 v[25:26], v[77:78], -v[79:80] v_add_f64 v[11:12], v[11:12], v[13:14] v_fma_f64 v[99:100], v[5:6], v[3:4], v[21:22] v_fma_f64 v[101:102], v[5:6], v[1:2], v[69:70] v_add_f64 v[13:14], v[9:10], v[15:16] v_mul_f64 v[81:82], v[3:4], -v[67:68] v_add_f64 v[21:22], v[75:76], -v[17:18] v_mul_f64 v[83:84], v[67:68], v[1:2] v_add_f64 v[85:86], v[9:10], -v[15:16] v_mul_f64 v[87:88], v[7:8], -v[25:26] v_mul_f64 v[89:90], v[99:100], -v[11:12] global_load_dwordx4 v[67:70], v[19:20], off global_load_dwordx4 v[71:74], v[23:24], off v_mul_f64 v[23:24], v[25:26], v[5:6] v_mul_f64 v[25:26], v[11:12], v[101:102] v_add_f64 v[9:10], v[75:76], v[17:18] v_fma_f64 v[11:12], v[13:14], v[1:2], v[81:82] v_fma_f64 v[15:16], v[13:14], v[3:4], v[83:84] v_fma_f64 v[17:18], v[21:22], v[5:6], v[87:88] v_fma_f64 v[19:20], v[85:86], v[101:102], v[89:90] v_add_f64 v[13:14], v[77:78], v[79:80] v_fma_f64 v[21:22], v[21:22], v[7:8], v[23:24] v_fma_f64 v[23:24], v[85:86], v[99:100], v[25:26] v_mul_u32_u24_e32 v25, 3, v0 v_lshl_add_u32 v132, v25, 3, v129 ds_write_b128 v132, v[9:12] ds_write_b128 v132, v[17:20] offset:16 v_mul_i32_i24_e32 v9, -3, v0 ; wave barrier v_lshl_add_u32 v130, v9, 3, v132 ds_read_b64 v[25:26], v129 ds_read_b64 v[75:76], v130 offset:1536 ds_read2st64_b64 v[17:20], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[13:16] ds_write_b128 v132, v[21:24] offset:16 v_lshlrev_b32_e32 v13, 4, v91 ; wave barrier ds_read_b64 v[77:78], v129 ds_read_b64 v[79:80], v130 offset:1536 ds_read2st64_b64 v[21:24], v130 offset0:1 offset1:2 ; wave barrier global_load_dwordx4 v[9:12], v13, s[10:11] global_load_dwordx4 v[13:16], v13, s[10:11] offset:1024 s_waitcnt lgkmcnt(5) v_add_f64 v[81:82], v[17:18], -v[75:76] v_add_f64 v[17:18], v[17:18], v[75:76] v_add_f64 v[85:86], v[25:26], v[19:20] s_waitcnt lgkmcnt(0) v_add_f64 v[83:84], v[77:78], -v[23:24] v_add_f64 v[23:24], v[77:78], v[23:24] v_add_f64 v[77:78], v[21:22], v[79:80] v_add_f64 v[19:20], v[25:26], -v[19:20] v_add_f64 v[21:22], v[21:22], -v[79:80] v_lshlrev_b32_e32 v92, 3, v92 v_lshlrev_b32_e32 v94, 4, v93 v_lshl_or_b32 v133, v91, 5, v92 v_add_f64 v[25:26], v[83:84], -v[81:82] v_add_f64 v[79:80], v[83:84], v[81:82] v_add_f64 v[81:82], v[23:24], -v[77:78] v_add_f64 v[23:24], v[23:24], v[77:78] v_add_f64 v[83:84], v[19:20], v[21:22] v_add_f64 v[19:20], v[19:20], -v[21:22] s_movk_i32 s0, 0x4000 s_waitcnt vmcnt(11) v_add_f64 v[91:92], v[27:28], v[35:36] v_add_f64 v[27:28], v[27:28], -v[35:36] s_waitcnt vmcnt(10) v_add_f64 v[35:36], v[31:32], v[39:40] v_add_f64 v[31:32], v[31:32], -v[39:40] s_waitcnt vmcnt(7) v_add_f64 v[95:96], v[45:46], v[53:54] v_add_f64 v[53:54], v[45:46], -v[53:54] s_waitcnt vmcnt(6) v_add_f64 v[109:110], v[47:48], v[55:56] v_add_f64 v[111:112], v[49:50], v[57:58] v_add_f64 v[55:56], v[47:48], -v[55:56] v_add_f64 v[49:50], v[49:50], -v[57:58] v_add_f64 v[45:46], v[91:92], -v[35:36] s_waitcnt vmcnt(1) v_mul_f64 v[21:22], v[11:12], -v[25:26] s_waitcnt vmcnt(0) v_mul_f64 v[87:88], v[9:10], v[15:16] v_mul_f64 v[75:76], v[11:12], -v[15:16] v_mul_f64 v[25:26], v[9:10], v[25:26] v_fma_f64 v[21:22], v[83:84], v[9:10], v[21:22] v_fma_f64 v[97:98], v[13:14], v[11:12], v[87:88] v_fma_f64 v[103:104], v[13:14], v[9:10], v[75:76] v_add_f64 v[87:88], v[85:86], -v[17:18] v_mul_f64 v[75:76], v[15:16], -v[81:82] v_mul_f64 v[81:82], v[81:82], v[13:14] v_add_f64 v[17:18], v[85:86], v[17:18] v_fma_f64 v[25:26], v[83:84], v[11:12], v[25:26] v_mul_f64 v[89:90], v[97:98], -v[79:80] v_mul_f64 v[79:80], v[79:80], v[103:104] v_fma_f64 v[75:76], v[87:88], v[13:14], v[75:76] v_fma_f64 v[81:82], v[87:88], v[15:16], v[81:82] v_fma_f64 v[85:86], v[19:20], v[103:104], v[89:90] v_fma_f64 v[19:20], v[19:20], v[97:98], v[79:80] ds_write2_b64 v133, v[17:18], v[21:22] offset1:4 ds_write2_b64 v133, v[75:76], v[85:86] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[83:84], v129 ds_read_b64 v[85:86], v130 offset:1536 ds_read2st64_b64 v[75:78], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v133, v[23:24], v[25:26] offset1:4 ds_write2_b64 v133, v[81:82], v[19:20] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[87:88], v129 ds_read_b64 v[89:90], v130 offset:1536 ds_read2st64_b64 v[79:82], v130 offset0:1 offset1:2 ; wave barrier global_load_dwordx4 v[17:20], v94, s[10:11] global_load_dwordx4 v[21:24], v94, s[10:11] offset:1024 v_and_b32_e32 v25, 15, v0 v_lshlrev_b32_e32 v25, 3, v25 v_lshl_or_b32 v131, v93, 5, v25 v_add_f64 v[25:26], v[29:30], v[37:38] v_add_f64 v[29:30], v[29:30], -v[37:38] v_add_f64 v[37:38], v[33:34], v[41:42] v_add_f64 v[33:34], v[33:34], -v[41:42] v_add_f64 v[93:94], v[43:44], v[51:52] v_add_f64 v[51:52], v[43:44], -v[51:52] v_add_f64 v[39:40], v[29:30], -v[31:32] v_add_f64 v[31:32], v[29:30], v[31:32] v_add_f64 v[41:42], v[25:26], -v[37:38] v_add_f64 v[25:26], v[25:26], v[37:38] v_add_f64 v[37:38], v[27:28], v[33:34] v_add_f64 v[29:30], v[91:92], v[35:36] v_add_f64 v[47:48], v[27:28], -v[33:34] v_mul_f64 v[43:44], v[39:40], v[1:2] v_mul_f64 v[39:40], v[3:4], -v[39:40] v_mul_f64 v[35:36], v[99:100], -v[31:32] v_mul_f64 v[57:58], v[31:32], v[101:102] v_mul_f64 v[33:34], v[7:8], -v[41:42] v_mul_f64 v[41:42], v[41:42], v[5:6] v_fma_f64 v[27:28], v[37:38], v[3:4], v[43:44] v_fma_f64 v[31:32], v[37:38], v[1:2], v[39:40] s_waitcnt lgkmcnt(5) v_add_f64 v[37:38], v[75:76], -v[85:86] s_waitcnt lgkmcnt(0) v_add_f64 v[39:40], v[87:88], -v[81:82] v_add_f64 v[81:82], v[87:88], v[81:82] v_add_f64 v[87:88], v[79:80], v[89:90] v_add_f64 v[75:76], v[75:76], v[85:86] v_add_f64 v[43:44], v[83:84], v[77:78] v_add_f64 v[77:78], v[83:84], -v[77:78] v_add_f64 v[79:80], v[79:80], -v[89:90] v_fma_f64 v[33:34], v[45:46], v[5:6], v[33:34] v_add_f64 v[83:84], v[39:40], -v[37:38] v_add_f64 v[37:38], v[39:40], v[37:38] v_add_f64 v[39:40], v[81:82], -v[87:88] v_fma_f64 v[35:36], v[47:48], v[101:102], v[35:36] v_add_f64 v[89:90], v[77:78], v[79:80] v_add_f64 v[77:78], v[77:78], -v[79:80] s_waitcnt vmcnt(1) v_mul_f64 v[79:80], v[19:20], -v[83:84] s_waitcnt vmcnt(0) v_mul_f64 v[91:92], v[17:18], v[23:24] v_mul_f64 v[85:86], v[19:20], -v[23:24] v_fma_f64 v[79:80], v[89:90], v[17:18], v[79:80] v_fma_f64 v[105:106], v[21:22], v[19:20], v[91:92] v_fma_f64 v[107:108], v[21:22], v[17:18], v[85:86] v_add_f64 v[91:92], v[43:44], -v[75:76] v_add_f64 v[43:44], v[43:44], v[75:76] v_mul_f64 v[75:76], v[17:18], v[83:84] v_mul_f64 v[85:86], v[23:24], -v[39:40] v_mul_f64 v[39:40], v[39:40], v[21:22] v_mul_f64 v[113:114], v[105:106], -v[37:38] v_mul_f64 v[83:84], v[37:38], v[107:108] v_fma_f64 v[37:38], v[45:46], v[7:8], v[41:42] v_add_f64 v[45:46], v[81:82], v[87:88] v_fma_f64 v[75:76], v[89:90], v[19:20], v[75:76] v_fma_f64 v[85:86], v[91:92], v[21:22], v[85:86] v_fma_f64 v[81:82], v[91:92], v[23:24], v[39:40] v_fma_f64 v[39:40], v[47:48], v[99:100], v[57:58] v_fma_f64 v[113:114], v[77:78], v[107:108], v[113:114] v_fma_f64 v[77:78], v[77:78], v[105:106], v[83:84] ds_write2_b64 v131, v[43:44], v[79:80] offset1:16 ds_write2_b64 v131, v[85:86], v[113:114] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[79:80], v129 ds_read_b64 v[83:84], v130 offset:1536 ds_read2st64_b64 v[41:44], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v131, v[45:46], v[75:76] offset1:16 ds_write2_b64 v131, v[81:82], v[77:78] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[81:82], v129 ds_read_b64 v[85:86], v130 offset:1536 ds_read2st64_b64 v[45:48], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[29:32] ds_write_b128 v132, v[33:36] offset:16 ; wave barrier ds_read_b64 v[33:34], v129 ds_read_b64 v[35:36], v130 offset:1536 ds_read2st64_b64 v[29:32], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[25:28] ds_write_b128 v132, v[37:40] offset:16 ; wave barrier ds_read_b64 v[37:38], v129 ds_read_b64 v[39:40], v130 offset:1536 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 v_add_f64 v[57:58], v[53:54], -v[55:56] v_add_f64 v[53:54], v[53:54], v[55:56] s_waitcnt lgkmcnt(5) v_add_f64 v[75:76], v[29:30], -v[35:36] v_add_f64 v[89:90], v[33:34], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[77:78], v[37:38], -v[27:28] v_add_f64 v[37:38], v[37:38], v[27:28] v_add_f64 v[87:88], v[25:26], v[39:40] v_add_f64 v[33:34], v[33:34], -v[31:32] v_add_f64 v[25:26], v[25:26], -v[39:40] v_add_f64 v[29:30], v[29:30], v[35:36] v_add_f64 v[31:32], v[93:94], v[109:110] v_add_f64 v[55:56], v[95:96], -v[111:112] v_add_f64 v[35:36], v[77:78], -v[75:76] v_add_f64 v[75:76], v[77:78], v[75:76] v_add_f64 v[39:40], v[37:38], -v[87:88] v_add_f64 v[77:78], v[93:94], -v[109:110] v_add_f64 v[93:94], v[33:34], v[25:26] v_add_f64 v[25:26], v[33:34], -v[25:26] v_add_f64 v[27:28], v[95:96], v[111:112] v_add_f64 v[91:92], v[51:52], v[49:50] v_mul_f64 v[33:34], v[11:12], -v[35:36] v_mul_f64 v[35:36], v[9:10], v[35:36] v_add_f64 v[95:96], v[89:90], -v[29:30] v_mul_f64 v[109:110], v[15:16], -v[39:40] v_mul_f64 v[111:112], v[97:98], -v[75:76] v_add_f64 v[51:52], v[51:52], -v[49:50] v_mul_f64 v[39:40], v[13:14], v[39:40] v_mul_f64 v[49:50], v[103:104], v[75:76] v_add_f64 v[29:30], v[89:90], v[29:30] v_fma_f64 v[33:34], v[93:94], v[9:10], v[33:34] v_add_f64 v[37:38], v[37:38], v[87:88] v_fma_f64 v[87:88], v[93:94], v[11:12], v[35:36] v_fma_f64 v[75:76], v[95:96], v[13:14], v[109:110] v_fma_f64 v[89:90], v[25:26], v[103:104], v[111:112] v_fma_f64 v[39:40], v[95:96], v[15:16], v[39:40] v_fma_f64 v[25:26], v[25:26], v[97:98], v[49:50] v_mul_f64 v[109:110], v[57:58], v[1:2] v_mul_f64 v[49:50], v[3:4], -v[57:58] ; wave barrier ds_write2_b64 v133, v[29:30], v[33:34] offset1:4 ds_write2_b64 v133, v[75:76], v[89:90] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[29:30], v129 ds_read_b64 v[57:58], v130 offset:1536 ds_read2st64_b64 v[33:36], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v133, v[37:38], v[87:88] offset1:4 ds_write2_b64 v133, v[39:40], v[25:26] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[25:26], v129 ds_read_b64 v[75:76], v130 offset:1536 ds_read2st64_b64 v[37:40], v130 offset0:1 offset1:2 v_mul_f64 v[87:88], v[7:8], -v[55:56] v_mul_f64 v[89:90], v[99:100], -v[53:54] s_waitcnt lgkmcnt(5) v_add_f64 v[93:94], v[33:34], -v[57:58] v_add_f64 v[113:114], v[29:30], -v[35:36] s_waitcnt lgkmcnt(0) v_add_f64 v[95:96], v[25:26], -v[39:40] v_add_f64 v[25:26], v[25:26], v[39:40] v_add_f64 v[39:40], v[37:38], v[75:76] v_add_f64 v[37:38], v[37:38], -v[75:76] v_add_f64 v[111:112], v[29:30], v[35:36] v_add_f64 v[57:58], v[33:34], v[57:58] v_fma_f64 v[29:30], v[91:92], v[3:4], v[109:110] v_fma_f64 v[33:34], v[91:92], v[1:2], v[49:50] v_add_f64 v[75:76], v[95:96], -v[93:94] v_add_f64 v[93:94], v[95:96], v[93:94] v_add_f64 v[109:110], v[25:26], -v[39:40] v_fma_f64 v[35:36], v[77:78], v[5:6], v[87:88] v_add_f64 v[87:88], v[113:114], v[37:38] v_mul_f64 v[55:56], v[55:56], v[5:6] v_add_f64 v[91:92], v[111:112], -v[57:58] v_add_f64 v[95:96], v[113:114], -v[37:38] v_mul_f64 v[49:50], v[19:20], -v[75:76] v_mul_f64 v[75:76], v[17:18], v[75:76] v_mul_f64 v[113:114], v[23:24], -v[109:110] v_mul_f64 v[115:116], v[105:106], -v[93:94] v_fma_f64 v[37:38], v[51:52], v[101:102], v[89:90] v_mul_f64 v[89:90], v[21:22], v[109:110] v_mul_f64 v[93:94], v[107:108], v[93:94] v_mul_f64 v[53:54], v[53:54], v[101:102] v_add_f64 v[57:58], v[111:112], v[57:58] v_fma_f64 v[109:110], v[87:88], v[17:18], v[49:50] v_add_f64 v[25:26], v[25:26], v[39:40] v_fma_f64 v[39:40], v[87:88], v[19:20], v[75:76] v_fma_f64 v[111:112], v[91:92], v[21:22], v[113:114] v_fma_f64 v[113:114], v[95:96], v[107:108], v[115:116] v_fma_f64 v[49:50], v[77:78], v[7:8], v[55:56] v_fma_f64 v[75:76], v[91:92], v[23:24], v[89:90] v_fma_f64 v[77:78], v[95:96], v[105:106], v[93:94] v_fma_f64 v[51:52], v[51:52], v[99:100], v[53:54] ; wave barrier ds_write2_b64 v131, v[57:58], v[109:110] offset1:16 ds_write2_b64 v131, v[111:112], v[113:114] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[87:88], v129 ds_read_b64 v[89:90], v130 offset:1536 ds_read2st64_b64 v[53:56], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v131, v[25:26], v[39:40] offset1:16 ds_write2_b64 v131, v[75:76], v[77:78] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[91:92], v129 ds_read_b64 v[93:94], v130 offset:1536 ds_read2st64_b64 v[75:78], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[31:34] ds_write_b128 v132, v[35:38] offset:16 ; wave barrier ds_read_b64 v[35:36], v129 ds_read_b64 v[37:38], v130 offset:1536 ds_read2st64_b64 v[31:34], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[27:30] ds_write_b128 v132, v[49:52] offset:16 ; wave barrier ds_read_b64 v[29:30], v129 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ds_read_b64 v[39:40], v130 offset:1536 ; wave barrier v_add_f64 v[113:114], v[61:62], v[69:70] v_add_f64 v[61:62], v[61:62], -v[69:70] s_waitcnt lgkmcnt(5) v_add_f64 v[49:50], v[31:32], -v[37:38] s_waitcnt lgkmcnt(1) v_add_f64 v[51:52], v[29:30], -v[27:28] v_add_f64 v[27:28], v[29:30], v[27:28] s_waitcnt lgkmcnt(0) v_add_f64 v[29:30], v[25:26], v[39:40] v_add_f64 v[57:58], v[35:36], v[33:34] v_add_f64 v[33:34], v[35:36], -v[33:34] v_add_f64 v[25:26], v[25:26], -v[39:40] v_add_f64 v[31:32], v[31:32], v[37:38] v_add_f64 v[35:36], v[51:52], -v[49:50] v_add_f64 v[39:40], v[51:52], v[49:50] v_add_f64 v[37:38], v[27:28], -v[29:30] v_add_f64 v[29:30], v[27:28], v[29:30] v_add_f64 v[49:50], v[33:34], v[25:26] v_add_f64 v[95:96], v[57:58], -v[31:32] v_add_f64 v[25:26], v[33:34], -v[25:26] v_mul_f64 v[51:52], v[11:12], -v[35:36] v_mul_f64 v[109:110], v[97:98], -v[39:40] v_mul_f64 v[33:34], v[15:16], -v[37:38] v_mul_f64 v[35:36], v[9:10], v[35:36] v_mul_f64 v[37:38], v[13:14], v[37:38] v_mul_f64 v[39:40], v[103:104], v[39:40] v_add_f64 v[31:32], v[57:58], v[31:32] v_fma_f64 v[51:52], v[49:50], v[9:10], v[51:52] v_fma_f64 v[57:58], v[25:26], v[103:104], v[109:110] v_fma_f64 v[33:34], v[95:96], v[13:14], v[33:34] v_fma_f64 v[35:36], v[49:50], v[11:12], v[35:36] v_fma_f64 v[37:38], v[95:96], v[15:16], v[37:38] v_fma_f64 v[39:40], v[25:26], v[97:98], v[39:40] ds_write2_b64 v133, v[31:32], v[51:52] offset1:4 ds_write2_b64 v133, v[33:34], v[57:58] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[33:34], v129 ds_read_b64 v[49:50], v130 offset:1536 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v133, v[29:30], v[35:36] offset1:4 ds_write2_b64 v133, v[37:38], v[39:40] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[35:36], v129 ds_read2st64_b64 v[29:32], v130 offset0:1 offset1:2 ds_read_b64 v[37:38], v130 offset:1536 ; wave barrier s_waitcnt lgkmcnt(5) v_add_f64 v[39:40], v[25:26], -v[49:50] v_add_f64 v[57:58], v[33:34], v[27:28] s_waitcnt lgkmcnt(1) v_add_f64 v[51:52], v[35:36], -v[31:32] v_add_f64 v[31:32], v[35:36], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[35:36], v[29:30], v[37:38] v_add_f64 v[27:28], v[33:34], -v[27:28] v_add_f64 v[29:30], v[29:30], -v[37:38] v_add_f64 v[25:26], v[25:26], v[49:50] v_add_f64 v[49:50], v[59:60], v[67:68] v_add_f64 v[59:60], v[59:60], -v[67:68] v_add_f64 v[33:34], v[51:52], -v[39:40] v_add_f64 v[39:40], v[51:52], v[39:40] v_add_f64 v[37:38], v[31:32], -v[35:36] v_add_f64 v[67:68], v[63:64], v[71:72] v_add_f64 v[51:52], v[27:28], v[29:30] v_add_f64 v[109:110], v[57:58], -v[25:26] v_add_f64 v[27:28], v[27:28], -v[29:30] v_add_f64 v[25:26], v[57:58], v[25:26] v_mul_f64 v[95:96], v[19:20], -v[33:34] v_mul_f64 v[111:112], v[105:106], -v[39:40] v_mul_f64 v[29:30], v[23:24], -v[37:38] v_mul_f64 v[33:34], v[17:18], v[33:34] v_mul_f64 v[37:38], v[21:22], v[37:38] v_add_f64 v[63:64], v[63:64], -v[71:72] v_mul_f64 v[39:40], v[107:108], v[39:40] v_fma_f64 v[69:70], v[51:52], v[17:18], v[95:96] v_fma_f64 v[57:58], v[27:28], v[107:108], v[111:112] v_fma_f64 v[29:30], v[109:110], v[21:22], v[29:30] v_add_f64 v[95:96], v[65:66], v[73:74] v_add_f64 v[65:66], v[65:66], -v[73:74] ds_write2_b64 v131, v[25:26], v[69:70] offset1:16 ds_write2_b64 v131, v[29:30], v[57:58] offset0:32 offset1:48 v_add_f64 v[25:26], v[31:32], v[35:36] v_fma_f64 v[33:34], v[51:52], v[19:20], v[33:34] v_fma_f64 v[35:36], v[109:110], v[23:24], v[37:38] v_add_f64 v[37:38], v[61:62], -v[63:64] v_fma_f64 v[27:28], v[27:28], v[105:106], v[39:40] v_add_f64 v[39:40], v[113:114], -v[95:96] v_add_f64 v[51:52], v[61:62], v[63:64] ; wave barrier ds_read_b64 v[69:70], v129 ds_read_b64 v[71:72], v130 offset:1536 ds_read2st64_b64 v[29:32], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v131, v[25:26], v[33:34] offset1:16 ds_write2_b64 v131, v[35:36], v[27:28] offset0:32 offset1:48 v_add_f64 v[27:28], v[59:60], v[65:66] v_mul_f64 v[33:34], v[3:4], -v[37:38] v_add_f64 v[57:58], v[49:50], -v[67:68] v_add_f64 v[59:60], v[59:60], -v[65:66] v_mul_f64 v[61:62], v[7:8], -v[39:40] v_mul_f64 v[63:64], v[99:100], -v[51:52] v_mul_f64 v[65:66], v[1:2], v[37:38] v_mul_f64 v[73:74], v[39:40], v[5:6] v_mul_f64 v[51:52], v[51:52], v[101:102] v_fma_f64 v[35:36], v[27:28], v[1:2], v[33:34] v_add_f64 v[33:34], v[49:50], v[67:68] v_add_f64 v[25:26], v[113:114], v[95:96] v_fma_f64 v[37:38], v[57:58], v[5:6], v[61:62] v_fma_f64 v[39:40], v[59:60], v[101:102], v[63:64] v_fma_f64 v[27:28], v[27:28], v[3:4], v[65:66] v_fma_f64 v[49:50], v[57:58], v[7:8], v[73:74] v_fma_f64 v[51:52], v[59:60], v[99:100], v[51:52] ; wave barrier ds_read_b64 v[61:62], v129 ds_read_b64 v[63:64], v130 offset:1536 ds_read2st64_b64 v[57:60], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[33:36] ds_write_b128 v132, v[37:40] offset:16 ; wave barrier ds_read_b64 v[37:38], v129 ds_read_b64 v[39:40], v130 offset:1536 ds_read2st64_b64 v[33:36], v130 offset0:1 offset1:2 ; wave barrier ds_write_b128 v132, v[25:28] ds_write_b128 v132, v[49:52] offset:16 ; wave barrier ds_read_b64 v[49:50], v129 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ds_read_b64 v[51:52], v130 offset:1536 v_add_f64 v[95:96], v[79:80], v[43:44] s_waitcnt lgkmcnt(5) v_add_f64 v[65:66], v[33:34], -v[39:40] v_add_f64 v[73:74], v[37:38], -v[35:36] s_waitcnt lgkmcnt(1) v_add_f64 v[67:68], v[49:50], -v[27:28] v_add_f64 v[27:28], v[49:50], v[27:28] s_waitcnt lgkmcnt(0) v_add_f64 v[49:50], v[25:26], v[51:52] v_add_f64 v[25:26], v[25:26], -v[51:52] v_add_f64 v[35:36], v[37:38], v[35:36] v_add_f64 v[33:34], v[33:34], v[39:40] v_add_f64 v[79:80], v[79:80], -v[43:44] ; wave barrier v_add_f64 v[111:112], v[87:88], v[55:56] v_add_f64 v[51:52], v[67:68], -v[65:66] v_add_f64 v[39:40], v[67:68], v[65:66] v_add_f64 v[37:38], v[27:28], -v[49:50] v_add_f64 v[43:44], v[73:74], v[25:26] v_add_f64 v[67:68], v[41:42], v[83:84] v_add_f64 v[83:84], v[41:42], -v[83:84] v_add_f64 v[41:42], v[35:36], -v[33:34] v_add_f64 v[25:26], v[73:74], -v[25:26] v_mul_f64 v[65:66], v[11:12], -v[51:52] v_mul_f64 v[51:52], v[9:10], v[51:52] v_mul_f64 v[73:74], v[15:16], -v[37:38] v_mul_f64 v[109:110], v[97:98], -v[39:40] v_mul_f64 v[37:38], v[13:14], v[37:38] v_mul_f64 v[39:40], v[103:104], v[39:40] v_add_f64 v[33:34], v[35:36], v[33:34] v_add_f64 v[49:50], v[27:28], v[49:50] v_fma_f64 v[65:66], v[43:44], v[9:10], v[65:66] v_fma_f64 v[43:44], v[43:44], v[11:12], v[51:52] v_fma_f64 v[35:36], v[41:42], v[13:14], v[73:74] v_fma_f64 v[73:74], v[25:26], v[103:104], v[109:110] v_fma_f64 v[37:38], v[41:42], v[15:16], v[37:38] v_fma_f64 v[39:40], v[25:26], v[97:98], v[39:40] ds_write2_b64 v133, v[33:34], v[65:66] offset1:4 ds_write2_b64 v133, v[35:36], v[73:74] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[51:52], v129 ds_read_b64 v[65:66], v130 offset:1536 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v133, v[49:50], v[43:44] offset1:4 ds_write2_b64 v133, v[37:38], v[39:40] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[39:40], v129 ds_read_b64 v[43:44], v130 offset:1536 ds_read2st64_b64 v[35:38], v130 offset0:1 offset1:2 v_add_f64 v[55:56], v[87:88], -v[55:56] v_add_f64 v[41:42], v[53:54], v[89:90] v_add_f64 v[49:50], v[53:54], -v[89:90] v_add_f64 v[53:54], v[91:92], v[77:78] v_add_f64 v[73:74], v[91:92], -v[77:78] s_waitcnt lgkmcnt(5) v_add_f64 v[77:78], v[25:26], -v[65:66] s_waitcnt lgkmcnt(0) v_add_f64 v[87:88], v[39:40], -v[37:38] v_add_f64 v[91:92], v[81:82], v[47:48] v_add_f64 v[81:82], v[81:82], -v[47:48] v_add_f64 v[47:48], v[51:52], -v[27:28] v_add_f64 v[109:110], v[35:36], -v[43:44] v_add_f64 v[89:90], v[75:76], v[93:94] v_add_f64 v[75:76], v[75:76], -v[93:94] v_add_f64 v[115:116], v[35:36], v[43:44] v_add_f64 v[113:114], v[87:88], -v[77:78] v_add_f64 v[27:28], v[51:52], v[27:28] v_add_f64 v[65:66], v[25:26], v[65:66] v_add_f64 v[93:94], v[45:46], v[85:86] v_add_f64 v[51:52], v[47:48], v[109:110] v_add_f64 v[85:86], v[45:46], -v[85:86] v_add_f64 v[45:46], v[39:40], v[37:38] v_add_f64 v[33:34], v[111:112], v[41:42] v_mul_f64 v[43:44], v[19:20], -v[113:114] v_add_f64 v[37:38], v[111:112], -v[41:42] v_add_f64 v[41:42], v[55:56], -v[75:76] v_add_f64 v[25:26], v[55:56], v[75:76] v_add_f64 v[75:76], v[27:28], v[65:66] v_add_f64 v[35:36], v[53:54], v[89:90] v_add_f64 v[39:40], v[53:54], -v[89:90] v_add_f64 v[53:54], v[45:46], -v[115:116] v_fma_f64 v[55:56], v[51:52], v[17:18], v[43:44] v_add_f64 v[77:78], v[87:88], v[77:78] ; wave barrier v_add_f64 v[65:66], v[27:28], -v[65:66] v_add_f64 v[47:48], v[47:48], -v[109:110] v_add_f64 v[43:44], v[73:74], v[49:50] v_add_f64 v[27:28], v[73:74], -v[49:50] v_add_f64 v[111:112], v[69:70], v[31:32] v_mul_f64 v[87:88], v[23:24], -v[53:54] ds_write2_b64 v131, v[75:76], v[55:56] offset1:16 v_lshl_add_u32 v75, v0, 9, s6 v_sub_u32_e32 v76, 0x8000, v75 v_cmp_lt_u32_e32 vcc, s0, v75 v_cndmask_b32_e32 v75, v75, v76, vcc v_cvt_f64_u32_e32 v[75:76], v75 s_mov_b32 s0, 0x54442d18 s_mov_b32 s1, 0x3f0921fb v_mul_f64 v[53:54], v[21:22], v[53:54] v_mul_f64 v[73:74], v[107:108], v[77:78] v_mul_f64 v[75:76], v[75:76], s[0:1] v_mul_f64 v[89:90], v[105:106], -v[77:78] v_add_f64 v[55:56], v[45:46], v[115:116] v_mul_f64 v[45:46], v[17:18], v[113:114] v_add_f64 v[69:70], v[69:70], -v[31:32] v_fma_f64 v[31:32], v[65:66], v[21:22], v[87:88] v_fma_f64 v[53:54], v[65:66], v[23:24], v[53:54] v_fma_f64 v[65:66], v[47:48], v[105:106], v[73:74] v_mul_f64 v[73:74], v[75:76], v[75:76] v_fma_f64 v[49:50], v[47:48], v[107:108], v[89:90] s_mov_b32 s0, 0xbe8838d4 v_fma_f64 v[51:52], v[51:52], v[19:20], v[45:46] v_add_f64 v[123:124], v[29:30], v[71:72] v_add_f64 v[71:72], v[29:30], -v[71:72] v_mov_b32_e32 v29, 0xbdb4b1c4 v_mov_b32_e32 v30, 0x3e21ee9e s_mov_b32 s1, 0xbda8fae9 v_fma_f64 v[29:30], v[73:74], s[0:1], v[29:30] s_mov_b32 s0, 0xf9a43bb8 ds_write2_b64 v131, v[31:32], v[49:50] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[31:32], v129 ds_read_b64 v[49:50], v130 offset:1536 ds_read2st64_b64 v[45:48], v130 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v131, v[55:56], v[51:52] offset1:16 ds_write2_b64 v131, v[53:54], v[65:66] offset0:32 offset1:48 v_mov_b32_e32 v51, 0xb42fdfa7 v_mov_b32_e32 v52, 0xbe5ae600 s_mov_b32 s1, 0x3de5e0b2 v_fma_f64 v[51:52], v[73:74], s[0:1], v[51:52] s_mov_b32 s0, 0x809c52ad s_mov_b32 s1, 0xbe927e4f v_fma_f64 v[29:30], v[29:30], v[73:74], s[0:1] s_mov_b32 s0, 0x796cde01 s_mov_b32 s1, 0x3ec71de3 s_waitcnt lgkmcnt(2) v_add_f64 v[53:54], v[31:32], v[47:48] v_add_f64 v[47:48], v[31:32], -v[47:48] v_fma_f64 v[31:32], v[51:52], v[73:74], s[0:1] s_mov_b32 s0, 0x19cb1590 s_mov_b32 s1, 0x3efa01a0 v_fma_f64 v[29:30], v[29:30], v[73:74], s[0:1] s_mov_b32 s0, 0x19e83e5c s_mov_b32 s1, 0xbf2a01a0 v_add_f64 v[55:56], v[45:46], v[49:50] v_add_f64 v[65:66], v[45:46], -v[49:50] v_fma_f64 v[49:50], v[31:32], v[73:74], s[0:1] s_mov_b32 s0, 0x16c15177 s_mov_b32 s1, 0xbf56c16c v_fma_f64 v[51:52], v[29:30], v[73:74], s[0:1] s_mov_b32 s0, 0x11110bb3 s_mov_b32 s1, 0x3f811111 ; wave barrier ds_read_b64 v[45:46], v129 ds_read_b64 v[77:78], v130 offset:1536 ds_read2st64_b64 v[29:32], v130 offset0:1 offset1:2 v_fma_f64 v[49:50], v[49:50], v[73:74], s[0:1] s_mov_b32 s0, 0x5555554c s_mov_b32 s1, 0x3fa55555 v_fma_f64 v[51:52], v[51:52], v[73:74], s[0:1] s_mov_b32 s0, 0x55555555 s_mov_b32 s1, 0xbfc55555 v_mul_f64 v[109:110], v[75:76], v[73:74] s_waitcnt lgkmcnt(0) v_add_f64 v[87:88], v[45:46], v[31:32] v_fma_f64 v[89:90], v[49:50], v[73:74], s[0:1] v_add_f64 v[31:32], v[45:46], -v[31:32] v_add_f64 v[45:46], v[29:30], v[77:78] v_fma_f64 v[51:52], v[51:52], v[73:74], -0.5 v_add_f64 v[29:30], v[29:30], -v[77:78] v_add_f64 v[113:114], v[61:62], v[59:60] v_add_f64 v[61:62], v[61:62], -v[59:60] v_add_f64 v[115:116], v[57:58], v[63:64] v_add_f64 v[63:64], v[57:58], -v[63:64] v_fma_f64 v[89:90], v[89:90], v[109:110], v[75:76] v_add_f64 v[49:50], v[53:54], v[55:56] v_fma_f64 v[135:136], v[51:52], v[73:74], 1.0 v_add_f64 v[51:52], v[87:88], v[45:46] v_add_f64 v[53:54], v[53:54], -v[55:56] v_add_f64 v[55:56], v[87:88], -v[45:46] v_add_f64 v[57:58], v[47:48], -v[29:30] v_add_f64 v[45:46], v[47:48], v[29:30] v_add_f64 v[59:60], v[31:32], v[65:66] v_add_f64 v[47:48], v[31:32], -v[65:66] v_add_f64 v[125:126], v[95:96], v[67:68] v_add_f64 v[73:74], v[95:96], -v[67:68] v_add_f64 v[75:76], v[91:92], -v[93:94] v_add_f64 v[127:128], v[91:92], v[93:94] v_add_f64 v[117:118], v[79:80], v[85:86] v_add_f64 v[85:86], v[79:80], -v[85:86] v_add_f64 v[87:88], v[81:82], v[83:84] v_add_f64 v[119:120], v[81:82], -v[83:84] v_add_f64 v[121:122], v[111:112], v[123:124] v_add_f64 v[77:78], v[111:112], -v[123:124] v_add_f64 v[79:80], v[113:114], -v[115:116] v_add_f64 v[123:124], v[113:114], v[115:116] v_add_f64 v[113:114], v[69:70], v[63:64] v_add_f64 v[81:82], v[69:70], -v[63:64] v_add_f64 v[83:84], v[61:62], v[71:72] v_add_f64 v[115:116], v[61:62], -v[71:72] v_cndmask_b32_e64 v110, -v90, -v136, vcc v_cndmask_b32_e32 v109, v89, v135, vcc v_cndmask_b32_e32 v112, v136, v90, vcc v_cndmask_b32_e32 v111, v135, v89, vcc s_and_b64 vcc, exec, s[8:9] v_add_u32_e32 v135, v130, v129 s_cbranch_vccnz .LBB0_2 ; %bb.1: v_xor_b32_e32 v179, 0xff0, v134 v_xor_b32_e32 v180, 0xbf0, v134 v_xor_b32_e32 v181, 0x7f0, v134 v_xor_b32_e32 v178, 0x3f0, v134 ; wave barrier ds_write_b128 v179, v[33:36] ds_write_b128 v180, v[25:28] ds_write_b128 v181, v[37:40] ds_write_b128 v178, v[41:44] ; wave barrier ds_read_b128 v[29:32], v135 ds_read_b128 v[61:64], v135 offset:1024 ds_read_b128 v[65:68], v135 offset:2048 ds_read_b128 v[69:72], v135 offset:3072 ; wave barrier ds_write_b128 v179, v[49:52] ds_write_b128 v180, v[45:48] ds_write_b128 v181, v[53:56] ds_write_b128 v178, v[57:60] ; wave barrier ds_read_b128 v[89:92], v135 ds_read_b128 v[93:96], v135 offset:1024 s_waitcnt lgkmcnt(9) v_add_f64 v[144:145], v[127:128], v[31:32] v_add_f64 v[150:151], v[125:126], -v[29:30] v_add_f64 v[31:32], v[127:128], -v[31:32] s_waitcnt lgkmcnt(1) v_add_f64 v[146:147], v[121:122], -v[89:90] v_add_f64 v[148:149], v[123:124], v[91:92] v_add_f64 v[29:30], v[125:126], v[29:30] v_add_f64 v[89:90], v[121:122], v[89:90] v_add_f64 v[91:92], v[123:124], -v[91:92] v_add_f64 v[162:163], v[75:76], v[67:68] v_add_f64 v[67:68], v[75:76], -v[67:68] s_waitcnt lgkmcnt(0) v_add_f64 v[166:167], v[113:114], -v[93:94] v_mul_f64 v[136:137], v[144:145], v[146:147] v_mul_f64 v[152:153], v[148:149], -v[144:145] v_mul_f64 v[156:157], v[148:149], -v[31:32] v_mul_f64 v[158:159], v[31:32], v[146:147] v_add_f64 v[93:94], v[113:114], v[93:94] v_fma_f64 v[154:155], v[150:151], v[148:149], v[136:137] v_fma_f64 v[152:153], v[150:151], v[146:147], v[152:153] ds_read_b128 v[136:139], v135 offset:2048 ds_read_b128 v[140:143], v135 offset:3072 v_fma_f64 v[146:147], v[29:30], v[146:147], v[156:157] v_fma_f64 v[148:149], v[29:30], v[148:149], v[158:159] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[164:165], v[77:78], -v[136:137] v_add_f64 v[158:159], v[79:80], v[138:139] v_mul_f64 v[160:161], v[109:110], v[154:155] v_mul_f64 v[154:155], -v[111:112], v[154:155] v_add_f64 v[138:139], v[79:80], -v[138:139] v_fma_f64 v[146:147], v[144:145], -v[91:92], v[146:147] v_fma_f64 v[144:145], v[144:145], v[89:90], v[148:149] v_add_f64 v[136:137], v[77:78], v[136:137] v_fma_f64 v[156:157], v[152:153], -v[111:112], v[160:161] v_fma_f64 v[152:153], v[152:153], -v[109:110], v[154:155] v_add_f64 v[154:155], v[73:74], -v[65:66] v_mul_f64 v[160:161], v[162:163], v[164:165] v_fma_f64 v[144:145], v[150:151], v[91:92], v[144:145] v_fma_f64 v[146:147], v[150:151], v[89:90], v[146:147] v_add_f64 v[65:66], v[73:74], v[65:66] v_fma_f64 v[148:149], v[31:32], -v[91:92], v[156:157] v_fma_f64 v[31:32], v[31:32], v[89:90], v[152:153] v_mul_f64 v[152:153], v[158:159], -v[162:163] v_fma_f64 v[156:157], v[154:155], v[158:159], v[160:161] v_add_f64 v[160:161], v[119:120], v[63:64] v_add_f64 v[63:64], v[119:120], -v[63:64] v_fma_f64 v[89:90], v[29:30], v[89:90], v[148:149] v_fma_f64 v[91:92], v[29:30], v[91:92], v[31:32] v_fma_f64 v[148:149], v[154:155], v[164:165], v[152:153] v_mul_f64 v[150:151], v[109:110], -v[156:157] v_mul_f64 v[152:153], v[111:112], v[156:157] v_mul_f64 v[156:157], v[67:68], v[164:165] v_mul_f64 v[170:171], v[63:64], v[166:167] v_add_f64 v[29:30], v[146:147], v[89:90] v_add_f64 v[31:32], -v[91:92], -v[144:145] v_add_f64 v[91:92], v[91:92], -v[144:145] v_mul_f64 v[144:145], v[158:159], -v[67:68] v_fma_f64 v[150:151], v[148:149], v[111:112], v[150:151] v_fma_f64 v[148:149], v[148:149], v[109:110], v[152:153] v_fma_f64 v[152:153], v[65:66], v[158:159], v[156:157] v_add_f64 v[156:157], v[115:116], v[95:96] v_add_f64 v[89:90], v[89:90], -v[146:147] v_add_f64 v[146:147], v[117:118], -v[61:62] v_mul_f64 v[158:159], v[160:161], v[166:167] v_fma_f64 v[144:145], v[65:66], v[164:165], v[144:145] v_fma_f64 v[150:151], v[67:68], -v[138:139], v[150:151] v_fma_f64 v[67:68], v[67:68], v[136:137], v[148:149] v_fma_f64 v[148:149], v[162:163], v[136:137], v[152:153] v_mul_f64 v[152:153], v[156:157], -v[160:161] s_waitcnt lgkmcnt(0) v_add_f64 v[164:165], v[81:82], -v[140:141] v_add_f64 v[61:62], v[117:118], v[61:62] v_fma_f64 v[158:159], v[146:147], v[156:157], v[158:159] v_fma_f64 v[144:145], v[162:163], -v[138:139], v[144:145] v_add_f64 v[162:163], v[87:88], v[71:72] v_fma_f64 v[150:151], v[65:66], v[136:137], v[150:151] v_fma_f64 v[65:66], v[65:66], v[138:139], v[67:68] v_add_f64 v[71:72], v[87:88], -v[71:72] v_add_f64 v[95:96], v[115:116], -v[95:96] v_add_f64 v[140:141], v[81:82], v[140:141] v_fma_f64 v[67:68], v[154:155], v[136:137], v[144:145] v_fma_f64 v[136:137], v[154:155], v[138:139], v[148:149] v_fma_f64 v[138:139], v[146:147], v[166:167], v[152:153] v_add_f64 v[144:145], v[85:86], -v[69:70] v_add_f64 v[148:149], v[83:84], v[142:143] v_mul_f64 v[152:153], v[162:163], v[164:165] v_mul_f64 v[154:155], v[111:112], -v[158:159] v_mul_f64 v[158:159], -v[109:110], v[158:159] v_add_f64 v[69:70], v[85:86], v[69:70] v_mul_f64 v[174:175], v[71:72], v[164:165] v_add_f64 v[142:143], v[83:84], -v[142:143] v_mul_f64 v[168:169], v[148:149], -v[162:163] v_fma_f64 v[152:153], v[144:145], v[148:149], v[152:153] v_fma_f64 v[154:155], v[138:139], -v[109:110], v[154:155] v_fma_f64 v[138:139], v[138:139], v[111:112], v[158:159] v_mul_f64 v[158:159], v[156:157], -v[63:64] v_mul_f64 v[176:177], v[148:149], -v[71:72] v_fma_f64 v[156:157], v[61:62], v[156:157], v[170:171] v_fma_f64 v[148:149], v[69:70], v[148:149], v[174:175] v_fma_f64 v[168:169], v[144:145], v[164:165], v[168:169] v_mul_f64 v[172:173], v[111:112], v[152:153] v_mul_f64 v[152:153], v[109:110], v[152:153] v_fma_f64 v[154:155], v[63:64], -v[95:96], v[154:155] v_fma_f64 v[158:159], v[61:62], v[166:167], v[158:159] v_fma_f64 v[164:165], v[69:70], v[164:165], v[176:177] v_fma_f64 v[63:64], v[63:64], v[93:94], v[138:139] v_fma_f64 v[156:157], v[160:161], v[93:94], v[156:157] v_fma_f64 v[148:149], v[162:163], v[140:141], v[148:149] v_fma_f64 v[166:167], v[168:169], v[109:110], v[172:173] v_fma_f64 v[152:153], v[168:169], -v[111:112], v[152:153] v_fma_f64 v[154:155], v[61:62], v[93:94], v[154:155] v_fma_f64 v[138:139], v[160:161], -v[95:96], v[158:159] v_fma_f64 v[160:161], v[61:62], v[95:96], v[63:64] v_add_f64 v[63:64], -v[65:66], -v[136:137] v_fma_f64 v[148:149], v[144:145], v[142:143], v[148:149] v_fma_f64 v[158:159], v[71:72], -v[142:143], v[166:167] v_fma_f64 v[71:72], v[71:72], v[140:141], v[152:153] v_fma_f64 v[152:153], v[162:163], -v[142:143], v[164:165] v_fma_f64 v[162:163], v[146:147], v[93:94], v[138:139] v_fma_f64 v[146:147], v[146:147], v[95:96], v[156:157] v_add_f64 v[95:96], v[65:66], -v[136:137] v_add_f64 v[93:94], v[150:151], -v[67:68] v_add_f64 v[61:62], v[67:68], v[150:151] v_fma_f64 v[156:157], v[69:70], v[140:141], v[158:159] v_fma_f64 v[158:159], v[69:70], v[142:143], v[71:72] v_fma_f64 v[144:145], v[144:145], v[140:141], v[152:153] v_add_f64 v[136:137], v[154:155], -v[162:163] v_add_f64 v[138:139], v[160:161], -v[146:147] v_add_f64 v[69:70], v[162:163], v[154:155] v_add_f64 v[71:72], -v[160:161], -v[146:147] v_add_f64 v[142:143], v[158:159], -v[148:149] v_add_f64 v[140:141], v[156:157], -v[144:145] v_add_f64 v[65:66], v[144:145], v[156:157] v_add_f64 v[67:68], -v[158:159], -v[148:149] ds_write_b128 v179, v[89:92] ds_write_b128 v180, v[136:139] ds_write_b128 v181, v[93:96] ds_write_b128 v178, v[140:143] ; wave barrier ds_read_b128 v[89:92], v135 ds_read_b128 v[93:96], v135 offset:1024 v_add_u32_e32 v137, 0x800, v135 v_add_u32_e32 v136, 0xc00, v135 s_load_dwordx2 s[0:1], s[4:5], 0x0 s_cbranch_execz .LBB0_3 s_branch .LBB0_8 .LBB0_2: ; implicit-def: $vgpr137 ; implicit-def: $vgpr136 ; implicit-def: $vgpr89_vgpr90_vgpr91_vgpr92 ; implicit-def: $vgpr93_vgpr94_vgpr95_vgpr96 ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32 ; implicit-def: $vgpr69_vgpr70_vgpr71_vgpr72 ; implicit-def: $vgpr61_vgpr62_vgpr63_vgpr64 ; implicit-def: $vgpr65_vgpr66_vgpr67_vgpr68 s_load_dwordx2 s[0:1], s[4:5], 0x0 .LBB0_3: v_sub_u32_e32 v29, 0, v0 v_and_b32_e32 v29, 0x7f, v29 v_sub_u32_e32 v134, 0, v134 s_waitcnt lgkmcnt(0) v_lshlrev_b32_e32 v89, 4, v29 ; wave barrier ds_write_b128 v134, v[85:88] offset:1024 ds_write_b128 v89, v[73:76] ; wave barrier ds_read_b128 v[85:88], v135 ds_read_b128 v[65:68], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[81:84] offset:1024 ds_write_b128 v89, v[77:80] ; wave barrier ds_read_b128 v[73:76], v135 ds_read_b128 v[61:64], v135 offset:1024 v_cmp_ne_u32_e32 vcc, 0, v0 ; implicit-def: $vgpr69_vgpr70_vgpr71_vgpr72 ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32 s_and_saveexec_b64 s[2:3], vcc s_xor_b64 s[2:3], exec, s[2:3] s_cbranch_execz .LBB0_5 ; %bb.4: s_waitcnt lgkmcnt(5) v_add_f64 v[29:30], v[127:128], v[87:88] s_waitcnt lgkmcnt(1) v_add_f64 v[31:32], v[121:122], -v[73:74] v_add_f64 v[69:70], v[125:126], -v[85:86] v_add_f64 v[71:72], v[123:124], v[75:76] v_add_f64 v[79:80], v[127:128], -v[87:88] v_add_f64 v[83:84], v[125:126], v[85:86] v_add_f64 v[73:74], v[121:122], v[73:74] v_add_f64 v[75:76], v[123:124], -v[75:76] ; implicit-def: $vgpr125_vgpr126 ; implicit-def: $vgpr127_vgpr128 ; implicit-def: $vgpr121_vgpr122 ; implicit-def: $vgpr123_vgpr124 v_mul_f64 v[77:78], v[29:30], v[31:32] v_mul_f64 v[81:82], v[71:72], -v[29:30] v_mul_f64 v[85:86], v[71:72], -v[79:80] v_mul_f64 v[87:88], v[79:80], v[31:32] v_fma_f64 v[77:78], v[69:70], v[71:72], v[77:78] v_fma_f64 v[81:82], v[69:70], v[31:32], v[81:82] v_fma_f64 v[31:32], v[83:84], v[31:32], v[85:86] v_fma_f64 v[71:72], v[83:84], v[71:72], v[87:88] v_mul_f64 v[90:91], v[109:110], v[77:78] v_mul_f64 v[77:78], -v[111:112], v[77:78] v_fma_f64 v[31:32], v[29:30], -v[75:76], v[31:32] v_fma_f64 v[29:30], v[29:30], v[73:74], v[71:72] v_fma_f64 v[85:86], v[81:82], -v[111:112], v[90:91] v_fma_f64 v[77:78], v[81:82], -v[109:110], v[77:78] v_fma_f64 v[71:72], v[79:80], -v[75:76], v[85:86] v_fma_f64 v[77:78], v[79:80], v[73:74], v[77:78] v_fma_f64 v[79:80], v[69:70], v[73:74], v[31:32] v_fma_f64 v[69:70], v[69:70], v[75:76], v[29:30] ; implicit-def: $vgpr85_vgpr86_vgpr87_vgpr88 v_fma_f64 v[73:74], v[83:84], v[73:74], v[71:72] v_fma_f64 v[71:72], v[83:84], v[75:76], v[77:78] v_add_f64 v[29:30], v[79:80], v[73:74] v_add_f64 v[31:32], -v[71:72], -v[69:70] v_add_f64 v[71:72], v[71:72], -v[69:70] v_add_f64 v[69:70], v[73:74], -v[79:80] ; implicit-def: $vgpr73_vgpr74_vgpr75_vgpr76 .LBB0_5: ; %Flow s_andn2_saveexec_b64 s[2:3], s[2:3] s_cbranch_execz .LBB0_7 ; %bb.6: v_add_f64 v[29:30], v[125:126], v[127:128] v_add_f64 v[31:32], v[125:126], -v[127:128] v_add_f64 v[69:70], v[121:122], v[123:124] v_add_f64 v[71:72], v[121:122], -v[123:124] v_mul_f64 v[29:30], v[29:30], v[69:70] v_mul_f64 v[31:32], v[31:32], v[71:72] s_waitcnt lgkmcnt(1) v_mul_f64 v[69:70], v[73:74], -v[87:88] v_mul_f64 v[71:72], v[87:88], -v[75:76] v_add_f64 v[77:78], v[29:30], v[31:32] v_add_f64 v[31:32], v[29:30], -v[31:32] v_fma_f64 v[69:70], v[85:86], -v[75:76], v[69:70] v_fma_f64 v[73:74], v[85:86], v[73:74], v[71:72] v_add_f64 v[29:30], v[77:78], v[77:78] v_add_f64 v[31:32], v[31:32], v[31:32] v_mul_f64 v[71:72], v[69:70], 4.0 v_mul_f64 v[69:70], v[73:74], 4.0 v_xor_b32_e32 v32, 0x80000000, v32 .LBB0_7: s_or_b64 exec, exec, s[2:3] s_waitcnt lgkmcnt(1) v_add_f64 v[73:74], v[119:120], v[67:68] s_waitcnt lgkmcnt(0) v_add_f64 v[75:76], v[113:114], -v[61:62] v_add_f64 v[77:78], v[117:118], -v[65:66] v_add_f64 v[79:80], v[115:116], v[63:64] v_add_f64 v[67:68], v[119:120], -v[67:68] v_add_f64 v[65:66], v[117:118], v[65:66] v_add_f64 v[61:62], v[113:114], v[61:62] v_add_f64 v[63:64], v[115:116], -v[63:64] s_mov_b32 s2, 0x8da49510 v_mul_f64 v[81:82], v[73:74], v[75:76] s_mov_b32 s3, 0xbf13bd2c v_mul_f64 v[83:84], v[79:80], -v[73:74] v_mul_f64 v[85:86], v[79:80], -v[67:68] v_mul_f64 v[87:88], v[67:68], v[75:76] ; wave barrier v_add_u32_e32 v136, 0x400, v135 v_mov_b32_e32 v137, v135 v_fma_f64 v[81:82], v[77:78], v[79:80], v[81:82] v_fma_f64 v[83:84], v[77:78], v[75:76], v[83:84] v_fma_f64 v[75:76], v[65:66], v[75:76], v[85:86] v_fma_f64 v[79:80], v[65:66], v[79:80], v[87:88] v_mul_f64 v[90:91], -v[109:110], v[81:82] v_mul_f64 v[81:82], v[111:112], -v[81:82] v_fma_f64 v[75:76], v[73:74], -v[63:64], v[75:76] v_fma_f64 v[73:74], v[73:74], v[61:62], v[79:80] v_fma_f64 v[85:86], v[83:84], v[111:112], v[90:91] v_fma_f64 v[81:82], v[83:84], -v[109:110], v[81:82] v_fma_f64 v[75:76], v[77:78], v[61:62], v[75:76] v_fma_f64 v[73:74], v[77:78], v[63:64], v[73:74] v_fma_f64 v[79:80], v[67:68], v[61:62], v[85:86] v_fma_f64 v[67:68], v[67:68], -v[63:64], v[81:82] v_fma_f64 v[77:78], v[65:66], v[63:64], v[79:80] v_fma_f64 v[79:80], v[65:66], v[61:62], v[67:68] v_fma_f64 v[65:66], v[111:112], s[2:3], v[111:112] v_fma_f64 v[67:68], v[109:110], s[2:3], v[109:110] s_mov_b32 s3, 0x3f8921d1 s_mov_b32 s2, 0xfcdec784 v_add_f64 v[63:64], v[77:78], -v[73:74] v_add_f64 v[61:62], v[79:80], -v[75:76] v_fma_f64 v[81:82], v[109:110], s[2:3], v[65:66] s_mov_b32 s3, 0xbf8921d1 v_fma_f64 v[83:84], v[111:112], s[2:3], v[67:68] ds_write_b128 v134, v[61:64] offset:1024 ds_write_b128 v89, v[69:72] ; wave barrier ds_read_b128 v[61:64], v135 ds_read_b128 v[65:68], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[41:44] offset:1008 ds_write_b128 v134, v[37:40] offset:2032 ; wave barrier ds_read_b128 v[37:40], v135 ds_read_b128 v[41:44], v135 offset:1024 ; wave barrier ds_write_b128 v134, v[57:60] offset:1008 ds_write_b128 v134, v[53:56] offset:2032 ; wave barrier ds_read_b128 v[53:56], v135 ds_read_b128 v[57:60], v135 offset:1024 ; wave barrier s_waitcnt lgkmcnt(5) v_add_f64 v[69:70], v[33:34], v[37:38] v_add_f64 v[71:72], v[35:36], -v[39:40] v_add_f64 v[35:36], v[35:36], v[39:40] s_waitcnt lgkmcnt(1) v_add_f64 v[39:40], v[49:50], -v[53:54] v_add_f64 v[33:34], v[33:34], -v[37:38] v_add_f64 v[37:38], v[27:28], v[43:44] s_waitcnt lgkmcnt(0) v_add_f64 v[85:86], v[45:46], -v[57:58] v_add_f64 v[49:50], v[49:50], v[53:54] v_add_f64 v[53:54], v[51:52], -v[55:56] v_add_f64 v[51:52], v[51:52], v[55:56] v_add_f64 v[55:56], v[25:26], v[41:42] v_mul_f64 v[87:88], v[35:36], v[39:40] v_add_f64 v[25:26], v[25:26], -v[41:42] v_add_f64 v[41:42], v[47:48], v[59:60] v_mul_f64 v[89:90], v[37:38], v[85:86] v_add_f64 v[27:28], v[27:28], -v[43:44] v_add_f64 v[43:44], v[45:46], v[57:58] v_add_f64 v[45:46], v[47:48], -v[59:60] v_mul_f64 v[47:48], v[51:52], -v[35:36] v_fma_f64 v[57:58], v[33:34], v[51:52], v[87:88] v_mul_f64 v[59:60], v[51:52], -v[71:72] v_mul_f64 v[87:88], v[41:42], -v[37:38] v_fma_f64 v[89:90], v[25:26], v[41:42], v[89:90] v_mul_f64 v[91:92], v[71:72], v[39:40] v_mul_f64 v[93:94], v[41:42], -v[27:28] v_mul_f64 v[95:96], v[27:28], v[85:86] v_fma_f64 v[47:48], v[33:34], v[39:40], v[47:48] v_mul_f64 v[109:110], v[83:84], v[57:58] v_mul_f64 v[57:58], -v[81:82], v[57:58] v_fma_f64 v[87:88], v[25:26], v[85:86], v[87:88] v_mul_f64 v[111:112], -v[83:84], v[89:90] v_mul_f64 v[89:90], v[81:82], -v[89:90] v_fma_f64 v[39:40], v[69:70], v[39:40], v[59:60] v_fma_f64 v[51:52], v[69:70], v[51:52], v[91:92] v_fma_f64 v[59:60], v[55:56], v[85:86], v[93:94] v_fma_f64 v[85:86], v[47:48], -v[81:82], v[109:110] v_fma_f64 v[47:48], v[47:48], -v[83:84], v[57:58] v_fma_f64 v[41:42], v[55:56], v[41:42], v[95:96] v_fma_f64 v[57:58], v[87:88], v[81:82], v[111:112] v_fma_f64 v[81:82], v[87:88], -v[83:84], v[89:90] v_fma_f64 v[39:40], v[35:36], -v[53:54], v[39:40] v_fma_f64 v[35:36], v[35:36], v[49:50], v[51:52] v_fma_f64 v[51:52], v[37:38], -v[45:46], v[59:60] v_fma_f64 v[59:60], v[71:72], -v[53:54], v[85:86] v_fma_f64 v[47:48], v[71:72], v[49:50], v[47:48] v_fma_f64 v[37:38], v[37:38], v[43:44], v[41:42] v_fma_f64 v[41:42], v[27:28], v[43:44], v[57:58] v_fma_f64 v[27:28], v[27:28], -v[45:46], v[81:82] v_fma_f64 v[39:40], v[33:34], v[49:50], v[39:40] v_fma_f64 v[57:58], v[33:34], v[53:54], v[35:36] v_fma_f64 v[51:52], v[25:26], v[43:44], v[51:52] v_fma_f64 v[33:34], v[69:70], v[49:50], v[59:60] v_fma_f64 v[47:48], v[69:70], v[53:54], v[47:48] v_fma_f64 v[37:38], v[25:26], v[45:46], v[37:38] v_fma_f64 v[41:42], v[55:56], v[45:46], v[41:42] v_fma_f64 v[43:44], v[55:56], v[43:44], v[27:28] v_add_f64 v[69:70], v[75:76], v[79:80] v_add_f64 v[71:72], -v[77:78], -v[73:74] v_add_f64 v[89:90], v[39:40], v[33:34] v_add_f64 v[91:92], -v[47:48], -v[57:58] v_add_f64 v[25:26], v[33:34], -v[39:40] v_add_f64 v[35:36], v[41:42], -v[37:38] v_add_f64 v[33:34], v[43:44], -v[51:52] v_add_f64 v[93:94], v[51:52], v[43:44] v_add_f64 v[95:96], -v[41:42], -v[37:38] v_add_f64 v[27:28], v[47:48], -v[57:58] ds_write_b128 v134, v[33:36] offset:1008 ds_write_b128 v134, v[25:28] offset:2032 ; wave barrier .LBB0_8: ; %Flow60 ds_read_b128 v[25:28], v137 ds_read_b128 v[33:36], v136 ; wave barrier s_lshl_b32 s3, s7, 9 s_and_b32 s3, s3, 0x1fe00 s_add_i32 s3, s3, s7 s_waitcnt lgkmcnt(0) v_add_f64 v[37:38], v[91:92], v[27:28] v_add_f64 v[27:28], v[91:92], -v[27:28] v_add_f64 v[39:40], v[93:94], -v[33:34] v_add_f64 v[41:42], v[95:96], v[35:36] v_add_f64 v[43:44], v[89:90], v[25:26] v_add_f64 v[25:26], v[89:90], -v[25:26] v_add_f64 v[35:36], v[95:96], -v[35:36] v_add_f64 v[33:34], v[93:94], v[33:34] s_and_b32 s3, s3, 0xffffff00 s_lshl_b32 s2, s6, 9 v_add_f64 v[45:46], v[27:28], -v[39:40] v_add_f64 v[47:48], v[37:38], -v[41:42] v_add_f64 v[27:28], v[27:28], v[39:40] v_add_f64 v[37:38], v[37:38], v[41:42] v_add_f64 v[51:52], v[25:26], -v[35:36] v_add_f64 v[39:40], v[25:26], v[35:36] v_add_f64 v[49:50], v[43:44], -v[33:34] v_add_f64 v[25:26], v[43:44], v[33:34] v_mul_f64 v[35:36], v[3:4], -v[45:46] v_mul_f64 v[53:54], v[7:8], -v[47:48] v_mul_f64 v[55:56], v[99:100], -v[27:28] v_mul_f64 v[43:44], v[1:2], v[45:46] v_mul_f64 v[45:46], v[5:6], v[47:48] v_mul_f64 v[47:48], v[101:102], v[27:28] s_and_b32 s2, s2, 0x1fe00 s_add_i32 s2, s2, s6 v_fma_f64 v[27:28], v[39:40], v[1:2], v[35:36] v_fma_f64 v[33:34], v[49:50], v[5:6], v[53:54] v_fma_f64 v[35:36], v[51:52], v[101:102], v[55:56] v_fma_f64 v[39:40], v[39:40], v[3:4], v[43:44] v_fma_f64 v[41:42], v[49:50], v[7:8], v[45:46] v_fma_f64 v[43:44], v[51:52], v[99:100], v[47:48] v_add_f64 v[55:56], v[63:64], v[31:32] ds_write_b128 v132, v[25:28] ds_write_b128 v132, v[33:36] offset:16 ; wave barrier ds_read_b64 v[45:46], v129 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ds_read_b64 v[47:48], v130 offset:1536 ; wave barrier ds_write_b128 v132, v[37:40] ds_write_b128 v132, v[41:44] offset:16 ; wave barrier ds_read_b64 v[37:38], v129 ds_read2st64_b64 v[33:36], v130 offset0:1 offset1:2 ds_read_b64 v[39:40], v130 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[49:50], v[45:46], v[27:28] v_add_f64 v[27:28], v[45:46], -v[27:28] ; wave barrier v_add_f64 v[31:32], v[31:32], -v[63:64] s_waitcnt lgkmcnt(1) v_add_f64 v[41:42], v[37:38], v[35:36] v_add_f64 v[35:36], v[37:38], -v[35:36] v_add_f64 v[37:38], v[25:26], -v[47:48] s_waitcnt lgkmcnt(0) v_add_f64 v[43:44], v[33:34], v[39:40] v_add_f64 v[33:34], v[33:34], -v[39:40] v_add_f64 v[25:26], v[25:26], v[47:48] v_add_f64 v[39:40], v[35:36], -v[37:38] v_add_f64 v[45:46], v[41:42], -v[43:44] v_add_f64 v[35:36], v[35:36], v[37:38] v_add_f64 v[47:48], v[27:28], -v[33:34] v_add_f64 v[27:28], v[27:28], v[33:34] v_add_f64 v[37:38], v[49:50], -v[25:26] v_add_f64 v[25:26], v[49:50], v[25:26] v_add_f64 v[41:42], v[41:42], v[43:44] v_mul_f64 v[33:34], v[11:12], -v[39:40] v_mul_f64 v[39:40], v[9:10], v[39:40] v_mul_f64 v[51:52], v[15:16], -v[45:46] v_mul_f64 v[53:54], v[97:98], -v[35:36] v_mul_f64 v[45:46], v[13:14], v[45:46] v_mul_f64 v[35:36], v[103:104], v[35:36] v_fma_f64 v[33:34], v[27:28], v[9:10], v[33:34] v_fma_f64 v[39:40], v[27:28], v[11:12], v[39:40] v_fma_f64 v[49:50], v[37:38], v[13:14], v[51:52] v_fma_f64 v[51:52], v[47:48], v[103:104], v[53:54] v_fma_f64 v[37:38], v[37:38], v[15:16], v[45:46] v_fma_f64 v[35:36], v[47:48], v[97:98], v[35:36] ds_write2_b64 v133, v[25:26], v[33:34] offset1:4 ds_write2_b64 v133, v[49:50], v[51:52] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[43:44], v129 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ds_read_b64 v[45:46], v130 offset:1536 ; wave barrier ds_write2_b64 v133, v[41:42], v[39:40] offset1:4 ds_write2_b64 v133, v[37:38], v[35:36] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[37:38], v129 ds_read2st64_b64 v[33:36], v130 offset0:1 offset1:2 ds_read_b64 v[39:40], v130 offset:1536 ; wave barrier s_waitcnt lgkmcnt(6) v_add_f64 v[49:50], v[43:44], v[27:28] v_add_f64 v[27:28], v[43:44], -v[27:28] s_waitcnt lgkmcnt(1) v_add_f64 v[41:42], v[37:38], v[35:36] v_add_f64 v[35:36], v[37:38], -v[35:36] v_add_f64 v[37:38], v[25:26], -v[45:46] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[33:34], v[39:40] v_add_f64 v[25:26], v[25:26], v[45:46] v_add_f64 v[33:34], v[33:34], -v[39:40] v_add_f64 v[39:40], v[35:36], -v[37:38] v_add_f64 v[43:44], v[41:42], -v[47:48] v_add_f64 v[35:36], v[35:36], v[37:38] v_add_f64 v[37:38], v[49:50], -v[25:26] v_add_f64 v[45:46], v[27:28], -v[33:34] v_add_f64 v[27:28], v[27:28], v[33:34] v_add_f64 v[25:26], v[49:50], v[25:26] v_add_f64 v[41:42], v[41:42], v[47:48] v_mul_f64 v[33:34], v[19:20], -v[39:40] v_mul_f64 v[51:52], v[23:24], -v[43:44] v_mul_f64 v[53:54], v[105:106], -v[35:36] v_mul_f64 v[35:36], v[107:108], v[35:36] v_mul_f64 v[39:40], v[17:18], v[39:40] v_mul_f64 v[43:44], v[21:22], v[43:44] v_add_f64 v[47:48], v[65:66], v[69:70] v_fma_f64 v[33:34], v[27:28], v[17:18], v[33:34] v_fma_f64 v[49:50], v[37:38], v[21:22], v[51:52] v_fma_f64 v[51:52], v[45:46], v[107:108], v[53:54] v_add_f64 v[53:54], v[69:70], -v[65:66] v_fma_f64 v[35:36], v[45:46], v[105:106], v[35:36] v_add_f64 v[45:46], v[67:68], v[71:72] v_fma_f64 v[39:40], v[27:28], v[19:20], v[39:40] ds_write2_b64 v131, v[25:26], v[33:34] offset1:16 ds_write2_b64 v131, v[49:50], v[51:52] offset0:32 offset1:48 v_add_f64 v[33:34], v[61:62], v[29:30] v_add_f64 v[29:30], v[29:30], -v[61:62] v_add_f64 v[49:50], v[71:72], -v[67:68] v_add_f64 v[57:58], v[31:32], -v[53:54] v_fma_f64 v[37:38], v[37:38], v[23:24], v[43:44] v_add_f64 v[59:60], v[55:56], -v[45:46] v_add_f64 v[31:32], v[53:54], v[31:32] ; wave barrier ds_read_b64 v[43:44], v129 ds_read2st64_b64 v[25:28], v130 offset0:1 offset1:2 ds_read_b64 v[51:52], v130 offset:1536 ; wave barrier ds_write2_b64 v131, v[41:42], v[39:40] offset1:16 ds_write2_b64 v131, v[37:38], v[35:36] offset0:32 offset1:48 v_add_f64 v[41:42], v[49:50], v[29:30] v_mul_f64 v[35:36], v[3:4], -v[57:58] v_add_f64 v[37:38], v[33:34], -v[47:48] v_add_f64 v[39:40], v[29:30], -v[49:50] v_mul_f64 v[49:50], v[7:8], -v[59:60] v_mul_f64 v[53:54], v[99:100], -v[31:32] v_mul_f64 v[57:58], v[1:2], v[57:58] v_mul_f64 v[59:60], v[5:6], v[59:60] v_mul_f64 v[63:64], v[101:102], v[31:32] v_add_f64 v[29:30], v[47:48], v[33:34] v_fma_f64 v[31:32], v[41:42], v[1:2], v[35:36] v_add_f64 v[1:2], v[45:46], v[55:56] v_fma_f64 v[33:34], v[37:38], v[5:6], v[49:50] v_fma_f64 v[35:36], v[39:40], v[101:102], v[53:54] v_fma_f64 v[3:4], v[41:42], v[3:4], v[57:58] v_fma_f64 v[5:6], v[37:38], v[7:8], v[59:60] v_fma_f64 v[7:8], v[39:40], v[99:100], v[63:64] ; wave barrier ds_read_b64 v[61:62], v129 ds_read2st64_b64 v[37:40], v130 offset0:1 offset1:2 ds_read_b64 v[41:42], v130 offset:1536 ; wave barrier ds_write_b128 v132, v[29:32] ds_write_b128 v132, v[33:36] offset:16 ; wave barrier ds_read_b64 v[33:34], v129 ds_read2st64_b64 v[29:32], v130 offset0:1 offset1:2 ds_read_b64 v[35:36], v130 offset:1536 ; wave barrier ds_write_b128 v132, v[1:4] ds_write_b128 v132, v[5:8] offset:16 ; wave barrier ds_read_b64 v[5:6], v129 ds_read2st64_b64 v[1:4], v130 offset0:1 offset1:2 ds_read_b64 v[7:8], v130 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[49:50], v[33:34], v[31:32] v_add_f64 v[31:32], v[33:34], -v[31:32] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[45:46], v[5:6], v[3:4] v_add_f64 v[3:4], v[5:6], -v[3:4] v_add_f64 v[5:6], v[29:30], -v[35:36] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[1:2], v[7:8] v_add_f64 v[1:2], v[1:2], -v[7:8] v_add_f64 v[29:30], v[29:30], v[35:36] v_add_f64 v[7:8], v[3:4], -v[5:6] v_add_f64 v[33:34], v[45:46], -v[47:48] v_add_f64 v[3:4], v[3:4], v[5:6] v_add_f64 v[35:36], v[31:32], -v[1:2] v_add_f64 v[1:2], v[31:32], v[1:2] v_add_f64 v[5:6], v[49:50], -v[29:30] v_add_f64 v[29:30], v[49:50], v[29:30] v_add_f64 v[45:46], v[45:46], v[47:48] v_mul_f64 v[31:32], v[11:12], -v[7:8] v_mul_f64 v[7:8], v[9:10], v[7:8] v_mul_f64 v[53:54], v[15:16], -v[33:34] v_mul_f64 v[55:56], v[97:98], -v[3:4] v_mul_f64 v[33:34], v[13:14], v[33:34] v_mul_f64 v[3:4], v[103:104], v[3:4] v_fma_f64 v[9:10], v[1:2], v[9:10], v[31:32] v_fma_f64 v[7:8], v[1:2], v[11:12], v[7:8] v_fma_f64 v[13:14], v[5:6], v[13:14], v[53:54] v_fma_f64 v[31:32], v[35:36], v[103:104], v[55:56] v_fma_f64 v[5:6], v[5:6], v[15:16], v[33:34] v_fma_f64 v[11:12], v[35:36], v[97:98], v[3:4] ds_write2_b64 v133, v[29:30], v[9:10] offset1:4 ds_write2_b64 v133, v[13:14], v[31:32] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[9:10], v129 ds_read2st64_b64 v[1:4], v130 offset0:1 offset1:2 ds_read_b64 v[13:14], v130 offset:1536 ; wave barrier ds_write2_b64 v133, v[45:46], v[7:8] offset1:4 ds_write2_b64 v133, v[5:6], v[11:12] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[11:12], v129 ds_read2st64_b64 v[5:8], v130 offset0:1 offset1:2 ds_read_b64 v[31:32], v130 offset:1536 v_add_f64 v[33:34], v[61:62], v[39:40] v_add_f64 v[29:30], v[61:62], -v[39:40] s_waitcnt lgkmcnt(6) v_add_f64 v[45:46], v[9:10], v[3:4] s_waitcnt lgkmcnt(1) v_add_f64 v[35:36], v[11:12], v[7:8] v_add_f64 v[7:8], v[11:12], -v[7:8] v_add_f64 v[11:12], v[1:2], -v[13:14] s_waitcnt lgkmcnt(0) v_add_f64 v[39:40], v[5:6], v[31:32] v_add_f64 v[3:4], v[9:10], -v[3:4] v_add_f64 v[9:10], v[1:2], v[13:14] v_add_f64 v[1:2], v[5:6], -v[31:32] v_add_f64 v[15:16], v[43:44], v[27:28] v_add_f64 v[27:28], v[43:44], -v[27:28] v_add_f64 v[43:44], v[37:38], v[41:42] v_add_f64 v[13:14], v[7:8], -v[11:12] v_add_f64 v[31:32], v[35:36], -v[39:40] v_add_f64 v[7:8], v[7:8], v[11:12] v_add_f64 v[37:38], v[37:38], -v[41:42] v_add_f64 v[41:42], v[3:4], -v[1:2] v_add_f64 v[47:48], v[3:4], v[1:2] v_add_f64 v[5:6], v[25:26], v[51:52] v_add_f64 v[25:26], v[25:26], -v[51:52] v_mul_f64 v[3:4], v[19:20], -v[13:14] v_mul_f64 v[13:14], v[17:18], v[13:14] v_add_f64 v[11:12], v[45:46], -v[9:10] v_mul_f64 v[49:50], v[23:24], -v[31:32] v_mul_f64 v[51:52], v[105:106], -v[7:8] v_mul_f64 v[31:32], v[21:22], v[31:32] v_mul_f64 v[7:8], v[107:108], v[7:8] v_add_f64 v[9:10], v[45:46], v[9:10] v_fma_f64 v[17:18], v[47:48], v[17:18], v[3:4] v_add_f64 v[35:36], v[35:36], v[39:40] v_fma_f64 v[13:14], v[47:48], v[19:20], v[13:14] v_fma_f64 v[21:22], v[11:12], v[21:22], v[49:50] v_fma_f64 v[45:46], v[41:42], v[107:108], v[51:52] v_fma_f64 v[19:20], v[11:12], v[23:24], v[31:32] v_fma_f64 v[7:8], v[41:42], v[105:106], v[7:8] v_add_f64 v[1:2], v[15:16], v[5:6] v_add_f64 v[3:4], v[33:34], v[43:44] v_add_f64 v[5:6], v[15:16], -v[5:6] ; wave barrier ds_write2_b64 v131, v[9:10], v[17:18] offset1:16 ds_write2_b64 v131, v[21:22], v[45:46] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[23:24], v129 ds_read2st64_b64 v[9:12], v130 offset0:1 offset1:2 ds_read_b64 v[31:32], v130 offset:1536 ; wave barrier ds_write2_b64 v131, v[35:36], v[13:14] offset1:16 ds_write2_b64 v131, v[19:20], v[7:8] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[35:36], v129 ds_read2st64_b64 v[13:16], v130 offset0:1 offset1:2 v_add_f64 v[7:8], v[33:34], -v[43:44] ds_read_b64 v[33:34], v130 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[43:44], v[23:24], -v[11:12] v_add_f64 v[17:18], v[27:28], -v[37:38] v_add_f64 v[21:22], v[27:28], v[37:38] s_waitcnt lgkmcnt(1) v_add_f64 v[27:28], v[35:36], v[15:16] s_waitcnt lgkmcnt(0) v_add_f64 v[41:42], v[13:14], v[33:34] v_add_f64 v[33:34], v[13:14], -v[33:34] v_add_f64 v[39:40], v[9:10], v[31:32] v_add_f64 v[35:36], v[35:36], -v[15:16] v_add_f64 v[31:32], v[9:10], -v[31:32] v_add_f64 v[37:38], v[23:24], v[11:12] v_add_f64 v[19:20], v[29:30], v[25:26] v_add_f64 v[23:24], v[29:30], -v[25:26] v_add_f64 v[11:12], v[27:28], v[41:42] v_add_f64 v[25:26], v[43:44], -v[33:34] v_add_f64 v[29:30], v[43:44], v[33:34] v_or_b32_e32 v33, s3, v0 v_mov_b32_e32 v34, 0 v_add_f64 v[15:16], v[27:28], -v[41:42] v_add_f64 v[27:28], v[35:36], v[31:32] v_add_f64 v[31:32], v[35:36], -v[31:32] v_lshlrev_b64 v[35:36], 4, v[33:34] v_add_f64 v[9:10], v[37:38], v[39:40] v_add_f64 v[13:14], v[37:38], -v[39:40] v_mov_b32_e32 v37, s1 v_add_co_u32_e32 v35, vcc, s0, v35 v_addc_co_u32_e32 v36, vcc, v37, v36, vcc global_store_dwordx4 v[35:36], v[1:4], off s_and_b32 s1, s2, 0xffffff00 v_or_b32_e32 v1, 64, v33 v_mov_b32_e32 v2, v34 v_lshlrev_b64 v[1:2], 4, v[1:2] v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[21:24], off v_or_b32_e32 v1, 0x80, v33 v_mov_b32_e32 v2, v34 v_lshlrev_b64 v[1:2], 4, v[1:2] v_or_b32_e32 v33, 0xc0, v33 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[5:8], off v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v33, s1, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[17:20], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[9:12], off v_or_b32_e32 v0, 64, v33 v_mov_b32_e32 v1, v34 v_lshlrev_b64 v[0:1], 4, v[0:1] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[29:32], off v_or_b32_e32 v0, 0x80, v33 v_mov_b32_e32 v1, v34 v_lshlrev_b64 v[0:1], 4, v[0:1] v_or_b32_e32 v33, 0xc0, v33 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[13:16], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[25:28], off s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel tailMul .amdhsa_group_segment_fixed_size 4096 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 32 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 182 .amdhsa_next_free_sgpr 23 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 0 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .text .Lfunc_end0: .size tailMul, .Lfunc_end0-tailMul ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 14348 ; NumSgprs: 25 ; NumVgprs: 182 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 4096 bytes/workgroup (compile time only) ; SGPRBlocks: 3 ; VGPRBlocks: 45 ; NumSGPRsForWavesPerEU: 25 ; NumVGPRsForWavesPerEU: 182 ; Occupancy: 1 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .ident "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.1.3 24193 669db884972e769450470020c06a6f132a8a065b)" .section ".note.GNU-stack" .addrsig .amdgpu_metadata --- amdhsa.kernels: - .args: - .actual_access: write_only .address_space: global .is_restrict: true .name: out .offset: 0 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: in .offset: 8 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: a .offset: 16 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .address_space: constant .is_const: true .name: smallTrig .offset: 24 .size: 8 .type_name: 'T2*' .value_kind: global_buffer .group_segment_fixed_size: 4096 .kernarg_segment_align: 8 .kernarg_segment_size: 32 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 64 .name: tailMul .private_segment_fixed_size: 0 .reqd_workgroup_size: - 64 - 1 - 1 .sgpr_count: 25 .sgpr_spill_count: 0 .symbol: tailMul.kd .uses_dynamic_stack: false .vgpr_count: 182 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: 'amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-' amdhsa.version: - 1 - 2 ... .end_amdgpu_metadata