.text .amdgcn_target "amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-" .protected tailSquare ; -- Begin function tailSquare .globl tailSquare .p2align 8 .type tailSquare,@function tailSquare: ; @tailSquare ; %bb.0: s_sub_i32 s7, 0x200, s6 s_load_dwordx4 s[0:3], s[4:5], 0x8 s_cmp_eq_u32 s6, 0 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[12:13], s[8:9], exec s_mov_b32 s11, 0 s_cselect_b32 s7, 0x100, s7 s_and_b32 s10, s6, 0xffffff00 s_lshl_b64 s[12:13], s[10:11], 4 s_waitcnt lgkmcnt(0) s_add_u32 s10, s0, s12 s_addc_u32 s12, s1, s13 s_lshl_b32 s13, s6, 7 s_and_b32 s13, s13, 0xf80 s_add_u32 s10, s10, s13 s_addc_u32 s13, s12, 0 s_lshl_b32 s12, s6, 13 s_and_b32 s12, s12, 0x1c0000 v_lshl_or_b32 v1, v0, 6, v0 s_add_u32 s12, s10, s12 v_and_b32_e32 v1, 0xe07, v1 s_addc_u32 s13, s13, 0 v_lshlrev_b32_e32 v29, 4, v1 v_mov_b32_e32 v1, s13 v_add_co_u32_e32 v21, vcc, s12, v29 v_addc_co_u32_e32 v22, vcc, 0, v1, vcc global_load_dwordx4 v[9:12], v29, s[12:13] s_mov_b32 s13, 0x10000 v_add_co_u32_e32 v13, vcc, s13, v21 v_addc_co_u32_e32 v14, vcc, 0, v22, vcc s_mov_b32 s12, 0x20000 v_add_co_u32_e32 v17, vcc, s12, v21 v_addc_co_u32_e32 v18, vcc, 0, v22, vcc s_mov_b32 s14, 0x30000 v_add_co_u32_e32 v21, vcc, s14, v21 v_lshlrev_b32_e32 v101, 4, v0 v_addc_co_u32_e32 v22, vcc, 0, v22, vcc global_load_dwordx4 v[1:4], v101, s[2:3] global_load_dwordx4 v[5:8], v101, s[2:3] offset:1024 global_load_dwordx4 v[13:16], v[13:14], off global_load_dwordx4 v[17:20], v[17:18], off global_load_dwordx4 v[21:24], v[21:22], off s_movk_i32 s10, 0xffe8 v_lshlrev_b32_e32 v99, 5, v0 v_mad_i32_i24 v98, v0, s10, v99 s_and_b32 s10, s7, 0xffffff00 s_lshl_b64 s[10:11], s[10:11], 4 s_add_u32 s0, s0, s10 s_addc_u32 s1, s1, s11 s_lshl_b32 s10, s7, 7 s_and_b32 s10, s10, 0xf80 s_add_u32 s0, s0, s10 s_addc_u32 s1, s1, 0 s_lshl_b32 s10, s7, 13 s_and_b32 s10, s10, 0x1c0000 s_add_u32 s0, s0, s10 v_and_b32_e32 v25, 60, v0 s_addc_u32 s1, s1, 0 v_lshlrev_b32_e32 v53, 4, v25 global_load_dwordx4 v[25:28], v29, s[0:1] v_mov_b32_e32 v30, s1 v_add_co_u32_e32 v47, vcc, s0, v29 v_addc_co_u32_e32 v48, vcc, 0, v30, vcc v_add_co_u32_e32 v29, vcc, s13, v47 v_addc_co_u32_e32 v30, vcc, 0, v48, vcc v_add_co_u32_e32 v41, vcc, s12, v47 v_addc_co_u32_e32 v42, vcc, 0, v48, vcc v_or_b32_e32 v97, 64, v0 v_add_co_u32_e32 v47, vcc, s14, v47 global_load_dwordx4 v[29:32], v[29:30], off v_addc_co_u32_e32 v48, vcc, 0, v48, vcc v_and_b32_e32 v57, 48, v0 v_lshlrev_b32_e32 v60, 4, v57 s_movk_i32 s0, 0x780 v_and_b32_e32 v61, 0x70, v97 v_and_b32_e32 v65, 15, v0 v_lshlrev_b32_e32 v71, 3, v65 s_movk_i32 s18, 0x600 v_and_or_b32 v102, v99, s18, v71 s_mov_b32 s10, 0x796cde01 s_mov_b32 s11, 0x3ec71de3 s_mov_b32 s12, 0x19e83e5c s_mov_b32 s13, 0xbf2a01a0 s_mov_b32 s14, 0x11110bb3 s_mov_b32 s15, 0x3f811111 v_mov_b32_e32 v63, 0xbdb4b1c4 v_mov_b32_e32 v64, 0x3e21ee9e s_mov_b32 s16, 0x809c52ad s_mov_b32 s17, 0xbe927e4f s_waitcnt vmcnt(5) v_mul_f64 v[33:34], v[1:2], v[7:8] v_mul_f64 v[39:40], v[3:4], -v[7:8] s_waitcnt vmcnt(3) v_add_f64 v[35:36], v[11:12], v[19:20] v_add_f64 v[11:12], v[11:12], -v[19:20] s_waitcnt vmcnt(2) v_add_f64 v[19:20], v[13:14], -v[21:22] v_add_f64 v[37:38], v[15:16], v[23:24] v_add_f64 v[15:16], v[15:16], -v[23:24] v_add_f64 v[13:14], v[13:14], v[21:22] v_fma_f64 v[73:74], v[5:6], v[3:4], v[33:34] v_add_f64 v[33:34], v[9:10], v[17:18] v_add_f64 v[9:10], v[9:10], -v[17:18] v_fma_f64 v[75:76], v[5:6], v[1:2], v[39:40] v_add_f64 v[17:18], v[11:12], -v[19:20] v_add_f64 v[21:22], v[35:36], -v[37:38] v_add_f64 v[11:12], v[11:12], v[19:20] v_add_f64 v[23:24], v[33:34], -v[13:14] v_add_f64 v[39:40], v[9:10], -v[15:16] v_add_f64 v[19:20], v[9:10], v[15:16] v_mul_f64 v[15:16], v[3:4], -v[17:18] v_mul_f64 v[43:44], v[7:8], -v[21:22] v_mul_f64 v[45:46], v[73:74], -v[11:12] v_mul_f64 v[49:50], v[17:18], v[1:2] v_mul_f64 v[21:22], v[21:22], v[5:6] v_mul_f64 v[51:52], v[11:12], v[75:76] v_add_f64 v[9:10], v[33:34], v[13:14] v_add_f64 v[17:18], v[35:36], v[37:38] v_fma_f64 v[11:12], v[19:20], v[1:2], v[15:16] v_fma_f64 v[13:14], v[23:24], v[5:6], v[43:44] v_fma_f64 v[15:16], v[39:40], v[75:76], v[45:46] v_fma_f64 v[19:20], v[19:20], v[3:4], v[49:50] v_fma_f64 v[21:22], v[23:24], v[7:8], v[21:22] v_fma_f64 v[23:24], v[39:40], v[73:74], v[51:52] global_load_dwordx4 v[33:36], v[41:42], off global_load_dwordx4 v[37:40], v[47:48], off ds_write_b128 v99, v[9:12] ds_write_b128 v99, v[13:16] offset:16 v_and_b32_e32 v9, 0x7c, v97 v_lshlrev_b32_e32 v13, 4, v9 ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[17:20] ds_write_b128 v99, v[21:24] offset:16 ; wave barrier ds_read2st64_b64 v[17:20], v98 offset1:1 ds_read2st64_b64 v[21:24], v98 offset0:2 offset1:3 ; wave barrier global_load_dwordx4 v[9:12], v53, s[2:3] global_load_dwordx4 v[13:16], v13, s[2:3] v_and_b32_e32 v49, 3, v0 v_lshlrev_b32_e32 v59, 3, v49 s_waitcnt lgkmcnt(4) v_add_f64 v[49:50], v[43:44], -v[47:48] s_waitcnt lgkmcnt(0) v_add_f64 v[51:52], v[17:18], v[21:22] v_add_f64 v[17:18], v[17:18], -v[21:22] v_add_f64 v[21:22], v[19:20], v[23:24] v_add_f64 v[19:20], v[19:20], -v[23:24] v_add_f64 v[23:24], v[43:44], v[47:48] v_add_f64 v[53:54], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_and_or_b32 v100, v99, s0, v59 s_movk_i32 s0, 0x4000 v_add_f64 v[45:46], v[17:18], -v[49:50] v_add_f64 v[47:48], v[51:52], -v[21:22] v_add_f64 v[17:18], v[17:18], v[49:50] v_add_f64 v[21:22], v[51:52], v[21:22] v_mov_b32_e32 v59, 0xb42fdfa7 v_add_f64 v[49:50], v[41:42], v[19:20] v_add_f64 v[19:20], v[41:42], -v[19:20] s_waitcnt vmcnt(3) v_add_f64 v[65:66], v[27:28], -v[35:36] s_waitcnt vmcnt(2) v_add_f64 v[67:68], v[29:30], -v[37:38] v_add_f64 v[27:28], v[27:28], v[35:36] v_add_f64 v[35:36], v[25:26], -v[33:34] v_add_f64 v[69:70], v[31:32], -v[39:40] v_add_f64 v[25:26], v[25:26], v[33:34] v_add_f64 v[31:32], v[31:32], v[39:40] v_add_f64 v[29:30], v[29:30], v[37:38] s_waitcnt vmcnt(1) v_mul_f64 v[41:42], v[11:12], -v[45:46] s_waitcnt vmcnt(0) v_mul_f64 v[55:56], v[9:10], v[15:16] v_mul_f64 v[43:44], v[11:12], -v[15:16] v_mul_f64 v[45:46], v[9:10], v[45:46] v_add_f64 v[33:34], v[65:66], -v[67:68] v_add_f64 v[37:38], v[35:36], v[69:70] v_add_f64 v[65:66], v[65:66], v[67:68] v_add_f64 v[67:68], v[35:36], -v[69:70] v_fma_f64 v[41:42], v[49:50], v[9:10], v[41:42] v_fma_f64 v[77:78], v[13:14], v[11:12], v[55:56] v_fma_f64 v[79:80], v[13:14], v[9:10], v[43:44] v_add_f64 v[55:56], v[53:54], -v[23:24] v_mul_f64 v[43:44], v[15:16], -v[47:48] v_mul_f64 v[47:48], v[47:48], v[13:14] v_add_f64 v[23:24], v[53:54], v[23:24] v_fma_f64 v[49:50], v[49:50], v[11:12], v[45:46] v_mul_f64 v[35:36], v[3:4], -v[33:34] v_mul_f64 v[57:58], v[77:78], -v[17:18] v_mul_f64 v[17:18], v[17:18], v[79:80] v_mul_f64 v[33:34], v[1:2], v[33:34] v_fma_f64 v[43:44], v[55:56], v[13:14], v[43:44] v_fma_f64 v[51:52], v[55:56], v[15:16], v[47:48] v_add_f64 v[39:40], v[27:28], -v[31:32] v_add_f64 v[69:70], v[25:26], -v[29:30] v_add_f64 v[25:26], v[25:26], v[29:30] v_fma_f64 v[53:54], v[19:20], v[79:80], v[57:58] v_fma_f64 v[17:18], v[19:20], v[77:78], v[17:18] v_lshlrev_b32_e32 v57, 4, v61 ds_write2_b64 v100, v[23:24], v[41:42] offset1:4 ds_write2_b64 v100, v[43:44], v[53:54] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v100, v[21:22], v[49:50] offset1:4 ds_write2_b64 v100, v[51:52], v[17:18] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[49:52], v98 offset1:1 ds_read2st64_b64 v[53:56], v98 offset0:2 offset1:3 ; wave barrier global_load_dwordx4 v[17:20], v60, s[2:3] global_load_dwordx4 v[21:24], v57, s[2:3] v_lshl_add_u32 v57, v0, 9, s6 v_sub_u32_e32 v58, 0x8000, v57 v_cmp_lt_u32_e32 vcc, s0, v57 v_cndmask_b32_e32 v57, v57, v58, vcc v_cvt_f64_u32_e32 v[57:58], v57 s_mov_b32 s0, 0x54442d18 s_mov_b32 s1, 0x3f0921fb v_add_f64 v[29:30], v[27:28], v[31:32] v_mul_f64 v[57:58], v[57:58], s[0:1] v_fma_f64 v[27:28], v[37:38], v[1:2], v[35:36] v_fma_f64 v[31:32], v[37:38], v[3:4], v[33:34] s_waitcnt lgkmcnt(4) v_add_f64 v[33:34], v[43:44], -v[47:48] s_waitcnt lgkmcnt(0) v_add_f64 v[35:36], v[49:50], -v[53:54] v_add_f64 v[37:38], v[49:50], v[53:54] v_add_f64 v[49:50], v[51:52], v[55:56] v_add_f64 v[43:44], v[43:44], v[47:48] v_mul_f64 v[61:62], v[57:58], v[57:58] v_mul_f64 v[71:72], v[7:8], -v[39:40] v_add_f64 v[53:54], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_add_f64 v[45:46], v[51:52], -v[55:56] v_add_f64 v[51:52], v[35:36], -v[33:34] v_mul_f64 v[85:86], v[73:74], -v[65:66] v_add_f64 v[55:56], v[37:38], -v[49:50] v_add_f64 v[87:88], v[35:36], v[33:34] s_mov_b32 s2, 0xf9a43bb8 v_mov_b32_e32 v60, 0xbe5ae600 s_mov_b32 s3, 0x3de5e0b2 v_fma_f64 v[59:60], v[61:62], s[2:3], v[59:60] v_fma_f64 v[33:34], v[69:70], v[5:6], v[71:72] v_add_f64 v[71:72], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_mul_f64 v[39:40], v[5:6], v[39:40] v_mul_f64 v[65:66], v[65:66], v[75:76] v_add_f64 v[89:90], v[53:54], -v[43:44] v_fma_f64 v[35:36], v[67:68], v[75:76], v[85:86] v_fma_f64 v[59:60], v[59:60], v[61:62], s[10:11] v_add_f64 v[43:44], v[53:54], v[43:44] v_add_f64 v[49:50], v[37:38], v[49:50] s_mov_b32 s2, 0xbe8838d4 v_fma_f64 v[37:38], v[69:70], v[7:8], v[39:40] v_fma_f64 v[39:40], v[67:68], v[73:74], v[65:66] s_mov_b32 s3, 0xbda8fae9 s_mov_b32 s0, 0x55555555 v_fma_f64 v[59:60], v[59:60], v[61:62], s[12:13] v_fma_f64 v[63:64], v[61:62], s[2:3], v[63:64] s_mov_b32 s1, 0xbfc55555 v_mul_f64 v[65:66], v[57:58], v[61:62] s_mov_b32 s2, 0x19cb1590 s_mov_b32 s3, 0x3efa01a0 s_mov_b32 s10, 0x16c15177 s_mov_b32 s11, 0xbf56c16c v_fma_f64 v[59:60], v[59:60], v[61:62], s[14:15] v_fma_f64 v[63:64], v[63:64], v[61:62], s[16:17] v_fma_f64 v[59:60], v[59:60], v[61:62], s[0:1] v_fma_f64 v[63:64], v[63:64], v[61:62], s[2:3] s_mov_b32 s0, 0x5555554c s_mov_b32 s1, 0x3fa55555 v_fma_f64 v[65:66], v[59:60], v[65:66], v[57:58] v_fma_f64 v[63:64], v[63:64], v[61:62], s[10:11] v_fma_f64 v[63:64], v[63:64], v[61:62], s[0:1] s_waitcnt vmcnt(1) v_mul_f64 v[45:46], v[19:20], -v[51:52] s_waitcnt vmcnt(0) v_mul_f64 v[81:82], v[17:18], v[23:24] v_mul_f64 v[47:48], v[19:20], -v[23:24] v_mul_f64 v[51:52], v[17:18], v[51:52] v_fma_f64 v[63:64], v[63:64], v[61:62], -0.5 v_fma_f64 v[45:46], v[71:72], v[17:18], v[45:46] v_fma_f64 v[81:82], v[21:22], v[19:20], v[81:82] v_fma_f64 v[83:84], v[21:22], v[17:18], v[47:48] v_mul_f64 v[47:48], v[23:24], -v[55:56] v_mul_f64 v[55:56], v[55:56], v[21:22] v_fma_f64 v[51:52], v[71:72], v[19:20], v[51:52] v_mul_f64 v[91:92], v[81:82], -v[87:88] v_mul_f64 v[85:86], v[87:88], v[83:84] v_fma_f64 v[47:48], v[89:90], v[21:22], v[47:48] v_fma_f64 v[55:56], v[89:90], v[23:24], v[55:56] v_fma_f64 v[53:54], v[41:42], v[83:84], v[91:92] v_fma_f64 v[71:72], v[41:42], v[81:82], v[85:86] ds_write2_b64 v102, v[43:44], v[45:46] offset1:16 ds_write2_b64 v102, v[47:48], v[53:54] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v102, v[49:50], v[51:52] offset1:16 ds_write2_b64 v102, v[55:56], v[71:72] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[49:52], v98 offset1:1 ds_read2st64_b64 v[53:56], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[25:28] ds_write_b128 v99, v[33:36] offset:16 ; wave barrier ds_read2st64_b64 v[25:28], v98 offset1:1 ds_read2st64_b64 v[33:36], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[29:32] ds_write_b128 v99, v[37:40] offset:16 ; wave barrier ds_read2st64_b64 v[29:32], v98 offset1:1 ds_read2st64_b64 v[37:40], v98 offset0:2 offset1:3 ; wave barrier s_waitcnt lgkmcnt(4) v_add_f64 v[67:68], v[27:28], -v[35:36] v_add_f64 v[71:72], v[25:26], v[33:34] s_waitcnt lgkmcnt(0) v_add_f64 v[69:70], v[29:30], -v[37:38] v_add_f64 v[29:30], v[29:30], v[37:38] v_add_f64 v[37:38], v[31:32], v[39:40] v_add_f64 v[25:26], v[25:26], -v[33:34] v_add_f64 v[31:32], v[31:32], -v[39:40] v_add_f64 v[27:28], v[27:28], v[35:36] v_add_f64 v[33:34], v[69:70], -v[67:68] v_add_f64 v[39:40], v[69:70], v[67:68] v_add_f64 v[35:36], v[29:30], -v[37:38] v_add_f64 v[37:38], v[29:30], v[37:38] v_add_f64 v[57:58], v[25:26], v[31:32] v_add_f64 v[25:26], v[25:26], -v[31:32] v_add_f64 v[59:60], v[71:72], -v[27:28] v_add_f64 v[27:28], v[71:72], v[27:28] v_mul_f64 v[31:32], v[11:12], -v[33:34] v_mul_f64 v[33:34], v[9:10], v[33:34] v_mul_f64 v[67:68], v[15:16], -v[35:36] v_mul_f64 v[69:70], v[77:78], -v[39:40] v_mul_f64 v[35:36], v[13:14], v[35:36] v_mul_f64 v[39:40], v[79:80], v[39:40] v_fma_f64 v[71:72], v[63:64], v[61:62], 1.0 v_add_f64 v[61:62], v[49:50], v[53:54] v_fma_f64 v[31:32], v[57:58], v[9:10], v[31:32] v_fma_f64 v[33:34], v[57:58], v[11:12], v[33:34] v_fma_f64 v[67:68], v[59:60], v[13:14], v[67:68] v_fma_f64 v[69:70], v[25:26], v[79:80], v[69:70] v_fma_f64 v[35:36], v[59:60], v[15:16], v[35:36] v_fma_f64 v[39:40], v[25:26], v[77:78], v[39:40] ds_write2_b64 v100, v[27:28], v[31:32] offset1:4 ds_write2_b64 v100, v[67:68], v[69:70] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[25:28], v98 offset1:1 ds_read2st64_b64 v[29:32], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v100, v[37:38], v[33:34] offset1:4 ds_write2_b64 v100, v[35:36], v[39:40] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[33:36], v98 offset1:1 ds_read2st64_b64 v[37:40], v98 offset0:2 offset1:3 v_add_f64 v[57:58], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_add_f64 v[45:46], v[43:44], v[47:48] v_add_f64 v[67:68], v[43:44], -v[47:48] s_waitcnt lgkmcnt(4) v_add_f64 v[43:44], v[27:28], -v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[33:34], -v[37:38] v_add_f64 v[33:34], v[33:34], v[37:38] v_add_f64 v[37:38], v[35:36], v[39:40] v_add_f64 v[59:60], v[25:26], v[29:30] v_add_f64 v[25:26], v[25:26], -v[29:30] v_add_f64 v[29:30], v[35:36], -v[39:40] v_add_f64 v[27:28], v[27:28], v[31:32] v_add_f64 v[49:50], v[49:50], -v[53:54] v_add_f64 v[31:32], v[47:48], -v[43:44] v_add_f64 v[39:40], v[47:48], v[43:44] v_add_f64 v[35:36], v[33:34], -v[37:38] v_add_f64 v[43:44], v[51:52], v[55:56] v_add_f64 v[47:48], v[51:52], -v[55:56] v_add_f64 v[51:52], v[25:26], v[29:30] v_add_f64 v[25:26], v[25:26], -v[29:30] v_add_f64 v[53:54], v[59:60], -v[27:28] v_mul_f64 v[29:30], v[19:20], -v[31:32] v_mul_f64 v[31:32], v[17:18], v[31:32] v_mul_f64 v[55:56], v[23:24], -v[35:36] v_mul_f64 v[63:64], v[81:82], -v[39:40] v_mul_f64 v[35:36], v[21:22], v[35:36] v_mul_f64 v[39:40], v[83:84], v[39:40] v_add_f64 v[27:28], v[59:60], v[27:28] v_add_f64 v[33:34], v[33:34], v[37:38] v_fma_f64 v[29:30], v[51:52], v[17:18], v[29:30] v_fma_f64 v[37:38], v[51:52], v[19:20], v[31:32] v_fma_f64 v[55:56], v[53:54], v[21:22], v[55:56] v_fma_f64 v[63:64], v[25:26], v[83:84], v[63:64] v_fma_f64 v[35:36], v[53:54], v[23:24], v[35:36] v_fma_f64 v[39:40], v[25:26], v[81:82], v[39:40] ; wave barrier ds_write2_b64 v102, v[27:28], v[29:30] offset1:16 ds_write2_b64 v102, v[55:56], v[63:64] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[25:28], v98 offset1:1 ds_read2st64_b64 v[29:32], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v102, v[33:34], v[37:38] offset1:16 ds_write2_b64 v102, v[35:36], v[39:40] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[33:36], v98 offset1:1 ds_read2st64_b64 v[37:40], v98 offset0:2 offset1:3 v_add_f64 v[93:94], v[57:58], v[45:46] v_add_f64 v[57:58], v[57:58], -v[45:46] v_add_f64 v[59:60], v[61:62], -v[43:44] v_add_f64 v[95:96], v[61:62], v[43:44] s_waitcnt lgkmcnt(4) v_add_f64 v[43:44], v[25:26], v[29:30] v_add_f64 v[45:46], v[27:28], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[51:52], v[33:34], v[37:38] v_add_f64 v[53:54], v[35:36], v[39:40] v_add_f64 v[25:26], v[25:26], -v[29:30] v_add_f64 v[29:30], v[35:36], -v[39:40] v_add_f64 v[27:28], v[27:28], -v[31:32] v_add_f64 v[31:32], v[33:34], -v[37:38] v_add_f64 v[89:90], v[41:42], v[47:48] v_add_f64 v[61:62], v[41:42], -v[47:48] v_add_f64 v[63:64], v[49:50], v[67:68] v_add_f64 v[33:34], v[43:44], v[45:46] v_add_f64 v[35:36], v[51:52], v[53:54] v_add_f64 v[41:42], v[43:44], -v[45:46] v_add_f64 v[43:44], v[51:52], -v[53:54] v_add_f64 v[45:46], v[25:26], -v[29:30] v_add_f64 v[29:30], v[25:26], v[29:30] v_add_f64 v[47:48], v[31:32], v[27:28] v_add_f64 v[31:32], v[31:32], -v[27:28] v_add_f64 v[91:92], v[49:50], -v[67:68] v_cndmask_b32_e64 v88, -v66, -v72, vcc v_cndmask_b32_e32 v87, v65, v71, vcc v_cndmask_b32_e32 v86, v72, v66, vcc v_cndmask_b32_e32 v85, v71, v65, vcc s_and_b64 vcc, exec, s[8:9] s_cbranch_vccnz .LBB0_2 ; %bb.1: v_not_b32_e32 v25, v0 v_lshlrev_b32_e32 v119, 4, v25 ; wave barrier ds_write_b128 v119, v[33:36] offset:4096 ds_write_b128 v119, v[29:32] offset:3072 ds_write_b128 v119, v[41:44] offset:2048 ds_write_b128 v119, v[45:48] offset:1024 ; wave barrier ds_read_b128 v[25:28], v101 ds_read_b128 v[49:52], v101 offset:1024 ds_read_b128 v[37:40], v101 offset:2048 ds_read_b128 v[53:56], v101 offset:3072 ; wave barrier v_or_b32_e32 v103, 0x800, v101 s_waitcnt lgkmcnt(3) v_add_f64 v[65:66], v[93:94], v[25:26] v_add_f64 v[25:26], v[93:94], -v[25:26] v_add_f64 v[67:68], v[95:96], -v[27:28] v_add_f64 v[27:28], v[95:96], v[27:28] v_or_b32_e32 v104, 0xc00, v101 v_add_f64 v[71:72], v[25:26], v[25:26] v_mul_f64 v[69:70], v[27:28], -v[27:28] v_mul_f64 v[71:72], v[27:28], v[71:72] v_fma_f64 v[69:70], v[25:26], v[25:26], v[69:70] v_mul_f64 v[105:106], v[87:88], v[71:72] v_mul_f64 v[71:72], -v[85:86], v[71:72] v_fma_f64 v[105:106], v[69:70], -v[85:86], v[105:106] v_fma_f64 v[69:70], v[69:70], -v[87:88], v[71:72] v_fma_f64 v[71:72], v[67:68], -v[67:68], v[105:106] v_add_f64 v[105:106], v[65:66], v[65:66] v_fma_f64 v[71:72], v[65:66], v[65:66], v[71:72] v_fma_f64 v[69:70], v[105:106], v[67:68], v[69:70] v_mul_f64 v[105:106], v[27:28], -v[67:68] v_fma_f64 v[105:106], v[65:66], v[25:26], v[105:106] v_mul_f64 v[25:26], v[67:68], v[25:26] v_add_f64 v[105:106], v[105:106], v[105:106] v_fma_f64 v[25:26], v[65:66], v[27:28], v[25:26] v_add_f64 v[65:66], v[25:26], v[25:26] v_add_f64 v[25:26], v[71:72], v[105:106] v_add_f64 v[27:28], -v[69:70], -v[65:66] v_add_f64 v[67:68], v[69:70], -v[65:66] s_waitcnt lgkmcnt(1) v_add_f64 v[69:70], v[57:58], v[37:38] v_add_f64 v[37:38], v[57:58], -v[37:38] v_add_f64 v[65:66], v[71:72], -v[105:106] v_add_f64 v[71:72], v[59:60], -v[39:40] v_add_f64 v[39:40], v[59:60], v[39:40] v_add_f64 v[107:108], v[37:38], v[37:38] v_mul_f64 v[105:106], v[39:40], -v[39:40] v_mul_f64 v[107:108], v[39:40], v[107:108] v_fma_f64 v[105:106], v[37:38], v[37:38], v[105:106] v_mul_f64 v[109:110], v[87:88], -v[107:108] v_mul_f64 v[107:108], v[85:86], v[107:108] v_fma_f64 v[109:110], v[105:106], v[85:86], v[109:110] v_fma_f64 v[105:106], v[105:106], v[87:88], v[107:108] v_fma_f64 v[107:108], v[71:72], -v[71:72], v[109:110] v_add_f64 v[109:110], v[69:70], v[69:70] v_fma_f64 v[107:108], v[69:70], v[69:70], v[107:108] v_fma_f64 v[105:106], v[109:110], v[71:72], v[105:106] v_mul_f64 v[109:110], v[39:40], -v[71:72] v_fma_f64 v[109:110], v[69:70], v[37:38], v[109:110] v_mul_f64 v[37:38], v[71:72], v[37:38] v_add_f64 v[109:110], v[109:110], v[109:110] v_fma_f64 v[37:38], v[69:70], v[39:40], v[37:38] v_add_f64 v[69:70], v[37:38], v[37:38] v_add_f64 v[37:38], v[107:108], v[109:110] v_add_f64 v[39:40], -v[105:106], -v[69:70] v_add_f64 v[71:72], v[105:106], -v[69:70] v_add_f64 v[105:106], v[89:90], v[49:50] v_add_f64 v[49:50], v[89:90], -v[49:50] v_add_f64 v[69:70], v[107:108], -v[109:110] v_add_f64 v[107:108], v[91:92], -v[51:52] v_add_f64 v[51:52], v[91:92], v[51:52] v_add_f64 v[111:112], v[49:50], v[49:50] v_mul_f64 v[109:110], v[51:52], -v[51:52] v_mul_f64 v[111:112], v[51:52], v[111:112] v_fma_f64 v[109:110], v[49:50], v[49:50], v[109:110] v_mul_f64 v[113:114], v[85:86], -v[111:112] v_mul_f64 v[111:112], -v[87:88], v[111:112] v_fma_f64 v[113:114], v[109:110], -v[87:88], v[113:114] v_fma_f64 v[109:110], v[109:110], v[85:86], v[111:112] v_fma_f64 v[111:112], v[107:108], -v[107:108], v[113:114] v_add_f64 v[113:114], v[105:106], v[105:106] v_fma_f64 v[111:112], v[105:106], v[105:106], v[111:112] v_fma_f64 v[109:110], v[113:114], v[107:108], v[109:110] v_mul_f64 v[113:114], v[51:52], -v[107:108] v_fma_f64 v[113:114], v[105:106], v[49:50], v[113:114] v_mul_f64 v[49:50], v[107:108], v[49:50] v_add_f64 v[113:114], v[113:114], v[113:114] v_fma_f64 v[49:50], v[105:106], v[51:52], v[49:50] v_add_f64 v[105:106], v[49:50], v[49:50] v_add_f64 v[49:50], v[111:112], v[113:114] v_add_f64 v[51:52], -v[109:110], -v[105:106] v_add_f64 v[107:108], v[109:110], -v[105:106] s_waitcnt lgkmcnt(0) v_add_f64 v[109:110], v[61:62], v[53:54] v_add_f64 v[53:54], v[61:62], -v[53:54] v_add_f64 v[105:106], v[111:112], -v[113:114] v_add_f64 v[111:112], v[63:64], -v[55:56] v_add_f64 v[55:56], v[63:64], v[55:56] v_add_f64 v[115:116], v[53:54], v[53:54] v_mul_f64 v[113:114], v[55:56], -v[55:56] v_mul_f64 v[115:116], v[55:56], v[115:116] v_fma_f64 v[113:114], v[53:54], v[53:54], v[113:114] v_mul_f64 v[117:118], v[85:86], v[115:116] v_mul_f64 v[115:116], v[87:88], v[115:116] v_fma_f64 v[117:118], v[113:114], v[87:88], v[117:118] v_fma_f64 v[113:114], v[113:114], -v[85:86], v[115:116] v_fma_f64 v[115:116], v[111:112], -v[111:112], v[117:118] v_add_f64 v[117:118], v[109:110], v[109:110] v_fma_f64 v[115:116], v[109:110], v[109:110], v[115:116] v_fma_f64 v[113:114], v[117:118], v[111:112], v[113:114] v_mul_f64 v[117:118], v[55:56], -v[111:112] v_fma_f64 v[117:118], v[109:110], v[53:54], v[117:118] v_mul_f64 v[53:54], v[111:112], v[53:54] v_add_f64 v[117:118], v[117:118], v[117:118] v_fma_f64 v[53:54], v[109:110], v[55:56], v[53:54] v_add_f64 v[109:110], v[53:54], v[53:54] v_add_f64 v[53:54], v[115:116], v[117:118] v_add_f64 v[55:56], -v[113:114], -v[109:110] v_add_f64 v[111:112], v[113:114], -v[109:110] v_add_f64 v[109:110], v[115:116], -v[117:118] ds_write_b128 v119, v[65:68] offset:4096 ds_write_b128 v119, v[105:108] offset:3072 ds_write_b128 v119, v[69:72] offset:2048 ds_write_b128 v119, v[109:112] offset:1024 ; wave barrier ds_read_b128 v[65:68], v101 ds_read_b128 v[69:72], v101 offset:1024 s_load_dwordx2 s[0:1], s[4:5], 0x0 s_cbranch_execz .LBB0_3 s_branch .LBB0_8 .LBB0_2: ; implicit-def: $vgpr103 ; implicit-def: $vgpr104 ; implicit-def: $vgpr65_vgpr66_vgpr67_vgpr68 ; implicit-def: $vgpr69_vgpr70_vgpr71_vgpr72 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 ; implicit-def: $vgpr49_vgpr50_vgpr51_vgpr52 ; implicit-def: $vgpr37_vgpr38_vgpr39_vgpr40 ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56 s_load_dwordx2 s[0:1], s[4:5], 0x0 .LBB0_3: v_sub_u32_e32 v25, 0, v0 v_sub_u32_e32 v101, 0, v101 v_and_b32_e32 v25, 0x7f, v25 ; wave barrier ds_write_b128 v101, v[61:64] offset:1024 v_lshlrev_b32_e32 v61, 4, v25 v_lshl_add_u32 v103, v0, 3, v98 ds_write_b128 v61, v[57:60] ; wave barrier ds_read_b128 v[53:56], v103 ds_read_b128 v[37:40], v103 offset:1024 v_cmp_ne_u32_e32 vcc, 0, v0 ; implicit-def: $vgpr49_vgpr50_vgpr51_vgpr52 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 s_and_saveexec_b64 s[2:3], vcc s_xor_b64 s[2:3], exec, s[2:3] s_cbranch_execz .LBB0_5 ; %bb.4: s_waitcnt lgkmcnt(0) v_add_f64 v[25:26], v[93:94], -v[53:54] v_add_f64 v[27:28], v[95:96], v[55:56] v_add_f64 v[55:56], v[95:96], -v[55:56] v_add_f64 v[53:54], v[93:94], v[53:54] ; implicit-def: $vgpr95_vgpr96 ; implicit-def: $vgpr93_vgpr94 v_add_f64 v[49:50], v[25:26], v[25:26] v_mul_f64 v[51:52], v[27:28], -v[27:28] v_mul_f64 v[59:60], v[55:56], v[25:26] v_mul_f64 v[62:63], v[27:28], -v[55:56] v_add_f64 v[64:65], v[53:54], v[53:54] v_mul_f64 v[49:50], v[27:28], v[49:50] v_fma_f64 v[51:52], v[25:26], v[25:26], v[51:52] v_fma_f64 v[27:28], v[53:54], v[27:28], v[59:60] v_fma_f64 v[25:26], v[53:54], v[25:26], v[62:63] v_mul_f64 v[57:58], v[87:88], v[49:50] v_mul_f64 v[49:50], -v[85:86], v[49:50] v_fma_f64 v[57:58], v[51:52], -v[85:86], v[57:58] v_fma_f64 v[49:50], v[51:52], -v[87:88], v[49:50] v_fma_f64 v[51:52], v[55:56], -v[55:56], v[57:58] v_add_f64 v[57:58], v[27:28], v[27:28] v_fma_f64 v[49:50], v[64:65], v[55:56], v[49:50] v_add_f64 v[55:56], v[25:26], v[25:26] v_fma_f64 v[53:54], v[53:54], v[53:54], v[51:52] v_add_f64 v[27:28], -v[49:50], -v[57:58] v_add_f64 v[51:52], v[49:50], -v[57:58] v_add_f64 v[25:26], v[53:54], v[55:56] v_add_f64 v[49:50], v[53:54], -v[55:56] ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56 .LBB0_5: ; %Flow s_andn2_saveexec_b64 s[2:3], s[2:3] s_cbranch_execz .LBB0_7 ; %bb.6: v_add_f64 v[25:26], v[93:94], -v[95:96] v_add_f64 v[27:28], v[93:94], v[95:96] s_waitcnt lgkmcnt(0) v_add_f64 v[49:50], v[53:54], v[53:54] v_mul_f64 v[51:52], v[55:56], -v[55:56] v_mul_f64 v[25:26], v[25:26], v[25:26] v_mul_f64 v[27:28], v[27:28], v[27:28] v_mul_f64 v[49:50], v[49:50], -v[55:56] v_fma_f64 v[53:54], v[53:54], v[53:54], v[51:52] v_add_f64 v[55:56], v[25:26], -v[27:28] v_add_f64 v[25:26], v[25:26], v[27:28] v_mul_f64 v[51:52], v[49:50], 4.0 v_mul_f64 v[49:50], v[53:54], 4.0 v_add_f64 v[27:28], v[55:56], v[55:56] v_add_f64 v[25:26], v[25:26], v[25:26] .LBB0_7: s_or_b64 exec, exec, s[2:3] s_waitcnt lgkmcnt(0) v_add_f64 v[53:54], v[89:90], -v[37:38] v_add_f64 v[55:56], v[91:92], v[39:40] v_add_f64 v[39:40], v[91:92], -v[39:40] v_add_f64 v[37:38], v[89:90], v[37:38] ; wave barrier s_mov_b32 s2, 0x8da49510 s_mov_b32 s3, 0xbf13bd2c s_mov_b32 s5, 0x3f8921d1 s_mov_b32 s4, 0xfcdec784 v_add_f64 v[57:58], v[53:54], v[53:54] v_mul_f64 v[59:60], v[55:56], -v[55:56] v_mul_f64 v[64:65], v[39:40], v[53:54] v_mul_f64 v[66:67], v[55:56], -v[39:40] v_add_f64 v[68:69], v[37:38], v[37:38] v_mul_f64 v[57:58], v[55:56], v[57:58] v_fma_f64 v[59:60], v[53:54], v[53:54], v[59:60] v_fma_f64 v[55:56], v[37:38], v[55:56], v[64:65] v_fma_f64 v[53:54], v[37:38], v[53:54], v[66:67] v_mul_f64 v[62:63], v[85:86], -v[57:58] v_mul_f64 v[57:58], -v[87:88], v[57:58] v_add_f64 v[64:65], v[53:54], v[53:54] v_fma_f64 v[62:63], v[59:60], -v[87:88], v[62:63] v_fma_f64 v[57:58], v[59:60], v[85:86], v[57:58] v_fma_f64 v[59:60], v[39:40], -v[39:40], v[62:63] v_add_f64 v[62:63], v[55:56], v[55:56] v_fma_f64 v[57:58], v[68:69], v[39:40], v[57:58] v_fma_f64 v[68:69], v[87:88], s[2:3], v[87:88] v_fma_f64 v[59:60], v[37:38], v[37:38], v[59:60] v_add_f64 v[39:40], v[57:58], -v[62:63] v_add_f64 v[37:38], v[59:60], -v[64:65] ds_write_b128 v101, v[37:40] offset:1024 ds_write_b128 v61, v[49:52] ; wave barrier ds_read_b128 v[37:40], v103 ds_read_b128 v[53:56], v103 offset:1024 ; wave barrier ds_write_b128 v101, v[45:48] offset:1008 ds_write_b128 v101, v[41:44] offset:2032 ; wave barrier ds_read_b128 v[41:44], v103 ds_read_b128 v[45:48], v103 offset:1024 v_fma_f64 v[49:50], v[85:86], s[2:3], v[85:86] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[51:52], v[33:34], -v[41:42] s_waitcnt lgkmcnt(0) v_add_f64 v[66:67], v[29:30], -v[45:46] v_add_f64 v[70:71], v[35:36], v[43:44] v_add_f64 v[33:34], v[33:34], v[41:42] v_add_f64 v[35:36], v[35:36], -v[43:44] v_fma_f64 v[49:50], v[87:88], s[4:5], v[49:50] v_add_f64 v[87:88], v[31:32], v[47:48] s_mov_b32 s5, 0xbf8921d1 v_add_f64 v[89:90], v[51:52], v[51:52] v_add_f64 v[91:92], v[66:67], v[66:67] v_fma_f64 v[68:69], v[85:86], s[4:5], v[68:69] v_mul_f64 v[41:42], v[70:71], -v[70:71] v_add_f64 v[31:32], v[31:32], -v[47:48] v_add_f64 v[29:30], v[29:30], v[45:46] v_mul_f64 v[43:44], v[87:88], -v[87:88] v_mul_f64 v[47:48], v[70:71], -v[35:36] v_mul_f64 v[85:86], v[70:71], v[89:90] v_mul_f64 v[89:90], v[87:88], v[91:92] v_mul_f64 v[95:96], v[35:36], v[51:52] v_fma_f64 v[41:42], v[51:52], v[51:52], v[41:42] v_mul_f64 v[106:107], v[31:32], v[66:67] v_mul_f64 v[108:109], v[87:88], -v[31:32] v_fma_f64 v[43:44], v[66:67], v[66:67], v[43:44] v_add_f64 v[45:46], v[33:34], v[33:34] v_mul_f64 v[91:92], v[68:69], v[85:86] v_mul_f64 v[93:94], v[49:50], -v[89:90] v_mul_f64 v[85:86], -v[49:50], v[85:86] v_mul_f64 v[89:90], -v[68:69], v[89:90] v_add_f64 v[104:105], v[29:30], v[29:30] v_fma_f64 v[70:71], v[33:34], v[70:71], v[95:96] v_fma_f64 v[47:48], v[33:34], v[51:52], v[47:48] v_fma_f64 v[51:52], v[29:30], v[87:88], v[106:107] v_fma_f64 v[91:92], v[41:42], -v[49:50], v[91:92] v_fma_f64 v[93:94], v[43:44], -v[68:69], v[93:94] v_fma_f64 v[41:42], v[41:42], -v[68:69], v[85:86] v_fma_f64 v[43:44], v[43:44], v[49:50], v[89:90] v_fma_f64 v[49:50], v[29:30], v[66:67], v[108:109] v_add_f64 v[70:71], v[70:71], v[70:71] v_add_f64 v[47:48], v[47:48], v[47:48] v_fma_f64 v[66:67], v[35:36], -v[35:36], v[91:92] v_fma_f64 v[68:69], v[31:32], -v[31:32], v[93:94] v_fma_f64 v[35:36], v[45:46], v[35:36], v[41:42] v_add_f64 v[41:42], v[51:52], v[51:52] v_fma_f64 v[43:44], v[104:105], v[31:32], v[43:44] v_add_f64 v[45:46], v[49:50], v[49:50] v_add_f64 v[51:52], -v[57:58], -v[62:63] v_add_f64 v[49:50], v[59:60], v[64:65] v_fma_f64 v[85:86], v[33:34], v[33:34], v[66:67] v_fma_f64 v[87:88], v[29:30], v[29:30], v[68:69] v_add_f64 v[67:68], -v[35:36], -v[70:71] v_add_f64 v[31:32], v[35:36], -v[70:71] v_add_f64 v[71:72], -v[43:44], -v[41:42] v_add_f64 v[35:36], v[43:44], -v[41:42] v_add_u32_e32 v104, 0x400, v103 v_add_f64 v[65:66], v[85:86], v[47:48] v_add_f64 v[33:34], v[87:88], -v[45:46] v_add_f64 v[69:70], v[87:88], v[45:46] v_add_f64 v[29:30], v[85:86], -v[47:48] ds_write_b128 v101, v[33:36] offset:1008 ds_write_b128 v101, v[29:32] offset:2032 ; wave barrier .LBB0_8: ; %Flow60 ds_read_b128 v[29:32], v103 ds_read_b128 v[33:36], v104 ; wave barrier s_lshl_b32 s2, s7, 9 s_and_b32 s2, s2, 0x1fe00 s_add_i32 s2, s2, s7 s_waitcnt lgkmcnt(0) v_add_f64 v[41:42], v[67:68], v[31:32] v_add_f64 v[31:32], v[67:68], -v[31:32] v_add_f64 v[43:44], v[69:70], -v[33:34] v_add_f64 v[45:46], v[71:72], v[35:36] v_add_f64 v[47:48], v[65:66], v[29:30] v_add_f64 v[29:30], v[65:66], -v[29:30] v_add_f64 v[35:36], v[71:72], -v[35:36] v_add_f64 v[33:34], v[69:70], v[33:34] s_and_b32 s2, s2, 0xffffff00 s_lshl_b32 s3, s6, 9 v_add_f64 v[57:58], v[31:32], -v[43:44] v_add_f64 v[59:60], v[41:42], -v[45:46] v_add_f64 v[31:32], v[31:32], v[43:44] v_add_f64 v[41:42], v[41:42], v[45:46] v_add_f64 v[63:64], v[29:30], -v[35:36] v_add_f64 v[43:44], v[29:30], v[35:36] v_add_f64 v[61:62], v[47:48], -v[33:34] v_add_f64 v[29:30], v[47:48], v[33:34] v_mul_f64 v[35:36], v[3:4], -v[57:58] v_mul_f64 v[57:58], v[1:2], v[57:58] v_mul_f64 v[65:66], v[7:8], -v[59:60] v_mul_f64 v[67:68], v[73:74], -v[31:32] v_mul_f64 v[59:60], v[5:6], v[59:60] v_mul_f64 v[69:70], v[75:76], v[31:32] s_and_b32 s3, s3, 0x1fe00 s_add_i32 s3, s3, s6 v_fma_f64 v[31:32], v[43:44], v[1:2], v[35:36] v_fma_f64 v[43:44], v[43:44], v[3:4], v[57:58] v_fma_f64 v[33:34], v[61:62], v[5:6], v[65:66] v_fma_f64 v[35:36], v[63:64], v[75:76], v[67:68] v_fma_f64 v[45:46], v[61:62], v[7:8], v[59:60] v_fma_f64 v[47:48], v[63:64], v[73:74], v[69:70] v_add_f64 v[69:70], v[55:56], v[51:52] ds_write_b128 v99, v[29:32] ds_write_b128 v99, v[33:36] offset:16 ; wave barrier ds_read2st64_b64 v[29:32], v98 offset1:1 ds_read2st64_b64 v[33:36], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[41:44] ds_write_b128 v99, v[45:48] offset:16 ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 ; wave barrier v_add_f64 v[51:52], v[51:52], -v[55:56] s_waitcnt lgkmcnt(4) v_add_f64 v[61:62], v[29:30], v[33:34] v_add_f64 v[29:30], v[29:30], -v[33:34] s_waitcnt lgkmcnt(0) v_add_f64 v[57:58], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_add_f64 v[45:46], v[31:32], -v[35:36] v_add_f64 v[59:60], v[43:44], v[47:48] v_add_f64 v[31:32], v[31:32], v[35:36] v_add_f64 v[33:34], v[43:44], -v[47:48] v_add_f64 v[35:36], v[41:42], -v[45:46] v_add_f64 v[43:44], v[57:58], -v[59:60] v_add_f64 v[41:42], v[41:42], v[45:46] v_add_f64 v[47:48], v[29:30], -v[33:34] v_add_f64 v[29:30], v[29:30], v[33:34] v_add_f64 v[45:46], v[61:62], -v[31:32] v_add_f64 v[31:32], v[61:62], v[31:32] v_add_f64 v[57:58], v[57:58], v[59:60] v_mul_f64 v[33:34], v[11:12], -v[35:36] v_mul_f64 v[35:36], v[9:10], v[35:36] v_mul_f64 v[63:64], v[15:16], -v[43:44] v_mul_f64 v[65:66], v[77:78], -v[41:42] v_mul_f64 v[43:44], v[13:14], v[43:44] v_mul_f64 v[41:42], v[79:80], v[41:42] v_fma_f64 v[33:34], v[29:30], v[9:10], v[33:34] v_fma_f64 v[59:60], v[29:30], v[11:12], v[35:36] v_fma_f64 v[61:62], v[45:46], v[13:14], v[63:64] v_fma_f64 v[63:64], v[47:48], v[79:80], v[65:66] v_fma_f64 v[43:44], v[45:46], v[15:16], v[43:44] v_fma_f64 v[41:42], v[47:48], v[77:78], v[41:42] ds_write2_b64 v100, v[31:32], v[33:34] offset1:4 ds_write2_b64 v100, v[61:62], v[63:64] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[29:32], v98 offset1:1 ds_read2st64_b64 v[33:36], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v100, v[57:58], v[59:60] offset1:4 ds_write2_b64 v100, v[43:44], v[41:42] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 v_add_f64 v[63:64], v[39:40], v[27:28] v_add_f64 v[27:28], v[27:28], -v[39:40] s_waitcnt lgkmcnt(4) v_add_f64 v[65:66], v[29:30], v[33:34] v_add_f64 v[29:30], v[29:30], -v[33:34] s_waitcnt lgkmcnt(0) v_add_f64 v[59:60], v[41:42], v[45:46] v_add_f64 v[41:42], v[41:42], -v[45:46] v_add_f64 v[45:46], v[31:32], -v[35:36] v_add_f64 v[61:62], v[43:44], v[47:48] v_add_f64 v[31:32], v[31:32], v[35:36] v_add_f64 v[33:34], v[43:44], -v[47:48] v_add_f64 v[57:58], v[37:38], v[25:26] v_add_f64 v[37:38], v[25:26], -v[37:38] v_add_f64 v[43:44], v[53:54], v[49:50] v_add_f64 v[49:50], v[49:50], -v[53:54] v_add_f64 v[35:36], v[41:42], -v[45:46] v_add_f64 v[39:40], v[59:60], -v[61:62] v_add_f64 v[41:42], v[41:42], v[45:46] v_add_f64 v[45:46], v[29:30], -v[33:34] v_add_f64 v[29:30], v[29:30], v[33:34] v_add_f64 v[25:26], v[65:66], -v[31:32] v_add_f64 v[31:32], v[65:66], v[31:32] v_add_f64 v[53:54], v[59:60], v[61:62] v_mul_f64 v[33:34], v[19:20], -v[35:36] v_mul_f64 v[47:48], v[23:24], -v[39:40] v_mul_f64 v[67:68], v[81:82], -v[41:42] v_mul_f64 v[35:36], v[17:18], v[35:36] v_mul_f64 v[39:40], v[21:22], v[39:40] v_mul_f64 v[41:42], v[83:84], v[41:42] ; wave barrier v_fma_f64 v[33:34], v[29:30], v[17:18], v[33:34] v_fma_f64 v[47:48], v[25:26], v[21:22], v[47:48] v_fma_f64 v[65:66], v[45:46], v[83:84], v[67:68] v_fma_f64 v[35:36], v[29:30], v[19:20], v[35:36] v_fma_f64 v[39:40], v[25:26], v[23:24], v[39:40] v_fma_f64 v[41:42], v[45:46], v[81:82], v[41:42] v_add_f64 v[45:46], v[27:28], -v[49:50] ds_write2_b64 v102, v[31:32], v[33:34] offset1:16 ds_write2_b64 v102, v[47:48], v[65:66] offset0:32 offset1:48 v_add_f64 v[33:34], v[63:64], -v[69:70] v_add_f64 v[47:48], v[49:50], v[27:28] ; wave barrier ds_read2st64_b64 v[25:28], v98 offset1:1 ds_read2st64_b64 v[29:32], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v102, v[53:54], v[35:36] offset1:16 v_add_f64 v[53:54], v[37:38], -v[51:52] v_add_f64 v[51:52], v[51:52], v[37:38] v_mul_f64 v[35:36], v[3:4], -v[45:46] ds_write2_b64 v102, v[39:40], v[41:42] offset0:32 offset1:48 v_mul_f64 v[41:42], v[1:2], v[45:46] v_add_f64 v[49:50], v[57:58], -v[43:44] v_mul_f64 v[37:38], v[7:8], -v[33:34] v_mul_f64 v[55:56], v[73:74], -v[47:48] v_mul_f64 v[45:46], v[5:6], v[33:34] v_mul_f64 v[47:48], v[75:76], v[47:48] v_add_f64 v[33:34], v[43:44], v[57:58] v_fma_f64 v[35:36], v[51:52], v[1:2], v[35:36] v_add_f64 v[1:2], v[69:70], v[63:64] v_fma_f64 v[3:4], v[51:52], v[3:4], v[41:42] v_fma_f64 v[37:38], v[49:50], v[5:6], v[37:38] v_fma_f64 v[39:40], v[53:54], v[75:76], v[55:56] v_fma_f64 v[5:6], v[49:50], v[7:8], v[45:46] v_fma_f64 v[7:8], v[53:54], v[73:74], v[47:48] ; wave barrier ds_read2st64_b64 v[41:44], v98 offset1:1 ds_read2st64_b64 v[45:48], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[33:36] ds_write_b128 v99, v[37:40] offset:16 ; wave barrier ds_read2st64_b64 v[33:36], v98 offset1:1 ds_read2st64_b64 v[37:40], v98 offset0:2 offset1:3 ; wave barrier ds_write_b128 v99, v[1:4] ds_write_b128 v99, v[5:8] offset:16 ; wave barrier ds_read2st64_b64 v[1:4], v98 offset1:1 ds_read2st64_b64 v[5:8], v98 offset0:2 offset1:3 ; wave barrier s_waitcnt lgkmcnt(4) v_add_f64 v[53:54], v[33:34], v[37:38] v_add_f64 v[33:34], v[33:34], -v[37:38] s_waitcnt lgkmcnt(0) v_add_f64 v[49:50], v[1:2], v[5:6] v_add_f64 v[1:2], v[1:2], -v[5:6] v_add_f64 v[5:6], v[35:36], -v[39:40] v_add_f64 v[51:52], v[3:4], v[7:8] v_add_f64 v[3:4], v[3:4], -v[7:8] v_add_f64 v[35:36], v[35:36], v[39:40] v_add_f64 v[7:8], v[1:2], -v[5:6] v_add_f64 v[37:38], v[49:50], -v[51:52] v_add_f64 v[1:2], v[1:2], v[5:6] v_add_f64 v[39:40], v[33:34], -v[3:4] v_add_f64 v[3:4], v[33:34], v[3:4] v_add_f64 v[5:6], v[53:54], -v[35:36] v_add_f64 v[35:36], v[53:54], v[35:36] v_add_f64 v[49:50], v[49:50], v[51:52] v_mul_f64 v[33:34], v[11:12], -v[7:8] v_mul_f64 v[7:8], v[9:10], v[7:8] v_mul_f64 v[55:56], v[15:16], -v[37:38] v_mul_f64 v[57:58], v[77:78], -v[1:2] v_mul_f64 v[37:38], v[13:14], v[37:38] v_mul_f64 v[1:2], v[79:80], v[1:2] v_fma_f64 v[9:10], v[3:4], v[9:10], v[33:34] v_fma_f64 v[11:12], v[3:4], v[11:12], v[7:8] v_fma_f64 v[13:14], v[5:6], v[13:14], v[55:56] v_fma_f64 v[33:34], v[39:40], v[79:80], v[57:58] v_fma_f64 v[15:16], v[5:6], v[15:16], v[37:38] v_fma_f64 v[37:38], v[39:40], v[77:78], v[1:2] ds_write2_b64 v100, v[35:36], v[9:10] offset1:4 ds_write2_b64 v100, v[13:14], v[33:34] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[1:4], v98 offset1:1 ds_read2st64_b64 v[5:8], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v100, v[49:50], v[11:12] offset1:4 ds_write2_b64 v100, v[15:16], v[37:38] offset0:8 offset1:12 ; wave barrier ds_read2st64_b64 v[9:12], v98 offset1:1 ds_read2st64_b64 v[13:16], v98 offset0:2 offset1:3 v_add_f64 v[33:34], v[41:42], v[45:46] v_add_f64 v[37:38], v[41:42], -v[45:46] v_add_f64 v[35:36], v[25:26], v[29:30] v_add_f64 v[29:30], v[25:26], -v[29:30] s_waitcnt lgkmcnt(0) v_add_f64 v[39:40], v[9:10], v[13:14] v_add_f64 v[9:10], v[9:10], -v[13:14] v_add_f64 v[13:14], v[3:4], -v[7:8] v_add_f64 v[41:42], v[11:12], v[15:16] v_add_f64 v[25:26], v[1:2], v[5:6] v_add_f64 v[1:2], v[1:2], -v[5:6] v_add_f64 v[3:4], v[3:4], v[7:8] v_add_f64 v[5:6], v[11:12], -v[15:16] v_add_f64 v[49:50], v[27:28], v[31:32] v_add_f64 v[31:32], v[27:28], -v[31:32] v_add_f64 v[7:8], v[9:10], -v[13:14] v_add_f64 v[11:12], v[39:40], -v[41:42] v_add_f64 v[9:10], v[9:10], v[13:14] v_add_f64 v[45:46], v[43:44], v[47:48] v_add_f64 v[13:14], v[25:26], -v[3:4] v_add_f64 v[15:16], v[1:2], -v[5:6] v_add_f64 v[1:2], v[1:2], v[5:6] v_add_f64 v[25:26], v[25:26], v[3:4] v_mul_f64 v[5:6], v[19:20], -v[7:8] v_mul_f64 v[27:28], v[23:24], -v[11:12] v_mul_f64 v[7:8], v[17:18], v[7:8] v_mul_f64 v[51:52], v[81:82], -v[9:10] v_mul_f64 v[11:12], v[21:22], v[11:12] v_mul_f64 v[9:10], v[83:84], v[9:10] ; wave barrier v_add_f64 v[3:4], v[33:34], v[45:46] v_add_f64 v[43:44], v[43:44], -v[47:48] v_fma_f64 v[5:6], v[1:2], v[17:18], v[5:6] v_fma_f64 v[17:18], v[13:14], v[21:22], v[27:28] v_add_f64 v[27:28], v[39:40], v[41:42] v_fma_f64 v[19:20], v[1:2], v[19:20], v[7:8] v_fma_f64 v[21:22], v[15:16], v[83:84], v[51:52] v_fma_f64 v[13:14], v[13:14], v[23:24], v[11:12] v_fma_f64 v[15:16], v[15:16], v[81:82], v[9:10] ds_write2_b64 v102, v[25:26], v[5:6] offset1:16 ds_write2_b64 v102, v[17:18], v[21:22] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[5:8], v98 offset1:1 ds_read2st64_b64 v[9:12], v98 offset0:2 offset1:3 ; wave barrier ds_write2_b64 v102, v[27:28], v[19:20] offset1:16 ds_write2_b64 v102, v[13:14], v[15:16] offset0:32 offset1:48 ; wave barrier ds_read2st64_b64 v[13:16], v98 offset1:1 ds_read2st64_b64 v[17:20], v98 offset0:2 offset1:3 v_add_f64 v[23:24], v[33:34], -v[45:46] v_add_f64 v[1:2], v[35:36], v[49:50] v_add_f64 v[21:22], v[35:36], -v[49:50] s_waitcnt lgkmcnt(4) v_add_f64 v[35:36], v[5:6], v[9:10] s_waitcnt lgkmcnt(0) v_add_f64 v[33:34], v[13:14], v[17:18] v_add_f64 v[41:42], v[15:16], v[19:20] v_add_f64 v[39:40], v[7:8], v[11:12] v_add_f64 v[45:46], v[7:8], -v[11:12] v_add_f64 v[25:26], v[29:30], -v[43:44] v_add_f64 v[29:30], v[29:30], v[43:44] v_add_f64 v[27:28], v[37:38], v[31:32] v_add_f64 v[43:44], v[5:6], -v[9:10] v_add_f64 v[31:32], v[37:38], -v[31:32] v_add_f64 v[7:8], v[33:34], v[41:42] v_add_f64 v[11:12], v[33:34], -v[41:42] v_or_b32_e32 v33, s2, v0 v_mov_b32_e32 v34, 0 v_add_f64 v[5:6], v[35:36], v[39:40] v_add_f64 v[9:10], v[35:36], -v[39:40] v_lshlrev_b64 v[35:36], 4, v[33:34] v_mov_b32_e32 v37, s1 v_add_co_u32_e32 v35, vcc, s0, v35 v_addc_co_u32_e32 v36, vcc, v37, v36, vcc v_or_b32_e32 v33, s2, v97 global_store_dwordx4 v[35:36], v[1:4], off v_add_f64 v[15:16], v[15:16], -v[19:20] v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v3, 0x80, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s2, v3 v_add_f64 v[19:20], v[13:14], -v[17:18] global_store_dwordx4 v[1:2], v[29:32], off v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v4, 0xc0, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s2, v4 global_store_dwordx4 v[1:2], v[21:24], off v_lshlrev_b64 v[1:2], 4, v[33:34] s_and_b32 s1, s3, 0xffffff00 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc v_or_b32_e32 v33, s1, v0 v_add_f64 v[13:14], v[43:44], -v[15:16] v_add_f64 v[17:18], v[43:44], v[15:16] v_add_f64 v[15:16], v[19:20], v[45:46] v_add_f64 v[19:20], v[19:20], -v[45:46] global_store_dwordx4 v[1:2], v[25:28], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v97 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[5:8], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v3 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[17:20], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_or_b32_e32 v33, s1, v4 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[9:12], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[13:16], off s_endpgm .section .rodata,"a",@progbits .p2align 6, 0x0 .amdhsa_kernel tailSquare .amdhsa_group_segment_fixed_size 4096 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 24 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 120 .amdhsa_next_free_sgpr 19 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 0 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .text .Lfunc_end0: .size tailSquare, .Lfunc_end0-tailSquare ; -- End function .section .AMDGPU.csdata,"",@progbits ; Kernel info: ; codeLenInByte = 7604 ; NumSgprs: 21 ; NumVgprs: 120 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 4096 bytes/workgroup (compile time only) ; SGPRBlocks: 2 ; VGPRBlocks: 29 ; NumSGPRsForWavesPerEU: 21 ; NumVGPRsForWavesPerEU: 120 ; Occupancy: 2 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .ident "AMD clang version 18.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.2.0 24292 26466ce804ac523b398608f17388eb6d605a3f09)" .section ".note.GNU-stack","",@progbits .addrsig .amdgpu_metadata --- amdhsa.kernels: - .args: - .actual_access: write_only .address_space: global .is_restrict: true .name: out .offset: 0 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: in .offset: 8 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .address_space: constant .is_const: true .name: smallTrig .offset: 16 .size: 8 .type_name: 'T2*' .value_kind: global_buffer .group_segment_fixed_size: 4096 .kernarg_segment_align: 8 .kernarg_segment_size: 24 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 64 .name: tailSquare .private_segment_fixed_size: 0 .reqd_workgroup_size: - 64 - 1 - 1 .sgpr_count: 21 .sgpr_spill_count: 0 .symbol: tailSquare.kd .uses_dynamic_stack: false .vgpr_count: 120 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: 'amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-' amdhsa.version: - 1 - 2 ... .end_amdgpu_metadata