.text .amdgcn_target "amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-" .protected tailSquare ; -- Begin function tailSquare .globl tailSquare .p2align 8 .type tailSquare,@function tailSquare: ; @tailSquare ; %bb.0: s_sub_i32 s7, 0x200, s6 s_load_dwordx4 s[0:3], s[4:5], 0x8 s_cmp_eq_u32 s6, 0 s_cselect_b64 s[8:9], -1, 0 s_and_b64 s[12:13], s[8:9], exec s_mov_b32 s11, 0 s_cselect_b32 s7, 0x100, s7 s_and_b32 s10, s6, 0xffffff00 s_lshl_b64 s[12:13], s[10:11], 4 s_waitcnt lgkmcnt(0) s_add_u32 s10, s0, s12 s_addc_u32 s12, s1, s13 s_lshl_b32 s13, s6, 7 s_and_b32 s13, s13, 0xf80 s_add_u32 s10, s10, s13 s_addc_u32 s12, s12, 0 s_lshl_b32 s13, s6, 13 v_lshl_or_b32 v1, v0, 6, v0 s_and_b32 s13, s13, 0x1c0000 v_and_b32_e32 v1, 0xe07, v1 v_mov_b32_e32 v2, 0 s_add_u32 s10, s10, s13 v_lshlrev_b64 v[17:18], 4, v[1:2] s_addc_u32 s12, s12, 0 v_mov_b32_e32 v1, s12 v_add_co_u32_e32 v23, vcc, s10, v17 v_addc_co_u32_e32 v24, vcc, v1, v18, vcc s_mov_b32 s12, 0x10000 v_add_co_u32_e32 v1, vcc, s12, v23 v_addc_co_u32_e32 v2, vcc, 0, v24, vcc s_mov_b32 s13, 0x20000 v_add_co_u32_e32 v19, vcc, s13, v23 v_addc_co_u32_e32 v20, vcc, 0, v24, vcc s_mov_b32 s14, 0x30000 global_load_dwordx4 v[9:12], v[23:24], off global_load_dwordx4 v[13:16], v[1:2], off v_add_co_u32_e32 v23, vcc, s14, v23 v_lshlrev_b32_e32 v99, 4, v0 v_addc_co_u32_e32 v24, vcc, 0, v24, vcc global_load_dwordx4 v[1:4], v99, s[2:3] global_load_dwordx4 v[5:8], v99, s[2:3] offset:1024 global_load_dwordx4 v[19:22], v[19:20], off global_load_dwordx4 v[23:26], v[23:24], off s_and_b32 s10, s7, 0xffffff00 s_lshl_b64 s[10:11], s[10:11], 4 s_add_u32 s0, s0, s10 s_addc_u32 s1, s1, s11 s_lshl_b32 s10, s7, 7 s_and_b32 s10, s10, 0xf80 s_add_u32 s0, s0, s10 s_addc_u32 s1, s1, 0 s_lshl_b32 s10, s7, 13 s_and_b32 s10, s10, 0x1c0000 s_add_u32 s0, s0, s10 v_lshlrev_b32_e32 v97, 3, v0 v_mul_u32_u24_e32 v27, 3, v0 s_addc_u32 s1, s1, 0 v_mul_i32_i24_e32 v28, -3, v0 v_lshl_add_u32 v100, v27, 3, v97 v_mov_b32_e32 v27, s1 v_add_co_u32_e32 v17, vcc, s0, v17 v_lshl_add_u32 v98, v28, 3, v100 v_addc_co_u32_e32 v18, vcc, v27, v18, vcc v_add_co_u32_e32 v29, vcc, s12, v17 v_addc_co_u32_e32 v30, vcc, 0, v18, vcc v_add_co_u32_e32 v33, vcc, s13, v17 v_addc_co_u32_e32 v34, vcc, 0, v18, vcc v_add_co_u32_e32 v41, vcc, s14, v17 v_addc_co_u32_e32 v42, vcc, 0, v18, vcc v_and_b32_e32 v59, 60, v0 s_movk_i32 s10, 0x4000 v_lshl_add_u32 v57, v0, 9, s6 v_sub_u32_e32 v58, 0x8000, v57 v_cmp_lt_u32_e32 vcc, s10, v57 v_cndmask_b32_e32 v62, v57, v58, vcc s_mov_b32 s0, 0x54442d18 v_and_b32_e32 v65, 48, v0 s_mov_b32 s1, 0x3f0921fb v_lshlrev_b32_e32 v61, 4, v65 v_and_b32_e32 v66, 15, v0 v_lshlrev_b32_e32 v66, 3, v66 v_lshl_or_b32 v101, v65, 5, v66 v_mov_b32_e32 v63, 0xb42fdfa7 v_mov_b32_e32 v64, 0xbe5ae600 s_mov_b32 s10, 0x16c15177 s_mov_b32 s11, 0xbf56c16c v_add_u32_e32 v103, v98, v97 s_waitcnt vmcnt(2) v_mul_f64 v[27:28], v[1:2], v[7:8] s_waitcnt vmcnt(1) v_add_f64 v[35:36], v[11:12], v[21:22] v_add_f64 v[11:12], v[11:12], -v[21:22] s_waitcnt vmcnt(0) v_add_f64 v[21:22], v[13:14], -v[23:24] v_add_f64 v[37:38], v[15:16], v[25:26] v_mul_f64 v[31:32], v[3:4], -v[7:8] v_add_f64 v[39:40], v[9:10], v[19:20] v_add_f64 v[9:10], v[9:10], -v[19:20] v_add_f64 v[15:16], v[15:16], -v[25:26] v_fma_f64 v[73:74], v[5:6], v[3:4], v[27:28] v_add_f64 v[13:14], v[13:14], v[23:24] v_add_f64 v[19:20], v[11:12], -v[21:22] v_add_f64 v[23:24], v[35:36], -v[37:38] v_add_f64 v[11:12], v[11:12], v[21:22] v_fma_f64 v[75:76], v[5:6], v[1:2], v[31:32] global_load_dwordx4 v[25:28], v[17:18], off global_load_dwordx4 v[29:32], v[29:30], off v_add_f64 v[43:44], v[9:10], -v[15:16] v_add_f64 v[45:46], v[9:10], v[15:16] v_add_f64 v[21:22], v[39:40], -v[13:14] v_mul_f64 v[15:16], v[3:4], -v[19:20] v_mul_f64 v[47:48], v[7:8], -v[23:24] v_mul_f64 v[49:50], v[73:74], -v[11:12] v_mul_f64 v[19:20], v[19:20], v[1:2] v_mul_f64 v[23:24], v[23:24], v[5:6] v_mul_f64 v[51:52], v[11:12], v[75:76] v_add_f64 v[9:10], v[39:40], v[13:14] v_add_f64 v[17:18], v[35:36], v[37:38] v_fma_f64 v[11:12], v[45:46], v[1:2], v[15:16] v_fma_f64 v[13:14], v[21:22], v[5:6], v[47:48] v_fma_f64 v[15:16], v[43:44], v[75:76], v[49:50] v_fma_f64 v[19:20], v[45:46], v[3:4], v[19:20] v_fma_f64 v[21:22], v[21:22], v[7:8], v[23:24] v_fma_f64 v[23:24], v[43:44], v[73:74], v[51:52] global_load_dwordx4 v[33:36], v[33:34], off global_load_dwordx4 v[37:40], v[41:42], off ds_write_b128 v100, v[9:12] ds_write_b128 v100, v[13:16] offset:16 v_lshlrev_b32_e32 v13, 4, v59 ; wave barrier ds_read_b64 v[45:46], v97 ds_read2st64_b64 v[41:44], v98 offset0:1 offset1:2 ds_read_b64 v[47:48], v98 offset:1536 ; wave barrier ds_write_b128 v100, v[17:20] ds_write_b128 v100, v[21:24] offset:16 ; wave barrier ds_read_b64 v[21:22], v97 ds_read2st64_b64 v[17:20], v98 offset0:1 offset1:2 ds_read_b64 v[23:24], v98 offset:1536 ; wave barrier global_load_dwordx4 v[9:12], v13, s[2:3] global_load_dwordx4 v[13:16], v13, s[2:3] offset:1024 v_and_b32_e32 v49, 3, v0 v_lshlrev_b32_e32 v60, 3, v49 s_waitcnt lgkmcnt(1) v_add_f64 v[49:50], v[21:22], -v[19:20] v_add_f64 v[51:52], v[41:42], -v[47:48] v_add_f64 v[19:20], v[21:22], v[19:20] s_waitcnt lgkmcnt(0) v_add_f64 v[21:22], v[17:18], v[23:24] v_add_f64 v[41:42], v[41:42], v[47:48] v_add_f64 v[53:54], v[45:46], v[43:44] v_add_f64 v[43:44], v[45:46], -v[43:44] v_add_f64 v[17:18], v[17:18], -v[23:24] v_lshl_or_b32 v102, v59, 5, v60 v_add_f64 v[23:24], v[49:50], -v[51:52] v_add_f64 v[45:46], v[49:50], v[51:52] v_add_f64 v[49:50], v[19:20], -v[21:22] v_add_f64 v[19:20], v[19:20], v[21:22] v_add_f64 v[51:52], v[43:44], v[17:18] v_add_f64 v[17:18], v[43:44], -v[17:18] s_waitcnt vmcnt(3) v_add_f64 v[65:66], v[27:28], -v[35:36] s_waitcnt vmcnt(2) v_add_f64 v[67:68], v[29:30], -v[37:38] v_add_f64 v[69:70], v[25:26], v[33:34] v_add_f64 v[27:28], v[27:28], v[35:36] v_add_f64 v[35:36], v[31:32], v[39:40] v_add_f64 v[25:26], v[25:26], -v[33:34] v_add_f64 v[31:32], v[31:32], -v[39:40] v_add_f64 v[29:30], v[29:30], v[37:38] s_waitcnt vmcnt(1) v_mul_f64 v[43:44], v[11:12], -v[23:24] s_waitcnt vmcnt(0) v_mul_f64 v[55:56], v[9:10], v[15:16] v_mul_f64 v[47:48], v[11:12], -v[15:16] v_mul_f64 v[23:24], v[9:10], v[23:24] v_add_f64 v[33:34], v[65:66], -v[67:68] v_add_f64 v[39:40], v[65:66], v[67:68] v_add_f64 v[67:68], v[25:26], -v[31:32] v_add_f64 v[31:32], v[25:26], v[31:32] v_fma_f64 v[43:44], v[51:52], v[9:10], v[43:44] v_fma_f64 v[77:78], v[13:14], v[11:12], v[55:56] v_fma_f64 v[79:80], v[13:14], v[9:10], v[47:48] v_add_f64 v[55:56], v[53:54], -v[41:42] v_mul_f64 v[47:48], v[15:16], -v[49:50] v_mul_f64 v[49:50], v[49:50], v[13:14] v_add_f64 v[41:42], v[53:54], v[41:42] v_fma_f64 v[21:22], v[51:52], v[11:12], v[23:24] v_mul_f64 v[81:82], v[3:4], -v[33:34] v_mul_f64 v[57:58], v[77:78], -v[45:46] v_mul_f64 v[45:46], v[45:46], v[79:80] v_mul_f64 v[33:34], v[1:2], v[33:34] v_fma_f64 v[47:48], v[55:56], v[13:14], v[47:48] v_fma_f64 v[23:24], v[55:56], v[15:16], v[49:50] v_add_f64 v[37:38], v[27:28], -v[35:36] v_add_f64 v[65:66], v[69:70], -v[29:30] v_add_f64 v[25:26], v[69:70], v[29:30] v_fma_f64 v[53:54], v[17:18], v[79:80], v[57:58] v_fma_f64 v[17:18], v[17:18], v[77:78], v[45:46] v_cvt_f64_u32_e32 v[57:58], v62 ds_write2_b64 v102, v[41:42], v[43:44] offset1:4 ds_write2_b64 v102, v[47:48], v[53:54] offset0:8 offset1:12 v_mul_f64 v[49:50], v[57:58], s[0:1] ; wave barrier ds_read_b64 v[51:52], v97 ds_read2st64_b64 v[41:44], v98 offset0:1 offset1:2 ds_read_b64 v[53:54], v98 offset:1536 ; wave barrier ds_write2_b64 v102, v[19:20], v[21:22] offset1:4 ds_write2_b64 v102, v[23:24], v[17:18] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[55:56], v97 ds_read2st64_b64 v[45:48], v98 offset0:1 offset1:2 ds_read_b64 v[57:58], v98 offset:1536 ; wave barrier global_load_dwordx4 v[17:20], v61, s[2:3] global_load_dwordx4 v[21:24], v61, s[2:3] offset:1024 s_mov_b32 s0, 0xbe8838d4 v_mov_b32_e32 v61, 0xbdb4b1c4 v_mov_b32_e32 v62, 0x3e21ee9e v_mul_f64 v[59:60], v[49:50], v[49:50] s_mov_b32 s1, 0xbda8fae9 v_add_f64 v[29:30], v[27:28], v[35:36] v_fma_f64 v[27:28], v[31:32], v[1:2], v[81:82] v_fma_f64 v[31:32], v[31:32], v[3:4], v[33:34] s_waitcnt lgkmcnt(5) v_add_f64 v[33:34], v[41:42], -v[53:54] s_waitcnt lgkmcnt(1) v_add_f64 v[81:82], v[55:56], -v[47:48] v_add_f64 v[47:48], v[55:56], v[47:48] v_fma_f64 v[61:62], v[59:60], s[0:1], v[61:62] s_mov_b32 s0, 0xf9a43bb8 s_mov_b32 s1, 0x3de5e0b2 v_fma_f64 v[63:64], v[59:60], s[0:1], v[63:64] s_waitcnt lgkmcnt(0) v_add_f64 v[55:56], v[45:46], v[57:58] v_add_f64 v[41:42], v[41:42], v[53:54] s_mov_b32 s2, 0x809c52ad s_mov_b32 s0, 0x796cde01 s_mov_b32 s3, 0xbe927e4f s_mov_b32 s1, 0x3ec71de3 v_fma_f64 v[61:62], v[61:62], v[59:60], s[2:3] v_fma_f64 v[63:64], v[63:64], v[59:60], s[0:1] v_add_f64 v[85:86], v[51:52], v[43:44] v_add_f64 v[43:44], v[51:52], -v[43:44] v_add_f64 v[45:46], v[45:46], -v[57:58] v_add_f64 v[51:52], v[81:82], -v[33:34] v_mul_f64 v[35:36], v[7:8], -v[37:38] v_mul_f64 v[69:70], v[73:74], -v[39:40] v_add_f64 v[57:58], v[81:82], v[33:34] v_add_f64 v[87:88], v[47:48], -v[55:56] s_mov_b32 s2, 0x19cb1590 s_mov_b32 s0, 0x19e83e5c s_mov_b32 s3, 0x3efa01a0 s_mov_b32 s1, 0xbf2a01a0 v_fma_f64 v[61:62], v[61:62], v[59:60], s[2:3] v_fma_f64 v[63:64], v[63:64], v[59:60], s[0:1] v_add_f64 v[89:90], v[43:44], v[45:46] v_add_f64 v[43:44], v[43:44], -v[45:46] v_fma_f64 v[33:34], v[65:66], v[5:6], v[35:36] v_add_f64 v[91:92], v[85:86], -v[41:42] v_fma_f64 v[35:36], v[67:68], v[75:76], v[69:70] v_mul_f64 v[37:38], v[5:6], v[37:38] v_mul_f64 v[39:40], v[39:40], v[75:76] s_mov_b32 s2, 0x11110bb3 s_mov_b32 s3, 0x3f811111 v_fma_f64 v[61:62], v[61:62], v[59:60], s[10:11] v_fma_f64 v[63:64], v[63:64], v[59:60], s[2:3] v_add_f64 v[41:42], v[85:86], v[41:42] v_add_f64 v[47:48], v[47:48], v[55:56] v_fma_f64 v[37:38], v[65:66], v[7:8], v[37:38] v_fma_f64 v[39:40], v[67:68], v[73:74], v[39:40] s_mov_b32 s0, 0x55555555 s_mov_b32 s10, 0x5555554c s_mov_b32 s1, 0xbfc55555 s_mov_b32 s11, 0x3fa55555 v_fma_f64 v[61:62], v[61:62], v[59:60], s[10:11] v_mul_f64 v[71:72], v[49:50], v[59:60] v_fma_f64 v[63:64], v[63:64], v[59:60], s[0:1] v_fma_f64 v[61:62], v[61:62], v[59:60], -0.5 v_fma_f64 v[49:50], v[63:64], v[71:72], v[49:50] v_fma_f64 v[67:68], v[61:62], v[59:60], 1.0 s_waitcnt vmcnt(1) v_mul_f64 v[45:46], v[19:20], -v[51:52] s_waitcnt vmcnt(0) v_mul_f64 v[83:84], v[17:18], v[23:24] v_mul_f64 v[53:54], v[19:20], -v[23:24] v_mul_f64 v[51:52], v[17:18], v[51:52] v_mul_f64 v[69:70], v[87:88], v[21:22] v_fma_f64 v[45:46], v[89:90], v[17:18], v[45:46] v_fma_f64 v[81:82], v[21:22], v[19:20], v[83:84] v_fma_f64 v[83:84], v[21:22], v[17:18], v[53:54] v_mul_f64 v[53:54], v[23:24], -v[87:88] v_fma_f64 v[51:52], v[89:90], v[19:20], v[51:52] v_fma_f64 v[55:56], v[91:92], v[23:24], v[69:70] v_cndmask_b32_e32 v88, v68, v50, vcc v_cndmask_b32_e32 v87, v67, v49, vcc v_mul_f64 v[93:94], v[81:82], -v[57:58] v_mul_f64 v[57:58], v[57:58], v[83:84] v_fma_f64 v[53:54], v[91:92], v[21:22], v[53:54] v_fma_f64 v[85:86], v[43:44], v[83:84], v[93:94] v_fma_f64 v[57:58], v[43:44], v[81:82], v[57:58] ds_write2_b64 v101, v[41:42], v[45:46] offset1:16 ds_write2_b64 v101, v[53:54], v[85:86] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[53:54], v97 ds_read_b64 v[65:66], v98 offset:1536 ds_read2st64_b64 v[41:44], v98 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v101, v[47:48], v[51:52] offset1:16 ds_write2_b64 v101, v[55:56], v[57:58] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[51:52], v97 ds_read_b64 v[55:56], v98 offset:1536 ds_read2st64_b64 v[45:48], v98 offset0:1 offset1:2 ; wave barrier ds_write_b128 v100, v[25:28] ds_write_b128 v100, v[33:36] offset:16 ; wave barrier ds_read_b64 v[33:34], v97 ds_read_b64 v[35:36], v98 offset:1536 ds_read2st64_b64 v[25:28], v98 offset0:1 offset1:2 ; wave barrier ds_write_b128 v100, v[29:32] ds_write_b128 v100, v[37:40] offset:16 ; wave barrier ds_read_b64 v[37:38], v97 ds_read2st64_b64 v[29:32], v98 offset0:1 offset1:2 ds_read_b64 v[39:40], v98 offset:1536 ; wave barrier s_waitcnt lgkmcnt(14) v_add_f64 v[71:72], v[53:54], v[43:44] v_add_f64 v[43:44], v[53:54], -v[43:44] s_waitcnt lgkmcnt(5) v_add_f64 v[57:58], v[25:26], -v[35:36] s_waitcnt lgkmcnt(1) v_add_f64 v[63:64], v[37:38], -v[31:32] v_add_f64 v[31:32], v[37:38], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[37:38], v[29:30], v[39:40] v_add_f64 v[59:60], v[33:34], v[27:28] v_add_f64 v[27:28], v[33:34], -v[27:28] v_add_f64 v[29:30], v[29:30], -v[39:40] v_add_f64 v[25:26], v[25:26], v[35:36] v_cndmask_b32_e64 v86, -v50, -v68, vcc v_add_f64 v[33:34], v[63:64], -v[57:58] v_add_f64 v[39:40], v[63:64], v[57:58] v_add_f64 v[35:36], v[31:32], -v[37:38] v_add_f64 v[31:32], v[31:32], v[37:38] v_cndmask_b32_e32 v85, v49, v67, vcc v_add_f64 v[57:58], v[27:28], v[29:30] v_add_f64 v[27:28], v[27:28], -v[29:30] v_add_f64 v[61:62], v[59:60], -v[25:26] v_mul_f64 v[29:30], v[11:12], -v[33:34] v_mul_f64 v[33:34], v[9:10], v[33:34] v_mul_f64 v[63:64], v[15:16], -v[35:36] v_mul_f64 v[69:70], v[77:78], -v[39:40] v_mul_f64 v[35:36], v[13:14], v[35:36] v_mul_f64 v[39:40], v[79:80], v[39:40] v_add_f64 v[25:26], v[59:60], v[25:26] s_and_b64 vcc, s[8:9], exec v_fma_f64 v[29:30], v[57:58], v[9:10], v[29:30] v_fma_f64 v[33:34], v[57:58], v[11:12], v[33:34] v_fma_f64 v[59:60], v[61:62], v[13:14], v[63:64] v_fma_f64 v[63:64], v[27:28], v[79:80], v[69:70] v_fma_f64 v[35:36], v[61:62], v[15:16], v[35:36] v_fma_f64 v[37:38], v[27:28], v[77:78], v[39:40] ds_write2_b64 v102, v[25:26], v[29:30] offset1:4 ds_write2_b64 v102, v[59:60], v[63:64] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[53:54], v97 ds_read_b64 v[57:58], v98 offset:1536 ds_read2st64_b64 v[25:28], v98 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v102, v[31:32], v[33:34] offset1:4 ds_write2_b64 v102, v[35:36], v[37:38] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[33:34], v97 ds_read2st64_b64 v[29:32], v98 offset0:1 offset1:2 ds_read_b64 v[35:36], v98 offset:1536 v_add_f64 v[39:40], v[41:42], v[65:66] v_add_f64 v[65:66], v[41:42], -v[65:66] s_waitcnt lgkmcnt(5) v_add_f64 v[41:42], v[25:26], -v[57:58] s_waitcnt lgkmcnt(1) v_add_f64 v[59:60], v[33:34], -v[31:32] v_add_f64 v[31:32], v[33:34], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[33:34], v[29:30], v[35:36] v_add_f64 v[37:38], v[51:52], v[47:48] v_add_f64 v[51:52], v[51:52], -v[47:48] v_add_f64 v[47:48], v[53:54], v[27:28] v_add_f64 v[27:28], v[53:54], -v[27:28] v_add_f64 v[29:30], v[29:30], -v[35:36] v_add_f64 v[53:54], v[59:60], -v[41:42] v_add_f64 v[25:26], v[25:26], v[57:58] v_add_f64 v[61:62], v[31:32], -v[33:34] v_add_f64 v[41:42], v[59:60], v[41:42] v_add_f64 v[35:36], v[45:46], v[55:56] v_add_f64 v[45:46], v[45:46], -v[55:56] v_add_f64 v[93:94], v[71:72], v[39:40] v_add_f64 v[55:56], v[27:28], v[29:30] v_add_f64 v[27:28], v[27:28], -v[29:30] v_mul_f64 v[29:30], v[19:20], -v[53:54] v_add_f64 v[57:58], v[71:72], -v[39:40] v_mul_f64 v[39:40], v[17:18], v[53:54] v_add_f64 v[63:64], v[47:48], -v[25:26] v_mul_f64 v[59:60], v[23:24], -v[61:62] v_mul_f64 v[69:70], v[81:82], -v[41:42] v_add_f64 v[25:26], v[47:48], v[25:26] v_mul_f64 v[47:48], v[21:22], v[61:62] v_mul_f64 v[41:42], v[83:84], v[41:42] v_fma_f64 v[29:30], v[55:56], v[17:18], v[29:30] v_add_f64 v[31:32], v[31:32], v[33:34] v_fma_f64 v[33:34], v[55:56], v[19:20], v[39:40] v_fma_f64 v[53:54], v[63:64], v[21:22], v[59:60] v_fma_f64 v[61:62], v[27:28], v[83:84], v[69:70] v_add_f64 v[59:60], v[37:38], -v[35:36] v_fma_f64 v[39:40], v[63:64], v[23:24], v[47:48] v_fma_f64 v[41:42], v[27:28], v[81:82], v[41:42] v_add_f64 v[95:96], v[37:38], v[35:36] ; wave barrier ds_write2_b64 v101, v[25:26], v[29:30] offset1:16 ds_write2_b64 v101, v[53:54], v[61:62] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[35:36], v97 ds_read_b64 v[37:38], v98 offset:1536 ds_read2st64_b64 v[25:28], v98 offset0:1 offset1:2 ; wave barrier ds_write2_b64 v101, v[31:32], v[33:34] offset1:16 ds_write2_b64 v101, v[39:40], v[41:42] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[33:34], v97 ds_read2st64_b64 v[29:32], v98 offset0:1 offset1:2 ds_read_b64 v[39:40], v98 offset:1536 v_add_f64 v[89:90], v[43:44], v[45:46] s_waitcnt lgkmcnt(5) v_add_f64 v[41:42], v[35:36], v[27:28] v_add_f64 v[47:48], v[25:26], v[37:38] s_waitcnt lgkmcnt(1) v_add_f64 v[53:54], v[33:34], v[31:32] s_waitcnt lgkmcnt(0) v_add_f64 v[55:56], v[29:30], v[39:40] v_add_f64 v[27:28], v[35:36], -v[27:28] v_add_f64 v[29:30], v[29:30], -v[39:40] v_add_f64 v[25:26], v[25:26], -v[37:38] v_add_f64 v[31:32], v[33:34], -v[31:32] v_add_f64 v[61:62], v[43:44], -v[45:46] v_add_f64 v[63:64], v[51:52], v[65:66] v_add_f64 v[37:38], v[41:42], v[47:48] v_add_f64 v[39:40], v[53:54], v[55:56] v_add_f64 v[41:42], v[41:42], -v[47:48] v_add_f64 v[43:44], v[53:54], -v[55:56] v_add_f64 v[45:46], v[27:28], -v[29:30] v_add_f64 v[33:34], v[27:28], v[29:30] v_add_f64 v[47:48], v[31:32], v[25:26] v_add_f64 v[35:36], v[31:32], -v[25:26] v_add_f64 v[91:92], v[51:52], -v[65:66] s_cbranch_vccnz .LBB0_2 ; %bb.1: v_xor_b32_e32 v121, 0xff0, v99 v_xor_b32_e32 v122, 0xbf0, v99 v_xor_b32_e32 v123, 0x7f0, v99 v_xor_b32_e32 v120, 0x3f0, v99 ; wave barrier ds_write_b128 v121, v[37:40] ds_write_b128 v122, v[33:36] ds_write_b128 v123, v[41:44] ds_write_b128 v120, v[45:48] ; wave barrier ds_read_b128 v[25:28], v103 ds_read_b128 v[49:52], v103 offset:1024 ds_read_b128 v[29:32], v103 offset:2048 ds_read_b128 v[53:56], v103 offset:3072 ; wave barrier v_add_u32_e32 v105, 0x800, v103 s_waitcnt lgkmcnt(3) v_add_f64 v[65:66], v[93:94], v[25:26] v_add_f64 v[25:26], v[93:94], -v[25:26] v_add_f64 v[67:68], v[95:96], -v[27:28] v_add_f64 v[27:28], v[95:96], v[27:28] v_add_u32_e32 v104, 0xc00, v103 v_add_f64 v[71:72], v[25:26], v[25:26] v_mul_f64 v[69:70], v[27:28], -v[27:28] v_mul_f64 v[71:72], v[27:28], v[71:72] v_fma_f64 v[69:70], v[25:26], v[25:26], v[69:70] v_mul_f64 v[106:107], v[85:86], v[71:72] v_mul_f64 v[71:72], -v[87:88], v[71:72] v_fma_f64 v[106:107], v[69:70], -v[87:88], v[106:107] v_fma_f64 v[69:70], v[69:70], -v[85:86], v[71:72] v_fma_f64 v[71:72], v[67:68], -v[67:68], v[106:107] v_add_f64 v[106:107], v[65:66], v[65:66] v_fma_f64 v[71:72], v[65:66], v[65:66], v[71:72] v_fma_f64 v[69:70], v[106:107], v[67:68], v[69:70] v_mul_f64 v[106:107], v[27:28], -v[67:68] v_fma_f64 v[106:107], v[65:66], v[25:26], v[106:107] v_mul_f64 v[25:26], v[67:68], v[25:26] v_add_f64 v[106:107], v[106:107], v[106:107] v_fma_f64 v[25:26], v[65:66], v[27:28], v[25:26] v_add_f64 v[65:66], v[25:26], v[25:26] v_add_f64 v[25:26], v[71:72], v[106:107] v_add_f64 v[27:28], -v[69:70], -v[65:66] v_add_f64 v[67:68], v[69:70], -v[65:66] s_waitcnt lgkmcnt(1) v_add_f64 v[69:70], v[57:58], v[29:30] v_add_f64 v[29:30], v[57:58], -v[29:30] v_add_f64 v[65:66], v[71:72], -v[106:107] v_add_f64 v[71:72], v[59:60], -v[31:32] v_add_f64 v[31:32], v[59:60], v[31:32] v_add_f64 v[108:109], v[29:30], v[29:30] v_mul_f64 v[106:107], v[31:32], -v[31:32] v_mul_f64 v[108:109], v[31:32], v[108:109] v_fma_f64 v[106:107], v[29:30], v[29:30], v[106:107] v_mul_f64 v[110:111], v[85:86], -v[108:109] v_mul_f64 v[108:109], v[87:88], v[108:109] v_fma_f64 v[110:111], v[106:107], v[87:88], v[110:111] v_fma_f64 v[106:107], v[106:107], v[85:86], v[108:109] v_fma_f64 v[108:109], v[71:72], -v[71:72], v[110:111] v_add_f64 v[110:111], v[69:70], v[69:70] v_fma_f64 v[108:109], v[69:70], v[69:70], v[108:109] v_fma_f64 v[106:107], v[110:111], v[71:72], v[106:107] v_mul_f64 v[110:111], v[31:32], -v[71:72] v_fma_f64 v[110:111], v[69:70], v[29:30], v[110:111] v_mul_f64 v[29:30], v[71:72], v[29:30] v_add_f64 v[110:111], v[110:111], v[110:111] v_fma_f64 v[29:30], v[69:70], v[31:32], v[29:30] v_add_f64 v[69:70], v[29:30], v[29:30] v_add_f64 v[29:30], v[108:109], v[110:111] v_add_f64 v[31:32], -v[106:107], -v[69:70] v_add_f64 v[71:72], v[106:107], -v[69:70] v_add_f64 v[106:107], v[89:90], v[49:50] v_add_f64 v[49:50], v[89:90], -v[49:50] v_add_f64 v[69:70], v[108:109], -v[110:111] v_add_f64 v[108:109], v[91:92], -v[51:52] v_add_f64 v[51:52], v[91:92], v[51:52] v_add_f64 v[112:113], v[49:50], v[49:50] v_mul_f64 v[110:111], v[51:52], -v[51:52] v_mul_f64 v[112:113], v[51:52], v[112:113] v_fma_f64 v[110:111], v[49:50], v[49:50], v[110:111] v_mul_f64 v[114:115], v[87:88], -v[112:113] v_mul_f64 v[112:113], -v[85:86], v[112:113] v_fma_f64 v[114:115], v[110:111], -v[85:86], v[114:115] v_fma_f64 v[110:111], v[110:111], v[87:88], v[112:113] v_fma_f64 v[112:113], v[108:109], -v[108:109], v[114:115] v_add_f64 v[114:115], v[106:107], v[106:107] v_fma_f64 v[112:113], v[106:107], v[106:107], v[112:113] v_fma_f64 v[110:111], v[114:115], v[108:109], v[110:111] v_mul_f64 v[114:115], v[51:52], -v[108:109] v_fma_f64 v[114:115], v[106:107], v[49:50], v[114:115] v_mul_f64 v[49:50], v[108:109], v[49:50] v_add_f64 v[114:115], v[114:115], v[114:115] v_fma_f64 v[49:50], v[106:107], v[51:52], v[49:50] v_add_f64 v[106:107], v[49:50], v[49:50] v_add_f64 v[49:50], v[112:113], v[114:115] v_add_f64 v[51:52], -v[110:111], -v[106:107] v_add_f64 v[108:109], v[110:111], -v[106:107] s_waitcnt lgkmcnt(0) v_add_f64 v[110:111], v[61:62], v[53:54] v_add_f64 v[53:54], v[61:62], -v[53:54] v_add_f64 v[106:107], v[112:113], -v[114:115] v_add_f64 v[112:113], v[63:64], -v[55:56] v_add_f64 v[55:56], v[63:64], v[55:56] v_add_f64 v[116:117], v[53:54], v[53:54] v_mul_f64 v[114:115], v[55:56], -v[55:56] v_mul_f64 v[116:117], v[55:56], v[116:117] v_fma_f64 v[114:115], v[53:54], v[53:54], v[114:115] v_mul_f64 v[118:119], v[87:88], v[116:117] v_mul_f64 v[116:117], v[85:86], v[116:117] v_fma_f64 v[118:119], v[114:115], v[85:86], v[118:119] v_fma_f64 v[114:115], v[114:115], -v[87:88], v[116:117] v_fma_f64 v[116:117], v[112:113], -v[112:113], v[118:119] v_add_f64 v[118:119], v[110:111], v[110:111] v_fma_f64 v[116:117], v[110:111], v[110:111], v[116:117] v_fma_f64 v[114:115], v[118:119], v[112:113], v[114:115] v_mul_f64 v[118:119], v[55:56], -v[112:113] v_fma_f64 v[118:119], v[110:111], v[53:54], v[118:119] v_mul_f64 v[53:54], v[112:113], v[53:54] v_add_f64 v[118:119], v[118:119], v[118:119] v_fma_f64 v[53:54], v[110:111], v[55:56], v[53:54] v_add_f64 v[110:111], v[53:54], v[53:54] v_add_f64 v[53:54], v[116:117], v[118:119] v_add_f64 v[55:56], -v[114:115], -v[110:111] v_add_f64 v[112:113], v[114:115], -v[110:111] v_add_f64 v[110:111], v[116:117], -v[118:119] ds_write_b128 v121, v[65:68] ds_write_b128 v122, v[106:109] ds_write_b128 v123, v[69:72] ds_write_b128 v120, v[110:113] ; wave barrier ds_read_b128 v[65:68], v103 ds_read_b128 v[69:72], v103 offset:1024 s_load_dwordx2 s[0:1], s[4:5], 0x0 s_cbranch_execz .LBB0_3 s_branch .LBB0_8 .LBB0_2: ; implicit-def: $vgpr105 ; implicit-def: $vgpr104 ; implicit-def: $vgpr65_vgpr66_vgpr67_vgpr68 ; implicit-def: $vgpr69_vgpr70_vgpr71_vgpr72 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 ; implicit-def: $vgpr49_vgpr50_vgpr51_vgpr52 ; implicit-def: $vgpr29_vgpr30_vgpr31_vgpr32 ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56 s_load_dwordx2 s[0:1], s[4:5], 0x0 .LBB0_3: v_sub_u32_e32 v25, 0, v0 v_sub_u32_e32 v99, 0, v99 v_and_b32_e32 v25, 0x7f, v25 ; wave barrier ds_write_b128 v99, v[61:64] offset:1024 v_lshlrev_b32_e32 v61, 4, v25 ds_write_b128 v61, v[57:60] ; wave barrier ds_read_b128 v[53:56], v103 ds_read_b128 v[29:32], v103 offset:1024 v_cmp_ne_u32_e32 vcc, 0, v0 ; implicit-def: $vgpr49_vgpr50_vgpr51_vgpr52 ; implicit-def: $vgpr25_vgpr26_vgpr27_vgpr28 s_and_saveexec_b64 s[2:3], vcc s_xor_b64 s[2:3], exec, s[2:3] s_cbranch_execz .LBB0_5 ; %bb.4: s_waitcnt lgkmcnt(0) v_add_f64 v[25:26], v[93:94], -v[53:54] v_add_f64 v[27:28], v[95:96], v[55:56] v_add_f64 v[55:56], v[95:96], -v[55:56] v_add_f64 v[53:54], v[93:94], v[53:54] ; implicit-def: $vgpr95_vgpr96 ; implicit-def: $vgpr93_vgpr94 v_add_f64 v[49:50], v[25:26], v[25:26] v_mul_f64 v[51:52], v[27:28], -v[27:28] v_mul_f64 v[59:60], v[55:56], v[25:26] v_mul_f64 v[62:63], v[27:28], -v[55:56] v_add_f64 v[64:65], v[53:54], v[53:54] v_mul_f64 v[49:50], v[27:28], v[49:50] v_fma_f64 v[51:52], v[25:26], v[25:26], v[51:52] v_fma_f64 v[27:28], v[53:54], v[27:28], v[59:60] v_fma_f64 v[25:26], v[53:54], v[25:26], v[62:63] v_mul_f64 v[57:58], v[85:86], v[49:50] v_mul_f64 v[49:50], -v[87:88], v[49:50] v_fma_f64 v[57:58], v[51:52], -v[87:88], v[57:58] v_fma_f64 v[49:50], v[51:52], -v[85:86], v[49:50] v_fma_f64 v[51:52], v[55:56], -v[55:56], v[57:58] v_add_f64 v[57:58], v[27:28], v[27:28] v_fma_f64 v[49:50], v[64:65], v[55:56], v[49:50] v_add_f64 v[55:56], v[25:26], v[25:26] v_fma_f64 v[53:54], v[53:54], v[53:54], v[51:52] v_add_f64 v[27:28], -v[49:50], -v[57:58] v_add_f64 v[51:52], v[49:50], -v[57:58] v_add_f64 v[25:26], v[53:54], v[55:56] v_add_f64 v[49:50], v[53:54], -v[55:56] ; implicit-def: $vgpr53_vgpr54_vgpr55_vgpr56 .LBB0_5: ; %Flow s_andn2_saveexec_b64 s[2:3], s[2:3] s_cbranch_execz .LBB0_7 ; %bb.6: v_add_f64 v[25:26], v[93:94], -v[95:96] v_add_f64 v[27:28], v[93:94], v[95:96] s_waitcnt lgkmcnt(0) v_add_f64 v[49:50], v[53:54], v[53:54] v_mul_f64 v[51:52], v[55:56], -v[55:56] v_mul_f64 v[25:26], v[25:26], v[25:26] v_mul_f64 v[27:28], v[27:28], v[27:28] v_mul_f64 v[49:50], v[49:50], -v[55:56] v_fma_f64 v[53:54], v[53:54], v[53:54], v[51:52] v_add_f64 v[55:56], v[25:26], -v[27:28] v_add_f64 v[25:26], v[25:26], v[27:28] v_mul_f64 v[51:52], v[49:50], 4.0 v_mul_f64 v[49:50], v[53:54], 4.0 v_add_f64 v[27:28], v[55:56], v[55:56] v_add_f64 v[25:26], v[25:26], v[25:26] .LBB0_7: s_or_b64 exec, exec, s[2:3] s_waitcnt lgkmcnt(0) v_add_f64 v[53:54], v[89:90], -v[29:30] v_add_f64 v[55:56], v[91:92], v[31:32] v_add_f64 v[31:32], v[91:92], -v[31:32] v_add_f64 v[29:30], v[89:90], v[29:30] ; wave barrier s_mov_b32 s2, 0x8da49510 s_mov_b32 s3, 0xbf13bd2c s_mov_b32 s5, 0x3f8921d1 s_mov_b32 s4, 0xfcdec784 v_add_f64 v[57:58], v[53:54], v[53:54] v_mul_f64 v[59:60], v[55:56], -v[55:56] v_mul_f64 v[64:65], v[31:32], v[53:54] v_mul_f64 v[66:67], v[55:56], -v[31:32] v_add_f64 v[68:69], v[29:30], v[29:30] v_mul_f64 v[57:58], v[55:56], v[57:58] v_fma_f64 v[59:60], v[53:54], v[53:54], v[59:60] v_fma_f64 v[55:56], v[29:30], v[55:56], v[64:65] v_fma_f64 v[53:54], v[29:30], v[53:54], v[66:67] v_mul_f64 v[62:63], v[87:88], -v[57:58] v_mul_f64 v[57:58], -v[85:86], v[57:58] v_add_f64 v[64:65], v[53:54], v[53:54] v_fma_f64 v[62:63], v[59:60], -v[85:86], v[62:63] v_fma_f64 v[57:58], v[59:60], v[87:88], v[57:58] v_fma_f64 v[59:60], v[31:32], -v[31:32], v[62:63] v_add_f64 v[62:63], v[55:56], v[55:56] v_fma_f64 v[57:58], v[68:69], v[31:32], v[57:58] v_fma_f64 v[68:69], v[85:86], s[2:3], v[85:86] v_fma_f64 v[59:60], v[29:30], v[29:30], v[59:60] v_add_f64 v[31:32], v[57:58], -v[62:63] v_add_f64 v[29:30], v[59:60], -v[64:65] ds_write_b128 v99, v[29:32] offset:1024 ds_write_b128 v61, v[49:52] ; wave barrier ds_read_b128 v[29:32], v103 ds_read_b128 v[53:56], v103 offset:1024 ; wave barrier ds_write_b128 v99, v[45:48] offset:1008 ds_write_b128 v99, v[41:44] offset:2032 ; wave barrier ds_read_b128 v[41:44], v103 ds_read_b128 v[45:48], v103 offset:1024 v_fma_f64 v[49:50], v[87:88], s[2:3], v[87:88] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[66:67], v[37:38], -v[41:42] s_waitcnt lgkmcnt(0) v_add_f64 v[51:52], v[33:34], -v[45:46] v_add_f64 v[70:71], v[39:40], v[43:44] v_add_f64 v[37:38], v[37:38], v[41:42] v_add_f64 v[39:40], v[39:40], -v[43:44] v_fma_f64 v[49:50], v[85:86], s[4:5], v[49:50] v_add_f64 v[85:86], v[35:36], v[47:48] s_mov_b32 s5, 0xbf8921d1 v_add_f64 v[91:92], v[66:67], v[66:67] v_add_f64 v[89:90], v[51:52], v[51:52] v_fma_f64 v[68:69], v[87:88], s[4:5], v[68:69] v_mul_f64 v[41:42], v[70:71], -v[70:71] v_add_f64 v[35:36], v[35:36], -v[47:48] v_add_f64 v[33:34], v[33:34], v[45:46] v_mul_f64 v[43:44], v[85:86], -v[85:86] v_mul_f64 v[47:48], v[70:71], -v[39:40] v_mul_f64 v[95:96], v[39:40], v[66:67] v_mul_f64 v[87:88], v[85:86], v[89:90] v_mul_f64 v[89:90], v[70:71], v[91:92] v_fma_f64 v[41:42], v[66:67], v[66:67], v[41:42] v_mul_f64 v[106:107], v[35:36], v[51:52] v_mul_f64 v[108:109], v[85:86], -v[35:36] v_fma_f64 v[43:44], v[51:52], v[51:52], v[43:44] v_add_f64 v[45:46], v[37:38], v[37:38] v_add_f64 v[104:105], v[33:34], v[33:34] v_mul_f64 v[91:92], v[49:50], -v[87:88] v_mul_f64 v[93:94], v[68:69], v[89:90] v_mul_f64 v[89:90], -v[49:50], v[89:90] v_mul_f64 v[87:88], -v[68:69], v[87:88] v_fma_f64 v[70:71], v[37:38], v[70:71], v[95:96] v_fma_f64 v[47:48], v[37:38], v[66:67], v[47:48] v_fma_f64 v[66:67], v[33:34], v[85:86], v[106:107] v_fma_f64 v[91:92], v[43:44], -v[68:69], v[91:92] v_fma_f64 v[93:94], v[41:42], -v[49:50], v[93:94] v_fma_f64 v[41:42], v[41:42], -v[68:69], v[89:90] v_fma_f64 v[43:44], v[43:44], v[49:50], v[87:88] v_fma_f64 v[49:50], v[33:34], v[51:52], v[108:109] v_add_f64 v[70:71], v[70:71], v[70:71] v_add_f64 v[47:48], v[47:48], v[47:48] v_fma_f64 v[51:52], v[35:36], -v[35:36], v[91:92] v_fma_f64 v[68:69], v[39:40], -v[39:40], v[93:94] v_fma_f64 v[39:40], v[45:46], v[39:40], v[41:42] v_add_f64 v[41:42], v[66:67], v[66:67] v_fma_f64 v[43:44], v[104:105], v[35:36], v[43:44] v_add_f64 v[45:46], v[49:50], v[49:50] v_add_f64 v[49:50], v[59:60], v[64:65] v_add_u32_e32 v104, 0x400, v103 v_fma_f64 v[85:86], v[33:34], v[33:34], v[51:52] v_fma_f64 v[87:88], v[37:38], v[37:38], v[68:69] v_add_f64 v[51:52], -v[57:58], -v[62:63] v_add_f64 v[67:68], -v[39:40], -v[70:71] v_add_f64 v[35:36], v[39:40], -v[70:71] v_add_f64 v[71:72], -v[43:44], -v[41:42] v_add_f64 v[39:40], v[43:44], -v[41:42] v_mov_b32_e32 v105, v103 v_add_f64 v[37:38], v[85:86], -v[45:46] v_add_f64 v[65:66], v[87:88], v[47:48] v_add_f64 v[69:70], v[85:86], v[45:46] v_add_f64 v[33:34], v[87:88], -v[47:48] ds_write_b128 v99, v[37:40] offset:1008 ds_write_b128 v99, v[33:36] offset:2032 ; wave barrier .LBB0_8: ; %Flow60 ds_read_b128 v[33:36], v105 ds_read_b128 v[37:40], v104 ; wave barrier s_lshl_b32 s3, s7, 9 s_and_b32 s3, s3, 0x1fe00 s_add_i32 s3, s3, s7 s_waitcnt lgkmcnt(0) v_add_f64 v[41:42], v[67:68], v[35:36] v_add_f64 v[35:36], v[67:68], -v[35:36] v_add_f64 v[43:44], v[69:70], -v[37:38] v_add_f64 v[45:46], v[71:72], v[39:40] v_add_f64 v[47:48], v[65:66], v[33:34] v_add_f64 v[33:34], v[65:66], -v[33:34] v_add_f64 v[39:40], v[71:72], -v[39:40] v_add_f64 v[37:38], v[69:70], v[37:38] s_and_b32 s3, s3, 0xffffff00 s_lshl_b32 s2, s6, 9 v_add_f64 v[57:58], v[35:36], -v[43:44] v_add_f64 v[59:60], v[41:42], -v[45:46] v_add_f64 v[35:36], v[35:36], v[43:44] v_add_f64 v[41:42], v[41:42], v[45:46] v_add_f64 v[63:64], v[33:34], -v[39:40] v_add_f64 v[43:44], v[33:34], v[39:40] v_add_f64 v[61:62], v[47:48], -v[37:38] v_add_f64 v[33:34], v[47:48], v[37:38] v_mul_f64 v[39:40], v[3:4], -v[57:58] v_mul_f64 v[65:66], v[7:8], -v[59:60] v_mul_f64 v[67:68], v[73:74], -v[35:36] v_mul_f64 v[47:48], v[1:2], v[57:58] v_mul_f64 v[57:58], v[5:6], v[59:60] v_mul_f64 v[59:60], v[75:76], v[35:36] s_and_b32 s2, s2, 0x1fe00 s_add_i32 s2, s2, s6 v_fma_f64 v[35:36], v[43:44], v[1:2], v[39:40] v_fma_f64 v[37:38], v[61:62], v[5:6], v[65:66] v_fma_f64 v[39:40], v[63:64], v[75:76], v[67:68] v_fma_f64 v[43:44], v[43:44], v[3:4], v[47:48] v_fma_f64 v[45:46], v[61:62], v[7:8], v[57:58] v_fma_f64 v[47:48], v[63:64], v[73:74], v[59:60] v_add_f64 v[67:68], v[31:32], v[27:28] ds_write_b128 v100, v[33:36] ds_write_b128 v100, v[37:40] offset:16 ; wave barrier ds_read_b64 v[57:58], v97 ds_read2st64_b64 v[33:36], v98 offset0:1 offset1:2 ds_read_b64 v[59:60], v98 offset:1536 ; wave barrier ds_write_b128 v100, v[41:44] ds_write_b128 v100, v[45:48] offset:16 ; wave barrier ds_read_b64 v[41:42], v97 ds_read2st64_b64 v[37:40], v98 offset0:1 offset1:2 ds_read_b64 v[43:44], v98 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[61:62], v[57:58], v[35:36] v_add_f64 v[35:36], v[57:58], -v[35:36] ; wave barrier v_add_f64 v[31:32], v[27:28], -v[31:32] s_waitcnt lgkmcnt(1) v_add_f64 v[45:46], v[41:42], v[39:40] v_add_f64 v[39:40], v[41:42], -v[39:40] v_add_f64 v[41:42], v[33:34], -v[59:60] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[37:38], v[43:44] v_add_f64 v[37:38], v[37:38], -v[43:44] v_add_f64 v[33:34], v[33:34], v[59:60] v_add_f64 v[43:44], v[39:40], -v[41:42] v_add_f64 v[57:58], v[45:46], -v[47:48] v_add_f64 v[39:40], v[39:40], v[41:42] v_add_f64 v[59:60], v[35:36], -v[37:38] v_add_f64 v[35:36], v[35:36], v[37:38] v_add_f64 v[41:42], v[61:62], -v[33:34] v_add_f64 v[33:34], v[61:62], v[33:34] v_add_f64 v[45:46], v[45:46], v[47:48] v_mul_f64 v[37:38], v[11:12], -v[43:44] v_mul_f64 v[63:64], v[15:16], -v[57:58] v_mul_f64 v[65:66], v[77:78], -v[39:40] v_mul_f64 v[43:44], v[9:10], v[43:44] v_mul_f64 v[57:58], v[13:14], v[57:58] v_mul_f64 v[39:40], v[79:80], v[39:40] v_fma_f64 v[37:38], v[35:36], v[9:10], v[37:38] v_fma_f64 v[61:62], v[41:42], v[13:14], v[63:64] v_fma_f64 v[63:64], v[59:60], v[79:80], v[65:66] v_fma_f64 v[43:44], v[35:36], v[11:12], v[43:44] v_fma_f64 v[41:42], v[41:42], v[15:16], v[57:58] v_fma_f64 v[39:40], v[59:60], v[77:78], v[39:40] ds_write2_b64 v102, v[33:34], v[37:38] offset1:4 ds_write2_b64 v102, v[61:62], v[63:64] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[47:48], v97 ds_read2st64_b64 v[33:36], v98 offset0:1 offset1:2 ds_read_b64 v[57:58], v98 offset:1536 ; wave barrier ds_write2_b64 v102, v[45:46], v[43:44] offset1:4 ds_write2_b64 v102, v[41:42], v[39:40] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[41:42], v97 ds_read2st64_b64 v[37:40], v98 offset0:1 offset1:2 ds_read_b64 v[43:44], v98 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[61:62], v[47:48], v[35:36] v_add_f64 v[35:36], v[47:48], -v[35:36] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[45:46], v[41:42], v[39:40] v_add_f64 v[39:40], v[41:42], -v[39:40] v_add_f64 v[41:42], v[33:34], -v[57:58] s_waitcnt lgkmcnt(0) v_add_f64 v[59:60], v[37:38], v[43:44] v_add_f64 v[37:38], v[37:38], -v[43:44] v_add_f64 v[33:34], v[33:34], v[57:58] v_add_f64 v[43:44], v[39:40], -v[41:42] v_add_f64 v[47:48], v[45:46], -v[59:60] v_add_f64 v[39:40], v[39:40], v[41:42] v_add_f64 v[57:58], v[35:36], -v[37:38] v_add_f64 v[35:36], v[35:36], v[37:38] v_add_f64 v[41:42], v[61:62], -v[33:34] v_add_f64 v[33:34], v[61:62], v[33:34] v_add_f64 v[45:46], v[45:46], v[59:60] v_mul_f64 v[37:38], v[19:20], -v[43:44] v_mul_f64 v[63:64], v[23:24], -v[47:48] v_mul_f64 v[65:66], v[81:82], -v[39:40] v_mul_f64 v[43:44], v[17:18], v[43:44] v_mul_f64 v[47:48], v[21:22], v[47:48] v_mul_f64 v[39:40], v[83:84], v[39:40] v_fma_f64 v[37:38], v[35:36], v[17:18], v[37:38] v_fma_f64 v[61:62], v[41:42], v[21:22], v[63:64] v_fma_f64 v[63:64], v[57:58], v[83:84], v[65:66] v_add_f64 v[65:66], v[49:50], -v[53:54] ds_write2_b64 v101, v[33:34], v[37:38] offset1:16 ds_write2_b64 v101, v[61:62], v[63:64] offset0:32 offset1:48 v_add_f64 v[37:38], v[55:56], v[51:52] v_fma_f64 v[35:36], v[35:36], v[19:20], v[43:44] v_fma_f64 v[41:42], v[41:42], v[23:24], v[47:48] v_add_f64 v[33:34], v[29:30], v[25:26] v_add_f64 v[29:30], v[25:26], -v[29:30] v_add_f64 v[47:48], v[53:54], v[49:50] v_add_f64 v[49:50], v[51:52], -v[55:56] v_add_f64 v[53:54], v[31:32], -v[65:66] v_fma_f64 v[39:40], v[57:58], v[81:82], v[39:40] v_add_f64 v[55:56], v[67:68], -v[37:38] v_add_f64 v[31:32], v[65:66], v[31:32] ; wave barrier ds_read_b64 v[43:44], v97 ds_read2st64_b64 v[25:28], v98 offset0:1 offset1:2 ds_read_b64 v[51:52], v98 offset:1536 ; wave barrier ds_write2_b64 v101, v[45:46], v[35:36] offset1:16 ds_write2_b64 v101, v[41:42], v[39:40] offset0:32 offset1:48 v_add_f64 v[45:46], v[49:50], v[29:30] v_mul_f64 v[35:36], v[3:4], -v[53:54] v_add_f64 v[39:40], v[33:34], -v[47:48] v_add_f64 v[41:42], v[29:30], -v[49:50] v_mul_f64 v[49:50], v[7:8], -v[55:56] v_mul_f64 v[57:58], v[73:74], -v[31:32] v_mul_f64 v[53:54], v[1:2], v[53:54] v_mul_f64 v[55:56], v[5:6], v[55:56] v_mul_f64 v[61:62], v[75:76], v[31:32] v_add_f64 v[29:30], v[47:48], v[33:34] v_fma_f64 v[31:32], v[45:46], v[1:2], v[35:36] v_add_f64 v[1:2], v[37:38], v[67:68] v_fma_f64 v[33:34], v[39:40], v[5:6], v[49:50] v_fma_f64 v[35:36], v[41:42], v[75:76], v[57:58] v_fma_f64 v[3:4], v[45:46], v[3:4], v[53:54] v_fma_f64 v[5:6], v[39:40], v[7:8], v[55:56] v_fma_f64 v[7:8], v[41:42], v[73:74], v[61:62] ; wave barrier ds_read_b64 v[59:60], v97 ds_read2st64_b64 v[37:40], v98 offset0:1 offset1:2 ds_read_b64 v[41:42], v98 offset:1536 ; wave barrier ds_write_b128 v100, v[29:32] ds_write_b128 v100, v[33:36] offset:16 ; wave barrier ds_read_b64 v[33:34], v97 ds_read2st64_b64 v[29:32], v98 offset0:1 offset1:2 ds_read_b64 v[35:36], v98 offset:1536 ; wave barrier ds_write_b128 v100, v[1:4] ds_write_b128 v100, v[5:8] offset:16 ; wave barrier ds_read_b64 v[5:6], v97 ds_read2st64_b64 v[1:4], v98 offset0:1 offset1:2 ds_read_b64 v[7:8], v98 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[49:50], v[33:34], v[31:32] v_add_f64 v[31:32], v[33:34], -v[31:32] ; wave barrier s_waitcnt lgkmcnt(1) v_add_f64 v[45:46], v[5:6], v[3:4] v_add_f64 v[3:4], v[5:6], -v[3:4] v_add_f64 v[5:6], v[29:30], -v[35:36] s_waitcnt lgkmcnt(0) v_add_f64 v[47:48], v[1:2], v[7:8] v_add_f64 v[1:2], v[1:2], -v[7:8] v_add_f64 v[29:30], v[29:30], v[35:36] v_add_f64 v[7:8], v[3:4], -v[5:6] v_add_f64 v[33:34], v[45:46], -v[47:48] v_add_f64 v[3:4], v[3:4], v[5:6] v_add_f64 v[35:36], v[31:32], -v[1:2] v_add_f64 v[1:2], v[31:32], v[1:2] v_add_f64 v[5:6], v[49:50], -v[29:30] v_add_f64 v[29:30], v[49:50], v[29:30] v_add_f64 v[45:46], v[45:46], v[47:48] v_mul_f64 v[31:32], v[11:12], -v[7:8] v_mul_f64 v[7:8], v[9:10], v[7:8] v_mul_f64 v[53:54], v[15:16], -v[33:34] v_mul_f64 v[55:56], v[77:78], -v[3:4] v_mul_f64 v[33:34], v[13:14], v[33:34] v_mul_f64 v[3:4], v[79:80], v[3:4] v_fma_f64 v[9:10], v[1:2], v[9:10], v[31:32] v_fma_f64 v[7:8], v[1:2], v[11:12], v[7:8] v_fma_f64 v[13:14], v[5:6], v[13:14], v[53:54] v_fma_f64 v[31:32], v[35:36], v[79:80], v[55:56] v_fma_f64 v[5:6], v[5:6], v[15:16], v[33:34] v_fma_f64 v[11:12], v[35:36], v[77:78], v[3:4] ds_write2_b64 v102, v[29:30], v[9:10] offset1:4 ds_write2_b64 v102, v[13:14], v[31:32] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[9:10], v97 ds_read2st64_b64 v[1:4], v98 offset0:1 offset1:2 ds_read_b64 v[13:14], v98 offset:1536 ; wave barrier ds_write2_b64 v102, v[45:46], v[7:8] offset1:4 ds_write2_b64 v102, v[5:6], v[11:12] offset0:8 offset1:12 ; wave barrier ds_read_b64 v[11:12], v97 ds_read2st64_b64 v[5:8], v98 offset0:1 offset1:2 ds_read_b64 v[31:32], v98 offset:1536 v_add_f64 v[33:34], v[59:60], v[39:40] v_add_f64 v[29:30], v[59:60], -v[39:40] s_waitcnt lgkmcnt(6) v_add_f64 v[45:46], v[9:10], v[3:4] s_waitcnt lgkmcnt(1) v_add_f64 v[35:36], v[11:12], v[7:8] v_add_f64 v[7:8], v[11:12], -v[7:8] v_add_f64 v[11:12], v[1:2], -v[13:14] s_waitcnt lgkmcnt(0) v_add_f64 v[39:40], v[5:6], v[31:32] v_add_f64 v[3:4], v[9:10], -v[3:4] v_add_f64 v[9:10], v[1:2], v[13:14] v_add_f64 v[1:2], v[5:6], -v[31:32] v_add_f64 v[15:16], v[43:44], v[27:28] v_add_f64 v[27:28], v[43:44], -v[27:28] v_add_f64 v[43:44], v[37:38], v[41:42] v_add_f64 v[13:14], v[7:8], -v[11:12] v_add_f64 v[31:32], v[35:36], -v[39:40] v_add_f64 v[7:8], v[7:8], v[11:12] v_add_f64 v[37:38], v[37:38], -v[41:42] v_add_f64 v[41:42], v[3:4], -v[1:2] v_add_f64 v[47:48], v[3:4], v[1:2] v_add_f64 v[5:6], v[25:26], v[51:52] v_add_f64 v[25:26], v[25:26], -v[51:52] v_mul_f64 v[3:4], v[19:20], -v[13:14] v_mul_f64 v[13:14], v[17:18], v[13:14] v_add_f64 v[11:12], v[45:46], -v[9:10] v_mul_f64 v[49:50], v[23:24], -v[31:32] v_mul_f64 v[51:52], v[81:82], -v[7:8] v_mul_f64 v[31:32], v[21:22], v[31:32] v_mul_f64 v[7:8], v[83:84], v[7:8] v_add_f64 v[9:10], v[45:46], v[9:10] v_fma_f64 v[17:18], v[47:48], v[17:18], v[3:4] v_add_f64 v[35:36], v[35:36], v[39:40] v_fma_f64 v[13:14], v[47:48], v[19:20], v[13:14] v_fma_f64 v[21:22], v[11:12], v[21:22], v[49:50] v_fma_f64 v[45:46], v[41:42], v[83:84], v[51:52] v_fma_f64 v[19:20], v[11:12], v[23:24], v[31:32] v_fma_f64 v[7:8], v[41:42], v[81:82], v[7:8] v_add_f64 v[1:2], v[15:16], v[5:6] v_add_f64 v[3:4], v[33:34], v[43:44] v_add_f64 v[5:6], v[15:16], -v[5:6] ; wave barrier ds_write2_b64 v101, v[9:10], v[17:18] offset1:16 ds_write2_b64 v101, v[21:22], v[45:46] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[23:24], v97 ds_read2st64_b64 v[9:12], v98 offset0:1 offset1:2 ds_read_b64 v[31:32], v98 offset:1536 ; wave barrier ds_write2_b64 v101, v[35:36], v[13:14] offset1:16 ds_write2_b64 v101, v[19:20], v[7:8] offset0:32 offset1:48 ; wave barrier ds_read_b64 v[35:36], v97 ds_read2st64_b64 v[13:16], v98 offset0:1 offset1:2 v_add_f64 v[7:8], v[33:34], -v[43:44] ds_read_b64 v[33:34], v98 offset:1536 s_waitcnt lgkmcnt(6) v_add_f64 v[43:44], v[23:24], -v[11:12] v_add_f64 v[17:18], v[27:28], -v[37:38] v_add_f64 v[21:22], v[27:28], v[37:38] s_waitcnt lgkmcnt(1) v_add_f64 v[27:28], v[35:36], v[15:16] s_waitcnt lgkmcnt(0) v_add_f64 v[41:42], v[13:14], v[33:34] v_add_f64 v[33:34], v[13:14], -v[33:34] v_add_f64 v[39:40], v[9:10], v[31:32] v_add_f64 v[35:36], v[35:36], -v[15:16] v_add_f64 v[31:32], v[9:10], -v[31:32] v_add_f64 v[37:38], v[23:24], v[11:12] v_add_f64 v[19:20], v[29:30], v[25:26] v_add_f64 v[23:24], v[29:30], -v[25:26] v_add_f64 v[11:12], v[27:28], v[41:42] v_add_f64 v[25:26], v[43:44], -v[33:34] v_add_f64 v[29:30], v[43:44], v[33:34] v_or_b32_e32 v33, s3, v0 v_mov_b32_e32 v34, 0 v_add_f64 v[15:16], v[27:28], -v[41:42] v_add_f64 v[27:28], v[35:36], v[31:32] v_add_f64 v[31:32], v[35:36], -v[31:32] v_lshlrev_b64 v[35:36], 4, v[33:34] v_add_f64 v[9:10], v[37:38], v[39:40] v_add_f64 v[13:14], v[37:38], -v[39:40] v_mov_b32_e32 v37, s1 v_add_co_u32_e32 v35, vcc, s0, v35 v_addc_co_u32_e32 v36, vcc, v37, v36, vcc global_store_dwordx4 v[35:36], v[1:4], off s_and_b32 s1, s2, 0xffffff00 v_or_b32_e32 v1, 64, v33 v_mov_b32_e32 v2, v34 v_lshlrev_b64 v[1:2], 4, v[1:2] v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[21:24], off v_or_b32_e32 v1, 0x80, v33 v_mov_b32_e32 v2, v34 v_lshlrev_b64 v[1:2], 4, v[1:2] v_or_b32_e32 v33, 0xc0, v33 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[5:8], off v_lshlrev_b64 v[1:2], 4, v[33:34] v_or_b32_e32 v33, s1, v0 v_add_co_u32_e32 v1, vcc, s0, v1 v_addc_co_u32_e32 v2, vcc, v37, v2, vcc global_store_dwordx4 v[1:2], v[17:20], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[9:12], off v_or_b32_e32 v0, 64, v33 v_mov_b32_e32 v1, v34 v_lshlrev_b64 v[0:1], 4, v[0:1] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[29:32], off v_or_b32_e32 v0, 0x80, v33 v_mov_b32_e32 v1, v34 v_lshlrev_b64 v[0:1], 4, v[0:1] v_or_b32_e32 v33, 0xc0, v33 v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[13:16], off v_lshlrev_b64 v[0:1], 4, v[33:34] v_add_co_u32_e32 v0, vcc, s0, v0 v_addc_co_u32_e32 v1, vcc, v37, v1, vcc global_store_dwordx4 v[0:1], v[25:28], off s_endpgm .section .rodata,#alloc .p2align 6, 0x0 .amdhsa_kernel tailSquare .amdhsa_group_segment_fixed_size 4096 .amdhsa_private_segment_fixed_size 0 .amdhsa_kernarg_size 24 .amdhsa_user_sgpr_count 6 .amdhsa_user_sgpr_private_segment_buffer 1 .amdhsa_user_sgpr_dispatch_ptr 0 .amdhsa_user_sgpr_queue_ptr 0 .amdhsa_user_sgpr_kernarg_segment_ptr 1 .amdhsa_user_sgpr_dispatch_id 0 .amdhsa_user_sgpr_flat_scratch_init 0 .amdhsa_user_sgpr_private_segment_size 0 .amdhsa_uses_dynamic_stack 0 .amdhsa_system_sgpr_private_segment_wavefront_offset 0 .amdhsa_system_sgpr_workgroup_id_x 1 .amdhsa_system_sgpr_workgroup_id_y 0 .amdhsa_system_sgpr_workgroup_id_z 0 .amdhsa_system_sgpr_workgroup_info 0 .amdhsa_system_vgpr_workitem_id 0 .amdhsa_next_free_vgpr 124 .amdhsa_next_free_sgpr 15 .amdhsa_reserve_flat_scratch 0 .amdhsa_reserve_xnack_mask 0 .amdhsa_float_round_mode_32 0 .amdhsa_float_round_mode_16_64 0 .amdhsa_float_denorm_mode_32 3 .amdhsa_float_denorm_mode_16_64 3 .amdhsa_dx10_clamp 1 .amdhsa_ieee_mode 1 .amdhsa_fp16_overflow 0 .amdhsa_exception_fp_ieee_invalid_op 0 .amdhsa_exception_fp_denorm_src 0 .amdhsa_exception_fp_ieee_div_zero 0 .amdhsa_exception_fp_ieee_overflow 0 .amdhsa_exception_fp_ieee_underflow 0 .amdhsa_exception_fp_ieee_inexact 0 .amdhsa_exception_int_div_zero 0 .end_amdhsa_kernel .text .Lfunc_end0: .size tailSquare, .Lfunc_end0-tailSquare ; -- End function .section .AMDGPU.csdata ; Kernel info: ; codeLenInByte = 10484 ; NumSgprs: 17 ; NumVgprs: 124 ; ScratchSize: 0 ; MemoryBound: 0 ; FloatMode: 240 ; IeeeMode: 1 ; LDSByteSize: 4096 bytes/workgroup (compile time only) ; SGPRBlocks: 2 ; VGPRBlocks: 30 ; NumSGPRsForWavesPerEU: 17 ; NumVGPRsForWavesPerEU: 124 ; Occupancy: 2 ; WaveLimiterHint : 1 ; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 ; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 ; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 ; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 .ident "AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.1.3 24193 669db884972e769450470020c06a6f132a8a065b)" .section ".note.GNU-stack" .addrsig .amdgpu_metadata --- amdhsa.kernels: - .args: - .actual_access: write_only .address_space: global .is_restrict: true .name: out .offset: 0 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .actual_access: read_only .address_space: global .is_const: true .is_restrict: true .name: in .offset: 8 .size: 8 .type_name: 'T2*' .value_kind: global_buffer - .address_space: constant .is_const: true .name: smallTrig .offset: 16 .size: 8 .type_name: 'T2*' .value_kind: global_buffer .group_segment_fixed_size: 4096 .kernarg_segment_align: 8 .kernarg_segment_size: 24 .language: OpenCL C .language_version: - 2 - 0 .max_flat_workgroup_size: 64 .name: tailSquare .private_segment_fixed_size: 0 .reqd_workgroup_size: - 64 - 1 - 1 .sgpr_count: 17 .sgpr_spill_count: 0 .symbol: tailSquare.kd .uses_dynamic_stack: false .vgpr_count: 124 .vgpr_spill_count: 0 .wavefront_size: 64 amdhsa.target: 'amdgcn-amd-amdhsa--gfx906:sramecc+:xnack-' amdhsa.version: - 1 - 2 ... .end_amdgpu_metadata