GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary &Zone) {
if (Zone.isTop()) {
- if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+ // Prefer the candidate with the lesser depth, but only if one of them has
+ // depth greater than the total latency scheduled so far, otherwise either
+ // of them could be scheduled now with no stall.
+ if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
+ Zone.getScheduledLatency()) {
if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
return true;
TryCand, Cand, GenericSchedulerBase::TopPathReduce))
return true;
} else {
- if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+ // Prefer the candidate with the lesser height, but only if one of them has
+ // height greater than the total latency scheduled so far, otherwise either
+ // of them could be scheduled now with no stall.
+ if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
+ Zone.getScheduledLatency()) {
if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
return true;
; NONE16: fmov s1, wzr
; NONE16: fmov d2, xzr
; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; ZEROFP: ldr h0,{{.*}}
-; ZEROFP: movi v{{[0-3]+}}.2d, #0
-; ZEROFP: movi v{{[0-3]+}}.2d, #0
-; ZEROFP: movi v{{[0-3]+}}.2d, #0
+; ZEROFP-DAG: ldr h0,{{.*}}
+; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
+; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
+; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: s_movk_i32 s6, 0xff
+; SI-NEXT: s_movk_i32 s0, 0xff
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v1, s6, v2
+; SI-NEXT: v_and_b32_e32 v1, s0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v2, s6, v3
+; SI-NEXT: v_and_b32_e32 v2, s0, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v3, s6, v4
+; SI-NEXT: v_and_b32_e32 v3, s0, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s6, v0
+; SI-NEXT: v_and_b32_e32 v4, s0, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: s_movk_i32 s6, 0xff
+; SI-NEXT: s_movk_i32 s0, 0xff
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_and_b32_e32 v1, s6, v2
+; SI-NEXT: v_and_b32_e32 v1, s0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
-; SI-NEXT: v_and_b32_e32 v2, s6, v3
+; SI-NEXT: v_and_b32_e32 v2, s0, v3
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_and_b32_e32 v3, s6, v4
+; SI-NEXT: v_and_b32_e32 v3, s0, v4
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_and_b32_e32 v4, s6, v0
+; SI-NEXT: v_and_b32_e32 v4, s0, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
-; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc0 BB4_6
; CHECK-NEXT: ; %bb.1: ; %bb2
-; CHECK-NEXT: s_getpc_b64 s[6:7]
-; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4
-; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4
-; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT: s_mov_b32 s4, -1
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4
+; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
-; CHECK-NEXT: v_mov_b32_e32 v1, s7
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: flat_load_dword v0, v[0:1]
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1
+; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7]
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
-; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v3, s4
-; CI-NEXT: ds_inc_rtn_u32 v4, v3, v2
-; CI-NEXT: ds_inc_rtn_u32 v5, v3, v2
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0
+; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: flat_store_dword v[2:3], v4
+; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: flat_store_dword v[0:1], v5
+; CI-NEXT: flat_store_dword v[2:3], v5
; CI-NEXT: s_endpgm
;
; VI-LABEL: nocse_lds_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
-; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s4
-; VI-NEXT: ds_inc_rtn_u32 v4, v3, v2
-; VI-NEXT: ds_inc_rtn_u32 v5, v3, v2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0
+; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NEXT: flat_store_dword v[2:3], v4
+; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NEXT: flat_store_dword v[0:1], v5
+; VI-NEXT: flat_store_dword v[2:3], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32:
; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
+; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8
; GFX7-NEXT: s_cmp_lg_u32 s8, 0
-; GFX7-NEXT: s_cselect_b32 s6, 1, 0
-; GFX7-NEXT: s_and_b32 s0, 1, s6
+; GFX7-NEXT: s_cselect_b32 s0, 1, 0
+; GFX7-NEXT: s_and_b32 s0, 1, s0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v0
+; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; GFX7-NEXT: s_endpgm
;
define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
; GFX6-LABEL: simplify_bfe_u32_multi_use_arg:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_and_b32 s0, s0, 63
-; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002
-; GFX6-NEXT: v_mov_b32_e32 v1, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: s_and_b32 s8, s8, 63
+; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002
+; GFX6-NEXT: v_mov_b32_e32 v1, s9
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0
-; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
i32 addrspace(1)* %out1,
i32 addrspace(1)* %in) #0 {
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0
; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-5
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-3
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-1
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-6
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-5
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-4
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-2
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-1
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-8
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-9
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off offset:-7
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff
; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v11, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v4
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v13, s4, v14
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v6
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v13
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v8
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v9, v4, v5
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v10
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v3
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v6, v7
-; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v4, v5, v8
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v11
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v6
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v7, v3, v8
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v10
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v11
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v4, v5
+; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v6, v7
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:5
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:6
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:7
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:9
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:10
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:1
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:2
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v13, v[0:1], s[4:7], 0 addr64 offset:3
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
-; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xff
-; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s8, 0xff
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:7
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
+; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v9, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v10
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s8, v11
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s8, v12
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v10
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v1
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v0
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v10, v9
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s8, v13
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v9
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v6, v7
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v3, v8
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v10
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v0, v4
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v11
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v8
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v12
; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 1
ret <3 x i32> %load
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0
; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-6
-; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-4
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-6
+; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off offset:-2
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v4
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v7, v4
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v6, v4, v3
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, v3, v6
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s8, 0xffff
+; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v3
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s8, v4
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s8, v5
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s8, v6
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s8, v0
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v16, s2
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9
; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[12:13], off
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, s3
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, s2
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[14:15], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[16:17], off
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[12:13], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[14:15], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[16:17], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[18:19], off
+; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s1
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
-; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[10:11], off
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s0
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[10:11], off
+; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[12:13], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[4:5], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[6:7], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8
-; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, 0xff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, 8
+; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8
+; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v3
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v12, v18
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v13, v18
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v21, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v14, v5
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v18, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v2
-; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v10, v18
+; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v11, v5
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v16, v18
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v14, v18, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v15, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2
; GFX7-NOUNALIGNED: ; %bb.0:
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:5
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:6
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:7
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:8
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:9
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:10
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:1
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:2
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:3
-; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, off, s[0:3], 0 offset:4
-; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff
-; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10
+; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11
+; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff
+; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v0
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v12
+; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s4, v9
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v7, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s4, v10
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v9
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v12
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v11
-; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v0
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v8, v7
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v12, v1
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v9
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v6
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v10
-; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v10
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off
-; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
-; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xffff
+; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v12
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v12, v0
-; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v12
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v5, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v12, v0
+; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8
-; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
+; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
+; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+4
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, gv0@gotpcrel32@hi+4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+4
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b32 s8, 0
+; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b32 s8, 0
; GFX6-NEXT: s_mov_b32 s4, 0
+; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
-; GFX6-NEXT: v_mov_b32_e32 v3, s8
+; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
; GFX9: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}}
-; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
-; VI: flat_load_dword [[LOAD:v[0-9]+]]
+; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
+; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]]
; VI: v_or_b32_e32
;
; GCN-LABEL: udiv_i64_pow2k_denom:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], 12
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%r = udiv i64 %x, 4096
store i64 %r, i64 addrspace(1)* %out
;
; GCN-LABEL: sdiv_i64_pow2k_denom:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_ashr_i32 s4, s7, 31
-; GCN-NEXT: s_lshr_b32 s4, s4, 20
-; GCN-NEXT: s_add_u32 s4, s6, s4
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_addc_u32 s5, s7, 0
-; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 12
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_ashr_i32 s0, s3, 31
+; GCN-NEXT: s_lshr_b32 s0, s0, 20
+; GCN-NEXT: s_add_u32 s0, s2, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%r = sdiv i64 %x, 4096
store i64 %r, i64 addrspace(1)* %out
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s6, 0xff00ff
-; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s10, 0x33333333
-; SI-NEXT: s_mov_b32 s11, 0xcccccccc
-; SI-NEXT: s_mov_b32 s0, 0x55555555
-; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
+; SI-NEXT: s_mov_b32 s0, 0xff00ff
+; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
+; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
+; SI-NEXT: s_mov_b32 s3, 0x33333333
+; SI-NEXT: s_mov_b32 s6, 0xcccccccc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_bfi_b32 v2, s6, v0, v2
-; SI-NEXT: v_bfi_b32 v4, s6, v1, v3
-; SI-NEXT: v_and_b32_e32 v1, s8, v2
-; SI-NEXT: v_and_b32_e32 v0, s8, v4
-; SI-NEXT: v_and_b32_e32 v3, s9, v2
-; SI-NEXT: v_and_b32_e32 v2, s9, v4
+; SI-NEXT: v_bfi_b32 v2, s0, v0, v2
+; SI-NEXT: v_bfi_b32 v4, s0, v1, v3
+; SI-NEXT: v_and_b32_e32 v1, s1, v2
+; SI-NEXT: v_and_b32_e32 v0, s1, v4
+; SI-NEXT: v_and_b32_e32 v3, s2, v2
+; SI-NEXT: v_and_b32_e32 v2, s2, v4
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s0, 0x55555555
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
-; SI-NEXT: v_and_b32_e32 v1, s10, v3
-; SI-NEXT: v_and_b32_e32 v0, s10, v2
-; SI-NEXT: v_and_b32_e32 v3, s11, v3
-; SI-NEXT: v_and_b32_e32 v2, s11, v2
+; SI-NEXT: v_and_b32_e32 v1, s3, v3
+; SI-NEXT: v_and_b32_e32 v0, s3, v2
+; SI-NEXT: v_and_b32_e32 v3, s6, v3
+; SI-NEXT: v_and_b32_e32 v2, s6, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
+; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s0, v3
; SI-NEXT: v_and_b32_e32 v2, s1, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; FLAT-NEXT: s_mov_b32 s6, 0x10203
-; FLAT-NEXT: s_mov_b32 s2, 0x33333333
-; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
+; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s3, 0x33333333
+; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s0, 0x10203
+; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6
-; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6
-; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
+; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0
+; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0
+; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
+; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
+; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
+; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: s_mov_b32 s0, 0x55555555
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
+; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
+; FLAT-NEXT: v_and_b32_e32 v3, s6, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s6, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s8, 0xff00ff
-; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f
-; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0
-; SI-NEXT: s_mov_b32 s11, 0x33333333
-; SI-NEXT: s_mov_b32 s12, 0xcccccccc
-; SI-NEXT: s_mov_b32 s13, 0x55555555
-; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa
+; SI-NEXT: s_mov_b32 s0, 0xff00ff
+; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
+; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
+; SI-NEXT: s_mov_b32 s3, 0x33333333
+; SI-NEXT: s_mov_b32 s8, 0xcccccccc
+; SI-NEXT: s_mov_b32 s9, 0x55555555
+; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
-; SI-NEXT: v_bfi_b32 v2, s8, v2, v4
-; SI-NEXT: v_bfi_b32 v4, s8, v3, v5
-; SI-NEXT: v_bfi_b32 v6, s8, v0, v6
-; SI-NEXT: v_bfi_b32 v8, s8, v1, v7
-; SI-NEXT: v_and_b32_e32 v1, s9, v2
-; SI-NEXT: v_and_b32_e32 v0, s9, v4
-; SI-NEXT: v_and_b32_e32 v3, s10, v2
-; SI-NEXT: v_and_b32_e32 v2, s10, v4
-; SI-NEXT: v_and_b32_e32 v5, s9, v6
-; SI-NEXT: v_and_b32_e32 v4, s9, v8
-; SI-NEXT: v_and_b32_e32 v7, s10, v6
-; SI-NEXT: v_and_b32_e32 v6, s10, v8
+; SI-NEXT: v_bfi_b32 v2, s0, v2, v4
+; SI-NEXT: v_bfi_b32 v4, s0, v3, v5
+; SI-NEXT: v_bfi_b32 v6, s0, v0, v6
+; SI-NEXT: v_bfi_b32 v8, s0, v1, v7
+; SI-NEXT: v_and_b32_e32 v1, s1, v2
+; SI-NEXT: v_and_b32_e32 v0, s1, v4
+; SI-NEXT: v_and_b32_e32 v3, s2, v2
+; SI-NEXT: v_and_b32_e32 v2, s2, v4
+; SI-NEXT: v_and_b32_e32 v5, s1, v6
+; SI-NEXT: v_and_b32_e32 v4, s1, v8
+; SI-NEXT: v_and_b32_e32 v7, s2, v6
+; SI-NEXT: v_and_b32_e32 v6, s2, v8
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s11, v3
-; SI-NEXT: v_and_b32_e32 v0, s11, v2
-; SI-NEXT: v_and_b32_e32 v5, s11, v7
-; SI-NEXT: v_and_b32_e32 v4, s11, v6
-; SI-NEXT: v_and_b32_e32 v3, s12, v3
-; SI-NEXT: v_and_b32_e32 v2, s12, v2
-; SI-NEXT: v_and_b32_e32 v7, s12, v7
-; SI-NEXT: v_and_b32_e32 v6, s12, v6
+; SI-NEXT: v_and_b32_e32 v1, s3, v3
+; SI-NEXT: v_and_b32_e32 v0, s3, v2
+; SI-NEXT: v_and_b32_e32 v5, s3, v7
+; SI-NEXT: v_and_b32_e32 v4, s3, v6
+; SI-NEXT: v_and_b32_e32 v3, s8, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
+; SI-NEXT: v_and_b32_e32 v7, s8, v7
+; SI-NEXT: v_and_b32_e32 v6, s8, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
-; SI-NEXT: v_and_b32_e32 v1, s13, v3
-; SI-NEXT: v_and_b32_e32 v0, s13, v2
-; SI-NEXT: v_and_b32_e32 v5, s13, v7
-; SI-NEXT: v_and_b32_e32 v4, s13, v6
-; SI-NEXT: v_and_b32_e32 v3, s14, v3
-; SI-NEXT: v_and_b32_e32 v2, s14, v2
-; SI-NEXT: v_and_b32_e32 v7, s14, v7
-; SI-NEXT: v_and_b32_e32 v6, s14, v6
+; SI-NEXT: v_and_b32_e32 v1, s9, v3
+; SI-NEXT: v_and_b32_e32 v0, s9, v2
+; SI-NEXT: v_and_b32_e32 v5, s9, v7
+; SI-NEXT: v_and_b32_e32 v4, s9, v6
+; SI-NEXT: v_and_b32_e32 v3, s10, v3
+; SI-NEXT: v_and_b32_e32 v2, s10, v2
+; SI-NEXT: v_and_b32_e32 v7, s10, v7
+; SI-NEXT: v_and_b32_e32 v6, s10, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; FLAT-NEXT: s_mov_b32 s10, 0x10203
-; FLAT-NEXT: s_mov_b32 s2, 0x33333333
-; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
+; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
+; FLAT-NEXT: s_mov_b32 s3, 0x33333333
+; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
-; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
-; FLAT-NEXT: s_mov_b32 s8, 0x55555555
-; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
+; FLAT-NEXT: s_mov_b32 s0, 0x10203
+; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
+; FLAT-NEXT: s_mov_b32 s9, 0x55555555
+; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10
-; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10
-; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10
-; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10
-; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
-; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
-; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
-; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
-; FLAT-NEXT: v_and_b32_e32 v5, s0, v6
-; FLAT-NEXT: v_and_b32_e32 v4, s0, v8
-; FLAT-NEXT: v_and_b32_e32 v7, s1, v6
-; FLAT-NEXT: v_and_b32_e32 v6, s1, v8
+; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0
+; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0
+; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0
+; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0
+; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
+; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
+; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
+; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
+; FLAT-NEXT: v_and_b32_e32 v5, s1, v6
+; FLAT-NEXT: v_and_b32_e32 v4, s1, v8
+; FLAT-NEXT: v_and_b32_e32 v7, s2, v6
+; FLAT-NEXT: v_and_b32_e32 v6, s2, v8
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s2, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s2, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s3, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s3, v6
+; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
+; FLAT-NEXT: v_and_b32_e32 v5, s3, v7
+; FLAT-NEXT: v_and_b32_e32 v4, s3, v6
+; FLAT-NEXT: v_and_b32_e32 v3, s8, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s8, v2
+; FLAT-NEXT: v_and_b32_e32 v7, s8, v7
+; FLAT-NEXT: v_and_b32_e32 v6, s8, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
-; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
-; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
-; FLAT-NEXT: v_and_b32_e32 v5, s8, v7
-; FLAT-NEXT: v_and_b32_e32 v4, s8, v6
-; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
-; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
-; FLAT-NEXT: v_and_b32_e32 v7, s9, v7
-; FLAT-NEXT: v_and_b32_e32 v6, s9, v6
+; FLAT-NEXT: v_and_b32_e32 v1, s9, v3
+; FLAT-NEXT: v_and_b32_e32 v0, s9, v2
+; FLAT-NEXT: v_and_b32_e32 v5, s9, v7
+; FLAT-NEXT: v_and_b32_e32 v4, s9, v6
+; FLAT-NEXT: v_and_b32_e32 v3, s10, v3
+; FLAT-NEXT: v_and_b32_e32 v2, s10, v2
+; FLAT-NEXT: v_and_b32_e32 v7, s10, v7
+; FLAT-NEXT: v_and_b32_e32 v6, s10, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_load_dword s4, s[6:7], 0x0
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
+; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v0, 0, s4, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_perm_b32 v0, 0, s0, v0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load i32, i32 addrspace(1)* %in, align 4
%bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s5, v0
-; VI-NEXT: v_perm_b32 v0, 0, s4, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_perm_b32 v1, 0, s3, v0
+; VI-NEXT: v_perm_b32 v0, 0, s2, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
%bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v3, 0, s7, v0
-; VI-NEXT: v_perm_b32 v2, 0, s6, v0
-; VI-NEXT: v_perm_b32 v1, 0, s5, v0
-; VI-NEXT: v_perm_b32 v0, 0, s4, v0
+; VI-NEXT: v_perm_b32 v3, 0, s11, v0
+; VI-NEXT: v_perm_b32 v2, 0, s10, v0
+; VI-NEXT: v_perm_b32 v1, 0, s9, v0
+; VI-NEXT: v_perm_b32 v0, 0, s8, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s4, v0
-; VI-NEXT: v_perm_b32 v0, 0, s5, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_perm_b32 v1, 0, s2, v0
+; VI-NEXT: v_perm_b32 v0, 0, s3, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load i64, i64 addrspace(1)* %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v3, 0, s6, v0
-; VI-NEXT: v_perm_b32 v2, 0, s7, v0
-; VI-NEXT: v_perm_b32 v1, 0, s4, v0
-; VI-NEXT: v_perm_b32 v0, 0, s5, v0
+; VI-NEXT: v_perm_b32 v3, 0, s10, v0
+; VI-NEXT: v_perm_b32 v2, 0, s11, v0
+; VI-NEXT: v_perm_b32 v1, 0, s8, v0
+; VI-NEXT: v_perm_b32 v0, 0, s9, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NEXT: global_load_ushort v4, v[2:3], off
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: global_load_ushort v2, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4
-; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2
+; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
+; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6
-; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
+; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6
+; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
; SI-LABEL: test_copy_v4i8_x2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s12, s6
-; SI-NEXT: s_mov_b32 s13, s7
-; SI-NEXT: s_mov_b32 s14, s2
-; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x2:
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s9, s7
-; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s22, 0
-; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[20:21], s[6:7]
+; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[20:23], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
-; SI-NEXT: s_mov_b32 s16, s4
-; SI-NEXT: s_mov_b32 s17, s5
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x3:
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s14, 0
-; SI-NEXT: s_mov_b32 s15, s3
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s20, s8
-; SI-NEXT: s_mov_b32 s21, s9
-; SI-NEXT: s_mov_b32 s8, s10
-; SI-NEXT: s_mov_b32 s9, s11
-; SI-NEXT: s_mov_b32 s16, s6
-; SI-NEXT: s_mov_b32 s17, s7
-; SI-NEXT: s_mov_b32 s18, s2
-; SI-NEXT: s_mov_b32 s19, s3
-; SI-NEXT: s_mov_b32 s22, s2
-; SI-NEXT: s_mov_b32 s23, s3
-; SI-NEXT: s_mov_b32 s10, s2
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s22, s10
+; SI-NEXT: s_mov_b32 s23, s11
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
+; SI-NEXT: s_mov_b32 s20, s6
+; SI-NEXT: s_mov_b32 s21, s7
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
-; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s8
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s17, s9
-; VI-NEXT: s_mov_b32 s8, s10
-; VI-NEXT: s_mov_b32 s9, s11
-; VI-NEXT: s_mov_b32 s12, s6
-; VI-NEXT: s_mov_b32 s13, s7
-; VI-NEXT: s_mov_b32 s14, s2
-; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: s_mov_b32 s18, s2
-; VI-NEXT: s_mov_b32 s19, s3
-; VI-NEXT: s_mov_b32 s10, s2
-; VI-NEXT: s_mov_b32 s11, s3
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s22, s10
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s23, s11
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
+; VI-NEXT: s_mov_b32 s20, s6
+; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI-LABEL: test_copy_v4i8_extra_use:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s12, 0xff00
; SI-NEXT: s_movk_i32 s13, 0xff
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s7
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s0, s6
+; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
-; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_movk_i32 s10, 0x900
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_movk_i32 s12, 0xff00
+; VI-NEXT: s_movk_i32 s13, 0xff
+; VI-NEXT: s_movk_i32 s14, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_movk_i32 s8, 0xff00
-; VI-NEXT: s_movk_i32 s9, 0xff
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v4, s8, v1
+; VI-NEXT: v_and_b32_e32 v4, s12, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
-; VI-NEXT: v_and_b32_e32 v1, s9, v1
+; VI-NEXT: v_and_b32_e32 v1, s13, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
-; VI-NEXT: v_and_b32_e32 v2, s8, v0
-; VI-NEXT: v_and_b32_e32 v3, s9, v3
+; VI-NEXT: v_and_b32_e32 v2, s12, v0
+; VI-NEXT: v_and_b32_e32 v3, s13, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_add_u16_e32 v1, s10, v1
-; VI-NEXT: v_add_u16_e32 v2, s10, v2
+; VI-NEXT: v_add_u16_e32 v1, s14, v1
+; VI-NEXT: v_add_u16_e32 v2, s14, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s18, 0
-; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[16:17], s[6:7]
+; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
-; SI-NEXT: s_mov_b32 s12, s4
-; SI-NEXT: s_mov_b32 s13, s5
-; SI-NEXT: s_mov_b32 s4, 0xff00
-; SI-NEXT: s_movk_i32 s5, 0xff
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; SI-NEXT: s_mov_b32 s16, 0xff00
+; SI-NEXT: s_movk_i32 s17, 0xff
; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s0, s2
-; SI-NEXT: s_mov_b32 s1, s3
-; SI-NEXT: s_mov_b32 s2, s10
-; SI-NEXT: s_mov_b32 s3, s11
-; SI-NEXT: s_mov_b32 s14, s10
-; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_and_b32_e32 v4, s4, v1
+; SI-NEXT: v_and_b32_e32 v4, s16, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
-; SI-NEXT: v_and_b32_e32 v2, s4, v0
-; SI-NEXT: v_and_b32_e32 v3, s5, v3
+; SI-NEXT: v_and_b32_e32 v2, s16, v0
+; SI-NEXT: v_and_b32_e32 v3, s17, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v1, s5, v1
+; SI-NEXT: v_and_b32_e32 v1, s17, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
-; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_movk_i32 s16, 0xff00
+; VI-NEXT: s_movk_i32 s17, 0xff
+; VI-NEXT: s_movk_i32 s18, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_movk_i32 s4, 0xff00
-; VI-NEXT: s_mov_b32 s13, s5
-; VI-NEXT: s_movk_i32 s5, 0xff
-; VI-NEXT: s_movk_i32 s6, 0x900
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: v_and_b32_e32 v4, s4, v1
+; VI-NEXT: v_and_b32_e32 v4, s16, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
-; VI-NEXT: v_and_b32_e32 v1, s5, v1
+; VI-NEXT: v_and_b32_e32 v1, s17, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
-; VI-NEXT: v_and_b32_e32 v2, s4, v0
-; VI-NEXT: v_and_b32_e32 v3, s5, v3
+; VI-NEXT: v_and_b32_e32 v2, s16, v0
+; VI-NEXT: v_and_b32_e32 v3, s17, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_add_u16_e32 v1, s6, v1
-; VI-NEXT: v_add_u16_e32 v2, s6, v2
+; VI-NEXT: v_add_u16_e32 v1, s18, v1
+; VI-NEXT: v_add_u16_e32 v2, s18, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v3i8_align4:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s10, 0
-; SI-NEXT: s_mov_b32 s11, s3
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
+; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
+; SI-NEXT: s_mov_b32 s0, s8
+; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT: v_mov_b32_e32 v4, 0
+; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
+; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v0, v2
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_ffbh_u32_e32 v6, v3
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
+; VI-NEXT: v_ffbh_u32_e32 v5, v0
+; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5
+; VI-NEXT: v_ffbh_u32_e32 v6, v1
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 64, v1, vcc
+; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_i64:
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2]
; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT: v_ffbh_u32_e32 v5, v1
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc
+; VI-NEXT: v_ffbh_u32_e32 v0, v1
+; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT: v_ffbh_u32_e32 v5, v2
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_endpgm
;
; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
-; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_sdwa
+; SI-SDWA: v_or_b32_e32
+; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
-; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_sdwa
; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT: s_mov_b32 s11, 0xf000
-; SI-NEXT: s_mov_b32 s2, 0
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
-; SI-NEXT: s_movk_i32 s12, 0xff
+; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_movk_i32 s8, 0xff
+; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
+; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, s12, v4
+; SI-NEXT: v_and_b32_e32 v0, s8, v4
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
; SI-NEXT: v_or_b32_e32 v0, v7, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
-; SI-NEXT: v_and_b32_e32 v2, s12, v2
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
-; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: v_mov_b32_e32 v5, 9
+; VI-NEXT: s_movk_i32 s8, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v5, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, 9
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_movk_i32 s0, 0x900
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
-; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
+; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
+; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
+; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
-; VI-NEXT: v_add_u16_e32 v8, 9, v5
-; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
+; VI-NEXT: v_add_u16_e32 v8, 9, v4
+; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_add_u16_e32 v0, s0, v0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_add_u16_e32 v0, s8, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
-; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
-; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
-; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
+; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
+; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_waitcnt vmcnt(4)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
+; SI-NEXT: s_waitcnt vmcnt(6)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
+; SI-NEXT: s_waitcnt vmcnt(5)
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
; SI-NEXT: s_waitcnt vmcnt(3)
-; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
-; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2
-; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT: v_or_b32_e32 v2, v7, v6
-; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v8, v3
+; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
+; SI-NEXT: v_or_b32_e32 v2, v9, v6
+; SI-NEXT: s_waitcnt vmcnt(2)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
+; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
-; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:24
+; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
-; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
+; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]]
; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
-; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
+; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
; GCN: s_endpgm
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
+; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
-; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
-; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
+; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
+; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]
define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
; GFX9-LABEL: urem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s2, 0xffff
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_movk_i32 s6, 0x400
+; GFX9-NEXT: s_movk_i32 s8, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s2, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_and_b32 s5, s4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: BB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
+; GFX9-NEXT: v_and_b32_e32 v2, s4, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
-; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1
-; GFX9-NEXT: v_trunc_f32_e32 v7, v7
-; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7
-; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v8, v1
+; GFX9-NEXT: v_trunc_f32_e32 v9, v9
+; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9
+; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v0
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
+; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v8, v8, s5
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
-; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v2, v2, v8
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB5_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-LABEL: srem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_movk_i32 s3, 0x400
+; GFX9-NEXT: s_movk_i32 s5, 0x400
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: BB7_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16
-; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
+; GFX9-NEXT: v_xor_b32_e32 v9, s4, v7
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v8, s5
-; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
-; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1
-; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7
-; GFX9-NEXT: v_trunc_f32_e32 v8, v8
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9
-; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8
-; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10
+; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1
+; GFX9-NEXT: v_trunc_f32_e32 v9, v9
+; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9
+; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10
; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0|
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
-; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0|
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v2, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
+; GFX9-NEXT: v_mov_b32_e32 v8, s7
+; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB7_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s0, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_lshr_b32 s1, s1, 16
-; GFX8-NEXT: s_and_b32 s0, s2, s0
+; GFX8-NEXT: s_and_b32 s3, s2, s0
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_lshr_b32 s1, s1, 16
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 8
+; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX8-NEXT: v_mad_i32_i24 v0, v2, v0, s2
-; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NODL-NEXT: v_bfe_i32 v3, v2, 0, 8
-; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_bfe_i32 v1, v0, 0, 8
+; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v2, v0, s2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
-; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_bfe_i32 v1, v0, 0, 8
+; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8
+; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v0, v2, v0, s2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
+; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s3, s2
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
+; GFX8-NEXT: s_sext_i32_i8 s2, s0
+; GFX8-NEXT: s_sext_i32_i8 s3, s1
+; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
-; GFX8-NEXT: s_sext_i32_i8 s1, s0
+; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010
-; GFX8-NEXT: s_ashr_i32 s2, s2, 24
+; GFX8-NEXT: s_ashr_i32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
-; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
+; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
+; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
+; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
+; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010
-; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
+; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: s_movk_i32 s5, 0xff
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_and_b32 s7, s6, s5
-; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT: s_and_b32 s5, s4, s5
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT: s_and_b32 s7, s4, s8
+; GFX7-NEXT: s_and_b32 s6, s5, s8
+; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
+; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT: s_lshr_b32 s6, s6, 24
+; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s2, s0
-; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_and_b32 s3, s2, s0
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
+; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: s_movk_i32 s5, 0xff
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_and_b32 s7, s6, s5
-; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT: s_and_b32 s5, s4, s5
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT: s_and_b32 s7, s4, s8
+; GFX7-NEXT: s_and_b32 s6, s5, s8
+; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
+; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT: s_lshr_b32 s6, s6, 24
+; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_and_b32 s0, s2, s0
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: s_movk_i32 s5, 0xff
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_and_b32 s7, s6, s5
-; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT: s_and_b32 s5, s4, s5
-; GFX7-NEXT: v_mov_b32_e32 v1, s7
-; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
+; GFX7-NEXT: s_and_b32 s7, s4, s8
+; GFX7-NEXT: s_and_b32 s6, s5, s8
+; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
+; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT: s_lshr_b32 s6, s6, 24
+; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s2, s0
-; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_and_b32 s3, s2, s0
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
+; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: v_mov_b32_e32 v5, s6
+; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
+; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_sext_i32_i8 s7, s6
-; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008
-; GFX7-NEXT: s_sext_i32_i8 s5, s4
+; GFX7-NEXT: s_sext_i32_i8 s6, s4
+; GFX7-NEXT: s_sext_i32_i8 s7, s5
+; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
; GFX7-NEXT: s_and_b32 s7, s7, s8
; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010
-; GFX7-NEXT: s_and_b32 s5, s5, s8
+; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010
+; GFX7-NEXT: s_and_b32 s6, s6, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010
-; GFX7-NEXT: s_lshr_b32 s6, s6, 24
+; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v2, s11
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT: s_sext_i32_i8 s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
-; GFX8-NEXT: s_sext_i32_i8 s1, s0
+; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_sext_i32_i8 s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX8-NEXT: s_sext_i32_i8 s2, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010
-; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
-; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
-; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008
-; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0
+; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT: s_movk_i32 s7, 0xff
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008
-; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010
-; GFX7-NEXT: s_lshr_b32 s9, s6, 24
-; GFX7-NEXT: s_and_b32 s6, s6, s7
-; GFX7-NEXT: s_lshr_b32 s5, s4, 24
-; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008
+; GFX7-NEXT: s_lshr_b32 s6, s4, 24
+; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
+; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010
+; GFX7-NEXT: s_lshr_b32 s9, s5, 24
+; GFX7-NEXT: s_and_b32 s5, s5, s8
+; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT: s_and_b32 s4, s4, s7
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: s_and_b32 s4, s4, s8
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s9
-; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_movk_i32 s2, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000
-; GFX8-NEXT: s_lshr_b32 s4, s3, 12
-; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004
-; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008
-; GFX8-NEXT: s_lshr_b32 s1, s0, 12
-; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX8-NEXT: s_lshr_b32 s3, s1, 12
+; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX8-NEXT: s_lshr_b32 s4, s2, 12
+; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008
+; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX8-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4
-; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004
-; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008
+; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004
+; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_mov_b32_e32 v7, s8
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3
-; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX8-NEXT: v_and_b32_e32 v5, s2, v5
-; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014
-; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX8-NEXT: v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX8-NEXT: v_mov_b32_e32 v8, s12
-; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018
-; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
+; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX8-NEXT: v_mov_b32_e32 v9, s14
-; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018
-; GFX8-NEXT: s_ashr_i32 s3, s3, 28
+; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018
+; GFX8-NEXT: s_ashr_i32 s2, s2, 28
; GFX8-NEXT: v_mov_b32_e32 v10, s16
-; GFX8-NEXT: s_ashr_i32 s0, s0, 28
+; GFX8-NEXT: s_ashr_i32 s1, s1, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_movk_i32 s2, 0xff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: s_movk_i32 s0, 0xff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000
-; GFX9-NEXT: s_lshr_b32 s4, s3, 12
-; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004
-; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008
-; GFX9-NEXT: s_lshr_b32 s1, s0, 12
-; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX9-NEXT: s_lshr_b32 s3, s1, 12
+; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-NEXT: s_lshr_b32 s4, s2, 12
+; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008
+; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004
-; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008
+; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004
+; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s8
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
-; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014
-; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
+; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
+; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX9-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018
-; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
+; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v9, s14
-; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018
-; GFX9-NEXT: s_ashr_i32 s3, s3, 28
+; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018
+; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: v_mov_b32_e32 v10, s16
-; GFX9-NEXT: s_ashr_i32 s0, s0, 28
+; GFX9-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000
-; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12
-; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008
-; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12
-; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
+; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12
+; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12
+; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4
-; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
-; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
-; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014
-; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010
+; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
+; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
+; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018
-; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14
-; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018
-; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16
-; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
+; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018
-; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014
-; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010
-; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000
-; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004
-; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008
-; GFX7-NEXT: s_ashr_i32 s14, s6, 28
-; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c
-; GFX7-NEXT: s_ashr_i32 s5, s4, 28
+; GFX7-NEXT: s_ashr_i32 s6, s4, 28
+; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018
+; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014
+; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010
+; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000
+; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004
+; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008
+; GFX7-NEXT: s_ashr_i32 s14, s5, 28
+; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014
; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010
; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c
-; GFX7-NEXT: v_mov_b32_e32 v1, s6
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2
; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3
; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s14
-; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0
+; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-NEXT: s_lshr_b32 s14, s1, 4
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-NEXT: s_lshr_b32 s7, s1, 4
+; GFX9-NEXT: s_lshr_b32 s14, s2, 4
+; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-NEXT: s_lshr_b32 s8, s0, 12
-; GFX9-NEXT: s_lshr_b32 s9, s0, 8
-; GFX9-NEXT: s_lshr_b32 s15, s1, 12
-; GFX9-NEXT: s_lshr_b32 s16, s1, 8
+; GFX9-NEXT: s_lshr_b32 s8, s1, 12
+; GFX9-NEXT: s_lshr_b32 s9, s1, 8
+; GFX9-NEXT: s_lshr_b32 s15, s2, 12
+; GFX9-NEXT: s_lshr_b32 s16, s2, 8
; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9
; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8
; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16
; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4
; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: s_lshr_b32 s3, s0, 20
-; GFX9-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-NEXT: s_lshr_b32 s10, s1, 20
-; GFX9-NEXT: s_lshr_b32 s11, s1, 16
+; GFX9-NEXT: s_lshr_b32 s3, s1, 20
+; GFX9-NEXT: s_lshr_b32 s4, s1, 16
+; GFX9-NEXT: s_lshr_b32 s10, s2, 20
+; GFX9-NEXT: s_lshr_b32 s11, s2, 16
; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3
; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11
; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-NEXT: s_lshr_b32 s5, s0, 28
-; GFX9-NEXT: s_lshr_b32 s6, s0, 24
-; GFX9-NEXT: s_lshr_b32 s12, s1, 28
-; GFX9-NEXT: s_lshr_b32 s13, s1, 24
-; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: s_lshr_b32 s5, s1, 28
+; GFX9-NEXT: s_lshr_b32 s6, s1, 24
+; GFX9-NEXT: s_lshr_b32 s12, s2, 28
+; GFX9-NEXT: s_lshr_b32 s13, s2, 24
+; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6
; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5
; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-NEXT: v_or_b32_e32 v6, v4, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4
+; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14
-; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12
-; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8
-; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12
-; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8
+; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 12
+; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 8
+; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 12
+; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 8
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20
-; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16
-; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20
-; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16
+; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 20
+; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 16
+; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 20
+; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 16
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10
-; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28
-; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24
-; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28
-; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 28
+; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24
+; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28
+; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 24
+; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15
; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010
-; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010
-; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018
-; GFX9-NEXT: s_lshr_b32 s13, s1, 28
-; GFX9-NEXT: s_and_b32 s14, s1, 15
-; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008
+; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40018
+; GFX9-NEXT: s_lshr_b32 s13, s2, 28
+; GFX9-NEXT: s_and_b32 s14, s2, 15
+; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40004
+; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40018
; GFX9-NEXT: v_mov_b32_e32 v5, s12
-; GFX9-NEXT: s_lshr_b32 s6, s0, 28
+; GFX9-NEXT: s_lshr_b32 s6, s1, 28
; GFX9-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-NEXT: s_and_b32 s7, s0, 15
+; GFX9-NEXT: s_and_b32 s7, s1, 15
; GFX9-NEXT: v_mov_b32_e32 v7, s14
-; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-NEXT: s_bfe_u32 s8, s1, 0x40004
; GFX9-NEXT: v_mov_b32_e32 v8, s15
-; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008
+; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v9, s16
-; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v10, s1
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v10, s2
; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3
; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v5, v7, v8
; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v6, v5, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018
-; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28
-; GFX9-DL-NEXT: s_and_b32 s14, s1, 15
-; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40018
+; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28
+; GFX9-DL-NEXT: s_and_b32 s14, s2, 15
+; GFX9-DL-NEXT: s_bfe_u32 s15, s2, 0x40004
+; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
-; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
+; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40018
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12
-; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28
+; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
-; GFX9-DL-NEXT: s_and_b32 s7, s0, 15
+; GFX9-DL-NEXT: s_and_b32 s7, s1, 15
; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14
-; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
+; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x40004
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15
-; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1
+; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5
; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9
-; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_load_dword s0, s[2:3], 0x0
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_and_b32 s1, s4, 0xffff
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s0, s0, 16
-; CI-NEXT: s_lshl_b32 s2, s0, 16
-; CI-NEXT: s_or_b32 s1, s1, s2
-; CI-NEXT: v_mov_b32_e32 v2, s1
+; CI-NEXT: s_lshr_b32 s1, s2, 16
+; CI-NEXT: s_lshl_b32 s2, s1, 16
+; CI-NEXT: s_or_b32 s0, s0, s2
+; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s0
+; CI-NEXT: ; use s1
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
+; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX9-NEXT: s_lshr_b32 s0, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_lshr_b32 s1, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: ;;#ASMSTART
-; GFX9-NEXT: ; use s1
+; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
-; VI-NEXT: s_or_b32 s0, s1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; VI-NEXT: s_or_b32 s1, s0, s1
+; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
-; VI-NEXT: ; use s1
+; VI-NEXT: ; use s0
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_endpgm
;
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_lshr_b32 s1, s4, 16
+; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
-; CI-NEXT: s_or_b32 s0, s1, s0
-; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
+; CI-NEXT: s_or_b32 s1, s0, s1
+; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
-; CI-NEXT: ; use s1
+; CI-NEXT: ; use s0
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_and_b32 s0, s0, 0xffff
+; CI-NEXT: s_and_b32 s0, s2, 0xffff
; CI-NEXT: s_or_b32 s0, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_load_dword v0, v[0:1]
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: flat_load_dword v3, v[0:1]
+; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s0, s4, 16
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: v_or_b32_e32 v0, s0, v0
-; CI-NEXT: flat_store_dword v[2:3], v0
+; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; CI-NEXT: v_or_b32_e32 v2, s0, v2
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500
+; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_0:
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
+; GFX9-NEXT: s_lshl_b32 s0, s4, 4
; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_load_dword s4, s[4:5], 0x0
+; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s0, s0, 4
+; VI-NEXT: s_lshl_b32 s0, s4, 4
; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_load_dword s4, s[4:5], 0x0
+; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_load_dword s0, s[4:5], 0x0
-; CI-NEXT: s_load_dword s1, s[2:3], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshl_b32 s0, s0, 4
+; CI-NEXT: s_lshl_b32 s0, s4, 4
; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
-; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: global_load_dword v1, v[2:3], off
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT: s_mov_b32 s0, 0xffff
-; GFX9-NEXT: v_mov_b32_e32 v5, s1
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
+; GFX9-NEXT: global_load_dword v2, v[2:3], off
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_mov_b32 s0, 0x12341234
-; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v3
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: s_mov_b32 s0, 0xffff
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_mov_b32 s2, 0xffff
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_mov_b32 s0, 0x12341234
-; VI-NEXT: v_bfi_b32 v0, v1, s0, v0
-; VI-NEXT: flat_store_dword v[4:5], v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
+; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v2, v[2:3]
-; CI-NEXT: flat_load_dword v0, v[0:1]
-; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
-; CI-NEXT: v_mov_b32_e32 v5, s1
+; CI-NEXT: flat_load_dword v3, v[0:1]
+; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
+; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 s0, 0x12341234
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
+; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: v_bfi_b32 v0, v1, s0, v0
-; CI-NEXT: flat_store_dword v[4:5], v0
+; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
+; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_mov_b32 s0, 0xffff
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
+; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1
-; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1
+; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_and_b32 s2, s4, s0
-; VI-NEXT: s_mov_b32 s1, 0
-; VI-NEXT: s_lshl_b32 s3, s2, 16
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_and_b32 s1, s4, s2
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_lshl_b32 s0, s1, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
-; VI-NEXT: s_or_b32 s0, s2, s3
+; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
;
; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; CI: ; %bb.0:
-; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_mov_b32 s6, 0xffff
-; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; CI-NEXT: s_mov_b32 s2, 0xffff
+; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
-; CI-NEXT: s_and_b32 s3, s4, s6
+; CI-NEXT: s_and_b32 s4, s4, s2
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_or_b32 s0, s3, s1
+; CI-NEXT: s_or_b32 s0, s4, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
-; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
+; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
+; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_mov_b32 s3, 0
+; GFX9-NEXT: s_mov_b32 s2, 0xffff
+; GFX9-NEXT: s_lshl_b32 s1, s5, 4
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: s_mov_b32 s1, 0
-; GFX9-NEXT: s_mov_b32 s0, 0xffff
-; GFX9-NEXT: s_lshl_b32 s3, s5, 4
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
+; GFX9-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1
-; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0
+; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1
+; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_mov_b32 s0, 0xffff
+; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_mov_b32 s1, 0
-; VI-NEXT: s_lshl_b32 s2, s5, 4
-; VI-NEXT: s_and_b32 s3, s4, s0
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT: s_lshl_b32 s2, s3, 16
-; VI-NEXT: s_or_b32 s2, s3, s2
+; VI-NEXT: s_mov_b32 s3, 0
+; VI-NEXT: s_lshl_b32 s1, s5, 4
+; VI-NEXT: s_and_b32 s4, s4, s2
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
+; VI-NEXT: s_lshl_b32 s2, s4, 16
+; VI-NEXT: s_or_b32 s2, s4, s2
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: s_and_b32 s2, s4, s0
-; CI-NEXT: s_lshl_b32 s4, s4, 16
+; CI-NEXT: s_mov_b32 s2, 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: s_or_b32 s2, s2, s4
-; CI-NEXT: s_mov_b32 s1, 0
-; CI-NEXT: s_lshl_b32 s3, s5, 4
-; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
+; CI-NEXT: s_and_b32 s6, s4, s2
+; CI-NEXT: s_mov_b32 s3, 0
+; CI-NEXT: s_lshl_b32 s1, s5, 4
+; CI-NEXT: s_lshl_b32 s4, s4, 16
+; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
+; CI-NEXT: s_or_b32 s2, s6, s4
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_mov_b32_e32 v5, s2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
+;SICI: v_mov_b32_e32 v1, 0x2000
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
;CHECK-DAG: s_waitcnt vmcnt(0)
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
; VI-LABEL: simplify_bfe_u32_multi_use_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s10, s2
-; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 63, v0
; VI-NEXT: v_bfe_u32 v1, v0, 2, 2
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
i32 addrspace(1)* %out1,
i32 addrspace(1)* %in) #0 {
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0
-; GFX9-NEXT: v_cos_f16_e32 v3, v1
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cos_f16_e32 v2, v0
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
+; GFX9-NEXT: v_cos_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_cos_f16_e32 v3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
-; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
-; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-LABEL: maxnum_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
-; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) #0 {
;
; GFX9-LABEL: maxnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) #0 {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
-; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
-; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-LABEL: minnum_v2f16_ieee:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
-; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) #0 {
;
; GFX9-LABEL: minnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) #0 {
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
-; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NEXT: s_movk_i32 s9, 0xfc01
-; SI-NEXT: s_mov_b32 s7, 0xfffff
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_brev_b32 s8, -2
+; SI-NEXT: s_movk_i32 s7, 0xfc01
+; SI-NEXT: s_mov_b32 s1, 0xfffff
+; SI-NEXT: s_mov_b32 s0, -1
+; SI-NEXT: s_brev_b32 s6, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT: v_add_i32_e32 v6, vcc, s9, v4
-; SI-NEXT: v_lshr_b64 v[4:5], s[6:7], v6
+; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
+; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v4, v4
; SI-NEXT: v_not_b32_e32 v5, v5
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
-; SI-NEXT: v_bfi_b32 v2, s8, v8, v3
+; SI-NEXT: v_bfi_b32 v2, s6, v8, v3
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_brev_b32 s6, -2
+; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
+; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3]
; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
-; CI-NEXT: v_bfi_b32 v2, s6, v8, v3
+; CI-NEXT: v_bfi_b32 v2, s0, v8, v3
; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0
-; GFX9-NEXT: v_sin_f16_e32 v3, v1
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_sin_f16_e32 v2, v0
+; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
+; GFX9-NEXT: v_sin_f16_e32 v2, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_sin_f16_e32 v3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
+; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GCN-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0
+; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1
; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v5, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v4, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5
+; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v2i16:
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
-; CI-NEXT: v_and_b32_e32 v3, s8, v3
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
+; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshr_b32_e32 v2, v2, v3
; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3
+; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s0, s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s9, s0, 16
-; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: s_lshr_b32 s1, s8, 16
+; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
-; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3
-; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
+; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3
+; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0
+; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_s_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s0, s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s9, s0, 16
-; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: s_lshr_b32 s1, s8, 16
+; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
-; CI-NEXT: v_lshr_b32_e32 v3, s9, v3
-; CI-NEXT: v_lshr_b32_e32 v2, s10, v2
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
+; CI-NEXT: v_lshr_b32_e32 v3, s1, v3
+; CI-NEXT: v_lshr_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
+; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_imm_v_v2i16:
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5
-; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3
+; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1
-; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0
-; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
+; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
+; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
-; VI-NEXT: v_or_b32_e32 v0, v5, v0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v3, v0
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v4i16:
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
-; CI-NEXT: v_and_b32_e32 v4, s8, v4
-; CI-NEXT: v_and_b32_e32 v3, s8, v3
-; CI-NEXT: v_and_b32_e32 v5, s8, v5
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
+; CI-NEXT: v_and_b32_e32 v4, s0, v4
+; CI-NEXT: v_and_b32_e32 v3, s0, v3
+; CI-NEXT: v_and_b32_e32 v5, s0, v5
; CI-NEXT: v_lshr_b32_e32 v3, v3, v5
; CI-NEXT: v_lshr_b32_e32 v5, v7, v9
; CI-NEXT: v_lshr_b32_e32 v2, v2, v4
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_mov_b32 s8, 0xff00ff
+; CI-NEXT: s_mov_b32 s0, 0xff00ff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
-; CI-NEXT: v_and_b32_e32 v3, s8, v3
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
+; CI-NEXT: v_and_b32_e32 v3, s0, v3
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; it.
; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dword v5, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v2, v0, v1
-; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: flat_store_dword v[4:5], v0
+; VI-NEXT: v_max_i16_e32 v3, v5, v2
+; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_ushort v6, v[6:7]
-; VI-NEXT: flat_load_dword v7, v[0:1]
+; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v4, v[4:5]
+; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: flat_load_dword v8, v[2:3]
-; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dword v7, v[2:3]
+; VI-NEXT: flat_load_ushort v8, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_max_i16_e32 v0, v6, v0
+; VI-NEXT: v_max_i16_e32 v6, v5, v7
+; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_i16_e32 v1, v7, v8
-; VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v1, v1, v7
-; VI-NEXT: flat_store_short v[2:3], v0
-; VI-NEXT: flat_store_dword v[4:5], v1
+; VI-NEXT: v_max_i16_e32 v4, v4, v8
+; VI-NEXT: v_or_b32_e32 v5, v6, v5
+; VI-NEXT: flat_store_short v[2:3], v4
+; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4
-; GFX9-NEXT: global_load_dword v1, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v7, v[0:1], off
+; GFX9-NEXT: global_load_short_d16 v4, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v2, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_pk_max_i16 v3, v6, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX9-NEXT: v_pk_max_i16 v1, v7, v6
-; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
-; GFX9-NEXT: global_store_dword v[4:5], v0, off
+; GFX9-NEXT: v_pk_max_i16 v2, v7, v2
+; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; VI-NEXT: flat_load_dword v5, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_max_u16_e32 v2, v0, v1
-; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: flat_store_dword v[4:5], v0
+; VI-NEXT: v_max_u16_e32 v3, v5, v2
+; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_v2i16:
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GCN-NEXT: v_add_u32_e32 v0, v0, v2
+; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
+; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:8
+; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:12
+; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:16
+; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:20
+; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:24
+; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:28
+; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:32
+; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:36
+; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:40
+; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:44
+; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:48
+; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:52
+; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56
; GCN-NEXT: v_add_u32_e32 v1, v1, v2
-; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20
-; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24
-; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28
-; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32
-; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36
-; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40
-; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44
-; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48
-; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52
-; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56
-; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60
-; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
-; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
-; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
+; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
; GCN-NEXT: s_nop 0
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
-; GCN-NEXT: s_waitcnt vmcnt(4)
-; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20
-; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24
-; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28
-; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32
-; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36
-; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40
-; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44
-; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48
-; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52
-; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56
-; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56
+; GCN-NEXT: s_waitcnt vmcnt(15)
+; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:
; GFX9-NEXT: v_and_b32_e32 v5, 1, v18
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
-; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz BB1_3
; GFX9-NEXT: ; %bb.1: ; %bb19
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2
; GFX9-NEXT: v_add_u32_e32 v7, v17, v12
-; GFX9-NEXT: s_mov_b64 s[12:13], 0
+; GFX9-NEXT: s_mov_b64 s[10:11], 0
; GFX9-NEXT: BB1_2: ; %bb23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5
; GFX9-NEXT: v_add_u32_e32 v8, v8, v16
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, v13
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13
; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15
; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[8:9], v19, v14
-; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18
-; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
; GFX9-NEXT: v_add_u32_e32 v8, v12, v8
+; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_mov_b32_e32 v9, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9]
-; GFX9-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13]
-; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v10, v8
-; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v11, v9, s[4:5]
+; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8
+; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7]
; GFX9-NEXT: global_load_dword v8, v[8:9], off
+; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
+; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
; GFX9-NEXT: ds_write_b32 v3, v8
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_cbranch_execnz BB1_2
; GFX9-NEXT: BB1_3: ; %Flow3
-; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
bb:
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
;
-; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s2, s10
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
-; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v7, s7
-; VI-NEXT: flat_load_dword v4, v[4:5]
-; VI-NEXT: flat_load_dword v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: global_load_dword v4, v[4:5], off
-; GFX9-NEXT: global_load_dword v5, v[6:7], off
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s2, s10
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
+; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v7, s7
-; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6
-; VI-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc
-; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5]
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
+; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
+; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-NEXT: flat_store_byte v[2:3], v0
+; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
-; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5]
-; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v[2:3], v0, off
+; GFX9-NEXT: global_store_byte v[6:7], v0, off
; GFX9-NEXT: s_endpgm
%a = load i64, i64 addrspace(1)* %aptr, align 4
%b = load i64, i64 addrspace(1)* %bptr, align 4
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s8, s0
-; SI-NEXT: s_mov_b32 s9, s1
-; SI-NEXT: s_mov_b32 s12, s2
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s0, s4
-; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s2, s10
-; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_mov_b32 s12, s4
+; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT: s_mov_b32 s8, s0
+; SI-NEXT: s_mov_b32 s9, s1
+; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v6, s6
-; VI-NEXT: v_mov_b32_e32 v7, s7
-; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
-; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v7
-; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6
-; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7
-; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5
-; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
-; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4
+; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
+; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
+; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
+; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
-; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v4, s4
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mov_b32_e32 v6, s6
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
-; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v9, v5, v7
-; GFX9-NEXT: v_add_u32_e32 v8, v4, v6
-; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7
-; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5
-; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
-; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4
+; GFX9-NEXT: v_add_u32_e32 v9, v1, v3
+; GFX9-NEXT: v_add_u32_e32 v8, v0, v2
+; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off
+; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
%b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
; CI-NOHSA-NOT: v_add
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; SI: s_mov_b32 {{s[0-9]+}}, 0x13480
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
+; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
; CHECK: %11.sub0:vreg_512 = COPY [[COPY]].sub0
; CHECK: %11.sub3:vreg_512 = COPY [[COPY]].sub3
- ; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
; CHECK: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
; CHECK: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
; CHECK: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
+ ; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
; CHECK: S_BRANCH %bb.1
bb.0:
liveins: $sgpr6_sgpr7
; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
+ ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+ ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
+ ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
+ ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
- ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
- ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
- ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
- ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
-; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
-; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe
+; GFX9-NEXT: s_mov_b32 s6, s10
+; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
-; GFX9-NEXT: s_mov_b32 s0, s2
-; GFX9-NEXT: s_mov_b32 s1, s3
-; GFX9-NEXT: s_mov_b32 s2, s10
-; GFX9-NEXT: s_mov_b32 s3, s11
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12
-; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8
+; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GFX9-NEXT: v_mul_f32_e32 v10, s4, v10
-; GFX9-NEXT: v_mul_f32_e32 v12, s4, v12
+; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10
+; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12
; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10
; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4
-; GFX9-NEXT: v_mul_f32_e32 v14, s4, v14
+; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8
; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
+; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
+; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
-; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
-; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
+; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
+; TONGA-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
-; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; TONGA-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
-; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
-; TONGA-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; TONGA-NEXT: s_waitcnt vmcnt(2)
; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; TONGA-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
-; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
-; GFX9-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
+; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
+; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; GCN-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
-; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
-; GCN-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
+; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
+; TONGA-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
-; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; TONGA-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
-; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
-; TONGA-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; TONGA-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GCN-NEXT: s_mov_b32 s0, 0x1389c755
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
+; GCN-NEXT: s_mov_b32 s4, 0x1389c755
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_hi_i32 v0, v0, s0
-; GCN-NEXT: v_mul_hi_i32 v1, v1, s0
-; GCN-NEXT: v_mul_hi_i32 v2, v2, s0
-; GCN-NEXT: v_mul_hi_i32 v3, v3, s0
+; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
+; GCN-NEXT: v_mul_hi_i32 v1, v1, s4
+; GCN-NEXT: v_mul_hi_i32 v2, v2, s4
+; GCN-NEXT: v_mul_hi_i32 v3, v3, s4
; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: scalarize_mulhs_4xi32:
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
-; TONGA-NEXT: s_mov_b32 s4, s6
-; TONGA-NEXT: s_mov_b32 s5, s7
-; TONGA-NEXT: s_mov_b32 s6, s2
-; TONGA-NEXT: s_mov_b32 s7, s3
+; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
-; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
-; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
-; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
+; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
+; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
+; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
+; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: scalarize_mulhs_4xi32:
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s2
-; GFX9-NEXT: s_mov_b32 s7, s3
+; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
-; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
-; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
-; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
+; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
+; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
+; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
+; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: scalarize_mulhs_4xi32:
define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_sdiv24_k_num_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40
-; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT: s_mov_b32 s7, 0x41c00000
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_ashr_i32 s4, s6, 30
+; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
+; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s3, 0x41c00000
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_ashr_i32 s0, s2, 30
; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_or_b32 s6, s4, 1
-; GCN-NEXT: v_mul_f32_e32 v1, s7, v1
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_or_b32 s2, s0, 1
+; GCN-NEXT: v_mul_f32_e32 v1, s3, v1
; GCN-NEXT: v_trunc_f32_e32 v1, v1
-; GCN-NEXT: v_mad_f32 v2, -v1, v0, s7
+; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s6, 0
-; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cselect_b32 s0, s2, 0
+; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_sdiv24_k_num_i64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40
-; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6
-; GCN-IR-NEXT: s_mov_b32 s7, 0x41c00000
-; GCN-IR-NEXT: s_mov_b32 s0, s4
-; GCN-IR-NEXT: s_ashr_i32 s4, s6, 30
+; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
+; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000
+; GCN-IR-NEXT: s_mov_b32 s4, s0
+; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30
; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GCN-IR-NEXT: s_mov_b32 s1, s5
-; GCN-IR-NEXT: s_or_b32 s6, s4, 1
-; GCN-IR-NEXT: v_mul_f32_e32 v1, s7, v1
+; GCN-IR-NEXT: s_mov_b32 s5, s1
+; GCN-IR-NEXT: s_or_b32 s2, s0, 1
+; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
-; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s7
+; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3
; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
-; GCN-IR-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0
-; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v1
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
+; GCN-IR-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0
+; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24
; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = ashr i64 %x, 40
%result = sdiv i64 24, %x.shr
; GCN-LABEL: {{^}}mul_v2i16:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
-; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: s_mov_b32 s16, s8
-; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s16, s6
+; VI-NEXT: s_mov_b32 s17, s7
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s20, s8
+; VI-NEXT: s_mov_b32 s21, s9
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
-; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s22, s2
+; VI-NEXT: s_mov_b32 s23, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0
-; VI-NEXT: v_mov_b32_e32 v2, 0x3800
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3800
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r,
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0
-; VI-NEXT: v_mov_b32_e32 v2, 0x3800
+; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3800
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r,
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: s_mov_b32 s16, s8
-; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s16, s6
+; VI-NEXT: s_mov_b32 s17, s7
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s20, s8
+; VI-NEXT: s_mov_b32 s21, s9
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
-; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s22, s2
+; VI-NEXT: s_mov_b32 s23, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s0, 0x3900
+; VI-NEXT: s_movk_i32 s2, 0x3900
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
+; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
-; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s6, s10
-; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s6, s10
+; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s0, 0x3900
+; VI-NEXT: s_movk_i32 s2, 0x3900
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s16, s4
-; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
-; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0
-; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
+; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT: s_waitcnt vmcnt(1)
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3
+; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
+; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc
+; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0
-; VI-NEXT: v_mov_b32_e32 v2, 0x3800
-; VI-NEXT: v_mov_b32_e32 v3, 0x3900
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3800
+; VI-NEXT: v_mov_b32_e32 v4, 0x3900
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_mov_b32 s16, s4
-; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
-; SI-NEXT: s_mov_b32 s18, s10
-; SI-NEXT: s_mov_b32 s19, s11
+; SI-NEXT: s_mov_b32 s16, s4
+; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
+; SI-NEXT: s_mov_b32 s18, s10
+; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
-; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0
-; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
+; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: s_waitcnt vmcnt(1)
-; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
-; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s12, s4
-; VI-NEXT: s_mov_b32 s13, s5
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s12, s2
+; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s16, s4
+; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s18, s10
+; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
-; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0
-; VI-NEXT: v_mov_b32_e32 v2, 0x3800
-; VI-NEXT: v_mov_b32_e32 v3, 0x3900
+; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x3800
+; VI-NEXT: v_mov_b32_e32 v4, 0x3900
+; VI-NEXT: s_mov_b32 s8, s0
+; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
+; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; after 64-bit shift is split.
; GCN-LABEL: {{^}}lshr_and_i64_35:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
; GCN-NEXT: v_lshl_b32_e32 v0, v0, v2
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshl_b32_e32 v3, v3, v7
; GCN-NEXT: v_lshl_b32_e32 v2, v2, v6
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s15, s3
-; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
+; GCN-NEXT: s_mov_b32 s6, 0xffff
; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GCN-NEXT: v_and_b32_e32 v0, s4, v0
+; GCN-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
-; GCN-NEXT: v_and_b32_e32 v0, s4, v0
+; GCN-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
-; GCN-NEXT: s_mov_b32 s8, 0xffff
+; GCN-NEXT: s_mov_b32 s0, 0xffff
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v8, s8, v4
+; GCN-NEXT: v_and_b32_e32 v8, s0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_and_b32_e32 v9, s8, v5
+; GCN-NEXT: v_and_b32_e32 v9, s0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshl_b32_e32 v5, v7, v5
; GCN-NEXT: v_lshl_b32_e32 v4, v6, v4
; GCN-NEXT: v_lshl_b32_e32 v2, v2, v8
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT: v_and_b32_e32 v3, s8, v3
+; GCN-NEXT: v_and_b32_e32 v3, s0, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT: v_and_b32_e32 v2, s8, v2
+; GCN-NEXT: v_and_b32_e32 v2, s0, v2
; GCN-NEXT: v_or_b32_e32 v3, v3, v5
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s4, s6
-; GCN-NEXT: s_mov_b32 s5, s7
-; GCN-NEXT: s_mov_b32 s6, s2
-; GCN-NEXT: s_mov_b32 s7, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; GCN-NEXT: s_waitcnt vmcnt(0)
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v5, v[0:1]
+; VI-NEXT: flat_load_dword v2, v[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v4, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
+; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v2i16:
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v5, s8, v3
+; CI-NEXT: v_and_b32_e32 v5, s0, v3
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; CI-NEXT: v_lshl_b32_e32 v3, v4, v3
; CI-NEXT: v_lshl_b32_e32 v2, v2, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3
+; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; CI-NEXT: s_load_dword s0, s[0:1], 0xd
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_lshr_b32 s9, s0, 16
-; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT: s_mov_b32 s0, 0xffff
+; CI-NEXT: s_lshr_b32 s1, s8, 16
+; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2
-; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
+; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
+; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0
+; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v4, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_s_v_v2i16:
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
-; CI-NEXT: s_lshr_b32 s9, s8, 16
+; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, s0, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; CI-NEXT: v_lshl_b32_e32 v2, s9, v2
+; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
+; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_imm_v_v2i16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
-; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1
-; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
+; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
+; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_imm_v2i16:
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5
-; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
-; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
+; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
+; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1
-; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0
-; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
+; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
-; VI-NEXT: v_or_b32_e32 v0, v5, v0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_e32 v0, v3, v0
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v4i16:
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
-; CI-NEXT: s_mov_b32 s8, 0xffff
+; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v8, s8, v4
+; CI-NEXT: v_and_b32_e32 v8, s0, v4
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_and_b32_e32 v9, s8, v5
+; CI-NEXT: v_and_b32_e32 v9, s0, v5
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; CI-NEXT: v_lshl_b32_e32 v5, v7, v5
; CI-NEXT: v_lshl_b32_e32 v4, v6, v4
; CI-NEXT: v_lshl_b32_e32 v2, v2, v8
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; CI-NEXT: v_and_b32_e32 v3, s8, v3
+; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; CI-NEXT: v_and_b32_e32 v2, s8, v2
+; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_mov_b32 s0, 0xff000000
+; VI-NEXT: s_mov_b32 s2, 0xff000000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; VI-NEXT: v_and_b32_e32 v0, s0, v0
+; VI-NEXT: v_and_b32_e32 v0, s2, v0
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT: v_and_b32_e32 v4, s0, v4
+; VI-NEXT: v_and_b32_e32 v4, s2, v4
; VI-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; CI-NEXT: s_mov_b32 s8, 0xff00
+; CI-NEXT: s_mov_b32 s0, 0xff00
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
-; CI-NEXT: v_and_b32_e32 v4, s8, v4
+; CI-NEXT: v_and_b32_e32 v4, s0, v4
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
-; CI-NEXT: v_and_b32_e32 v3, s8, v3
+; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v4
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4
+; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0
-; VI-NEXT: flat_store_dword v[2:3], v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4
+; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0
-; GFX9-NEXT: global_store_dword v[2:3], v1, off
-; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: v_subrev_u32_e32 v3, 64, v4
+; GFX9-NEXT: global_store_dword v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_ushort v3, v[0:1]
; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT: v_subrev_u16_e32 v1, 64, v4
+; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
-; VI-NEXT: flat_store_short v[2:3], v1
-; VI-NEXT: flat_store_short v[2:3], v0
+; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
+; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: global_load_ushort v3, v[0:1], off
; GFX9-NEXT: global_load_ushort v4, v[0:1], off
-; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4
+; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
-; GFX9-NEXT: global_store_short v[2:3], v1, off
-; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: v_subrev_u16_e32 v3, 64, v4
+; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: global_store_short v[0:1], v3, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, 64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, 64
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, -7, v0
-; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, -7, v3
+; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, 0xffffff85
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT: v_add_u16_e32 v0, -7, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_add_u16_e32 v3, -7, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v4, 32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, 32
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, -16, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, -16, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; VI-NEXT: v_add_u16_e32 v0, -16, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; VI-NEXT: v_add_u16_e32 v3, -16, v3
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_movk_i32 s2, 0xc400
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_movk_i32 s0, 0xc400
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, s0, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, s2, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_movk_i32 s2, 0x4400
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_movk_i32 s0, 0x4400
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, s0, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, s2, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_movk_i32 s2, 0x4000
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_movk_i32 s0, 0x4000
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, s0, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, s2, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_load_dword v3, v[0:1]
+; VI-NEXT: s_movk_i32 s2, 0xc000
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT: s_movk_i32 s0, 0xc000
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, s0, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: v_add_u16_e32 v2, s2, v3
+; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
+; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s7
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s10, s2
+; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s6
+; SI-NEXT: s_mov_b32 s9, s7
+; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_mov_b32 s4, s6
-; SI-NEXT: s_mov_b32 s5, s7
-; SI-NEXT: s_mov_b32 s6, s2
-; SI-NEXT: s_mov_b32 s7, s3
-; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v2, v0, v1
; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dword s4, s[6:7], 0x0
-; VI-NEXT: s_load_dword s5, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[8:9], 0x0
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s6, s4, 16
-; VI-NEXT: s_lshr_b32 s7, s5, 16
-; VI-NEXT: s_sub_i32 s4, s4, s5
-; VI-NEXT: s_sub_i32 s5, s6, s7
+; VI-NEXT: s_lshr_b32 s5, s4, 16
+; VI-NEXT: s_lshr_b32 s7, s6, 16
+; VI-NEXT: s_sub_i32 s4, s4, s6
+; VI-NEXT: s_sub_i32 s5, s5, s7
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_or_b32 s4, s4, s5
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_add_u16_e32 v2, 0xffffff85, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v2, 0x3df
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v1, 0x3df
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v2, 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v1, 1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: v_add_u16_e32 v1, 1, v0
-; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_add_u16_e32 v2, 1, v0
+; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v0, v1, v2
; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: global_load_dword v1, v[4:5], off
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v0, v4, v2
; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
-; VI-NEXT: s_mov_b32 s0, s4
-; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s2, s[4:5], 0x0
+; VI-NEXT: s_load_dword s3, s[6:7], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: s_load_dword s1, s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_sext_i32_i16 s1, s1
+; VI-NEXT: s_sext_i32_i16 s0, s2
+; VI-NEXT: s_sext_i32_i16 s1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_udiv24_k_den_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s6, 0x46b6fe00
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_lshr_b32 s0, s7, 8
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: s_lshr_b32 s2, s3, 8
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GCN-NEXT: v_mad_f32 v0, -v1, s6, v0
-; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6
+; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0
+; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_udiv24_k_den_i64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_mov_b32 s6, 0x46b6fe00
-; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s2, -1
-; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8
-; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT: s_mov_b32 s0, s4
-; GCN-IR-NEXT: s_mov_b32 s1, s5
+; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8
+; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00
+; GCN-IR-NEXT: s_mov_b32 s4, s0
+; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GCN-IR-NEXT: v_mad_f32 v0, -v1, s6, v0
-; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6
+; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = lshr i64 %x, 40
%result = udiv i64 %x.shr, 23423
define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_urem24_k_den_i64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT: s_mov_b32 s1, 0x46b6fe00
-; GCN-NEXT: s_movk_i32 s0, 0x5b7f
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s6, s7, 8
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GCN-NEXT: s_lshr_b32 s2, s3, 8
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT: s_movk_i32 s3, 0x5b7f
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GCN-NEXT: v_mad_f32 v0, -v1, s1, v0
-; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1
-; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0
+; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GCN-NEXT: v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_urem24_k_den_i64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT: s_mov_b32 s1, 0x46b6fe00
-; GCN-IR-NEXT: s_movk_i32 s0, 0x5b7f
-; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT: s_mov_b32 s2, -1
+; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00
+; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_lshr_b32 s6, s7, 8
-; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8
+; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f
+; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1
-; GCN-IR-NEXT: v_mad_f32 v0, -v1, s1, v0
-; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1
-; GCN-IR-NEXT: s_mov_b32 s1, s5
+; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0
+; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
+; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
-; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT: s_mov_b32 s0, s4
+; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = lshr i64 %x, 40
%result = urem i64 %x.shr, 23423
; SI-NEXT: s_mov_b32 s9, s11
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
-; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
-; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
-; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
-; SI-NEXT: v_mov_b32_e32 v2, 0x41200000
+; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000
-; SI-NEXT: v_mac_f32_e32 v2, v0, v3
+; SI-NEXT: v_mac_f32_e32 v3, v0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: buffer_store_short v1, off, s[8:11], 0
; SI-NEXT: s_endpgm
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
-; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
-; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
-; VI-NEXT: v_mov_b32_e32 v2, 0x4900
+; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
+; VI-NEXT: v_mov_b32_e32 v3, 0x4900
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s2
-; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mac_f16_e32 v2, v0, v3
+; VI-NEXT: v_mac_f16_e32 v3, v0, v2
; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_short v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v3, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r0,
half addrspace(1)* %r1,
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0
-; GCN-NEXT: v_mov_b32_e32 v5, v2
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64
-; GCN-NEXT: v_mov_b32_e32 v6, s8
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3
+; GCN-NEXT: v_mov_b32_e32 v7, v5
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3
+; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64
+; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT: buffer_store_dword v0, v[6:7], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%id.ext = sext i32 %id to i64
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: widen_i16_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s0, s0, 4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_addk_i32 s1, 0x3e7
+; SI-NEXT: s_or_b32 s4, s1, 4
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load:
; SI-LABEL: widen_i16_constant_load_zext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s0, s0, 0xffff
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s0, s0, 4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s1, s1, 0xffff
+; SI-NEXT: s_addk_i32 s1, 0x3e7
+; SI-NEXT: s_or_b32 s4, s1, 4
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load_zext_i32:
; SI-LABEL: widen_i16_constant_load_sext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sext_i32_i16 s0, s0
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s0, s0, 4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_sext_i32_i16 s1, s1
+; SI-NEXT: s_addk_i32 s1, 0x3e7
+; SI-NEXT: s_or_b32 s4, s1, 4
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load_sext_i32:
define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
; SI-LABEL: widen_i17_constant_load:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s0, 0
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s7, s[8:9], 0x0
+; SI-NEXT: s_load_dword s7, s[6:7], 0x0
; SI-NEXT: s_mov_b32 s4, 2
; SI-NEXT: s_mov_b32 s5, s0
; SI-NEXT: s_mov_b32 s6, s2
define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
; SI-LABEL: widen_v2i8_constant_load:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s1, s0, 0xff00
-; SI-NEXT: s_add_i32 s0, s0, 12
-; SI-NEXT: s_or_b32 s0, s0, 4
-; SI-NEXT: s_and_b32 s0, s0, 0xff
-; SI-NEXT: s_or_b32 s0, s1, s0
-; SI-NEXT: s_addk_i32 s0, 0x2c00
-; SI-NEXT: s_or_b32 s0, s0, 0x300
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s4, s1, 0xff00
+; SI-NEXT: s_add_i32 s1, s1, 12
+; SI-NEXT: s_or_b32 s1, s1, 4
+; SI-NEXT: s_and_b32 s1, s1, 0xff
+; SI-NEXT: s_or_b32 s1, s4, s1
+; SI-NEXT: s_addk_i32 s1, 0x2c00
+; SI-NEXT: s_or_b32 s4, s1, 0x300
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_v2i8_constant_load:
; SI-LABEL: widen_i1_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s0, s0, 1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s4, s1, 1
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i1_constant_load:
; SI-LABEL: widen_i16_zextload_i64_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s0, s0, 0xffff
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s0, s0, 4
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s1, s1, 0xffff
+; SI-NEXT: s_addk_i32 s1, 0x3e7
+; SI-NEXT: s_or_b32 s4, s1, 4
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_zextload_i64_constant_load:
; SI-LABEL: widen_i1_zext_to_i64_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s0, s0, 1
-; SI-NEXT: s_add_u32 s0, s0, 0x3e7
-; SI-NEXT: s_addc_u32 s1, 0, 0
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_and_b32 s1, s1, 1
+; SI-NEXT: s_add_u32 s4, s1, 0x3e7
+; SI-NEXT: s_addc_u32 s5, 0, 0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i1_zext_to_i64_constant_load:
; SI-LABEL: widen_i16_global_invariant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT: s_mov_b32 s4, 0
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s5, s4
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_load_dword s0, s[0:1], 0x0
+; SI-NEXT: s_load_dword s1, s[0:1], 0x0
+; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_addk_i32 s0, 0x3e7
-; SI-NEXT: s_or_b32 s0, s0, 1
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_addk_i32 s1, 0x3e7
+; SI-NEXT: s_or_b32 s4, s1, 1
+; SI-NEXT: s_mov_b32 s1, s0
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_global_invariant_load:
ret void
; CHECK-LABEL: test2
; CHECK: addi 3, 3, 8
-; CHECK: lxvx [[LD:[0-9]+]], 0, 3
; CHECK: addi [[REG:[0-9]+]], 4, 4
+; CHECK: lxvx [[LD:[0-9]+]], 0, 3
; CHECK: stxvx [[LD]], 0, [[REG]]
}
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=machine-scheduler -o - %s | FileCheck %s
+---
+# Check that machine-scheduler's BotHeightReduce heuristic puts the LD 8 in
+# between the final run of MULLDs and the LDXs that feed them, to try to hide
+# the latency of the LDXs.
+name: test
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x80000000)
+ ; CHECK: liveins: $x3, $x4
+ ; CHECK: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x4
+ ; CHECK: [[COPY1:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x3
+ ; CHECK: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY1]], 1
+ ; CHECK: [[CMPLDI:%[0-9]+]]:crrc = CMPLDI [[COPY]], 1
+ ; CHECK: [[LI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 1
+ ; CHECK: [[ISEL8_:%[0-9]+]]:g8rc = ISEL8 [[COPY]], [[LI8_]], [[CMPLDI]].sub_gt
+ ; CHECK: MTCTR8loop [[ISEL8_]], implicit-def dead $ctr8
+ ; CHECK: [[LI8_1:%[0-9]+]]:g8rc = LI8 0
+ ; CHECK: [[LI8_2:%[0-9]+]]:g8rc = LI8 2
+ ; CHECK: [[LI8_3:%[0-9]+]]:g8rc = LI8 3
+ ; CHECK: [[LI8_4:%[0-9]+]]:g8rc = LI8 5
+ ; CHECK: [[LI8_5:%[0-9]+]]:g8rc = LI8 6
+ ; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1
+ ; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8)
+ ; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8)
+ ; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8)
+ ; CHECK: [[LD1:%[0-9]+]]:g8rc = LD 4, [[ADDI8_]] :: (load 8)
+ ; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8)
+ ; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8)
+ ; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8)
+ ; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8)
+ ; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]]
+ ; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8)
+ ; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]]
+ ; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]]
+ ; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]]
+ ; CHECK: [[MULLD4:%[0-9]+]]:g8rc = MULLD [[MULLD3]], [[LDX2]]
+ ; CHECK: [[MULLD5:%[0-9]+]]:g8rc = MULLD [[MULLD4]], [[LDX3]]
+ ; CHECK: [[MULLD6:%[0-9]+]]:g8rc = MULLD [[MULLD5]], [[LDX4]]
+ ; CHECK: [[MADDLD8_:%[0-9]+]]:g8rc = MADDLD8 [[MULLD6]], [[LD2]], [[MADDLD8_]]
+ ; CHECK: [[COPY2:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY [[ADDI8_1]]
+ ; CHECK: BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8
+ ; CHECK: B %bb.2
+ ; CHECK: bb.2:
+ bb.0:
+ liveins: $x3, $x4
+
+ %0:g8rc_and_g8rc_nox0 = COPY $x4
+ %1:g8rc_and_g8rc_nox0 = COPY $x3
+ %2:g8rc_and_g8rc_nox0 = ADDI8 %1, 1
+ %3:crrc = CMPLDI %0, 1
+ %4:g8rc_and_g8rc_nox0 = LI8 1
+ %5:g8rc = ISEL8 %0, %4, %3.sub_gt
+ MTCTR8loop %5, implicit-def dead $ctr8
+ %6:g8rc = LI8 0
+ %7:g8rc = LI8 2
+ %8:g8rc = LI8 3
+ %9:g8rc = LI8 5
+ %10:g8rc = LI8 6
+ %11:g8rc = LI8 7
+
+ bb.1:
+ %12:g8rc = ADDI8 %2, 1
+ %13:g8rc = LD 0, %2 :: (load 8)
+ %14:g8rc = LDX %2, %4 :: (load 8)
+ %16:g8rc = LDX %2, %8 :: (load 8)
+ %17:g8rc = LD 4, %2 :: (load 8)
+ %18:g8rc = LDX %2, %9 :: (load 8)
+ %19:g8rc = LDX %2, %10 :: (load 8)
+ %20:g8rc = LDX %2, %11 :: (load 8)
+ %21:g8rc = LD 8, %2 :: (load 8)
+ %22:g8rc = MULLD %14, %13
+ %15:g8rc = LDX %2, %7 :: (load 8)
+ %23:g8rc = MULLD %22, %15
+ %24:g8rc = MULLD %23, %16
+ %25:g8rc = MULLD %24, %17
+ %26:g8rc = MULLD %25, %18
+ %27:g8rc = MULLD %26, %19
+ %28:g8rc = MULLD %27, %20
+ %6:g8rc = MADDLD8 %28, %21, %6
+ %2:g8rc_and_g8rc_nox0 = COPY %12
+ BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8
+ B %bb.2
+
+ bb.2:
+...
; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: sldi r4, r4, 2
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: lxsiwzx v2, r3, r4
; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: blr
;
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li 5, -13
-; CHECK-NEXT: lxvx 0, 3, 5
-; CHECK-NEXT: li 5, 19
-; CHECK-NEXT: lxvx 1, 3, 5
-; CHECK-NEXT: li 5, 3
; CHECK-NEXT: li 6, 7
; CHECK-NEXT: li 7, 11
; CHECK-NEXT: li 8, 15
-; CHECK-NEXT: mfvsrld 9, 0
-; CHECK-NEXT: ldx 5, 3, 5
+; CHECK-NEXT: lxvx 0, 3, 5
+; CHECK-NEXT: li 5, 19
; CHECK-NEXT: ldx 6, 3, 6
; CHECK-NEXT: ldx 7, 3, 7
+; CHECK-NEXT: lxvx 1, 3, 5
+; CHECK-NEXT: li 5, 3
+; CHECK-NEXT: ldx 5, 3, 5
; CHECK-NEXT: ldx 3, 3, 8
+; CHECK-NEXT: mfvsrld 9, 0
; CHECK-NEXT: mffprd 8, 0
; CHECK-NEXT: mfvsrld 10, 1
; CHECK-NEXT: mffprd 11, 1
; CHECK-P9-BE-LABEL: test_consecutive_i32:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3
+; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
-; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
; CHECK-P9-BE-NEXT: blr
entry:
; CHECK-P9-LABEL: test_consecutive_float:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
+; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stfiwx f0, 0, r5
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3
-; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stfiwx f0, r5, r3
; CHECK-P9-NEXT: blr
;
; CHECK-P9-LABEL: test_stores_exceed_vec_size:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha
+; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-P9-NEXT: lxvx vs35, 0, r3
-; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-NEXT: li r3, 16
; CHECK-P9-NEXT: stfiwx f0, r5, r3
; CHECK-P9-NEXT: li r3, 20
; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0
-; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-P9-BE-NEXT: li r3, 16
; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3
; CHECK-P9-BE-NEXT: li r3, 20
+; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-P9-BE-NEXT: stxv vs0, 0(r5)
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
; CHECK-P9-LABEL: test_5_consecutive_stores_of_bytes:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4
+; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12
-; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15
; CHECK-P9-NEXT: li r3, 2
; CHECK-P9-BE-LABEL: test_5_consecutive_stores_of_bytes:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13
+; CHECK-P9-BE-NEXT: li r3, 1
; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5
-; CHECK-P9-BE-NEXT: li r3, 1
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2
; CHECK-P9-BE-NEXT: li r3, 2
; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4
+; CHECK-P9-NEXT: li r3, 5
; CHECK-P9-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 8
-; CHECK-P9-NEXT: li r3, 5
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 13
; CHECK-P9-NEXT: li r3, 6
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13
+; CHECK-P9-BE-NEXT: li r3, 5
; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 9
-; CHECK-P9-BE-NEXT: li r3, 5
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 4
; CHECK-P9-BE-NEXT: li r3, 6
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3
; CHECK-P9-BE-NEXT: li r3, 4
-; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3
; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7
+; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3
; CHECK-P9-BE-NEXT: blr
entry:
%vecext = extractelement <4 x i32> %a, i32 0
; CHECK-P9-BE-LABEL: test_elements_from_three_vec:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2
+; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1
-; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3
; CHECK-P9-BE-NEXT: li r3, 8
; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3
; CHECK-LABEL: testMixedAggregate_03:
; CHECK: # %bb.0: # %entry
; CHECK: mtvsrwa v2, r3
-; CHECK: xscvsdqp v2, v2
-; CHECK: mtvsrdd v3, r6, r5
+; CHECK-DAG: xscvsdqp v2, v2
+; CHECK-DAG: mtvsrdd v3, r6, r5
; CHECK: xsaddqp v2, v3, v2
; CHECK: mtvsrd v[[REG1:[0-9]+]], r10
; CHECK: xscvsdqp v[[REG:[0-9]+]], v[[REG1]]
; CHECK-NEXT: bltlr cr0
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: addi r3, r1, 40
+; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72
; CHECK-NEXT: lxvx v3, 0, r3
+; CHECK-NEXT: std [[REG2]], -8(r1)
; CHECK-NEXT: xsaddqp v2, v3, v2
; CHECK-NEXT: lxv v3, 16(r3)
; CHECK-NEXT: xsaddqp v2, v2, v3
-; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72
-; CHECK-NEXT: std [[REG2]], -8(r1)
; CHECK-NEXT: blr
entry:
%ap = alloca i8*, align 8
; CHECK-LABEL: qpConv2dp_03:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, .LC7@toc@ha
+; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: ld r5, .LC7@toc@l(r5)
; CHECK-NEXT: lxvx v2, 0, r5
; CHECK-NEXT: xscvqpdp v2, v2
-; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: stxsdx v2, r3, r4
; CHECK-NEXT: blr
entry:
; CHECK-LABEL: qpConv2sp_03:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, .LC7@toc@ha
+; CHECK-NEXT: sldi r4, r4, 2
; CHECK-NEXT: ld r5, .LC7@toc@l(r5)
; CHECK-NEXT: lxv v2, 48(r5)
; CHECK-NEXT: xscvqpdpo v2, v2
; CHECK-NEXT: xsrsp f0, v2
-; CHECK-NEXT: sldi r4, r4, 2
; CHECK-NEXT: stfsx f0, r3, r4
; CHECK-NEXT: blr
entry:
; CHECK: # %bb.0: # %entry
; CHECK: lwz r3, 96(r1)
; CHECK: add r4, r7, r9
+; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK: add r4, r4, r10
+; CHECK: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK: add r3, r4, r3
; CHECK: clrldi r3, r3, 32
; CHECK: std r3, 0(r6)
; CHECK: lxv v[[REG1:[0-9]+]], 0(r8)
-; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1
-; CHECK: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK: xsaddqp v2, v[[REG1]], v2
; CHECK: xsaddqp v2, v2, v3
; CHECK-NEXT: blr
; CHECK-LABEL: mixParam_02f:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: add r4, r4, r6
+; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK-NEXT: add r4, r4, r7
+; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK-NEXT: add r4, r4, r8
; CHECK-NEXT: clrldi r4, r4, 32
; CHECK-DAG: std r4, 0(r3)
; CHECK-DAG: lxv v[[REG1:[0-9]+]], 0(r5)
-; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1
-; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK-NEXT: xsaddqp v2, v[[REG1]], v2
; CHECK-NEXT: xsaddqp v2, v2, v[[REG0]]
; CHECK-NEXT: blr
; CHECK-NEXT: std 0, 16(1)
; CHECK-NEXT: stdu 1, -192(1)
; CHECK-NEXT: addis 3, 2, a1@toc@ha
+; CHECK-NEXT: addis 5, 2, a16@toc@ha
+; CHECK-NEXT: addis 6, 2, a17@toc@ha
+; CHECK-NEXT: addis 4, 2, a15@toc@ha
; CHECK-NEXT: lfd 1, a1@toc@l(3)
; CHECK-NEXT: addis 3, 2, a2@toc@ha
+; CHECK-NEXT: addi 5, 5, a16@toc@l
+; CHECK-NEXT: addi 6, 6, a17@toc@l
+; CHECK-NEXT: ld 4, a15@toc@l(4)
; CHECK-NEXT: lfd 2, a2@toc@l(3)
; CHECK-NEXT: addis 3, 2, a3@toc@ha
+; CHECK-NEXT: lxvx 34, 0, 6
+; CHECK-NEXT: lxvx 0, 0, 5
+; CHECK-NEXT: li 5, 152
; CHECK-NEXT: lfd 3, a3@toc@l(3)
; CHECK-NEXT: addis 3, 2, a4@toc@ha
; CHECK-NEXT: lfd 4, a4@toc@l(3)
; CHECK-NEXT: addis 3, 2, a11@toc@ha
; CHECK-NEXT: lfd 11, a11@toc@l(3)
; CHECK-NEXT: addis 3, 2, a12@toc@ha
-; CHECK-NEXT: addis 5, 2, a16@toc@ha
-; CHECK-NEXT: addis 6, 2, a17@toc@ha
-; CHECK-NEXT: addi 6, 6, a17@toc@l
-; CHECK-NEXT: lxvx 34, 0, 6
; CHECK-NEXT: lfd 12, a12@toc@l(3)
; CHECK-NEXT: addis 3, 2, a13@toc@ha
-; CHECK-NEXT: addi 5, 5, a16@toc@l
-; CHECK-NEXT: addis 4, 2, a15@toc@ha
-; CHECK-NEXT: lxvx 0, 0, 5
-; CHECK-NEXT: ld 4, a15@toc@l(4)
-; CHECK-NEXT: li 5, 152
; CHECK-NEXT: lfd 13, a13@toc@l(3)
; CHECK-NEXT: addis 3, 2, a14@toc@ha
; CHECK-NEXT: ld 3, a14@toc@l(3)
; CHECK-NEXT: lhz r3, 0(r3)
; CHECK-NEXT: xxmrghd vs0, vs0, vs1
; CHECK-NEXT: mtfprwz f3, r3
+; CHECK-NEXT: xvcvdpsp vs35, vs0
; CHECK-NEXT: xscvhpdp f3, f3
; CHECK-NEXT: xxmrghd vs2, vs2, vs3
; CHECK-NEXT: xvcvdpsp vs34, vs2
-; CHECK-NEXT: xvcvdpsp vs35, vs0
; CHECK-NEXT: vmrgew v2, v3, v2
; CHECK-NEXT: blr
;
; CHECK-LABEL: test_trunc32_vec4:
; CHECK: # %bb.0:
; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3
+; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1
; CHECK-NEXT: xscvspdpn f0, vs0
+; CHECK-NEXT: xscvspdpn f1, vs1
; CHECK-NEXT: xscvdphp f0, f0
; CHECK-NEXT: mffprwz r3, f0
; CHECK-NEXT: xxswapd vs0, vs34
-; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1
-; CHECK-NEXT: xscvspdpn f1, vs1
; CHECK-NEXT: xscvspdpn f0, vs0
; CHECK-NEXT: xscvdphp f0, f0
; CHECK-NEXT: xscvdphp f1, f1
; CHECK-NEXT: xscvdphp f1, f1
; CHECK-NEXT: sth r4, 4(r5)
; CHECK-NEXT: mffprwz r4, f0
-; CHECK-NEXT: sth r4, 2(r5)
; CHECK-NEXT: sth r3, 0(r5)
+; CHECK-NEXT: sth r4, 2(r5)
; CHECK-NEXT: mffprwz r6, f1
; CHECK-NEXT: sth r6, 6(r5)
; CHECK-NEXT: blr
; CHECK-NEXT: xscvdphp f1, vs34
; CHECK-NEXT: mffprwz r4, f1
; CHECK-NEXT: xscvdphp f1, vs35
+; CHECK-NEXT: sth r3, 0(r7)
; CHECK-NEXT: sth r4, 2(r7)
; CHECK-NEXT: mffprwz r4, f0
; CHECK-NEXT: sth r4, 4(r7)
-; CHECK-NEXT: sth r3, 0(r7)
; CHECK-NEXT: mffprwz r5, f1
; CHECK-NEXT: sth r5, 6(r7)
; CHECK-NEXT: blr
; CHECK-LABEL: test_sitofp_fadd_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: mtfprwa f1, r3
-; CHECK-NEXT: xscvsxdsp f1, f1
; CHECK-NEXT: lhz r4, 0(r4)
+; CHECK-NEXT: xscvsxdsp f1, f1
; CHECK-NEXT: mtfprwz f0, r4
; CHECK-NEXT: xscvhpdp f0, f0
; CHECK-NEXT: xscvdphp f1, f1
; CHECK-P9-BE-LABEL: load_swap11:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
-; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
+; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr
; CHECK-P9-BE-LABEL: load_swap21:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha
-; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
+; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr
; CHECK-P9-BE-LABEL: load_swap51:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI10_0@toc@ha
-; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
+; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr
; CHECK: .LBB0_2: #
; CHECK-NEXT: ldx r9, r3, r6
; CHECK-NEXT: ldx r10, r3, r7
-; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: ldx r11, r3, r8
-; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: ld r12, 0(r3)
; CHECK-NEXT: addi r3, r3, 1
+; CHECK-NEXT: mulld r9, r10, r9
+; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: maddld r5, r9, r12, r5
; CHECK-NEXT: bdnz .LBB0_2
%3 = sext i32 %1 to i64
; CHECK: .LBB1_2: #
; CHECK-NEXT: ldx r9, r6, r7
; CHECK-NEXT: ld r10, 0(r6)
-; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: ldx r11, r6, r5
-; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: addi r8, r6, 1
; CHECK-NEXT: ld r6, 4(r6)
+; CHECK-NEXT: mulld r9, r10, r9
+; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: maddld r3, r9, r6, r3
; CHECK-NEXT: mr r6, r8
; CHECK-NEXT: bdnz .LBB1_2
; CHECK: .LBB2_2: #
; CHECK-NEXT: ldx r12, r9, r6
; CHECK-NEXT: ld r0, 0(r9)
-; CHECK-NEXT: mulld r12, r0, r12
+; CHECK-NEXT: ldx r30, r9, r5
+; CHECK-NEXT: ldx r29, r9, r7
; CHECK-NEXT: addi r11, r9, 1
-; CHECK-NEXT: ldx r30, r9, r7
-; CHECK-NEXT: ld r29, 4(r9)
-; CHECK-NEXT: ldx r28, r9, r8
-; CHECK-NEXT: ld r27, 12(r9)
-; CHECK-NEXT: ld r26, 8(r9)
-; CHECK-NEXT: ldx r25, r9, r10
-; CHECK-NEXT: ldx r9, r9, r5
-; CHECK-NEXT: mulld r9, r12, r9
-; CHECK-NEXT: mulld r9, r9, r30
-; CHECK-NEXT: mulld r9, r9, r29
-; CHECK-NEXT: mulld r9, r9, r28
-; CHECK-NEXT: mulld r9, r9, r27
-; CHECK-NEXT: mulld r9, r9, r26
-; CHECK-NEXT: maddld r3, r9, r25, r3
+; CHECK-NEXT: mulld r12, r0, r12
+; CHECK-NEXT: ld r28, 4(r9)
+; CHECK-NEXT: ldx r27, r9, r8
+; CHECK-NEXT: ld r26, 12(r9)
+; CHECK-NEXT: ld r25, 8(r9)
+; CHECK-NEXT: ldx r9, r9, r10
+; CHECK-NEXT: mulld r12, r12, r30
+; CHECK-NEXT: mulld r12, r12, r29
+; CHECK-NEXT: mulld r12, r12, r28
+; CHECK-NEXT: mulld r12, r12, r27
+; CHECK-NEXT: mulld r12, r12, r26
+; CHECK-NEXT: mulld r12, r12, r25
+; CHECK-NEXT: maddld r3, r12, r9, r3
; CHECK-NEXT: mr r9, r11
; CHECK-NEXT: bdnz .LBB2_2
%3 = sext i32 %1 to i64
; CHECK: .LBB3_2: #
; CHECK-NEXT: ldu r8, 4(r3)
; CHECK-NEXT: ldx r9, r3, r7
-; CHECK-NEXT: mulld r8, r8, r9
; CHECK-NEXT: ldx r10, r3, r6
-; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: ld r11, 4(r3)
+; CHECK-NEXT: mulld r8, r8, r9
+; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: maddld r5, r8, r11, r5
; CHECK-NEXT: bdnz .LBB3_2
%3 = sext i32 %1 to i64
; CHECK: .LBB5_2: #
; CHECK-NEXT: ld r8, 0(r3)
; CHECK-NEXT: ldx r9, r3, r7
-; CHECK-NEXT: mulld r8, r9, r8
-; CHECK-NEXT: ld r9, 4(r3)
-; CHECK-NEXT: mulld r8, r8, r9
-; CHECK-NEXT: ld r10, 8(r3)
+; CHECK-NEXT: ld r10, 4(r3)
+; CHECK-NEXT: ld r11, 8(r3)
; CHECK-NEXT: addi r3, r3, 1
+; CHECK-NEXT: mulld r8, r9, r8
+; CHECK-NEXT: ld r12, 0(r4)
+; CHECK-NEXT: ldx r0, r4, r7
+; CHECK-NEXT: ld r30, 4(r4)
+; CHECK-NEXT: ld r9, 8(r4)
+; CHECK-NEXT: addi r4, r4, 1
; CHECK-NEXT: mulld r8, r8, r10
-; CHECK-NEXT: ld r11, 0(r4)
; CHECK-NEXT: mulld r8, r8, r11
-; CHECK-NEXT: ldx r12, r4, r7
; CHECK-NEXT: mulld r8, r8, r12
-; CHECK-NEXT: ld r0, 4(r4)
; CHECK-NEXT: mulld r8, r8, r0
-; CHECK-NEXT: ld r30, 8(r4)
-; CHECK-NEXT: addi r4, r4, 1
-; CHECK-NEXT: maddld r6, r8, r30, r6
+; CHECK-NEXT: mulld r8, r8, r30
+; CHECK-NEXT: maddld r6, r8, r9, r6
; CHECK-NEXT: bdnz .LBB5_2
%4 = sext i32 %2 to i64
%5 = icmp eq i32 %2, 0
; CHECK-NEXT: lfsx f0, r3, r4
; CHECK-NEXT: xscvuxdsp f4, f4
; CHECK-NEXT: lfs f2, 20(r3)
-; CHECK-NEXT: xsmulsp f0, f0, f4
-; CHECK-NEXT: xsmulsp f0, f2, f0
; CHECK-NEXT: lfs f3, 60(r3)
; CHECK-NEXT: addi r3, r3, 1
+; CHECK-NEXT: xsmulsp f0, f0, f4
+; CHECK-NEXT: xsmulsp f0, f2, f0
; CHECK-NEXT: xsmulsp f0, f3, f0
; CHECK-NEXT: xsaddsp f1, f1, f0
; CHECK-NEXT: bdnz .LBB8_2
; CHECK-P9-NEXT: b .LBB1_2
; CHECK-P9-NEXT: .LBB1_7: # %while.end
; CHECK-P9-NEXT: lis r3, -13108
-; CHECK-P9-NEXT: ori r3, r3, 52429
-; CHECK-P9-NEXT: mullw r3, r28, r3
; CHECK-P9-NEXT: lis r4, 13107
+; CHECK-P9-NEXT: ori r3, r3, 52429
; CHECK-P9-NEXT: ori r4, r4, 13108
+; CHECK-P9-NEXT: mullw r3, r28, r3
; CHECK-P9-NEXT: cmplw r3, r4
; CHECK-P9-NEXT: blt cr0, .LBB1_9
; CHECK-P9-NEXT: # %bb.8: # %if.then8
; PC64LE9-NEXT: li 3, 0
; PC64LE9-NEXT: xxlxor 2, 2, 2
; PC64LE9-NEXT: xxlxor 4, 4, 4
+; PC64LE9-NEXT: mr 30, 4
; PC64LE9-NEXT: std 3, 8(4)
; PC64LE9-NEXT: fmr 1, 31
; PC64LE9-NEXT: fmr 3, 31
-; PC64LE9-NEXT: mr 30, 4
; PC64LE9-NEXT: stfd 31, 0(4)
; PC64LE9-NEXT: bl __gcc_qadd
; PC64LE9-NEXT: nop
; CHECK-NEXT: std 0, 16(1)
; CHECK-NEXT: stdu 1, -64(1)
; CHECK-NEXT: addis 3, 2, g@toc@ha
-; CHECK-NEXT: lwz 3, g@toc@l(3)
; CHECK-NEXT: std 30, 48(1) # 8-byte Folded Spill
+; CHECK-NEXT: lwz 3, g@toc@l(3)
; CHECK-NEXT: extswsli 30, 3, 2
; CHECK-NEXT: addis 3, 2, f@got@tlsld@ha
; CHECK-NEXT: addi 3, 3, f@got@tlsld@l
; CHECK-NEXT: .LBB0_6: # %L1057.preheader
; CHECK-NEXT: .LBB0_7: # %L670
; CHECK-NEXT: lis r5, 4095
-; CHECK-NEXT: ori r5, r5, 65533
-; CHECK-NEXT: sldi r5, r5, 4
; CHECK-NEXT: cmpdi r3, 0
; CHECK-NEXT: sradi r4, r3, 63
+; CHECK-NEXT: ori r5, r5, 65533
+; CHECK-NEXT: crnot 4*cr5+gt, eq
+; CHECK-NEXT: sldi r5, r5, 4
; CHECK-NEXT: mulhdu r3, r3, r5
; CHECK-NEXT: maddld r6, r4, r5, r3
-; CHECK-NEXT: crnot 4*cr5+gt, eq
; CHECK-NEXT: cmpld r6, r3
; CHECK-NEXT: mulld r3, r4, r5
; CHECK-NEXT: cmpldi cr1, r3, 0
; P9-NOVSX-NEXT: rldimi r5, r3, 28, 0
; P9-NOVSX-NEXT: rotldi r3, r3, 28
; P9-NOVSX-NEXT: rldimi r3, r4, 28, 0
+; P9-NOVSX-NEXT: std r5, -8(r1)
; P9-NOVSX-NEXT: std r3, -16(r1)
; P9-NOVSX-NEXT: addi r3, r1, -16
-; P9-NOVSX-NEXT: std r5, -8(r1)
; P9-NOVSX-NEXT: lvx v2, 0, r3
; P9-NOVSX-NEXT: blr
;
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxsd v5, 0(r5)
; CHECK-NEXT: addis r5, r2, .LCPI0_0@toc@ha
+; CHECK-NEXT: xxlxor v3, v3, v3
+; CHECK-NEXT: li r6, 0
; CHECK-NEXT: addi r5, r5, .LCPI0_0@toc@l
; CHECK-NEXT: lxvx v2, 0, r5
; CHECK-NEXT: addis r5, r2, .LCPI0_1@toc@ha
; CHECK-NEXT: addi r5, r5, .LCPI0_1@toc@l
; CHECK-NEXT: lxvx v4, 0, r5
; CHECK-NEXT: li r5, 4
-; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: vperm v0, v3, v5, v2
; CHECK-NEXT: mtctr r5
; CHECK-NEXT: li r5, 0
; CHECK-NEXT: vperm v1, v3, v5, v4
-; CHECK-NEXT: li r6, 0
; CHECK-NEXT: xvnegsp v5, v0
; CHECK-NEXT: xvnegsp v0, v1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader
; CHECK-NEXT: #
; CHECK-NEXT: lxsd v1, 0(r3)
+; CHECK-NEXT: add r7, r3, r4
; CHECK-NEXT: vperm v6, v3, v1, v4
; CHECK-NEXT: vperm v1, v3, v1, v2
; CHECK-NEXT: xvnegsp v1, v1
; CHECK-NEXT: xvnegsp v6, v6
-; CHECK-NEXT: add r7, r3, r4
; CHECK-NEXT: vabsduw v1, v1, v5
; CHECK-NEXT: vabsduw v6, v6, v0
; CHECK-NEXT: vadduwm v1, v6, v1
; CHECK-NEXT: vextuwrx r3, r5, v1
; CHECK-NEXT: vperm v7, v3, v6, v4
; CHECK-NEXT: vperm v6, v3, v6, v2
+; CHECK-NEXT: add r6, r3, r6
+; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: xvnegsp v1, v7
; CHECK-NEXT: vabsduw v6, v6, v5
-; CHECK-NEXT: add r6, r3, r6
; CHECK-NEXT: vabsduw v1, v1, v0
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxswapd v6, v1
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: vextuwrx r8, r5, v1
-; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: add r6, r8, r6
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfd f0, 0(r5)
; P9BE-NEXT: addis r5, r2, .LCPI0_0@toc@ha
+; P9BE-NEXT: xxlxor v3, v3, v3
+; P9BE-NEXT: li r6, 0
; P9BE-NEXT: addi r5, r5, .LCPI0_0@toc@l
; P9BE-NEXT: lxvx v2, 0, r5
; P9BE-NEXT: addis r5, r2, .LCPI0_1@toc@ha
+; P9BE-NEXT: xxlor v5, vs0, vs0
; P9BE-NEXT: addi r5, r5, .LCPI0_1@toc@l
; P9BE-NEXT: lxvx v4, 0, r5
; P9BE-NEXT: li r5, 4
-; P9BE-NEXT: xxlor v5, vs0, vs0
-; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: vperm v0, v3, v5, v2
; P9BE-NEXT: mtctr r5
; P9BE-NEXT: li r5, 0
; P9BE-NEXT: vperm v1, v3, v5, v4
-; P9BE-NEXT: li r6, 0
; P9BE-NEXT: xvnegsp v5, v0
; P9BE-NEXT: xvnegsp v0, v1
; P9BE-NEXT: .p2align 4
; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader
; P9BE-NEXT: #
; P9BE-NEXT: lfd f0, 0(r3)
+; P9BE-NEXT: add r7, r3, r4
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: lfdx f0, r3, r4
; P9BE-NEXT: vperm v6, v3, v1, v4
; P9BE-NEXT: xxlor v6, vs0, vs0
; P9BE-NEXT: vperm v7, v3, v6, v4
; P9BE-NEXT: vperm v6, v3, v6, v2
-; P9BE-NEXT: add r7, r3, r4
; P9BE-NEXT: vextuwlx r3, r5, v1
; P9BE-NEXT: xvnegsp v6, v6
+; P9BE-NEXT: add r6, r3, r6
; P9BE-NEXT: xvnegsp v1, v7
-; P9BE-NEXT: vabsduw v1, v1, v0
+; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: vabsduw v6, v6, v5
+; P9BE-NEXT: vabsduw v1, v1, v0
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxswapd v6, v1
-; P9BE-NEXT: add r6, r3, r6
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: vextuwlx r8, r5, v1
-; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: add r6, r8, r6
; P9BE-NEXT: bdnz .LBB0_1
; P9BE-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxsd v2, 0(r3)
; CHECK-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-NEXT: lxsd v1, 0(r4)
+; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-NEXT: lxvx v4, 0, r3
; CHECK-NEXT: addis r3, r2, .LCPI1_1@toc@ha
; CHECK-NEXT: addi r3, r3, .LCPI1_1@toc@l
; CHECK-NEXT: lxvx v0, 0, r3
-; CHECK-NEXT: lxsd v1, 0(r4)
-; CHECK-NEXT: xxlxor v3, v3, v3
+; CHECK-NEXT: li r3, 0
; CHECK-NEXT: vperm v5, v3, v2, v4
; CHECK-NEXT: vperm v2, v3, v2, v0
; CHECK-NEXT: vperm v0, v3, v1, v0
; CHECK-NEXT: vadduwm v2, v2, v3
; CHECK-NEXT: xxspltw v3, v2, 2
; CHECK-NEXT: vadduwm v2, v2, v3
-; CHECK-NEXT: li r3, 0
; CHECK-NEXT: vextuwrx r3, r3, v2
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfd f0, 0(r3)
; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l
; P9BE-NEXT: lxvx v4, 0, r3
; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha
; P9BE-NEXT: xxlor v2, vs0, vs0
; P9BE-NEXT: lfd f0, 0(r4)
; P9BE-NEXT: lxvx v0, 0, r3
-; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: xxlor v1, vs0, vs0
+; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vperm v5, v3, v2, v4
; P9BE-NEXT: vperm v2, v3, v2, v0
; P9BE-NEXT: vperm v0, v3, v1, v0
; P9BE-NEXT: vadduwm v2, v2, v3
; P9BE-NEXT: xxspltw v3, v2, 1
; P9BE-NEXT: vadduwm v2, v2, v3
-; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuwlx r3, r3, v2
; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: blr
; CHECK-NEXT: add r5, r3, r4
; CHECK-NEXT: lxsiwzx v2, r3, r4
; CHECK-NEXT: addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: addi r3, r3, .LCPI2_0@toc@l
; CHECK-NEXT: lxvx v4, 0, r3
; CHECK-NEXT: li r3, 4
; CHECK-NEXT: lxsiwzx v5, r5, r3
-; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: vperm v2, v2, v3, v4
; CHECK-NEXT: vperm v3, v5, v3, v4
; CHECK-NEXT: vspltisw v4, 8
; P9BE-NEXT: add r5, r3, r4
; P9BE-NEXT: lfiwzx f0, r3, r4
; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha
+; P9BE-NEXT: xxlxor v3, v3, v3
+; P9BE-NEXT: xxsldwi v2, f0, f0, 1
; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l
; P9BE-NEXT: lxvx v4, 0, r3
; P9BE-NEXT: li r3, 4
-; P9BE-NEXT: xxsldwi v2, f0, f0, 1
; P9BE-NEXT: lfiwzx f0, r5, r3
-; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: vperm v2, v3, v2, v4
; P9BE-NEXT: xxsldwi v5, f0, f0, 1
; P9BE-NEXT: vperm v3, v3, v5, v4
; CHECK-LABEL: test16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sldi r4, r4, 1
-; CHECK-NEXT: add r6, r3, r4
; CHECK-NEXT: li r7, 16
-; CHECK-NEXT: lxsihzx v2, r6, r7
+; CHECK-NEXT: add r6, r3, r4
; CHECK-NEXT: lxsihzx v4, r3, r4
+; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-NEXT: lxsihzx v2, r6, r7
; CHECK-NEXT: li r6, 0
+; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-NEXT: mtvsrd v3, r6
; CHECK-NEXT: vsplth v4, v4, 3
; CHECK-NEXT: vsplth v2, v2, 3
-; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha
-; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-NEXT: vmrghh v4, v3, v4
; CHECK-NEXT: vmrghh v2, v3, v2
; CHECK-NEXT: vsplth v3, v3, 3
; P9BE-LABEL: test16:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: sldi r4, r4, 1
-; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: li r7, 16
-; P9BE-NEXT: lxsihzx v2, r6, r7
+; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: lxsihzx v4, r3, r4
+; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
+; P9BE-NEXT: lxsihzx v2, r6, r7
; P9BE-NEXT: li r6, 0
+; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; P9BE-NEXT: sldi r6, r6, 48
; P9BE-NEXT: vsplth v4, v4, 3
; P9BE-NEXT: mtvsrd v3, r6
; P9BE-NEXT: vsplth v2, v2, 3
-; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
-; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; P9BE-NEXT: vmrghh v4, v3, v4
; P9BE-NEXT: vmrghh v2, v3, v2
; P9BE-NEXT: vsplth v3, v3, 0
; CHECK-NEXT: mtvsrd v3, r3
; CHECK-NEXT: li r3, 8
; CHECK-NEXT: lxsibzx v5, r6, r3
+; CHECK-NEXT: vspltb v4, v3, 7
; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha
-; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-NEXT: vspltb v2, v2, 7
+; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-NEXT: vmrghb v2, v3, v2
-; CHECK-NEXT: vspltb v4, v3, 7
; CHECK-NEXT: vspltb v5, v5, 7
; CHECK-NEXT: vmrglh v2, v2, v4
; CHECK-NEXT: vmrghb v3, v3, v5
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: li r7, 8
-; P9BE-NEXT: lxsibzx v2, r6, r7
; P9BE-NEXT: lxsibzx v4, r3, r4
+; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
+; P9BE-NEXT: lxsibzx v2, r6, r7
; P9BE-NEXT: li r6, 0
+; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; P9BE-NEXT: sldi r6, r6, 56
; P9BE-NEXT: vspltb v4, v4, 7
; P9BE-NEXT: mtvsrd v3, r6
; P9BE-NEXT: vmrghb v4, v3, v4
; P9BE-NEXT: vmrghb v2, v3, v2
; P9BE-NEXT: vspltb v3, v3, 0
-; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
-; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; P9BE-NEXT: vmrghh v4, v4, v3
; P9BE-NEXT: xxspltw v3, v3, 0
; P9BE-NEXT: vmrghw v2, v4, v2
; CHECK-P9-LABEL: foo3_fmf:
; CHECK-P9: # %bb.0:
; CHECK-P9-NEXT: addis 3, 2, .LCPI20_2@toc@ha
-; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3)
; CHECK-P9-NEXT: xsabsdp 0, 1
+; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3)
; CHECK-P9-NEXT: xscmpudp 0, 0, 2
; CHECK-P9-NEXT: xxlxor 0, 0, 0
; CHECK-P9-NEXT: blt 0, .LBB20_2
; CHECK-P9-LABEL: goo3_fmf:
; CHECK-P9: # %bb.0:
; CHECK-P9-NEXT: addis 3, 2, .LCPI22_2@toc@ha
-; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3)
; CHECK-P9-NEXT: xsabsdp 0, 1
+; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3)
; CHECK-P9-NEXT: fcmpu 0, 0, 2
; CHECK-P9-NEXT: xxlxor 0, 0, 0
; CHECK-P9-NEXT: blt 0, .LBB22_2
; CHECK-NEXT: .cfi_offset r31, -8
; CHECK-NEXT: .cfi_offset r2, -152
; CHECK-NEXT: lis 5, 4
+; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 6, 5, 6292
+; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 5, 5, 6291
+; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 7, 6, 13030
; CHECK-NEXT: oris 8, 6, 13066
-; CHECK-NEXT: ori 7, 7, 3704
; CHECK-NEXT: oris 9, 6, 13054
+; CHECK-NEXT: oris 10, 6, 13042
+; CHECK-NEXT: oris 11, 6, 13078
+; CHECK-NEXT: oris 12, 6, 13115
+; CHECK-NEXT: oris 0, 6, 13103
+; CHECK-NEXT: oris 30, 6, 13091
+; CHECK-NEXT: oris 29, 6, 13127
+; CHECK-NEXT: oris 28, 6, 13164
+; CHECK-NEXT: oris 27, 6, 13152
+; CHECK-NEXT: oris 26, 6, 13139
+; CHECK-NEXT: oris 25, 6, 13176
+; CHECK-NEXT: ori 7, 7, 3704
; CHECK-NEXT: ori 8, 8, 44408
; CHECK-NEXT: ori 9, 9, 30840
-; CHECK-NEXT: add 7, 4, 7
-; CHECK-NEXT: oris 10, 6, 13042
; CHECK-NEXT: ori 10, 10, 17272
-; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill
-; CHECK-NEXT: add 7, 4, 8
-; CHECK-NEXT: oris 11, 6, 13078
; CHECK-NEXT: ori 11, 11, 57976
-; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill
-; CHECK-NEXT: add 7, 4, 9
-; CHECK-NEXT: oris 12, 6, 13115
; CHECK-NEXT: ori 12, 12, 33144
-; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill
-; CHECK-NEXT: add 7, 4, 10
-; CHECK-NEXT: oris 0, 6, 13103
; CHECK-NEXT: ori 0, 0, 19576
-; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill
-; CHECK-NEXT: add 7, 4, 11
-; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 30, 6, 13091
; CHECK-NEXT: ori 30, 30, 6008
-; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill
-; CHECK-NEXT: add 7, 4, 12
-; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 29, 6, 13127
; CHECK-NEXT: ori 29, 29, 46712
+; CHECK-NEXT: ori 28, 28, 21880
+; CHECK-NEXT: ori 27, 27, 8312
+; CHECK-NEXT: ori 26, 26, 60280
+; CHECK-NEXT: ori 25, 25, 35448
+; CHECK-NEXT: add 7, 4, 7
; CHECK-NEXT: sldi 5, 5, 32
; CHECK-NEXT: oris 5, 5, 29347
; CHECK-NEXT: ori 5, 5, 20088
+; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 7, 4, 8
; CHECK-NEXT: lis 8, 402
+; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 7, 4, 9
; CHECK-NEXT: lis 9, 451
+; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 7, 4, 10
; CHECK-NEXT: lis 10, 500
+; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 7, 4, 11
; CHECK-NEXT: lis 11, 549
-; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill
+; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill
+; CHECK-NEXT: add 7, 4, 12
; CHECK-NEXT: std 7, 344(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 0
-; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 28, 6, 13164
-; CHECK-NEXT: ori 28, 28, 21880
; CHECK-NEXT: std 7, 336(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 30
-; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 27, 6, 13152
-; CHECK-NEXT: ori 27, 27, 8312
; CHECK-NEXT: std 7, 328(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 29
-; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 26, 6, 13139
-; CHECK-NEXT: ori 26, 26, 60280
; CHECK-NEXT: std 7, 320(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 28
-; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill
-; CHECK-NEXT: oris 25, 6, 13176
-; CHECK-NEXT: ori 25, 25, 35448
; CHECK-NEXT: std 7, 312(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 27
; CHECK-NEXT: std 7, 304(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 5, 268
; CHECK-NEXT: std 4, 256(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 4, 585
+; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill
+; CHECK-NEXT: lis 6, 305
+; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill
+; CHECK-NEXT: lis 7, 354
; CHECK-NEXT: ori 4, 4, 61440
; CHECK-NEXT: std 4, 560(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 4, 48
; CHECK-NEXT: std 4, 192(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 36352
; CHECK-NEXT: lis 5, 317
+; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 184(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 25088
; CHECK-NEXT: lis 5, 366
+; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 176(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 13824
; CHECK-NEXT: lis 5, 415
+; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 168(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 2560
; CHECK-NEXT: lis 5, 463
+; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 160(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 56832
; CHECK-NEXT: lis 5, 512
+; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 152(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 45568
; CHECK-NEXT: lis 5, 561
+; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 144(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 34304
; CHECK-NEXT: lis 5, 12
+; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 136(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 13568
; CHECK-NEXT: lis 5, 61
+; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 128(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 2304
; CHECK-NEXT: lis 5, 109
; CHECK-NEXT: std 4, 120(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 56576
; CHECK-NEXT: lis 5, 158
+; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 112(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 45312
; CHECK-NEXT: lis 5, 207
+; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 34048
; CHECK-NEXT: lis 5, 256
-; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill
-; CHECK-NEXT: lis 6, 305
-; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 96(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 22784
-; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill
-; CHECK-NEXT: lis 7, 354
+; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 88(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 6, 11520
; CHECK-NEXT: ld 6, 240(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 80(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 7, 256
; CHECK-NEXT: ld 7, 232(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 72(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 8, 54528
; CHECK-NEXT: ld 8, 224(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 64(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 9, 43264
; CHECK-NEXT: ld 9, 216(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 56(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 10, 32000
; CHECK-NEXT: ld 10, 208(1) # 8-byte Folded Reload
+; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 48(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 11, 20736
; CHECK-NEXT: ld 11, 200(1) # 8-byte Folded Reload
-; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill
-; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill
-; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload
-; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 14, 48(1) # 8-byte Folded Reload
+; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: ld 31, 40(1) # 8-byte Folded Reload
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: stdux 3, 12, 2
; CHECK-NEXT: ld 2, 552(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 5
+; CHECK-NEXT: stdx 3, 12, 6
+; CHECK-NEXT: stdx 3, 12, 7
+; CHECK-NEXT: stdx 3, 12, 8
+; CHECK-NEXT: stdx 3, 12, 9
+; CHECK-NEXT: stdx 3, 12, 10
+; CHECK-NEXT: stdx 3, 12, 11
+; CHECK-NEXT: stdx 3, 12, 30
+; CHECK-NEXT: stdx 3, 12, 29
+; CHECK-NEXT: stdx 3, 12, 28
+; CHECK-NEXT: stdx 3, 12, 27
+; CHECK-NEXT: stdx 3, 12, 26
+; CHECK-NEXT: stdx 3, 12, 25
+; CHECK-NEXT: stdx 3, 12, 24
+; CHECK-NEXT: stdx 3, 12, 23
+; CHECK-NEXT: stdx 3, 12, 4
+; CHECK-NEXT: stdx 3, 12, 0
+; CHECK-NEXT: stdx 3, 12, 22
+; CHECK-NEXT: stdx 3, 12, 21
+; CHECK-NEXT: stdx 3, 12, 20
+; CHECK-NEXT: stdx 3, 12, 19
+; CHECK-NEXT: stdx 3, 12, 18
+; CHECK-NEXT: stdx 3, 12, 17
+; CHECK-NEXT: stdx 3, 12, 16
+; CHECK-NEXT: stdx 3, 12, 15
+; CHECK-NEXT: stdx 3, 12, 14
+; CHECK-NEXT: stdx 3, 12, 31
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: ld 2, 544(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: ld 2, 400(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 2
-; CHECK-NEXT: stdx 3, 12, 6
-; CHECK-NEXT: stdx 3, 12, 7
-; CHECK-NEXT: stdx 3, 12, 8
-; CHECK-NEXT: stdx 3, 12, 9
-; CHECK-NEXT: stdx 3, 12, 10
-; CHECK-NEXT: stdx 3, 12, 11
-; CHECK-NEXT: stdx 3, 12, 30
-; CHECK-NEXT: stdx 3, 12, 29
-; CHECK-NEXT: stdx 3, 12, 28
-; CHECK-NEXT: stdx 3, 12, 27
-; CHECK-NEXT: stdx 3, 12, 26
-; CHECK-NEXT: stdx 3, 12, 25
-; CHECK-NEXT: stdx 3, 12, 24
-; CHECK-NEXT: stdx 3, 12, 23
-; CHECK-NEXT: stdx 3, 12, 4
-; CHECK-NEXT: stdx 3, 12, 0
-; CHECK-NEXT: stdx 3, 12, 22
-; CHECK-NEXT: stdx 3, 12, 21
-; CHECK-NEXT: stdx 3, 12, 20
-; CHECK-NEXT: stdx 3, 12, 19
-; CHECK-NEXT: stdx 3, 12, 18
-; CHECK-NEXT: stdx 3, 12, 17
-; CHECK-NEXT: stdx 3, 12, 16
-; CHECK-NEXT: stdx 3, 12, 15
-; CHECK-NEXT: stdx 3, 12, 14
-; CHECK-NEXT: stdx 3, 12, 31
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: ld 12, 384(1) # 8-byte Folded Reload
+; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload
+; CHECK-NEXT: addi 4, 4, 1
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 376(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 288(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 280(1) # 8-byte Folded Reload
-; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload
-; CHECK-NEXT: addi 4, 4, 1
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 272(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)
; PPC64LE-NEXT: std 0, 16(1)
; PPC64LE-NEXT: stdu 1, -32(1)
; PPC64LE-NEXT: addis 3, 2, .LC0@toc@ha
-; PPC64LE-NEXT: ld 3, .LC0@toc@l(3)
; PPC64LE-NEXT: li 4, 0
+; PPC64LE-NEXT: ld 3, .LC0@toc@l(3)
; PPC64LE-NEXT: std 4, 0(3)
; PPC64LE-NEXT: bl barney.94
; PPC64LE-NEXT: nop
; P9LE-LABEL: s2v_test_f2:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: addi r3, r3, 4
-; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrglw v2, v2, v2
+; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
; P9LE-LABEL: s2v_test_f3:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: sldi r4, r7, 2
-; P9LE-NEXT: lxsiwzx v3, r3, r4
; P9LE-NEXT: vmrglw v2, v2, v2
+; P9LE-NEXT: lxsiwzx v3, r3, r4
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
; P9BE-LABEL: s2v_test_f3:
; P9BE: # %bb.0: # %entry
; P9BE: sldi r4, r7, 2
-; P9BE: lfiwzx f0, r3, r4
+; P9BE-DAG: lfiwzx f0, r3, r4
; P9BE-DAG: xxspltw v2, v2, 1
-; P9BE-DAG: xxsldwi v3, f0, f0, 1
+; P9BE: xxsldwi v3, f0, f0, 1
; P9BE: vmrghw v2, v3, v2
; P9BE-NEXT: blr
; P9LE-LABEL: s2v_test_f4:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: addi r3, r3, 4
-; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrglw v2, v2, v2
+; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
; P9BE-LABEL: s2v_test_f4:
; P9BE: # %bb.0: # %entry
; P9BE: addi r3, r3, 4
-; P9BE: lfiwzx f0, 0, r3
+; P9BE-DAG: lfiwzx f0, 0, r3
; P9BE-DAG: xxspltw v2, v2, 1
-; P9BE-DAG: xxsldwi v3, f0, f0, 1
+; P9BE: xxsldwi v3, f0, f0, 1
; P9BE: vmrghw v2, v3, v2
; P9BE-NEXT: blr
; CHECK-P9-NEXT: addi 6, 6, 16
; CHECK-P9-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NEXT: addi 5, 5, -32
+; CHECK-P9-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NEXT: addi 5, 5, 1
-; CHECK-P9-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NEXT: mtctr 5
; CHECK-P9-NEXT: .p2align 4
; CHECK-P9-NEXT: .LBB0_1: # %vector.body
; CHECK-P9-NEXT: xvmuldp 4, 4, 0
; CHECK-P9-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NEXT: xvmuldp 5, 5, 0
+; CHECK-P9-NEXT: addi 4, 4, 256
+; CHECK-P9-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NEXT: stxv 1, 16(3)
+; CHECK-P9-NEXT: stxv 2, 0(3)
; CHECK-P9-NEXT: stxv 3, 48(3)
; CHECK-P9-NEXT: stxv 4, 32(3)
; CHECK-P9-NEXT: stxv 5, 240(3)
-; CHECK-P9-NEXT: addi 4, 4, 256
-; CHECK-P9-NEXT: xvmuldp 6, 6, 0
-; CHECK-P9-NEXT: stxv 2, 0(3)
; CHECK-P9-NEXT: stxv 6, 224(3)
; CHECK-P9-NEXT: addi 3, 3, 256
; CHECK-P9-NEXT: bdnz .LBB0_1
; CHECK-P9-NO-HEURISTIC-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, 16
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, -32
+; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NO-HEURISTIC-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, 1
-; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NO-HEURISTIC-NEXT: mtctr 5
; CHECK-P9-NO-HEURISTIC-NEXT: .p2align 4
; CHECK-P9-NO-HEURISTIC-NEXT: .LBB0_1: # %vector.body
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 5, 5, 0
+; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 1, 16(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 2, 0(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 3, 48(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 4, 32(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 5, 240(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 6, 224(3)
-; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256
; CHECK-P9-NO-HEURISTIC-NEXT: addi 3, 3, 256
; CHECK-P9-NO-HEURISTIC-NEXT: bdnz .LBB0_1
; CHECK-P9-NO-HEURISTIC-NEXT: # %bb.2: # %return.block
; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: addi 3, 3, 1
-; CHECK-NEXT: mtctr 3
-; CHECK-NEXT: li 3, 1
; CHECK-NEXT: li 7, -1
; CHECK-NEXT: li 5, 0
+; CHECK-NEXT: mtctr 3
+; CHECK-NEXT: li 3, 1
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: bdz .LBB0_6
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: add 4, 4, 6
; CHECK-NEXT: .LBB0_6:
; CHECK-NEXT: xori 5, 5, 84
-; CHECK-NEXT: cntlzw 5, 5
; CHECK-NEXT: clrldi 3, 3, 32
+; CHECK-NEXT: li 7, 0
+; CHECK-NEXT: li 8, 3
; CHECK-NEXT: std 3, 104(1)
+; CHECK-NEXT: cntlzw 5, 5
; CHECK-NEXT: addis 3, 2, .LC0@toc@ha
+; CHECK-NEXT: li 10, 0
; CHECK-NEXT: ld 3, .LC0@toc@l(3)
-; CHECK-NEXT: li 7, 0
-; CHECK-NEXT: li 8, 3
; CHECK-NEXT: srwi 5, 5, 5
; CHECK-NEXT: add 4, 4, 5
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: std 5, 120(1)
; CHECK-NEXT: li 5, 3
-; CHECK-NEXT: std 5, 96(1)
; CHECK-NEXT: clrldi 6, 4, 32
; CHECK-NEXT: li 4, 3
+; CHECK-NEXT: std 5, 96(1)
; CHECK-NEXT: li 5, 0
-; CHECK-NEXT: li 10, 0
; CHECK-NEXT: bl printf
; CHECK-NEXT: nop
%1 = load i32, i32* undef, align 4
; CHECK: # %bb.0:
; CHECK-NEXT: lha 3, 0(3)
; CHECK-NEXT: li 5, 1
-; CHECK-NEXT: sldi 5, 5, 62
; CHECK-NEXT: lhz 4, 0(0)
+; CHECK-NEXT: sldi 5, 5, 62
; CHECK-NEXT: mtctr 5
; CHECK-NEXT: srawi 3, 3, 1
; CHECK-NEXT: addze 3, 3
; CHECK-NEXT: nop
; CHECK-NEXT: addi 7, 30, -4
; CHECK-NEXT: mtctr 3
-; CHECK-NEXT: lwzu 8, 4(7)
; CHECK-NEXT: addi 4, 29, -8
; CHECK-NEXT: li 5, 0
+; CHECK-NEXT: lwzu 8, 4(7)
; CHECK-NEXT: bdz .LBB0_5
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: extswsli 6, 5, 5
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, x@toc@ha
-; CHECK-NEXT: addi r5, r5, x@toc@l
-; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: addis r6, r2, y@toc@ha
; CHECK-NEXT: li r7, 340
+; CHECK-NEXT: addi r5, r5, x@toc@l
+; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: addi r3, r6, y@toc@l
; CHECK-NEXT: lwz r6, y@toc@l(r6)
; CHECK-NEXT: mtctr r7
+; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: lwzu r7, 12(r5)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 4(r5)
-; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: stwu r6, 12(r4)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 8(r5)
; CHECK-NEXT: #
; CHECK-NEXT: maddld r7, r7, r7, r6
; CHECK-NEXT: lwzu r8, 12(r5)
-; CHECK-NEXT: maddld r8, r8, r8, r7
; CHECK-NEXT: stw r6, 4(r4)
; CHECK-NEXT: lwz r6, 4(r5)
-; CHECK-NEXT: maddld r6, r6, r6, r8
+; CHECK-NEXT: maddld r8, r8, r8, r7
; CHECK-NEXT: stw r7, 8(r4)
; CHECK-NEXT: lwz r7, 8(r5)
+; CHECK-NEXT: maddld r6, r6, r6, r8
; CHECK-NEXT: stwu r8, 12(r4)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2:
; P9LE-LABEL: fold_srem_vec_1:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: lis r4, 31710
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 2
+; P9LE-NEXT: ori r4, r4, 63421
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
-; P9LE-NEXT: ori r4, r4, 63421
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: sub r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: lis r4, 21399
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
+; P9LE-NEXT: ori r4, r4, 33437
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
-; P9LE-NEXT: ori r4, r4, 33437
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 5
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 98
; P9LE-NEXT: sub r3, r3, r4
-; P9LE-NEXT: vmrghh v3, v4, v3
+; P9LE-NEXT: lis r4, -16728
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
-; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: lis r4, -16728
; P9LE-NEXT: ori r4, r4, 63249
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9BE-LABEL: fold_srem_vec_1:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 2
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 31710
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 63421
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -16728
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 63249
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 63249
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 8
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 21399
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
+; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 5
; P9LE-LABEL: fold_srem_vec_2:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
; P9LE-NEXT: add r5, r5, r3
; P9LE-NEXT: add r5, r5, r6
; P9LE-NEXT: mulli r5, r5, 95
; P9LE-NEXT: sub r3, r3, r5
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9BE-LABEL: fold_srem_vec_2:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -21386
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: add r5, r5, r3
; P9BE-NEXT: mulli r5, r5, 95
; P9BE-NEXT: sub r3, r3, r5
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9LE-LABEL: combine_srem_sdiv:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r7, r3
; P9LE-NEXT: mulhw r8, r7, r4
; P9LE-NEXT: add r7, r8, r7
; P9LE-NEXT: add r7, r7, r8
; P9LE-NEXT: mulli r8, r7, 95
; P9LE-NEXT: sub r3, r3, r8
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: srawi r4, r4, 6
; P9LE-NEXT: add r4, r4, r8
; P9LE-NEXT: mulli r8, r4, 95
+; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: vmrghh v2, v2, v4
; P9LE-NEXT: mtvsrd v3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r7
-; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: vmrghh v4, v5, v4
; P9LE-NEXT: vmrglw v3, v4, v3
; P9LE-NEXT: vadduhm v2, v2, v3
; P9BE-LABEL: combine_srem_sdiv:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, -21386
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r5, r5, 37253
; P9BE-NEXT: extsh r4, r3
; P9BE-NEXT: mulhw r6, r4, r5
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r7, r3
; P9BE-NEXT: mulhw r8, r7, r5
; P9BE-NEXT: add r7, r8, r7
; P9BE-NEXT: mulli r8, r7, 95
; P9BE-NEXT: sub r3, r3, r8
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
+; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
-; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: add r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: sub r3, r3, r4
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: mulli r4, r4, 95
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9LE-LABEL: dont_fold_srem_one:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 2
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -14230
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 30865
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: lis r4, -19946
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 0
+; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v3, v4
; P9LE-NEXT: extsh r3, r3
-; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: add r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 23
; P9LE-NEXT: sub r3, r3, r4
-; P9LE-NEXT: vmrghh v3, v3, v4
+; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
-; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: ori r4, r4, 47143
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9BE-LABEL: dont_fold_srem_one:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -19946
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -14230
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 30865
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 30865
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: mtvsrd v2, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: vmrghh v2, v4, v2
; P9BE-NEXT: vmrghw v2, v2, v3
; P9LE-LABEL: dont_fold_urem_i16_smax:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -19946
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
+; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
-; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 11
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: srawi r4, r3, 15
; P9LE-NEXT: addze r4, r4
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: vmrghh v2, v2, v4
; P9LE-NEXT: vmrglw v2, v3, v2
; P9BE-LABEL: dont_fold_urem_i16_smax:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -19946
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: srawi r4, r3, 15
; P9BE-NEXT: addze r4, r4
; P9BE-NEXT: mtvsrd v2, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: vmrghh v2, v4, v2
; P9BE-NEXT: vmrghw v2, v2, v3
; P9LE-LABEL: dont_fold_srem_i64:
; P9LE: # %bb.0:
; P9LE-NEXT: lis r4, 24749
+; P9LE-NEXT: mfvsrd r3, v3
; P9LE-NEXT: ori r4, r4, 47142
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 58853
-; P9LE-NEXT: mfvsrd r3, v3
; P9LE-NEXT: ori r4, r4, 6055
; P9LE-NEXT: mulhd r4, r3, r4
; P9LE-NEXT: rldicl r5, r4, 1, 63
; P9LE-NEXT: sub r4, r4, r5
; P9LE-NEXT: mtvsrdd v3, r3, r4
; P9LE-NEXT: lis r4, 25653
+; P9LE-NEXT: mfvsrd r3, v2
; P9LE-NEXT: ori r4, r4, 15432
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 1603
-; P9LE-NEXT: mfvsrd r3, v2
; P9LE-NEXT: ori r4, r4, 21445
; P9LE-NEXT: mulhd r4, r3, r4
; P9LE-NEXT: rldicl r5, r4, 1, 63
; P9BE-LABEL: dont_fold_srem_i64:
; P9BE: # %bb.0:
; P9BE-NEXT: lis r4, 24749
+; P9BE-NEXT: mfvsrld r3, v3
; P9BE-NEXT: ori r4, r4, 47142
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 58853
-; P9BE-NEXT: mfvsrld r3, v3
; P9BE-NEXT: ori r4, r4, 6055
; P9BE-NEXT: mulhd r4, r3, r4
; P9BE-NEXT: rldicl r5, r4, 1, 63
; P9BE-NEXT: sub r4, r4, r5
; P9BE-NEXT: mtvsrdd v3, r4, r3
; P9BE-NEXT: lis r4, 25653
+; P9BE-NEXT: mfvsrld r3, v2
; P9BE-NEXT: ori r4, r4, 15432
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 1603
-; P9BE-NEXT: mfvsrld r3, v2
; P9BE-NEXT: ori r4, r4, 21445
; P9BE-NEXT: mulhd r4, r3, r4
; P9BE-NEXT: rldicl r5, r4, 1, 63
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r3, r3, 15
+; CHECK-P9-LE-NEXT: li r6, -32768
+; CHECK-P9-LE-NEXT: mr r31, r1
+; CHECK-P9-LE-NEXT: addi r4, r31, 48
; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
; CHECK-P9-LE-NEXT: neg r5, r3
-; CHECK-P9-LE-NEXT: li r6, -32768
; CHECK-P9-LE-NEXT: divd r7, r5, r6
-; CHECK-P9-LE-NEXT: mulld r6, r7, r6
-; CHECK-P9-LE-NEXT: mr r31, r1
-; CHECK-P9-LE-NEXT: addi r4, r31, 48
; CHECK-P9-LE-NEXT: add r3, r1, r5
+; CHECK-P9-LE-NEXT: mulld r6, r7, r6
; CHECK-P9-LE-NEXT: sub r5, r5, r6
; CHECK-P9-LE-NEXT: stdux r4, r1, r5
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: bne cr0, .LBB0_1
; CHECK-P9-LE-NEXT: .LBB0_2:
-; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: li r4, 1
+; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: stw r4, 4792(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r3)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r4, r4, 15
+; CHECK-P9-LE-NEXT: li r7, -4096
+; CHECK-P9-LE-NEXT: mr r31, r1
+; CHECK-P9-LE-NEXT: addi r5, r31, 48
; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4
; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29
; CHECK-P9-LE-NEXT: neg r6, r4
-; CHECK-P9-LE-NEXT: li r7, -4096
; CHECK-P9-LE-NEXT: divd r8, r6, r7
-; CHECK-P9-LE-NEXT: mulld r7, r8, r7
-; CHECK-P9-LE-NEXT: mr r31, r1
-; CHECK-P9-LE-NEXT: addi r5, r31, 48
; CHECK-P9-LE-NEXT: add r4, r1, r6
+; CHECK-P9-LE-NEXT: mulld r7, r8, r7
; CHECK-P9-LE-NEXT: sub r6, r6, r7
; CHECK-P9-LE-NEXT: stdux r5, r1, r6
; CHECK-P9-LE-NEXT: cmpd r1, r4
; CHECK-P9-LE-NEXT: cmpd r1, r4
; CHECK-P9-LE-NEXT: bne cr0, .LBB1_1
; CHECK-P9-LE-NEXT: .LBB1_2:
-; CHECK-P9-LE-NEXT: addi r4, r1, 32
; CHECK-P9-LE-NEXT: extswsli r3, r3, 2
-; CHECK-P9-LE-NEXT: add r3, r4, r3
; CHECK-P9-LE-NEXT: li r5, 1
+; CHECK-P9-LE-NEXT: addi r4, r1, 32
+; CHECK-P9-LE-NEXT: add r3, r4, r3
; CHECK-P9-LE-NEXT: stw r5, 4096(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r4)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r3, r3, 15
-; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
-; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
; CHECK-P9-LE-NEXT: lis r5, -1
; CHECK-P9-LE-NEXT: ori r5, r5, 0
-; CHECK-P9-LE-NEXT: neg r6, r3
-; CHECK-P9-LE-NEXT: divd r7, r6, r5
-; CHECK-P9-LE-NEXT: mulld r7, r7, r5
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r4, r31, 48
+; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
+; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
+; CHECK-P9-LE-NEXT: neg r6, r3
+; CHECK-P9-LE-NEXT: divd r7, r6, r5
; CHECK-P9-LE-NEXT: add r3, r1, r6
+; CHECK-P9-LE-NEXT: mulld r7, r7, r5
; CHECK-P9-LE-NEXT: sub r6, r6, r7
; CHECK-P9-LE-NEXT: stdux r4, r1, r6
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: bne cr0, .LBB2_1
; CHECK-P9-LE-NEXT: .LBB2_2:
-; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: li r4, 1
+; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: stw r4, 4792(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r3)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s
+---
+# Check that postmisched's TopDepthReduce heuristic moves the MULLD later
+# because of the dependency on x5
+name: test
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test
+ ; CHECK: renamable $x5 = LD 0, killed renamable $x5 :: (load 8)
+ ; CHECK: renamable $x4 = LD 0, killed renamable $x4 :: (load 8)
+ ; CHECK: renamable $x5 = MULLD killed renamable $x5, renamable $x3
+ ; CHECK: renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5
+ renamable $x5 = LD 0, killed renamable $x5 :: (load 8)
+ renamable $x5 = MULLD killed renamable $x5, renamable $x3
+ renamable $x4 = LD 0, killed renamable $x4 :: (load 8)
+ renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5
+...
; P9BE-NEXT: mtfprwz f0, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: xscvuxddp f0, f0
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mtfprwz f1, r3
-; P9BE-NEXT: xscvuxddp f0, f0
; P9BE-NEXT: xscvuxddp f1, f1
; P9BE-NEXT: xxmrghd v2, vs0, vs1
; P9BE-NEXT: blr
; P9LE-NEXT: mtfprwz f0, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: xscvuxddp f0, f0
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mtfprwz f1, r3
-; P9LE-NEXT: xscvuxddp f0, f0
; P9LE-NEXT: xscvuxddp f1, f1
; P9LE-NEXT: xxmrghd v2, vs1, vs0
; P9LE-NEXT: blr
; CHECK-LABEL: test_xaddr:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li r4, 0
-; CHECK-NEXT: ori r4, r4, 40000
; CHECK-NEXT: std r3, -8(r1)
+; CHECK-NEXT: ori r4, r4, 40000
; CHECK-NEXT: lbzx r3, r3, r4
; CHECK-NEXT: blr
entry:
; CHECK-LABEL: test_xoaddr:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi r3, r3, 8
-; CHECK-NEXT: lxvx vs0, 0, r3
; CHECK-NEXT: addi r4, r4, 4
+; CHECK-NEXT: lxvx vs0, 0, r3
; CHECK-NEXT: stxvx vs0, 0, r4
; CHECK-NEXT: blr
entry:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: li r3, 8
+; CHECK-NEXT: li r5, 3
; CHECK-NEXT: mtctr r3
; CHECK-NEXT: li r3, 0
-; CHECK-NEXT: li r5, 3
; loop instruction number is changed from 5 to 4, so its align is changed from 5 to 4.
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_1: # %for.body
; P9LE-LABEL: fold_urem_vec_1:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 21399
+; P9LE-NEXT: lis r5, 8456
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 33437
+; P9LE-NEXT: ori r5, r5, 16913
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 5
; P9LE-NEXT: lis r4, 16727
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
+; P9LE-NEXT: ori r4, r4, 2287
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
-; P9LE-NEXT: ori r4, r4, 2287
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 8
; P9LE-NEXT: mulli r4, r4, 1003
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: lis r5, 8456
-; P9LE-NEXT: ori r5, r5, 16913
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r4, r3, 16
; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 0
+; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
-; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: sub r5, r3, r4
; P9LE-NEXT: srwi r5, r5, 1
; P9BE-LABEL: fold_urem_vec_1:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 16727
+; P9BE-NEXT: lis r5, 8456
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 2287
+; P9BE-NEXT: ori r5, r5, 16913
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 8
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 21399
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 5
; P9BE-NEXT: mulli r4, r4, 98
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: lis r5, 8456
-; P9BE-NEXT: ori r5, r5, 16913
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31
; P9BE-NEXT: sub r3, r4, r3
; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: sub r5, r3, r4
; P9BE-NEXT: srwi r5, r5, 1
; P9LE-LABEL: fold_urem_vec_2:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 22765
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
; P9LE-NEXT: sub r6, r3, r5
; P9LE-NEXT: srwi r5, r5, 6
; P9LE-NEXT: mulli r5, r5, 95
; P9LE-NEXT: sub r3, r3, r5
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9BE-LABEL: fold_urem_vec_2:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 22765
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r5, r3, r4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r5, r3, r4
; P9BE-NEXT: sub r6, r3, r5
; P9BE-NEXT: mulli r5, r5, 95
; P9BE-NEXT: sub r3, r3, r5
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9LE-LABEL: combine_urem_udiv:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 22765
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r7, r3, 16
; P9LE-NEXT: mulhwu r8, r7, r4
; P9LE-NEXT: sub r7, r7, r8
; P9LE-NEXT: srwi r7, r7, 6
; P9LE-NEXT: mulli r8, r7, 95
; P9LE-NEXT: sub r3, r3, r8
-; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: add r4, r8, r4
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r8, r4, 95
+; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: vmrghh v2, v2, v4
; P9LE-NEXT: mtvsrd v3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r7
-; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: vmrghh v4, v5, v4
; P9LE-NEXT: vmrglw v3, v4, v3
; P9LE-NEXT: vadduhm v2, v2, v3
; P9BE-LABEL: combine_urem_udiv:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, 22765
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r5, r5, 8969
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: mulhwu r6, r4, r5
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r7, r3, 16
; P9BE-NEXT: mulhwu r8, r7, r5
; P9BE-NEXT: sub r7, r7, r8
; P9BE-NEXT: mulli r8, r7, 95
; P9BE-NEXT: sub r3, r3, r8
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9LE-LABEL: dont_fold_urem_power_of_two:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
+; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 26
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: lis r4, 22765
-; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
; P9BE-LABEL: dont_fold_urem_power_of_two:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 2
+; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: clrlwi r3, r3, 27
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: lis r4, 22765
-; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
; P9LE-LABEL: dont_fold_urem_one:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
-; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -19946
+; P9LE-NEXT: lis r5, -14230
+; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 17097
+; P9LE-NEXT: ori r5, r5, 30865
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 4
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
+; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
-; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 11
; P9LE-NEXT: mulli r4, r4, 5423
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: lis r5, -14230
-; P9LE-NEXT: ori r5, r5, 30865
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r4, r3, 16
; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31
; P9BE-LABEL: dont_fold_urem_one:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
-; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 24749
+; P9BE-NEXT: lis r5, -14230
+; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 47143
+; P9BE-NEXT: ori r5, r5, 30865
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 11
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: sldi r3, r3, 48
+; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 4
; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: lis r5, -14230
-; P9BE-NEXT: ori r5, r5, 30865
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31
; P9LE-LABEL: dont_fold_urem_i64:
; P9LE: # %bb.0:
; P9LE-NEXT: lis r4, 25644
+; P9LE-NEXT: mfvsrld r3, v3
; P9LE-NEXT: ori r4, r4, 34192
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 45590
-; P9LE-NEXT: mfvsrld r3, v3
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mulhdu r4, r3, r4
; P9LE-NEXT: sub r5, r3, r4
; P9LE-NEXT: sub r4, r4, r5
; P9LE-NEXT: lis r5, 25653
; P9LE-NEXT: ori r5, r5, 15432
-; P9LE-NEXT: sldi r5, r5, 32
; P9LE-NEXT: mtvsrdd v3, r4, r3
; P9LE-NEXT: mfvsrd r3, v2
+; P9LE-NEXT: sldi r5, r5, 32
; P9LE-NEXT: rldicl r4, r3, 63, 1
; P9LE-NEXT: oris r5, r5, 1603
; P9LE-NEXT: ori r5, r5, 21445
; P9BE-LABEL: dont_fold_urem_i64:
; P9BE: # %bb.0:
; P9BE-NEXT: lis r4, 25644
+; P9BE-NEXT: mfvsrd r3, v3
; P9BE-NEXT: ori r4, r4, 34192
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 45590
-; P9BE-NEXT: mfvsrd r3, v3
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mulhdu r4, r3, r4
; P9BE-NEXT: sub r5, r3, r4
; P9BE-NEXT: add r4, r5, r4
; P9BE-NEXT: lis r5, -16037
; P9BE-NEXT: rldicl r4, r4, 60, 4
-; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: ori r5, r5, 28749
+; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: oris r5, r5, 52170
; P9BE-NEXT: ori r5, r5, 12109
; P9BE-NEXT: sub r4, r4, r5
; P9BE-NEXT: lis r5, 25653
; P9BE-NEXT: ori r5, r5, 15432
-; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: mtvsrdd v3, r3, r4
; P9BE-NEXT: mfvsrld r3, v2
+; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: rldicl r4, r3, 63, 1
; P9BE-NEXT: oris r5, r5, 1603
; P9BE-NEXT: ori r5, r5, 21445
; CHECK-P9-LABEL: test_v8i16_sign_negative:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis 3, 2, .LCPI6_0@toc@ha
-; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l
; CHECK-P9-NEXT: vadduhm 2, 2, 3
+; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l
; CHECK-P9-NEXT: lxvx 35, 0, 3
; CHECK-P9-NEXT: vadduhm 2, 2, 3
; CHECK-P9-NEXT: vspltish 3, 1
define dso_local void @test(i32* %Arr, i32 signext %Len) {
; CHECK-LABEL: test:
; CHECK: lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: xxbrw vs{{[0-9]+}}, [[REG]]
+; CHECK-NOT: [[REG]]
+; CHECK: xxbrw vs{{[0-9]+}}, [[REG]]
entry:
%cmp1 = icmp slt i32 0, %Len
br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: xxswapd vs2, vs1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xxswapd vs2, vs1
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghh v3, v3, v4
+; CHECK-BE-NEXT: vmrghw v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghh v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 48
-; CHECK-BE-NEXT: vmrghw v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs2, 0(r4)
+; CHECK-P9-NEXT: lxv vs1, 16(r4)
+; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3
; CHECK-P9-NEXT: xxswapd vs4, vs2
+; CHECK-P9-NEXT: xscvspdpn f5, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
+; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f3, vs3
; CHECK-P9-NEXT: xscvspdpn f4, vs4
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: xscvdpsxws f4, f4
-; CHECK-P9-NEXT: xscvspdpn f5, vs2
-; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
; CHECK-P9-NEXT: mffprwz r5, f3
-; CHECK-P9-NEXT: lxv vs1, 16(r4)
-; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3
; CHECK-P9-NEXT: xxswapd vs3, vs1
; CHECK-P9-NEXT: mtvsrd v2, r5
; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvspdpn f3, vs3
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: vmrghh v2, v3, v2
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvspdpn f4, vs6
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: xscvspdpn f2, vs1
; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-P9-NEXT: xscvdpsxws f4, f4
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
-; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: mtvsrd v4, r5
; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: vmrghh v3, v3, v4
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: vmrglw v2, v3, v2
; CHECK-P9-NEXT: mffprwz r5, f4
-; CHECK-P9-NEXT: xscvspdpn f1, vs1
-; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: mtvsrd v4, r5
; CHECK-P9-NEXT: mffprwz r5, f3
; CHECK-P9-NEXT: xxsldwi vs3, vs0, vs0, 3
; CHECK-P9-NEXT: mtvsrd v4, r4
; CHECK-P9-NEXT: mffprwz r4, f0
; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3
+; CHECK-P9-NEXT: stxv vs2, 0(r3)
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: vmrghh v2, v4, v2
; CHECK-P9-NEXT: vmrglw v3, v4, v3
; CHECK-P9-NEXT: xxmrgld vs0, v3, v2
; CHECK-P9-NEXT: stxv vs0, 16(r3)
-; CHECK-P9-NEXT: stxv vs2, 0(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xxswapd vs3, vs1
-; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
-; CHECK-BE-NEXT: mffprwz r5, f2
; CHECK-BE-NEXT: xscvspdpn f4, vs1
; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
+; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: mffprwz r5, f2
+; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3
; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v2, r5
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: xscvdpsxws f3, f4
-; CHECK-BE-NEXT: lxv vs0, 0(r4)
-; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r5
; CHECK-BE-NEXT: lxv vs0, 32(r4)
; CHECK-BE-NEXT: xscvspdpn f5, vs1
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
-; CHECK-BE-NEXT: xscvdpsxws f5, f5
-; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xxswapd vs3, vs1
+; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvdpsxws f5, f5
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v0, r5
-; CHECK-BE-NEXT: vmrghh v5, v5, v0
; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: vmrghh v5, v5, v0
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: vmrghw v3, v5, v4
; CHECK-BE-NEXT: xscvdpsxws f3, f3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghw v3, v5, v4
; CHECK-BE-NEXT: mffprwz r4, f5
; CHECK-BE-NEXT: xxmrghd vs4, v3, v2
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: mffprwz r4, f2
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: stxv vs4, 0(r3)
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3
; CHECK-BE-NEXT: sldi r4, r4, 48
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: vmrghh v3, v4, v3
-; CHECK-BE-NEXT: mtvsrd v4, r4
-; CHECK-BE-NEXT: mffprwz r4, f1
-; CHECK-BE-NEXT: xxswapd vs1, vs0
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: mtvsrd v4, r4
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: vmrghh v2, v2, v4
-; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: vmrghw v2, v2, v3
+; CHECK-BE-NEXT: mffprwz r4, f1
+; CHECK-BE-NEXT: xxswapd vs1, vs0
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r4
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: xxswapd vs2, vs1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xxswapd vs2, vs1
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghh v3, v3, v4
+; CHECK-BE-NEXT: vmrghw v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghh v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 48
-; CHECK-BE-NEXT: vmrghw v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs2, 0(r4)
+; CHECK-P9-NEXT: lxv vs1, 16(r4)
+; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3
; CHECK-P9-NEXT: xxswapd vs4, vs2
+; CHECK-P9-NEXT: xscvspdpn f5, vs2
+; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
+; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f3, vs3
; CHECK-P9-NEXT: xscvspdpn f4, vs4
+; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: xscvdpsxws f4, f4
-; CHECK-P9-NEXT: xscvspdpn f5, vs2
-; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1
-; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
; CHECK-P9-NEXT: mffprwz r5, f3
-; CHECK-P9-NEXT: lxv vs1, 16(r4)
-; CHECK-P9-NEXT: xxsldwi vs6, vs1, vs1, 3
; CHECK-P9-NEXT: xxswapd vs3, vs1
; CHECK-P9-NEXT: mtvsrd v2, r5
; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvspdpn f3, vs3
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: vmrghh v2, v3, v2
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvspdpn f4, vs6
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: xscvspdpn f2, vs1
; CHECK-P9-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-P9-NEXT: xscvdpsxws f4, f4
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
-; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: mtvsrd v4, r5
; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: vmrghh v3, v3, v4
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: vmrglw v2, v3, v2
; CHECK-P9-NEXT: mffprwz r5, f4
-; CHECK-P9-NEXT: xscvspdpn f1, vs1
-; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: mtvsrd v4, r5
; CHECK-P9-NEXT: mffprwz r5, f3
; CHECK-P9-NEXT: xxsldwi vs3, vs0, vs0, 3
; CHECK-P9-NEXT: mtvsrd v4, r4
; CHECK-P9-NEXT: mffprwz r4, f0
; CHECK-P9-NEXT: xxsldwi vs0, vs1, vs1, 3
+; CHECK-P9-NEXT: stxv vs2, 0(r3)
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: vmrghh v2, v4, v2
; CHECK-P9-NEXT: vmrglw v3, v4, v3
; CHECK-P9-NEXT: xxmrgld vs0, v3, v2
; CHECK-P9-NEXT: stxv vs0, 16(r3)
-; CHECK-P9-NEXT: stxv vs2, 0(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r4)
+; CHECK-BE-NEXT: lxv vs0, 0(r4)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xxswapd vs3, vs1
-; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
-; CHECK-BE-NEXT: mffprwz r5, f2
; CHECK-BE-NEXT: xscvspdpn f4, vs1
; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
+; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: mffprwz r5, f2
+; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3
; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v2, r5
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: xscvdpsxws f3, f4
-; CHECK-BE-NEXT: lxv vs0, 0(r4)
-; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r5
; CHECK-BE-NEXT: lxv vs0, 32(r4)
; CHECK-BE-NEXT: xscvspdpn f5, vs1
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
-; CHECK-BE-NEXT: xscvdpsxws f5, f5
-; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xxswapd vs3, vs1
+; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
+; CHECK-BE-NEXT: sldi r5, r5, 48
+; CHECK-BE-NEXT: xscvdpsxws f5, f5
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v0, r5
-; CHECK-BE-NEXT: vmrghh v5, v5, v0
; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: vmrghh v5, v5, v0
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: vmrghw v3, v5, v4
; CHECK-BE-NEXT: xscvdpsxws f3, f3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghw v3, v5, v4
; CHECK-BE-NEXT: mffprwz r4, f5
; CHECK-BE-NEXT: xxmrghd vs4, v3, v2
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: mffprwz r4, f2
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: stxv vs4, 0(r3)
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xxsldwi vs1, vs0, vs0, 3
; CHECK-BE-NEXT: sldi r4, r4, 48
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: vmrghh v3, v4, v3
-; CHECK-BE-NEXT: mtvsrd v4, r4
-; CHECK-BE-NEXT: mffprwz r4, f1
-; CHECK-BE-NEXT: xxswapd vs1, vs0
; CHECK-BE-NEXT: xscvspdpn f1, vs1
+; CHECK-BE-NEXT: mtvsrd v4, r4
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: vmrghh v2, v2, v4
-; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: vmrghw v2, v2, v3
+; CHECK-BE-NEXT: mffprwz r4, f1
+; CHECK-BE-NEXT: xxswapd vs1, vs0
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r4
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: xxswapd vs2, vs1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xxswapd vs2, vs1
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghb v3, v3, v4
+; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghb v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3
-; CHECK-P9-NEXT: xscvspdpn f4, vs4
-; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
; CHECK-P9-NEXT: lxv vs2, 16(r3)
+; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT: xscvspdpn f4, vs4
+; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: xxswapd vs4, vs3
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3
-; CHECK-BE-NEXT: xscvspdpn f4, vs4
-; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT: xscvspdpn f4, vs4
+; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: xxswapd vs4, vs3
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f3, f3
+; CHECK-BE-NEXT: vmrghb v3, v3, v4
+; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f3
; CHECK-BE-NEXT: xxswapd vs3, vs2
-; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
-; CHECK-BE-NEXT: vmrghb v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f3
; CHECK-BE-NEXT: xscvspdpn f3, vs2
; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v5, r3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: mffprwz r3, f2
-; CHECK-BE-NEXT: xxswapd vs2, vs1
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: vmrghb v4, v4, v5
; CHECK-BE-NEXT: vmrghh v3, v4, v3
-; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: vmrghw v2, v3, v2
+; CHECK-BE-NEXT: mffprwz r3, f2
+; CHECK-BE-NEXT: xxswapd vs2, vs1
+; CHECK-BE-NEXT: sldi r3, r3, 56
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xscvspdpn f2, vs1
; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v5, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghb v4, v4, v5
+; CHECK-BE-NEXT: vmrghh v3, v4, v3
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghb v4, v4, v5
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v3, v4, v3
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-NEXT: xxswapd v2, vs0
; CHECK-P9-NEXT: xscvspdpn f0, vs0
; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: xscvspdpn f1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f0
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-P9-NEXT: xscvspdpn f2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: xxswapd vs2, vs1
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xxsldwi vs2, vs1, vs1, 3
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xxswapd vs2, vs1
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghb v3, v3, v4
+; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghb v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3
-; CHECK-P9-NEXT: xscvspdpn f4, vs4
-; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
; CHECK-P9-NEXT: lxv vs2, 16(r3)
+; CHECK-P9-NEXT: xxsldwi vs4, vs3, vs3, 3
+; CHECK-P9-NEXT: xscvspdpn f4, vs4
+; CHECK-P9-NEXT: xscvdpsxws f4, f4
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: xxswapd vs4, vs3
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3
-; CHECK-BE-NEXT: xscvspdpn f4, vs4
-; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
; CHECK-BE-NEXT: lxv vs2, 32(r3)
+; CHECK-BE-NEXT: xxsldwi vs4, vs3, vs3, 3
+; CHECK-BE-NEXT: xscvspdpn f4, vs4
+; CHECK-BE-NEXT: xscvdpsxws f4, f4
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: xxswapd vs4, vs3
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: mtvsrd v4, r3
; CHECK-BE-NEXT: xscvdpsxws f3, f3
+; CHECK-BE-NEXT: vmrghb v3, v3, v4
+; CHECK-BE-NEXT: vmrghh v2, v3, v2
; CHECK-BE-NEXT: mffprwz r3, f3
; CHECK-BE-NEXT: xxswapd vs3, vs2
-; CHECK-BE-NEXT: xscvspdpn f3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
-; CHECK-BE-NEXT: vmrghb v3, v3, v4
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v2, v3, v2
+; CHECK-BE-NEXT: xscvspdpn f3, vs3
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f3
; CHECK-BE-NEXT: xscvspdpn f3, vs2
; CHECK-BE-NEXT: xxsldwi vs2, vs2, vs2, 1
; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v5, r3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: mffprwz r3, f2
-; CHECK-BE-NEXT: xxswapd vs2, vs1
-; CHECK-BE-NEXT: xscvspdpn f2, vs2
-; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: vmrghb v4, v4, v5
; CHECK-BE-NEXT: vmrghh v3, v4, v3
-; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: vmrghw v2, v3, v2
+; CHECK-BE-NEXT: mffprwz r3, f2
+; CHECK-BE-NEXT: xxswapd vs2, vs1
+; CHECK-BE-NEXT: sldi r3, r3, 56
+; CHECK-BE-NEXT: xscvspdpn f2, vs2
; CHECK-BE-NEXT: mtvsrd v3, r3
+; CHECK-BE-NEXT: xscvdpsxws f2, f2
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: xscvspdpn f2, vs1
; CHECK-BE-NEXT: xxsldwi vs1, vs1, vs1, 1
; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v5, r3
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: vmrghb v4, v4, v5
+; CHECK-BE-NEXT: vmrghh v3, v4, v3
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xxswapd vs1, vs0
-; CHECK-BE-NEXT: xscvspdpn f1, vs1
-; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: vmrghb v4, v4, v5
; CHECK-BE-NEXT: sldi r3, r3, 56
-; CHECK-BE-NEXT: vmrghh v3, v4, v3
+; CHECK-BE-NEXT: xscvspdpn f1, vs1
; CHECK-BE-NEXT: mtvsrd v4, r3
+; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mffprwz r3, f1
; CHECK-BE-NEXT: xscvspdpn f1, vs0
; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xscvdpsxws f2, f1
; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xscvdpsxws f2, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
-; CHECK-P9-NEXT: xxswapd vs3, vs3
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: lxv vs2, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
+; CHECK-P9-NEXT: xxswapd vs3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xscvdpsxws f4, f3
-; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f4, f3
+; CHECK-BE-NEXT: xxswapd vs3, vs3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: lxv vs3, 0(r4)
; CHECK-P9-NEXT: lxv vs2, 16(r4)
; CHECK-P9-NEXT: lxv vs1, 32(r4)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
; CHECK-P9-NEXT: lxv vs0, 48(r4)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
; CHECK-P9-NEXT: xscvdpsxws f5, f2
; CHECK-P9-NEXT: xscvdpsxws f6, f1
; CHECK-P9-NEXT: xxswapd vs3, vs3
; CHECK-P9-NEXT: xscvdpsxws f7, f0
+; CHECK-P9-NEXT: xxswapd vs2, vs2
+; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xxswapd vs0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvdpsxws f3, f3
-; CHECK-P9-NEXT: xxswapd vs2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
+; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: mtvsrd v2, r5
; CHECK-P9-NEXT: mffprwz r5, f5
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: mtvsrd v5, r5
; CHECK-P9-NEXT: mffprwz r5, f3
; CHECK-P9-NEXT: lxv vs3, 64(r4)
-; CHECK-P9-NEXT: xxswapd vs1, vs1
-; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: mtvsrd v0, r5
; CHECK-P9-NEXT: mffprwz r5, f2
; CHECK-P9-NEXT: lxv vs2, 80(r4)
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs4, 48(r4)
+; CHECK-BE-NEXT: lxv vs3, 32(r4)
+; CHECK-BE-NEXT: lxv vs2, 16(r4)
+; CHECK-BE-NEXT: lxv vs1, 0(r4)
; CHECK-BE-NEXT: xscvdpsxws f5, f4
; CHECK-BE-NEXT: xxswapd vs4, vs4
-; CHECK-BE-NEXT: lxv vs3, 32(r4)
; CHECK-BE-NEXT: xscvdpsxws f6, f3
; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f4, f4
-; CHECK-BE-NEXT: mffprwz r5, f5
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs2, 16(r4)
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f7, f2
+; CHECK-BE-NEXT: lxv vs0, 112(r4)
; CHECK-BE-NEXT: xxswapd vs2, vs2
+; CHECK-BE-NEXT: xscvdpsxws f4, f4
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
+; CHECK-BE-NEXT: mffprwz r5, f5
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: mtvsrd v2, r5
; CHECK-BE-NEXT: mffprwz r5, f4
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs1, 0(r4)
; CHECK-BE-NEXT: xscvdpsxws f4, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f6
; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs0, 112(r4)
; CHECK-BE-NEXT: vmrghh v2, v2, v3
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: vmrghh v4, v4, v1
; CHECK-BE-NEXT: mtvsrd v1, r5
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: vmrghh v5, v5, v1
; CHECK-BE-NEXT: mffprwz r5, f0
; CHECK-BE-NEXT: lxv vs0, 64(r4)
+; CHECK-BE-NEXT: vmrghh v5, v5, v1
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: mffprwz r4, f3
-; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: mtvsrd v1, r5
; CHECK-BE-NEXT: vmrghw v3, v5, v4
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: vmrghh v0, v0, v1
; CHECK-BE-NEXT: xxmrghd vs3, v3, v2
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: mffprwz r4, f2
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: stxv vs3, 0(r3)
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: vmrghh v2, v2, v3
; CHECK-BE-NEXT: mffprwz r4, f2
; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: vmrghw v2, v2, v0
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xscvdpsxws f1, f0
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v4, r4
; CHECK-BE-NEXT: mffprwz r4, f0
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: mtvsrd v1, r5
-; CHECK-BE-NEXT: vmrghh v0, v0, v1
-; CHECK-BE-NEXT: vmrghw v2, v2, v0
-; CHECK-BE-NEXT: stxv vs3, 0(r3)
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v5, r4
; CHECK-BE-NEXT: vmrghh v4, v4, v5
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xscvdpsxws f2, f1
; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xscvdpsxws f2, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
-; CHECK-P9-NEXT: xxswapd vs3, vs3
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: lxv vs2, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
+; CHECK-P9-NEXT: xxswapd vs3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xscvdpsxws f4, f3
-; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f4, f3
+; CHECK-BE-NEXT: xxswapd vs3, vs3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: sldi r3, r3, 48
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: lxv vs3, 0(r4)
; CHECK-P9-NEXT: lxv vs2, 16(r4)
; CHECK-P9-NEXT: lxv vs1, 32(r4)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
; CHECK-P9-NEXT: lxv vs0, 48(r4)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
; CHECK-P9-NEXT: xscvdpsxws f5, f2
; CHECK-P9-NEXT: xscvdpsxws f6, f1
; CHECK-P9-NEXT: xxswapd vs3, vs3
; CHECK-P9-NEXT: xscvdpsxws f7, f0
+; CHECK-P9-NEXT: xxswapd vs2, vs2
+; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xxswapd vs0, vs0
-; CHECK-P9-NEXT: xscvdpsxws f0, f0
-; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: xscvdpsxws f3, f3
-; CHECK-P9-NEXT: xxswapd vs2, vs2
; CHECK-P9-NEXT: xscvdpsxws f2, f2
+; CHECK-P9-NEXT: xscvdpsxws f1, f1
+; CHECK-P9-NEXT: xscvdpsxws f0, f0
+; CHECK-P9-NEXT: mffprwz r5, f4
; CHECK-P9-NEXT: mtvsrd v2, r5
; CHECK-P9-NEXT: mffprwz r5, f5
; CHECK-P9-NEXT: mtvsrd v3, r5
; CHECK-P9-NEXT: mtvsrd v5, r5
; CHECK-P9-NEXT: mffprwz r5, f3
; CHECK-P9-NEXT: lxv vs3, 64(r4)
-; CHECK-P9-NEXT: xxswapd vs1, vs1
-; CHECK-P9-NEXT: xscvdpsxws f1, f1
; CHECK-P9-NEXT: mtvsrd v0, r5
; CHECK-P9-NEXT: mffprwz r5, f2
; CHECK-P9-NEXT: lxv vs2, 80(r4)
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs4, 48(r4)
+; CHECK-BE-NEXT: lxv vs3, 32(r4)
+; CHECK-BE-NEXT: lxv vs2, 16(r4)
+; CHECK-BE-NEXT: lxv vs1, 0(r4)
; CHECK-BE-NEXT: xscvdpsxws f5, f4
; CHECK-BE-NEXT: xxswapd vs4, vs4
-; CHECK-BE-NEXT: lxv vs3, 32(r4)
; CHECK-BE-NEXT: xscvdpsxws f6, f3
; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f4, f4
-; CHECK-BE-NEXT: mffprwz r5, f5
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs2, 16(r4)
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f7, f2
+; CHECK-BE-NEXT: lxv vs0, 112(r4)
; CHECK-BE-NEXT: xxswapd vs2, vs2
+; CHECK-BE-NEXT: xscvdpsxws f4, f4
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: xscvdpsxws f2, f2
+; CHECK-BE-NEXT: mffprwz r5, f5
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: mtvsrd v2, r5
; CHECK-BE-NEXT: mffprwz r5, f4
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs1, 0(r4)
; CHECK-BE-NEXT: xscvdpsxws f4, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: xscvdpsxws f1, f1
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f6
; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: lxv vs0, 112(r4)
; CHECK-BE-NEXT: vmrghh v2, v2, v3
; CHECK-BE-NEXT: mtvsrd v3, r5
; CHECK-BE-NEXT: mffprwz r5, f3
; CHECK-BE-NEXT: vmrghh v4, v4, v1
; CHECK-BE-NEXT: mtvsrd v1, r5
; CHECK-BE-NEXT: xscvdpsxws f2, f2
-; CHECK-BE-NEXT: vmrghh v5, v5, v1
; CHECK-BE-NEXT: mffprwz r5, f0
; CHECK-BE-NEXT: lxv vs0, 64(r4)
+; CHECK-BE-NEXT: vmrghh v5, v5, v1
+; CHECK-BE-NEXT: sldi r5, r5, 48
; CHECK-BE-NEXT: mffprwz r4, f3
-; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: mtvsrd v1, r5
; CHECK-BE-NEXT: vmrghw v3, v5, v4
+; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: vmrghh v0, v0, v1
; CHECK-BE-NEXT: xxmrghd vs3, v3, v2
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: mffprwz r4, f2
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: xscvdpsxws f1, f1
+; CHECK-BE-NEXT: stxv vs3, 0(r3)
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: vmrghh v2, v2, v3
; CHECK-BE-NEXT: mffprwz r4, f2
; CHECK-BE-NEXT: sldi r4, r4, 48
+; CHECK-BE-NEXT: vmrghw v2, v2, v0
; CHECK-BE-NEXT: mtvsrd v3, r4
; CHECK-BE-NEXT: mffprwz r4, f1
; CHECK-BE-NEXT: xscvdpsxws f1, f0
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v4, r4
; CHECK-BE-NEXT: mffprwz r4, f0
-; CHECK-BE-NEXT: sldi r5, r5, 48
-; CHECK-BE-NEXT: mtvsrd v1, r5
-; CHECK-BE-NEXT: vmrghh v0, v0, v1
-; CHECK-BE-NEXT: vmrghw v2, v2, v0
-; CHECK-BE-NEXT: stxv vs3, 0(r3)
; CHECK-BE-NEXT: sldi r4, r4, 48
; CHECK-BE-NEXT: mtvsrd v5, r4
; CHECK-BE-NEXT: vmrghh v4, v4, v5
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs2, 0(r4)
; CHECK-P9-NEXT: lxv vs3, 16(r4)
-; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
-; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: lxv vs1, 48(r4)
+; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
+; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-P9-NEXT: xvcvdpuxws v2, vs4
; CHECK-P9-NEXT: xvcvdpuxws v3, vs2
; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs2, 16(r4)
; CHECK-BE-NEXT: lxv vs3, 0(r4)
-; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
-; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-BE-NEXT: lxv vs0, 48(r4)
; CHECK-BE-NEXT: lxv vs1, 32(r4)
+; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
+; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-BE-NEXT: xvcvdpuxws v2, vs4
; CHECK-BE-NEXT: xvcvdpuxws v3, vs2
; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs6, 0(r4)
; CHECK-P9-NEXT: lxv vs7, 16(r4)
-; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6
-; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-P9-NEXT: lxv vs4, 32(r4)
; CHECK-P9-NEXT: lxv vs5, 48(r4)
+; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6
+; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4
; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4
-; CHECK-P9-NEXT: xvcvdpuxws v2, vs8
-; CHECK-P9-NEXT: xvcvdpuxws v3, vs6
; CHECK-P9-NEXT: lxv vs2, 64(r4)
; CHECK-P9-NEXT: lxv vs3, 80(r4)
+; CHECK-P9-NEXT: lxv vs0, 96(r4)
+; CHECK-P9-NEXT: lxv vs1, 112(r4)
+; CHECK-P9-NEXT: xvcvdpuxws v2, vs8
+; CHECK-P9-NEXT: xvcvdpuxws v3, vs6
; CHECK-P9-NEXT: xvcvdpuxws v4, vs7
; CHECK-P9-NEXT: vmrgew v2, v3, v2
; CHECK-P9-NEXT: xvcvdpuxws v3, vs4
; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
-; CHECK-P9-NEXT: lxv vs0, 96(r4)
-; CHECK-P9-NEXT: lxv vs1, 112(r4)
; CHECK-P9-NEXT: stxv v2, 0(r3)
; CHECK-P9-NEXT: xvcvdpuxws v5, vs2
; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs6, 16(r4)
; CHECK-BE-NEXT: lxv vs7, 0(r4)
-; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6
-; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-BE-NEXT: lxv vs4, 48(r4)
; CHECK-BE-NEXT: lxv vs5, 32(r4)
+; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6
+; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4
; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4
-; CHECK-BE-NEXT: xvcvdpuxws v2, vs8
-; CHECK-BE-NEXT: xvcvdpuxws v3, vs6
; CHECK-BE-NEXT: lxv vs2, 80(r4)
; CHECK-BE-NEXT: lxv vs3, 64(r4)
+; CHECK-BE-NEXT: lxv vs0, 112(r4)
+; CHECK-BE-NEXT: lxv vs1, 96(r4)
+; CHECK-BE-NEXT: xvcvdpuxws v2, vs8
+; CHECK-BE-NEXT: xvcvdpuxws v3, vs6
; CHECK-BE-NEXT: xvcvdpuxws v4, vs7
; CHECK-BE-NEXT: vmrgew v2, v3, v2
; CHECK-BE-NEXT: xvcvdpuxws v3, vs4
; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
-; CHECK-BE-NEXT: lxv vs0, 112(r4)
-; CHECK-BE-NEXT: lxv vs1, 96(r4)
; CHECK-BE-NEXT: stxv v2, 0(r3)
; CHECK-BE-NEXT: xvcvdpuxws v5, vs2
; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs2, 0(r4)
; CHECK-P9-NEXT: lxv vs3, 16(r4)
-; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
-; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-P9-NEXT: lxv vs0, 32(r4)
; CHECK-P9-NEXT: lxv vs1, 48(r4)
+; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
+; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-P9-NEXT: xvcvdpsxws v2, vs4
; CHECK-P9-NEXT: xvcvdpsxws v3, vs2
; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs2, 16(r4)
; CHECK-BE-NEXT: lxv vs3, 0(r4)
-; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
-; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-BE-NEXT: lxv vs0, 48(r4)
; CHECK-BE-NEXT: lxv vs1, 32(r4)
+; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
+; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
; CHECK-BE-NEXT: xvcvdpsxws v2, vs4
; CHECK-BE-NEXT: xvcvdpsxws v3, vs2
; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs6, 0(r4)
; CHECK-P9-NEXT: lxv vs7, 16(r4)
-; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6
-; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-P9-NEXT: lxv vs4, 32(r4)
; CHECK-P9-NEXT: lxv vs5, 48(r4)
+; CHECK-P9-NEXT: xxmrgld vs8, vs7, vs6
+; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-P9-NEXT: xxmrgld vs7, vs5, vs4
; CHECK-P9-NEXT: xxmrghd vs4, vs5, vs4
-; CHECK-P9-NEXT: xvcvdpsxws v2, vs8
-; CHECK-P9-NEXT: xvcvdpsxws v3, vs6
; CHECK-P9-NEXT: lxv vs2, 64(r4)
; CHECK-P9-NEXT: lxv vs3, 80(r4)
+; CHECK-P9-NEXT: lxv vs0, 96(r4)
+; CHECK-P9-NEXT: lxv vs1, 112(r4)
+; CHECK-P9-NEXT: xvcvdpsxws v2, vs8
+; CHECK-P9-NEXT: xvcvdpsxws v3, vs6
; CHECK-P9-NEXT: xvcvdpsxws v4, vs7
; CHECK-P9-NEXT: vmrgew v2, v3, v2
; CHECK-P9-NEXT: xvcvdpsxws v3, vs4
; CHECK-P9-NEXT: xxmrgld vs4, vs3, vs2
; CHECK-P9-NEXT: xxmrghd vs2, vs3, vs2
-; CHECK-P9-NEXT: lxv vs0, 96(r4)
-; CHECK-P9-NEXT: lxv vs1, 112(r4)
; CHECK-P9-NEXT: stxv v2, 0(r3)
; CHECK-P9-NEXT: xvcvdpsxws v5, vs2
; CHECK-P9-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs6, 16(r4)
; CHECK-BE-NEXT: lxv vs7, 0(r4)
-; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6
-; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-BE-NEXT: lxv vs4, 48(r4)
; CHECK-BE-NEXT: lxv vs5, 32(r4)
+; CHECK-BE-NEXT: xxmrgld vs8, vs7, vs6
+; CHECK-BE-NEXT: xxmrghd vs6, vs7, vs6
; CHECK-BE-NEXT: xxmrgld vs7, vs5, vs4
; CHECK-BE-NEXT: xxmrghd vs4, vs5, vs4
-; CHECK-BE-NEXT: xvcvdpsxws v2, vs8
-; CHECK-BE-NEXT: xvcvdpsxws v3, vs6
; CHECK-BE-NEXT: lxv vs2, 80(r4)
; CHECK-BE-NEXT: lxv vs3, 64(r4)
+; CHECK-BE-NEXT: lxv vs0, 112(r4)
+; CHECK-BE-NEXT: lxv vs1, 96(r4)
+; CHECK-BE-NEXT: xvcvdpsxws v2, vs8
+; CHECK-BE-NEXT: xvcvdpsxws v3, vs6
; CHECK-BE-NEXT: xvcvdpsxws v4, vs7
; CHECK-BE-NEXT: vmrgew v2, v3, v2
; CHECK-BE-NEXT: xvcvdpsxws v3, vs4
; CHECK-BE-NEXT: xxmrgld vs4, vs3, vs2
; CHECK-BE-NEXT: xxmrghd vs2, vs3, vs2
-; CHECK-BE-NEXT: lxv vs0, 112(r4)
-; CHECK-BE-NEXT: lxv vs1, 96(r4)
; CHECK-BE-NEXT: stxv v2, 0(r3)
; CHECK-BE-NEXT: xvcvdpsxws v5, vs2
; CHECK-BE-NEXT: xxmrgld vs2, vs1, vs0
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xscvdpsxws f2, f1
; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xscvdpsxws f2, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
-; CHECK-P9-NEXT: xxswapd vs3, vs3
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: lxv vs2, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
+; CHECK-P9-NEXT: xxswapd vs3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xscvdpsxws f4, f3
-; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f4, f3
+; CHECK-BE-NEXT: xxswapd vs3, vs3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs7, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f8, f7
-; CHECK-P9-NEXT: xxswapd vs7, vs7
-; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: lxv vs6, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 112(r3)
; CHECK-P9-NEXT: lxv vs1, 96(r3)
+; CHECK-P9-NEXT: xscvdpsxws f8, f7
+; CHECK-P9-NEXT: xxswapd vs7, vs7
; CHECK-P9-NEXT: lxv vs2, 80(r3)
; CHECK-P9-NEXT: lxv vs3, 64(r3)
; CHECK-P9-NEXT: lxv vs4, 48(r3)
; CHECK-P9-NEXT: lxv vs5, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: mffprwz r3, f8
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f7
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs7, 112(r3)
-; CHECK-BE-NEXT: xscvdpsxws f8, f7
-; CHECK-BE-NEXT: xxswapd vs7, vs7
-; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: lxv vs6, 96(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f8, f7
+; CHECK-BE-NEXT: xxswapd vs7, vs7
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs4, 64(r3)
; CHECK-BE-NEXT: lxv vs5, 80(r3)
+; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: mffprwz r3, f8
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r3)
+; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: xscvdpsxws f2, f1
; CHECK-P9-NEXT: xxswapd vs1, vs1
; CHECK-P9-NEXT: xscvdpsxws f1, f1
-; CHECK-P9-NEXT: lxv vs0, 16(r3)
; CHECK-P9-NEXT: mffprwz r3, f2
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f1
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: xscvdpsxws f2, f1
; CHECK-BE-NEXT: xxswapd vs1, vs1
; CHECK-BE-NEXT: xscvdpsxws f1, f1
-; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: mffprwz r3, f2
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs3, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f4, f3
-; CHECK-P9-NEXT: xxswapd vs3, vs3
-; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: lxv vs2, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 48(r3)
; CHECK-P9-NEXT: lxv vs1, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f4, f3
+; CHECK-P9-NEXT: xxswapd vs3, vs3
+; CHECK-P9-NEXT: xscvdpsxws f3, f3
; CHECK-P9-NEXT: mffprwz r3, f4
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs3, 48(r3)
-; CHECK-BE-NEXT: xscvdpsxws f4, f3
-; CHECK-BE-NEXT: xxswapd vs3, vs3
-; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f4, f3
+; CHECK-BE-NEXT: xxswapd vs3, vs3
+; CHECK-BE-NEXT: xscvdpsxws f3, f3
; CHECK-BE-NEXT: mffprwz r3, f4
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs7, 0(r3)
-; CHECK-P9-NEXT: xscvdpsxws f8, f7
-; CHECK-P9-NEXT: xxswapd vs7, vs7
-; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: lxv vs6, 16(r3)
; CHECK-P9-NEXT: lxv vs0, 112(r3)
; CHECK-P9-NEXT: lxv vs1, 96(r3)
+; CHECK-P9-NEXT: xscvdpsxws f8, f7
+; CHECK-P9-NEXT: xxswapd vs7, vs7
; CHECK-P9-NEXT: lxv vs2, 80(r3)
; CHECK-P9-NEXT: lxv vs3, 64(r3)
; CHECK-P9-NEXT: lxv vs4, 48(r3)
; CHECK-P9-NEXT: lxv vs5, 32(r3)
+; CHECK-P9-NEXT: xscvdpsxws f7, f7
; CHECK-P9-NEXT: mffprwz r3, f8
; CHECK-P9-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mffprwz r3, f7
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs7, 112(r3)
-; CHECK-BE-NEXT: xscvdpsxws f8, f7
-; CHECK-BE-NEXT: xxswapd vs7, vs7
-; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: lxv vs6, 96(r3)
; CHECK-BE-NEXT: lxv vs0, 0(r3)
; CHECK-BE-NEXT: lxv vs1, 16(r3)
+; CHECK-BE-NEXT: xscvdpsxws f8, f7
+; CHECK-BE-NEXT: xxswapd vs7, vs7
; CHECK-BE-NEXT: lxv vs2, 32(r3)
; CHECK-BE-NEXT: lxv vs3, 48(r3)
; CHECK-BE-NEXT: lxv vs4, 64(r3)
; CHECK-BE-NEXT: lxv vs5, 80(r3)
+; CHECK-BE-NEXT: xscvdpsxws f7, f7
; CHECK-BE-NEXT: mffprwz r3, f8
; CHECK-BE-NEXT: sldi r3, r3, 56
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-P9-NEXT: mtfprwz f0, r3
; CHECK-P9-NEXT: li r3, 2
; CHECK-P9-NEXT: xscvuxdsp f0, f0
-; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextuhrx r3, r3, v2
; CHECK-P9-NEXT: clrlwi r3, r3, 16
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: mtfprwz f0, r3
; CHECK-P9-NEXT: xscvuxdsp f0, f0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxwsp v2, v2
; CHECK-BE-NEXT: blr
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-NEXT: lxv v2, 16(r4)
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v5, v5, v5
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v4, 0, r4
-; CHECK-P9-NEXT: xxlxor v5, v5, v5
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v0, v5, v3, v4
; CHECK-BE-NEXT: lxv v2, 16(r4)
; CHECK-BE-NEXT: lxv v3, 0(r4)
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v0, v3, v5, v4
; CHECK-P9-NEXT: mtfprwa f0, r3
; CHECK-P9-NEXT: li r3, 2
; CHECK-P9-NEXT: xscvsxdsp f0, f0
-; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextuhrx r3, r3, v2
; CHECK-P9-NEXT: extsh r3, r3
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: mtfprwa f0, r3
; CHECK-P9-NEXT: xscvsxdsp f0, f0
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: vmrghh v2, v2, v2
; CHECK-BE-NEXT: vextsh2w v3, v3
; CHECK-P9-NEXT: vmrglh v4, v3, v3
; CHECK-P9-NEXT: vmrghh v3, v3, v3
; CHECK-P9-NEXT: vextsh2w v3, v3
+; CHECK-P9-NEXT: vextsh2w v4, v4
; CHECK-P9-NEXT: xvcvsxwsp vs1, v3
; CHECK-P9-NEXT: vmrglh v3, v2, v2
; CHECK-P9-NEXT: vmrghh v2, v2, v2
-; CHECK-P9-NEXT: vextsh2w v4, v4
; CHECK-P9-NEXT: xvcvsxwsp vs0, v4
; CHECK-P9-NEXT: vextsh2w v3, v3
; CHECK-P9-NEXT: vextsh2w v2, v2
; CHECK-BE-NEXT: lxv v2, 16(r4)
; CHECK-BE-NEXT: lxv v3, 0(r4)
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: vperm v0, v5, v3, v4
; CHECK-BE-NEXT: vperm v4, v5, v2, v4
; CHECK-BE-NEXT: vmrghh v3, v3, v3
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp v2, v2
; CHECK-P9-NEXT: blr
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp v2, v2
; CHECK-BE-NEXT: blr
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-NEXT: lxv v2, 16(r4)
; CHECK-P9-NEXT: lxv v3, 0(r4)
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v5, v5, v5
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v4, 0, r4
-; CHECK-P9-NEXT: xxlxor v5, v5, v5
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v0, v5, v3, v4
; CHECK-BE-NEXT: lxv v2, 16(r4)
; CHECK-BE-NEXT: lxv v3, 0(r4)
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v5, v5, v5
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v0, v3, v5, v4
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v3, v3, v3
-; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha
-; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v3, v2, v4
+; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: vextsh2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_0@toc@ha
-; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l
; CHECK-P9-NEXT: lxv v2, 0(r4)
+; CHECK-P9-NEXT: addi r5, r5, .LCPI7_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r5
; CHECK-P9-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-P9-NEXT: addi r5, r5, .LCPI7_1@toc@l
; CHECK-P9-NEXT: xvcvsxddp vs1, v4
; CHECK-P9-NEXT: vperm v4, v2, v2, v0
; CHECK-P9-NEXT: vperm v2, v2, v2, v1
+; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: vextsh2d v4, v4
; CHECK-P9-NEXT: xvcvsxddp vs2, v4
; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs3, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v3
; CHECK-P9-NEXT: stxv vs2, 32(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: stxv vs3, 48(r3)
-; CHECK-P9-NEXT: stxv vs1, 16(r3)
; CHECK-P9-NEXT: xvcvsxddp vs4, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v5
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs6, v2
; CHECK-P9-NEXT: vperm v2, v4, v4, v1
; CHECK-P9-NEXT: stxv vs5, 80(r3)
-; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: vextsh2d v2, v2
; CHECK-P9-NEXT: xvcvsxddp vs7, v2
+; CHECK-P9-NEXT: stxv vs6, 96(r3)
; CHECK-P9-NEXT: stxv vs7, 112(r3)
-; CHECK-P9-NEXT: stxv vs0, 0(r3)
; CHECK-P9-NEXT: blr
;
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r5, r2, .LCPI7_0@toc@ha
-; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l
-; CHECK-BE-NEXT: lxvx v2, 0, r5
; CHECK-BE-NEXT: lxv v4, 0(r4)
; CHECK-BE-NEXT: lxv v1, 16(r4)
+; CHECK-BE-NEXT: xxlxor v5, v5, v5
+; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha
+; CHECK-BE-NEXT: addi r5, r5, .LCPI7_0@toc@l
+; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l
+; CHECK-BE-NEXT: lxvx v2, 0, r5
; CHECK-BE-NEXT: addis r5, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r5, r5, .LCPI7_1@toc@l
-; CHECK-BE-NEXT: addis r4, r2, .LCPI7_2@toc@ha
-; CHECK-BE-NEXT: xxlxor v5, v5, v5
-; CHECK-BE-NEXT: vperm v0, v5, v4, v2
; CHECK-BE-NEXT: lxvx v3, 0, r5
+; CHECK-BE-NEXT: vperm v0, v5, v4, v2
; CHECK-BE-NEXT: vperm v2, v5, v1, v2
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: addi r4, r4, .LCPI7_2@toc@l
; CHECK-BE-NEXT: vextsh2d v0, v0
; CHECK-BE-NEXT: xvcvsxddp vs2, v2
; CHECK-BE-NEXT: vperm v2, v5, v1, v3
+; CHECK-BE-NEXT: xvcvsxddp vs0, v0
+; CHECK-BE-NEXT: vperm v0, v5, v4, v3
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: stxv vs2, 80(r3)
+; CHECK-BE-NEXT: vextsh2d v0, v0
; CHECK-BE-NEXT: xvcvsxddp vs3, v2
; CHECK-BE-NEXT: lxvx v2, 0, r4
-; CHECK-BE-NEXT: xvcvsxddp vs0, v0
-; CHECK-BE-NEXT: vperm v0, v5, v4, v3
-; CHECK-BE-NEXT: vperm v3, v4, v4, v2
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_3@toc@ha
-; CHECK-BE-NEXT: vextsh2d v0, v0
; CHECK-BE-NEXT: xvcvsxddp vs1, v0
+; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
+; CHECK-BE-NEXT: stxv vs2, 80(r3)
+; CHECK-BE-NEXT: stxv vs0, 16(r3)
+; CHECK-BE-NEXT: vperm v3, v4, v4, v2
+; CHECK-BE-NEXT: vperm v2, v1, v1, v2
+; CHECK-BE-NEXT: stxv vs3, 112(r3)
; CHECK-BE-NEXT: stxv vs1, 48(r3)
; CHECK-BE-NEXT: vextsh2d v3, v3
-; CHECK-BE-NEXT: addi r4, r4, .LCPI7_3@toc@l
+; CHECK-BE-NEXT: vextsh2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs4, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: vperm v2, v1, v1, v2
-; CHECK-BE-NEXT: vextsh2d v2, v2
; CHECK-BE-NEXT: xvcvsxddp vs6, v2
-; CHECK-BE-NEXT: vperm v2, v1, v1, v3
; CHECK-BE-NEXT: vperm v4, v4, v4, v3
+; CHECK-BE-NEXT: vperm v2, v1, v1, v3
+; CHECK-BE-NEXT: stxv vs6, 64(r3)
+; CHECK-BE-NEXT: stxv vs4, 0(r3)
; CHECK-BE-NEXT: vextsh2d v4, v4
; CHECK-BE-NEXT: vextsh2d v2, v2
-; CHECK-BE-NEXT: xvcvsxddp vs7, v2
; CHECK-BE-NEXT: xvcvsxddp vs5, v4
-; CHECK-BE-NEXT: stxv vs3, 112(r3)
-; CHECK-BE-NEXT: stxv vs6, 64(r3)
-; CHECK-BE-NEXT: stxv vs0, 16(r3)
-; CHECK-BE-NEXT: stxv vs4, 0(r3)
+; CHECK-BE-NEXT: xvcvsxddp vs7, v2
; CHECK-BE-NEXT: stxv vs7, 96(r3)
; CHECK-BE-NEXT: stxv vs5, 32(r3)
; CHECK-BE-NEXT: blr
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r4)
-; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1
; CHECK-P9-NEXT: lxv vs0, 16(r4)
+; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1
; CHECK-P9-NEXT: xvcvuxwdp vs2, v2
; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1
; CHECK-P9-NEXT: xvcvuxwdp vs1, v2
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 0(r4)
-; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1
; CHECK-BE-NEXT: lxv vs0, 16(r4)
+; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1
; CHECK-BE-NEXT: xvcvuxwdp vs2, v2
; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1
; CHECK-BE-NEXT: xvcvuxwdp vs1, v2
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs0, 0(r4)
-; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0
; CHECK-P9-NEXT: lxv vs2, 16(r4)
+; CHECK-P9-NEXT: lxv vs5, 32(r4)
; CHECK-P9-NEXT: lxv vs4, 48(r4)
+; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0
; CHECK-P9-NEXT: xvcvuxwdp vs1, v2
; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0
-; CHECK-P9-NEXT: lxv vs5, 32(r4)
; CHECK-P9-NEXT: xvcvuxwdp vs0, v2
; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2
; CHECK-P9-NEXT: xvcvuxwdp vs3, v2
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs0, 0(r4)
-; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0
; CHECK-BE-NEXT: lxv vs2, 16(r4)
+; CHECK-BE-NEXT: lxv vs5, 32(r4)
; CHECK-BE-NEXT: lxv vs4, 48(r4)
+; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0
; CHECK-BE-NEXT: xvcvuxwdp vs1, v2
; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0
-; CHECK-BE-NEXT: lxv vs5, 32(r4)
; CHECK-BE-NEXT: xvcvuxwdp vs0, v2
; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2
; CHECK-BE-NEXT: xvcvuxwdp vs3, v2
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs1, 0(r4)
-; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1
; CHECK-P9-NEXT: lxv vs0, 16(r4)
+; CHECK-P9-NEXT: xxmrglw v2, vs1, vs1
; CHECK-P9-NEXT: xvcvsxwdp vs2, v2
; CHECK-P9-NEXT: xxmrghw v2, vs1, vs1
; CHECK-P9-NEXT: xvcvsxwdp vs1, v2
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs1, 0(r4)
-; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1
; CHECK-BE-NEXT: lxv vs0, 16(r4)
+; CHECK-BE-NEXT: xxmrghw v2, vs1, vs1
; CHECK-BE-NEXT: xvcvsxwdp vs2, v2
; CHECK-BE-NEXT: xxmrglw v2, vs1, vs1
; CHECK-BE-NEXT: xvcvsxwdp vs1, v2
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv vs0, 0(r4)
-; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0
; CHECK-P9-NEXT: lxv vs2, 16(r4)
+; CHECK-P9-NEXT: lxv vs5, 32(r4)
; CHECK-P9-NEXT: lxv vs4, 48(r4)
+; CHECK-P9-NEXT: xxmrglw v2, vs0, vs0
; CHECK-P9-NEXT: xvcvsxwdp vs1, v2
; CHECK-P9-NEXT: xxmrghw v2, vs0, vs0
-; CHECK-P9-NEXT: lxv vs5, 32(r4)
; CHECK-P9-NEXT: xvcvsxwdp vs0, v2
; CHECK-P9-NEXT: xxmrglw v2, vs2, vs2
; CHECK-P9-NEXT: xvcvsxwdp vs3, v2
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv vs0, 0(r4)
-; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0
; CHECK-BE-NEXT: lxv vs2, 16(r4)
+; CHECK-BE-NEXT: lxv vs5, 32(r4)
; CHECK-BE-NEXT: lxv vs4, 48(r4)
+; CHECK-BE-NEXT: xxmrghw v2, vs0, vs0
; CHECK-BE-NEXT: xvcvsxwdp vs1, v2
; CHECK-BE-NEXT: xxmrglw v2, vs0, vs0
-; CHECK-BE-NEXT: lxv vs5, 32(r4)
; CHECK-BE-NEXT: xvcvsxwdp vs0, v2
; CHECK-BE-NEXT: xxmrghw v2, vs2, vs2
; CHECK-BE-NEXT: xvcvsxwdp vs3, v2
; CHECK-P9-LABEL: test4elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v3, 0(r3)
-; CHECK-P9-NEXT: xvcvuxdsp vs0, v3
; CHECK-P9-NEXT: lxv v2, 16(r3)
+; CHECK-P9-NEXT: xvcvuxdsp vs0, v3
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v2
; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3
; CHECK-BE-LABEL: test4elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v3, 16(r3)
-; CHECK-BE-NEXT: xvcvuxdsp vs0, v3
; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: xvcvuxdsp vs0, v3
; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v2
; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3
; CHECK-P9-LABEL: test8elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v5, 0(r4)
-; CHECK-P9-NEXT: xvcvuxdsp vs0, v5
; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: lxv v3, 32(r4)
+; CHECK-P9-NEXT: lxv v2, 48(r4)
+; CHECK-P9-NEXT: xvcvuxdsp vs0, v5
; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v4
-; CHECK-P9-NEXT: lxv v3, 32(r4)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v3
-; CHECK-P9-NEXT: lxv v2, 48(r4)
; CHECK-P9-NEXT: vpkudum v3, v4, v5
; CHECK-P9-NEXT: stxv v3, 0(r3)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-LABEL: test8elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v5, 16(r4)
-; CHECK-BE-NEXT: xvcvuxdsp vs0, v5
; CHECK-BE-NEXT: lxv v4, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 48(r4)
+; CHECK-BE-NEXT: lxv v2, 32(r4)
+; CHECK-BE-NEXT: xvcvuxdsp vs0, v5
; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v4
-; CHECK-BE-NEXT: lxv v3, 48(r4)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v3
-; CHECK-BE-NEXT: lxv v2, 32(r4)
; CHECK-BE-NEXT: vpkudum v3, v4, v5
; CHECK-BE-NEXT: stxv v3, 0(r3)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v7, 0(r4)
-; CHECK-P9-NEXT: xvcvuxdsp vs0, v7
; CHECK-P9-NEXT: lxv v6, 16(r4)
+; CHECK-P9-NEXT: lxv v1, 32(r4)
+; CHECK-P9-NEXT: lxv v0, 48(r4)
+; CHECK-P9-NEXT: xvcvuxdsp vs0, v7
+; CHECK-P9-NEXT: lxv v5, 64(r4)
+; CHECK-P9-NEXT: lxv v4, 80(r4)
+; CHECK-P9-NEXT: lxv v3, 96(r4)
+; CHECK-P9-NEXT: lxv v2, 112(r4)
; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v6
-; CHECK-P9-NEXT: lxv v1, 32(r4)
; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v1
-; CHECK-P9-NEXT: lxv v0, 48(r4)
; CHECK-P9-NEXT: vpkudum v1, v6, v7
+; CHECK-P9-NEXT: stxv v1, 0(r3)
; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v0
-; CHECK-P9-NEXT: lxv v5, 64(r4)
-; CHECK-P9-NEXT: stxv v1, 0(r3)
; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v5
-; CHECK-P9-NEXT: lxv v4, 80(r4)
; CHECK-P9-NEXT: vpkudum v0, v0, v6
; CHECK-P9-NEXT: stxv v0, 16(r3)
; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v4
-; CHECK-P9-NEXT: lxv v3, 96(r4)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvuxdsp vs0, v3
-; CHECK-P9-NEXT: lxv v2, 112(r4)
; CHECK-P9-NEXT: vpkudum v4, v4, v5
; CHECK-P9-NEXT: stxv v4, 32(r3)
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v7, 16(r4)
-; CHECK-BE-NEXT: xvcvuxdsp vs0, v7
; CHECK-BE-NEXT: lxv v6, 0(r4)
+; CHECK-BE-NEXT: lxv v1, 48(r4)
+; CHECK-BE-NEXT: lxv v0, 32(r4)
+; CHECK-BE-NEXT: xvcvuxdsp vs0, v7
+; CHECK-BE-NEXT: lxv v5, 80(r4)
+; CHECK-BE-NEXT: lxv v4, 64(r4)
+; CHECK-BE-NEXT: lxv v3, 112(r4)
+; CHECK-BE-NEXT: lxv v2, 96(r4)
; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v6
-; CHECK-BE-NEXT: lxv v1, 48(r4)
; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v1
-; CHECK-BE-NEXT: lxv v0, 32(r4)
; CHECK-BE-NEXT: vpkudum v1, v6, v7
+; CHECK-BE-NEXT: stxv v1, 0(r3)
; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v0
-; CHECK-BE-NEXT: lxv v5, 80(r4)
-; CHECK-BE-NEXT: stxv v1, 0(r3)
; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v5
-; CHECK-BE-NEXT: lxv v4, 64(r4)
; CHECK-BE-NEXT: vpkudum v0, v0, v6
; CHECK-BE-NEXT: stxv v0, 16(r3)
; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v4
-; CHECK-BE-NEXT: lxv v3, 112(r4)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvuxdsp vs0, v3
-; CHECK-BE-NEXT: lxv v2, 96(r4)
; CHECK-BE-NEXT: vpkudum v4, v4, v5
; CHECK-BE-NEXT: stxv v4, 32(r3)
; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-LABEL: test4elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v3, 0(r3)
-; CHECK-P9-NEXT: xvcvsxdsp vs0, v3
; CHECK-P9-NEXT: lxv v2, 16(r3)
+; CHECK-P9-NEXT: xvcvsxdsp vs0, v3
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v2
; CHECK-P9-NEXT: xxsldwi v2, vs0, vs0, 3
; CHECK-BE-LABEL: test4elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v3, 16(r3)
-; CHECK-BE-NEXT: xvcvsxdsp vs0, v3
; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: xvcvsxdsp vs0, v3
; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v2
; CHECK-BE-NEXT: xxsldwi v2, vs0, vs0, 3
; CHECK-P9-LABEL: test8elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v5, 0(r4)
-; CHECK-P9-NEXT: xvcvsxdsp vs0, v5
; CHECK-P9-NEXT: lxv v4, 16(r4)
+; CHECK-P9-NEXT: lxv v3, 32(r4)
+; CHECK-P9-NEXT: lxv v2, 48(r4)
+; CHECK-P9-NEXT: xvcvsxdsp vs0, v5
; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v4
-; CHECK-P9-NEXT: lxv v3, 32(r4)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v3
-; CHECK-P9-NEXT: lxv v2, 48(r4)
; CHECK-P9-NEXT: vpkudum v3, v4, v5
; CHECK-P9-NEXT: stxv v3, 0(r3)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-LABEL: test8elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v5, 16(r4)
-; CHECK-BE-NEXT: xvcvsxdsp vs0, v5
; CHECK-BE-NEXT: lxv v4, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 48(r4)
+; CHECK-BE-NEXT: lxv v2, 32(r4)
+; CHECK-BE-NEXT: xvcvsxdsp vs0, v5
; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v4
-; CHECK-BE-NEXT: lxv v3, 48(r4)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v3
-; CHECK-BE-NEXT: lxv v2, 32(r4)
; CHECK-BE-NEXT: vpkudum v3, v4, v5
; CHECK-BE-NEXT: stxv v3, 0(r3)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-LABEL: test16elt_signed:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxv v7, 0(r4)
-; CHECK-P9-NEXT: xvcvsxdsp vs0, v7
; CHECK-P9-NEXT: lxv v6, 16(r4)
+; CHECK-P9-NEXT: lxv v1, 32(r4)
+; CHECK-P9-NEXT: lxv v0, 48(r4)
+; CHECK-P9-NEXT: xvcvsxdsp vs0, v7
+; CHECK-P9-NEXT: lxv v5, 64(r4)
+; CHECK-P9-NEXT: lxv v4, 80(r4)
+; CHECK-P9-NEXT: lxv v3, 96(r4)
+; CHECK-P9-NEXT: lxv v2, 112(r4)
; CHECK-P9-NEXT: xxsldwi v7, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v6
-; CHECK-P9-NEXT: lxv v1, 32(r4)
; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v1
-; CHECK-P9-NEXT: lxv v0, 48(r4)
; CHECK-P9-NEXT: vpkudum v1, v6, v7
+; CHECK-P9-NEXT: stxv v1, 0(r3)
; CHECK-P9-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v0
-; CHECK-P9-NEXT: lxv v5, 64(r4)
-; CHECK-P9-NEXT: stxv v1, 0(r3)
; CHECK-P9-NEXT: xxsldwi v0, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v5
-; CHECK-P9-NEXT: lxv v4, 80(r4)
; CHECK-P9-NEXT: vpkudum v0, v0, v6
; CHECK-P9-NEXT: stxv v0, 16(r3)
; CHECK-P9-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v4
-; CHECK-P9-NEXT: lxv v3, 96(r4)
; CHECK-P9-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-P9-NEXT: xvcvsxdsp vs0, v3
-; CHECK-P9-NEXT: lxv v2, 112(r4)
; CHECK-P9-NEXT: vpkudum v4, v4, v5
; CHECK-P9-NEXT: stxv v4, 32(r3)
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: lxv v7, 16(r4)
-; CHECK-BE-NEXT: xvcvsxdsp vs0, v7
; CHECK-BE-NEXT: lxv v6, 0(r4)
+; CHECK-BE-NEXT: lxv v1, 48(r4)
+; CHECK-BE-NEXT: lxv v0, 32(r4)
+; CHECK-BE-NEXT: xvcvsxdsp vs0, v7
+; CHECK-BE-NEXT: lxv v5, 80(r4)
+; CHECK-BE-NEXT: lxv v4, 64(r4)
+; CHECK-BE-NEXT: lxv v3, 112(r4)
+; CHECK-BE-NEXT: lxv v2, 96(r4)
; CHECK-BE-NEXT: xxsldwi v7, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v6
-; CHECK-BE-NEXT: lxv v1, 48(r4)
; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v1
-; CHECK-BE-NEXT: lxv v0, 32(r4)
; CHECK-BE-NEXT: vpkudum v1, v6, v7
+; CHECK-BE-NEXT: stxv v1, 0(r3)
; CHECK-BE-NEXT: xxsldwi v6, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v0
-; CHECK-BE-NEXT: lxv v5, 80(r4)
-; CHECK-BE-NEXT: stxv v1, 0(r3)
; CHECK-BE-NEXT: xxsldwi v0, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v5
-; CHECK-BE-NEXT: lxv v4, 64(r4)
; CHECK-BE-NEXT: vpkudum v0, v0, v6
; CHECK-BE-NEXT: stxv v0, 16(r3)
; CHECK-BE-NEXT: xxsldwi v5, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v4
-; CHECK-BE-NEXT: lxv v3, 112(r4)
; CHECK-BE-NEXT: xxsldwi v4, vs0, vs0, 3
; CHECK-BE-NEXT: xvcvsxdsp vs0, v3
-; CHECK-BE-NEXT: lxv v2, 96(r4)
; CHECK-BE-NEXT: vpkudum v4, v4, v5
; CHECK-BE-NEXT: stxv v4, 32(r3)
; CHECK-BE-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: mtfprwz f0, r3
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: xscvuxdsp f0, f0
-; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: clrlwi r3, r3, 24
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: mtfprwz f0, r3
; CHECK-P9-NEXT: xscvuxdsp f0, f0
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxwsp v2, v2
; CHECK-P9-NEXT: blr
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxwsp v2, v2
; CHECK-BE-NEXT: blr
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-NEXT: mtfprwa f0, r3
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: xscvsxdsp f0, f0
-; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: vextubrx r3, r3, v2
; CHECK-P9-NEXT: extsb r3, r3
+; CHECK-P9-NEXT: xscvdpspn vs0, f0
; CHECK-P9-NEXT: xxsldwi v3, vs0, vs0, 3
; CHECK-P9-NEXT: mtfprwa f0, r3
; CHECK-P9-NEXT: xscvsxdsp f0, f0
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v3, v3, v3
-; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
-; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v3, v2, v4
+; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: xvcvsxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: vextsb2w v3, v3
; CHECK-BE-NEXT: xvcvsxwsp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r3
; CHECK-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: xvcvuxddp v2, v2
; CHECK-P9-NEXT: blr
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r3
; CHECK-BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r3, r3, .LCPI0_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r3
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: vperm v2, v2, v4, v3
; CHECK-BE-NEXT: xvcvuxddp v2, v2
; CHECK-BE-NEXT: blr
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrws v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI1_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI1_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: mtvsrd v2, r4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI2_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI2_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-P9-LABEL: test16elt:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r4
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-P9-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-P9-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-LABEL: test16elt:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI3_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI3_1@toc@l
; CHECK-BE-NEXT: vperm v3, v2, v4, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrws v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v3, v3, v3
-; CHECK-BE-NEXT: vperm v3, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI5_1@toc@ha
-; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI5_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v3, v2, v4
+; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: mtvsrd v2, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v4, v4, v4
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_0@toc@l
; CHECK-BE-NEXT: lxvx v3, 0, r4
-; CHECK-BE-NEXT: xxlxor v4, v4, v4
-; CHECK-BE-NEXT: vperm v3, v4, v2, v3
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_1@toc@ha
-; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI6_1@toc@l
+; CHECK-BE-NEXT: vperm v3, v4, v2, v3
+; CHECK-BE-NEXT: vextsb2d v3, v3
; CHECK-BE-NEXT: xvcvsxddp vs0, v3
; CHECK-BE-NEXT: lxvx v3, 0, r4
; CHECK-BE-NEXT: addis r4, r2, .LCPI6_2@toc@ha
; CHECK-BE-LABEL: test16elt_signed:
; CHECK-BE: # %bb.0: # %entry
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_0@toc@ha
+; CHECK-BE-NEXT: xxlxor v3, v3, v3
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_0@toc@l
; CHECK-BE-NEXT: lxvx v4, 0, r4
-; CHECK-BE-NEXT: xxlxor v3, v3, v3
-; CHECK-BE-NEXT: vperm v4, v3, v2, v4
; CHECK-BE-NEXT: addis r4, r2, .LCPI7_1@toc@ha
; CHECK-BE-NEXT: addi r4, r4, .LCPI7_1@toc@l
+; CHECK-BE-NEXT: vperm v4, v3, v2, v4
; CHECK-BE-NEXT: vextsb2d v4, v4
; CHECK-BE-NEXT: xvcvsxddp vs0, v4
; CHECK-BE-NEXT: lxvx v4, 0, r4
; PC64LE9-NEXT: addis 3, 2, .LCPI6_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI6_2@toc@l(3)
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI7_2@toc@ha
; PC64LE9-NEXT: fmr 30, 1
-; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI7_2@toc@l(3)
; PC64LE9-NEXT: bl fmodf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI7_3@toc@ha
; PC64LE9-NEXT: fmr 29, 1
-; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI7_3@toc@l(3)
; PC64LE9-NEXT: bl fmodf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l
+; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 29
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
-; PC64LE9-NEXT: addis 3, 2, .LCPI7_4@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI7_4@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -80(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI8_0@toc@ha
+; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: lfs 1, .LCPI8_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI8_1@toc@ha
-; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
; PC64LE9-NEXT: lfs 31, .LCPI8_1@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
-; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI8_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI8_2@toc@l(3)
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI8_3@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: xxmrghd 63, 1, 0
; PC64LE9-NEXT: lfs 1, .LCPI8_3@toc@l(3)
-; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -80(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI9_0@toc@ha
+; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: lfs 1, .LCPI9_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI9_1@toc@ha
-; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
; PC64LE9-NEXT: lfs 31, .LCPI9_1@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
-; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI9_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI9_2@toc@l(3)
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI9_3@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: xxmrghd 63, 1, 0
; PC64LE9-NEXT: lfs 1, .LCPI9_3@toc@l(3)
-; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI9_4@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI9_4@toc@l(3)
; PC64LE9-NEXT: bl fmod
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-LABEL: constrained_vector_fadd_v3f32:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: addis 3, 2, .LCPI17_0@toc@ha
+; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: lfs 0, .LCPI17_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI17_1@toc@ha
; PC64LE9-NEXT: lfs 2, .LCPI17_1@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI17_2@toc@ha
-; PC64LE9-NEXT: xsaddsp 2, 0, 2
; PC64LE9-NEXT: lfs 3, .LCPI17_2@toc@l(3)
-; PC64LE9-NEXT: xxlxor 1, 1, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l
; PC64LE9-NEXT: xsaddsp 1, 0, 1
+; PC64LE9-NEXT: lxvx 36, 0, 3
+; PC64LE9-NEXT: xsaddsp 2, 0, 2
; PC64LE9-NEXT: xsaddsp 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 0
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 1
-; PC64LE9-NEXT: addis 3, 2, .LCPI17_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI17_3@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-LABEL: constrained_vector_fadd_v3f64:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: addis 3, 2, .LCPI18_0@toc@ha
+; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: lfd 0, .LCPI18_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI18_1@toc@ha
-; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: addi 3, 3, .LCPI18_1@toc@l
; PC64LE9-NEXT: xsadddp 3, 0, 1
; PC64LE9-NEXT: lxvx 0, 0, 3
; PC64LE9-LABEL: constrained_vector_fsub_v3f32:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: addis 3, 2, .LCPI22_0@toc@ha
+; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: lfs 0, .LCPI22_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI22_1@toc@ha
; PC64LE9-NEXT: lfs 2, .LCPI22_1@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI22_2@toc@ha
-; PC64LE9-NEXT: xssubsp 2, 0, 2
; PC64LE9-NEXT: lfs 3, .LCPI22_2@toc@l(3)
-; PC64LE9-NEXT: xxlxor 1, 1, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l
; PC64LE9-NEXT: xssubsp 1, 0, 1
+; PC64LE9-NEXT: lxvx 36, 0, 3
+; PC64LE9-NEXT: xssubsp 2, 0, 2
; PC64LE9-NEXT: xssubsp 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 0
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 1
-; PC64LE9-NEXT: addis 3, 2, .LCPI22_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI22_3@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-LABEL: constrained_vector_fsub_v3f64:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: addis 3, 2, .LCPI23_0@toc@ha
+; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: lfd 0, .LCPI23_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI23_1@toc@ha
-; PC64LE9-NEXT: xxlxor 1, 1, 1
; PC64LE9-NEXT: addi 3, 3, .LCPI23_1@toc@l
; PC64LE9-NEXT: xssubdp 3, 0, 1
; PC64LE9-NEXT: lxvx 0, 0, 3
; PC64LE9-NEXT: addis 3, 2, .LCPI31_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfd 1, .LCPI31_2@toc@l(3)
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI32_2@toc@ha
; PC64LE9-NEXT: fmr 30, 1
-; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI32_2@toc@l(3)
; PC64LE9-NEXT: bl powf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI32_3@toc@ha
; PC64LE9-NEXT: fmr 29, 1
-; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfs 1, .LCPI32_3@toc@l(3)
; PC64LE9-NEXT: bl powf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l
+; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 29
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
-; PC64LE9-NEXT: addis 3, 2, .LCPI32_4@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI32_4@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -80(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI33_0@toc@ha
+; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: lfs 1, .LCPI33_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI33_1@toc@ha
-; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
; PC64LE9-NEXT: lfs 31, .LCPI33_1@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
-; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI33_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfd 1, .LCPI33_2@toc@l(3)
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI33_3@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: xxmrghd 63, 1, 0
; PC64LE9-NEXT: lfd 1, .LCPI33_3@toc@l(3)
-; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -80(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI34_0@toc@ha
+; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: lfd 1, .LCPI34_0@toc@l(3)
; PC64LE9-NEXT: addis 3, 2, .LCPI34_1@toc@ha
-; PC64LE9-NEXT: stfd 31, 72(1) # 8-byte Folded Spill
; PC64LE9-NEXT: lfs 31, .LCPI34_1@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
-; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI34_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfd 1, .LCPI34_2@toc@l(3)
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI34_3@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: xxmrghd 63, 1, 0
; PC64LE9-NEXT: lfd 1, .LCPI34_3@toc@l(3)
-; PC64LE9-NEXT: fmr 2, 31
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI34_4@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3)
; PC64LE9-NEXT: fmr 2, 31
+; PC64LE9-NEXT: lfd 1, .LCPI34_4@toc@l(3)
; PC64LE9-NEXT: bl pow
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -32(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI35_0@toc@ha
-; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfs 1, .LCPI35_0@toc@l(3)
; PC64LE9-NEXT: bl __powisf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addi 1, 1, 32
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -48(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI36_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfd 1, .LCPI36_0@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI36_1@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfd 1, .LCPI36_1@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -48(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI37_0@toc@ha
-; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfs 1, .LCPI37_0@toc@l(3)
; PC64LE9-NEXT: bl __powisf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI37_1@toc@ha
; PC64LE9-NEXT: fmr 31, 1
-; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfs 1, .LCPI37_1@toc@l(3)
; PC64LE9-NEXT: bl __powisf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI37_2@toc@ha
; PC64LE9-NEXT: fmr 30, 1
-; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfs 1, .LCPI37_2@toc@l(3)
; PC64LE9-NEXT: bl __powisf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l
+; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI37_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI37_3@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI38_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3)
; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI38_0@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI38_1@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfs 1, .LCPI38_1@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI38_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: xxmrghd 63, 0, 1
; PC64LE9-NEXT: lfd 1, .LCPI38_2@toc@l(3)
-; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI39_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3)
; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI39_0@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI39_1@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfd 1, .LCPI39_1@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: addis 3, 2, .LCPI39_2@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: xxmrghd 63, 1, 0
; PC64LE9-NEXT: lfd 1, .LCPI39_2@toc@l(3)
-; PC64LE9-NEXT: li 4, 3
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI39_3@toc@ha
; PC64LE9-NEXT: # kill: def $f1 killed $f1 def $vsl1
; PC64LE9-NEXT: stxv 1, 32(1) # 16-byte Folded Spill
-; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3)
; PC64LE9-NEXT: li 4, 3
+; PC64LE9-NEXT: lfd 1, .LCPI39_3@toc@l(3)
; PC64LE9-NEXT: bl __powidf2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: lxv 0, 32(1) # 16-byte Folded Reload
; PC64LE9-NEXT: bl sinf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI42_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI42_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI43_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI43_0@toc@l(3)
; PC64LE9-NEXT: bl sin
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI43_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI44_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI44_0@toc@l(3)
; PC64LE9-NEXT: bl sin
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI44_1@toc@ha
; PC64LE9-NEXT: bl cosf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI47_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI47_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI48_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI48_0@toc@l(3)
; PC64LE9-NEXT: bl cos
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI48_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI49_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI49_0@toc@l(3)
; PC64LE9-NEXT: bl cos
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI49_1@toc@ha
; PC64LE9-NEXT: bl expf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI52_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI52_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI53_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI53_0@toc@l(3)
; PC64LE9-NEXT: bl exp
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI53_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI54_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI54_0@toc@l(3)
; PC64LE9-NEXT: bl exp
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI54_1@toc@ha
; PC64LE9-NEXT: bl exp2f
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI57_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI57_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI58_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI58_0@toc@l(3)
; PC64LE9-NEXT: bl exp2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI58_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI59_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI59_0@toc@l(3)
; PC64LE9-NEXT: bl exp2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI59_1@toc@ha
; PC64LE9-NEXT: bl logf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI62_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI62_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI63_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI63_0@toc@l(3)
; PC64LE9-NEXT: bl log
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI63_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI64_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI64_0@toc@l(3)
; PC64LE9-NEXT: bl log
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI64_1@toc@ha
; PC64LE9-NEXT: bl log10f
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI67_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI67_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI68_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI68_0@toc@l(3)
; PC64LE9-NEXT: bl log10
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI68_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI69_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI69_0@toc@l(3)
; PC64LE9-NEXT: bl log10
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI69_1@toc@ha
; PC64LE9-NEXT: bl log2f
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI72_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI72_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI73_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI73_0@toc@l(3)
; PC64LE9-NEXT: bl log2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI73_1@toc@ha
; PC64LE9-NEXT: std 0, 16(1)
; PC64LE9-NEXT: stdu 1, -64(1)
; PC64LE9-NEXT: addis 3, 2, .LCPI74_0@toc@ha
-; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3)
; PC64LE9-NEXT: stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT: lfd 1, .LCPI74_0@toc@l(3)
; PC64LE9-NEXT: bl log2
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI74_1@toc@ha
; PC64LE9-NEXT: bl nearbyintf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 31
-; PC64LE9-NEXT: addis 3, 2, .LCPI82_3@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI82_3@toc@l
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: lxvx 35, 0, 3
; PC64LE9-NEXT: xxsldwi 36, 0, 0, 3
; PC64LE9-NEXT: bl fmaxf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI87_4@toc@ha
-; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3)
; PC64LE9-NEXT: fmr 29, 1
; PC64LE9-NEXT: fmr 1, 31
+; PC64LE9-NEXT: lfs 2, .LCPI87_4@toc@l(3)
; PC64LE9-NEXT: bl fmaxf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l
+; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 29
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
-; PC64LE9-NEXT: addis 3, 2, .LCPI87_5@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI87_5@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-NEXT: bl fmax
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI88_2@toc@ha
+; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: addi 3, 3, .LCPI88_2@toc@l
; PC64LE9-NEXT: lxvx 0, 0, 3
; PC64LE9-NEXT: addis 3, 2, .LCPI88_3@toc@ha
; PC64LE9-NEXT: addi 3, 3, .LCPI88_3@toc@l
-; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: lxvx 1, 0, 3
; PC64LE9-NEXT: xvmaxdp 2, 1, 0
; PC64LE9-NEXT: xxswapd 1, 2
; PC64LE9-NEXT: bl fminf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI92_4@toc@ha
-; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3)
; PC64LE9-NEXT: fmr 29, 1
; PC64LE9-NEXT: fmr 1, 31
+; PC64LE9-NEXT: lfs 2, .LCPI92_4@toc@l(3)
; PC64LE9-NEXT: bl fminf
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: xscvdpspn 0, 1
+; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha
+; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l
+; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: xxsldwi 34, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 29
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: xscvdpspn 0, 30
-; PC64LE9-NEXT: addis 3, 2, .LCPI92_5@toc@ha
-; PC64LE9-NEXT: addi 3, 3, .LCPI92_5@toc@l
-; PC64LE9-NEXT: lxvx 36, 0, 3
; PC64LE9-NEXT: vmrghw 2, 3, 2
; PC64LE9-NEXT: xxsldwi 35, 0, 0, 3
; PC64LE9-NEXT: vperm 2, 3, 2, 4
; PC64LE9-NEXT: bl fmin
; PC64LE9-NEXT: nop
; PC64LE9-NEXT: addis 3, 2, .LCPI93_2@toc@ha
+; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: addi 3, 3, .LCPI93_2@toc@l
; PC64LE9-NEXT: lxvx 0, 0, 3
; PC64LE9-NEXT: addis 3, 2, .LCPI93_3@toc@ha
; PC64LE9-NEXT: addi 3, 3, .LCPI93_3@toc@l
-; PC64LE9-NEXT: fmr 3, 1
; PC64LE9-NEXT: lxvx 1, 0, 3
; PC64LE9-NEXT: xvmindp 2, 1, 0
; PC64LE9-NEXT: xxswapd 1, 2
; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_alu_je:
; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry
; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax
-; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00
; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi)
+; NOFUSION_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00
; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB2_2
; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then
; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax
; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_alu_je:
; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax
-; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi)
+; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: addl $-512, %eax # imm = 0xFE00
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB2_2
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax
; NOFUSION_MISCHEDPOSTRA-LABEL: macrofuse_dec_je:
; NOFUSION_MISCHEDPOSTRA: # %bb.0: # %entry
; NOFUSION_MISCHEDPOSTRA-NEXT: movl %edi, %eax
-; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax
; NOFUSION_MISCHEDPOSTRA-NEXT: movb $1, (%rsi)
+; NOFUSION_MISCHEDPOSTRA-NEXT: decl %eax
; NOFUSION_MISCHEDPOSTRA-NEXT: je .LBB3_2
; NOFUSION_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then
; NOFUSION_MISCHEDPOSTRA-NEXT: movl $1, %eax
; BRANCHFUSIONONLY_MISCHEDPOSTRA-LABEL: macrofuse_dec_je:
; BRANCHFUSIONONLY_MISCHEDPOSTRA: # %bb.0: # %entry
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl %edi, %eax
-; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movb $1, (%rsi)
+; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: decl %eax
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: je .LBB3_2
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: # %bb.1: # %if.then
; BRANCHFUSIONONLY_MISCHEDPOSTRA-NEXT: movl $1, %eax
--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64 -enable-post-misched -run-pass=postmisched -o - %s | FileCheck %s
+---
+# Check that postmisched's TopDepthReduce heuristic moves the DEC32r later
+# because of the dependency on eax
+name: test
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: test
+ ; CHECK: $eax = MOV32rr killed $edi
+ ; CHECK: MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1)
+ ; CHECK: renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags
+ $eax = MOV32rr $edi
+ renamable $eax = DEC32r killed renamable $eax, implicit-def $eflags
+ MOV8mi killed renamable $rsi, 1, $noreg, 0, $noreg, 1 :: (store 1)
+...