namespace {
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+ Out = C->getAPIntValue().getSExtValue();
+ return true;
+ }
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+ Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
+ return true;
+ }
+
+ return false;
+}
+
+// TODO: Handle undef as zero
+static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
+ bool Negate = false) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ SDLoc SL(N);
+ uint32_t K = Negate ?
+ (-LHSVal & 0xffff) | (-RHSVal << 16) :
+ (LHSVal & 0xffff) | (RHSVal << 16);
+ return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
+ DAG.getTargetConstant(K, SL, MVT::i32));
+ }
+
+ return nullptr;
+}
+
+static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
+ return packConstantV2I16(N, DAG, true);
+}
+
/// AMDGPU specific code to select AMDGPU machine instructions for
/// SelectionDAG operations.
class AMDGPUDAGToDAGISel : public SelectionDAGISel {
private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;
- bool isInlineImmediate(const SDNode *N) const;
+ bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
+ bool isNegInlineImmediate(const SDNode *N) const {
+ return isInlineImmediate(N, true);
+ }
+
bool isVGPRImm(const SDNode *N) const;
bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;
return CurDAG->isKnownNeverNaN(N);
}
-bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
+bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
+ bool Negated) const {
+ // TODO: Handle undef
+
const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (Negated) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(-C->getAPIntValue());
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
- return TII->isInlineConstant(C->getAPIntValue());
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
- return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ } else {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
+ return TII->isInlineConstant(C->getAPIntValue());
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
+ return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
+ }
return false;
}
llvm_unreachable("invalid vector size");
}
-static bool getConstantValue(SDValue N, uint32_t &Out) {
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
- Out = C->getAPIntValue().getZExtValue();
- return true;
- }
-
- if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
- Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
- return true;
- }
-
- return false;
-}
-
void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
unsigned NumVectorElts = VT.getVectorNumElements();
if (VT.getScalarSizeInBits() == 16) {
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
- uint32_t LHSVal, RHSVal;
- if (getConstantValue(N->getOperand(0), LHSVal) &&
- getConstantValue(N->getOperand(1), RHSVal)) {
- uint32_t K = LHSVal | (RHSVal << 16);
- CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
- CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+ if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
+ ReplaceNode(N, Packed);
return;
}
}
return N->getZExtValue() < 32;
}]>;
+def getNegV2I16Imm : SDNodeXForm<build_vector, [{
+ return SDValue(packNegConstantV2I16(N, *CurDAG), 0);
+}]>;
+
+
+// TODO: Handle undef as 0
+def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
+ assert(N->getNumOperands() == 2);
+ assert(N->getOperand(0).getValueType().getSizeInBits() == 16);
+ SDValue Src0 = N->getOperand(0);
+ SDValue Src1 = N->getOperand(1);
+ if (Src0 == Src1)
+ return isNegInlineImmediate(Src0.getNode());
+
+ return (isNullConstant(Src0) && isNegInlineImmediate(Src1.getNode())) ||
+ (isNullConstant(Src1) && isNegInlineImmediate(Src0.getNode()));
+}], getNegV2I16Imm>;
+
//===----------------------------------------------------------------------===//
// Custom Operands
//===----------------------------------------------------------------------===//
def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// The constant will be emitted as a mov, and folded later.
+// TODO: We could directly encode the immediate now
+def : GCNPat<
+ (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
+ (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
+>;
+
multiclass MadFmaMixPats<SDPatternOperator fma_like,
Instruction mix_inst,
Instruction mixlo_inst,
}
; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
+; GFX9: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}}
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
; VI: flat_load_dword [[LOAD:v[0-9]+]]
; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
; GFX900: ds_read_u16_d16_hi v1, v0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:2
; GFX900-NEXT: s_waitcnt vmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v0, v1
; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_pk_add_u16 v0, v0, 12 op_sel_hi:[1,0]
+; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
; GFX900-NEXT: s_setpc_b64
define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_movk_i32 s4, 0xffe0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, s4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xffe00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xffe0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, s4
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, -16 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, -16
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, -4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v2, v3, -2.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9: s_load_dword [[VAL:s[0-9]+]]
; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; CIVI: s_sub_i32
; GFX9: global_load_dword [[VAL:v[0-9]+]]
; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
; GFX9: s_load_dword [[VAL:s[0-9]+]]
; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
%z0 = insertelement <2 x i16> undef, i16 0, i16 0
%z1 = insertelement <2 x i16> %z0, i16 0, i16 1
; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
%z0 = insertelement <2 x i16> undef, i16 0, i16 0
%z1 = insertelement <2 x i16> %z0, i16 0, i16 1
; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]]
-; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
-; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
+; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
%z0 = insertelement <4 x i16> undef, i16 0, i16 0
%z1 = insertelement <4 x i16> %z0, i16 0, i16 1
; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
-; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
+; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0]
; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
-; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
+; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0]
define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
%z0 = insertelement <4 x i16> undef, i16 0, i16 0
%z1 = insertelement <4 x i16> %z0, i16 0, i16 1