[ARM] Add FP16 vector insert/extract patterns

author Mikhail Maltsev <mikhail.maltsev@arm.com>

Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)

committer Mikhail Maltsev <mikhail.maltsev@arm.com>

Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)
author Mikhail Maltsev <mikhail.maltsev@arm.com>
Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)
committer Mikhail Maltsev <mikhail.maltsev@arm.com>
Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index effee0f..1c7bbab 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -1117,6 +1117,12 @@ def VLD1LNq8Pseudo  : VLD1QLNPseudo<v16i8, extloadi8>;
  def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
  def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
  
+def : Pat<(vector_insert (v4f16 DPR:$src),
+                         (f16 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v8f16 QPR:$src),
+                         (f16 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
  def : Pat<(vector_insert (v2f32 DPR:$src),
                           (f32 (load addrmode6:$addr)), imm:$lane),
            (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
@@ -2175,6 +2181,11 @@ def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
  def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
            (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
  
+def : Pat<(store (extractelt (v4f16 DPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v8f16 QPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
  // ...with address register writeback:
  class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                 PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
@@ -2504,6 +2515,13 @@ def SSubReg_f32_reg : SDNodeXForm<imm, [{
                                     MVT::i32);
  }]>;
  
+// Extract S sub-registers of Q/D registers containing a given f16 lane.
+def SSubReg_f16_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
  // Translate lane numbers from Q registers to D subregs.
  def SubReg_i8_lane  : SDNodeXForm<imm, [{
    return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
@@ -6223,6 +6241,32 @@ def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
  def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
            (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
  
+def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
+def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
+
+def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane),
+            (EXTRACT_SUBREG
+                (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+                (SSubReg_f16_reg imm_even:$lane))>;
+
+def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane),
+            (COPY_TO_REGCLASS
+              (VMOVH (EXTRACT_SUBREG
+                  (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+                  (SSubReg_f16_reg imm_odd:$lane))),
+              HPR)>;
+
+def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane),
+            (EXTRACT_SUBREG
+                (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
+                (SSubReg_f16_reg imm_even:$lane))>;
+
+def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane),
+            (COPY_TO_REGCLASS
+              (VMOVH (EXTRACT_SUBREG
+                  (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
+                  (SSubReg_f16_reg imm_odd:$lane))),
+              HPR)>;
  
  //   VMOV     : Vector Set Lane (move ARM core register to scalar)
  
@@ -6281,6 +6325,15 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
            (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
                                  SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
  
+def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane),
+          (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>;
+def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane),
+          (v8f16 (INSERT_SUBREG QPR:$src1,
+                   (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                      (DSubReg_i16_reg imm:$lane))),
+                             (VMOVRH $src2), (SubReg_i16_lane imm:$lane))),
+                   (DSubReg_i16_reg imm:$lane)))>;
+
  //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
  //          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
  def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
diff --git a/test/CodeGen/ARM/fp16-insert-extract.ll b/test/CodeGen/ARM/fp16-insert-extract.ll

new file mode 100644 (file)

index 0000000..617a4df
--- /dev/null
+++ b/test/CodeGen/ARM/fp16-insert-extract.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s
+
+define float @test_vget_lane_f16_1(<4 x half> %a) nounwind {
+; CHECK-LABEL: test_vget_lane_f16_1:
+; CHECK:      vmovx.f16 s0, s0
+; CHECK-NEXT: vcvtb.f32.f16 s0, s0
+entry:
+  %elt = extractelement <4 x half> %a, i32 1
+  %conv = fpext half %elt to float
+  ret float %conv
+}
+
+define float @test_vget_lane_f16_2(<4 x half> %a) nounwind {
+; CHECK-LABEL: test_vget_lane_f16_2:
+; CHECK-NOT:  vmovx.f16
+; CHECK:      vcvtb.f32.f16 s0, s1
+entry:
+  %elt = extractelement <4 x half> %a, i32 2
+  %conv = fpext half %elt to float
+  ret float %conv
+}
+
+define float @test_vget_laneq_f16_6(<8 x half> %a) nounwind {
+; CHECK-LABEL: test_vget_laneq_f16_6:
+; CHECK-NOT:  vmovx.f16
+; CHECK:      vcvtb.f32.f16 s0, s3
+entry:
+  %elt = extractelement <8 x half> %a, i32 6
+  %conv = fpext half %elt to float
+  ret float %conv
+}
+
+define float @test_vget_laneq_f16_7(<8 x half> %a) nounwind {
+; CHECK-LABEL: test_vget_laneq_f16_7:
+; CHECK:      vmovx.f16 s0, s3
+; CHECK:      vcvtb.f32.f16 s0, s0
+entry:
+  %elt = extractelement <8 x half> %a, i32 7
+  %conv = fpext half %elt to float
+  ret float %conv
+}
+
+define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind {
+; CHECK-LABEL: test_vset_lane_f16:
+; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
+; CHECK: vmov.16  d{{[0-9]+}}[3], r[[GPR]]
+entry:
+  %b = fptrunc float %fb to half
+  %x = insertelement <4 x half> %a, half %b, i32 3
+  ret <4 x half> %x
+}
+
+define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind {
+; CHECK-LABEL: test_vset_laneq_f16_1:
+; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
+; CHECK: vmov.16  d{{[0-9]+}}[1], r[[GPR]]
+entry:
+  %b = fptrunc float %fb to half
+  %x = insertelement <8 x half> %a, half %b, i32 1
+  ret <8 x half> %x
+}
+
+define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind {
+; CHECK-LABEL: test_vset_laneq_f16_7:
+; CHECK: vmov.f16 r[[GPR:[0-9]+]], s{{[0-9]+}}
+; CHECK: vmov.16  d{{[0-9]+}}[3], r[[GPR]]
+entry:
+  %b = fptrunc float %fb to half
+  %x = insertelement <8 x half> %a, half %b, i32 7
+  ret <8 x half> %x
+}
diff --git a/test/CodeGen/ARM/fp16-vldlane-vstlane.ll b/test/CodeGen/ARM/fp16-vldlane-vstlane.ll

new file mode 100644 (file)

index 0000000..2a73583
--- /dev/null
+++ b/test/CodeGen/ARM/fp16-vldlane-vstlane.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s
+; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s
+
+define <4 x half> @vld1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
+; CHECK-LABEL: vld1d_lane_f16:
+; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
+entry:
+  %a = load half, half* %pa
+  %res = insertelement <4 x half> %v4, half %a, i32 3
+  ret <4 x half> %res
+}
+
+define <8 x half> @vld1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
+; CHECK-LABEL: vld1q_lane_f16_1:
+; CHECK: vld1.16 {d{{[0-9]+}}[1]}, [r0:16]
+entry:
+  %a = load half, half* %pa
+  %res = insertelement <8 x half> %v8, half %a, i32 1
+  ret <8 x half> %res
+}
+
+define <8 x half> @vld1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
+; CHECK-LABEL: vld1q_lane_f16_7:
+; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
+entry:
+  %a = load half, half* %pa
+  %res = insertelement <8 x half> %v8, half %a, i32 7
+  ret <8 x half> %res
+}
+
+define void @vst1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
+; CHECK-LABEL: vst1d_lane_f16:
+; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
+entry:
+  %a = extractelement <4 x half> %v4, i32 3
+  store half %a, half* %pa
+  ret void
+}
+
+define void @vst1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
+; CHECK-LABEL: vst1q_lane_f16_7:
+; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
+entry:
+  %a = extractelement <8 x half> %v8, i32 7
+  store half %a, half* %pa
+  ret void
+}
+
+define void @vst1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
+; CHECK-LABEL: vst1q_lane_f16_1:
+; CHECK: vst1.16 {d{{[0-9]+}}[1]}, [r0:16]
+entry:
+  %a = extractelement <8 x half> %v8, i32 1
+  store half %a, half* %pa
+  ret void
+}
author	Mikhail Maltsev <mikhail.maltsev@arm.com>
	Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)
committer	Mikhail Maltsev <mikhail.maltsev@arm.com>
	Tue, 4 Jun 2019 09:39:55 +0000 (09:39 +0000)
lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
test/CodeGen/ARM/fp16-insert-extract.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/ARM/fp16-vldlane-vstlane.ll	[new file with mode: 0644]	patch \| blob