From 95ea50e4adf76b75fcc0ad29cacd10642db091a6 Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Sat, 5 Dec 2020 16:53:39 +0900
Subject: [PATCH] [VE] Correct LVLGen (LVL instruction insert pass)

SX Aurora VE uses an intermediate representation similar to VP as its MIR.
VE itself uses invidiual VL register as its own vector length register at
the hardware level.  So, LLVM needs to insert load VL (LVL) instruction just
before vector instructions if the value of VL is changed.  This LVLGen pass
generates LVL instructions for such purpose.  Previously, a bug is pointed
out in D91416.  This patch correct this bug and add a regression test.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D92716
---
 llvm/lib/Target/VE/LVLGen.cpp                | 25 ++++++++++-------
 llvm/test/CodeGen/VE/VELIntrinsics/lvlgen.ll | 41 +++++++++++++++++++++++++++-
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/VE/LVLGen.cpp b/llvm/lib/Target/VE/LVLGen.cpp
index 08b350a581d..c4588926af9 100644
--- a/llvm/lib/Target/VE/LVLGen.cpp
+++ b/llvm/lib/Target/VE/LVLGen.cpp
@@ -68,6 +68,12 @@ bool LVLGen::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
     MachineBasicBlock::iterator MI = I;
 
+    // Check whether MI uses a vector length operand.  If so, we prepare for VL
+    // register.  We would like to reuse VL register as much as possible.  We
+    // also would like to keep the number of LEA instructions as fewer as
+    // possible.  Therefore, we use a regular scalar register to hold immediate
+    // values to load VL register.  And try to reuse identical scalar registers
+    // to avoid new LVLr instructions as much as possible.
     unsigned Reg = getVL(*MI);
     if (Reg != VE::NoRegister) {
       LLVM_DEBUG(dbgs() << "Vector instruction found: ");
@@ -78,6 +84,8 @@ bool LVLGen::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
                         << ". ");
 
       if (!HasRegForVL || RegForVL != Reg) {
+        // Use VL, but a different value in a different scalar register.
+        // So, generate new LVL instruction just before the current instruction.
         LLVM_DEBUG(dbgs() << "Generate a LVL instruction to load "
                           << RegName(Reg) << ".\n");
         BuildMI(MBB, I, MI->getDebugLoc(), TII->get(VE::LVLr)).addReg(Reg);
@@ -87,18 +95,15 @@ bool LVLGen::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       } else {
         LLVM_DEBUG(dbgs() << "Reuse current VL.\n");
       }
-    } else if (HasRegForVL) {
-      // Old VL is overwritten, so disable HasRegForVL.
-      if (MI->findRegisterDefOperandIdx(RegForVL, false, false, TRI) != -1) {
-        LLVM_DEBUG(dbgs() << RegName(RegForVL) << " is killed: ");
-        LLVM_DEBUG(MI->dump());
-        HasRegForVL = false;
-      }
     }
+    // Check the update of a given scalar register holding an immediate value
+    // for VL register.  Also, a call doesn't preserve VL register.
     if (HasRegForVL) {
-      // The latest VL is killed, so disable HasRegForVL.
-      if (MI->killsRegister(RegForVL, TRI)) {
-        LLVM_DEBUG(dbgs() << RegName(RegForVL) << " is killed: ");
+      if (MI->definesRegister(RegForVL, TRI) ||
+          MI->modifiesRegister(RegForVL, TRI) ||
+          MI->killsRegister(RegForVL, TRI) || MI->isCall()) {
+        // The latest VL is needed to be updated, so disable HasRegForVL.
+        LLVM_DEBUG(dbgs() << RegName(RegForVL) << " is needed to be updated: ");
         LLVM_DEBUG(MI->dump());
         HasRegForVL = false;
       }
diff --git a/llvm/test/CodeGen/VE/VELIntrinsics/lvlgen.ll b/llvm/test/CodeGen/VE/VELIntrinsics/lvlgen.ll
index ac889e7b60c..c4db6244245 100644
--- a/llvm/test/CodeGen/VE/VELIntrinsics/lvlgen.ll
+++ b/llvm/test/CodeGen/VE/VELIntrinsics/lvlgen.ll
@@ -42,7 +42,6 @@ define void @switching_vl(i32 %evl, i32 %evl2, i8* %P, i8* %Q) {
 ; Check that no redundant 'lvl' is inserted when vector length does not change
 ; in a basic block.
 
-
 ; Function Attrs: nounwind
 define void @stable_vl(i32 %evl, i8* %P, i8* %Q) {
 ; CHECK-LABEL: stable_vl:
@@ -64,3 +63,43 @@ define void @stable_vl(i32 %evl, i8* %P, i8* %Q) {
   tail call void @llvm.ve.vl.vst.vssl(<256 x double> %l2, i64 16, i8* %Q, i32 %evl)
   ret void
 }
+
+;;; Check the case we have a call in the middle of vector instructions.
+
+; Function Attrs: nounwind
+define void @call_invl(i32 %evl, i8* %P, i8* %Q) {
+; CHECK-LABEL: call_invl:
+; CHECK:       .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:    st %s18, 288(, %s11) # 8-byte Folded Spill
+; CHECK-NEXT:    st %s19, 296(, %s11) # 8-byte Folded Spill
+; CHECK-NEXT:    st %s20, 304(, %s11) # 8-byte Folded Spill
+; CHECK-NEXT:    or %s18, 0, %s1
+; CHECK-NEXT:    and %s20, %s0, (32)0
+; CHECK-NEXT:    lvl %s20
+; CHECK-NEXT:    vld %v0, 8, %s1
+; CHECK-NEXT:    or %s19, 0, %s2
+; CHECK-NEXT:    vst %v0, 16, %s2
+; CHECK-NEXT:    lea %s0, fun@lo
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea.sl %s12, fun@hi(, %s0)
+; CHECK-NEXT:    bsic %s10, (, %s12)
+; CHECK-NEXT:    lvl %s20
+; CHECK-NEXT:    vld %v0, 16, %s18
+; CHECK-NEXT:    vst %v0, 16, %s19
+; CHECK-NEXT:    vld %v0, 8, %s18
+; CHECK-NEXT:    vst %v0, 16, %s19
+; CHECK-NEXT:    ld %s20, 304(, %s11) # 8-byte Folded Reload
+; CHECK-NEXT:    ld %s19, 296(, %s11) # 8-byte Folded Reload
+; CHECK-NEXT:    ld %s18, 288(, %s11) # 8-byte Folded Reload
+; CHECK-NEXT:    or %s11, 0, %s9
+  %l0 = tail call <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %P, i32 %evl)
+  tail call void @llvm.ve.vl.vst.vssl(<256 x double> %l0, i64 16, i8* %Q, i32 %evl)
+  call void @fun()
+  %l1 = tail call <256 x double> @llvm.ve.vl.vld.vssl(i64 16, i8* %P, i32 %evl)
+  tail call void @llvm.ve.vl.vst.vssl(<256 x double> %l1, i64 16, i8* %Q, i32 %evl)
+  %l2 = tail call <256 x double> @llvm.ve.vl.vld.vssl(i64 8, i8* %P, i32 %evl)
+  tail call void @llvm.ve.vl.vst.vssl(<256 x double> %l2, i64 16, i8* %Q, i32 %evl)
+  ret void
+}
+
+declare void @fun()
-- 
2.11.0