From 45d8d9a866ac6f971981071a82693e9249d696d8 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 6 Jul 2016 20:02:45 +0000 Subject: [PATCH] NVPTX: Replace uses of cuda.syncthreads with nvvm.barrier0 Everywhere where cuda.syncthreads or __syncthreads is used, use the properly namespaced nvvm.barrier0 instead. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274664 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 2 +- docs/NVPTXUsage.rst | 2 +- include/llvm/IR/IntrinsicsNVVM.td | 2 -- lib/Target/NVPTX/NVPTXInstrInfo.cpp | 2 +- lib/Target/NVPTX/NVPTXIntrinsics.td | 3 --- test/CodeGen/NVPTX/MachineSink-convergent.ll | 4 ++-- test/CodeGen/NVPTX/TailDuplication-convergent.ll | 4 ++-- test/CodeGen/NVPTX/access-non-generic.ll | 12 ++++++------ test/CodeGen/NVPTX/noduplicate-syncthreads.ll | 8 ++++---- test/Feature/intrinsic-noduplicate.ll | 6 +++--- test/Transforms/FunctionAttrs/convergent.ll | 6 +++--- 11 files changed, 23 insertions(+), 28 deletions(-) diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 70c24050310..699f56c2d62 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -1318,7 +1318,7 @@ example: The ``convergent`` attribute may appear on functions or call/invoke instructions. When it appears on a function, it indicates that calls to this function should not be made control-dependent on additional values. - For example, the intrinsic ``llvm.cuda.syncthreads`` is ``convergent``, so + For example, the intrinsic ``llvm.nvvm.barrier0`` is ``convergent``, so calls to this intrinsic cannot be made control-dependent on additional values. diff --git a/docs/NVPTXUsage.rst b/docs/NVPTXUsage.rst index cf072eca134..8b8c40f1fd7 100644 --- a/docs/NVPTXUsage.rst +++ b/docs/NVPTXUsage.rst @@ -566,7 +566,7 @@ Intrinsic CUDA Equivalent ``i32 @llvm.nvvm.read.ptx.sreg.ctaid.{x,y,z}`` blockIdx.{x,y,z} ``i32 @llvm.nvvm.read.ptx.sreg.ntid.{x,y,z}`` blockDim.{x,y,z} ``i32 @llvm.nvvm.read.ptx.sreg.nctaid.{x,y,z}`` gridDim.{x,y,z} -``void @llvm.cuda.syncthreads()`` __syncthreads() +``void @llvm.nvvm.barrier0()`` __syncthreads() ================================================ ==================== diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index f5838bbd4c7..b446e4d4074 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -729,8 +729,6 @@ def llvm_anyi64ptr_ty : LLVMAnyPointerType; // (space)i64* [IntrArgMemOnly, NoCapture<0>]>; // Bar.Sync - def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">, - Intrinsic<[], [], [IntrConvergent]>; def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">, Intrinsic<[], [], [IntrConvergent]>; def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">, diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 3cebfce0f93..cd1d355937f 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -112,7 +112,7 @@ bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI, bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const { unsigned addrspace = 0; - if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS) + if (MI->getOpcode() == NVPTX::INT_BARRIER0) return false; if (isLoadInstr(*MI, addrspace)) if (addrspace == NVPTX::PTXLdStInstCode::SHARED) diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 6f7df39c771..6e40421323e 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -33,9 +33,6 @@ def immDouble1 : PatLeaf<(fpimm), [{ // Synchronization and shuffle functions //----------------------------------- let isConvergent = 1 in { -def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins), - "bar.sync \t0;", - [(int_cuda_syncthreads)]>; def INT_BARRIER0 : NVPTXInst<(outs), (ins), "bar.sync \t0;", [(int_nvvm_barrier0)]>; diff --git a/test/CodeGen/NVPTX/MachineSink-convergent.ll b/test/CodeGen/NVPTX/MachineSink-convergent.ll index c06fe224688..91c80182e2f 100644 --- a/test/CodeGen/NVPTX/MachineSink-convergent.ll +++ b/test/CodeGen/NVPTX/MachineSink-convergent.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s | FileCheck %s target triple = "nvptx64-nvidia-cuda" -declare void @llvm.cuda.syncthreads() +declare void @llvm.nvvm.barrier0() ; Load a value, then syncthreads. Branch, and use the loaded value only on one ; side of the branch. The load shouldn't be sunk beneath the call, because @@ -11,7 +11,7 @@ Start: ; CHECK: ld.u32 %ptr_val = load i32, i32* %ptr ; CHECK: bar.sync - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() br i1 %cond, label %L1, label %L2 L1: %ptr_val2 = add i32 %ptr_val, 100 diff --git a/test/CodeGen/NVPTX/TailDuplication-convergent.ll b/test/CodeGen/NVPTX/TailDuplication-convergent.ll index 02b562d85b9..fc6867eca41 100644 --- a/test/CodeGen/NVPTX/TailDuplication-convergent.ll +++ b/test/CodeGen/NVPTX/TailDuplication-convergent.ll @@ -2,7 +2,7 @@ target triple = "nvptx64-nvidia-cuda" declare void @foo() -declare void @llvm.cuda.syncthreads() +declare void @llvm.nvvm.barrier0() ; syncthreads shouldn't be duplicated. ; CHECK: .func call_syncthreads @@ -20,7 +20,7 @@ L2: store i32 1, i32* %a br label %L42 L42: - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() br label %Ret } diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll index 8645ae612d4..3cd5a922508 100644 --- a/test/CodeGen/NVPTX/access-non-generic.ll +++ b/test/CodeGen/NVPTX/access-non-generic.ll @@ -34,7 +34,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) { store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4 ; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; ; use syncthreads to disable optimizations across components - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ; cast; load @@ -45,7 +45,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) { ; cast; store store float %v, float* %2, align 4 ; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ; load gep cast @@ -55,7 +55,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) { ; store gep cast store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4 ; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ; gep cast; load @@ -66,7 +66,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) { ; gep cast; store store float %v, float* %5, align 4 ; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ; cast; gep; load @@ -78,7 +78,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) { ; cast; gep; store store float %v, float* %8, align 4 ; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}}; - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ret void @@ -181,7 +181,7 @@ exit: ret void } -declare void @llvm.cuda.syncthreads() #3 +declare void @llvm.nvvm.barrier0() #3 declare void @use(float) diff --git a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll index 2fec31b3791..ca7fb6eddfe 100644 --- a/test/CodeGen/NVPTX/noduplicate-syncthreads.ll +++ b/test/CodeGen/NVPTX/noduplicate-syncthreads.ll @@ -3,8 +3,8 @@ ; Make sure the call to syncthreads is not duplicate here by the LLVM ; optimizations, because it has the noduplicate attribute set. -; CHECK: call void @llvm.cuda.syncthreads -; CHECK-NOT: call void @llvm.cuda.syncthreads +; CHECK: call void @llvm.nvvm.barrier0 +; CHECK-NOT: call void @llvm.nvvm.barrier0 ; Function Attrs: nounwind define void @foo(float* %output) #1 { @@ -37,7 +37,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() %6 = load float*, float** %output.addr, align 8 %arrayidx6 = getelementptr inbounds float, float* %6, i64 0 %7 = load float, float* %arrayidx6, align 4 @@ -68,7 +68,7 @@ if.end17: ; preds = %if.else13, %if.then } ; Function Attrs: noduplicate nounwind -declare void @llvm.cuda.syncthreads() #2 +declare void @llvm.nvvm.barrier0() #2 !0 = !{void (float*)* @foo, !"kernel", i32 1} !1 = !{null, !"align", i32 8} diff --git a/test/Feature/intrinsic-noduplicate.ll b/test/Feature/intrinsic-noduplicate.ll index 370026223e8..4f2ae1c698c 100644 --- a/test/Feature/intrinsic-noduplicate.ll +++ b/test/Feature/intrinsic-noduplicate.ll @@ -1,9 +1,9 @@ ; RUN: llvm-as < %s | llvm-dis | FileCheck %s ; Make sure LLVM knows about the convergent attribute on the -; llvm.cuda.syncthreads intrinsic. +; llvm.nvvm.barrier0 intrinsic. -declare void @llvm.cuda.syncthreads() +declare void @llvm.nvvm.barrier0() -; CHECK: declare void @llvm.cuda.syncthreads() #[[ATTRNUM:[0-9]+]] +; CHECK: declare void @llvm.nvvm.barrier0() #[[ATTRNUM:[0-9]+]] ; CHECK: attributes #[[ATTRNUM]] = { convergent nounwind } diff --git a/test/Transforms/FunctionAttrs/convergent.ll b/test/Transforms/FunctionAttrs/convergent.ll index bc21d85ec22..37886b82b87 100644 --- a/test/Transforms/FunctionAttrs/convergent.ll +++ b/test/Transforms/FunctionAttrs/convergent.ll @@ -59,15 +59,15 @@ define i32 @indirect_non_convergent_call(i32 ()* %f) convergent norecurse { ; CHECK: Function Attrs ; CHECK-SAME: convergent -; CHECK-NEXT: declare void @llvm.cuda.syncthreads() -declare void @llvm.cuda.syncthreads() convergent +; CHECK-NEXT: declare void @llvm.nvvm.barrier0() +declare void @llvm.nvvm.barrier0() convergent ; CHECK: Function Attrs ; CHECK-SAME: convergent ; CHECK-NEXT: define i32 @intrinsic() define i32 @intrinsic() convergent { ; Implicitly convergent, because the intrinsic is convergent. - call void @llvm.cuda.syncthreads() + call void @llvm.nvvm.barrier0() ret i32 0 } -- 2.11.0