utiltools/X86MAC64/cuda/include/sm_32_intrinsics.h

   1 /*
   2  * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * NOTICE TO LICENSEE:
   5  *
   6  * This source code and/or documentation ("Licensed Deliverables") are
   7  * subject to NVIDIA intellectual property rights under U.S. and
   8  * international Copyright laws.
   9  *
  10  * These Licensed Deliverables contained herein is PROPRIETARY and
  11  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
  12  * conditions of a form of NVIDIA software license agreement by and
  13  * between NVIDIA and Licensee ("License Agreement") or electronically
  14  * accepted by Licensee.  Notwithstanding any terms or conditions to
  15  * the contrary in the License Agreement, reproduction or disclosure
  16  * of the Licensed Deliverables to any third party without the express
  17  * written consent of NVIDIA is prohibited.
  18  *
  19  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  20  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  21  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
  22  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  23  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  24  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  25  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  26  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  27  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  28  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  29  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  30  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  31  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  32  * OF THESE LICENSED DELIVERABLES.
  33  *
  34  * U.S. Government End Users.  These Licensed Deliverables are a
  35  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  36  * 1995), consisting of "commercial computer software" and "commercial
  37  * computer software documentation" as such terms are used in 48
  38  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
  39  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
  40  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  41  * U.S. Government End Users acquire the Licensed Deliverables with
  42  * only those rights set forth herein.
  43  *
  44  * Any use of the Licensed Deliverables in individual and commercial
  45  * software must include, in the user documentation and internal
  46  * comments to the code, the above Disclaimer and U.S. Government End
  47  * Users Notice.
  48  */
  49
  50 #if !defined(__SM_32_INTRINSICS_H__)
  51 #define __SM_32_INTRINSICS_H__
  52
  53 #if defined(__cplusplus) && defined(__CUDACC__)
  54
  55 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
  56
  57 /*******************************************************************************
  58 *                                                                              *
  59 *                                                                              *
  60 *                                                                              *
  61 *******************************************************************************/
  62
  63 #include "builtin_types.h"
  64 #include "device_types.h"
  65 #include "host_defines.h"
  66
  67 // In here are intrinsics which are built in to the compiler. These may be
  68 // referenced by intrinsic implementations from this file.
  69 extern "C"
  70 {
  71     // There are no intrinsics built in to the compiler for SM-3.5,
  72     // all intrinsics are now implemented as inline PTX below.
  73 }
  74
  75 /*******************************************************************************
  76 *                                                                              *
  77 *  Below are implementations of SM-3.5 intrinsics which are included as        *
  78 *  source (instead of being built in to the compiler)                          *
  79 *                                                                              *
  80 *******************************************************************************/
  81
  82 // LDG is a "load from global via texture path" command which can exhibit higher
  83 // bandwidth on GK110 than a regular LD.
  84 // Define a different pointer storage size for 64 and 32 bit
  85 #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
  86 #define __LDG_PTR   "l"
  87 #else
  88 #define __LDG_PTR   "r"
  89 #endif
  90
  91 static __device__ __inline__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
  92 static __device__ __inline__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
  93 static __device__ __inline__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
  94 static __device__ __inline__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
  95 static __device__ __inline__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
  96 static __device__ __inline__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
  97 static __device__ __inline__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
  98 static __device__ __inline__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
  99 static __device__ __inline__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
 100 static __device__ __inline__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
 101 static __device__ __inline__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 %0, [%1];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
 102
 103 static __device__ __inline__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
 104 static __device__ __inline__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
 105 static __device__ __inline__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
 106 static __device__ __inline__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
 107 static __device__ __inline__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
 108 static __device__ __inline__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
 109 static __device__ __inline__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
 110 static __device__ __inline__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
 111 static __device__ __inline__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
 112 static __device__ __inline__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
 113 static __device__ __inline__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 %0, [%1];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
 114
 115 static __device__ __inline__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
 116 static __device__ __inline__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
 117 static __device__ __inline__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
 118 static __device__ __inline__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
 119 static __device__ __inline__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
 120
 121 #undef __LDG_PTR
 122
 123
 124 // SHF is the "funnel shift" operation - an accelerated left/right shift with carry
 125 // operating on 64-bit quantities, which are concatenations of two 32-bit registers.
 126 // We also limit intrinsics to the "clamp" operation, even though PTX allows a "wrap"
 127 // version as well. It is trivial for the user to implement wrap with a bitwise AND.
 128
 129 // This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
 130 static __device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
 131 {
 132     unsigned int ret;
 133     asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
 134     return ret;
 135 }
 136 static __device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
 137 {
 138     unsigned int ret;
 139     asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
 140     return ret;
 141 }
 142
 143 // This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
 144 static __device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
 145 {
 146     unsigned int ret;
 147     asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
 148     return ret;
 149 }
 150 static __device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
 151 {
 152     unsigned int ret;
 153     asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
 154     return ret;
 155 }
 156
 157
 158 #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
 159
 160 #endif /* __cplusplus && __CUDACC__ */
 161
 162 #endif /* !__SM_32_INTRINSICS_H__ */