util/X86LINUX64/cuda-6.5/include/curand_mtgp32_kernel.h

   1 /*
   2  * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
   3  *
   4  * NOTICE TO LICENSEE:
   5  *
   6  * This source code and/or documentation ("Licensed Deliverables") are
   7  * subject to NVIDIA intellectual property rights under U.S. and
   8  * international Copyright laws.
   9  *
  10  * These Licensed Deliverables contained herein is PROPRIETARY and
  11  * CONFIDENTIAL to NVIDIA and is being provided under the terms and
  12  * conditions of a form of NVIDIA software license agreement by and
  13  * between NVIDIA and Licensee ("License Agreement") or electronically
  14  * accepted by Licensee.  Notwithstanding any terms or conditions to
  15  * the contrary in the License Agreement, reproduction or disclosure
  16  * of the Licensed Deliverables to any third party without the express
  17  * written consent of NVIDIA is prohibited.
  18  *
  19  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  20  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
  21  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
  22  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
  23  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
  24  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
  25  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
  26  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
  27  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
  28  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
  29  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  30  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  31  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  32  * OF THESE LICENSED DELIVERABLES.
  33  *
  34  * U.S. Government End Users.  These Licensed Deliverables are a
  35  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
  36  * 1995), consisting of "commercial computer software" and "commercial
  37  * computer software documentation" as such terms are used in 48
  38  * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
  39  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
  40  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
  41  * U.S. Government End Users acquire the Licensed Deliverables with
  42  * only those rights set forth herein.
  43  *
  44  * Any use of the Licensed Deliverables in individual and commercial
  45  * software must include, in the user documentation and internal
  46  * comments to the code, the above Disclaimer and U.S. Government End
  47  * Users Notice.
  48  */
  49
  50 /*
  51  * curand_mtgp32_kernel.h
  52  *
  53  *
  54  * MTGP32-11213
  55  *
  56  * Mersenne Twister RNG for the GPU
  57  *
  58  * The period of generated integers is 2<sup>11213</sup>-1.
  59  *
  60  * This code generates 32-bit unsigned integers, and
  61  * single precision floating point numbers uniformly distributed
  62  * in the range [1, 2). (float r; 1.0 <= r < 2.0)
  63  */
  64
  65 /*
  66  * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
  67  * University.  All rights reserved.
  68  * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
  69  * University and University of Tokyo.  All rights reserved.
  70  *
  71  * Redistribution and use in source and binary forms, with or without
  72  * modification, are permitted provided that the following conditions are
  73  * met:
  74  *
  75  *     * Redistributions of source code must retain the above copyright
  76  *       notice, this list of conditions and the following disclaimer.
  77  *     * Redistributions in binary form must reproduce the above
  78  *       copyright notice, this list of conditions and the following
  79  *       disclaimer in the documentation and/or other materials provided
  80  *       with the distribution.
  81  *     * Neither the name of the Hiroshima University nor the names of
  82  *       its contributors may be used to endorse or promote products
  83  *       derived from this software without specific prior written
  84  *       permission.
  85  *
  86  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  87  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  88  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  89  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  90  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  91  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  92  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  93  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  94  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  95  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  96  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  97  */
  98 #if !defined CURAND_MTGP32_KERNEL_H
  99 #define CURAND_MTGP32_KERNEL_H
 100
 101 //#if !defined(QUALIFIERS)
 102 //#define QUALIFIERS static inline __device__
 103 #define QUALIFIERS_MTGP32 static __forceinline__ __device__ __host__
 104 //#endif
 105
 106 #include <cuda.h>
 107 #include <stdlib.h>
 108 #include <memory.h>
 109 #include <string.h>
 110 #include "curand.h"
 111 #include "curand_mtgp32.h"
 112
 113 /**
 114  * \addtogroup DEVICE Device API
 115  *
 116  * @{
 117  */
 118
 119 #ifndef __CUDA_ARCH__
 120 // define blockDim and threadIdx for host compatibility call
 121 extern const dim3 blockDim;
 122 extern const uint3 threadIdx;
 123 #endif
 124
 125
 126 /*
 127  * The function of the recursion formula calculation.
 128  *
 129  * @param[in] X1 the farthest part of state array.
 130  * @param[in] X2 the second farthest part of state array.
 131  * @param[in] Y a part of state array.
 132  * @param[in] bid block id.
 133  * @return output
 134  */
 135 QUALIFIERS_MTGP32 unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
 136     unsigned int X = (X1 & k->mask[0]) ^ X2;
 137     unsigned int MAT;
 138
 139     X ^= X << k->sh1_tbl[bid];
 140     Y = X ^ (Y >> k->sh2_tbl[bid]);
 141     MAT = k->param_tbl[bid][Y & 0x0f];
 142     return Y ^ MAT;
 143 }
 144
 145 /*
 146  * The tempering function.
 147  *
 148  * @param[in] V the output value should be tempered.
 149  * @param[in] T the tempering helper value.
 150  * @param[in] bid block id.
 151  * @return the tempered value.
 152  */
 153 QUALIFIERS_MTGP32 unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
 154     unsigned int MAT;
 155
 156     T ^= T >> 16;
 157     T ^= T >> 8;
 158     MAT = k->temper_tbl[bid][T & 0x0f];
 159     return V ^ MAT;
 160 }
 161
 162 /*
 163  * The tempering and converting function.
 164  * By using the preset table, converting to IEEE format
 165  * and tempering are done simultaneously.
 166  *
 167  * @param[in] V the output value should be tempered.
 168  * @param[in] T the tempering helper value.
 169  * @param[in] bid block id.
 170  * @return the tempered and converted value.
 171  */
 172 QUALIFIERS_MTGP32 unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
 173     unsigned int MAT;
 174     unsigned int r;
 175
 176     T ^= T >> 16;
 177     T ^= T >> 8;
 178     MAT = k->single_temper_tbl[bid][T & 0x0f];
 179     r = (V >> 9) ^ MAT;
 180     return r;
 181 }
 182
 183 /**
 184  * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
 185  *
 186  * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
 187  * increment position of generator by the number of threads in the block.
 188  * Note the number of threads in the block can not exceed 256.
 189  *
 190  * \param state - Pointer to state to update
 191  *
 192  * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 193  */
 194 QUALIFIERS_MTGP32 unsigned int curand(curandStateMtgp32_t *state)
 195 {
 196     unsigned int t;
 197     unsigned int d;
 198     int pos = state->k->pos_tbl[state->pIdx];
 199     unsigned int r;
 200     unsigned int o;
 201
 202     d = blockDim.z * blockDim.y * blockDim.x;
 203     //assert( d <= 256 );
 204     t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
 205     r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
 206              state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
 207              state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
 208              state->pIdx);
 209
 210     state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
 211     o = temper(state->k, r,
 212            state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
 213            state->pIdx);
 214 #if __CUDA_ARCH__ != 0
 215     __syncthreads();
 216 #endif
 217     if (t == 0)
 218     {
 219         state->offset = (state->offset + d) & MTGP32_STATE_MASK;
 220     }
 221 #if __CUDA_ARCH__ != 0
 222     __syncthreads();
 223 #endif
 224     return o;
 225
 226 }
 227 /**
 228  * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
 229  *
 230  * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
 231  * increment position of generator by \p n positions, which must be the total number of positions
 232  * upddated in the state by the thread block, for this invocation.
 233  *
 234  * Note :
 235  * Thread indices must range from 0...\ n - 1.
 236  * The number of positions updated may not exceed 256.
 237  * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
 238  *
 239  * \param state - Pointer to state to update
 240  * \param index - Index (0..255) of the position within the state to draw from and update
 241  * \param n - The total number of postions in this state that are being updated by this invocation
 242  *
 243  * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
 244  */
 245 QUALIFIERS_MTGP32 unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
 246 {
 247     unsigned int t;
 248     int pos = state->k->pos_tbl[state->pIdx];
 249     unsigned int r;
 250     unsigned int o;
 251
 252     t = index;
 253     r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
 254              state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
 255              state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
 256              state->pIdx);
 257
 258     state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
 259     o = temper(state->k, r,
 260            state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
 261            state->pIdx);
 262 #if __CUDA_ARCH__ != 0
 263     __syncthreads();
 264 #endif
 265     if (index == 0)
 266     {
 267         state->offset = (state->offset + n) & MTGP32_STATE_MASK;
 268     }
 269 #if __CUDA_ARCH__ != 0
 270     __syncthreads();
 271 #endif
 272     return o;
 273 }
 274 /**
 275  * \brief Return a uniformly distributed float from a mtgp32 generator.
 276  *
 277  * Return a uniformly distributed float between \p 0.0f and \p 1.0f
 278  * from the mtgp32 generator in \p state, increment position of generator.
 279  * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
 280  * point outputs are never returned.
 281  *
 282  * Note: This alternate derivation of a uniform float is provided for completeness
 283  * with the original source
 284  *
 285  * \param state - Pointer to state to update
 286  *
 287  * \return uniformly distributed float between \p 0.0f and \p 1.0f
 288  */
 289 QUALIFIERS_MTGP32 float curand_mtgp32_single(curandStateMtgp32_t *state)
 290 {
 291     unsigned int t;
 292     unsigned int d;
 293     int pos = state->k->pos_tbl[state->pIdx];
 294     unsigned int r;
 295     union mtgp32_u_to_f {
 296         unsigned int u;
 297         float f;
 298     }o;
 299
 300
 301     t = blockDim.z * blockDim.y;
 302     d = t * blockDim.x;
 303     //assert( d <= 256 );
 304     t += threadIdx.x;
 305     r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
 306              state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
 307              state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
 308              state->pIdx);
 309
 310     state->s[t] = r;
 311     o.u = temper_single(state->k, r,
 312                         state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
 313                         state->pIdx);
 314 #ifdef __CUDA_ARCH__
 315     __syncthreads();
 316 #endif
 317     if (threadIdx.x == 0)
 318     {
 319         state->offset = (state->offset + d) & MTGP32_STATE_MASK;
 320     }
 321 #if __CUDA_ARCH__ != 0
 322     __syncthreads();
 323 #endif
 324     return o.f;
 325 }
 326
 327 /**
 328  * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
 329  *
 330  * Return a uniformly distributed float between \p 0.0f and \p 1.0f
 331  * from position \p index of the mtgp32 generator in \p state, and
 332  * increment position of generator by \p n positions, which must be the total number of positions
 333  * upddated in the state by the thread block, for this invocation.
 334  * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
 335  * point outputs are never returned.
 336  *
 337  * Note 1:
 338  * Thread indices must range from 0...\p n - 1.
 339  * The number of positions updated may not exceed 256.
 340  * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
 341  *
 342  * Note 2: This alternate derivation of a uniform float is provided for completeness
 343  * with the original source
 344  *
 345  * \param state - Pointer to state to update
 346  * \param index - Index (0..255) of the position within the state to draw from and update
 347  * \param n - The total number of postions in this state that are being updated by this invocation
 348  *
 349  * \return uniformly distributed float between \p 0.0f and \p 1.0f
 350  */
 351 QUALIFIERS_MTGP32 float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
 352 {
 353     unsigned int t;
 354     int pos = state->k->pos_tbl[state->pIdx];
 355     unsigned int r;
 356     union mtgp32_u_to_f {
 357         unsigned int u;
 358         float f;
 359     }o;
 360
 361
 362     t = index;
 363     r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
 364              state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
 365              state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
 366              state->pIdx);
 367
 368     state->s[t] = r;
 369     o.u = temper_single(state->k, r,
 370                         state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
 371                         state->pIdx);
 372 #ifdef __CUDA_ARCH__
 373     __syncthreads();
 374 #endif
 375     if (threadIdx.x == 0)
 376     {
 377         state->offset = (state->offset + n) & MTGP32_STATE_MASK;
 378     }
 379 #if __CUDA_ARCH__ != 0
 380     __syncthreads();
 381 #endif
 382     return o.f;
 383 }
 384
 385 /** @} */
 386
 387 #endif
 388
 389 #undef QUALIFIERS_MTGP32