2 * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
6 * This source code and/or documentation ("Licensed Deliverables") are
7 * subject to NVIDIA intellectual property rights under U.S. and
8 * international Copyright laws.
10 * These Licensed Deliverables contained herein is PROPRIETARY and
11 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12 * conditions of a form of NVIDIA software license agreement by and
13 * between NVIDIA and Licensee ("License Agreement") or electronically
14 * accepted by Licensee. Notwithstanding any terms or conditions to
15 * the contrary in the License Agreement, reproduction or disclosure
16 * of the Licensed Deliverables to any third party without the express
17 * written consent of NVIDIA is prohibited.
19 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32 * OF THESE LICENSED DELIVERABLES.
34 * U.S. Government End Users. These Licensed Deliverables are a
35 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36 * 1995), consisting of "commercial computer software" and "commercial
37 * computer software documentation" as such terms are used in 48
38 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39 * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41 * U.S. Government End Users acquire the Licensed Deliverables with
42 * only those rights set forth herein.
44 * Any use of the Licensed Deliverables in individual and commercial
45 * software must include, in the user documentation and internal
46 * comments to the code, the above Disclaimer and U.S. Government End
51 * curand_mtgp32_kernel.h
56 * Mersenne Twister RNG for the GPU
58 * The period of generated integers is 2<sup>11213</sup>-1.
60 * This code generates 32-bit unsigned integers, and
61 * single precision floating point numbers uniformly distributed
62 * in the range [1, 2). (float r; 1.0 <= r < 2.0)
66 * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
67 * University. All rights reserved.
68 * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
69 * University and University of Tokyo. All rights reserved.
71 * Redistribution and use in source and binary forms, with or without
72 * modification, are permitted provided that the following conditions are
75 * * Redistributions of source code must retain the above copyright
76 * notice, this list of conditions and the following disclaimer.
77 * * Redistributions in binary form must reproduce the above
78 * copyright notice, this list of conditions and the following
79 * disclaimer in the documentation and/or other materials provided
80 * with the distribution.
81 * * Neither the name of the Hiroshima University nor the names of
82 * its contributors may be used to endorse or promote products
83 * derived from this software without specific prior written
86 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
98 #if !defined CURAND_MTGP32_KERNEL_H
99 #define CURAND_MTGP32_KERNEL_H
101 //#if !defined(QUALIFIERS)
102 //#define QUALIFIERS static inline __device__
103 #define QUALIFIERS_MTGP32 static __forceinline__ __device__ __host__
111 #include "curand_mtgp32.h"
114 * \addtogroup DEVICE Device API
119 #ifndef __CUDA_ARCH__
120 // define blockDim and threadIdx for host compatibility call
121 extern const dim3 blockDim;
122 extern const uint3 threadIdx;
127 * The function of the recursion formula calculation.
129 * @param[in] X1 the farthest part of state array.
130 * @param[in] X2 the second farthest part of state array.
131 * @param[in] Y a part of state array.
132 * @param[in] bid block id.
135 QUALIFIERS_MTGP32 unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
136 unsigned int X = (X1 & k->mask[0]) ^ X2;
139 X ^= X << k->sh1_tbl[bid];
140 Y = X ^ (Y >> k->sh2_tbl[bid]);
141 MAT = k->param_tbl[bid][Y & 0x0f];
146 * The tempering function.
148 * @param[in] V the output value should be tempered.
149 * @param[in] T the tempering helper value.
150 * @param[in] bid block id.
151 * @return the tempered value.
153 QUALIFIERS_MTGP32 unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
158 MAT = k->temper_tbl[bid][T & 0x0f];
163 * The tempering and converting function.
164 * By using the preset table, converting to IEEE format
165 * and tempering are done simultaneously.
167 * @param[in] V the output value should be tempered.
168 * @param[in] T the tempering helper value.
169 * @param[in] bid block id.
170 * @return the tempered and converted value.
172 QUALIFIERS_MTGP32 unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
178 MAT = k->single_temper_tbl[bid][T & 0x0f];
184 * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
186 * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
187 * increment position of generator by the number of threads in the block.
188 * Note the number of threads in the block can not exceed 256.
190 * \param state - Pointer to state to update
192 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
194 QUALIFIERS_MTGP32 unsigned int curand(curandStateMtgp32_t *state)
198 int pos = state->k->pos_tbl[state->pIdx];
202 d = blockDim.z * blockDim.y * blockDim.x;
203 //assert( d <= 256 );
204 t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
205 r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
206 state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
207 state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
210 state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
211 o = temper(state->k, r,
212 state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
214 #if __CUDA_ARCH__ != 0
219 state->offset = (state->offset + d) & MTGP32_STATE_MASK;
221 #if __CUDA_ARCH__ != 0
228 * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
230 * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
231 * increment position of generator by \p n positions, which must be the total number of positions
232 * upddated in the state by the thread block, for this invocation.
235 * Thread indices must range from 0...\ n - 1.
236 * The number of positions updated may not exceed 256.
237 * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
239 * \param state - Pointer to state to update
240 * \param index - Index (0..255) of the position within the state to draw from and update
241 * \param n - The total number of postions in this state that are being updated by this invocation
243 * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
245 QUALIFIERS_MTGP32 unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
248 int pos = state->k->pos_tbl[state->pIdx];
253 r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
254 state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
255 state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
258 state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
259 o = temper(state->k, r,
260 state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
262 #if __CUDA_ARCH__ != 0
267 state->offset = (state->offset + n) & MTGP32_STATE_MASK;
269 #if __CUDA_ARCH__ != 0
275 * \brief Return a uniformly distributed float from a mtgp32 generator.
277 * Return a uniformly distributed float between \p 0.0f and \p 1.0f
278 * from the mtgp32 generator in \p state, increment position of generator.
279 * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
280 * point outputs are never returned.
282 * Note: This alternate derivation of a uniform float is provided for completeness
283 * with the original source
285 * \param state - Pointer to state to update
287 * \return uniformly distributed float between \p 0.0f and \p 1.0f
289 QUALIFIERS_MTGP32 float curand_mtgp32_single(curandStateMtgp32_t *state)
293 int pos = state->k->pos_tbl[state->pIdx];
295 union mtgp32_u_to_f {
301 t = blockDim.z * blockDim.y;
303 //assert( d <= 256 );
305 r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
306 state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
307 state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
311 o.u = temper_single(state->k, r,
312 state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
317 if (threadIdx.x == 0)
319 state->offset = (state->offset + d) & MTGP32_STATE_MASK;
321 #if __CUDA_ARCH__ != 0
328 * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
330 * Return a uniformly distributed float between \p 0.0f and \p 1.0f
331 * from position \p index of the mtgp32 generator in \p state, and
332 * increment position of generator by \p n positions, which must be the total number of positions
333 * upddated in the state by the thread block, for this invocation.
334 * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
335 * point outputs are never returned.
338 * Thread indices must range from 0...\p n - 1.
339 * The number of positions updated may not exceed 256.
340 * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
342 * Note 2: This alternate derivation of a uniform float is provided for completeness
343 * with the original source
345 * \param state - Pointer to state to update
346 * \param index - Index (0..255) of the position within the state to draw from and update
347 * \param n - The total number of postions in this state that are being updated by this invocation
349 * \return uniformly distributed float between \p 0.0f and \p 1.0f
351 QUALIFIERS_MTGP32 float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
354 int pos = state->k->pos_tbl[state->pIdx];
356 union mtgp32_u_to_f {
363 r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
364 state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
365 state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
369 o.u = temper_single(state->k, r,
370 state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
375 if (threadIdx.x == 0)
377 state->offset = (state->offset + n) & MTGP32_STATE_MASK;
379 #if __CUDA_ARCH__ != 0
389 #undef QUALIFIERS_MTGP32