2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
12 // Type-specific implementation of rounded arithmetic operators.
13 // Thin layer over the CUDA intrinsics.
15 #ifndef CUDA_INTERVAL_ROUNDED_ARITH_H
16 #define CUDA_INTERVAL_ROUNDED_ARITH_H
18 // Temporary workaround for CUDA 3.0/3.1 on SM 1.3 devices:
19 // missing double-precision div/sqrt with directed rounding
20 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ <= 130
24 // Generic class, no actual implementation yet
28 __device__ T add_down(const T &x, const T &y);
29 __device__ T add_up(const T &x, const T &y);
30 __device__ T sub_down(const T &x, const T &y);
31 __device__ T sub_up(const T &x, const T &y);
32 __device__ T mul_down(const T &x, const T &y);
33 __device__ T mul_up(const T &x, const T &y);
34 __device__ T div_down(const T &x, const T &y);
35 __device__ T div_up(const T &x, const T &y);
36 __device__ T median(const T &x, const T &y);
37 __device__ T sqrt_down(const T &x);
38 __device__ T sqrt_up(const T &x);
39 __device__ T int_down(const T &x);
40 __device__ T int_up(const T &x);
42 __device__ T pos_inf();
43 __device__ T neg_inf();
44 __device__ __host__ T nan();
45 __device__ T min(T const &x, T const &y);
46 __device__ T max(T const &x, T const &y);
50 // Specialization for float
52 struct rounded_arith<float>
54 __device__ float add_down(const float &x, const float &y)
56 return __fadd_rd(x, y);
59 __device__ float add_up(const float &x, const float &y)
61 return __fadd_ru(x, y);
64 __device__ float sub_down(const float &x, const float &y)
66 return __fadd_rd(x, -y);
69 __device__ float sub_up(const float &x, const float &y)
71 return __fadd_ru(x, -y);
74 __device__ float mul_down(const float &x, const float &y)
76 return __fmul_rd(x, y);
79 __device__ float mul_up(const float &x, const float &y)
81 return __fmul_ru(x, y);
84 __device__ float div_down(const float &x, const float &y)
86 return __fdiv_rd(x, y);
89 __device__ float div_up(const float &x, const float &y)
91 return __fdiv_ru(x, y);
94 __device__ float median(const float &x, const float &y)
99 __device__ float sqrt_down(const float &x)
101 return __fsqrt_rd(x);
104 __device__ float sqrt_up(const float &x)
106 return __fsqrt_ru(x);
109 __device__ float int_down(const float &x)
114 __device__ float int_up(const float &x)
119 __device__ float neg_inf()
121 return __int_as_float(0xff800000);
124 __device__ float pos_inf()
126 return __int_as_float(0x7f800000);
129 __device__ __host__ float nan()
134 __device__ float min(float const &x, float const &y)
139 __device__ float max(float const &x, float const &y)
145 __device__ double my_div_ru(double a, double b)
149 double y0 = (double)(1.0f / __double2float_rn(b)); // y0 22 bits
151 // Cubic iteration from Alex
152 double e1 = __fma_rn(-b, y0, 1.0);
153 double e2 = __fma_rn(e1, e1, e1);
154 double y2 = __fma_rn(y0, e2, y0); // y2 ~ 1/b 66 bits + rounding RN => faithful
156 double e3 = __fma_rn(-b, y2, 1.0); // M
157 double y3 = __fma_rn(y2, e3, y2); // y3 ~ 1/b correct except for 1/(2-eps)
159 double q0 = __dmul_rn(a, y0);
160 double r0 = __fma_rn(-q0, b, a); // M
161 double q1 = __fma_rn(r0, y2, q0); // q1 ~ a/b faithful
163 double r1 = __fma_ru(-q1, b, a); // M
164 double q2 = __fma_ru(r1, y3, q1); // q2 = a/b correct
165 return ldexp(q2, -expo);
168 __device__ double my_div_rd(double a, double b)
172 double y0 = (double)(1.0f / __double2float_rn(b)); // y0 22 bits
174 double e1 = __fma_rn(-b, y0, 1.0);
175 double e2 = __fma_rn(e1, e1, e1);
176 double y2 = __fma_rn(y0, e2, y0); // y2 ~ 1/b 66 bits + rounding RN => faithful
178 double e3 = __fma_rn(-b, y2, 1.0); // M
179 double y3 = __fma_rn(y2, e3, y2); // y3 ~ 1/b correct except for 1/(2-eps)
181 double q0 = __dmul_rn(a, y0);
182 double r0 = __fma_rn(-q0, b, a); // M
183 double q1 = __fma_rn(r0, y2, q0); // q1 ~ a/b faithful
185 double r1 = __fma_rd(-q1, b, a); // M
186 double q2 = __fma_rd(r1, y3, q1);
187 return ldexp(q2, -expo);
190 // Specialization for double
192 struct rounded_arith<double>
194 __device__ double add_down(const double &x, const double &y)
196 return __dadd_rd(x, y);
199 __device__ double add_up(const double &x, const double &y)
201 return __dadd_ru(x, y);
204 __device__ double sub_down(const double &x, const double &y)
206 return __dadd_rd(x, -y);
209 __device__ double sub_up(const double &x, const double &y)
211 return __dadd_ru(x, -y);
214 __device__ double mul_down(const double &x, const double &y)
216 return __dmul_rd(x, y);
219 __device__ double mul_up(const double &x, const double &y)
221 return __dmul_ru(x, y);
224 __device__ double div_down(const double &x, const double &y)
226 #ifndef NO_DOUBLE_DIV
227 return __ddiv_rd(x, y);
229 return my_div_rd(x, y);
233 __device__ double div_up(const double &x, const double &y)
235 #ifndef NO_DOUBLE_DIV
236 return __ddiv_ru(x, y);
238 return my_div_ru(x, y);
241 __device__ double median(const double &x, const double &y)
246 #ifndef NO_DOUBLE_DIV
247 __device__ double sqrt_down(const double &x)
249 return __dsqrt_rd(x);
252 __device__ double sqrt_up(const double &x)
254 return __dsqrt_ru(x);
257 __device__ double int_down(const double &x)
262 __device__ double int_up(const double &x)
267 __device__ double neg_inf()
269 return __longlong_as_double(0xfff0000000000000ull);
272 __device__ double pos_inf()
274 return __longlong_as_double(0x7ff0000000000000ull);
276 __device__ __host__ double nan()
281 __device__ double min(double const &x, double const &y)
286 __device__ double max(double const &x, double const &y)