2 * Copyright 2008-2013 NVIDIA Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 * \brief CUDA implementation of device-to-device copy,
19 * based on Gregory Diamos' memcpy code.
24 #include <thrust/detail/config.h>
26 #include <thrust/pair.h>
28 #include <thrust/detail/type_traits.h>
29 #include <thrust/detail/dispatch/is_trivial_copy.h>
30 #include <thrust/detail/raw_reference_cast.h>
43 namespace trivial_copy_detail
47 template<typename Size>
48 inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
50 Size quotient = n / d;
51 Size remainder = n - d * quotient;
52 return thrust::make_pair(quotient,remainder);
53 } // end quotient_and_remainder()
56 // assumes the addresses dst & src are aligned to T boundaries
57 template<typename Context,
59 __device__ __thrust_forceinline__
60 void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
62 for(unsigned int i = context.thread_index();
64 i += context.block_dimension())
68 } // end aligned_copy()
71 } // end namespace trivial_copy_detail
74 template <typename Context>
75 __device__ __thrust_forceinline__
76 void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
78 // reinterpret at bytes
79 char* destination = reinterpret_cast<char*>(destination_);
80 const char* source = reinterpret_cast<const char*>(source_);
82 // TODO replace this with uint64
83 #if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
84 typedef long long int2;
85 typedef long long uint2;
86 #endif // THRUST_DEVICE_COMPILER_NVCC
89 // XXX can we do this in three steps?
90 // 1. copy until alignment is met
92 // 3. get the remainder
93 if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
95 for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
97 destination[i] = source[i];
102 // it's aligned; do a wide copy
104 // this pair stores the number of int2s in the aligned portion of the arrays
105 // and the number of bytes in the remainder
106 const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
108 // copy int2 elements
109 trivial_copy_detail::aligned_copy(context,
110 reinterpret_cast<int2*>(destination),
111 reinterpret_cast<const int2*>(source),
112 num_wide_elements_and_remainder_bytes.first);
114 // XXX we could copy int elements here
116 // copy remainder byte by byte
118 // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
119 // this is sizeof(int2) times the number of int2s comprising the aligned portion
120 const char *remainder_first = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
121 char *remainder_result = reinterpret_cast<char*>(destination + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
123 trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
125 } // end trivial_copy()
133 template<typename Context,
134 typename RandomAccessIterator1,
135 typename RandomAccessIterator2>
136 __thrust_forceinline__ __device__
137 RandomAccessIterator2 copy(Context context,
138 RandomAccessIterator1 first,
139 RandomAccessIterator1 last,
140 RandomAccessIterator2 result,
141 thrust::detail::true_type is_trivial_copy)
143 typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
145 const T *src = &thrust::raw_reference_cast(*first);
146 T *dst = &thrust::raw_reference_cast(*result);
148 size_t n = (last - first);
149 thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
153 template<typename Context,
154 typename RandomAccessIterator1,
155 typename RandomAccessIterator2>
156 __thrust_forceinline__ __device__
157 RandomAccessIterator2 copy(Context context,
158 RandomAccessIterator1 first,
159 RandomAccessIterator1 last,
160 RandomAccessIterator2 result,
161 thrust::detail::false_type is_trivial_copy)
163 RandomAccessIterator2 end_of_output = result + (last - first);
166 first += context.thread_index();
167 result += context.thread_index();
171 first += context.block_dimension(),
172 result += context.block_dimension())
177 return end_of_output;
180 } // end namespace dispatch
181 } // end namespace detail
183 template<typename Context,
184 typename RandomAccessIterator1,
185 typename RandomAccessIterator2>
186 __thrust_forceinline__ __device__
187 RandomAccessIterator2 copy(Context context,
188 RandomAccessIterator1 first,
189 RandomAccessIterator1 last,
190 RandomAccessIterator2 result)
192 return detail::dispatch::copy(context, first, last, result,
193 #if __CUDA_ARCH__ < 200
194 // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
195 thrust::detail::false_type()
197 typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
203 template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
205 RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
207 for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
209 result[i] = first[i];
218 } // end namespace block
219 } // end namespace detail
220 } // end namespace cuda
221 } // end namespace system
222 } // end namespace thrust