util/X86LINUX64/cuda-6.5/include/thrust/system/cuda/detail/block/copy.h

   1 /*
   2  *  Copyright 2008-2013 NVIDIA Corporation
   3  *
   4  *  Licensed under the Apache License, Version 2.0 (the "License");
   5  *  you may not use this file except in compliance with the License.
   6  *  You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  *  Unless required by applicable law or agreed to in writing, software
  11  *  distributed under the License is distributed on an "AS IS" BASIS,
  12  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  *  See the License for the specific language governing permissions and
  14  *  limitations under the License.
  15  */
  16
  17 /*! \file copy.h
  18  *  \brief CUDA implementation of device-to-device copy,
  19  *         based on Gregory Diamos' memcpy code.
  20  */
  21
  22 #pragma once
  23
  24 #include <thrust/detail/config.h>
  25
  26 #include <thrust/pair.h>
  27
  28 #include <thrust/detail/type_traits.h>
  29 #include <thrust/detail/dispatch/is_trivial_copy.h>
  30 #include <thrust/detail/raw_reference_cast.h>
  31
  32 namespace thrust
  33 {
  34 namespace system
  35 {
  36 namespace cuda
  37 {
  38 namespace detail
  39 {
  40 namespace block
  41 {
  42
  43 namespace trivial_copy_detail
  44 {
  45
  46
  47 template<typename Size>
  48   inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
  49 {
  50   Size quotient  = n / d;
  51   Size remainder = n - d * quotient;
  52   return thrust::make_pair(quotient,remainder);
  53 } // end quotient_and_remainder()
  54
  55
  56 // assumes the addresses dst & src are aligned to T boundaries
  57 template<typename Context,
  58          typename T>
  59 __device__ __thrust_forceinline__
  60 void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
  61 {
  62   for(unsigned int i = context.thread_index();
  63       i < num_elements;
  64       i += context.block_dimension())
  65   {
  66     dst[i] = src[i];
  67   }
  68 } // end aligned_copy()
  69
  70
  71 } // end namespace trivial_copy_detail
  72
  73
  74 template <typename Context>
  75 __device__ __thrust_forceinline__
  76 void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
  77 {
  78   // reinterpret at bytes
  79   char* destination  = reinterpret_cast<char*>(destination_);
  80   const char* source = reinterpret_cast<const char*>(source_);
  81
  82   // TODO replace this with uint64
  83 #if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
  84   typedef long long  int2;
  85   typedef long long uint2;
  86 #endif // THRUST_DEVICE_COMPILER_NVCC
  87
  88   // check alignment
  89   // XXX can we do this in three steps?
  90   //     1. copy until alignment is met
  91   //     2. go hog wild
  92   //     3. get the remainder
  93   if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
  94   {
  95     for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
  96     {
  97       destination[i] = source[i];
  98     }
  99   }
 100   else
 101   {
 102     // it's aligned; do a wide copy
 103
 104     // this pair stores the number of int2s in the aligned portion of the arrays
 105     // and the number of bytes in the remainder
 106     const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
 107
 108     // copy int2 elements
 109     trivial_copy_detail::aligned_copy(context,
 110                                       reinterpret_cast<int2*>(destination),
 111                                       reinterpret_cast<const int2*>(source),
 112                                       num_wide_elements_and_remainder_bytes.first);
 113
 114     // XXX we could copy int elements here
 115
 116     // copy remainder byte by byte
 117
 118     // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
 119     // this is sizeof(int2) times the number of int2s comprising the aligned portion
 120     const char *remainder_first  = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
 121           char *remainder_result = reinterpret_cast<char*>(destination  + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
 122
 123     trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
 124   }
 125 } // end trivial_copy()
 126
 127
 128 namespace detail
 129 {
 130 namespace dispatch
 131 {
 132
 133 template<typename Context,
 134          typename RandomAccessIterator1,
 135          typename RandomAccessIterator2>
 136   __thrust_forceinline__ __device__
 137   RandomAccessIterator2 copy(Context context,
 138                              RandomAccessIterator1 first,
 139                              RandomAccessIterator1 last,
 140                              RandomAccessIterator2 result,
 141                              thrust::detail::true_type is_trivial_copy)
 142 {
 143   typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
 144
 145   const T *src = &thrust::raw_reference_cast(*first);
 146         T *dst = &thrust::raw_reference_cast(*result);
 147
 148   size_t n = (last - first);
 149   thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
 150   return result + n;
 151 } // end copy()
 152
 153 template<typename Context,
 154          typename RandomAccessIterator1,
 155          typename RandomAccessIterator2>
 156   __thrust_forceinline__ __device__
 157   RandomAccessIterator2 copy(Context context,
 158                              RandomAccessIterator1 first,
 159                              RandomAccessIterator1 last,
 160                              RandomAccessIterator2 result,
 161                              thrust::detail::false_type is_trivial_copy)
 162 {
 163   RandomAccessIterator2 end_of_output = result + (last - first);
 164
 165   // advance iterators
 166   first  += context.thread_index();
 167   result += context.thread_index();
 168
 169   for(;
 170       first < last;
 171       first  += context.block_dimension(),
 172       result += context.block_dimension())
 173   {
 174     *result = *first;
 175   } // end for
 176
 177   return end_of_output;
 178 } // end copy()
 179
 180 } // end namespace dispatch
 181 } // end namespace detail
 182
 183 template<typename Context,
 184          typename RandomAccessIterator1,
 185          typename RandomAccessIterator2>
 186   __thrust_forceinline__ __device__
 187   RandomAccessIterator2 copy(Context context,
 188                              RandomAccessIterator1 first,
 189                              RandomAccessIterator1 last,
 190                              RandomAccessIterator2 result)
 191 {
 192   return detail::dispatch::copy(context, first, last, result,
 193 #if __CUDA_ARCH__ < 200
 194       // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
 195       thrust::detail::false_type()
 196 #else
 197       typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
 198 #endif
 199       );
 200 } // end copy()
 201
 202
 203 template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
 204 inline __device__
 205 RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
 206 {
 207   for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
 208   {
 209     result[i] = first[i];
 210   }
 211
 212   ctx.barrier();
 213
 214   return result + n;
 215 }
 216
 217
 218 } // end namespace block
 219 } // end namespace detail
 220 } // end namespace cuda
 221 } // end namespace system
 222 } // end namespace thrust
 223