CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_device.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "util_arch.cuh"
37 #include "util_debug.cuh"
38 #include "util_namespace.cuh"
39 #include "util_macro.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
47 
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
54 
55 
59 template <typename T>
60 __global__ void EmptyKernel(void) { }
61 
62 
66 template <int ALLOCATIONS>
67 CUB_RUNTIME_FUNCTION __forceinline__
68 cudaError_t AliasTemporaries(
69  void *d_temp_storage,
70  size_t &temp_storage_bytes,
71  void* (&allocations)[ALLOCATIONS],
72  size_t (&allocation_sizes)[ALLOCATIONS])
73 {
74  const int ALIGN_BYTES = 256;
75  const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
76 
77  // Compute exclusive prefix sum over allocation requests
78  size_t allocation_offsets[ALLOCATIONS];
79  size_t bytes_needed = 0;
80  for (int i = 0; i < ALLOCATIONS; ++i)
81  {
82  size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
83  allocation_offsets[i] = bytes_needed;
84  bytes_needed += allocation_bytes;
85  }
86 
87  // Check if the caller is simply requesting the size of the storage allocation
88  if (!d_temp_storage)
89  {
90  temp_storage_bytes = bytes_needed;
91  return cudaSuccess;
92  }
93 
94  // Check if enough storage provided
95  if (temp_storage_bytes < bytes_needed)
96  {
97  return CubDebug(cudaErrorInvalidValue);
98  }
99 
100  // Alias
101  for (int i = 0; i < ALLOCATIONS; ++i)
102  {
103  allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
104  }
105 
106  return cudaSuccess;
107 }
108 
109 
110 
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
112 
113 
114 
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119 {
120  struct Dummy
121  {
123  typedef void (*EmptyKernelPtr)();
124 
126  CUB_RUNTIME_FUNCTION __forceinline__
127  EmptyKernelPtr Empty()
128  {
129  return EmptyKernel<void>;
130  }
131  };
132 
133 
134 #ifndef CUB_RUNTIME_ENABLED
135 
136  // CUDA API calls not supported from this device
137  return cudaErrorInvalidConfiguration;
138 
139 #elif (CUB_PTX_ARCH > 0)
140 
141  ptx_version = CUB_PTX_ARCH;
142  return cudaSuccess;
143 
144 #else
145 
146  cudaError_t error = cudaSuccess;
147  do
148  {
149  cudaFuncAttributes empty_kernel_attrs;
150  if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
151  ptx_version = empty_kernel_attrs.ptxVersion * 10;
152  }
153  while (0);
154 
155  return error;
156 
157 #endif
158 }
159 
160 
164 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
165 {
166 #ifndef CUB_RUNTIME_ENABLED
167 
168  // CUDA API calls not supported from this device
169  return cudaErrorInvalidConfiguration;
170 
171 #else
172 
173  cudaError_t error = cudaSuccess;
174  do
175  {
176  // Fill in SM version
177  int major, minor;
178  if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
179  if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
180  sm_version = major * 100 + minor * 10;
181  }
182  while (0);
183 
184  return error;
185 
186 #endif
187 }
188 
189 
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
191 
195 CUB_RUNTIME_FUNCTION __forceinline__
196 static cudaError_t SyncStream(cudaStream_t stream)
197 {
198 #if (CUB_PTX_ARCH == 0)
199  return cudaStreamSynchronize(stream);
200 #else
201  // Device can't yet sync on a specific stream
202  return cudaDeviceSynchronize();
203 #endif
204 }
205 
206 
210 template <typename KernelPtr>
211 CUB_RUNTIME_FUNCTION __forceinline__
212 cudaError_t MaxSmOccupancy(
213  int &max_sm_occupancy,
214  int sm_version,
215  KernelPtr kernel_ptr,
216  int block_threads)
217 {
218 #ifndef CUB_RUNTIME_ENABLED
219 
220  // CUDA API calls not supported from this device
221  return CubDebug(cudaErrorInvalidConfiguration);
222 
223 #else
224 
225  cudaError_t error = cudaSuccess;
226  do
227  {
228  int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version);
229  int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version);
230  int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads;
231  int regs_by_block = CUB_REGS_BY_BLOCK(sm_version);
232  int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version);
233  int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version);
234  int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version);
235  int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version);
236  int smem_bytes = CUB_SMEM_BYTES(sm_version);
237 
238  // Get kernel attributes
239  cudaFuncAttributes kernel_attrs;
240  if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
241 
242  // Number of warps per threadblock
243  int block_warps = (block_threads + warp_threads - 1) / warp_threads;
244 
245  // Max warp occupancy
246  int max_warp_occupancy = (block_warps > 0) ?
247  max_sm_warps / block_warps :
248  max_sm_blocks;
249 
250  // Maximum register occupancy
251  int max_reg_occupancy;
252  if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
253  {
254  // Prevent divide-by-zero
255  max_reg_occupancy = max_sm_blocks;
256  }
257  else if (regs_by_block)
258  {
259  // Allocates registers by threadblock
260  int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
261  max_reg_occupancy = max_sm_registers / block_regs;
262  }
263  else
264  {
265  // Allocates registers by warp
266  int sm_sides = warp_alloc_unit;
267  int sm_registers_per_side = max_sm_registers / sm_sides;
268  int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
269  int warps_per_side = sm_registers_per_side / regs_per_warp;
270  int warps = warps_per_side * sm_sides;
271  max_reg_occupancy = warps / block_warps;
272  }
273 
274  // Shared memory per threadblock
275  int block_allocated_smem = CUB_ROUND_UP_NEAREST(
276  kernel_attrs.sharedSizeBytes,
277  smem_alloc_unit);
278 
279  // Max shared memory occupancy
280  int max_smem_occupancy = (block_allocated_smem > 0) ?
281  (smem_bytes / block_allocated_smem) :
282  max_sm_blocks;
283 
284  // Max occupancy
285  max_sm_occupancy = CUB_MIN(
286  CUB_MIN(max_sm_blocks, max_warp_occupancy),
287  CUB_MIN(max_smem_occupancy, max_reg_occupancy));
288 
289 // printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
290 
291  } while (0);
292 
293  return error;
294 
295 #endif // CUB_RUNTIME_ENABLED
296 }
297 
298 #endif // Do not document
299 
300 
332 template <typename KernelPtr>
333 CUB_RUNTIME_FUNCTION __forceinline__
334 cudaError_t MaxSmOccupancy(
335  int &max_sm_occupancy,
336  KernelPtr kernel_ptr,
337  int block_threads)
338 {
339 #ifndef CUB_RUNTIME_ENABLED
340 
341  // CUDA API calls not supported from this device
342  return CubDebug(cudaErrorInvalidConfiguration);
343 
344 #else
345 
346  cudaError_t error = cudaSuccess;
347  do
348  {
349  // Get device ordinal
350  int device_ordinal;
351  if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
352 
353  // Get device SM version
354  int sm_version;
355  if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
356 
357  // Get SM occupancy
358  if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break;
359 
360  } while (0);
361 
362  return error;
363 
364 #endif // CUB_RUNTIME_ENABLED
365 
366 }
367 
368  // end group UtilMgmt
370 
371 } // CUB namespace
372 CUB_NS_POSTFIX // Optional outer namespace(s)