38 #include "util_namespace.cuh"
39 #include "util_macro.cuh"
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
60 __global__
void EmptyKernel(
void) { }
66 template <
int ALLOCATIONS>
67 CUB_RUNTIME_FUNCTION __forceinline__
68 cudaError_t AliasTemporaries(
70 size_t &temp_storage_bytes,
71 void* (&allocations)[ALLOCATIONS],
72 size_t (&allocation_sizes)[ALLOCATIONS])
74 const int ALIGN_BYTES = 256;
75 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
78 size_t allocation_offsets[ALLOCATIONS];
79 size_t bytes_needed = 0;
80 for (
int i = 0; i < ALLOCATIONS; ++i)
82 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
83 allocation_offsets[i] = bytes_needed;
84 bytes_needed += allocation_bytes;
90 temp_storage_bytes = bytes_needed;
95 if (temp_storage_bytes < bytes_needed)
97 return CubDebug(cudaErrorInvalidValue);
101 for (
int i = 0; i < ALLOCATIONS; ++i)
103 allocations[i] =
static_cast<char*
>(d_temp_storage) + allocation_offsets[i];
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
PtxVersion(
int &ptx_version)
123 typedef void (*EmptyKernelPtr)();
126 CUB_RUNTIME_FUNCTION __forceinline__
127 EmptyKernelPtr Empty()
129 return EmptyKernel<void>;
134 #ifndef CUB_RUNTIME_ENABLED
137 return cudaErrorInvalidConfiguration;
139 #elif (CUB_PTX_ARCH > 0)
146 cudaError_t error = cudaSuccess;
149 cudaFuncAttributes empty_kernel_attrs;
150 if (
CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>)))
break;
151 ptx_version = empty_kernel_attrs.ptxVersion * 10;
164 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
SmVersion(
int &sm_version,
int device_ordinal)
166 #ifndef CUB_RUNTIME_ENABLED
169 return cudaErrorInvalidConfiguration;
173 cudaError_t error = cudaSuccess;
178 if (
CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal)))
break;
179 if (
CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal)))
break;
180 sm_version = major * 100 + minor * 10;
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
195 CUB_RUNTIME_FUNCTION __forceinline__
196 static cudaError_t SyncStream(cudaStream_t stream)
198 #if (CUB_PTX_ARCH == 0)
199 return cudaStreamSynchronize(stream);
202 return cudaDeviceSynchronize();
210 template <
typename KernelPtr>
211 CUB_RUNTIME_FUNCTION __forceinline__
213 int &max_sm_occupancy,
215 KernelPtr kernel_ptr,
218 #ifndef CUB_RUNTIME_ENABLED
221 return CubDebug(cudaErrorInvalidConfiguration);
225 cudaError_t error = cudaSuccess;
239 cudaFuncAttributes kernel_attrs;
240 if (
CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr)))
break;
243 int block_warps = (block_threads + warp_threads - 1) / warp_threads;
246 int max_warp_occupancy = (block_warps > 0) ?
247 max_sm_warps / block_warps :
251 int max_reg_occupancy;
252 if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
255 max_reg_occupancy = max_sm_blocks;
257 else if (regs_by_block)
260 int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
261 max_reg_occupancy = max_sm_registers / block_regs;
266 int sm_sides = warp_alloc_unit;
267 int sm_registers_per_side = max_sm_registers / sm_sides;
268 int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
269 int warps_per_side = sm_registers_per_side / regs_per_warp;
270 int warps = warps_per_side * sm_sides;
271 max_reg_occupancy = warps / block_warps;
275 int block_allocated_smem = CUB_ROUND_UP_NEAREST(
276 kernel_attrs.sharedSizeBytes,
280 int max_smem_occupancy = (block_allocated_smem > 0) ?
281 (smem_bytes / block_allocated_smem) :
285 max_sm_occupancy = CUB_MIN(
286 CUB_MIN(max_sm_blocks, max_warp_occupancy),
287 CUB_MIN(max_smem_occupancy, max_reg_occupancy));
295 #endif // CUB_RUNTIME_ENABLED
298 #endif // Do not document
332 template <
typename KernelPtr>
333 CUB_RUNTIME_FUNCTION __forceinline__
335 int &max_sm_occupancy,
336 KernelPtr kernel_ptr,
339 #ifndef CUB_RUNTIME_ENABLED
342 return CubDebug(cudaErrorInvalidConfiguration);
346 cudaError_t error = cudaSuccess;
351 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
364 #endif // CUB_RUNTIME_ENABLED