CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
Classes | Macros | Functions
Device, kernel, and storage management

Classes

struct  cub::CachingDeviceAllocator
 A simple caching allocator for device memory allocations. More...
 

Macros

#define CUB_PTX_ARCH   0
 CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 
#define CUB_RUNTIME_ENABLED
 Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
 
#define CUB_RUNTIME_FUNCTION   __host__ __device__
 
#define CUB_LOG_WARP_THREADS(arch)   (5)
 Number of threads per warp (log)
 
#define CUB_WARP_THREADS(arch)   (1 << CUB_LOG_WARP_THREADS(arch))
 Number of threads per warp.
 
#define CUB_LOG_SMEM_BANKS(arch)
 Number of smem banks (log) More...
 
#define CUB_SMEM_BANKS(arch)   (1 << CUB_LOG_SMEM_BANKS(arch))
 Number of smem banks.
 
#define CUB_SMEM_BANK_BYTES(arch)   (4)
 Number of bytes per smem bank.
 
#define CUB_SMEM_BYTES(arch)
 Number of smem bytes provisioned per SM. More...
 
#define CUB_SMEM_ALLOC_UNIT(arch)
 Smem allocation size in bytes. More...
 
#define CUB_REGS_BY_BLOCK(arch)
 Whether or not the architecture allocates registers by block (or by warp) More...
 
#define CUB_REG_ALLOC_UNIT(arch)
 Number of registers allocated at a time per block (or by warp) More...
 
#define CUB_WARP_ALLOC_UNIT(arch)
 Granularity of warps for which registers are allocated. More...
 
#define CUB_MAX_SM_THREADS(arch)
 Maximum number of threads per SM. More...
 
#define CUB_MAX_SM_BLOCKS(arch)
 Maximum number of thread blocks per SM. More...
 
#define CUB_MAX_BLOCK_THREADS(arch)
 Maximum number of threads per thread block. More...
 
#define CUB_MAX_SM_REGISTERS(arch)
 Maximum number of registers per SM. More...
 
#define CUB_SUBSCRIPTION_FACTOR(arch)
 Oversubscription factor. More...
 
#define CUB_PREFER_CONFLICT_OVER_PADDING(arch)
 Prefer padding overhead vs X-way conflicts greater than this threshold. More...
 
#define CubDebug(e)   cub::Debug((e), __FILE__, __LINE__)
 Debug macro.
 
#define CubDebugExit(e)   if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
 Debug macro with exit.
 
#define CubLog(format,...)   printf(format,__VA_ARGS__);
 Log macro for printf statements.
 

Functions

__host__ __device__
__forceinline__ cudaError_t 
cub::Debug (cudaError_t error, const char *filename, int line)
 CUB error reporting macro (prints error messages to stderr) More...
 
CUB_RUNTIME_FUNCTION
__forceinline__ cudaError_t 
cub::PtxVersion (int &ptx_version)
 Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) More...
 
CUB_RUNTIME_FUNCTION
__forceinline__ cudaError_t 
cub::SmVersion (int &sm_version, int device_ordinal)
 Retrieves the SM version (major * 100 + minor * 10)
 
template<typename KernelPtr >
CUB_RUNTIME_FUNCTION
__forceinline__ cudaError_t 
cub::MaxSmOccupancy (int &max_sm_occupancy, KernelPtr kernel_ptr, int block_threads)
 Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel_ptr on the current device with block_threads per thread block. More...
 

Macro Definition Documentation

#define CUB_LOG_SMEM_BANKS (   arch)
Value:
((arch >= 200) ? \
(5) : \
(4))

Number of smem banks (log)

Definition at line 78 of file util_arch.cuh.

#define CUB_SMEM_BYTES (   arch)
Value:
((arch >= 200) ? \
(48 * 1024) : \
(16 * 1024))

Number of smem bytes provisioned per SM.

Definition at line 92 of file util_arch.cuh.

#define CUB_SMEM_ALLOC_UNIT (   arch)
Value:
((arch >= 300) ? \
(256) : \
((arch >= 200) ? \
(128) : \
(512)))

Smem allocation size in bytes.

Definition at line 98 of file util_arch.cuh.

#define CUB_REGS_BY_BLOCK (   arch)
Value:
((arch >= 200) ? \
(false) : \
(true))

Whether or not the architecture allocates registers by block (or by warp)

Definition at line 106 of file util_arch.cuh.

#define CUB_REG_ALLOC_UNIT (   arch)
Value:
((arch >= 300) ? \
(256) : \
((arch >= 200) ? \
(64) : \
((arch >= 120) ? \
(512) : \
(256))))

Number of registers allocated at a time per block (or by warp)

Definition at line 112 of file util_arch.cuh.

#define CUB_WARP_ALLOC_UNIT (   arch)
Value:
((arch >= 300) ? \
(4) : \
(2))

Granularity of warps for which registers are allocated.

Definition at line 122 of file util_arch.cuh.

#define CUB_MAX_SM_THREADS (   arch)
Value:
((arch >= 300) ? \
(2048) : \
((arch >= 200) ? \
(1536) : \
((arch >= 120) ? \
(1024) : \
(768))))

Maximum number of threads per SM.

Definition at line 128 of file util_arch.cuh.

#define CUB_MAX_SM_BLOCKS (   arch)
Value:
((arch >= 300) ? \
(16) : \
(8))

Maximum number of thread blocks per SM.

Definition at line 138 of file util_arch.cuh.

#define CUB_MAX_BLOCK_THREADS (   arch)
Value:
((arch >= 200) ? \
(1024) : \
(512))

Maximum number of threads per thread block.

Definition at line 144 of file util_arch.cuh.

#define CUB_MAX_SM_REGISTERS (   arch)
Value:
((arch >= 300) ? \
(64 * 1024) : \
((arch >= 200) ? \
(32 * 1024) : \
((arch >= 120) ? \
(16 * 1024) : \
(8 * 1024))))

Maximum number of registers per SM.

Definition at line 150 of file util_arch.cuh.

#define CUB_SUBSCRIPTION_FACTOR (   arch)
Value:
((arch >= 300) ? \
(5) : \
((arch >= 200) ? \
(3) : \
(10)))

Oversubscription factor.

Definition at line 160 of file util_arch.cuh.

#define CUB_PREFER_CONFLICT_OVER_PADDING (   arch)
Value:
((arch >= 300) ? \
(1) : \
(4))

Prefer padding overhead vs X-way conflicts greater than this threshold.

Definition at line 168 of file util_arch.cuh.

Function Documentation

__host__ __device__ __forceinline__ cudaError_t cub::Debug ( cudaError_t  error,
const char *  filename,
int  line 
)

CUB error reporting macro (prints error messages to stderr)

If CUB_STDERR is defined and error is not cudaSuccess, the corresponding error message is printed to stderr (or stdout in device code) along with the supplied source context.

Returns
The CUDA error.

Definition at line 68 of file util_debug.cuh.

CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t cub::PtxVersion ( int &  ptx_version)

Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)

Type definition of the EmptyKernel kernel entry point

Force EmptyKernel<void> to be generated if this class is used

Definition at line 118 of file util_device.cuh.

template<typename KernelPtr >
CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t cub::MaxSmOccupancy ( int &  max_sm_occupancy,
KernelPtr  kernel_ptr,
int  block_threads 
)

Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer kernel_ptr on the current device with block_threads per thread block.

Snippet
The code snippet below illustrates the use of the MaxSmOccupancy function.
#include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
template <typename T>
__global__ void ExampleKernel()
{
// Allocate shared memory for BlockScan
__shared__ volatile T buffer[4096];
...
}
...
// Determine SM occupancy for ExampleKernel specialized for unsigned char
int max_sm_occupancy;
MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
// max_sm_occupancy <-- 4 on SM10
// max_sm_occupancy <-- 8 on SM20
// max_sm_occupancy <-- 12 on SM35
Parameters
[out]max_sm_occupancymaximum number of thread blocks that can reside on a single SM
[in]kernel_ptrKernel pointer for which to compute SM occupancy
[in]block_threadsNumber of threads per thread block
Examples:
example_block_radix_sort.cu, example_block_reduce.cu, and example_block_scan.cu.

Definition at line 334 of file util_device.cuh.