36 #if (CUB_PTX_ARCH == 0)
43 #include "util_namespace.cuh"
46 #include "host/spinlock.cuh"
101 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
111 INVALID_DEVICE_ORDINAL = -1,
117 static unsigned int IntPow(
121 unsigned int retval = 1;
125 retval = retval * base;
137 static void NearestPowerOf(
139 size_t &rounded_bytes,
146 while (rounded_bytes < value)
148 rounded_bytes *= base;
156 struct BlockDescriptor
164 BlockDescriptor(
void *d_ptr,
int device) :
171 BlockDescriptor(
size_t bytes,
unsigned int bin,
int device) :
178 static bool PtrCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
180 if (a.device < b.device) {
182 }
else if (a.device > b.device) {
185 return (a.d_ptr < b.d_ptr);
190 static bool SizeCompare(
const BlockDescriptor &a,
const BlockDescriptor &b)
192 if (a.device < b.device) {
194 }
else if (a.device > b.device) {
197 return (a.bytes < b.bytes);
203 typedef bool (*Compare)(
const BlockDescriptor &,
const BlockDescriptor &);
205 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
208 typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
211 typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
214 typedef std::map<int, size_t> GpuCachedBytes;
216 #endif // CUB_PTX_ARCH
224 unsigned int bin_growth;
225 unsigned int min_bin;
226 unsigned int max_bin;
228 size_t min_bin_bytes;
229 size_t max_bin_bytes;
230 size_t max_cached_bytes;
235 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
237 GpuCachedBytes cached_bytes;
238 CachedBlocks cached_blocks;
239 BusyBlocks live_blocks;
241 #endif // CUB_PTX_ARCH
243 #endif // DOXYGEN_SHOULD_SKIP_THIS
253 unsigned int bin_growth,
254 unsigned int min_bin,
255 unsigned int max_bin,
256 size_t max_cached_bytes,
257 bool skip_cleanup =
false)
260 cached_blocks(BlockDescriptor::SizeCompare),
261 live_blocks(BlockDescriptor::PtrCompare),
265 bin_growth(bin_growth),
268 min_bin_bytes(IntPow(bin_growth, min_bin)),
269 max_bin_bytes(IntPow(bin_growth, max_bin)),
270 max_cached_bytes(max_cached_bytes)
288 bool skip_cleanup =
false)
291 cached_blocks(BlockDescriptor::SizeCompare),
292 live_blocks(BlockDescriptor::PtrCompare),
294 skip_cleanup(skip_cleanup),
300 min_bin_bytes(IntPow(bin_growth, min_bin)),
301 max_bin_bytes(IntPow(bin_growth, max_bin)),
302 max_cached_bytes((max_bin_bytes * 3) - 1)
310 size_t max_cached_bytes)
312 #if (CUB_PTX_ARCH > 0)
314 return CubDebug(cudaErrorInvalidConfiguration);
320 this->max_cached_bytes = max_cached_bytes;
322 if (debug)
CubLog(
"New max_cached_bytes(%lld)\n", (
long long) max_cached_bytes);
329 #endif // CUB_PTX_ARCH
341 #if (CUB_PTX_ARCH > 0)
343 return CubDebug(cudaErrorInvalidConfiguration);
347 int entrypoint_device = INVALID_DEVICE_ORDINAL;
348 cudaError_t error = cudaSuccess;
353 NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
356 bin_bytes = min_bin_bytes;
363 bin = (
unsigned int) -1;
367 BlockDescriptor search_key(bin_bytes, bin, device);
377 CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
378 if ((block_itr != cached_blocks.end()) &&
379 (block_itr->device == device) &&
380 (block_itr->bin == search_key.bin))
383 search_key = *block_itr;
384 live_blocks.insert(search_key);
387 cached_blocks.erase(block_itr);
388 cached_bytes[device] -= search_key.bytes;
390 if (debug)
CubLog(
"\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
391 device, (
long long) search_key.bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
402 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
403 if (
CubDebug(error = cudaSetDevice(device)))
break;
406 if (
CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
break;
415 live_blocks.insert(search_key);
417 if (debug)
CubLog(
"\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
418 device, (
long long) search_key.bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
429 *d_ptr = search_key.d_ptr;
432 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
434 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
439 #endif // CUB_PTX_ARCH
450 #if (CUB_PTX_ARCH > 0)
452 return CubDebug(cudaErrorInvalidConfiguration);
454 cudaError_t error = cudaSuccess;
457 if (
CubDebug(error = cudaGetDevice(¤t_device)))
break;
463 #endif // CUB_PTX_ARCH
474 #if (CUB_PTX_ARCH > 0)
476 return CubDebug(cudaErrorInvalidConfiguration);
480 int entrypoint_device = INVALID_DEVICE_ORDINAL;
481 cudaError_t error = cudaSuccess;
483 BlockDescriptor search_key(d_ptr, device);
493 BusyBlocks::iterator block_itr = live_blocks.find(search_key);
494 if (block_itr == live_blocks.end())
497 if (
CubDebug(error = cudaErrorUnknown))
break;
502 search_key = *block_itr;
503 live_blocks.erase(block_itr);
506 if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
509 cached_blocks.insert(search_key);
510 cached_bytes[device] += search_key.bytes;
512 if (debug)
CubLog(
"\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
513 device, (
long long) search_key.bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
524 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
525 if (
CubDebug(error = cudaSetDevice(device)))
break;
528 if (
CubDebug(error = cudaFree(d_ptr)))
break;
530 if (debug)
CubLog(
"\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
531 device, (
long long) search_key.bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[device], (
long long) live_blocks.size());
543 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
545 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
550 #endif // CUB_PTX_ARCH
560 #if (CUB_PTX_ARCH > 0)
562 return CubDebug(cudaErrorInvalidConfiguration);
566 cudaError_t error = cudaSuccess;
569 if (
CubDebug(error = cudaGetDevice(¤t_device)))
break;
575 #endif // CUB_PTX_ARCH
584 #if (CUB_PTX_ARCH > 0)
586 return CubDebug(cudaErrorInvalidConfiguration);
589 cudaError_t error = cudaSuccess;
591 int entrypoint_device = INVALID_DEVICE_ORDINAL;
592 int current_device = INVALID_DEVICE_ORDINAL;
600 while (!cached_blocks.empty())
603 CachedBlocks::iterator begin = cached_blocks.begin();
606 if (entrypoint_device == INVALID_DEVICE_ORDINAL)
608 if (
CubDebug(error = cudaGetDevice(&entrypoint_device)))
break;
612 if (begin->device != current_device)
614 if (
CubDebug(error = cudaSetDevice(begin->device)))
break;
615 current_device = begin->device;
619 if (
CubDebug(error = cudaFree(begin->d_ptr)))
break;
622 cached_bytes[current_device] -= begin->bytes;
623 cached_blocks.erase(begin);
625 if (debug)
CubLog(
"\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
626 current_device, (
long long) begin->bytes, (
long long) cached_blocks.size(), (
long long) cached_bytes[current_device], (
long long) live_blocks.size());
636 if (entrypoint_device != INVALID_DEVICE_ORDINAL)
638 if (
CubDebug(error = cudaSetDevice(entrypoint_device)))
return error;
643 #endif // CUB_PTX_ARCH