CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_allocator.cuh
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
29 /******************************************************************************
30  * Simple caching allocator for device memory allocations. The allocator is
31  * thread-safe and capable of managing device allocations on multiple devices.
32  ******************************************************************************/
33 
34 #pragma once
35 
36 #if (CUB_PTX_ARCH == 0)
37  #include <set> // NVCC (EDG, really) takes FOREVER to compile std::map
38  #include <map>
39 #endif
40 
41 #include <math.h>
42 
43 #include "util_namespace.cuh"
44 #include "util_debug.cuh"
45 
46 #include "host/spinlock.cuh"
47 
49 CUB_NS_PREFIX
50 
52 namespace cub {
53 
54 
61 /******************************************************************************
62  * CachingDeviceAllocator (host use)
63  ******************************************************************************/
64 
100 {
101 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
102 
103 
104  //---------------------------------------------------------------------
105  // Type definitions and constants
106  //---------------------------------------------------------------------
107 
108  enum
109  {
111  INVALID_DEVICE_ORDINAL = -1,
112  };
113 
117  static unsigned int IntPow(
118  unsigned int base,
119  unsigned int exp)
120  {
121  unsigned int retval = 1;
122  while (exp > 0)
123  {
124  if (exp & 1) {
125  retval = retval * base; // multiply the result by the current base
126  }
127  base = base * base; // square the base
128  exp = exp >> 1; // divide the exponent in half
129  }
130  return retval;
131  }
132 
133 
137  static void NearestPowerOf(
138  unsigned int &power,
139  size_t &rounded_bytes,
140  unsigned int base,
141  size_t value)
142  {
143  power = 0;
144  rounded_bytes = 1;
145 
146  while (rounded_bytes < value)
147  {
148  rounded_bytes *= base;
149  power++;
150  }
151  }
152 
156  struct BlockDescriptor
157  {
158  int device; // device ordinal
159  void* d_ptr; // Device pointer
160  size_t bytes; // Size of allocation in bytes
161  unsigned int bin; // Bin enumeration
162 
163  // Constructor
164  BlockDescriptor(void *d_ptr, int device) :
165  d_ptr(d_ptr),
166  bytes(0),
167  bin(0),
168  device(device) {}
169 
170  // Constructor
171  BlockDescriptor(size_t bytes, unsigned int bin, int device) :
172  d_ptr(NULL),
173  bytes(bytes),
174  bin(bin),
175  device(device) {}
176 
177  // Comparison functor for comparing device pointers
178  static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
179  {
180  if (a.device < b.device) {
181  return true;
182  } else if (a.device > b.device) {
183  return false;
184  } else {
185  return (a.d_ptr < b.d_ptr);
186  }
187  }
188 
189  // Comparison functor for comparing allocation sizes
190  static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
191  {
192  if (a.device < b.device) {
193  return true;
194  } else if (a.device > b.device) {
195  return false;
196  } else {
197  return (a.bytes < b.bytes);
198  }
199  }
200  };
201 
203  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
204 
205 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
206 
208  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
209 
211  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
212 
214  typedef std::map<int, size_t> GpuCachedBytes;
215 
216 #endif // CUB_PTX_ARCH
217 
218  //---------------------------------------------------------------------
219  // Fields
220  //---------------------------------------------------------------------
221 
222  Spinlock spin_lock;
223 
224  unsigned int bin_growth;
225  unsigned int min_bin;
226  unsigned int max_bin;
227 
228  size_t min_bin_bytes;
229  size_t max_bin_bytes;
230  size_t max_cached_bytes;
231 
232  bool debug;
233  bool skip_cleanup;
234 
235 #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
236 
237  GpuCachedBytes cached_bytes;
238  CachedBlocks cached_blocks;
239  BusyBlocks live_blocks;
240 
241 #endif // CUB_PTX_ARCH
242 
243 #endif // DOXYGEN_SHOULD_SKIP_THIS
244 
245  //---------------------------------------------------------------------
246  // Methods
247  //---------------------------------------------------------------------
248 
253  unsigned int bin_growth,
254  unsigned int min_bin,
255  unsigned int max_bin,
256  size_t max_cached_bytes,
257  bool skip_cleanup = false)
258  :
259  #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
260  cached_blocks(BlockDescriptor::SizeCompare),
261  live_blocks(BlockDescriptor::PtrCompare),
262  #endif
263  debug(false),
264  spin_lock(0),
265  bin_growth(bin_growth),
266  min_bin(min_bin),
267  max_bin(max_bin),
268  min_bin_bytes(IntPow(bin_growth, min_bin)),
269  max_bin_bytes(IntPow(bin_growth, max_bin)),
270  max_cached_bytes(max_cached_bytes)
271  {}
272 
273 
288  bool skip_cleanup = false)
289  :
290  #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code
291  cached_blocks(BlockDescriptor::SizeCompare),
292  live_blocks(BlockDescriptor::PtrCompare),
293  #endif
294  skip_cleanup(skip_cleanup),
295  debug(false),
296  spin_lock(0),
297  bin_growth(8),
298  min_bin(3),
299  max_bin(7),
300  min_bin_bytes(IntPow(bin_growth, min_bin)),
301  max_bin_bytes(IntPow(bin_growth, max_bin)),
302  max_cached_bytes((max_bin_bytes * 3) - 1)
303  {}
304 
305 
309  cudaError_t SetMaxCachedBytes(
310  size_t max_cached_bytes)
311  {
312  #if (CUB_PTX_ARCH > 0)
313  // Caching functionality only defined on host
314  return CubDebug(cudaErrorInvalidConfiguration);
315  #else
316 
317  // Lock
318  Lock(&spin_lock);
319 
320  this->max_cached_bytes = max_cached_bytes;
321 
322  if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
323 
324  // Unlock
325  Unlock(&spin_lock);
326 
327  return cudaSuccess;
328 
329  #endif // CUB_PTX_ARCH
330  }
331 
332 
336  cudaError_t DeviceAllocate(
337  void** d_ptr,
338  size_t bytes,
339  int device)
340  {
341  #if (CUB_PTX_ARCH > 0)
342  // Caching functionality only defined on host
343  return CubDebug(cudaErrorInvalidConfiguration);
344  #else
345 
346  bool locked = false;
347  int entrypoint_device = INVALID_DEVICE_ORDINAL;
348  cudaError_t error = cudaSuccess;
349 
350  // Round up to nearest bin size
351  unsigned int bin;
352  size_t bin_bytes;
353  NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
354  if (bin < min_bin) {
355  bin = min_bin;
356  bin_bytes = min_bin_bytes;
357  }
358 
359  // Check if bin is greater than our maximum bin
360  if (bin > max_bin)
361  {
362  // Allocate the request exactly and give out-of-range bin
363  bin = (unsigned int) -1;
364  bin_bytes = bytes;
365  }
366 
367  BlockDescriptor search_key(bin_bytes, bin, device);
368 
369  // Lock
370  if (!locked) {
371  Lock(&spin_lock);
372  locked = true;
373  }
374 
375  do {
376  // Find a free block big enough within the same bin on the same device
377  CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
378  if ((block_itr != cached_blocks.end()) &&
379  (block_itr->device == device) &&
380  (block_itr->bin == search_key.bin))
381  {
382  // Reuse existing cache block. Insert into live blocks.
383  search_key = *block_itr;
384  live_blocks.insert(search_key);
385 
386  // Remove from free blocks
387  cached_blocks.erase(block_itr);
388  cached_bytes[device] -= search_key.bytes;
389 
390  if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
391  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
392  }
393  else
394  {
395  // Need to allocate a new cache block. Unlock.
396  if (locked) {
397  Unlock(&spin_lock);
398  locked = false;
399  }
400 
401  // Set to specified device
402  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
403  if (CubDebug(error = cudaSetDevice(device))) break;
404 
405  // Allocate
406  if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
407 
408  // Lock
409  if (!locked) {
410  Lock(&spin_lock);
411  locked = true;
412  }
413 
414  // Insert into live blocks
415  live_blocks.insert(search_key);
416 
417  if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
418  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
419  }
420  } while(0);
421 
422  // Unlock
423  if (locked) {
424  Unlock(&spin_lock);
425  locked = false;
426  }
427 
428  // Copy device pointer to output parameter (NULL on error)
429  *d_ptr = search_key.d_ptr;
430 
431  // Attempt to revert back to previous device if necessary
432  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
433  {
434  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
435  }
436 
437  return error;
438 
439  #endif // CUB_PTX_ARCH
440  }
441 
442 
446  cudaError_t DeviceAllocate(
447  void** d_ptr,
448  size_t bytes)
449  {
450  #if (CUB_PTX_ARCH > 0)
451  // Caching functionality only defined on host
452  return CubDebug(cudaErrorInvalidConfiguration);
453  #else
454  cudaError_t error = cudaSuccess;
455  do {
456  int current_device;
457  if (CubDebug(error = cudaGetDevice(&current_device))) break;
458  if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
459  } while(0);
460 
461  return error;
462 
463  #endif // CUB_PTX_ARCH
464  }
465 
466 
470  cudaError_t DeviceFree(
471  void* d_ptr,
472  int device)
473  {
474  #if (CUB_PTX_ARCH > 0)
475  // Caching functionality only defined on host
476  return CubDebug(cudaErrorInvalidConfiguration);
477  #else
478 
479  bool locked = false;
480  int entrypoint_device = INVALID_DEVICE_ORDINAL;
481  cudaError_t error = cudaSuccess;
482 
483  BlockDescriptor search_key(d_ptr, device);
484 
485  // Lock
486  if (!locked) {
487  Lock(&spin_lock);
488  locked = true;
489  }
490 
491  do {
492  // Find corresponding block descriptor
493  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
494  if (block_itr == live_blocks.end())
495  {
496  // Cannot find pointer
497  if (CubDebug(error = cudaErrorUnknown)) break;
498  }
499  else
500  {
501  // Remove from live blocks
502  search_key = *block_itr;
503  live_blocks.erase(block_itr);
504 
505  // Check if we should keep the returned allocation
506  if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
507  {
508  // Insert returned allocation into free blocks
509  cached_blocks.insert(search_key);
510  cached_bytes[device] += search_key.bytes;
511 
512  if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
513  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
514  }
515  else
516  {
517  // Free the returned allocation. Unlock.
518  if (locked) {
519  Unlock(&spin_lock);
520  locked = false;
521  }
522 
523  // Set to specified device
524  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
525  if (CubDebug(error = cudaSetDevice(device))) break;
526 
527  // Free device memory
528  if (CubDebug(error = cudaFree(d_ptr))) break;
529 
530  if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
531  device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
532  }
533  }
534  } while (0);
535 
536  // Unlock
537  if (locked) {
538  Unlock(&spin_lock);
539  locked = false;
540  }
541 
542  // Attempt to revert back to entry-point device if necessary
543  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
544  {
545  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
546  }
547 
548  return error;
549 
550  #endif // CUB_PTX_ARCH
551  }
552 
553 
557  cudaError_t DeviceFree(
558  void* d_ptr)
559  {
560  #if (CUB_PTX_ARCH > 0)
561  // Caching functionality only defined on host
562  return CubDebug(cudaErrorInvalidConfiguration);
563  #else
564 
565  int current_device;
566  cudaError_t error = cudaSuccess;
567 
568  do {
569  if (CubDebug(error = cudaGetDevice(&current_device))) break;
570  if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
571  } while(0);
572 
573  return error;
574 
575  #endif // CUB_PTX_ARCH
576  }
577 
578 
582  cudaError_t FreeAllCached()
583  {
584  #if (CUB_PTX_ARCH > 0)
585  // Caching functionality only defined on host
586  return CubDebug(cudaErrorInvalidConfiguration);
587  #else
588 
589  cudaError_t error = cudaSuccess;
590  bool locked = false;
591  int entrypoint_device = INVALID_DEVICE_ORDINAL;
592  int current_device = INVALID_DEVICE_ORDINAL;
593 
594  // Lock
595  if (!locked) {
596  Lock(&spin_lock);
597  locked = true;
598  }
599 
600  while (!cached_blocks.empty())
601  {
602  // Get first block
603  CachedBlocks::iterator begin = cached_blocks.begin();
604 
605  // Get entry-point device ordinal if necessary
606  if (entrypoint_device == INVALID_DEVICE_ORDINAL)
607  {
608  if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
609  }
610 
611  // Set current device ordinal if necessary
612  if (begin->device != current_device)
613  {
614  if (CubDebug(error = cudaSetDevice(begin->device))) break;
615  current_device = begin->device;
616  }
617 
618  // Free device memory
619  if (CubDebug(error = cudaFree(begin->d_ptr))) break;
620 
621  // Reduce balance and erase entry
622  cached_bytes[current_device] -= begin->bytes;
623  cached_blocks.erase(begin);
624 
625  if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
626  current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
627  }
628 
629  // Unlock
630  if (locked) {
631  Unlock(&spin_lock);
632  locked = false;
633  }
634 
635  // Attempt to revert back to entry-point device if necessary
636  if (entrypoint_device != INVALID_DEVICE_ORDINAL)
637  {
638  if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
639  }
640 
641  return error;
642 
643  #endif // CUB_PTX_ARCH
644  }
645 
646 
651  {
652  if (!skip_cleanup)
653  FreeAllCached();
654  }
655 
656 };
657 
658 
659 
660  // end group UtilMgmt
662 
663 } // CUB namespace
664 CUB_NS_POSTFIX // Optional outer namespace(s)