CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
56 /******************************************************************/
60 
70 template <
71  typename T,
72  int ITEMS_PER_THREAD,
73  typename OutputIterator>
74 __device__ __forceinline__ void StoreDirectBlocked(
75  int linear_tid,
76  OutputIterator block_itr,
77  T (&items)[ITEMS_PER_THREAD])
78 {
79  // Store directly in thread-blocked order
80  #pragma unroll
81  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
82  {
83  block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
84  }
85 }
86 
87 
97 template <
98  typename T,
99  int ITEMS_PER_THREAD,
100  typename OutputIterator>
101 __device__ __forceinline__ void StoreDirectBlocked(
102  int linear_tid,
103  OutputIterator block_itr,
104  T (&items)[ITEMS_PER_THREAD],
105  int valid_items)
106 {
107  // Store directly in thread-blocked order
108  #pragma unroll
109  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
110  {
111  if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
112  {
113  block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
114  }
115  }
116 }
117 
118 
136 template <
137  typename T,
138  int ITEMS_PER_THREAD>
139 __device__ __forceinline__ void StoreDirectBlockedVectorized(
140  int linear_tid,
141  T *block_ptr,
142  T (&items)[ITEMS_PER_THREAD])
143 {
144  enum
145  {
146  // Maximum CUDA vector size is 4 elements
147  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
148 
149  // Vector size must be a power of two and an even divisor of the items per thread
150  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
151  MAX_VEC_SIZE :
152  1,
153 
154  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
155  };
156 
157  // Vector type
158  typedef typename CubVector<T, VEC_SIZE>::Type Vector;
159 
160  // Alias global pointer
161  Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
162 
163  // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
164  Vector raw_vector[VECTORS_PER_THREAD];
165  T *raw_items = reinterpret_cast<T*>(raw_vector);
166 
167  // Copy
168  #pragma unroll
169  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
170  {
171  raw_items[ITEM] = items[ITEM];
172  }
173 
174  // Direct-store using vector types
175  StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
176 }
177 
178 
179 
181 /******************************************************************/
185 
186 
197 template <
198  int BLOCK_THREADS,
199  typename T,
200  int ITEMS_PER_THREAD,
201  typename OutputIterator>
202 __device__ __forceinline__ void StoreDirectStriped(
203  int linear_tid,
204  OutputIterator block_itr,
205  T (&items)[ITEMS_PER_THREAD])
206 {
207  // Store directly in striped order
208  #pragma unroll
209  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
210  {
211  block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
212  }
213 }
214 
215 
226 template <
227  int BLOCK_THREADS,
228  typename T,
229  int ITEMS_PER_THREAD,
230  typename OutputIterator>
231 __device__ __forceinline__ void StoreDirectStriped(
232  int linear_tid,
233  OutputIterator block_itr,
234  T (&items)[ITEMS_PER_THREAD],
235  int valid_items)
236 {
237  // Store directly in striped order
238  #pragma unroll
239  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
240  {
241  if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
242  {
243  block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
244  }
245  }
246 }
247 
248 
249 
251 /******************************************************************/
255 
256 
269 template <
270  typename T,
271  int ITEMS_PER_THREAD,
272  typename OutputIterator>
273 __device__ __forceinline__ void StoreDirectWarpStriped(
274  int linear_tid,
275  OutputIterator block_itr,
276  T (&items)[ITEMS_PER_THREAD])
277 {
278  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
279  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
280  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
281 
282  // Store directly in warp-striped order
283  #pragma unroll
284  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
285  {
286  block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
287  }
288 }
289 
290 
303 template <
304  typename T,
305  int ITEMS_PER_THREAD,
306  typename OutputIterator>
307 __device__ __forceinline__ void StoreDirectWarpStriped(
308  int linear_tid,
309  OutputIterator block_itr,
310  T (&items)[ITEMS_PER_THREAD],
311  int valid_items)
312 {
313  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
314  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
315  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
316 
317  // Store directly in warp-striped order
318  #pragma unroll
319  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
320  {
321  if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
322  {
323  block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
324  }
325  }
326 }
327 
328 
330 
331  // end group UtilIo
333 
334 
335 //-----------------------------------------------------------------------------
336 // Generic BlockStore abstraction
337 //-----------------------------------------------------------------------------
338 
343 {
356 
378 
397 
416 };
417 
418 
486 template <
487  typename OutputIterator,
488  int BLOCK_DIM_X,
489  int ITEMS_PER_THREAD,
491  bool WARP_TIME_SLICING = false,
492  int BLOCK_DIM_Y = 1,
493  int BLOCK_DIM_Z = 1,
494  int PTX_ARCH = CUB_PTX_ARCH>
496 {
497 private:
498  /******************************************************************************
499  * Constants and typed definitions
500  ******************************************************************************/
501 
503  enum
504  {
506  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
507  };
508 
509  // Data type of input iterator
510  typedef typename std::iterator_traits<OutputIterator>::value_type T;
511 
512 
513  /******************************************************************************
514  * Algorithmic variants
515  ******************************************************************************/
516 
518  template <BlockStoreAlgorithm _POLICY, int DUMMY>
519  struct StoreInternal;
520 
521 
525  template <int DUMMY>
526  struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
527  {
529  typedef NullType TempStorage;
530 
532  int linear_tid;
533 
535  __device__ __forceinline__ StoreInternal(
536  TempStorage &temp_storage,
537  int linear_tid)
538  :
539  linear_tid(linear_tid)
540  {}
541 
543  __device__ __forceinline__ void Store(
544  OutputIterator block_itr,
545  T (&items)[ITEMS_PER_THREAD])
546  {
547  StoreDirectBlocked(linear_tid, block_itr, items);
548  }
549 
551  __device__ __forceinline__ void Store(
552  OutputIterator block_itr,
553  T (&items)[ITEMS_PER_THREAD],
554  int valid_items)
555  {
556  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
557  }
558  };
559 
560 
564  template <int DUMMY>
565  struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
566  {
568  typedef NullType TempStorage;
569 
571  int linear_tid;
572 
574  __device__ __forceinline__ StoreInternal(
575  TempStorage &temp_storage,
576  int linear_tid)
577  :
578  linear_tid(linear_tid)
579  {}
580 
582  __device__ __forceinline__ void Store(
583  T *block_ptr,
584  T (&items)[ITEMS_PER_THREAD])
585  {
586  StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
587  }
588 
590  template <typename _OutputIterator>
591  __device__ __forceinline__ void Store(
592  _OutputIterator block_itr,
593  T (&items)[ITEMS_PER_THREAD])
594  {
595  StoreDirectBlocked(linear_tid, block_itr, items);
596  }
597 
599  __device__ __forceinline__ void Store(
600  OutputIterator block_itr,
601  T (&items)[ITEMS_PER_THREAD],
602  int valid_items)
603  {
604  StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
605  }
606  };
607 
608 
612  template <int DUMMY>
613  struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
614  {
615  // BlockExchange utility type for keys
617 
619  typedef typename BlockExchange::TempStorage _TempStorage;
620 
622  struct TempStorage : Uninitialized<_TempStorage> {};
623 
625  _TempStorage &temp_storage;
626 
628  int linear_tid;
629 
631  __device__ __forceinline__ StoreInternal(
632  TempStorage &temp_storage,
633  int linear_tid)
634  :
635  temp_storage(temp_storage.Alias()),
636  linear_tid(linear_tid)
637  {}
638 
640  __device__ __forceinline__ void Store(
641  OutputIterator block_itr,
642  T (&items)[ITEMS_PER_THREAD])
643  {
644  BlockExchange(temp_storage).BlockedToStriped(items);
645  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
646  }
647 
649  __device__ __forceinline__ void Store(
650  OutputIterator block_itr,
651  T (&items)[ITEMS_PER_THREAD],
652  int valid_items)
653  {
654  BlockExchange(temp_storage).BlockedToStriped(items);
655  StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
656  }
657  };
658 
659 
663  template <int DUMMY>
664  struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
665  {
666  enum
667  {
668  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
669  };
670 
671  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
672  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
673 
674  // BlockExchange utility type for keys
675  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
676 
678  typedef typename BlockExchange::TempStorage _TempStorage;
679 
681  struct TempStorage : Uninitialized<_TempStorage> {};
682 
684  _TempStorage &temp_storage;
685 
687  int linear_tid;
688 
690  __device__ __forceinline__ StoreInternal(
691  TempStorage &temp_storage,
692  int linear_tid)
693  :
694  temp_storage(temp_storage.Alias()),
695  linear_tid(linear_tid)
696  {}
697 
699  __device__ __forceinline__ void Store(
700  OutputIterator block_itr,
701  T (&items)[ITEMS_PER_THREAD])
702  {
703  BlockExchange(temp_storage).BlockedToWarpStriped(items);
704  StoreDirectWarpStriped(linear_tid, block_itr, items);
705  }
706 
708  __device__ __forceinline__ void Store(
709  OutputIterator block_itr,
710  T (&items)[ITEMS_PER_THREAD],
711  int valid_items)
712  {
713  BlockExchange(temp_storage).BlockedToWarpStriped(items);
714  StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items);
715  }
716  };
717 
718  /******************************************************************************
719  * Type definitions
720  ******************************************************************************/
721 
723  typedef StoreInternal<ALGORITHM, 0> InternalStore;
724 
725 
727  typedef typename InternalStore::TempStorage _TempStorage;
728 
729 
730  /******************************************************************************
731  * Utility methods
732  ******************************************************************************/
733 
735  __device__ __forceinline__ _TempStorage& PrivateStorage()
736  {
737  __shared__ _TempStorage private_storage;
738  return private_storage;
739  }
740 
741 
742  /******************************************************************************
743  * Thread fields
744  ******************************************************************************/
745 
747  _TempStorage &temp_storage;
748 
750  int linear_tid;
751 
752 public:
753 
754 
756  struct TempStorage : Uninitialized<_TempStorage> {};
757 
758 
759  /******************************************************************/
763 
767  __device__ __forceinline__ BlockStore()
768  :
769  temp_storage(PrivateStorage()),
770  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
771  {}
772 
773 
777  __device__ __forceinline__ BlockStore(
778  TempStorage &temp_storage)
779  :
780  temp_storage(temp_storage.Alias()),
781  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
782  {}
783 
784 
786  /******************************************************************/
790 
791 
832  __device__ __forceinline__ void Store(
833  OutputIterator block_itr,
834  T (&items)[ITEMS_PER_THREAD])
835  {
836  InternalStore(temp_storage, linear_tid).Store(block_itr, items);
837  }
838 
880  __device__ __forceinline__ void Store(
881  OutputIterator block_itr,
882  T (&items)[ITEMS_PER_THREAD],
883  int valid_items)
884  {
885  InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
886  }
887 };
888 
889 
890 } // CUB namespace
891 CUB_NS_POSTFIX // Optional outer namespace(s)
892