CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_load.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <iterator>
37 
38 #include "block_exchange.cuh"
39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
43 
45 CUB_NS_PREFIX
46 
48 namespace cub {
49 
56 /******************************************************************/
60 
61 
71 template <
72  typename T,
73  int ITEMS_PER_THREAD,
74  typename InputIterator>
75 __device__ __forceinline__ void LoadDirectBlocked(
76  int linear_tid,
77  InputIterator block_itr,
78  T (&items)[ITEMS_PER_THREAD])
79 {
80  // Load directly in thread-blocked order
81  #pragma unroll
82  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
83  {
84  items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
85  }
86 }
87 
88 
98 template <
99  typename T,
100  int ITEMS_PER_THREAD,
101  typename InputIterator>
102 __device__ __forceinline__ void LoadDirectBlocked(
103  int linear_tid,
104  InputIterator block_itr,
105  T (&items)[ITEMS_PER_THREAD],
106  int valid_items)
107 {
108  int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
109 
110  #pragma unroll
111  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
112  {
113  if (ITEM < bounds)
114  {
115  items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
116  }
117  }
118 }
119 
120 
130 template <
131  typename T,
132  int ITEMS_PER_THREAD,
133  typename InputIterator>
134 __device__ __forceinline__ void LoadDirectBlocked(
135  int linear_tid,
136  InputIterator block_itr,
137  T (&items)[ITEMS_PER_THREAD],
138  int valid_items,
139  T oob_default)
140 {
141  #pragma unroll
142  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
143  {
144  items[ITEM] = oob_default;
145  }
146 
147  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
148 }
149 
150 
165 template <
166  typename T,
167  int ITEMS_PER_THREAD>
168 __device__ __forceinline__ void LoadDirectBlockedVectorized(
169  int linear_tid,
170  T *block_ptr,
171  T (&items)[ITEMS_PER_THREAD])
172 {
173  enum
174  {
175  // Maximum CUDA vector size is 4 elements
176  MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
177 
178  // Vector size must be a power of two and an even divisor of the items per thread
179  VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
180  MAX_VEC_SIZE :
181  1,
182 
183  VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
184  };
185 
186  // Vector type
187  typedef typename CubVector<T, VEC_SIZE>::Type Vector;
188 
189  // Vector items
190  Vector vec_items[VECTORS_PER_THREAD];
191 
192  // Aliased input ptr
193  Vector *ptr = reinterpret_cast<Vector*>(block_ptr + (linear_tid * VEC_SIZE * VECTORS_PER_THREAD));
194 
195  // Load directly in thread-blocked order
196  #pragma unroll
197  for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
198  {
199  vec_items[ITEM] = ptr[ITEM];
200  }
201 
202  // Copy
203  #pragma unroll
204  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
205  {
206  items[ITEM] = reinterpret_cast<T*>(vec_items)[ITEM];
207  }
208 }
209 
210 
211 
213 /******************************************************************/
217 
218 
229 template <
230  int BLOCK_THREADS,
231  typename T,
232  int ITEMS_PER_THREAD,
233  typename InputIterator>
234 __device__ __forceinline__ void LoadDirectStriped(
235  int linear_tid,
236  InputIterator block_itr,
237  T (&items)[ITEMS_PER_THREAD])
238 {
239  #pragma unroll
240  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
241  {
242  items[ITEM] = block_itr[(ITEM * BLOCK_THREADS) + linear_tid];
243  }
244 }
245 
246 
257 template <
258  int BLOCK_THREADS,
259  typename T,
260  int ITEMS_PER_THREAD,
261  typename InputIterator>
262 __device__ __forceinline__ void LoadDirectStriped(
263  int linear_tid,
264  InputIterator block_itr,
265  T (&items)[ITEMS_PER_THREAD],
266  int valid_items)
267 {
268  int bounds = valid_items - linear_tid;
269 
270  #pragma unroll
271  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
272  {
273  if (ITEM * BLOCK_THREADS < bounds)
274  {
275  items[ITEM] = block_itr[linear_tid + (ITEM * BLOCK_THREADS)];
276  }
277  }
278 }
279 
280 
291 template <
292  int BLOCK_THREADS,
293  typename T,
294  int ITEMS_PER_THREAD,
295  typename InputIterator>
296 __device__ __forceinline__ void LoadDirectStriped(
297  int linear_tid,
298  InputIterator block_itr,
299  T (&items)[ITEMS_PER_THREAD],
300  int valid_items,
301  T oob_default)
302 {
303  #pragma unroll
304  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
305  {
306  items[ITEM] = oob_default;
307  }
308 
309  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
310 }
311 
312 
313 
315 /******************************************************************/
319 
320 
333 template <
334  typename T,
335  int ITEMS_PER_THREAD,
336  typename InputIterator>
337 __device__ __forceinline__ void LoadDirectWarpStriped(
338  int linear_tid,
339  InputIterator block_itr,
340  T (&items)[ITEMS_PER_THREAD])
341 {
342  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
343  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
344  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
345 
346  // Load directly in warp-striped order
347  #pragma unroll
348  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
349  {
350  items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
351  }
352 }
353 
354 
367 template <
368  typename T,
369  int ITEMS_PER_THREAD,
370  typename InputIterator>
371 __device__ __forceinline__ void LoadDirectWarpStriped(
372  int linear_tid,
373  InputIterator block_itr,
374  T (&items)[ITEMS_PER_THREAD],
375  int valid_items)
376 {
377  int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
378  int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
379  int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
380  int bounds = valid_items - warp_offset - tid;
381 
382  // Load directly in warp-striped order
383  #pragma unroll
384  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
385  {
386  if ((ITEM * CUB_PTX_WARP_THREADS) < bounds)
387  {
388  items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
389  }
390  }
391 }
392 
393 
406 template <
407  typename T,
408  int ITEMS_PER_THREAD,
409  typename InputIterator>
410 __device__ __forceinline__ void LoadDirectWarpStriped(
411  int linear_tid,
412  InputIterator block_itr,
413  T (&items)[ITEMS_PER_THREAD],
414  int valid_items,
415  T oob_default)
416 {
417  #pragma unroll
418  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
419  {
420  items[ITEM] = oob_default;
421  }
422 
423  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
424 }
425 
426 
428  // end group UtilIo
430 
431 
432 
433 //-----------------------------------------------------------------------------
434 // Generic BlockLoad abstraction
435 //-----------------------------------------------------------------------------
436 
441 {
454 
476 
495 
496 
518 };
519 
520 
584 template <
585  typename InputIterator,
586  int BLOCK_DIM_X,
587  int ITEMS_PER_THREAD,
589  bool WARP_TIME_SLICING = false,
590  int BLOCK_DIM_Y = 1,
591  int BLOCK_DIM_Z = 1,
592  int PTX_ARCH = CUB_PTX_ARCH>
594 {
595 private:
596 
597  /******************************************************************************
598  * Constants and typed definitions
599  ******************************************************************************/
600 
602  enum
603  {
605  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
606  };
607 
608  // Data type of input iterator
609  typedef typename std::iterator_traits<InputIterator>::value_type T;
610 
611 
612  /******************************************************************************
613  * Algorithmic variants
614  ******************************************************************************/
615 
617  template <BlockLoadAlgorithm _POLICY, int DUMMY>
618  struct LoadInternal;
619 
620 
624  template <int DUMMY>
625  struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
626  {
628  typedef NullType TempStorage;
629 
631  int linear_tid;
632 
634  __device__ __forceinline__ LoadInternal(
635  TempStorage &temp_storage,
636  int linear_tid)
637  :
638  linear_tid(linear_tid)
639  {}
640 
642  __device__ __forceinline__ void Load(
643  InputIterator block_itr,
644  T (&items)[ITEMS_PER_THREAD])
645  {
646  LoadDirectBlocked(linear_tid, block_itr, items);
647  }
648 
650  __device__ __forceinline__ void Load(
651  InputIterator block_itr,
652  T (&items)[ITEMS_PER_THREAD],
653  int valid_items)
654  {
655  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
656  }
657 
659  __device__ __forceinline__ void Load(
660  InputIterator block_itr,
661  T (&items)[ITEMS_PER_THREAD],
662  int valid_items,
663  T oob_default)
664  {
665  LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
666  }
667 
668  };
669 
670 
674  template <int DUMMY>
675  struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
676  {
678  typedef NullType TempStorage;
679 
681  int linear_tid;
682 
684  __device__ __forceinline__ LoadInternal(
685  TempStorage &temp_storage,
686  int linear_tid)
687  :
688  linear_tid(linear_tid)
689  {}
690 
692  __device__ __forceinline__ void Load(
693  T *block_ptr,
694  T (&items)[ITEMS_PER_THREAD])
695  {
696  LoadDirectBlockedVectorized(linear_tid, block_ptr, items);
697  }
698 
700  template <
701  typename T,
702  typename _InputIterator>
703  __device__ __forceinline__ void Load(
704  _InputIterator block_itr,
705  T (&items)[ITEMS_PER_THREAD])
706  {
707  LoadDirectBlocked(linear_tid, block_itr, items);
708  }
709 
711  __device__ __forceinline__ void Load(
712  InputIterator block_itr,
713  T (&items)[ITEMS_PER_THREAD],
714  int valid_items)
715  {
716  LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
717  }
718 
720  __device__ __forceinline__ void Load(
721  InputIterator block_itr,
722  T (&items)[ITEMS_PER_THREAD],
723  int valid_items,
724  T oob_default)
725  {
726  LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
727  }
728 
729  };
730 
731 
735  template <int DUMMY>
736  struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
737  {
738  // BlockExchange utility type for keys
740 
742  typedef typename BlockExchange::TempStorage _TempStorage;
743 
745  struct TempStorage : Uninitialized<_TempStorage> {};
746 
748  _TempStorage &temp_storage;
749 
751  int linear_tid;
752 
754  __device__ __forceinline__ LoadInternal(
755  TempStorage &temp_storage,
756  int linear_tid)
757  :
758  temp_storage(temp_storage.Alias()),
759  linear_tid(linear_tid)
760  {}
761 
763  __device__ __forceinline__ void Load(
764  InputIterator block_itr,
765  T (&items)[ITEMS_PER_THREAD])
766  {
767  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
768  BlockExchange(temp_storage).StripedToBlocked(items);
769  }
770 
772  __device__ __forceinline__ void Load(
773  InputIterator block_itr,
774  T (&items)[ITEMS_PER_THREAD],
775  int valid_items)
776  {
777  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
778  BlockExchange(temp_storage).StripedToBlocked(items);
779  }
780 
782  __device__ __forceinline__ void Load(
783  InputIterator block_itr,
784  T (&items)[ITEMS_PER_THREAD],
785  int valid_items,
786  T oob_default)
787  {
788  LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
789  BlockExchange(temp_storage).StripedToBlocked(items);
790  }
791 
792  };
793 
794 
798  template <int DUMMY>
799  struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
800  {
801  enum
802  {
803  WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
804  };
805 
806  // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
807  CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
808 
809  // BlockExchange utility type for keys
810  typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
811 
813  typedef typename BlockExchange::TempStorage _TempStorage;
814 
816  struct TempStorage : Uninitialized<_TempStorage> {};
817 
819  _TempStorage &temp_storage;
820 
822  int linear_tid;
823 
825  __device__ __forceinline__ LoadInternal(
826  TempStorage &temp_storage,
827  int linear_tid)
828  :
829  temp_storage(temp_storage.Alias()),
830  linear_tid(linear_tid)
831  {}
832 
834  __device__ __forceinline__ void Load(
835  InputIterator block_itr,
836  T (&items)[ITEMS_PER_THREAD])
837  {
838  LoadDirectWarpStriped(linear_tid, block_itr, items);
839  BlockExchange(temp_storage).WarpStripedToBlocked(items);
840  }
841 
843  __device__ __forceinline__ void Load(
844  InputIterator block_itr,
845  T (&items)[ITEMS_PER_THREAD],
846  int valid_items)
847  {
848  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
849  BlockExchange(temp_storage).WarpStripedToBlocked(items);
850  }
851 
852 
854  __device__ __forceinline__ void Load(
855  InputIterator block_itr,
856  T (&items)[ITEMS_PER_THREAD],
857  int valid_items,
858  T oob_default)
859  {
860  LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
861  BlockExchange(temp_storage).WarpStripedToBlocked(items);
862  }
863  };
864 
865 
866  /******************************************************************************
867  * Type definitions
868  ******************************************************************************/
869 
871  typedef LoadInternal<ALGORITHM, 0> InternalLoad;
872 
873 
875  typedef typename InternalLoad::TempStorage _TempStorage;
876 
877 
878  /******************************************************************************
879  * Utility methods
880  ******************************************************************************/
881 
883  __device__ __forceinline__ _TempStorage& PrivateStorage()
884  {
885  __shared__ _TempStorage private_storage;
886  return private_storage;
887  }
888 
889 
890  /******************************************************************************
891  * Thread fields
892  ******************************************************************************/
893 
895  _TempStorage &temp_storage;
896 
898  int linear_tid;
899 
900 public:
901 
903  struct TempStorage : Uninitialized<_TempStorage> {};
904 
905 
906  /******************************************************************/
910 
914  __device__ __forceinline__ BlockLoad()
915  :
916  temp_storage(PrivateStorage()),
917  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
918  {}
919 
920 
924  __device__ __forceinline__ BlockLoad(
925  TempStorage &temp_storage)
926  :
927  temp_storage(temp_storage.Alias()),
928  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
929  {}
930 
931 
932 
933 
935  /******************************************************************/
939 
940 
977  __device__ __forceinline__ void Load(
978  InputIterator block_itr,
979  T (&items)[ITEMS_PER_THREAD])
980  {
981  InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
982  }
983 
984 
1022  __device__ __forceinline__ void Load(
1023  InputIterator block_itr,
1024  T (&items)[ITEMS_PER_THREAD],
1025  int valid_items)
1026  {
1027  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1028  }
1029 
1030 
1069  __device__ __forceinline__ void Load(
1070  InputIterator block_itr,
1071  T (&items)[ITEMS_PER_THREAD],
1072  int valid_items,
1073  T oob_default)
1074  {
1075  InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
1076  }
1077 
1078 
1080 
1081 };
1082 
1083 
1084 } // CUB namespace
1085 CUB_NS_POSTFIX // Optional outer namespace(s)
1086