39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
74 typename InputIterator>
77 InputIterator block_itr,
78 T (&items)[ITEMS_PER_THREAD])
82 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
84 items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
100 int ITEMS_PER_THREAD,
101 typename InputIterator>
104 InputIterator block_itr,
105 T (&items)[ITEMS_PER_THREAD],
108 int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
111 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
115 items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
132 int ITEMS_PER_THREAD,
133 typename InputIterator>
136 InputIterator block_itr,
137 T (&items)[ITEMS_PER_THREAD],
142 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
144 items[ITEM] = oob_default;
167 int ITEMS_PER_THREAD>
171 T (&items)[ITEMS_PER_THREAD])
176 MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
179 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
183 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
190 Vector vec_items[VECTORS_PER_THREAD];
193 Vector *ptr =
reinterpret_cast<Vector*
>(block_ptr + (linear_tid * VEC_SIZE * VECTORS_PER_THREAD));
197 for (
int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
199 vec_items[ITEM] = ptr[ITEM];
204 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
206 items[ITEM] =
reinterpret_cast<T*
>(vec_items)[ITEM];
232 int ITEMS_PER_THREAD,
233 typename InputIterator>
236 InputIterator block_itr,
237 T (&items)[ITEMS_PER_THREAD])
240 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
242 items[ITEM] = block_itr[(ITEM * BLOCK_THREADS) + linear_tid];
260 int ITEMS_PER_THREAD,
261 typename InputIterator>
264 InputIterator block_itr,
265 T (&items)[ITEMS_PER_THREAD],
268 int bounds = valid_items - linear_tid;
271 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
273 if (ITEM * BLOCK_THREADS < bounds)
275 items[ITEM] = block_itr[linear_tid + (ITEM * BLOCK_THREADS)];
294 int ITEMS_PER_THREAD,
295 typename InputIterator>
298 InputIterator block_itr,
299 T (&items)[ITEMS_PER_THREAD],
304 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
306 items[ITEM] = oob_default;
309 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
335 int ITEMS_PER_THREAD,
336 typename InputIterator>
339 InputIterator block_itr,
340 T (&items)[ITEMS_PER_THREAD])
342 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
343 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
344 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
348 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
350 items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
369 int ITEMS_PER_THREAD,
370 typename InputIterator>
373 InputIterator block_itr,
374 T (&items)[ITEMS_PER_THREAD],
377 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
378 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
379 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
380 int bounds = valid_items - warp_offset - tid;
384 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
386 if ((ITEM * CUB_PTX_WARP_THREADS) < bounds)
388 items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
408 int ITEMS_PER_THREAD,
409 typename InputIterator>
412 InputIterator block_itr,
413 T (&items)[ITEMS_PER_THREAD],
418 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
420 items[ITEM] = oob_default;
585 typename InputIterator,
587 int ITEMS_PER_THREAD,
589 bool WARP_TIME_SLICING =
false,
605 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
609 typedef typename std::iterator_traits<InputIterator>::value_type T;
617 template <BlockLoadAlgorithm _POLICY,
int DUMMY>
625 struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
634 __device__ __forceinline__ LoadInternal(
638 linear_tid(linear_tid)
642 __device__ __forceinline__
void Load(
643 InputIterator block_itr,
644 T (&items)[ITEMS_PER_THREAD])
650 __device__ __forceinline__
void Load(
651 InputIterator block_itr,
652 T (&items)[ITEMS_PER_THREAD],
659 __device__ __forceinline__
void Load(
660 InputIterator block_itr,
661 T (&items)[ITEMS_PER_THREAD],
675 struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
684 __device__ __forceinline__ LoadInternal(
688 linear_tid(linear_tid)
692 __device__ __forceinline__
void Load(
694 T (&items)[ITEMS_PER_THREAD])
702 typename _InputIterator>
703 __device__ __forceinline__
void Load(
704 _InputIterator block_itr,
705 T (&items)[ITEMS_PER_THREAD])
711 __device__ __forceinline__
void Load(
712 InputIterator block_itr,
713 T (&items)[ITEMS_PER_THREAD],
720 __device__ __forceinline__
void Load(
721 InputIterator block_itr,
722 T (&items)[ITEMS_PER_THREAD],
736 struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
754 __device__ __forceinline__ LoadInternal(
758 temp_storage(temp_storage.Alias()),
759 linear_tid(linear_tid)
763 __device__ __forceinline__
void Load(
764 InputIterator block_itr,
765 T (&items)[ITEMS_PER_THREAD])
767 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
772 __device__ __forceinline__
void Load(
773 InputIterator block_itr,
774 T (&items)[ITEMS_PER_THREAD],
777 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
782 __device__ __forceinline__
void Load(
783 InputIterator block_itr,
784 T (&items)[ITEMS_PER_THREAD],
788 LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
789 BlockExchange(temp_storage).StripedToBlocked(items);
807 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0),
"BLOCK_THREADS must be a multiple of WARP_THREADS");
810 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
813 typedef typename BlockExchange::TempStorage _TempStorage;
825 __device__ __forceinline__ LoadInternal(
829 temp_storage(temp_storage.Alias()),
830 linear_tid(linear_tid)
834 __device__ __forceinline__
void Load(
835 InputIterator block_itr,
836 T (&items)[ITEMS_PER_THREAD])
843 __device__ __forceinline__
void Load(
844 InputIterator block_itr,
845 T (&items)[ITEMS_PER_THREAD],
854 __device__ __forceinline__
void Load(
855 InputIterator block_itr,
856 T (&items)[ITEMS_PER_THREAD],
861 BlockExchange(temp_storage).WarpStripedToBlocked(items);
871 typedef LoadInternal<ALGORITHM, 0> InternalLoad;
875 typedef typename InternalLoad::TempStorage _TempStorage;
883 __device__ __forceinline__ _TempStorage& PrivateStorage()
885 __shared__ _TempStorage private_storage;
886 return private_storage;
895 _TempStorage &temp_storage;
916 temp_storage(PrivateStorage()),
917 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
927 temp_storage(temp_storage.Alias()),
928 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
977 __device__ __forceinline__
void Load(
978 InputIterator block_itr,
979 T (&items)[ITEMS_PER_THREAD])
981 InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
1022 __device__ __forceinline__
void Load(
1023 InputIterator block_itr,
1024 T (&items)[ITEMS_PER_THREAD],
1027 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
1069 __device__ __forceinline__
void Load(
1070 InputIterator block_itr,
1071 T (&items)[ITEMS_PER_THREAD],
1075 InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);