39 #include "../util_ptx.cuh"
40 #include "../util_macro.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
73 typename OutputIterator>
76 OutputIterator block_itr,
77 T (&items)[ITEMS_PER_THREAD])
81 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
83 block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
100 typename OutputIterator>
103 OutputIterator block_itr,
104 T (&items)[ITEMS_PER_THREAD],
109 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
111 if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
113 block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
138 int ITEMS_PER_THREAD>
142 T (&items)[ITEMS_PER_THREAD])
147 MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
150 VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
154 VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
161 Vector *block_ptr_vectors =
reinterpret_cast<Vector *
>(block_ptr);
164 Vector raw_vector[VECTORS_PER_THREAD];
165 T *raw_items =
reinterpret_cast<T*
>(raw_vector);
169 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
171 raw_items[ITEM] = items[ITEM];
200 int ITEMS_PER_THREAD,
201 typename OutputIterator>
204 OutputIterator block_itr,
205 T (&items)[ITEMS_PER_THREAD])
209 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
211 block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
229 int ITEMS_PER_THREAD,
230 typename OutputIterator>
233 OutputIterator block_itr,
234 T (&items)[ITEMS_PER_THREAD],
239 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
241 if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
243 block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
271 int ITEMS_PER_THREAD,
272 typename OutputIterator>
275 OutputIterator block_itr,
276 T (&items)[ITEMS_PER_THREAD])
278 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
279 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
280 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
284 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
286 block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
305 int ITEMS_PER_THREAD,
306 typename OutputIterator>
309 OutputIterator block_itr,
310 T (&items)[ITEMS_PER_THREAD],
313 int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1);
314 int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
315 int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
319 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
321 if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
323 block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
487 typename OutputIterator,
489 int ITEMS_PER_THREAD,
491 bool WARP_TIME_SLICING =
false,
506 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
510 typedef typename std::iterator_traits<OutputIterator>::value_type T;
518 template <BlockStoreAlgorithm _POLICY,
int DUMMY>
519 struct StoreInternal;
526 struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
535 __device__ __forceinline__ StoreInternal(
539 linear_tid(linear_tid)
543 __device__ __forceinline__
void Store(
544 OutputIterator block_itr,
545 T (&items)[ITEMS_PER_THREAD])
551 __device__ __forceinline__
void Store(
552 OutputIterator block_itr,
553 T (&items)[ITEMS_PER_THREAD],
565 struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
574 __device__ __forceinline__ StoreInternal(
578 linear_tid(linear_tid)
582 __device__ __forceinline__
void Store(
584 T (&items)[ITEMS_PER_THREAD])
590 template <
typename _OutputIterator>
591 __device__ __forceinline__
void Store(
592 _OutputIterator block_itr,
593 T (&items)[ITEMS_PER_THREAD])
599 __device__ __forceinline__
void Store(
600 OutputIterator block_itr,
601 T (&items)[ITEMS_PER_THREAD],
613 struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
631 __device__ __forceinline__ StoreInternal(
635 temp_storage(temp_storage.Alias()),
636 linear_tid(linear_tid)
640 __device__ __forceinline__
void Store(
641 OutputIterator block_itr,
642 T (&items)[ITEMS_PER_THREAD])
645 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
649 __device__ __forceinline__
void Store(
650 OutputIterator block_itr,
651 T (&items)[ITEMS_PER_THREAD],
655 StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
672 CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0),
"BLOCK_THREADS must be a multiple of WARP_THREADS");
675 typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
678 typedef typename BlockExchange::TempStorage _TempStorage;
690 __device__ __forceinline__ StoreInternal(
694 temp_storage(temp_storage.Alias()),
695 linear_tid(linear_tid)
699 __device__ __forceinline__
void Store(
700 OutputIterator block_itr,
701 T (&items)[ITEMS_PER_THREAD])
708 __device__ __forceinline__
void Store(
709 OutputIterator block_itr,
710 T (&items)[ITEMS_PER_THREAD],
723 typedef StoreInternal<ALGORITHM, 0> InternalStore;
727 typedef typename InternalStore::TempStorage _TempStorage;
735 __device__ __forceinline__ _TempStorage& PrivateStorage()
737 __shared__ _TempStorage private_storage;
738 return private_storage;
747 _TempStorage &temp_storage;
769 temp_storage(PrivateStorage()),
770 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
780 temp_storage(temp_storage.Alias()),
781 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
832 __device__ __forceinline__
void Store(
833 OutputIterator block_itr,
834 T (&items)[ITEMS_PER_THREAD])
836 InternalStore(temp_storage, linear_tid).Store(block_itr, items);
880 __device__ __forceinline__
void Store(
881 OutputIterator block_itr,
882 T (&items)[ITEMS_PER_THREAD],
885 InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);