36 #include "../util_ptx.cuh"
37 #include "../util_arch.cuh"
38 #include "../util_macro.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
111 int ITEMS_PER_THREAD,
112 bool WARP_TIME_SLICING =
false,
128 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
131 WARP_THREADS = 1 << LOG_WARP_THREADS,
132 WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
135 SMEM_BANKS = 1 << LOG_SMEM_BANKS,
137 TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
139 TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
141 TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
142 TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
144 WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
145 WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
149 PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
157 typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
172 _TempStorage &temp_storage;
186 __device__ __forceinline__ _TempStorage& PrivateStorage()
188 __shared__ _TempStorage private_storage;
189 return private_storage;
196 __device__ __forceinline__
void BlockedToStriped(
197 T items[ITEMS_PER_THREAD],
201 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
203 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
204 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
205 temp_storage[item_offset] = items[ITEM];
211 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
213 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
214 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
215 items[ITEM] = temp_storage[item_offset];
223 __device__ __forceinline__
void BlockedToStriped(
224 T items[ITEMS_PER_THREAD],
225 Int2Type<true> time_slicing)
227 T temp_items[ITEMS_PER_THREAD];
230 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
232 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
233 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
237 if (warp_id == SLICE)
240 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
242 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
243 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
244 temp_storage[item_offset] = items[ITEM];
251 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
254 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
255 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
257 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
259 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
260 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
262 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
263 temp_items[ITEM] = temp_storage[item_offset];
271 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
273 items[ITEM] = temp_items[ITEM];
281 __device__ __forceinline__
void BlockedToWarpStriped(
282 T items[ITEMS_PER_THREAD],
283 Int2Type<false> time_slicing)
286 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
288 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
289 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
290 temp_storage[item_offset] = items[ITEM];
294 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
296 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
297 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
298 items[ITEM] = temp_storage[item_offset];
305 __device__ __forceinline__
void BlockedToWarpStriped(
306 T items[ITEMS_PER_THREAD],
307 Int2Type<true> time_slicing)
310 for (
int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
314 if (warp_id == SLICE)
317 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
319 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
320 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
321 temp_storage[item_offset] = items[ITEM];
325 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
327 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
328 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
329 items[ITEM] = temp_storage[item_offset];
339 __device__ __forceinline__
void StripedToBlocked(
340 T items[ITEMS_PER_THREAD],
341 Int2Type<false> time_slicing)
344 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
346 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
347 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
348 temp_storage[item_offset] = items[ITEM];
355 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
357 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
358 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
359 items[ITEM] = temp_storage[item_offset];
367 __device__ __forceinline__
void StripedToBlocked(
368 T items[ITEMS_PER_THREAD],
369 Int2Type<true> time_slicing)
372 T temp_items[ITEMS_PER_THREAD];
375 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
377 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
378 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
383 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
386 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
387 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
389 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
391 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
392 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
394 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
395 temp_storage[item_offset] = items[ITEM];
402 if (warp_id == SLICE)
405 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
407 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
408 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
409 temp_items[ITEM] = temp_storage[item_offset];
416 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
418 items[ITEM] = temp_items[ITEM];
426 __device__ __forceinline__
void WarpStripedToBlocked(
427 T items[ITEMS_PER_THREAD],
428 Int2Type<false> time_slicing)
431 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
433 int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
434 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
435 temp_storage[item_offset] = items[ITEM];
439 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
441 int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
442 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
443 items[ITEM] = temp_storage[item_offset];
451 __device__ __forceinline__
void WarpStripedToBlocked(
452 T items[ITEMS_PER_THREAD],
453 Int2Type<true> time_slicing)
456 for (
int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
460 if (warp_id == SLICE)
463 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
465 int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
466 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
467 temp_storage[item_offset] = items[ITEM];
471 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
473 int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
474 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
475 items[ITEM] = temp_storage[item_offset];
485 template <
typename Offset>
486 __device__ __forceinline__
void ScatterToBlocked(
487 T items[ITEMS_PER_THREAD],
488 Offset ranks[ITEMS_PER_THREAD],
489 Int2Type<false> time_slicing)
492 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
494 int item_offset = ranks[ITEM];
495 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
496 temp_storage[item_offset] = items[ITEM];
502 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
504 int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
505 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
506 items[ITEM] = temp_storage[item_offset];
513 template <
typename Offset>
514 __device__ __forceinline__
void ScatterToBlocked(
515 T items[ITEMS_PER_THREAD],
516 Offset ranks[ITEMS_PER_THREAD],
517 Int2Type<true> time_slicing)
519 T temp_items[ITEMS_PER_THREAD];
522 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
526 const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
529 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
531 int item_offset = ranks[ITEM] - SLICE_OFFSET;
532 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
534 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
535 temp_storage[item_offset] = items[ITEM];
541 if (warp_id == SLICE)
544 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
546 int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
547 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
548 temp_items[ITEM] = temp_storage[item_offset];
555 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
557 items[ITEM] = temp_items[ITEM];
565 template <
typename Offset>
566 __device__ __forceinline__
void ScatterToStriped(
567 T items[ITEMS_PER_THREAD],
568 Offset ranks[ITEMS_PER_THREAD],
569 Int2Type<false> time_slicing)
572 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
574 int item_offset = ranks[ITEM];
575 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
576 temp_storage[item_offset] = items[ITEM];
582 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
584 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
585 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
586 items[ITEM] = temp_storage[item_offset];
594 template <
typename Offset>
595 __device__ __forceinline__
void ScatterToStriped(
596 T items[ITEMS_PER_THREAD],
597 Offset ranks[ITEMS_PER_THREAD],
598 Int2Type<true> time_slicing)
600 T temp_items[ITEMS_PER_THREAD];
603 for (
int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
605 const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
606 const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
611 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
613 int item_offset = ranks[ITEM] - SLICE_OFFSET;
614 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
616 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
617 temp_storage[item_offset] = items[ITEM];
624 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
627 const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
628 const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
630 if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
632 int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
633 if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
635 if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
636 temp_items[ITEM] = temp_storage[item_offset];
644 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
646 items[ITEM] = temp_items[ITEM];
663 temp_storage(PrivateStorage()),
664 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
665 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
667 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
677 temp_storage(temp_storage.Alias()),
678 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
679 warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
681 warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
728 T items[ITEMS_PER_THREAD])
774 T items[ITEMS_PER_THREAD])
819 T items[ITEMS_PER_THREAD])
866 T items[ITEMS_PER_THREAD])
887 template <
typename Offset>
889 T items[ITEMS_PER_THREAD],
890 Offset ranks[ITEMS_PER_THREAD])
904 template <
typename Offset>
906 T items[ITEMS_PER_THREAD],
907 Offset ranks[ITEMS_PER_THREAD])
921 template <
typename Offset>
923 T items[ITEMS_PER_THREAD],
924 Offset ranks[ITEMS_PER_THREAD])
927 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
929 int item_offset = ranks[ITEM];
930 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
931 if (ranks[ITEM] >= 0)
932 temp_storage[item_offset] = items[ITEM];
938 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
940 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
941 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
942 items[ITEM] = temp_storage[item_offset];
955 template <
typename Offset,
typename Val
idFlag>
957 T items[ITEMS_PER_THREAD],
958 Offset ranks[ITEMS_PER_THREAD],
959 ValidFlag is_valid[ITEMS_PER_THREAD])
962 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
964 int item_offset = ranks[ITEM];
965 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
967 temp_storage[item_offset] = items[ITEM];
973 for (
int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
975 int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
976 if (INSERT_PADDING) item_offset =
SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
977 items[ITEM] = temp_storage[item_offset];