CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_exchange.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "../util_ptx.cuh"
37 #include "../util_arch.cuh"
38 #include "../util_macro.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
108 template <
109  typename T,
110  int BLOCK_DIM_X,
111  int ITEMS_PER_THREAD,
112  bool WARP_TIME_SLICING = false,
113  int BLOCK_DIM_Y = 1,
114  int BLOCK_DIM_Z = 1,
115  int PTX_ARCH = CUB_PTX_ARCH>
117 {
118 private:
119 
120  /******************************************************************************
121  * Constants
122  ******************************************************************************/
123 
125  enum
126  {
128  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
129 
130  LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH),
131  WARP_THREADS = 1 << LOG_WARP_THREADS,
132  WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
133 
134  LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH),
135  SMEM_BANKS = 1 << LOG_SMEM_BANKS,
136 
137  TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
138 
139  TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1,
140 
141  TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
142  TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
143 
144  WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
145  WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
146 
147  // Insert padding if the number of items per thread is a power of two
148  INSERT_PADDING = 0, // Mooch PowerOfTwo<ITEMS_PER_THREAD>::VALUE,
149  PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
150  };
151 
152  /******************************************************************************
153  * Type definitions
154  ******************************************************************************/
155 
157  typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
158 
159 public:
160 
162  struct TempStorage : Uninitialized<_TempStorage> {};
163 
164 private:
165 
166 
167  /******************************************************************************
168  * Thread fields
169  ******************************************************************************/
170 
172  _TempStorage &temp_storage;
173 
175  int linear_tid;
176  int lane_id;
177  int warp_id;
178  int warp_offset;
179 
180 
181  /******************************************************************************
182  * Utility methods
183  ******************************************************************************/
184 
186  __device__ __forceinline__ _TempStorage& PrivateStorage()
187  {
188  __shared__ _TempStorage private_storage;
189  return private_storage;
190  }
191 
192 
196  __device__ __forceinline__ void BlockedToStriped(
197  T items[ITEMS_PER_THREAD],
198  Int2Type<false> time_slicing)
199  {
200  #pragma unroll
201  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
202  {
203  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
204  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
205  temp_storage[item_offset] = items[ITEM];
206  }
207 
208  __syncthreads();
209 
210  #pragma unroll
211  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
212  {
213  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
214  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
215  items[ITEM] = temp_storage[item_offset];
216  }
217  }
218 
219 
223  __device__ __forceinline__ void BlockedToStriped(
224  T items[ITEMS_PER_THREAD],
225  Int2Type<true> time_slicing)
226  {
227  T temp_items[ITEMS_PER_THREAD];
228 
229  #pragma unroll
230  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
231  {
232  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
233  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
234 
235  __syncthreads();
236 
237  if (warp_id == SLICE)
238  {
239  #pragma unroll
240  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
241  {
242  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
243  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
244  temp_storage[item_offset] = items[ITEM];
245  }
246  }
247 
248  __syncthreads();
249 
250  #pragma unroll
251  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
252  {
253  // Read a strip of items
254  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
255  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
256 
257  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
258  {
259  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
260  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
261  {
262  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
263  temp_items[ITEM] = temp_storage[item_offset];
264  }
265  }
266  }
267  }
268 
269  // Copy
270  #pragma unroll
271  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
272  {
273  items[ITEM] = temp_items[ITEM];
274  }
275  }
276 
277 
281  __device__ __forceinline__ void BlockedToWarpStriped(
282  T items[ITEMS_PER_THREAD],
283  Int2Type<false> time_slicing)
284  {
285  #pragma unroll
286  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
287  {
288  int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
289  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
290  temp_storage[item_offset] = items[ITEM];
291  }
292 
293  #pragma unroll
294  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
295  {
296  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
297  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
298  items[ITEM] = temp_storage[item_offset];
299  }
300  }
301 
305  __device__ __forceinline__ void BlockedToWarpStriped(
306  T items[ITEMS_PER_THREAD],
307  Int2Type<true> time_slicing)
308  {
309  #pragma unroll
310  for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
311  {
312  __syncthreads();
313 
314  if (warp_id == SLICE)
315  {
316  #pragma unroll
317  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
318  {
319  int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
320  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
321  temp_storage[item_offset] = items[ITEM];
322  }
323 
324  #pragma unroll
325  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
326  {
327  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
328  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
329  items[ITEM] = temp_storage[item_offset];
330  }
331  }
332  }
333  }
334 
335 
339  __device__ __forceinline__ void StripedToBlocked(
340  T items[ITEMS_PER_THREAD],
341  Int2Type<false> time_slicing)
342  {
343  #pragma unroll
344  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
345  {
346  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
347  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
348  temp_storage[item_offset] = items[ITEM];
349  }
350 
351  __syncthreads();
352 
353  // No timeslicing
354  #pragma unroll
355  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
356  {
357  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
358  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
359  items[ITEM] = temp_storage[item_offset];
360  }
361  }
362 
363 
367  __device__ __forceinline__ void StripedToBlocked(
368  T items[ITEMS_PER_THREAD],
369  Int2Type<true> time_slicing)
370  {
371  // Warp time-slicing
372  T temp_items[ITEMS_PER_THREAD];
373 
374  #pragma unroll
375  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
376  {
377  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
378  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
379 
380  __syncthreads();
381 
382  #pragma unroll
383  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
384  {
385  // Write a strip of items
386  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
387  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
388 
389  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
390  {
391  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
392  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
393  {
394  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
395  temp_storage[item_offset] = items[ITEM];
396  }
397  }
398  }
399 
400  __syncthreads();
401 
402  if (warp_id == SLICE)
403  {
404  #pragma unroll
405  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
406  {
407  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
408  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
409  temp_items[ITEM] = temp_storage[item_offset];
410  }
411  }
412  }
413 
414  // Copy
415  #pragma unroll
416  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
417  {
418  items[ITEM] = temp_items[ITEM];
419  }
420  }
421 
422 
426  __device__ __forceinline__ void WarpStripedToBlocked(
427  T items[ITEMS_PER_THREAD],
428  Int2Type<false> time_slicing)
429  {
430  #pragma unroll
431  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
432  {
433  int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
434  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
435  temp_storage[item_offset] = items[ITEM];
436  }
437 
438  #pragma unroll
439  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
440  {
441  int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
442  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
443  items[ITEM] = temp_storage[item_offset];
444  }
445  }
446 
447 
451  __device__ __forceinline__ void WarpStripedToBlocked(
452  T items[ITEMS_PER_THREAD],
453  Int2Type<true> time_slicing)
454  {
455  #pragma unroll
456  for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
457  {
458  __syncthreads();
459 
460  if (warp_id == SLICE)
461  {
462  #pragma unroll
463  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
464  {
465  int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
466  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
467  temp_storage[item_offset] = items[ITEM];
468  }
469 
470  #pragma unroll
471  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
472  {
473  int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
474  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
475  items[ITEM] = temp_storage[item_offset];
476  }
477  }
478  }
479  }
480 
481 
485  template <typename Offset>
486  __device__ __forceinline__ void ScatterToBlocked(
487  T items[ITEMS_PER_THREAD],
488  Offset ranks[ITEMS_PER_THREAD],
489  Int2Type<false> time_slicing)
490  {
491  #pragma unroll
492  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
493  {
494  int item_offset = ranks[ITEM];
495  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
496  temp_storage[item_offset] = items[ITEM];
497  }
498 
499  __syncthreads();
500 
501  #pragma unroll
502  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
503  {
504  int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
505  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
506  items[ITEM] = temp_storage[item_offset];
507  }
508  }
509 
513  template <typename Offset>
514  __device__ __forceinline__ void ScatterToBlocked(
515  T items[ITEMS_PER_THREAD],
516  Offset ranks[ITEMS_PER_THREAD],
517  Int2Type<true> time_slicing)
518  {
519  T temp_items[ITEMS_PER_THREAD];
520 
521  #pragma unroll
522  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
523  {
524  __syncthreads();
525 
526  const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
527 
528  #pragma unroll
529  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
530  {
531  int item_offset = ranks[ITEM] - SLICE_OFFSET;
532  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
533  {
534  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
535  temp_storage[item_offset] = items[ITEM];
536  }
537  }
538 
539  __syncthreads();
540 
541  if (warp_id == SLICE)
542  {
543  #pragma unroll
544  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
545  {
546  int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
547  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
548  temp_items[ITEM] = temp_storage[item_offset];
549  }
550  }
551  }
552 
553  // Copy
554  #pragma unroll
555  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
556  {
557  items[ITEM] = temp_items[ITEM];
558  }
559  }
560 
561 
565  template <typename Offset>
566  __device__ __forceinline__ void ScatterToStriped(
567  T items[ITEMS_PER_THREAD],
568  Offset ranks[ITEMS_PER_THREAD],
569  Int2Type<false> time_slicing)
570  {
571  #pragma unroll
572  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
573  {
574  int item_offset = ranks[ITEM];
575  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
576  temp_storage[item_offset] = items[ITEM];
577  }
578 
579  __syncthreads();
580 
581  #pragma unroll
582  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
583  {
584  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
585  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
586  items[ITEM] = temp_storage[item_offset];
587  }
588  }
589 
590 
594  template <typename Offset>
595  __device__ __forceinline__ void ScatterToStriped(
596  T items[ITEMS_PER_THREAD],
597  Offset ranks[ITEMS_PER_THREAD],
598  Int2Type<true> time_slicing)
599  {
600  T temp_items[ITEMS_PER_THREAD];
601 
602  #pragma unroll
603  for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
604  {
605  const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS;
606  const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS;
607 
608  __syncthreads();
609 
610  #pragma unroll
611  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
612  {
613  int item_offset = ranks[ITEM] - SLICE_OFFSET;
614  if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
615  {
616  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
617  temp_storage[item_offset] = items[ITEM];
618  }
619  }
620 
621  __syncthreads();
622 
623  #pragma unroll
624  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
625  {
626  // Read a strip of items
627  const int STRIP_OFFSET = ITEM * BLOCK_THREADS;
628  const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS;
629 
630  if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
631  {
632  int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
633  if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
634  {
635  if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
636  temp_items[ITEM] = temp_storage[item_offset];
637  }
638  }
639  }
640  }
641 
642  // Copy
643  #pragma unroll
644  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
645  {
646  items[ITEM] = temp_items[ITEM];
647  }
648  }
649 
650 
651 public:
652 
653  /******************************************************************/
657 
661  __device__ __forceinline__ BlockExchange()
662  :
663  temp_storage(PrivateStorage()),
664  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
665  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
666  lane_id(LaneId()),
667  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
668  {}
669 
670 
674  __device__ __forceinline__ BlockExchange(
675  TempStorage &temp_storage)
676  :
677  temp_storage(temp_storage.Alias()),
678  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
679  warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
680  lane_id(LaneId()),
681  warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
682  {}
683 
684 
686  /******************************************************************/
690 
727  __device__ __forceinline__ void StripedToBlocked(
728  T items[ITEMS_PER_THREAD])
729  {
730  StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
731  }
732 
773  __device__ __forceinline__ void BlockedToStriped(
774  T items[ITEMS_PER_THREAD])
775  {
776  BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
777  }
778 
779 
818  __device__ __forceinline__ void WarpStripedToBlocked(
819  T items[ITEMS_PER_THREAD])
820  {
821  WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
822  }
823 
865  __device__ __forceinline__ void BlockedToWarpStriped(
866  T items[ITEMS_PER_THREAD])
867  {
868  BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
869  }
870 
871 
873  /******************************************************************/
877 
878 
887  template <typename Offset>
888  __device__ __forceinline__ void ScatterToBlocked(
889  T items[ITEMS_PER_THREAD],
890  Offset ranks[ITEMS_PER_THREAD])
891  {
892  ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
893  }
894 
895 
904  template <typename Offset>
905  __device__ __forceinline__ void ScatterToStriped(
906  T items[ITEMS_PER_THREAD],
907  Offset ranks[ITEMS_PER_THREAD])
908  {
909  ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
910  }
911 
912 
921  template <typename Offset>
922  __device__ __forceinline__ void ScatterToStripedGuarded(
923  T items[ITEMS_PER_THREAD],
924  Offset ranks[ITEMS_PER_THREAD])
925  {
926  #pragma unroll
927  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
928  {
929  int item_offset = ranks[ITEM];
930  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
931  if (ranks[ITEM] >= 0)
932  temp_storage[item_offset] = items[ITEM];
933  }
934 
935  __syncthreads();
936 
937  #pragma unroll
938  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
939  {
940  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
941  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
942  items[ITEM] = temp_storage[item_offset];
943  }
944  }
945 
955  template <typename Offset, typename ValidFlag>
956  __device__ __forceinline__ void ScatterToStriped(
957  T items[ITEMS_PER_THREAD],
958  Offset ranks[ITEMS_PER_THREAD],
959  ValidFlag is_valid[ITEMS_PER_THREAD])
960  {
961  #pragma unroll
962  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
963  {
964  int item_offset = ranks[ITEM];
965  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
966  if (is_valid[ITEM])
967  temp_storage[item_offset] = items[ITEM];
968  }
969 
970  __syncthreads();
971 
972  #pragma unroll
973  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
974  {
975  int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
976  if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
977  items[ITEM] = temp_storage[item_offset];
978  }
979  }
980 
982 
983 
984 };
985 
986 } // CUB namespace
987 CUB_NS_POSTFIX // Optional outer namespace(s)
988