CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
block_scan.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include "specializations/block_scan_raking.cuh"
37 #include "specializations/block_scan_warp_scans.cuh"
38 #include "../util_arch.cuh"
39 #include "../util_type.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_namespace.cuh"
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
49 
50 
51 /******************************************************************************
52  * Scan utility types
53  ******************************************************************************/
54 
55 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
56 
60 template <typename ReductionOp>
61 struct ReduceByKeyOp
62 {
63  ReductionOp op;
64 
66  __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
67 
69  template <typename KeyValuePair>
70  __device__ __forceinline__ KeyValuePair operator()(
71  const KeyValuePair &first,
72  const KeyValuePair &second)
73  {
74  KeyValuePair retval;
75 
76  retval.value = (second.key != first.key) ?
77  second.value : // The second value is for a different ID, return only that value
78  op(first.value, second.value); // The values are for the same ID so reduce them
79 
80  retval.key = second.key;
81  return retval;
82  }
83 };
84 
85 
86 
90 template <typename ReductionOp>
91 struct SegmentedOp
92 {
93  ReductionOp op;
94 
96  __device__ __forceinline__ SegmentedOp(ReductionOp op) : op(op) {}
97 
99  template <typename KeyValuePair>
100  __device__ __forceinline__ KeyValuePair operator()(
101  const KeyValuePair &first,
102  const KeyValuePair &second)
103  {
104  if (second.key) {
105  KeyValuePair retval;
106  retval.value = second.value;
107  retval.key = first.key + second.key;
108  return retval;
109  } else {
110  KeyValuePair retval;
111  retval.value = op(first.value, second.value);
112  retval.key = first.key + second.key;
113  return ;
114  }
115  }
116 };
117 
118 #endif // DOXYGEN_SHOULD_SKIP_THIS
119 
120 
121 
122 /******************************************************************************
123  * Algorithmic variants
124  ******************************************************************************/
125 
130 {
131 
151 
152 
161 
162 
181 };
182 
183 
184 /******************************************************************************
185  * Block scan
186  ******************************************************************************/
187 
258 template <
259  typename T,
260  int BLOCK_DIM_X,
262  int BLOCK_DIM_Y = 1,
263  int BLOCK_DIM_Z = 1,
264  int PTX_ARCH = CUB_PTX_ARCH>
266 {
267 private:
268 
269  /******************************************************************************
270  * Constants and type definitions
271  ******************************************************************************/
272 
274  enum
275  {
277  BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
278  };
279 
286  static const BlockScanAlgorithm SAFE_ALGORITHM =
287  ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
289  ALGORITHM;
290 
291  typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
292  typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
293 
295  typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
296  WarpScans,
297  Raking>::Type InternalBlockScan;
298 
300  typedef typename InternalBlockScan::TempStorage _TempStorage;
301 
302 
303  /******************************************************************************
304  * Thread fields
305  ******************************************************************************/
306 
308  _TempStorage &temp_storage;
309 
311  int linear_tid;
312 
313 
314  /******************************************************************************
315  * Utility methods
316  ******************************************************************************/
317 
319  __device__ __forceinline__ _TempStorage& PrivateStorage()
320  {
321  __shared__ _TempStorage private_storage;
322  return private_storage;
323  }
324 
325 
326 public:
327 
329  struct TempStorage : Uninitialized<_TempStorage> {};
330 
331 
332  /******************************************************************/
336 
340  __device__ __forceinline__ BlockScan()
341  :
342  temp_storage(PrivateStorage()),
343  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
344  {}
345 
346 
350  __device__ __forceinline__ BlockScan(
351  TempStorage &temp_storage)
352  :
353  temp_storage(temp_storage.Alias()),
354  linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
355  {}
356 
357 
358 
359 
360 
361 
363  /******************************************************************/
367 
368 
404  __device__ __forceinline__ void ExclusiveSum(
405  T input,
406  T &output)
407  {
408  T block_aggregate;
409  InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate);
410  }
411 
412 
450  __device__ __forceinline__ void ExclusiveSum(
451  T input,
452  T &output,
453  T &block_aggregate)
454  {
455  InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate);
456  }
457 
458 
534  template <typename BlockPrefixCallbackOp>
535  __device__ __forceinline__ void ExclusiveSum(
536  T input,
537  T &output,
538  T &block_aggregate,
539  BlockPrefixCallbackOp &block_prefix_callback_op)
540  {
541  InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate, block_prefix_callback_op);
542  }
543 
544 
546  /******************************************************************/
550 
551 
590  template <int ITEMS_PER_THREAD>
591  __device__ __forceinline__ void ExclusiveSum(
592  T (&input)[ITEMS_PER_THREAD],
593  T (&output)[ITEMS_PER_THREAD])
594  {
595  // Reduce consecutive thread items in registers
596  Sum scan_op;
597  T thread_partial = ThreadReduce(input, scan_op);
598 
599  // Exclusive threadblock-scan
600  ExclusiveSum(thread_partial, thread_partial);
601 
602  // Exclusive scan in registers with prefix
603  ThreadScanExclusive(input, output, scan_op, thread_partial);
604  }
605 
606 
647  template <int ITEMS_PER_THREAD>
648  __device__ __forceinline__ void ExclusiveSum(
649  T (&input)[ITEMS_PER_THREAD],
650  T (&output)[ITEMS_PER_THREAD],
651  T &block_aggregate)
652  {
653  // Reduce consecutive thread items in registers
654  Sum scan_op;
655  T thread_partial = ThreadReduce(input, scan_op);
656 
657  // Exclusive threadblock-scan
658  ExclusiveSum(thread_partial, thread_partial, block_aggregate);
659 
660  // Exclusive scan in registers with prefix
661  ThreadScanExclusive(input, output, scan_op, thread_partial);
662  }
663 
664 
752  template <
753  int ITEMS_PER_THREAD,
754  typename BlockPrefixCallbackOp>
755  __device__ __forceinline__ void ExclusiveSum(
756  T (&input)[ITEMS_PER_THREAD],
757  T (&output)[ITEMS_PER_THREAD],
758  T &block_aggregate,
759  BlockPrefixCallbackOp &block_prefix_callback_op)
760  {
761  // Reduce consecutive thread items in registers
762  Sum scan_op;
763  T thread_partial = ThreadReduce(input, scan_op);
764 
765  // Exclusive threadblock-scan
766  ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
767 
768  // Exclusive scan in registers with prefix
769  ThreadScanExclusive(input, output, scan_op, thread_partial);
770  }
771 
772 
773 
775  /******************************************************************/
779 
780 
818  template <typename ScanOp>
819  __device__ __forceinline__ void ExclusiveScan(
820  T input,
821  T &output,
822  T identity,
823  ScanOp scan_op)
824  {
825  T block_aggregate;
826  InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
827  }
828 
829 
869  template <typename ScanOp>
870  __device__ __forceinline__ void ExclusiveScan(
871  T input,
872  T &output,
873  T identity,
874  ScanOp scan_op,
875  T &block_aggregate)
876  {
877  InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
878  }
879 
880 
959  template <
960  typename ScanOp,
961  typename BlockPrefixCallbackOp>
962  __device__ __forceinline__ void ExclusiveScan(
963  T input,
964  T &output,
965  T identity,
966  ScanOp scan_op,
967  T &block_aggregate,
968  BlockPrefixCallbackOp &block_prefix_callback_op)
969  {
970  InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_callback_op);
971  }
972 
973 
975  /******************************************************************/
979 
980 
1023  template <
1024  int ITEMS_PER_THREAD,
1025  typename ScanOp>
1026  __device__ __forceinline__ void ExclusiveScan(
1027  T (&input)[ITEMS_PER_THREAD],
1028  T (&output)[ITEMS_PER_THREAD],
1029  T identity,
1030  ScanOp scan_op)
1031  {
1032  // Reduce consecutive thread items in registers
1033  T thread_partial = ThreadReduce(input, scan_op);
1034 
1035  // Exclusive threadblock-scan
1036  ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
1037 
1038  // Exclusive scan in registers with prefix
1039  ThreadScanExclusive(input, output, scan_op, thread_partial);
1040  }
1041 
1042 
1085  template <
1086  int ITEMS_PER_THREAD,
1087  typename ScanOp>
1088  __device__ __forceinline__ void ExclusiveScan(
1089  T (&input)[ITEMS_PER_THREAD],
1090  T (&output)[ITEMS_PER_THREAD],
1091  T identity,
1092  ScanOp scan_op,
1093  T &block_aggregate)
1094  {
1095  // Reduce consecutive thread items in registers
1096  T thread_partial = ThreadReduce(input, scan_op);
1097 
1098  // Exclusive threadblock-scan
1099  ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
1100 
1101  // Exclusive scan in registers with prefix
1102  ThreadScanExclusive(input, output, scan_op, thread_partial);
1103  }
1104 
1105 
1195  template <
1196  int ITEMS_PER_THREAD,
1197  typename ScanOp,
1198  typename BlockPrefixCallbackOp>
1199  __device__ __forceinline__ void ExclusiveScan(
1200  T (&input)[ITEMS_PER_THREAD],
1201  T (&output)[ITEMS_PER_THREAD],
1202  T identity,
1203  ScanOp scan_op,
1204  T &block_aggregate,
1205  BlockPrefixCallbackOp &block_prefix_callback_op)
1206  {
1207  // Reduce consecutive thread items in registers
1208  T thread_partial = ThreadReduce(input, scan_op);
1209 
1210  // Exclusive threadblock-scan
1211  ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_callback_op);
1212 
1213  // Exclusive scan in registers with prefix
1214  ThreadScanExclusive(input, output, scan_op, thread_partial);
1215  }
1216 
1217 
1219 
1220 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1221 
1222  /******************************************************************/
1226 
1227 
1238  template <typename ScanOp>
1239  __device__ __forceinline__ void ExclusiveScan(
1240  T input,
1241  T &output,
1242  ScanOp scan_op)
1243  {
1244  T block_aggregate;
1245  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1246  }
1247 
1248 
1259  template <typename ScanOp>
1260  __device__ __forceinline__ void ExclusiveScan(
1261  T input,
1262  T &output,
1263  ScanOp scan_op,
1264  T &block_aggregate)
1265  {
1266  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1267  }
1268 
1269 
1286  template <
1287  typename ScanOp,
1288  typename BlockPrefixCallbackOp>
1289  __device__ __forceinline__ void ExclusiveScan(
1290  T input,
1291  T &output,
1292  ScanOp scan_op,
1293  T &block_aggregate,
1294  BlockPrefixCallbackOp &block_prefix_callback_op)
1295  {
1296  InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
1297  }
1298 
1299 
1301  /******************************************************************/
1305 
1306 
1319  template <
1320  int ITEMS_PER_THREAD,
1321  typename ScanOp>
1322  __device__ __forceinline__ void ExclusiveScan(
1323  T (&input)[ITEMS_PER_THREAD],
1324  T (&output)[ITEMS_PER_THREAD],
1325  ScanOp scan_op)
1326  {
1327  // Reduce consecutive thread items in registers
1328  T thread_partial = ThreadReduce(input, scan_op);
1329 
1330  // Exclusive threadblock-scan
1331  ExclusiveScan(thread_partial, thread_partial, scan_op);
1332 
1333  // Exclusive scan in registers with prefix
1334  ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1335  }
1336 
1337 
1350  template <
1351  int ITEMS_PER_THREAD,
1352  typename ScanOp>
1353  __device__ __forceinline__ void ExclusiveScan(
1354  T (&input)[ITEMS_PER_THREAD],
1355  T (&output)[ITEMS_PER_THREAD],
1356  ScanOp scan_op,
1357  T &block_aggregate)
1358  {
1359  // Reduce consecutive thread items in registers
1360  T thread_partial = ThreadReduce(input, scan_op);
1361 
1362  // Exclusive threadblock-scan
1363  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1364 
1365  // Exclusive scan in registers with prefix
1366  ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1367  }
1368 
1369 
1388  template <
1389  int ITEMS_PER_THREAD,
1390  typename ScanOp,
1391  typename BlockPrefixCallbackOp>
1392  __device__ __forceinline__ void ExclusiveScan(
1393  T (&input)[ITEMS_PER_THREAD],
1394  T (&output)[ITEMS_PER_THREAD],
1395  ScanOp scan_op,
1396  T &block_aggregate,
1397  BlockPrefixCallbackOp &block_prefix_callback_op)
1398  {
1399  // Reduce consecutive thread items in registers
1400  T thread_partial = ThreadReduce(input, scan_op);
1401 
1402  // Exclusive threadblock-scan
1403  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
1404 
1405  // Exclusive scan in registers with prefix
1406  ThreadScanExclusive(input, output, scan_op, thread_partial);
1407  }
1408 
1409 
1411 
1412 #endif // DOXYGEN_SHOULD_SKIP_THIS
1413 
1414  /******************************************************************/
1418 
1419 
1455  __device__ __forceinline__ void InclusiveSum(
1456  T input,
1457  T &output)
1458  {
1459  T block_aggregate;
1460  InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate);
1461  }
1462 
1463 
1501  __device__ __forceinline__ void InclusiveSum(
1502  T input,
1503  T &output,
1504  T &block_aggregate)
1505  {
1506  InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate);
1507  }
1508 
1509 
1510 
1586  template <typename BlockPrefixCallbackOp>
1587  __device__ __forceinline__ void InclusiveSum(
1588  T input,
1589  T &output,
1590  T &block_aggregate,
1591  BlockPrefixCallbackOp &block_prefix_callback_op)
1592  {
1593  InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate, block_prefix_callback_op);
1594  }
1595 
1596 
1598  /******************************************************************/
1602 
1603 
1642  template <int ITEMS_PER_THREAD>
1643  __device__ __forceinline__ void InclusiveSum(
1644  T (&input)[ITEMS_PER_THREAD],
1645  T (&output)[ITEMS_PER_THREAD])
1646  {
1647  if (ITEMS_PER_THREAD == 1)
1648  {
1649  InclusiveSum(input[0], output[0]);
1650  }
1651  else
1652  {
1653  // Reduce consecutive thread items in registers
1654  Sum scan_op;
1655  T thread_partial = ThreadReduce(input, scan_op);
1656 
1657  // Exclusive threadblock-scan
1658  ExclusiveSum(thread_partial, thread_partial);
1659 
1660  // Inclusive scan in registers with prefix
1661  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1662  }
1663  }
1664 
1665 
1709  template <int ITEMS_PER_THREAD>
1710  __device__ __forceinline__ void InclusiveSum(
1711  T (&input)[ITEMS_PER_THREAD],
1712  T (&output)[ITEMS_PER_THREAD],
1713  T &block_aggregate)
1714  {
1715  if (ITEMS_PER_THREAD == 1)
1716  {
1717  InclusiveSum(input[0], output[0], block_aggregate);
1718  }
1719  else
1720  {
1721  // Reduce consecutive thread items in registers
1722  Sum scan_op;
1723  T thread_partial = ThreadReduce(input, scan_op);
1724 
1725  // Exclusive threadblock-scan
1726  ExclusiveSum(thread_partial, thread_partial, block_aggregate);
1727 
1728  // Inclusive scan in registers with prefix
1729  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1730  }
1731  }
1732 
1733 
1821  template <
1822  int ITEMS_PER_THREAD,
1823  typename BlockPrefixCallbackOp>
1824  __device__ __forceinline__ void InclusiveSum(
1825  T (&input)[ITEMS_PER_THREAD],
1826  T (&output)[ITEMS_PER_THREAD],
1827  T &block_aggregate,
1828  BlockPrefixCallbackOp &block_prefix_callback_op)
1829  {
1830  if (ITEMS_PER_THREAD == 1)
1831  {
1832  InclusiveSum(input[0], output[0], block_aggregate, block_prefix_callback_op);
1833  }
1834  else
1835  {
1836  // Reduce consecutive thread items in registers
1837  Sum scan_op;
1838  T thread_partial = ThreadReduce(input, scan_op);
1839 
1840  // Exclusive threadblock-scan
1841  ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
1842 
1843  // Inclusive scan in registers with prefix
1844  ThreadScanInclusive(input, output, scan_op, thread_partial);
1845  }
1846  }
1847 
1848 
1850  /******************************************************************/
1854 
1855 
1893  template <typename ScanOp>
1894  __device__ __forceinline__ void InclusiveScan(
1895  T input,
1896  T &output,
1897  ScanOp scan_op)
1898  {
1899  T block_aggregate;
1900  InclusiveScan(input, output, scan_op, block_aggregate);
1901  }
1902 
1903 
1943  template <typename ScanOp>
1944  __device__ __forceinline__ void InclusiveScan(
1945  T input,
1946  T &output,
1947  ScanOp scan_op,
1948  T &block_aggregate)
1949  {
1950  InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
1951  }
1952 
1953 
2032  template <
2033  typename ScanOp,
2034  typename BlockPrefixCallbackOp>
2035  __device__ __forceinline__ void InclusiveScan(
2036  T input,
2037  T &output,
2038  ScanOp scan_op,
2039  T &block_aggregate,
2040  BlockPrefixCallbackOp &block_prefix_callback_op)
2041  {
2042  InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
2043  }
2044 
2045 
2047  /******************************************************************/
2051 
2052 
2093  template <
2094  int ITEMS_PER_THREAD,
2095  typename ScanOp>
2096  __device__ __forceinline__ void InclusiveScan(
2097  T (&input)[ITEMS_PER_THREAD],
2098  T (&output)[ITEMS_PER_THREAD],
2099  ScanOp scan_op)
2100  {
2101  if (ITEMS_PER_THREAD == 1)
2102  {
2103  InclusiveScan(input[0], output[0], scan_op);
2104  }
2105  else
2106  {
2107  // Reduce consecutive thread items in registers
2108  T thread_partial = ThreadReduce(input, scan_op);
2109 
2110  // Exclusive threadblock-scan
2111  ExclusiveScan(thread_partial, thread_partial, scan_op);
2112 
2113  // Inclusive scan in registers with prefix
2114  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
2115  }
2116  }
2117 
2118 
2163  template <
2164  int ITEMS_PER_THREAD,
2165  typename ScanOp>
2166  __device__ __forceinline__ void InclusiveScan(
2167  T (&input)[ITEMS_PER_THREAD],
2168  T (&output)[ITEMS_PER_THREAD],
2169  ScanOp scan_op,
2170  T &block_aggregate)
2171  {
2172  if (ITEMS_PER_THREAD == 1)
2173  {
2174  InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2175  }
2176  else
2177  {
2178  // Reduce consecutive thread items in registers
2179  T thread_partial = ThreadReduce(input, scan_op);
2180 
2181  // Exclusive threadblock-scan
2182  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
2183 
2184  // Inclusive scan in registers with prefix
2185  ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
2186  }
2187  }
2188 
2189 
2279  template <
2280  int ITEMS_PER_THREAD,
2281  typename ScanOp,
2282  typename BlockPrefixCallbackOp>
2283  __device__ __forceinline__ void InclusiveScan(
2284  T (&input)[ITEMS_PER_THREAD],
2285  T (&output)[ITEMS_PER_THREAD],
2286  ScanOp scan_op,
2287  T &block_aggregate,
2288  BlockPrefixCallbackOp &block_prefix_callback_op)
2289  {
2290  if (ITEMS_PER_THREAD == 1)
2291  {
2292  InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_callback_op);
2293  }
2294  else
2295  {
2296  // Reduce consecutive thread items in registers
2297  T thread_partial = ThreadReduce(input, scan_op);
2298 
2299  // Exclusive threadblock-scan
2300  ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
2301 
2302  // Inclusive scan in registers with prefix
2303  ThreadScanInclusive(input, output, scan_op, thread_partial);
2304  }
2305  }
2306 
2308 
2309 
2310 };
2311 
2316 } // CUB namespace
2317 CUB_NS_POSTFIX // Optional outer namespace(s)
2318