36 #include "specializations/block_scan_raking.cuh"
37 #include "specializations/block_scan_warp_scans.cuh"
38 #include "../util_arch.cuh"
39 #include "../util_type.cuh"
40 #include "../util_ptx.cuh"
41 #include "../util_namespace.cuh"
55 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
60 template <
typename ReductionOp>
66 __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
69 template <
typename KeyValuePair>
70 __device__ __forceinline__ KeyValuePair operator()(
71 const KeyValuePair &first,
72 const KeyValuePair &second)
76 retval.value = (second.key != first.key) ?
78 op(first.value, second.value);
80 retval.key = second.key;
90 template <
typename ReductionOp>
96 __device__ __forceinline__ SegmentedOp(ReductionOp op) : op(op) {}
99 template <
typename KeyValuePair>
100 __device__ __forceinline__ KeyValuePair operator()(
101 const KeyValuePair &first,
102 const KeyValuePair &second)
106 retval.value = second.value;
107 retval.key = first.key + second.key;
111 retval.value = op(first.value, second.value);
112 retval.key = first.key + second.key;
118 #endif // DOXYGEN_SHOULD_SKIP_THIS
277 BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
291 typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
292 typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
297 Raking>::Type InternalBlockScan;
300 typedef typename InternalBlockScan::TempStorage _TempStorage;
308 _TempStorage &temp_storage;
319 __device__ __forceinline__ _TempStorage& PrivateStorage()
321 __shared__ _TempStorage private_storage;
322 return private_storage;
342 temp_storage(PrivateStorage()),
343 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
353 temp_storage(temp_storage.Alias()),
354 linear_tid(
RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
409 InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate);
455 InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate);
534 template <
typename BlockPrefixCallbackOp>
539 BlockPrefixCallbackOp &block_prefix_callback_op)
541 InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate, block_prefix_callback_op);
590 template <
int ITEMS_PER_THREAD>
592 T (&input)[ITEMS_PER_THREAD],
593 T (&output)[ITEMS_PER_THREAD])
597 T thread_partial = ThreadReduce(input, scan_op);
603 ThreadScanExclusive(input, output, scan_op, thread_partial);
647 template <
int ITEMS_PER_THREAD>
649 T (&input)[ITEMS_PER_THREAD],
650 T (&output)[ITEMS_PER_THREAD],
655 T thread_partial = ThreadReduce(input, scan_op);
658 ExclusiveSum(thread_partial, thread_partial, block_aggregate);
661 ThreadScanExclusive(input, output, scan_op, thread_partial);
753 int ITEMS_PER_THREAD,
754 typename BlockPrefixCallbackOp>
756 T (&input)[ITEMS_PER_THREAD],
757 T (&output)[ITEMS_PER_THREAD],
759 BlockPrefixCallbackOp &block_prefix_callback_op)
763 T thread_partial = ThreadReduce(input, scan_op);
766 ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
769 ThreadScanExclusive(input, output, scan_op, thread_partial);
818 template <
typename ScanOp>
826 InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
869 template <
typename ScanOp>
877 InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
961 typename BlockPrefixCallbackOp>
968 BlockPrefixCallbackOp &block_prefix_callback_op)
970 InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_callback_op);
1024 int ITEMS_PER_THREAD,
1027 T (&input)[ITEMS_PER_THREAD],
1028 T (&output)[ITEMS_PER_THREAD],
1033 T thread_partial = ThreadReduce(input, scan_op);
1036 ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
1039 ThreadScanExclusive(input, output, scan_op, thread_partial);
1086 int ITEMS_PER_THREAD,
1089 T (&input)[ITEMS_PER_THREAD],
1090 T (&output)[ITEMS_PER_THREAD],
1096 T thread_partial = ThreadReduce(input, scan_op);
1099 ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
1102 ThreadScanExclusive(input, output, scan_op, thread_partial);
1196 int ITEMS_PER_THREAD,
1198 typename BlockPrefixCallbackOp>
1200 T (&input)[ITEMS_PER_THREAD],
1201 T (&output)[ITEMS_PER_THREAD],
1205 BlockPrefixCallbackOp &block_prefix_callback_op)
1208 T thread_partial = ThreadReduce(input, scan_op);
1211 ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_callback_op);
1214 ThreadScanExclusive(input, output, scan_op, thread_partial);
1220 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
1238 template <
typename ScanOp>
1245 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1259 template <
typename ScanOp>
1266 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
1288 typename BlockPrefixCallbackOp>
1294 BlockPrefixCallbackOp &block_prefix_callback_op)
1296 InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
1320 int ITEMS_PER_THREAD,
1323 T (&input)[ITEMS_PER_THREAD],
1324 T (&output)[ITEMS_PER_THREAD],
1328 T thread_partial = ThreadReduce(input, scan_op);
1334 ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1351 int ITEMS_PER_THREAD,
1354 T (&input)[ITEMS_PER_THREAD],
1355 T (&output)[ITEMS_PER_THREAD],
1360 T thread_partial = ThreadReduce(input, scan_op);
1363 ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
1366 ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1389 int ITEMS_PER_THREAD,
1391 typename BlockPrefixCallbackOp>
1393 T (&input)[ITEMS_PER_THREAD],
1394 T (&output)[ITEMS_PER_THREAD],
1397 BlockPrefixCallbackOp &block_prefix_callback_op)
1400 T thread_partial = ThreadReduce(input, scan_op);
1403 ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
1406 ThreadScanExclusive(input, output, scan_op, thread_partial);
1412 #endif // DOXYGEN_SHOULD_SKIP_THIS
1460 InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate);
1506 InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate);
1586 template <
typename BlockPrefixCallbackOp>
1591 BlockPrefixCallbackOp &block_prefix_callback_op)
1593 InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate, block_prefix_callback_op);
1642 template <
int ITEMS_PER_THREAD>
1644 T (&input)[ITEMS_PER_THREAD],
1645 T (&output)[ITEMS_PER_THREAD])
1647 if (ITEMS_PER_THREAD == 1)
1655 T thread_partial = ThreadReduce(input, scan_op);
1661 ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1709 template <
int ITEMS_PER_THREAD>
1711 T (&input)[ITEMS_PER_THREAD],
1712 T (&output)[ITEMS_PER_THREAD],
1715 if (ITEMS_PER_THREAD == 1)
1723 T thread_partial = ThreadReduce(input, scan_op);
1726 ExclusiveSum(thread_partial, thread_partial, block_aggregate);
1729 ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
1822 int ITEMS_PER_THREAD,
1823 typename BlockPrefixCallbackOp>
1825 T (&input)[ITEMS_PER_THREAD],
1826 T (&output)[ITEMS_PER_THREAD],
1828 BlockPrefixCallbackOp &block_prefix_callback_op)
1830 if (ITEMS_PER_THREAD == 1)
1832 InclusiveSum(input[0], output[0], block_aggregate, block_prefix_callback_op);
1838 T thread_partial = ThreadReduce(input, scan_op);
1841 ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
1844 ThreadScanInclusive(input, output, scan_op, thread_partial);
1893 template <
typename ScanOp>
1943 template <
typename ScanOp>
1950 InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
2034 typename BlockPrefixCallbackOp>
2040 BlockPrefixCallbackOp &block_prefix_callback_op)
2042 InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
2094 int ITEMS_PER_THREAD,
2097 T (&input)[ITEMS_PER_THREAD],
2098 T (&output)[ITEMS_PER_THREAD],
2101 if (ITEMS_PER_THREAD == 1)
2108 T thread_partial = ThreadReduce(input, scan_op);
2114 ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
2164 int ITEMS_PER_THREAD,
2167 T (&input)[ITEMS_PER_THREAD],
2168 T (&output)[ITEMS_PER_THREAD],
2172 if (ITEMS_PER_THREAD == 1)
2174 InclusiveScan(input[0], output[0], scan_op, block_aggregate);
2179 T thread_partial = ThreadReduce(input, scan_op);
2182 ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
2185 ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
2280 int ITEMS_PER_THREAD,
2282 typename BlockPrefixCallbackOp>
2284 T (&input)[ITEMS_PER_THREAD],
2285 T (&output)[ITEMS_PER_THREAD],
2288 BlockPrefixCallbackOp &block_prefix_callback_op)
2290 if (ITEMS_PER_THREAD == 1)
2292 InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_callback_op);
2297 T thread_partial = ThreadReduce(input, scan_op);
2300 ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
2303 ThreadScanInclusive(input, output, scan_op, thread_partial);