CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
device_reduce.cuh
Go to the documentation of this file.
1 
2 /******************************************************************************
3  * Copyright (c) 2011, Duane Merrill. All rights reserved.
4  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
35 #pragma once
36 
37 #include <stdio.h>
38 #include <iterator>
39 
40 #include "dispatch/device_reduce_dispatch.cuh"
41 #include "../util_namespace.cuh"
42 
44 CUB_NS_PREFIX
45 
47 namespace cub {
48 
49 
90 {
144  template <
145  typename InputIterator,
146  typename OutputIterator,
147  typename ReductionOp>
148  CUB_RUNTIME_FUNCTION
149  static cudaError_t Reduce(
150  void *d_temp_storage,
151  size_t &temp_storage_bytes,
152  InputIterator d_in,
153  OutputIterator d_out,
154  int num_items,
155  ReductionOp reduction_op,
156  cudaStream_t stream = 0,
157  bool debug_synchronous = false)
158  {
159  // Signed integer type for global offsets
160  typedef int Offset;
161 
162  // Dispatch type
163  typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, ReductionOp> DeviceReduceDispatch;
164 
165  return DeviceReduceDispatch::Dispatch(
166  d_temp_storage,
167  temp_storage_bytes,
168  d_in,
169  d_out,
170  num_items,
171  reduction_op,
172  stream,
173  debug_synchronous);
174  }
175 
176 
222  template <
223  typename InputIterator,
224  typename OutputIterator>
225  CUB_RUNTIME_FUNCTION
226  static cudaError_t Sum(
227  void *d_temp_storage,
228  size_t &temp_storage_bytes,
229  InputIterator d_in,
230  OutputIterator d_out,
231  int num_items,
232  cudaStream_t stream = 0,
233  bool debug_synchronous = false)
234  {
235  // Signed integer type for global offsets
236  typedef int Offset;
237 
238  // Dispatch type
239  typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Sum> DeviceReduceDispatch;
240 
241  return DeviceReduceDispatch::Dispatch(
242  d_temp_storage,
243  temp_storage_bytes,
244  d_in,
245  d_out,
246  num_items,
247  cub::Sum(),
248  stream,
249  debug_synchronous);
250  }
251 
252 
294  template <
295  typename InputIterator,
296  typename OutputIterator>
297  CUB_RUNTIME_FUNCTION
298  static cudaError_t Min(
299  void *d_temp_storage,
300  size_t &temp_storage_bytes,
301  InputIterator d_in,
302  OutputIterator d_out,
303  int num_items,
304  cudaStream_t stream = 0,
305  bool debug_synchronous = false)
306  {
307  // Signed integer type for global offsets
308  typedef int Offset;
309 
310  // Dispatch type
311  typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Min> DeviceReduceDispatch;
312 
313  return DeviceReduceDispatch::Dispatch(
314  d_temp_storage,
315  temp_storage_bytes,
316  d_in,
317  d_out,
318  num_items,
319  cub::Min(),
320  stream,
321  debug_synchronous);
322  }
323 
324 
371  template <
372  typename InputIterator,
373  typename OutputIterator>
374  CUB_RUNTIME_FUNCTION
375  static cudaError_t ArgMin(
376  void *d_temp_storage,
377  size_t &temp_storage_bytes,
378  InputIterator d_in,
379  OutputIterator d_out,
380  int num_items,
381  cudaStream_t stream = 0,
382  bool debug_synchronous = false)
383  {
384  // Signed integer type for global offsets
385  typedef int Offset;
386 
387  // Wrapped input iterator
389  ArgIndexInputIterator d_argmin_in(d_in, 0);
390 
391  // Dispatch type
392  typedef DeviceReduceDispatch<ArgIndexInputIterator, OutputIterator, Offset, cub::ArgMin> DeviceReduceDispatch;
393 
394  return DeviceReduceDispatch::Dispatch(
395  d_temp_storage,
396  temp_storage_bytes,
397  d_argmin_in,
398  d_out,
399  num_items,
400  cub::ArgMin(),
401  stream,
402  debug_synchronous);
403  }
404 
405 
447  template <
448  typename InputIterator,
449  typename OutputIterator>
450  CUB_RUNTIME_FUNCTION
451  static cudaError_t Max(
452  void *d_temp_storage,
453  size_t &temp_storage_bytes,
454  InputIterator d_in,
455  OutputIterator d_out,
456  int num_items,
457  cudaStream_t stream = 0,
458  bool debug_synchronous = false)
459  {
460  // Signed integer type for global offsets
461  typedef int Offset;
462 
463  // Dispatch type
464  typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Max> DeviceReduceDispatch;
465 
466  return DeviceReduceDispatch::Dispatch(
467  d_temp_storage,
468  temp_storage_bytes,
469  d_in,
470  d_out,
471  num_items,
472  cub::Max(),
473  stream,
474  debug_synchronous);
475  }
476 
477 
524  template <
525  typename InputIterator,
526  typename OutputIterator>
527  CUB_RUNTIME_FUNCTION
528  static cudaError_t ArgMax(
529  void *d_temp_storage,
530  size_t &temp_storage_bytes,
531  InputIterator d_in,
532  OutputIterator d_out,
533  int num_items,
534  cudaStream_t stream = 0,
535  bool debug_synchronous = false)
536  {
537  // Signed integer type for global offsets
538  typedef int Offset;
539 
540  // Wrapped input iterator
542  ArgIndexInputIterator d_argmax_in(d_in, 0);
543 
544  // Dispatch type
545  typedef DeviceReduceDispatch<ArgIndexInputIterator, OutputIterator, Offset, cub::ArgMax> DeviceReduceDispatch;
546 
547  return DeviceReduceDispatch::Dispatch(
548  d_temp_storage,
549  temp_storage_bytes,
550  d_argmax_in,
551  d_out,
552  num_items,
553  cub::ArgMax(),
554  stream,
555  debug_synchronous);
556  }
557 
558 
640  template <
641  typename KeyInputIterator,
642  typename KeyOutputIterator,
643  typename ValueInputIterator,
644  typename ValueOutputIterator,
645  typename NumSegmentsIterator,
646  typename ReductionOp>
647  CUB_RUNTIME_FUNCTION __forceinline__
648  static cudaError_t ReduceByKey(
649  void *d_temp_storage,
650  size_t &temp_storage_bytes,
651  KeyInputIterator d_keys_in,
652  KeyOutputIterator d_keys_out,
653  ValueInputIterator d_values_in,
654  ValueOutputIterator d_values_out,
655  NumSegmentsIterator d_num_segments,
656  ReductionOp reduction_op,
657  int num_items,
658  cudaStream_t stream = 0,
659  bool debug_synchronous = false)
660  {
661  typedef int Offset; // Signed integer type for global offsets
662  typedef NullType* FlagIterator; // Flag iterator type (not used)
663  typedef NullType SelectOp; // Selection op (not used)
664  typedef Equality EqualityOp; // Default == operator
665 
666  return DeviceReduceByKeyDispatch<KeyInputIterator, KeyOutputIterator, ValueInputIterator, ValueOutputIterator, NumSegmentsIterator, EqualityOp, ReductionOp, Offset>::Dispatch(
667  d_temp_storage,
668  temp_storage_bytes,
669  d_keys_in,
670  d_keys_out,
671  d_values_in,
672  d_values_out,
673  d_num_segments,
674  EqualityOp(),
675  reduction_op,
676  num_items,
677  stream,
678  debug_synchronous);
679  }
680 
681 
748  template <
749  typename InputIterator,
750  typename OutputIterator,
751  typename CountsOutputIterator,
752  typename NumSegmentsIterator>
753  CUB_RUNTIME_FUNCTION __forceinline__
754  static cudaError_t RunLengthEncode(
755  void *d_temp_storage,
756  size_t &temp_storage_bytes,
757  InputIterator d_in,
758  OutputIterator d_compacted_out,
759  CountsOutputIterator d_counts_out,
760  NumSegmentsIterator d_num_segments,
761  int num_items,
762  cudaStream_t stream = 0,
763  bool debug_synchronous = false)
764  {
765  // Data type of value iterator
766  typedef typename std::iterator_traits<CountsOutputIterator>::value_type Value;
767 
768  typedef int Offset; // Signed integer type for global offsets
769  typedef NullType* FlagIterator; // Flag iterator type (not used)
770  typedef NullType SelectOp; // Selection op (not used)
771  typedef Equality EqualityOp; // Default == operator
772  typedef cub::Sum ReductionOp; // Value reduction operator
773 
774  // Generator type for providing 1s values for run-length reduction
775  typedef ConstantInputIterator<Value, Offset> CountsInputIterator;
776 
777  Value one_val;
778  one_val = 1;
779 
780  return DeviceReduceByKeyDispatch<InputIterator, OutputIterator, CountsInputIterator, CountsOutputIterator, NumSegmentsIterator, EqualityOp, ReductionOp, Offset>::Dispatch(
781  d_temp_storage,
782  temp_storage_bytes,
783  d_in,
784  d_compacted_out,
785  CountsInputIterator(one_val),
786  d_counts_out,
787  d_num_segments,
788  EqualityOp(),
789  ReductionOp(),
790  num_items,
791  stream,
792  debug_synchronous);
793  }
794 
795 };
796 
801 } // CUB namespace
802 CUB_NS_POSTFIX // Optional outer namespace(s)
803 
804