CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
util_ptx.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
35 #pragma once
36 
37 #include "util_type.cuh"
38 #include "util_arch.cuh"
39 #include "util_namespace.cuh"
40 
42 CUB_NS_PREFIX
43 
45 namespace cub {
46 
47 
54 /******************************************************************************
55  * PTX helper macros
56  ******************************************************************************/
57 
58 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
59 
63 #if defined(_WIN64) || defined(__LP64__)
64  #define __CUB_LP64__ 1
65  // 64-bit register modifier for inlined asm
66  #define _CUB_ASM_PTR_ "l"
67  #define _CUB_ASM_PTR_SIZE_ "u64"
68 #else
69  #define __CUB_LP64__ 0
70  // 32-bit register modifier for inlined asm
71  #define _CUB_ASM_PTR_ "r"
72  #define _CUB_ASM_PTR_SIZE_ "u32"
73 #endif
74 
75 #endif // DOXYGEN_SHOULD_SKIP_THIS
76 
77 
78 /******************************************************************************
79  * Inlined PTX intrinsics
80  ******************************************************************************/
81 
85 __device__ __forceinline__ unsigned int SHR_ADD(
86  unsigned int x,
87  unsigned int shift,
88  unsigned int addend)
89 {
90  unsigned int ret;
91 #if CUB_PTX_ARCH >= 200
92  asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
93  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
94 #else
95  ret = (x >> shift) + addend;
96 #endif
97  return ret;
98 }
99 
100 
104 __device__ __forceinline__ unsigned int SHL_ADD(
105  unsigned int x,
106  unsigned int shift,
107  unsigned int addend)
108 {
109  unsigned int ret;
110 #if CUB_PTX_ARCH >= 200
111  asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
112  "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
113 #else
114  ret = (x << shift) + addend;
115 #endif
116  return ret;
117 }
118 
119 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
120 
124 template <typename UnsignedBits, int BYTE_LEN>
125 __device__ __forceinline__ unsigned int BFE(
126  UnsignedBits source,
127  unsigned int bit_start,
128  unsigned int num_bits,
129  Int2Type<BYTE_LEN> byte_len)
130 {
131  unsigned int bits;
132 #if CUB_PTX_ARCH >= 200
133  asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
134 #else
135  const unsigned int MASK = (1 << num_bits) - 1;
136  bits = (source >> bit_start) & MASK;
137 #endif
138  return bits;
139 }
140 
141 
145 template <typename UnsignedBits>
146 __device__ __forceinline__ unsigned int BFE(
147  UnsignedBits source,
148  unsigned int bit_start,
149  unsigned int num_bits,
150  Int2Type<8> byte_len)
151 {
152  const unsigned long long MASK = (1ull << num_bits) - 1;
153  return (source >> bit_start) & MASK;
154 }
155 
156 #endif // DOXYGEN_SHOULD_SKIP_THIS
157 
161 template <typename UnsignedBits>
162 __device__ __forceinline__ unsigned int BFE(
163  UnsignedBits source,
164  unsigned int bit_start,
165  unsigned int num_bits)
166 {
167  return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
168 }
169 
170 
174 __device__ __forceinline__ void BFI(
175  unsigned int &ret,
176  unsigned int x,
177  unsigned int y,
178  unsigned int bit_start,
179  unsigned int num_bits)
180 {
181 #if CUB_PTX_ARCH >= 200
182  asm("bfi.b32 %0, %1, %2, %3, %4;" :
183  "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
184 #else
185  x <<= bit_start;
186  unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
187  unsigned int MASK_Y = ~MASK_X;
188  ret = (y & MASK_Y) | (x & MASK_X);
189 #endif
190 }
191 
192 
196 __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
197 {
198 #if CUB_PTX_ARCH >= 200
199  asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
200 #else
201  x = x + y + z;
202 #endif
203  return x;
204 }
205 
206 
233 __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
234 {
235  int ret;
236  asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
237  return ret;
238 }
239 
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
241 
245 __device__ __forceinline__ void BAR(int count)
246 {
247  asm volatile("bar.sync 1, %0;" : : "r"(count));
248 }
249 
250 
254 __device__ __forceinline__ float FMUL_RZ(float a, float b)
255 {
256  float d;
257  asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
258  return d;
259 }
260 
261 
265 __device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
266 {
267  float d;
268  asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
269  return d;
270 }
271 
272 #endif // DOXYGEN_SHOULD_SKIP_THIS
273 
277 __device__ __forceinline__ void ThreadExit() {
278  asm("exit;");
279 }
280 
281 
285 __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
286 {
287  return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
288  ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
289  threadIdx.x;
290 }
291 
292 
296 __device__ __forceinline__ unsigned int LaneId()
297 {
298  unsigned int ret;
299  asm("mov.u32 %0, %laneid;" : "=r"(ret) );
300  return ret;
301 }
302 
303 
307 __device__ __forceinline__ unsigned int WarpId()
308 {
309  unsigned int ret;
310  asm("mov.u32 %0, %warpid;" : "=r"(ret) );
311  return ret;
312 }
313 
317 __device__ __forceinline__ unsigned int LaneMaskLt()
318 {
319  unsigned int ret;
320  asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
321  return ret;
322 }
323 
327 __device__ __forceinline__ unsigned int LaneMaskLe()
328 {
329  unsigned int ret;
330  asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
331  return ret;
332 }
333 
337 __device__ __forceinline__ unsigned int LaneMaskGt()
338 {
339  unsigned int ret;
340  asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
341  return ret;
342 }
343 
347 __device__ __forceinline__ unsigned int LaneMaskGe()
348 {
349  unsigned int ret;
350  asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
351  return ret;
352 }
353  // end group UtilPtx
355 
356 
357 
358 
387 template <typename T>
388 __device__ __forceinline__ T ShuffleUp(
389  T input,
390  int src_offset)
391 {
392  enum
393  {
394  SHFL_C = 0,
395  };
396 
397  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
398 
399  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
400  T output;
401  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
402  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
403 
404  #pragma unroll
405  for (int WORD = 0; WORD < WORDS; ++WORD)
406  {
407  unsigned int shuffle_word = input_alias[WORD];
408  asm(
409  " shfl.up.b32 %0, %1, %2, %3;"
410  : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
411  output_alias[WORD] = (ShuffleWord) shuffle_word;
412  }
413 
414  return output;
415 }
416 
417 
446 template <typename T>
447 __device__ __forceinline__ T ShuffleDown(
448  T input,
449  int src_offset)
450 {
451  enum
452  {
453  SHFL_C = CUB_PTX_WARP_THREADS - 1,
454  };
455 
456  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
457 
458  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
459  T output;
460  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
461  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
462 
463  #pragma unroll
464  for (int WORD = 0; WORD < WORDS; ++WORD)
465  {
466  unsigned int shuffle_word = input_alias[WORD];
467  asm(
468  " shfl.down.b32 %0, %1, %2, %3;"
469  : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
470  output_alias[WORD] = (ShuffleWord) shuffle_word;
471  }
472 
473  return output;
474 }
475 
476 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
477 
485 template <typename T>
486 __device__ __forceinline__ T ShuffleBroadcast(
487  T input,
488  int src_lane,
489  int logical_warp_threads)
490 {
491  typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
492 
493  const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
494  T output;
495  ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
496  ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
497 
498  #pragma unroll
499  for (int WORD = 0; WORD < WORDS; ++WORD)
500  {
501  unsigned int shuffle_word = input_alias[WORD];
502  asm("shfl.idx.b32 %0, %1, %2, %3;"
503  : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(logical_warp_threads - 1));
504  output_alias[WORD] = (ShuffleWord) shuffle_word;
505  }
506 
507  return output;
508 }
509 
510 #endif // DOXYGEN_SHOULD_SKIP_THIS
511 
512 
541 template <typename T>
542 __device__ __forceinline__ T ShuffleBroadcast(
543  T input,
544  int src_lane)
545 {
546  return ShuffleBroadcast(input, src_lane, CUB_PTX_WARP_THREADS);
547 }
548 
549 
550 
551 
552 
557 __device__ __forceinline__ int WarpAll(int cond)
558 {
559 #if CUB_PTX_ARCH < 120
560 
561  __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
562 
563  if (LaneId() == 0)
564  warp_signals[WarpId()] = 1;
565 
566  if (cond == 0)
567  warp_signals[WarpId()] = 0;
568 
569  return warp_signals[WarpId()];
570 
571 #else
572 
573  return __all(cond);
574 
575 #endif
576 }
577 
578 
583 __device__ __forceinline__ int WarpAny(int cond)
584 {
585 #if CUB_PTX_ARCH < 120
586 
587  __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
588 
589  if (LaneId() == 0)
590  warp_signals[WarpId()] = 0;
591 
592  if (cond)
593  warp_signals[WarpId()] = 1;
594 
595  return warp_signals[WarpId()];
596 
597 #else
598 
599  return __any(cond);
600 
601 #endif
602 }
603 
604 
605 } // CUB namespace
606 CUB_NS_POSTFIX // Optional outer namespace(s)