CUB
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
thread_store.cuh
Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  ******************************************************************************/
28 
34 #pragma once
35 
36 #include <cuda.h>
37 
38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
41 
43 CUB_NS_PREFIX
44 
46 namespace cub {
47 
54 //-----------------------------------------------------------------------------
55 // Tags and constants
56 //-----------------------------------------------------------------------------
57 
62 {
69 };
70 
71 
110 template <
111  CacheStoreModifier MODIFIER,
112  typename OutputIterator,
113  typename T>
114 __device__ __forceinline__ void ThreadStore(OutputIterator itr, T val);
115 
116 
118 
119 
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
121 
122 
124 template <int COUNT, int MAX>
125 struct IterateThreadStore
126 {
127  template <CacheStoreModifier MODIFIER, typename T>
128  static __device__ __forceinline__ void Store(T *ptr, T *vals)
129  {
130  ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131  IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
132  }
133 
134  template <typename OutputIterator, typename T>
135  static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals)
136  {
137  ptr[COUNT] = vals[COUNT];
138  IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
139  }
140 
141 };
142 
144 template <int MAX>
145 struct IterateThreadStore<MAX, MAX>
146 {
147  template <CacheStoreModifier MODIFIER, typename T>
148  static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
149 
150  template <typename OutputIterator, typename T>
151  static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) {}
152 };
153 
154 
158 #define CUB_STORE_16(cub_modifier, ptx_modifier) \
159  template<> \
160  __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
161  { \
162  asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163  _CUB_ASM_PTR_(ptr), \
164  "r"(val.x), \
165  "r"(val.y), \
166  "r"(val.z), \
167  "r"(val.w)); \
168  } \
169  template<> \
170  __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
171  { \
172  asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173  _CUB_ASM_PTR_(ptr), \
174  "l"(val.x), \
175  "l"(val.y)); \
176  }
177 
178 
182 #define CUB_STORE_8(cub_modifier, ptx_modifier) \
183  template<> \
184  __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
185  { \
186  asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187  _CUB_ASM_PTR_(ptr), \
188  "h"(val.x), \
189  "h"(val.y), \
190  "h"(val.z), \
191  "h"(val.w)); \
192  } \
193  template<> \
194  __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
195  { \
196  asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197  _CUB_ASM_PTR_(ptr), \
198  "r"(val.x), \
199  "r"(val.y)); \
200  } \
201  template<> \
202  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
203  { \
204  asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205  _CUB_ASM_PTR_(ptr), \
206  "l"(val)); \
207  }
208 
212 #define CUB_STORE_4(cub_modifier, ptx_modifier) \
213  template<> \
214  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
215  { \
216  asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217  _CUB_ASM_PTR_(ptr), \
218  "r"(val)); \
219  }
220 
221 
225 #define CUB_STORE_2(cub_modifier, ptx_modifier) \
226  template<> \
227  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
228  { \
229  asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230  _CUB_ASM_PTR_(ptr), \
231  "h"(val)); \
232  }
233 
234 
238 #define CUB_STORE_1(cub_modifier, ptx_modifier) \
239  template<> \
240  __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
241  { \
242  asm volatile ( \
243  "{" \
244  " .reg .u8 datum;" \
245  " cvt.u8.u16 datum, %1;" \
246  " st."#ptx_modifier".u8 [%0], datum;" \
247  "}" : : \
248  _CUB_ASM_PTR_(ptr), \
249  "h"((unsigned short) val)); \
250  }
251 
255 #define CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256  CUB_STORE_16(cub_modifier, ptx_modifier) \
257  CUB_STORE_8(cub_modifier, ptx_modifier) \
258  CUB_STORE_4(cub_modifier, ptx_modifier) \
259  CUB_STORE_2(cub_modifier, ptx_modifier) \
260  CUB_STORE_1(cub_modifier, ptx_modifier) \
261 
262 
266 #if CUB_PTX_ARCH >= 200
267  CUB_STORE_ALL(STORE_WB, ca)
268  CUB_STORE_ALL(STORE_CG, cg)
269  CUB_STORE_ALL(STORE_CS, cs)
270  CUB_STORE_ALL(STORE_WT, wt)
271 #else
272  CUB_STORE_ALL(STORE_WB, global)
273  CUB_STORE_ALL(STORE_CG, global)
274  CUB_STORE_ALL(STORE_CS, global)
275  CUB_STORE_ALL(STORE_WT, volatile.global)
276 #endif
277 
278 
282 template <typename OutputIterator, typename T>
283 __device__ __forceinline__ void ThreadStore(
284  OutputIterator itr,
285  T val,
286  Int2Type<STORE_DEFAULT> modifier,
287  Int2Type<false> is_pointer)
288 {
289  *itr = val;
290 }
291 
292 
296 template <typename T>
297 __device__ __forceinline__ void ThreadStore(
298  T *ptr,
299  T val,
300  Int2Type<STORE_DEFAULT> modifier,
301  Int2Type<true> is_pointer)
302 {
303  *ptr = val;
304 }
305 
306 
310 template <typename T>
311 __device__ __forceinline__ void ThreadStoreVolatilePtr(
312  T *ptr,
313  T val,
314  Int2Type<true> is_primitive)
315 {
316  *reinterpret_cast<volatile T*>(ptr) = val;
317 }
318 
319 
323 template <typename T>
324 __device__ __forceinline__ void ThreadStoreVolatilePtr(
325  T *ptr,
326  T val,
327  Int2Type<false> is_primitive)
328 {
329 #if CUB_PTX_ARCH <= 130
330 
331  *ptr = val;
332  __threadfence_block();
333 
334 #else
335 
336  typedef typename UnitWord<T>::VolatileWord VolatileWord; // Word type for memcopying
337 
338  const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
339 
340  VolatileWord words[VOLATILE_MULTIPLE];
341  *reinterpret_cast<T*>(words) = val;
342 
343 // VolatileWord *words = reinterpret_cast<VolatileWord*>(&val);
344 
345  IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
346  reinterpret_cast<volatile VolatileWord*>(ptr),
347  words);
348 
349 #endif // CUB_PTX_ARCH <= 130
350 
351 }
352 
353 
357 template <typename T>
358 __device__ __forceinline__ void ThreadStore(
359  T *ptr,
360  T val,
361  Int2Type<STORE_VOLATILE> modifier,
362  Int2Type<true> is_pointer)
363 {
364  ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
365 }
366 
367 
371 template <typename T, int MODIFIER>
372 __device__ __forceinline__ void ThreadStore(
373  T *ptr,
374  T val,
375  Int2Type<MODIFIER> modifier,
376  Int2Type<true> is_pointer)
377 {
378  typedef typename UnitWord<T>::DeviceWord DeviceWord; // Word type for memcopying
379 
380  const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
381 
382  DeviceWord words[DEVICE_MULTIPLE];
383 
384  *reinterpret_cast<T*>(words) = val;
385 
386  IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
387  reinterpret_cast<DeviceWord*>(ptr),
388  words);
389 }
390 
391 
395 template <CacheStoreModifier MODIFIER, typename OutputIterator, typename T>
396 __device__ __forceinline__ void ThreadStore(OutputIterator itr, T val)
397 {
398  ThreadStore(
399  itr,
400  val,
401  Int2Type<MODIFIER>(),
402  Int2Type<IsPointer<OutputIterator>::VALUE>());
403 }
404 
405 
406 
407 #endif // DOXYGEN_SHOULD_SKIP_THIS
408 
409  // end group UtilIo
411 
412 
413 } // CUB namespace
414 CUB_NS_POSTFIX // Optional outer namespace(s)