38 #include "../util_ptx.cuh"
39 #include "../util_type.cuh"
40 #include "../util_namespace.cuh"
112 typename OutputIterator,
114 __device__ __forceinline__
void ThreadStore(OutputIterator itr, T val);
120 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
124 template <
int COUNT,
int MAX>
125 struct IterateThreadStore
127 template <CacheStoreModifier MODIFIER,
typename T>
128 static __device__ __forceinline__
void Store(T *ptr, T *vals)
130 ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
131 IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
134 template <
typename OutputIterator,
typename T>
135 static __device__ __forceinline__
void Dereference(OutputIterator ptr, T *vals)
137 ptr[COUNT] = vals[COUNT];
138 IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
145 struct IterateThreadStore<MAX, MAX>
147 template <CacheStoreModifier MODIFIER,
typename T>
148 static __device__ __forceinline__
void Store(T *ptr, T *vals) {}
150 template <
typename OutputIterator,
typename T>
151 static __device__ __forceinline__
void Dereference(OutputIterator ptr, T *vals) {}
158 #define CUB_STORE_16(cub_modifier, ptx_modifier) \
160 __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val) \
162 asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \
163 _CUB_ASM_PTR_(ptr), \
170 __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val) \
172 asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \
173 _CUB_ASM_PTR_(ptr), \
182 #define CUB_STORE_8(cub_modifier, ptx_modifier) \
184 __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val) \
186 asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \
187 _CUB_ASM_PTR_(ptr), \
194 __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val) \
196 asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \
197 _CUB_ASM_PTR_(ptr), \
202 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val) \
204 asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \
205 _CUB_ASM_PTR_(ptr), \
212 #define CUB_STORE_4(cub_modifier, ptx_modifier) \
214 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val) \
216 asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \
217 _CUB_ASM_PTR_(ptr), \
225 #define CUB_STORE_2(cub_modifier, ptx_modifier) \
227 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val) \
229 asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \
230 _CUB_ASM_PTR_(ptr), \
238 #define CUB_STORE_1(cub_modifier, ptx_modifier) \
240 __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val) \
245 " cvt.u8.u16 datum, %1;" \
246 " st."#ptx_modifier".u8 [%0], datum;" \
248 _CUB_ASM_PTR_(ptr), \
249 "h"((unsigned short) val)); \
255 #define CUB_STORE_ALL(cub_modifier, ptx_modifier) \
256 CUB_STORE_16(cub_modifier, ptx_modifier) \
257 CUB_STORE_8(cub_modifier, ptx_modifier) \
258 CUB_STORE_4(cub_modifier, ptx_modifier) \
259 CUB_STORE_2(cub_modifier, ptx_modifier) \
260 CUB_STORE_1(cub_modifier, ptx_modifier) \
266 #if CUB_PTX_ARCH >= 200
273 CUB_STORE_ALL(STORE_CG, global)
274 CUB_STORE_ALL(STORE_CS, global)
275 CUB_STORE_ALL(STORE_WT, volatile.global)
282 template <
typename OutputIterator,
typename T>
286 Int2Type<STORE_DEFAULT> modifier,
287 Int2Type<false> is_pointer)
296 template <
typename T>
300 Int2Type<STORE_DEFAULT> modifier,
301 Int2Type<true> is_pointer)
310 template <
typename T>
311 __device__ __forceinline__
void ThreadStoreVolatilePtr(
314 Int2Type<true> is_primitive)
316 *
reinterpret_cast<volatile T*
>(ptr) = val;
323 template <
typename T>
324 __device__ __forceinline__
void ThreadStoreVolatilePtr(
327 Int2Type<false> is_primitive)
329 #if CUB_PTX_ARCH <= 130
332 __threadfence_block();
336 typedef typename UnitWord<T>::VolatileWord VolatileWord;
338 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
340 VolatileWord words[VOLATILE_MULTIPLE];
341 *
reinterpret_cast<T*
>(words) = val;
345 IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
346 reinterpret_cast<volatile VolatileWord*>(ptr),
349 #endif // CUB_PTX_ARCH <= 130
357 template <
typename T>
361 Int2Type<STORE_VOLATILE> modifier,
362 Int2Type<true> is_pointer)
364 ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
371 template <
typename T,
int MODIFIER>
375 Int2Type<MODIFIER> modifier,
376 Int2Type<true> is_pointer)
378 typedef typename UnitWord<T>::DeviceWord DeviceWord;
380 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
382 DeviceWord words[DEVICE_MULTIPLE];
384 *
reinterpret_cast<T*
>(words) = val;
386 IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
387 reinterpret_cast<DeviceWord*
>(ptr),
395 template <CacheStoreModifier MODIFIER,
typename OutputIterator,
typename T>
396 __device__ __forceinline__
void ThreadStore(OutputIterator itr, T val)
401 Int2Type<MODIFIER>(),
402 Int2Type<IsPointer<OutputIterator>::VALUE>());
407 #endif // DOXYGEN_SHOULD_SKIP_THIS