40 #include "../util_ptx.cuh"
41 #include "../util_type.cuh"
42 #include "../util_namespace.cuh"
109 typename InputIterator>
110 __device__ __forceinline__
typename std::iterator_traits<InputIterator>::value_type
ThreadLoad(InputIterator itr);
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
120 template <
int COUNT,
int MAX>
121 struct IterateThreadLoad
123 template <CacheLoadModifier MODIFIER,
typename T>
124 static __device__ __forceinline__
void Load(T *ptr, T *vals)
126 vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
127 IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
130 template <
typename InputIterator,
typename T>
131 static __device__ __forceinline__
void Dereference(InputIterator ptr, T *vals)
133 vals[COUNT] = ptr[COUNT];
134 IterateThreadLoad<COUNT + 1, MAX>::Dereference(ptr, vals);
141 struct IterateThreadLoad<MAX, MAX>
143 template <CacheLoadModifier MODIFIER,
typename T>
144 static __device__ __forceinline__
void Load(T *ptr, T *vals) {}
146 template <
typename InputIterator,
typename T>
147 static __device__ __forceinline__
void Dereference(InputIterator ptr, T *vals) {}
154 #define CUB_LOAD_16(cub_modifier, ptx_modifier) \
156 __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4*>(uint4* ptr) \
159 asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \
164 _CUB_ASM_PTR_(ptr)); \
168 __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2*>(ulonglong2* ptr) \
171 asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \
174 _CUB_ASM_PTR_(ptr)); \
181 #define CUB_LOAD_8(cub_modifier, ptx_modifier) \
183 __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4*>(ushort4* ptr) \
186 asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \
191 _CUB_ASM_PTR_(ptr)); \
195 __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2*>(uint2* ptr) \
198 asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \
201 _CUB_ASM_PTR_(ptr)); \
205 __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long*>(unsigned long long* ptr) \
207 unsigned long long retval; \
208 asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \
210 _CUB_ASM_PTR_(ptr)); \
217 #define CUB_LOAD_4(cub_modifier, ptx_modifier) \
219 __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int*>(unsigned int* ptr) \
221 unsigned int retval; \
222 asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \
224 _CUB_ASM_PTR_(ptr)); \
232 #define CUB_LOAD_2(cub_modifier, ptx_modifier) \
234 __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short*>(unsigned short* ptr) \
236 unsigned short retval; \
237 asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \
239 _CUB_ASM_PTR_(ptr)); \
247 #define CUB_LOAD_1(cub_modifier, ptx_modifier) \
249 __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char*>(unsigned char* ptr) \
251 unsigned short retval; \
255 " ld."#ptx_modifier".u8 datum, [%1];" \
256 " cvt.u16.u8 %0, datum;" \
259 _CUB_ASM_PTR_(ptr)); \
260 return (unsigned char) retval; \
267 #define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \
268 CUB_LOAD_16(cub_modifier, ptx_modifier) \
269 CUB_LOAD_8(cub_modifier, ptx_modifier) \
270 CUB_LOAD_4(cub_modifier, ptx_modifier) \
271 CUB_LOAD_2(cub_modifier, ptx_modifier) \
272 CUB_LOAD_1(cub_modifier, ptx_modifier) \
278 #if CUB_PTX_ARCH >= 200
286 CUB_LOAD_ALL(LOAD_CG, volatile.global)
287 CUB_LOAD_ALL(LOAD_CS, global)
288 CUB_LOAD_ALL(LOAD_CV, volatile.global)
291 #if CUB_PTX_ARCH >= 350
301 template <
typename InputIterator>
302 __device__ __forceinline__
typename std::iterator_traits<InputIterator>::value_type
ThreadLoad(
304 Int2Type<LOAD_DEFAULT> modifier,
305 Int2Type<false> is_pointer)
314 template <
typename T>
317 Int2Type<LOAD_DEFAULT> modifier,
318 Int2Type<true> is_pointer)
327 template <
typename T>
328 __device__ __forceinline__ T ThreadLoadVolatilePointer(
330 Int2Type<true> is_primitive)
332 T retval = *
reinterpret_cast<volatile T*
>(ptr);
334 #if (CUB_PTX_ARCH <= 130)
335 if (
sizeof(T) == 1) __threadfence_block();
345 template <
typename T>
346 __device__ __forceinline__ T ThreadLoadVolatilePointer(
348 Int2Type<false> is_primitive)
351 #if CUB_PTX_ARCH <= 130
354 __threadfence_block();
359 typedef typename UnitWord<T>::VolatileWord VolatileWord;
361 const int VOLATILE_MULTIPLE =
sizeof(T) /
sizeof(VolatileWord);
373 VolatileWord *words =
reinterpret_cast<VolatileWord*
>(&retval);
374 IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
375 reinterpret_cast<volatile VolatileWord*>(ptr),
379 #endif // CUB_PTX_ARCH <= 130
386 template <
typename T>
389 Int2Type<LOAD_VOLATILE> modifier,
390 Int2Type<true> is_pointer)
393 return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
400 template <
typename T,
int MODIFIER>
403 Int2Type<MODIFIER> modifier,
404 Int2Type<true> is_pointer)
406 typedef typename UnitWord<T>::DeviceWord DeviceWord;
408 const int DEVICE_MULTIPLE =
sizeof(T) /
sizeof(DeviceWord);
410 DeviceWord words[DEVICE_MULTIPLE];
412 IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
413 reinterpret_cast<DeviceWord*
>(ptr),
416 return *
reinterpret_cast<T*
>(words);
425 typename InputIterator>
426 __device__ __forceinline__
typename std::iterator_traits<InputIterator>::value_type
ThreadLoad(InputIterator itr)
431 Int2Type<MODIFIER>(),
432 Int2Type<IsPointer<InputIterator>::VALUE>());
437 #endif // DOXYGEN_SHOULD_SKIP_THIS