21 #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_ 22 #define ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_ 26 #include <type_traits> 28 #include "../../config.hpp" 29 #include "../../detail/various.hpp" 30 #include "../../functional.hpp" 31 #include "../../intrinsics.hpp" 32 #include "../../type_traits.hpp" 34 #include "../../block/block_load.hpp" 36 #include "uint_fast_div.hpp" 38 BEGIN_ROCPRIM_NAMESPACE
44 template<
class T,
unsigned int Size>
51 ROCPRIM_HOST_DEVICE
fixed_array(
const T values[Size])
53 for(
unsigned int i = 0; i < Size; i++)
55 this->values[i] = values[i];
59 ROCPRIM_HOST_DEVICE T& operator[](
unsigned int index)
64 ROCPRIM_HOST_DEVICE
const T& operator[](
unsigned int index)
const 70 template<
class Level,
class Enable =
void>
80 ROCPRIM_HOST_DEVICE
sample_to_bin_even(
unsigned int bins, Level lower_level, Level upper_level)
82 , lower_level(lower_level)
83 , upper_level(upper_level)
84 , scale((upper_level - lower_level) / bins)
87 template<
class Sample>
88 ROCPRIM_HOST_DEVICE
bool operator()(Sample sample,
unsigned int& bin)
const 90 const Level s =
static_cast<Level
>(sample);
91 if(s >= lower_level && s < upper_level)
93 bin =
static_cast<unsigned int>((s - lower_level) / scale);
101 template<
class Level>
104 typename std::enable_if<std::is_integral<Level>::value && (sizeof(Level) <= 4)>::type>
117 , lower_level(lower_level)
118 , upper_level(upper_level)
119 , scale((upper_level - lower_level) / bins)
122 template<
class Sample>
123 ROCPRIM_HOST_DEVICE
inline bool operator()(Sample sample,
unsigned int& bin)
const 125 const Level s =
static_cast<Level
>(sample);
126 if(s >= lower_level && s < upper_level)
128 bin =
static_cast<unsigned int>(s - lower_level) / scale;
136 template<
class Level>
138 typename std::enable_if<rocprim::is_floating_point<Level>::value>::type>
151 , lower_level(lower_level)
152 , upper_level(upper_level)
153 , inv_scale(static_cast<Level>(bins) / (upper_level - lower_level))
156 template<
class Sample>
157 ROCPRIM_HOST_DEVICE
inline bool operator()(Sample sample,
unsigned int& bin)
const 159 const Level s =
static_cast<Level
>(sample);
160 if(s >= lower_level && s < upper_level)
162 bin =
static_cast<unsigned int>((s - lower_level) * inv_scale);
171 ROCPRIM_HOST_DEVICE
inline unsigned int upper_bound(
const T* values,
unsigned int count, T value)
173 unsigned int current = 0;
176 const unsigned int step = count / 2;
177 const unsigned int next = current + step;
178 if(value < values[next])
191 template<
class Level>
195 const Level* level_values;
199 ROCPRIM_HOST_DEVICE
inline sample_to_bin_range(
unsigned int bins,
const Level* level_values)
200 : bins(bins), level_values(level_values)
203 template<
class Sample>
204 ROCPRIM_HOST_DEVICE
inline bool operator()(Sample sample,
unsigned int& bin)
const 206 const Level s =
static_cast<Level
>(sample);
207 bin = upper_bound(level_values, bins + 1, s) - 1;
212 template<
class T,
unsigned int Size>
219 template<
unsigned int ItemsPerThread,
unsigned int Channels,
class Sample>
221 : std::integral_constant<bool,
222 ((sizeof(Sample) * Channels == 1) || (sizeof(Sample) * Channels == 2))
223 && (sizeof(Sample) * Channels * ItemsPerThread % sizeof(int) == 0)
224 && (sizeof(Sample) * Channels * ItemsPerThread / sizeof(int) > 0)>
227 template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Channels,
class Sample>
228 ROCPRIM_DEVICE ROCPRIM_INLINE
229 typename std::enable_if<is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
230 load_samples(
unsigned int flat_id,
234 using packed_samples_type =
int[
sizeof(Sample) * Channels * ItemsPerThread /
sizeof(
int)];
236 if(reinterpret_cast<uintptr_t>(samples) %
sizeof(int) == 0)
239 block_load_direct_striped<BlockSize>(flat_id,
240 reinterpret_cast<const int*
>(samples),
241 reinterpret_cast<packed_samples_type&>(values));
245 block_load_direct_striped<BlockSize>(
252 template<
unsigned int BlockSize,
unsigned int ItemsPerThread,
unsigned int Channels,
class Sample>
253 ROCPRIM_DEVICE ROCPRIM_INLINE
254 typename std::enable_if<!is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
255 load_samples(
unsigned int flat_id,
259 block_load_direct_striped<BlockSize>(
265 template<
unsigned int BlockSize,
266 unsigned int ItemsPerThread,
267 unsigned int Channels,
269 class SampleIterator>
270 ROCPRIM_DEVICE ROCPRIM_INLINE
void 271 load_samples(
unsigned int flat_id,
272 SampleIterator samples,
275 Sample tmp[Channels * ItemsPerThread];
277 for(
unsigned int i = 0; i < ItemsPerThread; i++)
279 for(
unsigned int channel = 0; channel < Channels; channel++)
281 values[i].values[channel] = tmp[i * Channels + channel];
286 template<
unsigned int BlockSize,
287 unsigned int ItemsPerThread,
288 unsigned int Channels,
290 class SampleIterator>
291 ROCPRIM_DEVICE ROCPRIM_INLINE
void 292 load_samples(
unsigned int flat_id,
293 SampleIterator samples,
295 unsigned int valid_count)
297 Sample tmp[Channels * ItemsPerThread];
299 for(
unsigned int i = 0; i < ItemsPerThread; i++)
301 for(
unsigned int channel = 0; channel < Channels; channel++)
303 values[i].values[channel] = tmp[i * Channels + channel];
308 template<
unsigned int BlockSize,
unsigned int ActiveChannels,
class Counter>
312 const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
313 const unsigned int block_id = ::rocprim::detail::block_id<0>();
315 const unsigned int index = block_id * BlockSize + flat_id;
316 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
318 if(index < bins[channel])
320 histogram[channel][index] = 0;
325 template<
unsigned int BlockSize,
326 unsigned int ItemsPerThread,
327 unsigned int Channels,
328 unsigned int ActiveChannels,
329 class SampleIterator,
332 ROCPRIM_DEVICE ROCPRIM_INLINE
void 333 histogram_shared(SampleIterator samples,
334 unsigned int columns,
336 unsigned int row_stride,
337 unsigned int rows_per_block,
338 unsigned int shared_histograms,
342 unsigned int* block_histogram_start)
344 using sample_type =
typename std::iterator_traits<SampleIterator>::value_type;
347 constexpr
unsigned int items_per_block = BlockSize * ItemsPerThread;
349 const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
350 const unsigned int block_id0 = ::rocprim::detail::block_id<0>();
351 const unsigned int block_id1 = ::rocprim::detail::block_id<1>();
352 const unsigned int grid_size0 = ::rocprim::detail::grid_size<0>();
356 unsigned int total_bins = 0;
357 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
359 block_histogram[channel] = block_histogram_start + total_bins;
360 total_bins += bins[channel];
364 const unsigned int thread_shift = (flat_id % shared_histograms) * total_bins;
367 for(
unsigned int i = flat_id; i < total_bins * shared_histograms; i += BlockSize)
369 block_histogram_start[i] = 0;
373 const unsigned int start_row = block_id1 * rows_per_block;
374 const unsigned int end_row =
::rocprim::min(rows, start_row + rows_per_block);
375 for(
unsigned int row = start_row; row < end_row; row++)
377 SampleIterator row_samples = samples + row * row_stride;
379 unsigned int block_offset = block_id0 * items_per_block;
380 while(block_offset < columns)
382 sample_vector_type values[ItemsPerThread];
384 if(block_offset + items_per_block <= columns)
386 load_samples<BlockSize>(flat_id, row_samples + Channels * block_offset, values);
388 for(
unsigned int i = 0; i < ItemsPerThread; i++)
390 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
393 if(sample_to_bin_op[channel](values[i].values[channel], bin))
395 ::rocprim::detail::atomic_add(block_histogram[channel] + bin
404 const unsigned int valid_count = columns - block_offset;
405 load_samples<BlockSize>(flat_id,
406 row_samples + Channels * block_offset,
410 for(
unsigned int i = 0; i < ItemsPerThread; i++)
412 if(flat_id * ItemsPerThread + i < valid_count)
414 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
417 if(sample_to_bin_op[channel](values[i].values[channel], bin))
419 ::rocprim::detail::atomic_add(block_histogram[channel] + bin
428 block_offset += grid_size0 * items_per_block;
433 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
435 for(
unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize)
437 unsigned int total = 0;
438 for(
unsigned int i = 0; i < shared_histograms; i++)
440 total += block_histogram[channel][bin + i * total_bins];
444 ::rocprim::detail::atomic_add(&histogram[channel][bin], total);
450 template<
unsigned int BlockSize,
451 unsigned int ItemsPerThread,
452 unsigned int Channels,
453 unsigned int ActiveChannels,
454 class SampleIterator,
457 ROCPRIM_DEVICE ROCPRIM_INLINE
void 458 histogram_global(SampleIterator samples,
459 unsigned int columns,
460 unsigned int row_stride,
465 using sample_type =
typename std::iterator_traits<SampleIterator>::value_type;
468 constexpr
unsigned int items_per_block = BlockSize * ItemsPerThread;
470 const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
471 const unsigned int block_id0 = ::rocprim::detail::block_id<0>();
472 const unsigned int block_id1 = ::rocprim::detail::block_id<1>();
473 const unsigned int block_offset = block_id0 * items_per_block;
475 samples += block_id1 * row_stride + Channels * block_offset;
477 sample_vector_type values[ItemsPerThread];
478 unsigned int valid_count;
479 if(block_offset + items_per_block <= columns)
481 valid_count = items_per_block;
482 load_samples<BlockSize>(flat_id, samples, values);
486 valid_count = columns - block_offset;
487 load_samples<BlockSize>(flat_id, samples, values, valid_count);
490 for(
unsigned int i = 0; i < ItemsPerThread; i++)
492 for(
unsigned int channel = 0; channel < ActiveChannels; channel++)
495 if(sample_to_bin_op[channel](values[i].values[channel], bin))
497 const unsigned int pos = flat_id * ItemsPerThread + i;
499 for(
unsigned int b = 0; b < bins_bits[channel]; b++)
501 const unsigned int bit_set = bin & (1u << b);
503 same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
506 const unsigned int prev_same_bin_count
508 if(prev_same_bin_count == 0)
512 ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count);
521 END_ROCPRIM_NAMESPACE
523 #endif // ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
The block_histogram class is a block level parallel primitive which provides methods for constructing...
Definition: block_histogram.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int masked_bit_count(lane_mask_type x, unsigned int add=0)
Masked bit count.
Definition: warp.hpp:48
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Definition: device_histogram.hpp:71
Definition: device_histogram.hpp:220
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type ballot(int predicate)
Evaluate predicate for all active work-items in the warp and return an integer whose i-th bit is set ...
Definition: warp.hpp:38
Definition: uint_fast_div.hpp:31
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
Definition: device_histogram.hpp:45
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:58
Definition: benchmark_block_histogram.cpp:64
Definition: device_histogram.hpp:192
Definition: device_histogram.hpp:213
unsigned long long int lane_mask_type
The lane_mask_type is an integer that contains one bit per thread.
Definition: types.hpp:164
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int bit_count(unsigned int x)
Bit count.
Definition: bit.hpp:42