rocPRIM
device_histogram.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_
22 #define ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_
23 
24 #include <cmath>
25 #include <iterator>
26 #include <type_traits>
27 
28 #include "../../config.hpp"
29 #include "../../detail/various.hpp"
30 #include "../../functional.hpp"
31 #include "../../intrinsics.hpp"
32 #include "../../type_traits.hpp"
33 
34 #include "../../block/block_load.hpp"
35 
36 #include "uint_fast_div.hpp"
37 
38 BEGIN_ROCPRIM_NAMESPACE
39 
40 namespace detail
41 {
42 
43 // Special wrapper for passing fixed-length arrays (i.e. T values[Size]) into kernels
44 template<class T, unsigned int Size>
46 {
47 private:
48  T values[Size];
49 
50 public:
51  ROCPRIM_HOST_DEVICE fixed_array(const T values[Size])
52  {
53  for(unsigned int i = 0; i < Size; i++)
54  {
55  this->values[i] = values[i];
56  }
57  }
58 
59  ROCPRIM_HOST_DEVICE T& operator[](unsigned int index)
60  {
61  return values[index];
62  }
63 
64  ROCPRIM_HOST_DEVICE const T& operator[](unsigned int index) const
65  {
66  return values[index];
67  }
68 };
69 
70 template<class Level, class Enable = void>
72 {
73  unsigned int bins;
74  Level lower_level;
75  Level upper_level;
76  Level scale;
77 
78  ROCPRIM_HOST_DEVICE sample_to_bin_even() = default;
79 
80  ROCPRIM_HOST_DEVICE sample_to_bin_even(unsigned int bins, Level lower_level, Level upper_level)
81  : bins(bins)
82  , lower_level(lower_level)
83  , upper_level(upper_level)
84  , scale((upper_level - lower_level) / bins)
85  {}
86 
87  template<class Sample>
88  ROCPRIM_HOST_DEVICE bool operator()(Sample sample, unsigned int& bin) const
89  {
90  const Level s = static_cast<Level>(sample);
91  if(s >= lower_level && s < upper_level)
92  {
93  bin = static_cast<unsigned int>((s - lower_level) / scale);
94  return true;
95  }
96  return false;
97  }
98 };
99 
100 // This specialization uses fast division (uint_fast_div) for integers smaller than 64 bit
101 template<class Level>
103  Level,
104  typename std::enable_if<std::is_integral<Level>::value && (sizeof(Level) <= 4)>::type>
105 {
106  unsigned int bins;
107  Level lower_level;
108  Level upper_level;
109  uint_fast_div scale;
110 
111  ROCPRIM_HOST_DEVICE inline sample_to_bin_even() = default;
112 
113  ROCPRIM_HOST_DEVICE inline sample_to_bin_even(unsigned int bins,
114  Level lower_level,
115  Level upper_level)
116  : bins(bins)
117  , lower_level(lower_level)
118  , upper_level(upper_level)
119  , scale((upper_level - lower_level) / bins)
120  {}
121 
122  template<class Sample>
123  ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const
124  {
125  const Level s = static_cast<Level>(sample);
126  if(s >= lower_level && s < upper_level)
127  {
128  bin = static_cast<unsigned int>(s - lower_level) / scale;
129  return true;
130  }
131  return false;
132  }
133 };
134 
135 // This specialization uses multiplication by inv divisor for floats
136 template<class Level>
137 struct sample_to_bin_even<Level,
138  typename std::enable_if<rocprim::is_floating_point<Level>::value>::type>
139 {
140  unsigned int bins;
141  Level lower_level;
142  Level upper_level;
143  Level inv_scale;
144 
145  ROCPRIM_HOST_DEVICE inline sample_to_bin_even() = default;
146 
147  ROCPRIM_HOST_DEVICE inline sample_to_bin_even(unsigned int bins,
148  Level lower_level,
149  Level upper_level)
150  : bins(bins)
151  , lower_level(lower_level)
152  , upper_level(upper_level)
153  , inv_scale(static_cast<Level>(bins) / (upper_level - lower_level))
154  {}
155 
156  template<class Sample>
157  ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const
158  {
159  const Level s = static_cast<Level>(sample);
160  if(s >= lower_level && s < upper_level)
161  {
162  bin = static_cast<unsigned int>((s - lower_level) * inv_scale);
163  return true;
164  }
165  return false;
166  }
167 };
168 
169 // Returns index of the first element in values that is greater than value, or count if no such element is found.
170 template<class T>
171 ROCPRIM_HOST_DEVICE inline unsigned int upper_bound(const T* values, unsigned int count, T value)
172 {
173  unsigned int current = 0;
174  while(count > 0)
175  {
176  const unsigned int step = count / 2;
177  const unsigned int next = current + step;
178  if(value < values[next])
179  {
180  count = step;
181  }
182  else
183  {
184  current = next + 1;
185  count -= step + 1;
186  }
187  }
188  return current;
189 }
190 
191 template<class Level>
193 {
194  unsigned int bins;
195  const Level* level_values;
196 
197  ROCPRIM_HOST_DEVICE inline sample_to_bin_range() = default;
198 
199  ROCPRIM_HOST_DEVICE inline sample_to_bin_range(unsigned int bins, const Level* level_values)
200  : bins(bins), level_values(level_values)
201  {}
202 
203  template<class Sample>
204  ROCPRIM_HOST_DEVICE inline bool operator()(Sample sample, unsigned int& bin) const
205  {
206  const Level s = static_cast<Level>(sample);
207  bin = upper_bound(level_values, bins + 1, s) - 1;
208  return bin < bins;
209  }
210 };
211 
212 template<class T, unsigned int Size>
214 {
215  T values[Size];
216 };
217 
218 // Checks if it is possible to load 2 or 4 sample_vector<Sample, Channels> as one 32-bit value
219 template<unsigned int ItemsPerThread, unsigned int Channels, class Sample>
221  : std::integral_constant<bool,
222  ((sizeof(Sample) * Channels == 1) || (sizeof(Sample) * Channels == 2))
223  && (sizeof(Sample) * Channels * ItemsPerThread % sizeof(int) == 0)
224  && (sizeof(Sample) * Channels * ItemsPerThread / sizeof(int) > 0)>
225 {};
226 
227 template<unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Channels, class Sample>
228 ROCPRIM_DEVICE ROCPRIM_INLINE
229  typename std::enable_if<is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
230  load_samples(unsigned int flat_id,
231  Sample* samples,
232  sample_vector<Sample, Channels> (&values)[ItemsPerThread])
233 {
234  using packed_samples_type = int[sizeof(Sample) * Channels * ItemsPerThread / sizeof(int)];
235 
236  if(reinterpret_cast<uintptr_t>(samples) % sizeof(int) == 0)
237  {
238  // the pointer is aligned by 4 bytes
239  block_load_direct_striped<BlockSize>(flat_id,
240  reinterpret_cast<const int*>(samples),
241  reinterpret_cast<packed_samples_type&>(values));
242  }
243  else
244  {
245  block_load_direct_striped<BlockSize>(
246  flat_id,
247  reinterpret_cast<const sample_vector<Sample, Channels>*>(samples),
248  values);
249  }
250 }
251 
252 template<unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Channels, class Sample>
253 ROCPRIM_DEVICE ROCPRIM_INLINE
254  typename std::enable_if<!is_sample_vectorizable<ItemsPerThread, Channels, Sample>::value>::type
255  load_samples(unsigned int flat_id,
256  Sample* samples,
257  sample_vector<Sample, Channels> (&values)[ItemsPerThread])
258 {
259  block_load_direct_striped<BlockSize>(
260  flat_id,
261  reinterpret_cast<const sample_vector<Sample, Channels>*>(samples),
262  values);
263 }
264 
265 template<unsigned int BlockSize,
266  unsigned int ItemsPerThread,
267  unsigned int Channels,
268  class Sample,
269  class SampleIterator>
270 ROCPRIM_DEVICE ROCPRIM_INLINE void
271  load_samples(unsigned int flat_id,
272  SampleIterator samples,
273  sample_vector<Sample, Channels> (&values)[ItemsPerThread])
274 {
275  Sample tmp[Channels * ItemsPerThread];
276  block_load_direct_blocked(flat_id, samples, tmp);
277  for(unsigned int i = 0; i < ItemsPerThread; i++)
278  {
279  for(unsigned int channel = 0; channel < Channels; channel++)
280  {
281  values[i].values[channel] = tmp[i * Channels + channel];
282  }
283  }
284 }
285 
286 template<unsigned int BlockSize,
287  unsigned int ItemsPerThread,
288  unsigned int Channels,
289  class Sample,
290  class SampleIterator>
291 ROCPRIM_DEVICE ROCPRIM_INLINE void
292  load_samples(unsigned int flat_id,
293  SampleIterator samples,
294  sample_vector<Sample, Channels> (&values)[ItemsPerThread],
295  unsigned int valid_count)
296 {
297  Sample tmp[Channels * ItemsPerThread];
298  block_load_direct_blocked(flat_id, samples, tmp, valid_count * Channels);
299  for(unsigned int i = 0; i < ItemsPerThread; i++)
300  {
301  for(unsigned int channel = 0; channel < Channels; channel++)
302  {
303  values[i].values[channel] = tmp[i * Channels + channel];
304  }
305  }
306 }
307 
308 template<unsigned int BlockSize, unsigned int ActiveChannels, class Counter>
309 ROCPRIM_DEVICE ROCPRIM_INLINE void init_histogram(fixed_array<Counter*, ActiveChannels> histogram,
311 {
312  const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
313  const unsigned int block_id = ::rocprim::detail::block_id<0>();
314 
315  const unsigned int index = block_id * BlockSize + flat_id;
316  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
317  {
318  if(index < bins[channel])
319  {
320  histogram[channel][index] = 0;
321  }
322  }
323 }
324 
325 template<unsigned int BlockSize,
326  unsigned int ItemsPerThread,
327  unsigned int Channels,
328  unsigned int ActiveChannels,
329  class SampleIterator,
330  class Counter,
331  class SampleToBinOp>
332 ROCPRIM_DEVICE ROCPRIM_INLINE void
333  histogram_shared(SampleIterator samples,
334  unsigned int columns,
335  unsigned int rows,
336  unsigned int row_stride,
337  unsigned int rows_per_block,
338  unsigned int shared_histograms,
342  unsigned int* block_histogram_start)
343 {
344  using sample_type = typename std::iterator_traits<SampleIterator>::value_type;
345  using sample_vector_type = sample_vector<sample_type, Channels>;
346 
347  constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
348 
349  const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
350  const unsigned int block_id0 = ::rocprim::detail::block_id<0>();
351  const unsigned int block_id1 = ::rocprim::detail::block_id<1>();
352  const unsigned int grid_size0 = ::rocprim::detail::grid_size<0>();
353 
354  // starts of the first histogram for each channel
355  unsigned int* block_histogram[ActiveChannels];
356  unsigned int total_bins = 0;
357  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
358  {
359  block_histogram[channel] = block_histogram_start + total_bins;
360  total_bins += bins[channel];
361  }
362 
363  // partial histogram to work with
364  const unsigned int thread_shift = (flat_id % shared_histograms) * total_bins;
365 
366  // fill all histograms with 0
367  for(unsigned int i = flat_id; i < total_bins * shared_histograms; i += BlockSize)
368  {
369  block_histogram_start[i] = 0;
370  }
372 
373  const unsigned int start_row = block_id1 * rows_per_block;
374  const unsigned int end_row = ::rocprim::min(rows, start_row + rows_per_block);
375  for(unsigned int row = start_row; row < end_row; row++)
376  {
377  SampleIterator row_samples = samples + row * row_stride;
378 
379  unsigned int block_offset = block_id0 * items_per_block;
380  while(block_offset < columns)
381  {
382  sample_vector_type values[ItemsPerThread];
383 
384  if(block_offset + items_per_block <= columns)
385  {
386  load_samples<BlockSize>(flat_id, row_samples + Channels * block_offset, values);
387 
388  for(unsigned int i = 0; i < ItemsPerThread; i++)
389  {
390  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
391  {
392  unsigned int bin;
393  if(sample_to_bin_op[channel](values[i].values[channel], bin))
394  {
395  ::rocprim::detail::atomic_add(block_histogram[channel] + bin
396  + thread_shift,
397  1);
398  }
399  }
400  }
401  }
402  else
403  {
404  const unsigned int valid_count = columns - block_offset;
405  load_samples<BlockSize>(flat_id,
406  row_samples + Channels * block_offset,
407  values,
408  valid_count);
409 
410  for(unsigned int i = 0; i < ItemsPerThread; i++)
411  {
412  if(flat_id * ItemsPerThread + i < valid_count)
413  {
414  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
415  {
416  unsigned int bin;
417  if(sample_to_bin_op[channel](values[i].values[channel], bin))
418  {
419  ::rocprim::detail::atomic_add(block_histogram[channel] + bin
420  + thread_shift,
421  1);
422  }
423  }
424  }
425  }
426  }
427 
428  block_offset += grid_size0 * items_per_block;
429  }
430  }
432 
433  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
434  {
435  for(unsigned int bin = flat_id; bin < bins[channel]; bin += BlockSize)
436  {
437  unsigned int total = 0;
438  for(unsigned int i = 0; i < shared_histograms; i++)
439  {
440  total += block_histogram[channel][bin + i * total_bins];
441  }
442  if(total > 0)
443  {
444  ::rocprim::detail::atomic_add(&histogram[channel][bin], total);
445  }
446  }
447  }
448 }
449 
450 template<unsigned int BlockSize,
451  unsigned int ItemsPerThread,
452  unsigned int Channels,
453  unsigned int ActiveChannels,
454  class SampleIterator,
455  class Counter,
456  class SampleToBinOp>
457 ROCPRIM_DEVICE ROCPRIM_INLINE void
458  histogram_global(SampleIterator samples,
459  unsigned int columns,
460  unsigned int row_stride,
464 {
465  using sample_type = typename std::iterator_traits<SampleIterator>::value_type;
466  using sample_vector_type = sample_vector<sample_type, Channels>;
467 
468  constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
469 
470  const unsigned int flat_id = ::rocprim::detail::block_thread_id<0>();
471  const unsigned int block_id0 = ::rocprim::detail::block_id<0>();
472  const unsigned int block_id1 = ::rocprim::detail::block_id<1>();
473  const unsigned int block_offset = block_id0 * items_per_block;
474 
475  samples += block_id1 * row_stride + Channels * block_offset;
476 
477  sample_vector_type values[ItemsPerThread];
478  unsigned int valid_count;
479  if(block_offset + items_per_block <= columns)
480  {
481  valid_count = items_per_block;
482  load_samples<BlockSize>(flat_id, samples, values);
483  }
484  else
485  {
486  valid_count = columns - block_offset;
487  load_samples<BlockSize>(flat_id, samples, values, valid_count);
488  }
489 
490  for(unsigned int i = 0; i < ItemsPerThread; i++)
491  {
492  for(unsigned int channel = 0; channel < ActiveChannels; channel++)
493  {
494  unsigned int bin;
495  if(sample_to_bin_op[channel](values[i].values[channel], bin))
496  {
497  const unsigned int pos = flat_id * ItemsPerThread + i;
498  lane_mask_type same_bin_lanes_mask = ::rocprim::ballot(pos < valid_count);
499  for(unsigned int b = 0; b < bins_bits[channel]; b++)
500  {
501  const unsigned int bit_set = bin & (1u << b);
502  const lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set);
503  same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
504  }
505  const unsigned int same_bin_count = ::rocprim::bit_count(same_bin_lanes_mask);
506  const unsigned int prev_same_bin_count
507  = ::rocprim::masked_bit_count(same_bin_lanes_mask);
508  if(prev_same_bin_count == 0)
509  {
510  // Write the number of lanes having this bin,
511  // if the current lane is the first (and maybe only) lane with this bin.
512  ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count);
513  }
514  }
515  }
516  }
517 }
518 
519 } // namespace detail
520 
521 END_ROCPRIM_NAMESPACE
522 
523 #endif // ROCPRIM_DEVICE_DETAIL_DEVICE_HISTOGRAM_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
The block_histogram class is a block level parallel primitive which provides methods for constructing...
Definition: block_histogram.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int masked_bit_count(lane_mask_type x, unsigned int add=0)
Masked bit count.
Definition: warp.hpp:48
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Definition: device_histogram.hpp:71
Definition: device_histogram.hpp:220
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type ballot(int predicate)
Evaluate predicate for all active work-items in the warp and return an integer whose i-th bit is set ...
Definition: warp.hpp:38
Definition: uint_fast_div.hpp:31
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
Definition: device_histogram.hpp:45
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:58
Definition: benchmark_block_histogram.cpp:64
Definition: device_histogram.hpp:192
Definition: device_histogram.hpp:213
unsigned long long int lane_mask_type
The lane_mask_type is an integer that contains one bit per thread.
Definition: types.hpp:164
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int bit_count(unsigned int x)
Bit count.
Definition: bit.hpp:42