rocPRIM
block_histogram_atomic.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_HISTOGRAM_ATOMIC_HPP_
22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_HISTOGRAM_ATOMIC_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../intrinsics.hpp"
30 #include "../../functional.hpp"
31 
32 BEGIN_ROCPRIM_NAMESPACE
33 
34 namespace detail
35 {
36 
37 template<
38  class T,
39  unsigned int BlockSizeX,
40  unsigned int BlockSizeY,
41  unsigned int BlockSizeZ,
42  unsigned int ItemsPerThread,
43  unsigned int Bins
44 >
46 {
47  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
48  static_assert(
49  std::is_convertible<T, unsigned int>::value,
50  "T must be convertible to unsigned int"
51  );
52 
53 public:
54  using storage_type = typename ::rocprim::detail::empty_storage_type;
55 
56  template<class Counter>
57  ROCPRIM_DEVICE ROCPRIM_INLINE
58  void composite(T (&input)[ItemsPerThread],
59  Counter hist[Bins])
60  {
61  static_assert(
62  std::is_same<Counter, unsigned int>::value || std::is_same<Counter, int>::value ||
63  std::is_same<Counter, float>::value || std::is_same<Counter, unsigned long long>::value,
64  "Counter must be type that is supported by atomics (float, int, unsigned int, unsigned long long)"
65  );
66  ROCPRIM_UNROLL
67  for(unsigned int i = 0; i < ItemsPerThread; ++i)
68  {
69  const unsigned int bin = static_cast<unsigned int>(input[i]);
70 
71  // Get a mask with the threads that have the same value for `bin`.
72  ::rocprim::lane_mask_type peer_mask = ballot(1);
73  ROCPRIM_UNROLL
74  for(unsigned int b = 1; b < Bins; b <<= 1)
75  {
76  const unsigned int bit_set = bin & b;
77  const auto bit_set_mask = ballot(bit_set);
78  peer_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
79  }
80 
81  // The total number of threads in the warp which also have this digit.
82  const unsigned int bin_count = bit_count(peer_mask);
83 
84  // The number of threads in the warp that have the same digit AND whose lane id is lower
85  // than the current thread's.
86  const unsigned int peer_digit_prefix = masked_bit_count(peer_mask);
87 
88  // Set counter value.
89  if(peer_digit_prefix == 0)
90  {
91  detail::atomic_add(&hist[bin], Counter(bin_count));
92  }
93  }
95  }
96 
97  template<class Counter>
98  ROCPRIM_DEVICE ROCPRIM_INLINE
99  void composite(T (&input)[ItemsPerThread],
100  Counter hist[Bins],
101  storage_type& storage)
102  {
103  (void) storage;
104  this->composite(input, hist);
105  }
106 };
107 
108 } // end namespace detail
109 
110 END_ROCPRIM_NAMESPACE
111 
112 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_HISTOGRAM_ATOMIC_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int masked_bit_count(lane_mask_type x, unsigned int add=0)
Masked bit count.
Definition: warp.hpp:48
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: block_histogram_atomic.hpp:45
ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type ballot(int predicate)
Evaluate predicate for all active work-items in the warp and return an integer whose i-th bit is set ...
Definition: warp.hpp:38
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
unsigned long long int lane_mask_type
The lane_mask_type is an integer that contains one bit per thread.
Definition: types.hpp:164
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int bit_count(unsigned int x)
Bit count.
Definition: bit.hpp:42