rocPRIM
block_scan_reduce_then_scan.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_SCAN_REDUCE_THEN_SCAN_HPP_
22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_SCAN_REDUCE_THEN_SCAN_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../intrinsics.hpp"
30 #include "../../functional.hpp"
31 
32 #include "../../warp/warp_scan.hpp"
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
36 namespace detail
37 {
38 
39 template<
40  class T,
41  unsigned int BlockSizeX,
42  unsigned int BlockSizeY,
43  unsigned int BlockSizeZ
44 >
46 {
47  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
48  // Number of items to reduce per thread
49  static constexpr unsigned int thread_reduction_size_ =
51 
52  // Warp scan, warp_scan_crosslane does not require shared memory (storage), but
53  // logical warp size must be a power of two.
54  static constexpr unsigned int warp_size_ =
55  detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
56  using warp_scan_prefix_type = ::rocprim::detail::warp_scan_crosslane<T, warp_size_>;
57 
58  // Minimize LDS bank conflicts
59  static constexpr unsigned int banks_no_ = ::rocprim::detail::get_lds_banks_no();
60  static constexpr bool has_bank_conflicts_ =
61  ::rocprim::detail::is_power_of_two(thread_reduction_size_) && thread_reduction_size_ > 1;
62  static constexpr unsigned int bank_conflicts_padding =
63  has_bank_conflicts_ ? (warp_size_ * thread_reduction_size_ / banks_no_) : 0;
64 
65  struct storage_type_
66  {
67  T threads[warp_size_ * thread_reduction_size_ + bank_conflicts_padding];
68  };
69 
70 public:
72 
73  template<class BinaryFunction>
74  ROCPRIM_DEVICE ROCPRIM_INLINE
75  void inclusive_scan(T input,
76  T& output,
77  storage_type& storage,
78  BinaryFunction scan_op)
79  {
80  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
81  this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op);
82  }
83 
84  template<class BinaryFunction>
85  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
86  void inclusive_scan(T input,
87  T& output,
88  BinaryFunction scan_op)
89  {
90  ROCPRIM_SHARED_MEMORY storage_type storage;
91  this->inclusive_scan(input, output, storage, scan_op);
92  }
93 
94  template<class BinaryFunction>
95  ROCPRIM_DEVICE ROCPRIM_INLINE
96  void inclusive_scan(T input,
97  T& output,
98  T& reduction,
99  storage_type& storage,
100  BinaryFunction scan_op)
101  {
102  storage_type_& storage_ = storage.get();
103  this->inclusive_scan(input, output, storage, scan_op);
104  reduction = storage_.threads[index(BlockSize - 1)];
105  }
106 
107  template<class BinaryFunction>
108  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
109  void inclusive_scan(T input,
110  T& output,
111  T& reduction,
112  BinaryFunction scan_op)
113  {
114  ROCPRIM_SHARED_MEMORY storage_type storage;
115  this->inclusive_scan(input, output, reduction, storage, scan_op);
116  }
117 
118  template<class PrefixCallback, class BinaryFunction>
119  ROCPRIM_DEVICE ROCPRIM_INLINE
120  void inclusive_scan(T input,
121  T& output,
122  storage_type& storage,
123  PrefixCallback& prefix_callback_op,
124  BinaryFunction scan_op)
125  {
126  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
127  const auto warp_id = ::rocprim::warp_id(flat_tid);
128  storage_type_& storage_ = storage.get();
129  this->inclusive_scan_impl(flat_tid, input, output, storage, scan_op);
130  // Include block prefix (this operation overwrites storage_.threads[0])
131  T block_prefix = this->get_block_prefix(
132  flat_tid, warp_id,
133  storage_.threads[index(BlockSize - 1)], // block reduction
134  prefix_callback_op, storage
135  );
136  output = scan_op(block_prefix, output);
137  }
138 
139  template<unsigned int ItemsPerThread, class BinaryFunction>
140  ROCPRIM_DEVICE ROCPRIM_INLINE
141  void inclusive_scan(T (&input)[ItemsPerThread],
142  T (&output)[ItemsPerThread],
143  storage_type& storage,
144  BinaryFunction scan_op)
145  {
146  // Reduce thread items
147  T thread_input = input[0];
148  ROCPRIM_UNROLL
149  for(unsigned int i = 1; i < ItemsPerThread; i++)
150  {
151  thread_input = scan_op(thread_input, input[i]);
152  }
153 
154  // Scan of reduced values to get prefixes
155  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
156  this->exclusive_scan_impl(
157  flat_tid,
158  thread_input, thread_input, // input, output
159  storage,
160  scan_op
161  );
162 
163  // Include prefix (first thread does not have prefix)
164  output[0] = input[0];
165  if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]);
166  // Final thread-local scan
167  ROCPRIM_UNROLL
168  for(unsigned int i = 1; i < ItemsPerThread; i++)
169  {
170  output[i] = scan_op(output[i-1], input[i]);
171  }
172  }
173 
174  template<unsigned int ItemsPerThread, class BinaryFunction>
175  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
176  void inclusive_scan(T (&input)[ItemsPerThread],
177  T (&output)[ItemsPerThread],
178  BinaryFunction scan_op)
179  {
180  ROCPRIM_SHARED_MEMORY storage_type storage;
181  this->inclusive_scan(input, output, storage, scan_op);
182  }
183 
184  template<unsigned int ItemsPerThread, class BinaryFunction>
185  ROCPRIM_DEVICE ROCPRIM_INLINE
186  void inclusive_scan(T (&input)[ItemsPerThread],
187  T (&output)[ItemsPerThread],
188  T& reduction,
189  storage_type& storage,
190  BinaryFunction scan_op)
191  {
192  storage_type_& storage_ = storage.get();
193  this->inclusive_scan(input, output, storage, scan_op);
194  // Save reduction result
195  reduction = storage_.threads[index(BlockSize - 1)];
196  }
197 
198  template<unsigned int ItemsPerThread, class BinaryFunction>
199  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
200  void inclusive_scan(T (&input)[ItemsPerThread],
201  T (&output)[ItemsPerThread],
202  T& reduction,
203  BinaryFunction scan_op)
204  {
205  ROCPRIM_SHARED_MEMORY storage_type storage;
206  this->inclusive_scan(input, output, reduction, storage, scan_op);
207  }
208 
209  template<
210  class PrefixCallback,
211  unsigned int ItemsPerThread,
212  class BinaryFunction
213  >
214  ROCPRIM_DEVICE ROCPRIM_INLINE
215  void inclusive_scan(T (&input)[ItemsPerThread],
216  T (&output)[ItemsPerThread],
217  storage_type& storage,
218  PrefixCallback& prefix_callback_op,
219  BinaryFunction scan_op)
220  {
221  storage_type_& storage_ = storage.get();
222  // Reduce thread items
223  T thread_input = input[0];
224  ROCPRIM_UNROLL
225  for(unsigned int i = 1; i < ItemsPerThread; i++)
226  {
227  thread_input = scan_op(thread_input, input[i]);
228  }
229 
230  // Scan of reduced values to get prefixes
231  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
232  this->exclusive_scan_impl(
233  flat_tid,
234  thread_input, thread_input, // input, output
235  storage,
236  scan_op
237  );
238 
239  // this operation overwrites storage_.threads[0]
240  T block_prefix = this->get_block_prefix(
241  flat_tid, ::rocprim::warp_id(flat_tid),
242  storage_.threads[index(BlockSize - 1)], // block reduction
243  prefix_callback_op, storage
244  );
245 
246  // Include prefix (first thread does not have prefix)
247  output[0] = input[0];
248  if(flat_tid != 0) output[0] = scan_op(thread_input, input[0]);
249  // Include block prefix
250  output[0] = scan_op(block_prefix, output[0]);
251  // Final thread-local scan
252  ROCPRIM_UNROLL
253  for(unsigned int i = 1; i < ItemsPerThread; i++)
254  {
255  output[i] = scan_op(output[i-1], input[i]);
256  }
257  }
258 
259  template<class BinaryFunction>
260  ROCPRIM_DEVICE ROCPRIM_INLINE
261  void exclusive_scan(T input,
262  T& output,
263  T init,
264  storage_type& storage,
265  BinaryFunction scan_op)
266  {
267  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
268  this->exclusive_scan_impl(flat_tid, input, output, init, storage, scan_op);
269  }
270 
271  template<class BinaryFunction>
272  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
273  void exclusive_scan(T input,
274  T& output,
275  T init,
276  BinaryFunction scan_op)
277  {
278  ROCPRIM_SHARED_MEMORY storage_type storage;
279  this->exclusive_scan(input, output, init, storage, scan_op);
280  }
281 
282  template<class BinaryFunction>
283  ROCPRIM_DEVICE ROCPRIM_INLINE
284  void exclusive_scan(T input,
285  T& output,
286  T init,
287  T& reduction,
288  storage_type& storage,
289  BinaryFunction scan_op)
290  {
291  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
292  storage_type_& storage_ = storage.get();
293  this->exclusive_scan_impl(
294  flat_tid, input, output, init, storage, scan_op
295  );
296  // Save reduction result
297  reduction = storage_.threads[index(BlockSize - 1)];
298  }
299 
300  template<class BinaryFunction>
301  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
302  void exclusive_scan(T input,
303  T& output,
304  T init,
305  T& reduction,
306  BinaryFunction scan_op)
307  {
308  ROCPRIM_SHARED_MEMORY storage_type storage;
309  this->exclusive_scan(input, output, init, reduction, storage, scan_op);
310  }
311 
312  template<class PrefixCallback, class BinaryFunction>
313  ROCPRIM_DEVICE ROCPRIM_INLINE
314  void exclusive_scan(T input,
315  T& output,
316  storage_type& storage,
317  PrefixCallback& prefix_callback_op,
318  BinaryFunction scan_op)
319  {
320  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
321  const auto warp_id = ::rocprim::warp_id(flat_tid);
322  storage_type_& storage_ = storage.get();
323  this->exclusive_scan_impl(
324  flat_tid, input, output, storage, scan_op
325  );
326  // Get reduction result
327  T reduction = storage_.threads[index(BlockSize - 1)];
328  // Include block prefix (this operation overwrites storage_.threads[0])
329  T block_prefix = this->get_block_prefix(
330  flat_tid, warp_id, reduction,
331  prefix_callback_op, storage
332  );
333  output = scan_op(block_prefix, output);
334  if(flat_tid == 0) output = block_prefix;
335  }
336 
337  template<unsigned int ItemsPerThread, class BinaryFunction>
338  ROCPRIM_DEVICE ROCPRIM_INLINE
339  void exclusive_scan(T (&input)[ItemsPerThread],
340  T (&output)[ItemsPerThread],
341  T init,
342  storage_type& storage,
343  BinaryFunction scan_op)
344  {
345  // Reduce thread items
346  T thread_input = input[0];
347  ROCPRIM_UNROLL
348  for(unsigned int i = 1; i < ItemsPerThread; i++)
349  {
350  thread_input = scan_op(thread_input, input[i]);
351  }
352 
353  // Scan of reduced values to get prefixes
354  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
355  this->exclusive_scan_impl(
356  flat_tid,
357  thread_input, thread_input, // input, output
358  init,
359  storage,
360  scan_op
361  );
362 
363  // Include init value
364  T prev = input[0];
365  T exclusive = init;
366  if(flat_tid != 0)
367  {
368  exclusive = thread_input;
369  }
370  output[0] = exclusive;
371  ROCPRIM_UNROLL
372  for(unsigned int i = 1; i < ItemsPerThread; i++)
373  {
374  exclusive = scan_op(exclusive, prev);
375  prev = input[i];
376  output[i] = exclusive;
377  }
378  }
379 
380  template<unsigned int ItemsPerThread, class BinaryFunction>
381  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
382  void exclusive_scan(T (&input)[ItemsPerThread],
383  T (&output)[ItemsPerThread],
384  T init,
385  BinaryFunction scan_op)
386  {
387  ROCPRIM_SHARED_MEMORY storage_type storage;
388  this->exclusive_scan(input, output, init, storage, scan_op);
389  }
390 
391  template<unsigned int ItemsPerThread, class BinaryFunction>
392  ROCPRIM_DEVICE ROCPRIM_INLINE
393  void exclusive_scan(T (&input)[ItemsPerThread],
394  T (&output)[ItemsPerThread],
395  T init,
396  T& reduction,
397  storage_type& storage,
398  BinaryFunction scan_op)
399  {
400  storage_type_& storage_ = storage.get();
401  this->exclusive_scan(input, output, init, storage, scan_op);
402  // Save reduction result
403  reduction = storage_.threads[index(BlockSize - 1)];
404  }
405 
406  template<unsigned int ItemsPerThread, class BinaryFunction>
407  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
408  void exclusive_scan(T (&input)[ItemsPerThread],
409  T (&output)[ItemsPerThread],
410  T init,
411  T& reduction,
412  BinaryFunction scan_op)
413  {
414  ROCPRIM_SHARED_MEMORY storage_type storage;
415  this->exclusive_scan(input, output, init, reduction, storage, scan_op);
416  }
417 
418  template<
419  class PrefixCallback,
420  unsigned int ItemsPerThread,
421  class BinaryFunction
422  >
423  ROCPRIM_DEVICE ROCPRIM_INLINE
424  void exclusive_scan(T (&input)[ItemsPerThread],
425  T (&output)[ItemsPerThread],
426  storage_type& storage,
427  PrefixCallback& prefix_callback_op,
428  BinaryFunction scan_op)
429  {
430  storage_type_& storage_ = storage.get();
431  // Reduce thread items
432  T thread_input = input[0];
433  ROCPRIM_UNROLL
434  for(unsigned int i = 1; i < ItemsPerThread; i++)
435  {
436  thread_input = scan_op(thread_input, input[i]);
437  }
438 
439  // Scan of reduced values to get prefixes
440  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
441  this->exclusive_scan_impl(
442  flat_tid,
443  thread_input, thread_input, // input, output
444  storage,
445  scan_op
446  );
447 
448  // this operation overwrites storage_.warp_prefixes[0]
449  T block_prefix = this->get_block_prefix(
450  flat_tid, ::rocprim::warp_id(flat_tid),
451  storage_.threads[index(BlockSize - 1)], // block reduction
452  prefix_callback_op, storage
453  );
454 
455  // Include init value and block prefix
456  T prev = input[0];
457  T exclusive = block_prefix;
458  if(flat_tid != 0)
459  {
460  exclusive = scan_op(block_prefix, thread_input);
461  }
462  output[0] = exclusive;
463  ROCPRIM_UNROLL
464  for(unsigned int i = 1; i < ItemsPerThread; i++)
465  {
466  exclusive = scan_op(exclusive, prev);
467  prev = input[i];
468  output[i] = exclusive;
469  }
470  }
471 
472 private:
473 
474  // Calculates inclusive scan results and stores them in storage_.threads,
475  // result for each thread is stored in storage_.threads[flat_tid], and sets
476  // output to storage_.threads[flat_tid]
477  template<class BinaryFunction>
478  ROCPRIM_DEVICE ROCPRIM_INLINE
479  void inclusive_scan_impl(const unsigned int flat_tid,
480  T input,
481  T& output,
482  storage_type& storage,
483  BinaryFunction scan_op)
484  {
485  storage_type_& storage_ = storage.get();
486  // Calculate inclusive scan,
487  // result for each thread is stored in storage_.threads[flat_tid]
488  this->inclusive_scan_base(flat_tid, input, storage, scan_op);
489  output = storage_.threads[index(flat_tid)];
490  }
491 
492  // Calculates inclusive scan results and stores them in storage_.threads,
493  // result for each thread is stored in storage_.threads[flat_tid]
494  template<class BinaryFunction>
495  ROCPRIM_DEVICE ROCPRIM_INLINE
496  void inclusive_scan_base(const unsigned int flat_tid,
497  T input,
498  storage_type& storage,
499  BinaryFunction scan_op)
500  {
501  storage_type_& storage_ = storage.get();
502  storage_.threads[index(flat_tid)] = input;
504  if(flat_tid < warp_size_)
505  {
506  const unsigned int idx_start = index(flat_tid * thread_reduction_size_);
507  const unsigned int idx_end = idx_start + thread_reduction_size_;
508 
509  T thread_reduction = storage_.threads[idx_start];
510  ROCPRIM_UNROLL
511  for(unsigned int i = idx_start + 1; i < idx_end; i++)
512  {
513  thread_reduction = scan_op(
514  thread_reduction, storage_.threads[i]
515  );
516  }
517 
518  // Calculate warp prefixes
519  warp_scan_prefix_type().inclusive_scan(thread_reduction, thread_reduction, scan_op);
520  thread_reduction = warp_shuffle_up(thread_reduction, 1, warp_size_);
521 
522  // Include warp prefix
523  thread_reduction = scan_op(thread_reduction, storage_.threads[idx_start]);
524  if(flat_tid == 0)
525  {
526  thread_reduction = input;
527  }
528 
529  storage_.threads[idx_start] = thread_reduction;
530  ROCPRIM_UNROLL
531  for(unsigned int i = idx_start + 1; i < idx_end; i++)
532  {
533  thread_reduction = scan_op(
534  thread_reduction, storage_.threads[i]
535  );
536  storage_.threads[i] = thread_reduction;
537  }
538  }
540  }
541 
542  template<class BinaryFunction>
543  ROCPRIM_DEVICE ROCPRIM_INLINE
544  void exclusive_scan_impl(const unsigned int flat_tid,
545  T input,
546  T& output,
547  T init,
548  storage_type& storage,
549  BinaryFunction scan_op)
550  {
551  storage_type_& storage_ = storage.get();
552  // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid]
553  this->inclusive_scan_base(flat_tid, input, storage, scan_op);
554  output = init;
555  if(flat_tid != 0) output = scan_op(init, storage_.threads[index(flat_tid-1)]);
556  }
557 
558  template<class BinaryFunction>
559  ROCPRIM_DEVICE ROCPRIM_INLINE
560  void exclusive_scan_impl(const unsigned int flat_tid,
561  T input,
562  T& output,
563  storage_type& storage,
564  BinaryFunction scan_op)
565  {
566  storage_type_& storage_ = storage.get();
567  // Calculates inclusive scan, result for each thread is stored in storage_.threads[flat_tid]
568  this->inclusive_scan_base(flat_tid, input, storage, scan_op);
569  if(flat_tid > 0)
570  {
571  output = storage_.threads[index(flat_tid-1)];
572  }
573  }
574 
575  // OVERWRITES storage_.threads[0]
576  template<class PrefixCallback, class BinaryFunction>
577  ROCPRIM_DEVICE ROCPRIM_INLINE
578  void include_block_prefix(const unsigned int flat_tid,
579  const unsigned int warp_id,
580  const T input,
581  T& output,
582  const T reduction,
583  PrefixCallback& prefix_callback_op,
584  storage_type& storage,
585  BinaryFunction scan_op)
586  {
587  T block_prefix = this->get_block_prefix(
588  flat_tid, warp_id, reduction,
589  prefix_callback_op, storage
590  );
591  output = scan_op(block_prefix, input);
592  }
593 
594  // OVERWRITES storage_.threads[0]
595  template<class PrefixCallback>
596  ROCPRIM_DEVICE ROCPRIM_INLINE
597  T get_block_prefix(const unsigned int flat_tid,
598  const unsigned int warp_id,
599  const T reduction,
600  PrefixCallback& prefix_callback_op,
601  storage_type& storage)
602  {
603  storage_type_& storage_ = storage.get();
604  if(warp_id == 0)
605  {
606  T block_prefix = prefix_callback_op(reduction);
607  if(flat_tid == 0)
608  {
609  // Reuse storage_.threads[0] which should not be
610  // needed at that point.
611  storage_.threads[0] = block_prefix;
612  }
613  }
615  return storage_.threads[0];
616  }
617 
618  // Change index to minimize LDS bank conflicts if necessary
619  ROCPRIM_DEVICE ROCPRIM_INLINE
620  unsigned int index(unsigned int n) const
621  {
622  // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
623  return has_bank_conflicts_ ? (n + (n/banks_no_)) : n;
624  }
625 };
626 
627 } // end namespace detail
628 
629 END_ROCPRIM_NAMESPACE
630 
631 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_SCAN_REDUCE_THEN_SCAN_HPP_
Definition: benchmark_block_scan.cpp:63
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: block_scan_reduce_then_scan.hpp:45
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_up(const T &input, const unsigned int delta, const int width=device_warp_size())
Shuffle up for any data type.
Definition: warp_shuffle.hpp:197
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
Definition: benchmark_block_scan.cpp:100