cuda-kat
CUDA kernel author's tools
Classes | Macros | Enumerations | Functions
warp.cuh File Reference

CUDA device computation warp-level primitives, i.e. More...

#include "../grid_info.cuh"
#include <kat/on_device/shuffle.cuh>
#include <kat/on_device/builtins.cuh>
#include <kat/on_device/atomics.cuh>
#include <kat/on_device/non-builtins.cuh>
#include <kat/on_device/ptx.cuh>
#include <kat/on_device/math.cuh>
#include <kat/on_device/common.cuh>
#include <type_traits>

Classes

struct  kat::linear_grid::collaborative::warp::search_result_t< T >
 A structure for warp-level search results. More...
 

Enumerations

enum  kat::linear_grid::collaborative::warp::detail::predicate_computation_length_slack_t {
  kat::linear_grid::collaborative::warp::detail::has_no_slack,
  kat::linear_grid::collaborative::warp::detail::may_have_full_warps_of_slack,
  kat::linear_grid::collaborative::warp::detail::may_have_arbitrary_slack
}
 Sometimes you have a run-time-determined range of indices for which you need to compute some predicate, but you have it constrained to be a nice round value, say a multiple of warp_size * warp_size - or even a constant multiple, but by means of, say, a configuration file. More...
 

Functions

KAT_FD unsigned kat::num_lanes_in (lane_mask_t mask)
 
template<bool ReturnWarpSizeForEmptyMask = true>
KAT_FD int kat::first_lane_in (lane_mask_t mask)
 Determines which lane is the first within a lane mask (considered in LSB-to-MSB order) More...
 
KAT_FD int kat::last_lane_in (lane_mask_t mask)
 Determines which lane is the first within a lane mask (considered in LSB-to-MSB order) More...
 
KAT_FD bool kat::collaborative::warp::all_lanes_satisfy (int condition)
 Checks whether a condition holds for an entire warp of threads. More...
 
KAT_FD bool kat::collaborative::warp::no_lanes_satisfy (int condition)
 Checks whether a condition holds for none of the threads in a warp. More...
 
KAT_FD bool kat::collaborative::warp::all_lanes_agree_on (int condition)
 Checks whether a condition holds for an entire warp of threads. More...
 
KAT_FD bool kat::collaborative::warp::some_lanes_satisfy (int condition)
 Checks whether a condition holds for at least one of the threads in a warp. More...
 
KAT_FD native_word_t kat::collaborative::warp::num_lanes_satisfying (int condition)
 Count the lanes in a warp for which some condition holds. More...
 
KAT_FD native_word_t kat::collaborative::warp::num_lanes_agreeing_on (int condition)
 Count the lanes in a warp which have the same condition value as the calling lane. More...
 
KAT_FD bool kat::collaborative::warp::majority_vote (int condition)
 Check whether a condition holds for most lanes in a warp. More...
 
template<typename T >
KAT_FD bool kat::collaborative::warp::in_unique_lane_with (T value)
 Compares values provided by each lane in a warp, checking for uniqueness. More...
 
template<typename T >
KAT_FD T kat::collaborative::warp::get_from_lane (T value, int source_lane)
 
template<typename T >
KAT_FD T kat::collaborative::warp::get_from_first_lane (T value)
 
template<typename T >
KAT_FD T kat::collaborative::warp::get_from_last_lane (T value)
 
template<bool DefinedResultOnNoSatisfaction = true>
KAT_FD native_word_t kat::collaborative::warp::first_lane_satisfying (int condition)
 Determines which is the first lane within a warp which satisfies some boolean condition. More...
 
KAT_FD lane_mask_t kat::collaborative::warp::get_active_lanes ()
 
KAT_FD unsigned kat::collaborative::warp::num_active_lanes ()
 
template<bool PreferFirstLane = true>
KAT_FD unsigned kat::collaborative::warp::detail::select_leader_lane_among (unsigned lanes_mask)
 
template<bool PreferFirstLane = true>
KAT_FD bool kat::collaborative::warp::detail::am_leader_lane (unsigned active_lanes_mask)
 
KAT_FD unsigned kat::collaborative::warp::detail::lane_index_among (lane_mask_t mask)
 
template<bool PreferFirstLane = true>
KAT_FD unsigned kat::collaborative::warp::select_leader_lane ()
 This is a mechanism for making exactly one lane act instead of the whole warp, which supports the case of some threads being inactive (e.g. More...
 
template<bool PreferFirstLane = true>
KAT_FD bool kat::collaborative::warp::am_leader_lane ()
 This applies the leader lane selection mechanism to obtain a condition for being the leader lane. More...
 
template<typename Function >
KAT_FD std::result_of< Function()>::type kat::collaborative::warp::have_a_single_lane_compute (Function f, unsigned designated_computing_lane)
 
template<typename Function , bool PreferFirstLane = true>
KAT_FD std::result_of< Function()>::type kat::collaborative::warp::have_a_single_lane_compute (Function f)
 
template<typename Function >
KAT_FD std::result_of< Function()>::type kat::collaborative::warp::have_first_lane_compute (Function f)
 
template<typename Function >
KAT_FD std::result_of< Function()>::type kat::collaborative::warp::have_last_lane_compute (Function f)
 
KAT_FD unsigned kat::collaborative::warp::index_among_active_lanes ()
 
KAT_FD unsigned kat::collaborative::warp::last_active_lane_index ()
 
template<typename T >
KAT_FD T kat::collaborative::warp::active_lanes_atomically_increment (T *counter)
 When every (active) lane in a warp needs to increment a counter, use this function to avoid all of them having to execute atomics; each active lane gets the "previous value" as though they had all incremented in order. More...
 
template<typename Function , typename Size = unsigned>
KAT_FD void kat::collaborative::warp::at_warp_stride (Size length, Function f)
 
template<typename T , bool AssumeNeedlesAreSorted = false>
KAT_FD search_result_t< T > kat::linear_grid::collaborative::warp::multisearch (const T &lane_needle, const T &lane_hay_straw)
 Have each lane search for its own value of interest within the sorted sequence of single values provided by all the warp lanes. More...
 
template<typename Function , typename Size = unsigned>
KAT_FD void kat::linear_grid::collaborative::warp::at_warp_stride (Size length, Function f)
 
template<typename Predicate , typename Size = native_word_t, detail::predicate_computation_length_slack_t PossibilityOfSlack = detail::may_have_arbitrary_slack>
KAT_FD void kat::linear_grid::collaborative::warp::compute_predicate_at_warp_stride (unsigned *computed_predicate, Predicate &predicate, Size length)
 Arrange the computation of a predicate by a warp so that both reads and writes are coalesced and divergence is minimized. More...
 

Detailed Description

CUDA device computation warp-level primitives, i.e.

those involving interaction of most/all of each warp's lanes, but no inter-warp interaction.

Todo:
  1. Some of these assume linear grids, others do not - sort them out.
  2. Use a lane index type

Enumeration Type Documentation

§ predicate_computation_length_slack_t

Sometimes you have a run-time-determined range of indices for which you need to compute some predicate, but you have it constrained to be a nice round value, say a multiple of warp_size * warp_size - or even a constant multiple, but by means of, say, a configuration file.

Now, the compiler will not be able to figure out this is the case, and will compile in the condition checks and the code for handling slack - which you don't actually need. To prevent that and cater to my OCD, you can use a template parameter to indicate how nicely-behaving your input length is.

Enumerator
has_no_slack 

Length known to be multiple of warp_size * warp_size.

may_have_full_warps_of_slack 

Length only known to be multiple of warp_size.

may_have_arbitrary_slack 

Length may have any value, make no assumptions about it.

Function Documentation

§ active_lanes_atomically_increment()

template<typename T >
KAT_FD T kat::collaborative::warp::active_lanes_atomically_increment ( T *  counter)

When every (active) lane in a warp needs to increment a counter, use this function to avoid all of them having to execute atomics; each active lane gets the "previous value" as though they had all incremented in order.

Note
It's not clear to me how better this is from just using atomics and being done with it
Todo:
extend this to other atomic operations

§ all_lanes_agree_on()

KAT_FD bool kat::collaborative::warp::all_lanes_agree_on ( int  condition)

Checks whether a condition holds for an entire warp of threads.

Parameters
conditionA boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction
Returns
true if condition is non-zero for all threads

§ all_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::all_lanes_satisfy ( int  condition)

Checks whether a condition holds for an entire warp of threads.

Parameters
conditionA boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction
Returns
true if condition is non-zero for all threads

§ am_leader_lane()

template<bool PreferFirstLane = true>
KAT_FD bool kat::collaborative::warp::am_leader_lane ( )

This applies the leader lane selection mechanism to obtain a condition for being the leader lane.

Template Parameters
PreferFirstLaneif true, the first lane will be the acting leader whenever it is active - at the cost of making this functions slightly more expensive (an extra subtraction instruction)
Returns
1 for exactly one of the active lanes in each warp, 0 for all others

§ compute_predicate_at_warp_stride()

template<typename Predicate , typename Size = native_word_t, detail::predicate_computation_length_slack_t PossibilityOfSlack = detail::may_have_arbitrary_slack>
KAT_FD void kat::linear_grid::collaborative::warp::compute_predicate_at_warp_stride ( unsigned *  computed_predicate,
Predicate &  predicate,
Size  length 
)

Arrange the computation of a predicate by a warp so that both reads and writes are coalesced and divergence is minimized.

This relies on the fact that there are as many threads in a warp as there are bits in the native register size (a 4-byte unsigned int has 32 bits, and warp size is 32). This means that the whole warp computing the predicate once (i.e. for 32 elements), the results in bits exactly suffice for a single write of a 4-byte value by a single thread. Since we want coalesced writes, of 128 bytes at a time, we need that many results for each of the lanes in a warp, i.e. we need to have 32 times all-warp computations of the predicate, with every consecutive 32 providing the write value to one of the lanes. That's how this function arranges the computation.

Note
The predicate is passed no input data other than the index within the range 0..length - 1; if you need such input, pass a closure (e.g. a lambda which captures the relevant data).
the last bits of the last bit container - those which are beyond
Parameters
lengthbits overall - are slack bits; it is assumed we're allowed to write anything to them.
Note
There is no inter-warp collaboration here; the outputs may be completely disjoint; and any warp's range's slack is separate. See kat::linear_grid::collabopration::grid::compute_predicate_at_warp_stride for the case of a single joint range to cover.
Parameters
computed_predicatethe result of the computation of the predicate for each of the indices in the range
lengthThe number of elements for which to compute the predicate, which is also the number of bits (not the size in bytes) of computed_predicate
predicate

§ first_lane_in()

template<bool ReturnWarpSizeForEmptyMask = true>
KAT_FD int kat::first_lane_in ( lane_mask_t  mask)

Determines which lane is the first within a lane mask (considered in LSB-to-MSB order)

Template Parameters
ReturnWarpSizeForEmptyMaskwhen set to true, the semantics of this function will be consistent for empty (all-zero) lane masks, in that the "first lane" inside the mask will be one past the last lane - like in the last_lane_in() function
Returns
index of the first 1-bit in the warp-size-bit mask; if no lanes have a corresponding 1-bit, -1 or 32 (warp_size) is returned, depending on
Template Parameters
ReturnWarpSizeForEmptyMask

§ first_lane_satisfying()

template<bool DefinedResultOnNoSatisfaction = true>
KAT_FD native_word_t kat::collaborative::warp::first_lane_satisfying ( int  condition)

Determines which is the first lane within a warp which satisfies some boolean condition.

Parameters
[in]conditionan essentially-boolena value representing whether or not the calling thread (lane) satisfies the condition; typed as an int since that's what CUDA primitives take mostly.
Returns
index of the first lane in the warp for which condition is non-zero. If no lane has non-zero condition, then the result is either undefined, or if
Template Parameters
DefinedResultOnNoSatisfactionwas set - 32 (warp_size).

§ in_unique_lane_with()

template<typename T >
KAT_FD bool kat::collaborative::warp::in_unique_lane_with ( value)

Compares values provided by each lane in a warp, checking for uniqueness.

Parameters
valueto check for matches; each lane brings its own
lane_maskthe lanes participating in the uniqueness check (other lanes' values are not considered); should be uniform among participating lanes
Returns
true if the lane this value provided had no matches among the values provided by other lanes

§ last_lane_in()

KAT_FD int kat::last_lane_in ( lane_mask_t  mask)

Determines which lane is the first within a lane mask (considered in LSB-to-MSB order)

Returns
index of the first 1-bit in the warp-size-bit mask; if no lanes have a corresponding 1-bit, 32 is returned;

§ majority_vote()

KAT_FD bool kat::collaborative::warp::majority_vote ( int  condition)

Check whether a condition holds for most lanes in a warp.

Parameters
conditionA boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

§ multisearch()

template<typename T , bool AssumeNeedlesAreSorted = false>
KAT_FD search_result_t<T> kat::linear_grid::collaborative::warp::multisearch ( const T &  lane_needle,
const T &  lane_hay_straw 
)

Have each lane search for its own value of interest within the sorted sequence of single values provided by all the warp lanes.

Note
The amount of time this function takes is very much data-dependent!
this function assumes all warp lanes are active.
Todo:
Does it matter if the needles, as opposed to the hay straws, are sorted? I wonder.
Todo:
consider specializing for non-full warps
Todo:
Specialize for smaller and larger data types: For larger ones, compare 4-byte parts of the datum separately (assuming
Template Parameters
Tis bitwise-comparable); for smaller ones, consider having lanes collect multiple data and pass it on to other lanes, which can then spare some shuffling.
Parameters
lane_needlethe value the current lane wants to search for
lane_hay_strawthe warp_size hay "straws" passed by the lanes make up the entire "haystack" we search in. They must be known to be in order, i < j => straw of lane i < straw of lane j. They are accessed using intra-warp shuffling, solely.
Returns
For lane i, the index of the first lane j with straw-of-j > needle-of-i, along with straw-of-j; if there is no such lane j, warp_size is returned as the lane index and an arbitrary value is returned as the result.

§ no_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::no_lanes_satisfy ( int  condition)

Checks whether a condition holds for none of the threads in a warp.

Parameters
conditionA boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction
Returns
true if condition is zero for all threads

§ num_lanes_agreeing_on()

KAT_FD native_word_t kat::collaborative::warp::num_lanes_agreeing_on ( int  condition)

Count the lanes in a warp which have the same condition value as the calling lane.

Parameters
conditionthe condition value for each lane (true if non-zero)
Returns
the number of threads in the warp whose condition is the same value as the calling lane (including the calling lane itself)

§ num_lanes_satisfying()

KAT_FD native_word_t kat::collaborative::warp::num_lanes_satisfying ( int  condition)

Count the lanes in a warp for which some condition holds.

Parameters
conditionthe condition value for each lane (true if non-zero)
Returns
the number of threads in the warp whose condition is true (non-zero)

§ select_leader_lane()

template<bool PreferFirstLane = true>
KAT_FD unsigned kat::collaborative::warp::select_leader_lane ( )

This is a mechanism for making exactly one lane act instead of the whole warp, which supports the case of some threads being inactive (e.g.

having exited).

Template Parameters
PreferFirstLaneif true, the first lane will be the acting leader whenever it is active - at the cost of making this functions slightly more expensive (an extra subtraction instruction)
Returns
Iif PreferFirstLane is true, and the first lane is active, then 0; the index of some active lane otherwise.

§ some_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::some_lanes_satisfy ( int  condition)

Checks whether a condition holds for at least one of the threads in a warp.

Parameters
conditionA boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction
Returns
true if condition is non-zero for at least one thread