CUDA device computation warp-level primitives, i.e. More...

#include "../grid_info.cuh"
#include <kat/on_device/shuffle.cuh>
#include <kat/on_device/builtins.cuh>
#include <kat/on_device/atomics.cuh>
#include <kat/on_device/non-builtins.cuh>
#include <kat/on_device/ptx.cuh>
#include <kat/on_device/math.cuh>
#include <kat/on_device/common.cuh>
#include <type_traits>

Classes
struct	kat::linear_grid::collaborative::warp::search_result_t< T >
	A structure for warp-level search results. More...

Enumerations
enum	kat::linear_grid::collaborative::warp::detail::predicate_computation_length_slack_t { kat::linear_grid::collaborative::warp::detail::has_no_slack, kat::linear_grid::collaborative::warp::detail::may_have_full_warps_of_slack, kat::linear_grid::collaborative::warp::detail::may_have_arbitrary_slack }
	Sometimes you have a run-time-determined range of indices for which you need to compute some predicate, but you have it constrained to be a nice round value, say a multiple of warp_size * warp_size - or even a constant multiple, but by means of, say, a configuration file. More...

Functions
KAT_FD unsigned	kat::num_lanes_in (lane_mask_t mask)

template<bool ReturnWarpSizeForEmptyMask = true>
KAT_FD int	kat::first_lane_in (lane_mask_t mask)
	Determines which lane is the first within a lane mask (considered in LSB-to-MSB order) More...

KAT_FD int	kat::last_lane_in (lane_mask_t mask)
	Determines which lane is the first within a lane mask (considered in LSB-to-MSB order) More...

KAT_FD bool	kat::collaborative::warp::all_lanes_satisfy (int condition)
	Checks whether a condition holds for an entire warp of threads. More...

KAT_FD bool	kat::collaborative::warp::no_lanes_satisfy (int condition)
	Checks whether a condition holds for none of the threads in a warp. More...

KAT_FD bool	kat::collaborative::warp::all_lanes_agree_on (int condition)
	Checks whether a condition holds for an entire warp of threads. More...

KAT_FD bool	kat::collaborative::warp::some_lanes_satisfy (int condition)
	Checks whether a condition holds for at least one of the threads in a warp. More...

KAT_FD native_word_t	kat::collaborative::warp::num_lanes_satisfying (int condition)
	Count the lanes in a warp for which some condition holds. More...

KAT_FD native_word_t	kat::collaborative::warp::num_lanes_agreeing_on (int condition)
	Count the lanes in a warp which have the same condition value as the calling lane. More...

KAT_FD bool	kat::collaborative::warp::majority_vote (int condition)
	Check whether a condition holds for most lanes in a warp. More...

template<typename T >
KAT_FD bool	kat::collaborative::warp::in_unique_lane_with (T value)
	Compares values provided by each lane in a warp, checking for uniqueness. More...

template<typename T >
KAT_FD T	kat::collaborative::warp::get_from_lane (T value, int source_lane)

template<typename T >
KAT_FD T	kat::collaborative::warp::get_from_first_lane (T value)

template<typename T >
KAT_FD T	kat::collaborative::warp::get_from_last_lane (T value)

template<bool DefinedResultOnNoSatisfaction = true>
KAT_FD native_word_t	kat::collaborative::warp::first_lane_satisfying (int condition)
	Determines which is the first lane within a warp which satisfies some boolean condition. More...

KAT_FD lane_mask_t	kat::collaborative::warp::get_active_lanes ()

KAT_FD unsigned	kat::collaborative::warp::num_active_lanes ()

template<bool PreferFirstLane = true>
KAT_FD unsigned	kat::collaborative::warp::detail::select_leader_lane_among (unsigned lanes_mask)

template<bool PreferFirstLane = true>
KAT_FD bool	kat::collaborative::warp::detail::am_leader_lane (unsigned active_lanes_mask)

KAT_FD unsigned	kat::collaborative::warp::detail::lane_index_among (lane_mask_t mask)

template<bool PreferFirstLane = true>
KAT_FD unsigned	kat::collaborative::warp::select_leader_lane ()
	This is a mechanism for making exactly one lane act instead of the whole warp, which supports the case of some threads being inactive (e.g. More...

template<bool PreferFirstLane = true>
KAT_FD bool	kat::collaborative::warp::am_leader_lane ()
	This applies the leader lane selection mechanism to obtain a condition for being the leader lane. More...

template<typename Function >
KAT_FD std::result_of< Function()>::type	kat::collaborative::warp::have_a_single_lane_compute (Function f, unsigned designated_computing_lane)

template<typename Function , bool PreferFirstLane = true>
KAT_FD std::result_of< Function()>::type	kat::collaborative::warp::have_a_single_lane_compute (Function f)

template<typename Function >
KAT_FD std::result_of< Function()>::type	kat::collaborative::warp::have_first_lane_compute (Function f)

template<typename Function >
KAT_FD std::result_of< Function()>::type	kat::collaborative::warp::have_last_lane_compute (Function f)

KAT_FD unsigned	kat::collaborative::warp::index_among_active_lanes ()

KAT_FD unsigned	kat::collaborative::warp::last_active_lane_index ()

template<typename T >
KAT_FD T	kat::collaborative::warp::active_lanes_atomically_increment (T *counter)
	When every (active) lane in a warp needs to increment a counter, use this function to avoid all of them having to execute atomics; each active lane gets the "previous value" as though they had all incremented in order. More...

template<typename Function , typename Size = unsigned>
KAT_FD void	kat::collaborative::warp::at_warp_stride (Size length, Function f)

template<typename T , bool AssumeNeedlesAreSorted = false>
KAT_FD search_result_t< T >	kat::linear_grid::collaborative::warp::multisearch (const T &lane_needle, const T &lane_hay_straw)
	Have each lane search for its own value of interest within the sorted sequence of single values provided by all the warp lanes. More...

template<typename Function , typename Size = unsigned>
KAT_FD void	kat::linear_grid::collaborative::warp::at_warp_stride (Size length, Function f)

template<typename Predicate , typename Size = native_word_t, detail::predicate_computation_length_slack_t PossibilityOfSlack = detail::may_have_arbitrary_slack>
KAT_FD void	kat::linear_grid::collaborative::warp::compute_predicate_at_warp_stride (unsigned *computed_predicate, Predicate &predicate, Size length)
	Arrange the computation of a predicate by a warp so that both reads and writes are coalesced and divergence is minimized. More...

Detailed Description

CUDA device computation warp-level primitives, i.e.

those involving interaction of most/all of each warp's lanes, but no inter-warp interaction.

Todo:

Some of these assume linear grids, others do not - sort them out.
Use a lane index type

Enumeration Type Documentation

§ predicate_computation_length_slack_t

enum kat::linear_grid::collaborative::warp::detail::predicate_computation_length_slack_t

Sometimes you have a run-time-determined range of indices for which you need to compute some predicate, but you have it constrained to be a nice round value, say a multiple of warp_size * warp_size - or even a constant multiple, but by means of, say, a configuration file.

Now, the compiler will not be able to figure out this is the case, and will compile in the condition checks and the code for handling slack - which you don't actually need. To prevent that and cater to my OCD, you can use a template parameter to indicate how nicely-behaving your input length is.

Enumerator
has_no_slack	Length known to be multiple of warp_size * warp_size.
may_have_full_warps_of_slack	Length only known to be multiple of warp_size.
may_have_arbitrary_slack	Length may have any value, make no assumptions about it.

Function Documentation

§ active_lanes_atomically_increment()

template<typename T >

KAT_FD T kat::collaborative::warp::active_lanes_atomically_increment ( T * counter )

When every (active) lane in a warp needs to increment a counter, use this function to avoid all of them having to execute atomics; each active lane gets the "previous value" as though they had all incremented in order.

Note: It's not clear to me how better this is from just using atomics and being done with it

Todo:: extend this to other atomic operations

§ all_lanes_agree_on()

KAT_FD bool kat::collaborative::warp::all_lanes_agree_on ( int condition )

Checks whether a condition holds for an entire warp of threads.

Parameters

condition A boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

Returns: true if condition is non-zero for all threads

§ all_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::all_lanes_satisfy ( int condition )

Checks whether a condition holds for an entire warp of threads.

Parameters

condition A boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

Returns: true if condition is non-zero for all threads

§ am_leader_lane()

template<bool PreferFirstLane = true>

KAT_FD bool kat::collaborative::warp::am_leader_lane ( )

This applies the leader lane selection mechanism to obtain a condition for being the leader lane.

Template Parameters

PreferFirstLane if true, the first lane will be the acting leader whenever it is active - at the cost of making this functions slightly more expensive (an extra subtraction instruction)

Returns: 1 for exactly one of the active lanes in each warp, 0 for all others

§ compute_predicate_at_warp_stride()

template<typename Predicate , typename Size = native_word_t, detail::predicate_computation_length_slack_t PossibilityOfSlack = detail::may_have_arbitrary_slack>

KAT_FD void kat::linear_grid::collaborative::warp::compute_predicate_at_warp_stride	(	unsigned *	computed_predicate,
		Predicate &	predicate,
		Size	length
	)

Arrange the computation of a predicate by a warp so that both reads and writes are coalesced and divergence is minimized.

This relies on the fact that there are as many threads in a warp as there are bits in the native register size (a 4-byte unsigned int has 32 bits, and warp size is 32). This means that the whole warp computing the predicate once (i.e. for 32 elements), the results in bits exactly suffice for a single write of a 4-byte value by a single thread. Since we want coalesced writes, of 128 bytes at a time, we need that many results for each of the lanes in a warp, i.e. we need to have 32 times all-warp computations of the predicate, with every consecutive 32 providing the write value to one of the lanes. That's how this function arranges the computation.

Note: The predicate is passed no input data other than the index within the range 0..length - 1; if you need such input, pass a closure (e.g. a lambda which captures the relevant data).; the last bits of the last bit container - those which are beyond

Parameters

length bits overall - are slack bits; it is assumed we're allowed to write anything to them.

Note: There is no inter-warp collaboration here; the outputs may be completely disjoint; and any warp's range's slack is separate. See kat::linear_grid::collabopration::grid::compute_predicate_at_warp_stride for the case of a single joint range to cover.

Parameters

computed_predicate	the result of the computation of the predicate for each of the indices in the range
length	The number of elements for which to compute the predicate, which is also the number of bits (not the size in bytes) of `computed_predicate`
predicate

§ first_lane_in()

template<bool ReturnWarpSizeForEmptyMask = true>

KAT_FD int kat::first_lane_in ( lane_mask_t mask )

Determines which lane is the first within a lane mask (considered in LSB-to-MSB order)

Template Parameters

ReturnWarpSizeForEmptyMask when set to true, the semantics of this function will be consistent for empty (all-zero) lane masks, in that the "first lane" inside the mask will be one past the last lane - like in the last_lane_in() function

Returns: index of the first 1-bit in the warp-size-bit mask; if no lanes have a corresponding 1-bit, -1 or 32 (warp_size) is returned, depending on

Template Parameters

ReturnWarpSizeForEmptyMask

§ first_lane_satisfying()

template<bool DefinedResultOnNoSatisfaction = true>

KAT_FD native_word_t kat::collaborative::warp::first_lane_satisfying ( int condition )

Determines which is the first lane within a warp which satisfies some boolean condition.

Parameters

[in] condition an essentially-boolena value representing whether or not the calling thread (lane) satisfies the condition; typed as an int since that's what CUDA primitives take mostly.

Returns: index of the first lane in the warp for which condition is non-zero. If no lane has non-zero condition, then the result is either undefined, or if

Template Parameters

DefinedResultOnNoSatisfaction was set - 32 (warp_size).

§ in_unique_lane_with()

template<typename T >

KAT_FD bool kat::collaborative::warp::in_unique_lane_with ( T value )

Compares values provided by each lane in a warp, checking for uniqueness.

Parameters

value	to check for matches; each lane brings its own
lane_mask	the lanes participating in the uniqueness check (other lanes' values are not considered); should be uniform among participating lanes

Returns: true if the lane this value provided had no matches among the values provided by other lanes

§ last_lane_in()

KAT_FD int kat::last_lane_in ( lane_mask_t mask )

Determines which lane is the first within a lane mask (considered in LSB-to-MSB order)

Returns: index of the first 1-bit in the warp-size-bit mask; if no lanes have a corresponding 1-bit, 32 is returned;

§ majority_vote()

KAT_FD bool kat::collaborative::warp::majority_vote ( int condition )

Check whether a condition holds for most lanes in a warp.

Parameters

condition A boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

§ multisearch()

template<typename T , bool AssumeNeedlesAreSorted = false>

KAT_FD search_result_t<T> kat::linear_grid::collaborative::warp::multisearch	(	const T &	lane_needle,
		const T &	lane_hay_straw
	)

Have each lane search for its own value of interest within the sorted sequence of single values provided by all the warp lanes.

Note: The amount of time this function takes is very much data-dependent!; this function assumes all warp lanes are active.

Todo:: Does it matter if the needles, as opposed to the hay straws, are sorted? I wonder.

Todo:: consider specializing for non-full warps

Todo:: Specialize for smaller and larger data types: For larger ones, compare 4-byte parts of the datum separately (assuming

Template Parameters

T	is bitwise-comparable); for smaller ones, consider having lanes collect multiple data and pass it on to other lanes, which can then spare some shuffling.

Parameters

lane_needle	the value the current lane wants to search for
lane_hay_straw	the warp_size hay "straws" passed by the lanes make up the entire "haystack" we search in. They must be known to be in order, i < j => straw of lane i < straw of lane j. They are accessed using intra-warp shuffling, solely.

Returns: For lane i, the index of the first lane j with straw-of-j > needle-of-i, along with straw-of-j; if there is no such lane j, warp_size is returned as the lane index and an arbitrary value is returned as the result.

§ no_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::no_lanes_satisfy ( int condition )

Checks whether a condition holds for none of the threads in a warp.

Parameters

condition A boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

Returns: true if condition is zero for all threads

§ num_lanes_agreeing_on()

KAT_FD native_word_t kat::collaborative::warp::num_lanes_agreeing_on ( int condition )

Count the lanes in a warp which have the same condition value as the calling lane.

Parameters

condition the condition value for each lane (true if non-zero)

Returns: the number of threads in the warp whose condition is the same value as the calling lane (including the calling lane itself)

§ num_lanes_satisfying()

KAT_FD native_word_t kat::collaborative::warp::num_lanes_satisfying ( int condition )

Count the lanes in a warp for which some condition holds.

Parameters

condition the condition value for each lane (true if non-zero)

Returns: the number of threads in the warp whose condition is true (non-zero)

§ select_leader_lane()

template<bool PreferFirstLane = true>

KAT_FD unsigned kat::collaborative::warp::select_leader_lane ( )

This is a mechanism for making exactly one lane act instead of the whole warp, which supports the case of some threads being inactive (e.g.

having exited).

Template Parameters

PreferFirstLane if true, the first lane will be the acting leader whenever it is active - at the cost of making this functions slightly more expensive (an extra subtraction instruction)

Returns: Iif PreferFirstLane is true, and the first lane is active, then 0; the index of some active lane otherwise.

§ some_lanes_satisfy()

KAT_FD bool kat::collaborative::warp::some_lanes_satisfy ( int condition )

Checks whether a condition holds for at least one of the threads in a warp.

Parameters

condition A boolean value (passed as an integer since that's what nVIDIA GPUs actually check with the HW instruction

Returns: true if condition is non-zero for at least one thread

Classes

Enumerations

Functions

Detailed Description

Enumeration Type Documentation

§ predicate_computation_length_slack_t

Function Documentation

§ active_lanes_atomically_increment()

§ all_lanes_agree_on()

§ all_lanes_satisfy()

§ am_leader_lane()

§ compute_predicate_at_warp_stride()

§ first_lane_in()

§ first_lane_satisfying()

§ in_unique_lane_with()

§ last_lane_in()

§ majority_vote()

§ multisearch()

§ no_lanes_satisfy()

§ num_lanes_agreeing_on()

§ num_lanes_satisfying()

§ select_leader_lane()

§ some_lanes_satisfy()