CUDA device computation warp-level primitives, i.e. More...

#include <kat/on_device/collaboration/warp.cuh>
#include <kat/on_device/shared_memory/basic.cuh>
#include <kat/on_device/common.cuh>
#include <kat/on_device/math.cuh>
#include <kat/on_device/grid_info.cuh>
#include <type_traits>

Functions
template<typename T , bool Synchronize = true>
KAT_FD void	kat::collaborative::block::share_per_warp_data (T datum, T *__restrict__ where_to_make_available, unsigned writing_lane_id)
	Share one element of type T for each warp with the entire block - using a single array in shared memory for all shared values. More...

template<typename T , bool Synchronize = true>
KAT_FD void	kat::collaborative::block::share_per_warp_data (T datum, T *__restrict__ where_to_make_available)
	A variant of share_per_warp_data , with the writing lane index being decided dynamically in each lane based on who's actually active.

KAT_FD void	kat::collaborative::block::barrier ()

template<typename T , bool Synchronize = true, unsigned Dimensionality = 3>
KAT_FD T	kat::collaborative::block::get_from_thread (const T &value, kat::position_t source_thread_position)
	have all block threads obtain a value held by just one of the threads (and likely not otherwise easily accessible to the rest of the block's threads). More...

template<typename T , bool Synchronize = true>
KAT_FD T	kat::collaborative::block::get_from_first_thread (T &&value)
	have all block threads obtain a value held by the first thread in the block (and likely not otherwise easily accessible to the rest of the block's threads). More...

template<typename Function , typename Size = size_t>
KAT_FD void	kat::linear_grid::collaborative::block::at_block_stride (Size length, const Function &f)
	Have all threads in (one/some/all) blocks perform some action over the linear range of 0..length-1 - the same range for each block. More...

template<typename T , bool Synchronize = true>
KAT_FD void	kat::linear_grid::collaborative::block::share_per_warp_data (T datum, T *__restrict__ where_to_make_available, unsigned writing_lane_id)
	Share one element of type T for each warp with the entire block - using a single array in shared memory for all shared values. More...

template<typename T , bool Synchronize = true>
KAT_FD void	kat::linear_grid::collaborative::block::share_per_warp_data (T datum, T *__restrict__ where_to_make_available)
	A variant of share_per_warp_data , with the writing lane index being decided dynamically in each lane based on who's actually active.

KAT_FD void	kat::linear_grid::collaborative::block::barrier ()

template<typename T , bool Synchronize = true>
KAT_FD T	kat::linear_grid::collaborative::block::get_from_thread (T &&value, unsigned source_thread_id)
	have all block threads obtain a value held by just one of the threads (and likely not otherwise easily accessible to the rest of the block's threads). More...

template<typename T , bool Synchronize = true>
KAT_FD T	kat::linear_grid::collaborative::block::get_from_first_thread (T &&value)
	have all block threads obtain a value held by the first thread in the block (and likely not otherwise easily accessible to the rest of the block's threads). More...

Detailed Description

CUDA device computation warp-level primitives, i.e.

those involving interaction of many/all of each blocks's lanes, but no inter-block interaction.

Todo:: Some of these assume linear grids, others do not - sort them out

Function Documentation

§ at_block_stride()

template<typename Function , typename Size = size_t>

KAT_FD void kat::linear_grid::collaborative::block::at_block_stride	(	Size	length,
		const Function &	f
	)

Have all threads in (one/some/all) blocks perform some action over the linear range of 0..length-1 - the same range for each block.

Note: This function semi-assumes the block size is a multiple of the warp size; otherwise, it should still work, but - it'll be slow(ish).

Parameters

length	The length of the range (of integers) on which to act handle (serially)
f	The callable to execute for each element of the sequence.

§ get_from_first_thread() [1/2]

template<typename T , bool Synchronize = true>

KAT_FD T kat::collaborative::block::get_from_first_thread ( T && value )

have all block threads obtain a value held by the first thread in the block (and likely not otherwise easily accessible to the rest of the block's threads).

Note: uses shared memory for "broadcasting" the value

§ get_from_first_thread() [2/2]

template<typename T , bool Synchronize = true>

KAT_FD T kat::linear_grid::collaborative::block::get_from_first_thread ( T && value )

have all block threads obtain a value held by the first thread in the block (and likely not otherwise easily accessible to the rest of the block's threads).

Note: uses shared memory for "broadcasting" the value

§ get_from_thread() [1/2]

template<typename T , bool Synchronize = true, unsigned Dimensionality = 3>

KAT_FD T kat::collaborative::block::get_from_thread	(	const T &	value,
		kat::position_t	source_thread_position
	)

have all block threads obtain a value held by just one of the threads (and likely not otherwise easily accessible to the rest of the block's threads).

Note: uses shared memory for the "broadcast" by the thread holding the relevant value

§ get_from_thread() [2/2]

template<typename T , bool Synchronize = true>

KAT_FD T kat::linear_grid::collaborative::block::get_from_thread	(	T &&	value,
		unsigned	source_thread_id
	)

have all block threads obtain a value held by just one of the threads (and likely not otherwise easily accessible to the rest of the block's threads).

Note: uses shared memory for the "broadcast" by the thread holding the relevant value

§ share_per_warp_data() [1/2]

template<typename T , bool Synchronize = true>

KAT_FD void kat::collaborative::block::share_per_warp_data	(	T	datum,
		T *__restrict__	where_to_make_available,
		unsigned	writing_lane_id
	)

Share one element of type T for each warp with the entire block - using a single array in shared memory for all shared values.

Parameters

datum	a warp-specific (but not thread-specific) piece of data, one for each warp, which is to be shared with the whole block
where_to_make_available	the various warp-specific data will be stored here by warp index
writing_lane_id	which lane in each warp should perform write operations

§ share_per_warp_data() [2/2]

template<typename T , bool Synchronize = true>

KAT_FD void kat::linear_grid::collaborative::block::share_per_warp_data	(	T	datum,
		T *__restrict__	where_to_make_available,
		unsigned	writing_lane_id
	)

Share one element of type T for each warp with the entire block - using a single array in shared memory for all shared values.

Parameters

datum	a warp-specific (but not thread-specific) piece of data, one for each warp, which is to be shared with the whole block
where_to_make_available	the various warp-specific data will be stored here by warp index
writing_lane_id	which lane in each warp should perform write operations

Note: if different threads in a warp have different values, behavior is not guaranteed.

Functions

Detailed Description

Function Documentation

§ at_block_stride()

§ get_from_first_thread() [1/2]

§ get_from_first_thread() [2/2]

§ get_from_thread() [1/2]

§ get_from_thread() [2/2]

§ share_per_warp_data() [1/2]

§ share_per_warp_data() [2/2]