Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h> header. More...

#include "detail/define_macros.cuh"
#include <kat/on_device/common.cuh>
#include <cstdint>
#include <cassert>
#include <type_traits>
#include "detail/undefine_macros.cuh"

Namespaces
	kat::ptx
	Code exposing CUDA's PTX intermediate representation instructions to C++ code.

Macros
#define	CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_

#define	DEFINE_IS_IN_MEMORY_SPACE(_which_space)
	See relevant section of the CUDA PTX reference for details on these instructions. More...

#define	DEFINE_BFIND(ptx_type)

#define	DEFINE_PRMT_WITH_MODE(selection_mode_name, selection_mode)

#define	DEFINE_BFE(ptx_type)
	Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field. More...

#define	DEFINE_SAD(ptx_type_1, unsigned_ptx_type_1)
	Adds the absolute difference of two values to a base value. More...

#define	DEFINE_SAD_(x) DEFINE_SAD(x, MAKE_UNSIGNED(x));

Functions
KAT_FD void	kat::ptx::trap ()
	Aborts execution (of the entire kernel grid) and generates an interrupt to the host CPU.

KAT_FD void	kat::ptx::exit ()
	Ends execution of the current thread of this kernel/grid.

	kat::ptx::DEFINE_IS_IN_MEMORY_SPACE (const) DEFINE_IS_IN_MEMORY_SPACE(global) DEFINE_IS_IN_MEMORY_SPACE(local) DEFINE_IS_IN_MEMORY_SPACE(shared) DEFINE_BFIND(s32) DEFINE_BFIND(s64) DEFINE_BFIND(u32) DEFINE_BFIND(u64) DEFINE_PRMT_WITH_MODE(forward_4_extract

f4e	kat::ptx::DEFINE_PRMT_WITH_MODE (backward_4_extract, b4e) DEFINE_PRMT_WITH_MODE(replicate_8

f4e rc8	kat::ptx::DEFINE_PRMT_WITH_MODE (replicate_16, rc16) DEFINE_PRMT_WITH_MODE(edge_clam_left

f4e rc8 ecl	kat::ptx::DEFINE_PRMT_WITH_MODE (edge_clam_right, ecl) KAT_FD uint32_t prmt(uint32_t first
	See: relevant section of the CUDA PTX reference for an explanation of what this does exactly. More...

	kat::ptx::asm ("prmt.b32 %0, %1, %2, %3;" :"=r"(result) :"r"(first), "r"(second), "r"(byte_selectors))

	kat::ptx::DEFINE_BFE (s32) DEFINE_BFE(s64) DEFINE_BFE(u32) DEFINE_BFE(u64) KAT_FD uint32_t bfi(uint32_t bits_to_insert

	kat::ptx::asm ("bfi.b32 %0, %1, %2, %3, %4;" :"=r"(ret) :"r"(bits_to_insert), "r"(existing_bit_field), "r"(start_position), "r"(num_bits))

KAT_FD uint64_t	kat::ptx::bfi (uint64_t bits_to_insert, uint64_t existing_bit_field, uint32_t start_position, uint32_t num_bits)

	kat::ptx::DEFINE_SAD_ (u16)

	kat::ptx::DEFINE_SAD_ (u32)

	kat::ptx::DEFINE_SAD_ (u64)

	kat::ptx::DEFINE_SAD_ (s16)

	kat::ptx::DEFINE_SAD_ (s32)

	kat::ptx::DEFINE_SAD_ (s64)

Variables
f4e rc8 ecl uint32_t	kat::ptx::second

f4e rc8 ecl uint32_t uint32_t	kat::ptx::byte_selectors

return	kat::ptx::result

uint32_t	kat::ptx::existing_bit_field

uint32_t uint32_t	kat::ptx::start_position

uint32_t uint32_t uint32_t	kat::ptx::num_bits

return	kat::ptx::ret

Detailed Description

Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h> header.

Macro Definition Documentation

§ DEFINE_BFE

#define DEFINE_BFE ( ptx_type )

Value:

KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_type) \
bfe( \
    CPP_TYPE_BY_PTX_TYPE(ptx_type) bits, \
    uint32_t start_position, \
    uint32_t num_bits) \
{ \
    CPP_TYPE_BY_PTX_TYPE(ptx_type) extracted_bits;  \
    asm ( \
        "bfe." PTX_STRINGIFY(ptx_type) " %0, %1, %2, %3;" \
        : "=" SIZE_CONSTRAINT(ptx_type) (extracted_bits) \
        : SIZE_CONSTRAINT(ptx_type) (bits) \
        , "r" (start_position) \
        , "r" (num_bits) \
    );\
    return extracted_bits; \
}

Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field.

Has sign extension semantics for signed inputs which are bit tricky, see in the PTX ISA guide:

http://docs.nvidia.com/cuda/parallel-thread-execution/index.html

TODO: CUB 1.5.2's BFE wrapper seems kind of fishy. Why does Duane Merill not use PTX for extraction from 64-bit fields? I'll take a different route.

§ DEFINE_BFIND

#define DEFINE_BFIND ( ptx_type )

Value:

KAT_FD uint32_t \
bfind(CPP_TYPE_BY_PTX_TYPE(ptx_type) val) \
{ \
    uint32_t ret;  \
    asm ( \
        "bfind." PTX_STRINGIFY(ptx_type) " %0, %1;" \
        : "=r"(ret) : SIZE_CONSTRAINT(ptx_type) (val)); \
    return ret; \
}

§ DEFINE_IS_IN_MEMORY_SPACE

#define DEFINE_IS_IN_MEMORY_SPACE ( _which_space )

Value:

KAT_FD int32_t is_in_ ## _which_space ## _memory (const void *ptr) \
{ \
    int32_t result; \
    asm ("{\n\t" \
        ".reg .pred p;\n\t" \
        "isspacep." PTX_STRINGIFY(_which_space) " p, %1;\n\t" \
        "selp.b32 %0, 1, 0, p;\n\t" \
        "}" \
        : "=r"(result) : PTR_SIZE_CONSTRAINT(ptr)); \
    return result; \
}

See relevant section of the CUDA PTX reference for details on these instructions.

§ DEFINE_PRMT_WITH_MODE

#define DEFINE_PRMT_WITH_MODE	(	selection_mode_name,
		selection_mode
	)

Value:

KAT_FD  uint32_t prmt_ ## selection_mode_name (uint32_t first, uint32_t second, uint32_t control_bits) \
{ \
    uint32_t result; \
    asm("prmt.b32." PTX_STRINGIFY(selection_mode) " %0, %1, %2, %3;" \
        : "=r"(result) : "r"(first), "r"(second), "r"(control_bits)); \
    return result; \
}

§ DEFINE_SAD

#define DEFINE_SAD	(	ptx_type_1,
		unsigned_ptx_type_1
	)

Value:

KAT_FD CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) sad( \
    CPP_TYPE_BY_PTX_TYPE(ptx_type_1) x, \
    CPP_TYPE_BY_PTX_TYPE(ptx_type_1) y, \
    CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) addend) \
{ \
    CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) result;  \
    asm ( \
        "sad." PTX_STRINGIFY(ptx_type_1) " %0, %1, %2, %3;" \
        : "=" SIZE_CONSTRAINT(unsigned_ptx_type_1) (result) \
        : SIZE_CONSTRAINT(ptx_type_1) (x) \
        , SIZE_CONSTRAINT(ptx_type_1) (y) \
        , SIZE_CONSTRAINT(unsigned_ptx_type_1) (addend) \
    );\
    return result; \
}

Adds the absolute difference of two values to a base value.

Parameters

x	value from which to subtract `y`
y	value to subtract from `x`
addend	base value to which to add `\|x-y\|`

Returns: addend + |x - y|

Namespaces

Macros

Functions

Variables

Detailed Description

Macro Definition Documentation

§ DEFINE_BFE

§ DEFINE_BFIND

§ DEFINE_IS_IN_MEMORY_SPACE

§ DEFINE_PRMT_WITH_MODE

§ DEFINE_SAD