cuda-kat
CUDA kernel author's tools
Namespaces | Macros | Functions | Variables
miscellany.cuh File Reference

Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h> header. More...

#include "detail/define_macros.cuh"
#include <kat/on_device/common.cuh>
#include <cstdint>
#include <cassert>
#include <type_traits>
#include "detail/undefine_macros.cuh"

Namespaces

 kat::ptx
 Code exposing CUDA's PTX intermediate representation instructions to C++ code.
 

Macros

#define CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_
 
#define DEFINE_IS_IN_MEMORY_SPACE(_which_space)
 See relevant section of the CUDA PTX reference for details on these instructions. More...
 
#define DEFINE_BFIND(ptx_type)
 
#define DEFINE_PRMT_WITH_MODE(selection_mode_name, selection_mode)
 
#define DEFINE_BFE(ptx_type)
 Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field. More...
 
#define DEFINE_SAD(ptx_type_1, unsigned_ptx_type_1)
 Adds the absolute difference of two values to a base value. More...
 
#define DEFINE_SAD_(x)   DEFINE_SAD(x, MAKE_UNSIGNED(x));
 

Functions

KAT_FD void kat::ptx::trap ()
 Aborts execution (of the entire kernel grid) and generates an interrupt to the host CPU.
 
KAT_FD void kat::ptx::exit ()
 Ends execution of the current thread of this kernel/grid.
 
 kat::ptx::DEFINE_IS_IN_MEMORY_SPACE (const) DEFINE_IS_IN_MEMORY_SPACE(global) DEFINE_IS_IN_MEMORY_SPACE(local) DEFINE_IS_IN_MEMORY_SPACE(shared) DEFINE_BFIND(s32) DEFINE_BFIND(s64) DEFINE_BFIND(u32) DEFINE_BFIND(u64) DEFINE_PRMT_WITH_MODE(forward_4_extract
 
f4e kat::ptx::DEFINE_PRMT_WITH_MODE (backward_4_extract, b4e) DEFINE_PRMT_WITH_MODE(replicate_8
 
f4e rc8 kat::ptx::DEFINE_PRMT_WITH_MODE (replicate_16, rc16) DEFINE_PRMT_WITH_MODE(edge_clam_left
 
f4e rc8 ecl kat::ptx::DEFINE_PRMT_WITH_MODE (edge_clam_right, ecl) KAT_FD uint32_t prmt(uint32_t first
 See: relevant section of the CUDA PTX reference for an explanation of what this does exactly. More...
 
 kat::ptx::asm ("prmt.b32 %0, %1, %2, %3;" :"=r"(result) :"r"(first), "r"(second), "r"(byte_selectors))
 
 kat::ptx::DEFINE_BFE (s32) DEFINE_BFE(s64) DEFINE_BFE(u32) DEFINE_BFE(u64) KAT_FD uint32_t bfi(uint32_t bits_to_insert
 
 kat::ptx::asm ("bfi.b32 %0, %1, %2, %3, %4;" :"=r"(ret) :"r"(bits_to_insert), "r"(existing_bit_field), "r"(start_position), "r"(num_bits))
 
KAT_FD uint64_t kat::ptx::bfi (uint64_t bits_to_insert, uint64_t existing_bit_field, uint32_t start_position, uint32_t num_bits)
 
 kat::ptx::DEFINE_SAD_ (u16)
 
 kat::ptx::DEFINE_SAD_ (u32)
 
 kat::ptx::DEFINE_SAD_ (u64)
 
 kat::ptx::DEFINE_SAD_ (s16)
 
 kat::ptx::DEFINE_SAD_ (s32)
 
 kat::ptx::DEFINE_SAD_ (s64)
 

Variables

f4e rc8 ecl uint32_t kat::ptx::second
 
f4e rc8 ecl uint32_t uint32_t kat::ptx::byte_selectors
 
return kat::ptx::result
 
uint32_t kat::ptx::existing_bit_field
 
uint32_t uint32_t kat::ptx::start_position
 
uint32_t uint32_t uint32_t kat::ptx::num_bits
 
return kat::ptx::ret
 

Detailed Description

Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h> header.

Macro Definition Documentation

§ DEFINE_BFE

#define DEFINE_BFE (   ptx_type)
Value:
KAT_FD CPP_TYPE_BY_PTX_TYPE(ptx_type) \
bfe( \
CPP_TYPE_BY_PTX_TYPE(ptx_type) bits, \
uint32_t start_position, \
uint32_t num_bits) \
{ \
CPP_TYPE_BY_PTX_TYPE(ptx_type) extracted_bits; \
asm ( \
"bfe." PTX_STRINGIFY(ptx_type) " %0, %1, %2, %3;" \
: "=" SIZE_CONSTRAINT(ptx_type) (extracted_bits) \
: SIZE_CONSTRAINT(ptx_type) (bits) \
, "r" (start_position) \
, "r" (num_bits) \
);\
return extracted_bits; \
}

Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field.

Has sign extension semantics for signed inputs which are bit tricky, see in the PTX ISA guide:

http://docs.nvidia.com/cuda/parallel-thread-execution/index.html

TODO: CUB 1.5.2's BFE wrapper seems kind of fishy. Why does Duane Merill not use PTX for extraction from 64-bit fields? I'll take a different route.

§ DEFINE_BFIND

#define DEFINE_BFIND (   ptx_type)
Value:
KAT_FD uint32_t \
bfind(CPP_TYPE_BY_PTX_TYPE(ptx_type) val) \
{ \
uint32_t ret; \
asm ( \
"bfind." PTX_STRINGIFY(ptx_type) " %0, %1;" \
: "=r"(ret) : SIZE_CONSTRAINT(ptx_type) (val)); \
return ret; \
}

§ DEFINE_IS_IN_MEMORY_SPACE

#define DEFINE_IS_IN_MEMORY_SPACE (   _which_space)
Value:
KAT_FD int32_t is_in_ ## _which_space ## _memory (const void *ptr) \
{ \
int32_t result; \
asm ("{\n\t" \
".reg .pred p;\n\t" \
"isspacep." PTX_STRINGIFY(_which_space) " p, %1;\n\t" \
"selp.b32 %0, 1, 0, p;\n\t" \
"}" \
: "=r"(result) : PTR_SIZE_CONSTRAINT(ptr)); \
return result; \
}

See relevant section of the CUDA PTX reference for details on these instructions.

§ DEFINE_PRMT_WITH_MODE

#define DEFINE_PRMT_WITH_MODE (   selection_mode_name,
  selection_mode 
)
Value:
KAT_FD uint32_t prmt_ ## selection_mode_name (uint32_t first, uint32_t second, uint32_t control_bits) \
{ \
uint32_t result; \
asm("prmt.b32." PTX_STRINGIFY(selection_mode) " %0, %1, %2, %3;" \
: "=r"(result) : "r"(first), "r"(second), "r"(control_bits)); \
return result; \
}

§ DEFINE_SAD

#define DEFINE_SAD (   ptx_type_1,
  unsigned_ptx_type_1 
)
Value:
KAT_FD CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) sad( \
CPP_TYPE_BY_PTX_TYPE(ptx_type_1) x, \
CPP_TYPE_BY_PTX_TYPE(ptx_type_1) y, \
CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) addend) \
{ \
CPP_TYPE_BY_PTX_TYPE(unsigned_ptx_type_1) result; \
asm ( \
"sad." PTX_STRINGIFY(ptx_type_1) " %0, %1, %2, %3;" \
: "=" SIZE_CONSTRAINT(unsigned_ptx_type_1) (result) \
: SIZE_CONSTRAINT(ptx_type_1) (x) \
, SIZE_CONSTRAINT(ptx_type_1) (y) \
, SIZE_CONSTRAINT(unsigned_ptx_type_1) (addend) \
);\
return result; \
}

Adds the absolute difference of two values to a base value.

Parameters
xvalue from which to subtract y
yvalue to subtract from x
addendbase value to which to add |x-y|
Returns
addend + |x - y|