cuda-kat
CUDA kernel author's tools
|
Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h>
header.
More...
#include "detail/define_macros.cuh"
#include <kat/on_device/common.cuh>
#include <cstdint>
#include <cassert>
#include <type_traits>
#include "detail/undefine_macros.cuh"
Namespaces | |
kat::ptx | |
Code exposing CUDA's PTX intermediate representation instructions to C++ code. | |
Macros | |
#define | CUDA_KAT_ON_DEVICE_PTX_MISCELLANY_CUH_ |
#define | DEFINE_IS_IN_MEMORY_SPACE(_which_space) |
See relevant section of the CUDA PTX reference for details on these instructions. More... | |
#define | DEFINE_BFIND(ptx_type) |
#define | DEFINE_PRMT_WITH_MODE(selection_mode_name, selection_mode) |
#define | DEFINE_BFE(ptx_type) |
Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field. More... | |
#define | DEFINE_SAD(ptx_type_1, unsigned_ptx_type_1) |
Adds the absolute difference of two values to a base value. More... | |
#define | DEFINE_SAD_(x) DEFINE_SAD(x, MAKE_UNSIGNED(x)); |
Functions | |
KAT_FD void | kat::ptx::trap () |
Aborts execution (of the entire kernel grid) and generates an interrupt to the host CPU. | |
KAT_FD void | kat::ptx::exit () |
Ends execution of the current thread of this kernel/grid. | |
kat::ptx::DEFINE_IS_IN_MEMORY_SPACE (const) DEFINE_IS_IN_MEMORY_SPACE(global) DEFINE_IS_IN_MEMORY_SPACE(local) DEFINE_IS_IN_MEMORY_SPACE(shared) DEFINE_BFIND(s32) DEFINE_BFIND(s64) DEFINE_BFIND(u32) DEFINE_BFIND(u64) DEFINE_PRMT_WITH_MODE(forward_4_extract | |
f4e | kat::ptx::DEFINE_PRMT_WITH_MODE (backward_4_extract, b4e) DEFINE_PRMT_WITH_MODE(replicate_8 |
f4e rc8 | kat::ptx::DEFINE_PRMT_WITH_MODE (replicate_16, rc16) DEFINE_PRMT_WITH_MODE(edge_clam_left |
f4e rc8 ecl | kat::ptx::DEFINE_PRMT_WITH_MODE (edge_clam_right, ecl) KAT_FD uint32_t prmt(uint32_t first |
See: relevant section of the CUDA PTX reference for an explanation of what this does exactly. More... | |
kat::ptx::asm ("prmt.b32 %0, %1, %2, %3;" :"=r"(result) :"r"(first), "r"(second), "r"(byte_selectors)) | |
kat::ptx::DEFINE_BFE (s32) DEFINE_BFE(s64) DEFINE_BFE(u32) DEFINE_BFE(u64) KAT_FD uint32_t bfi(uint32_t bits_to_insert | |
kat::ptx::asm ("bfi.b32 %0, %1, %2, %3, %4;" :"=r"(ret) :"r"(bits_to_insert), "r"(existing_bit_field), "r"(start_position), "r"(num_bits)) | |
KAT_FD uint64_t | kat::ptx::bfi (uint64_t bits_to_insert, uint64_t existing_bit_field, uint32_t start_position, uint32_t num_bits) |
kat::ptx::DEFINE_SAD_ (u16) | |
kat::ptx::DEFINE_SAD_ (u32) | |
kat::ptx::DEFINE_SAD_ (u64) | |
kat::ptx::DEFINE_SAD_ (s16) | |
kat::ptx::DEFINE_SAD_ (s32) | |
kat::ptx::DEFINE_SAD_ (s64) | |
Non-templated wrappers for PTX instructions, which nVIDIA does not provide wrappers for through the CUDA <device_functions.h>
header.
#define DEFINE_BFE | ( | ptx_type | ) |
Extracts the bits with 0-based indices start_pos...start_pos+length-1, counting from least to most significant, from a bit field field.
Has sign extension semantics for signed inputs which are bit tricky, see in the PTX ISA guide:
http://docs.nvidia.com/cuda/parallel-thread-execution/index.html
TODO: CUB 1.5.2's BFE wrapper seems kind of fishy. Why does Duane Merill not use PTX for extraction from 64-bit fields? I'll take a different route.
#define DEFINE_BFIND | ( | ptx_type | ) |
#define DEFINE_IS_IN_MEMORY_SPACE | ( | _which_space | ) |
See relevant section of the CUDA PTX reference for details on these instructions.
#define DEFINE_PRMT_WITH_MODE | ( | selection_mode_name, | |
selection_mode | |||
) |
#define DEFINE_SAD | ( | ptx_type_1, | |
unsigned_ptx_type_1 | |||
) |
Adds the absolute difference of two values to a base value.
x | value from which to subtract y |
y | value to subtract from x |
addend | base value to which to add |x-y| |
addend + |x - y|