24 #ifndef MLPACK_METHODS_ANN_LAYER_MULTIHEAD_ATTENTION_HPP 25 #define MLPACK_METHODS_ANN_LAYER_MULTIHEAD_ATTENTION_HPP 58 typename InputDataType = arma::mat,
59 typename OutputDataType = arma::mat,
60 typename RegularizerType = NoRegularizer
62 class MultiheadAttention
79 const size_t srcSeqLen,
80 const size_t embedDim,
81 const size_t numHeads);
96 void Forward(
const arma::Mat<eT>& input, arma::Mat<eT>& output);
106 template<
typename eT>
107 void Backward(
const arma::Mat<eT>& ,
108 const arma::Mat<eT>& gy,
118 template<
typename eT>
119 void Gradient(
const arma::Mat<eT>& input,
120 const arma::Mat<eT>& error,
121 arma::Mat<eT>& gradient);
124 size_t WeightSize()
const {
return 4 * (embedDim + 1) * embedDim; }
129 template<
typename Archive>
130 void serialize(Archive& ar,
const uint32_t );
168 OutputDataType
const&
Delta()
const {
return delta; }
170 OutputDataType&
Delta() {
return delta; }
173 OutputDataType
const&
Gradient()
const {
return grad; }
182 size_t InputShape()
const 184 return embedDim * (tgtSeqLen + 2 * srcSeqLen);
189 typedef typename OutputDataType::elem_type ElemType;
207 OutputDataType attnMask;
210 OutputDataType keyPaddingMask;
213 OutputDataType queryWt;
216 OutputDataType keyWt;
219 OutputDataType valueWt;
222 OutputDataType outWt;
225 OutputDataType qBias;
228 OutputDataType kBias;
231 OutputDataType vBias;
234 OutputDataType outBias;
237 OutputDataType weights;
240 arma::Cube<ElemType> qProj;
243 arma::Cube<ElemType> kProj;
246 arma::Cube<ElemType> vProj;
249 arma::Cube<ElemType> scores;
252 arma::Cube<ElemType> attnOut;
258 OutputDataType delta;
264 OutputDataType outputParameter;
267 RegularizerType regularizer;
OutputDataType const & Gradient() const
Get the gradient.
Definition: multihead_attention.hpp:173
size_t TgtSeqLen() const
Get the target sequence length.
Definition: multihead_attention.hpp:133
void serialize(Archive &ar, const uint32_t)
Serialize the layer.
Definition: multihead_attention_impl.hpp:437
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Backward(const arma::Mat< eT > &, const arma::Mat< eT > &gy, arma::Mat< eT > &g)
Ordinary feed backward pass of a neural network, calculating the function f(x) by propagating x backw...
Definition: multihead_attention_impl.hpp:195
OutputDataType const & Parameters() const
Get the parameters.
Definition: multihead_attention.hpp:178
OutputDataType & AttentionMask()
Modify the two dimensional Attention Mask.
Definition: multihead_attention.hpp:155
size_t SrcSeqLen() const
Get the source sequence length.
Definition: multihead_attention.hpp:138
The core includes that mlpack expects; standard C++ includes and Armadillo.
OutputDataType & Delta()
Modify the delta.
Definition: multihead_attention.hpp:170
void Forward(const arma::Mat< eT > &input, arma::Mat< eT > &output)
Ordinary feed forward pass of a neural network, evaluating the function f(x) by propagating the activ...
Definition: multihead_attention_impl.hpp:89
OutputDataType & Gradient()
Modify the gradient.
Definition: multihead_attention.hpp:175
OutputDataType const & OutputParameter() const
Get the output parameter.
Definition: multihead_attention.hpp:163
Implementation of the Softmax layer.
Definition: softmax.hpp:38
size_t & TgtSeqLen()
Modify the target sequence length.
Definition: multihead_attention.hpp:135
size_t & NumHeads()
Modify the number of attention heads.
Definition: multihead_attention.hpp:150
OutputDataType & Parameters()
Modify the parameters.
Definition: multihead_attention.hpp:180
OutputDataType const & Delta() const
Get the delta.
Definition: multihead_attention.hpp:168
size_t WeightSize() const
Get the size of the weights.
Definition: multihead_attention.hpp:124
OutputDataType & KeyPaddingMask()
Modify the Key Padding Mask.
Definition: multihead_attention.hpp:160
OutputDataType const & KeyPaddingMask() const
Get Key Padding Mask.
Definition: multihead_attention.hpp:158
size_t NumHeads() const
Get the number of attention heads.
Definition: multihead_attention.hpp:148
OutputDataType const & AttentionMask() const
Get the two dimensional Attention Mask.
Definition: multihead_attention.hpp:153
size_t EmbedDim() const
Get the embedding dimension.
Definition: multihead_attention.hpp:143
MultiheadAttention()
Default constructor.
Definition: multihead_attention_impl.hpp:27
size_t & EmbedDim()
Modify the embedding dimension.
Definition: multihead_attention.hpp:145
OutputDataType & OutputParameter()
Modify the output parameter.
Definition: multihead_attention.hpp:165
size_t & SrcSeqLen()
Modify the source sequence length.
Definition: multihead_attention.hpp:140
void Reset()
Reset the layer parameters.
Definition: multihead_attention_impl.hpp:63