37 #define BLOCK_SZ_M 384
38 #define BLOCK_SZ_K 384
39 #define BLOCK_SZ_N 4096
179 void sgescal(
int m,
int n,
float alpha,
float *X,
int incRowX,
int incColX);
Class for performing matrix multiplication on float type arrays.
static float SGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
void pack_micro_A(int k, const float *A, int incRowA, int incColA, float *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
void sgemm_macro_kernel(int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
void pack_buffer_B(int kc, int nc, const float *B, int incRowB, int incColB, float *buffer)
Packs panels from B with padding if needed.
void sgeaxpy(int m, int n, float alpha, const float *X, int incRowX, int incColX, float *Y, int incRowY, int incColY)
Computes Y += alpha*X (float precision AX + Y)
void sgemm_micro_kernel(int kc, float alpha, const float *A, const float *B, float beta, float *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
static float SGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
void pack_buffer_A(int mc, int kc, const float *A, int incRowA, int incColA, float *buffer)
Packs panels from A with padding if needed.
static float SGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
void sgemm_nn(int m, int n, int k, float alpha, const float *A, int incRowA, int incColA, const float *B, int incRowB, int incColB, float beta, float *C, int incRowC, int incColC)
Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B.
void sgescal(int m, int n, float alpha, float *X, int incRowX, int incColX)
Scales elements of X by alpha.
void pack_micro_B(int k, const float *B, int incRowB, int incColB, float *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
The source C++ openGPMP namespace.