57 for (j = 0; j < k; ++j) {
59 buffer[i] =
A[i * incRowA];
78 for (i = 0; i < mp; ++i) {
79 pack_micro_A(kc,
A, incRowA, incColA, buffer);
84 for (j = 0; j < kc; ++j) {
85 for (i = 0; i < _mr; ++i) {
86 buffer[i] =
A[i * incRowA];
105 for (i = 0; i < k; ++i) {
107 buffer[j] =
B[j * incColB];
126 for (j = 0; j < np; ++j) {
127 pack_micro_B(kc,
B, incRowB, incColB, buffer);
132 for (i = 0; i < kc; ++i) {
133 for (j = 0; j < _nr; ++j) {
134 buffer[j] =
B[j * incColB];
162 for (l = 0; l < kc; ++l) {
173 if (fabs(beta - 0.0f) < std::numeric_limits<float>::epsilon()) {
176 C[i * incRowC + j * incColC] = 0.0f;
179 }
else if (fabs(beta - 1.0f) > std::numeric_limits<float>::epsilon()) {
182 C[i * incRowC + j * incColC] *= beta;
190 if (fabs(alpha - 1.0f) < std::numeric_limits<float>::epsilon()) {
193 C[i * incRowC + j * incColC] += AB[i + j *
BLOCK_SZ_MR];
201 C[i * incRowC + j * incColC] += alpha * AB[i + j *
BLOCK_SZ_MR];
219 if (fabs(alpha - 1.0f) > std::numeric_limits<float>::epsilon()) {
221 for (j = 0; j < n; ++j) {
222 for (i = 0; i < m; ++i) {
223 Y[i * incRowY + j * incColY] +=
224 alpha * X[i * incRowX + j * incColX];
230 for (j = 0; j < n; ++j) {
231 for (i = 0; i < m; ++i) {
232 Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
247 if (fabs(alpha - 0.0f) > std::numeric_limits<float>::epsilon()) {
248 for (j = 0; j < n; ++j) {
249 for (i = 0; i < m; ++i) {
250 X[i * incRowX + j * incColX] *= alpha;
256 for (j = 0; j < n; ++j) {
257 for (i = 0; i < m; ++i) {
258 X[i * incRowX + j * incColX] = 0.0f;
284 for (j = 0; j < np; ++j) {
285 nr = (j != np - 1 || _nr == 0) ?
BLOCK_SZ_NR : _nr;
287 for (i = 0; i < mp; ++i) {
288 mr = (i != mp - 1 || _mr == 0) ?
BLOCK_SZ_MR : _mr;
301 sgemm_micro_kernel(kc,
359 if (fabs(alpha) < std::numeric_limits<float>::epsilon() || k == 0) {
360 sgescal(m, n, beta,
C, incRowC, incColC);
364 for (j = 0; j < nb; ++j) {
365 nc = (j != nb - 1 || _nc == 0) ?
BLOCK_SZ_N : _nc;
367 for (l = 0; l < kb; ++l) {
368 kc = (l != kb - 1 || _kc == 0) ?
BLOCK_SZ_K : _kc;
369 _beta = (l == 0) ? beta : 1.0f;
379 for (i = 0; i < mb; ++i) {
380 mc = (i != mb - 1 || _mc == 0) ?
BLOCK_SZ_M : _mc;
static float SGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
void pack_micro_A(int k, const float *A, int incRowA, int incColA, float *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
void sgemm_macro_kernel(int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
void pack_buffer_B(int kc, int nc, const float *B, int incRowB, int incColB, float *buffer)
Packs panels from B with padding if needed.
void sgeaxpy(int m, int n, float alpha, const float *X, int incRowX, int incColX, float *Y, int incRowY, int incColY)
Computes Y += alpha*X (float precision AX + Y)
void sgemm_micro_kernel(int kc, float alpha, const float *A, const float *B, float beta, float *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
static float SGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
void pack_buffer_A(int mc, int kc, const float *A, int incRowA, int incColA, float *buffer)
Packs panels from A with padding if needed.
static float SGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
void sgemm_nn(int m, int n, int k, float alpha, const float *A, int incRowA, int incColA, const float *B, int incRowB, int incColB, float beta, float *C, int incRowC, int incColC)
Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B.
void sgescal(int m, int n, float alpha, float *X, int incRowX, int incColX)
Scales elements of X by alpha.
void pack_micro_B(int k, const float *B, int incRowB, int incColB, float *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.