57 for (j = 0; j < k; ++j) {
59 buffer[i] =
A[i * incRowA];
78 for (i = 0; i < mp; ++i) {
79 pack_micro_A(kc,
A, incRowA, incColA, buffer);
84 for (j = 0; j < kc; ++j) {
85 for (i = 0; i < _mr; ++i) {
86 buffer[i] =
A[i * incRowA];
105 for (i = 0; i < k; ++i) {
107 buffer[j] =
B[j * incColB];
126 for (j = 0; j < np; ++j) {
127 pack_micro_B(kc,
B, incRowB, incColB, buffer);
132 for (i = 0; i < kc; ++i) {
133 for (j = 0; j < _nr; ++j) {
134 buffer[j] =
B[j * incColB];
162 for (l = 0; l < kc; ++l) {
176 C[i * incRowC + j * incColC] = 0;
179 }
else if (beta != 1) {
182 C[i * incRowC + j * incColC] *= beta;
193 C[i * incRowC + j * incColC] += AB[i + j *
BLOCK_SZ_MR];
201 C[i * incRowC + j * incColC] += alpha * AB[i + j *
BLOCK_SZ_MR];
220 for (j = 0; j < n; ++j) {
221 for (i = 0; i < m; ++i) {
222 Y[i * incRowY + j * incColY] +=
223 alpha * X[i * incRowX + j * incColX];
229 for (j = 0; j < n; ++j) {
230 for (i = 0; i < m; ++i) {
231 Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
247 for (j = 0; j < n; ++j) {
248 for (i = 0; i < m; ++i) {
249 X[i * incRowX + j * incColX] *= alpha;
255 for (j = 0; j < n; ++j) {
256 for (i = 0; i < m; ++i) {
257 X[i * incRowX + j * incColX] = 0;
283 for (j = 0; j < np; ++j) {
284 nr = (j != np - 1 || _nr == 0) ?
BLOCK_SZ_NR : _nr;
286 for (i = 0; i < mp; ++i) {
287 mr = (i != mp - 1 || _mr == 0) ?
BLOCK_SZ_MR : _mr;
300 igemm_micro_kernel(kc,
358 if (alpha == 0 || k == 0) {
359 igescal(m, n, beta,
C, incRowC, incColC);
363 for (j = 0; j < nb; ++j) {
364 nc = (j != nb - 1 || _nc == 0) ?
BLOCK_SZ_N : _nc;
366 for (l = 0; l < kb; ++l) {
367 kc = (l != kb - 1 || _kc == 0) ?
BLOCK_SZ_K : _kc;
368 _beta = (l == 0) ? beta : 1.0;
378 for (i = 0; i < mb; ++i) {
379 mc = (i != mb - 1 || _mc == 0) ?
BLOCK_SZ_M : _mc;
void igemm_macro_kernel(int mc, int nc, int kc, int alpha, int beta, int *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
void igemm_nn(int m, int n, int k, int alpha, const int *A, int incRowA, int incColA, const int *B, int incRowB, int incColB, int beta, int *C, int incRowC, int incColC)
Main IGEMM entrypoint, computes C <- beta*C + alpha*A*B.
void pack_buffer_B(int kc, int nc, const int *B, int incRowB, int incColB, int *buffer)
Packs panels from B with padding if needed.
void igeaxpy(int m, int n, int alpha, const int *X, int incRowX, int incColX, int *Y, int incRowY, int incColY)
Computes Y += alpha*X (int precision AX + Y)
void igemm_micro_kernel(int kc, int alpha, const int *A, const int *B, int beta, int *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
void igescal(int m, int n, int alpha, int *X, int incRowX, int incColX)
Scales elements of X by alpha.
static int IGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
void pack_micro_A(int k, const int *A, int incRowA, int incColA, int *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
static int IGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
void pack_micro_B(int k, const int *B, int incRowB, int incColB, int *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
static int IGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
void pack_buffer_A(int mc, int kc, const int *A, int incRowA, int incColA, int *buffer)
Packs panels from A with padding if needed.