70 extern void dgemm_kernel_asm(
const double *
A,
97 const double *nextB) {
128 for (j = 0; j < k; ++j) {
130 buffer[i] =
A[i * incRowA];
149 for (i = 0; i < mp; ++i) {
150 pack_micro_A(kc,
A, incRowA, incColA, buffer);
155 for (j = 0; j < kc; ++j) {
156 for (i = 0; i < _mr; ++i) {
157 buffer[i] =
A[i * incRowA];
176 for (i = 0; i < k; ++i) {
178 buffer[j] =
B[j * incColB];
197 for (j = 0; j < np; ++j) {
198 pack_micro_B(kc,
B, incRowB, incColB, buffer);
203 for (i = 0; i < kc; ++i) {
204 for (j = 0; j < _nr; ++j) {
205 buffer[j] =
B[j * incColB];
228 if (fabs(alpha - 1.0) > std::numeric_limits<double>::epsilon()) {
230 for (j = 0; j < n; ++j) {
231 for (i = 0; i < m; ++i) {
232 Y[i * incRowY + j * incColY] +=
233 alpha * X[i * incRowX + j * incColX];
239 for (j = 0; j < n; ++j) {
240 for (i = 0; i < m; ++i) {
241 Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
256 if (fabs(alpha - 0.0) > std::numeric_limits<double>::epsilon()) {
257 for (j = 0; j < n; ++j) {
258 for (i = 0; i < m; ++i) {
259 X[i * incRowX + j * incColX] *= alpha;
265 for (j = 0; j < n; ++j) {
266 for (i = 0; i < m; ++i) {
267 X[i * incRowX + j * incColX] = 0.0;
295 const double *nextA =
nullptr;
296 const double *nextB =
nullptr;
300 for (j = 0; j < np; ++j) {
301 nr = (j != np - 1 || _nr == 0) ?
BLOCK_SZ_NR : _nr;
303 for (i = 0; i < mp; ++i) {
304 mr = (i != mp - 1 || _mr == 0) ?
BLOCK_SZ_MR : _mr;
337 dgemm_micro_kernel(kc,
348 dgemm_micro_kernel(kc,
408 if (fabs(alpha) < std::numeric_limits<double>::epsilon() || k == 0) {
409 dgescal(m, n, beta,
C, incRowC, incColC);
413 for (j = 0; j < nb; ++j) {
414 nc = (j != nb - 1 || _nc == 0) ?
BLOCK_SZ_N : _nc;
416 for (l = 0; l < kb; ++l) {
417 kc = (l != kb - 1 || _kc == 0) ?
BLOCK_SZ_K : _kc;
418 _beta = (l == 0) ? beta : 1.0;
428 for (i = 0; i < mb; ++i) {
429 mc = (i != mb - 1 || _mc == 0) ?
BLOCK_SZ_M : _mc;
void dgemm_micro_kernel(int kc, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
void pack_buffer_B(int kc, int nc, const double *B, int incRowB, int incColB, double *buffer)
Packs panels from B with padding if needed.
void pack_micro_A(int k, const double *A, int incRowA, int incColA, double *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY)
Computes Y += alpha*X (double precision AX + Y)
void pack_micro_B(int k, const double *B, int incRowB, int incColB, double *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
void dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX)
Scales elements of X by alpha.
void dgemm_macro_kernel(int mc, int nc, int kc, double alpha, double beta, double *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
void pack_buffer_A(int mc, int kc, const double *A, int incRowA, int incColA, double *buffer)
Packs panels from A with padding if needed.
void dgemm_nn(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC)
Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B.