openGPMP
Open Source Mathematics Package
Public Member Functions | Static Public Member Functions | List of all members
gpmp::linalg::DGEMM Class Reference

Class for performing matrix multiplication on double type arrays. More...

#include <_dgemm.hpp>

Public Member Functions

void pack_micro_A (int k, const double *A, int incRowA, int incColA, double *buffer)
 Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding. More...
 
void pack_buffer_A (int mc, int kc, const double *A, int incRowA, int incColA, double *buffer)
 Packs panels from A with padding if needed. More...
 
void pack_micro_B (int k, const double *B, int incRowB, int incColB, double *buffer)
 Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding. More...
 
void pack_buffer_B (int kc, int nc, const double *B, int incRowB, int incColB, double *buffer)
 Packs panels from B with padding if needed. More...
 
void dgemm_micro_kernel (int kc, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC)
 Computes the micro kernel that multiplies panels from A and B. More...
 
void dgemm_micro_kernel (long kc, double alpha, const double *A, const double *B, double beta, double *C, long incRowC, long incColC, const double *nextA, const double *nextB)
 Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM) More...
 
void dgeaxpy (int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY)
 Computes Y += alpha*X (double precision AX + Y) More...
 
void dgescal (int m, int n, double alpha, double *X, int incRowX, int incColX)
 Scales elements of X by alpha. More...
 
void dgemm_macro_kernel (int mc, int nc, int kc, double alpha, double beta, double *C, int incRowC, int incColC)
 Macro kernel for the multiplication of blocks of A and B. More...
 
void dgemm_nn (int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC)
 Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B. More...
 

Static Public Member Functions

static double DGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K__attribute__ ((aligned(32)))
 
static double DGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N__attribute__ ((aligned(32)))
 
static double DGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR__attribute__ ((aligned(32)))
 

Detailed Description

Class for performing matrix multiplication on double type arrays.

Definition at line 50 of file _dgemm.hpp.

Member Function Documentation

◆ __attribute__() [1/3]

static double DGEMM_BUFF_A [BLOCK_SZ_M * BLOCK_SZ_K] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32))  )
static

< Buffer for storing packed micro panels of A
Buffer for storing packed micro panels of B

◆ __attribute__() [2/3]

static double DGEMM_BUFF_B [BLOCK_SZ_K * BLOCK_SZ_N] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32))  )
static

Buffer for storing intermediate results

◆ __attribute__() [3/3]

static double DGEMM_BUFF_C [BLOCK_SZ_MR * BLOCK_SZ_NR] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32))  )
static

◆ dgeaxpy()

void gpmp::linalg::DGEMM::dgeaxpy ( int  m,
int  n,
double  alpha,
const double *  X,
int  incRowX,
int  incColX,
double *  Y,
int  incRowY,
int  incColY 
)

Computes Y += alpha*X (double precision AX + Y)

Parameters
mNumber of rows
nNumber of columns
alphaScalar alpha
XPointer to matrix X
incRowXIncrement between consecutive rows of X
incColXIncrement between consecutive columns of X
YPointer to matrix Y
incRowYIncrement between consecutive rows of Y
incColYIncrement between consecutive columns of Y

Definition at line 217 of file dgemm_arr.cpp.

225  {
226  int i, j;
227 
228  if (fabs(alpha - 1.0) > std::numeric_limits<double>::epsilon()) {
229 
230  for (j = 0; j < n; ++j) {
231  for (i = 0; i < m; ++i) {
232  Y[i * incRowY + j * incColY] +=
233  alpha * X[i * incRowX + j * incColX];
234  }
235  }
236  }
237 
238  else {
239  for (j = 0; j < n; ++j) {
240  for (i = 0; i < m; ++i) {
241  Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
242  }
243  }
244  }
245 }

◆ dgemm_macro_kernel()

void gpmp::linalg::DGEMM::dgemm_macro_kernel ( int  mc,
int  nc,
int  kc,
double  alpha,
double  beta,
double *  C,
int  incRowC,
int  incColC 
)

Macro kernel for the multiplication of blocks of A and B.

Parameters
mcNumber of rows in the block of C
ncNumber of columns in the block of C
kcNumber of columns in the blocks of A and rows of B
alphaScalar alpha
betaScalar beta
CPointer to the output matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Definition at line 275 of file dgemm_arr.cpp.

282  {
283 
284  int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
285  int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
286 
287  int _mr = mc % BLOCK_SZ_MR;
288  int _nr = nc % BLOCK_SZ_NR;
289 
290  int mr, nr;
291  int i, j;
292 
293 #if defined(__SSE__)
294 
295  const double *nextA = nullptr;
296  const double *nextB = nullptr;
297 
298 #endif
299 
300  for (j = 0; j < np; ++j) {
301  nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
302 
303  for (i = 0; i < mp; ++i) {
304  mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
305 
306  if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
307 #if defined(__SSE__)
309  kc,
310  alpha,
311  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
312  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
313  beta,
314  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
315  incRowC,
316  incColC,
317  nextA,
318  nextB);
319 
320 #else
322  kc,
323  alpha,
324  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
325  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
326  beta,
327  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
328  incRowC,
329  incColC);
330 
331 #endif
332  }
333 
334  else {
335 
336 #if defined(__SSE__)
338  alpha,
339  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
340  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
341  0.0,
342  DGEMM_BUFF_C,
343  1,
344  BLOCK_SZ_MR,
345  nextA,
346  nextB);
347 #else
349  alpha,
350  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
351  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
352  0.0,
353  DGEMM_BUFF_C,
354  1,
355  BLOCK_SZ_MR);
356 
357 #endif
358  dgescal(
359  mr,
360  nr,
361  beta,
362  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
363  incRowC,
364  incColC);
365  dgeaxpy(
366  mr,
367  nr,
368  1.0,
369  DGEMM_BUFF_C,
370  1,
371  BLOCK_SZ_MR,
372  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
373  incRowC,
374  incColC);
375  }
376  }
377  }
378 }
#define BLOCK_SZ_MR
Definition: _dgemm.hpp:40
#define BLOCK_SZ_NR
Definition: _dgemm.hpp:41
void dgemm_micro_kernel(int kc, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY)
Computes Y += alpha*X (double precision AX + Y)
Definition: dgemm_arr.cpp:217
void dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX)
Scales elements of X by alpha.
Definition: dgemm_arr.cpp:248
list C
Definition: linalg.py:24

References BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ dgemm_micro_kernel() [1/2]

void gpmp::linalg::DGEMM::dgemm_micro_kernel ( int  kc,
double  alpha,
const double *  A,
const double *  B,
double  beta,
double *  C,
int  incRowC,
int  incColC 
)

Computes the micro kernel that multiplies panels from A and B.

Parameters
kcNumber of columns
alphaScalar alpha
APointer to the packed panel A
BPointer to the packed panel B
betaScalar beta
CPointer to the output matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Double precision GEneral Matrix-Matrix product kernel without accl

Definition at line 38 of file dgemm_kernel.cpp.

45  {
46  double AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
47 
48  int i, j, l;
49 
50  // Compute AB = A*B
51  for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
52  AB[l] = 0;
53  }
54  for (l = 0; l < kc; ++l) {
55  for (j = 0; j < BLOCK_SZ_NR; ++j) {
56  for (i = 0; i < BLOCK_SZ_MR; ++i) {
57  AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
58  }
59  }
60  A += BLOCK_SZ_MR;
61  B += BLOCK_SZ_NR;
62  }
63 
64  // Update C <- beta*C
65  if (fabs(beta - 0.0) < std::numeric_limits<double>::epsilon()) {
66  for (j = 0; j < BLOCK_SZ_NR; ++j) {
67  for (i = 0; i < BLOCK_SZ_MR; ++i) {
68  C[i * incRowC + j * incColC] = 0.0;
69  }
70  }
71  } else if (fabs(beta - 1.0) > std::numeric_limits<double>::epsilon()) {
72  for (j = 0; j < BLOCK_SZ_NR; ++j) {
73  for (i = 0; i < BLOCK_SZ_MR; ++i) {
74  C[i * incRowC + j * incColC] *= beta;
75  }
76  }
77  }
78 
79  // Update C <- C + alpha*AB (note: the case alpha==0.0 was already treated
80  // in
81  // the above layer dgemm_nn)
82  if (fabs(alpha - 1.0) < std::numeric_limits<double>::epsilon()) {
83  for (j = 0; j < BLOCK_SZ_NR; ++j) {
84  for (i = 0; i < BLOCK_SZ_MR; ++i) {
85  C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
86  }
87  }
88  }
89 
90  else {
91  for (j = 0; j < BLOCK_SZ_NR; ++j) {
92  for (i = 0; i < BLOCK_SZ_MR; ++i) {
93  C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
94  }
95  }
96  }
97 }
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23

References python.linalg::A, python.linalg::B, BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ dgemm_micro_kernel() [2/2]

void gpmp::linalg::DGEMM::dgemm_micro_kernel ( long  kc,
double  alpha,
const double *  A,
const double *  B,
double  beta,
double *  C,
long  incRowC,
long  incColC,
const double *  nextA,
const double *  nextB 
)

Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM)

This function implements a micro-kernel operation for DGEMM, which is used as a building block in larger DGEMM routines. The micro-kernel performs a basic matrix multiplication operation with optimizations tailored for the SSE (Streaming SIMD Extensions) x86 architecture

Parameters
kcThe size of the inner dimension of the matrices A and B
alphaScaling factor for the matrix multiplication
APointer to the first input matrix A in row-major order
BPointer to the second input matrix B in column-major order
betaScaling factor for the matrix C
CPointer to the output matrix C in row-major order
incRowCIncrement for moving between rows of the matrix C
incColCIncrement for moving between columns of the matrix C
nextAPointer to the next block of matrix A (unused in this micro-kernel)
nextBPointer to the next block of matrix B (unused in this micro-kernel)

Double precision GEneral Matrix-Matrix product

Definition at line 88 of file dgemm_arr.cpp.

97  {
98  long kb = kc / 4;
99  long kl = kc % 4;
100 
101  dgemm_kernel_asm(A,
102  B,
103  C,
104  nextA,
105  nextB,
106  kl,
107  kb,
108  incRowC,
109  incColC,
110  alpha,
111  beta);
112 }

References python.linalg::A, python.linalg::B, and python.linalg::C.

◆ dgemm_nn()

void gpmp::linalg::DGEMM::dgemm_nn ( int  m,
int  n,
int  k,
double  alpha,
const double *  A,
int  incRowA,
int  incColA,
const double *  B,
int  incRowB,
int  incColB,
double  beta,
double *  C,
int  incRowC,
int  incColC 
)

Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B.

Parameters
mNumber of rows of A and rows of C
nNumber of columns of B and columns of C
kNumber of columns of A and rows of B
alphaScalar alpha
APointer to matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
BPointer to matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
betaScalar beta
CPointer to matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Definition at line 381 of file dgemm_arr.cpp.

394  {
395  int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
396  int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
397  int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
398 
399  int _mc = m % BLOCK_SZ_M;
400  int _nc = n % BLOCK_SZ_N;
401  int _kc = k % BLOCK_SZ_K;
402 
403  int mc, nc, kc;
404  int i, j, l;
405 
406  double _beta;
407 
408  if (fabs(alpha) < std::numeric_limits<double>::epsilon() || k == 0) {
409  dgescal(m, n, beta, C, incRowC, incColC);
410  return;
411  }
412 
413  for (j = 0; j < nb; ++j) {
414  nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
415 
416  for (l = 0; l < kb; ++l) {
417  kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
418  _beta = (l == 0) ? beta : 1.0;
419 
421  kc,
422  nc,
423  &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
424  incRowB,
425  incColB,
426  DGEMM_BUFF_B);
427 
428  for (i = 0; i < mb; ++i) {
429  mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
430 
432  mc,
433  kc,
434  &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
435  incRowA,
436  incColA,
437  DGEMM_BUFF_A);
438 
440  mc,
441  nc,
442  kc,
443  alpha,
444  _beta,
445  &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
446  incRowC,
447  incColC);
448  }
449  }
450  }
451 }
#define BLOCK_SZ_M
Definition: _dgemm.hpp:37
#define BLOCK_SZ_N
Definition: _dgemm.hpp:39
#define BLOCK_SZ_K
Definition: _dgemm.hpp:38
void pack_buffer_B(int kc, int nc, const double *B, int incRowB, int incColB, double *buffer)
Packs panels from B with padding if needed.
Definition: dgemm_arr.cpp:186
void dgemm_macro_kernel(int mc, int nc, int kc, double alpha, double beta, double *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
Definition: dgemm_arr.cpp:275
void pack_buffer_A(int mc, int kc, const double *A, int incRowA, int incColA, double *buffer)
Packs panels from A with padding if needed.
Definition: dgemm_arr.cpp:138

References python.linalg::A, python.linalg::B, BLOCK_SZ_K, BLOCK_SZ_M, BLOCK_SZ_N, and python.linalg::C.

◆ dgescal()

void gpmp::linalg::DGEMM::dgescal ( int  m,
int  n,
double  alpha,
double *  X,
int  incRowX,
int  incColX 
)

Scales elements of X by alpha.

Parameters
mNumber of rows
nNumber of columns
alphaScalar alpha
XPointer to matrix X
incRowXIncrement between consecutive rows of X
incColXIncrement between consecutive columns of X

Definition at line 248 of file dgemm_arr.cpp.

253  {
254  int i, j;
255 
256  if (fabs(alpha - 0.0) > std::numeric_limits<double>::epsilon()) {
257  for (j = 0; j < n; ++j) {
258  for (i = 0; i < m; ++i) {
259  X[i * incRowX + j * incColX] *= alpha;
260  }
261  }
262  }
263 
264  else {
265  for (j = 0; j < n; ++j) {
266  for (i = 0; i < m; ++i) {
267  X[i * incRowX + j * incColX] = 0.0;
268  }
269  }
270  }
271 }

◆ pack_buffer_A()

void gpmp::linalg::DGEMM::pack_buffer_A ( int  mc,
int  kc,
const double *  A,
int  incRowA,
int  incColA,
double *  buffer 
)

Packs panels from A with padding if needed.

Parameters
mcNumber of rows to pack
kcNumber of columns to pack
APointer to the source matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
bufferPointer to the buffer to store the packed panels

Definition at line 138 of file dgemm_arr.cpp.

143  {
144  int mp = mc / BLOCK_SZ_MR;
145  int _mr = mc % BLOCK_SZ_MR;
146 
147  int i, j;
148 
149  for (i = 0; i < mp; ++i) {
150  pack_micro_A(kc, A, incRowA, incColA, buffer);
151  buffer += kc * BLOCK_SZ_MR;
152  A += BLOCK_SZ_MR * incRowA;
153  }
154  if (_mr > 0) {
155  for (j = 0; j < kc; ++j) {
156  for (i = 0; i < _mr; ++i) {
157  buffer[i] = A[i * incRowA];
158  }
159  for (i = _mr; i < BLOCK_SZ_MR; ++i) {
160  buffer[i] = 0.0;
161  }
162  buffer += BLOCK_SZ_MR;
163  A += incColA;
164  }
165  }
166 }
void pack_micro_A(int k, const double *A, int incRowA, int incColA, double *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
Definition: dgemm_arr.cpp:121

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_buffer_B()

void gpmp::linalg::DGEMM::pack_buffer_B ( int  kc,
int  nc,
const double *  B,
int  incRowB,
int  incColB,
double *  buffer 
)

Packs panels from B with padding if needed.

Parameters
kcNumber of rows to pack
ncNumber of columns to pack
BPointer to the source matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
bufferPointer to the buffer to store the packed panels

Definition at line 186 of file dgemm_arr.cpp.

191  {
192  int np = nc / BLOCK_SZ_NR;
193  int _nr = nc % BLOCK_SZ_NR;
194 
195  int i, j;
196 
197  for (j = 0; j < np; ++j) {
198  pack_micro_B(kc, B, incRowB, incColB, buffer);
199  buffer += kc * BLOCK_SZ_NR;
200  B += BLOCK_SZ_NR * incColB;
201  }
202  if (_nr > 0) {
203  for (i = 0; i < kc; ++i) {
204  for (j = 0; j < _nr; ++j) {
205  buffer[j] = B[j * incColB];
206  }
207  for (j = _nr; j < BLOCK_SZ_NR; ++j) {
208  buffer[j] = 0.0;
209  }
210  buffer += BLOCK_SZ_NR;
211  B += incRowB;
212  }
213  }
214 }
void pack_micro_B(int k, const double *B, int incRowB, int incColB, double *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
Definition: dgemm_arr.cpp:169

References python.linalg::B, and BLOCK_SZ_NR.

◆ pack_micro_A()

void gpmp::linalg::DGEMM::pack_micro_A ( int  k,
const double *  A,
int  incRowA,
int  incColA,
double *  buffer 
)

Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.

Parameters
kNumber of columns to pack
APointer to the source matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
bufferPointer to the buffer to store the packed micro panels

Definition at line 121 of file dgemm_arr.cpp.

125  {
126  int i, j;
127 
128  for (j = 0; j < k; ++j) {
129  for (i = 0; i < BLOCK_SZ_MR; ++i) {
130  buffer[i] = A[i * incRowA];
131  }
132  buffer += BLOCK_SZ_MR;
133  A += incColA;
134  }
135 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_micro_B()

void gpmp::linalg::DGEMM::pack_micro_B ( int  k,
const double *  B,
int  incRowB,
int  incColB,
double *  buffer 
)

Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.

Parameters
kNumber of rows to pack
BPointer to the source matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
bufferPointer to the buffer to store the packed micro panels

Definition at line 169 of file dgemm_arr.cpp.

173  {
174  int i, j;
175 
176  for (i = 0; i < k; ++i) {
177  for (j = 0; j < BLOCK_SZ_NR; ++j) {
178  buffer[j] = B[j * incColB];
179  }
180  buffer += BLOCK_SZ_NR;
181  B += incRowB;
182  }
183 }

References python.linalg::B, and BLOCK_SZ_NR.


The documentation for this class was generated from the following files: