openGPMP
Open Source Mathematics Package
Public Member Functions | Static Public Attributes | List of all members
gpmp::linalg::SGEMM Class Reference

Class for performing matrix multiplication on float type arrays. More...

#include <_sgemm.hpp>

Public Member Functions

void pack_micro_A (int k, const float *A, int incRowA, int incColA, float *buffer)
 Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding. More...
 
void pack_buffer_A (int mc, int kc, const float *A, int incRowA, int incColA, float *buffer)
 Packs panels from A with padding if needed. More...
 
void pack_micro_B (int k, const float *B, int incRowB, int incColB, float *buffer)
 Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding. More...
 
void pack_buffer_B (int kc, int nc, const float *B, int incRowB, int incColB, float *buffer)
 Packs panels from B with padding if needed. More...
 
void sgemm_micro_kernel (int kc, float alpha, const float *A, const float *B, float beta, float *C, int incRowC, int incColC)
 Computes the micro kernel that multiplies panels from A and B. More...
 
void sgeaxpy (int m, int n, float alpha, const float *X, int incRowX, int incColX, float *Y, int incRowY, int incColY)
 Computes Y += alpha*X (float precision AX + Y) More...
 
void sgescal (int m, int n, float alpha, float *X, int incRowX, int incColX)
 Scales elements of X by alpha. More...
 
void sgemm_macro_kernel (int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
 Macro kernel for the multiplication of blocks of A and B. More...
 
void sgemm_nn (int m, int n, int k, float alpha, const float *A, int incRowA, int incColA, const float *B, int incRowB, int incColB, float beta, float *C, int incRowC, int incColC)
 Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B. More...
 

Static Public Attributes

static float SGEMM_BUFF_A [BLOCK_SZ_M *BLOCK_SZ_K]
 
static float SGEMM_BUFF_B [BLOCK_SZ_K *BLOCK_SZ_N]
 
static float SGEMM_BUFF_C [BLOCK_SZ_MR *BLOCK_SZ_NR]
 

Detailed Description

Class for performing matrix multiplication on float type arrays.

Definition at line 50 of file _sgemm.hpp.

Member Function Documentation

◆ pack_buffer_A()

void gpmp::linalg::SGEMM::pack_buffer_A ( int  mc,
int  kc,
const float *  A,
int  incRowA,
int  incColA,
float *  buffer 
)

Packs panels from A with padding if needed.

Parameters
mcNumber of rows to pack
kcNumber of columns to pack
APointer to the source matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
bufferPointer to the buffer to store the packed panels

Definition at line 67 of file sgemm_arr.cpp.

72  {
73  int mp = mc / BLOCK_SZ_MR;
74  int _mr = mc % BLOCK_SZ_MR;
75 
76  int i, j;
77 
78  for (i = 0; i < mp; ++i) {
79  pack_micro_A(kc, A, incRowA, incColA, buffer);
80  buffer += kc * BLOCK_SZ_MR;
81  A += BLOCK_SZ_MR * incRowA;
82  }
83  if (_mr > 0) {
84  for (j = 0; j < kc; ++j) {
85  for (i = 0; i < _mr; ++i) {
86  buffer[i] = A[i * incRowA];
87  }
88  for (i = _mr; i < BLOCK_SZ_MR; ++i) {
89  buffer[i] = 0.0f;
90  }
91  buffer += BLOCK_SZ_MR;
92  A += incColA;
93  }
94  }
95 }
#define BLOCK_SZ_MR
Definition: _dgemm.hpp:40
void pack_micro_A(int k, const float *A, int incRowA, int incColA, float *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
Definition: sgemm_arr.cpp:50
list A
Definition: linalg.py:22

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_buffer_B()

void gpmp::linalg::SGEMM::pack_buffer_B ( int  kc,
int  nc,
const float *  B,
int  incRowB,
int  incColB,
float *  buffer 
)

Packs panels from B with padding if needed.

Parameters
kcNumber of rows to pack
ncNumber of columns to pack
BPointer to the source matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
bufferPointer to the buffer to store the packed panels

Definition at line 115 of file sgemm_arr.cpp.

120  {
121  int np = nc / BLOCK_SZ_NR;
122  int _nr = nc % BLOCK_SZ_NR;
123 
124  int i, j;
125 
126  for (j = 0; j < np; ++j) {
127  pack_micro_B(kc, B, incRowB, incColB, buffer);
128  buffer += kc * BLOCK_SZ_NR;
129  B += BLOCK_SZ_NR * incColB;
130  }
131  if (_nr > 0) {
132  for (i = 0; i < kc; ++i) {
133  for (j = 0; j < _nr; ++j) {
134  buffer[j] = B[j * incColB];
135  }
136  for (j = _nr; j < BLOCK_SZ_NR; ++j) {
137  buffer[j] = 0.0f;
138  }
139  buffer += BLOCK_SZ_NR;
140  B += incRowB;
141  }
142  }
143 }
#define BLOCK_SZ_NR
Definition: _dgemm.hpp:41
void pack_micro_B(int k, const float *B, int incRowB, int incColB, float *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
Definition: sgemm_arr.cpp:98
list B
Definition: linalg.py:23

References python.linalg::B, and BLOCK_SZ_NR.

◆ pack_micro_A()

void gpmp::linalg::SGEMM::pack_micro_A ( int  k,
const float *  A,
int  incRowA,
int  incColA,
float *  buffer 
)

Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.

Parameters
kNumber of columns to pack
APointer to the source matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
bufferPointer to the buffer to store the packed micro panels

Definition at line 50 of file sgemm_arr.cpp.

54  {
55  int i, j;
56 
57  for (j = 0; j < k; ++j) {
58  for (i = 0; i < BLOCK_SZ_MR; ++i) {
59  buffer[i] = A[i * incRowA];
60  }
61  buffer += BLOCK_SZ_MR;
62  A += incColA;
63  }
64 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_micro_B()

void gpmp::linalg::SGEMM::pack_micro_B ( int  k,
const float *  B,
int  incRowB,
int  incColB,
float *  buffer 
)

Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.

Parameters
kNumber of rows to pack
BPointer to the source matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
bufferPointer to the buffer to store the packed micro panels

Definition at line 98 of file sgemm_arr.cpp.

102  {
103  int i, j;
104 
105  for (i = 0; i < k; ++i) {
106  for (j = 0; j < BLOCK_SZ_NR; ++j) {
107  buffer[j] = B[j * incColB];
108  }
109  buffer += BLOCK_SZ_NR;
110  B += incRowB;
111  }
112 }

References python.linalg::B, and BLOCK_SZ_NR.

◆ sgeaxpy()

void gpmp::linalg::SGEMM::sgeaxpy ( int  m,
int  n,
float  alpha,
const float *  X,
int  incRowX,
int  incColX,
float *  Y,
int  incRowY,
int  incColY 
)

Computes Y += alpha*X (float precision AX + Y)

Parameters
mNumber of rows
nNumber of columns
alphaScalar alpha
XPointer to matrix X
incRowXIncrement between consecutive rows of X
incColXIncrement between consecutive columns of X
YPointer to matrix Y
incRowYIncrement between consecutive rows of Y
incColYIncrement between consecutive columns of Y

Definition at line 208 of file sgemm_arr.cpp.

216  {
217  int i, j;
218 
219  if (fabs(alpha - 1.0f) > std::numeric_limits<float>::epsilon()) {
220 
221  for (j = 0; j < n; ++j) {
222  for (i = 0; i < m; ++i) {
223  Y[i * incRowY + j * incColY] +=
224  alpha * X[i * incRowX + j * incColX];
225  }
226  }
227  }
228 
229  else {
230  for (j = 0; j < n; ++j) {
231  for (i = 0; i < m; ++i) {
232  Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
233  }
234  }
235  }
236 }

◆ sgemm_macro_kernel()

void gpmp::linalg::SGEMM::sgemm_macro_kernel ( int  mc,
int  nc,
int  kc,
float  alpha,
float  beta,
float *  C,
int  incRowC,
int  incColC 
)

Macro kernel for the multiplication of blocks of A and B.

Parameters
mcNumber of rows in the block of C
ncNumber of columns in the block of C
kcNumber of columns in the blocks of A and rows of B
alphaScalar alpha
betaScalar beta
CPointer to the output matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Definition at line 266 of file sgemm_arr.cpp.

273  {
274 
275  int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
276  int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
277 
278  int _mr = mc % BLOCK_SZ_MR;
279  int _nr = nc % BLOCK_SZ_NR;
280 
281  int mr, nr;
282  int i, j;
283 
284  for (j = 0; j < np; ++j) {
285  nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
286 
287  for (i = 0; i < mp; ++i) {
288  mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
289 
290  if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
292  kc,
293  alpha,
294  &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
295  &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
296  beta,
297  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
298  incRowC,
299  incColC);
300  } else {
302  alpha,
303  &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
304  &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
305  0.0f,
306  SGEMM_BUFF_C,
307  1,
308  BLOCK_SZ_MR);
309  sgescal(
310  mr,
311  nr,
312  beta,
313  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
314  incRowC,
315  incColC);
316  sgeaxpy(
317  mr,
318  nr,
319  1.0f,
320  SGEMM_BUFF_C,
321  1,
322  BLOCK_SZ_MR,
323  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
324  incRowC,
325  incColC);
326  }
327  }
328  }
329 }
static float SGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
Definition: _sgemm.hpp:55
void sgeaxpy(int m, int n, float alpha, const float *X, int incRowX, int incColX, float *Y, int incRowY, int incColY)
Computes Y += alpha*X (float precision AX + Y)
Definition: sgemm_arr.cpp:208
void sgemm_micro_kernel(int kc, float alpha, const float *A, const float *B, float beta, float *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
Definition: sgemm_arr.cpp:146
static float SGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
Definition: _sgemm.hpp:53
static float SGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
Definition: _sgemm.hpp:57
void sgescal(int m, int n, float alpha, float *X, int incRowX, int incColX)
Scales elements of X by alpha.
Definition: sgemm_arr.cpp:239
list C
Definition: linalg.py:24

References BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ sgemm_micro_kernel()

void gpmp::linalg::SGEMM::sgemm_micro_kernel ( int  kc,
float  alpha,
const float *  A,
const float *  B,
float  beta,
float *  C,
int  incRowC,
int  incColC 
)

Computes the micro kernel that multiplies panels from A and B.

Parameters
kcNumber of columns
alphaScalar alpha
APointer to the packed panel A
BPointer to the packed panel B
betaScalar beta
CPointer to the output matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Definition at line 146 of file sgemm_arr.cpp.

153  {
154  float AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
155 
156  int i, j, l;
157 
158  // Compute AB = A*B
159  for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
160  AB[l] = 0;
161  }
162  for (l = 0; l < kc; ++l) {
163  for (j = 0; j < BLOCK_SZ_NR; ++j) {
164  for (i = 0; i < BLOCK_SZ_MR; ++i) {
165  AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
166  }
167  }
168  A += BLOCK_SZ_MR;
169  B += BLOCK_SZ_NR;
170  }
171 
172  // Update C <- beta*C
173  if (fabs(beta - 0.0f) < std::numeric_limits<float>::epsilon()) {
174  for (j = 0; j < BLOCK_SZ_NR; ++j) {
175  for (i = 0; i < BLOCK_SZ_MR; ++i) {
176  C[i * incRowC + j * incColC] = 0.0f;
177  }
178  }
179  } else if (fabs(beta - 1.0f) > std::numeric_limits<float>::epsilon()) {
180  for (j = 0; j < BLOCK_SZ_NR; ++j) {
181  for (i = 0; i < BLOCK_SZ_MR; ++i) {
182  C[i * incRowC + j * incColC] *= beta;
183  }
184  }
185  }
186 
187  // Update C <- C + alpha*AB (note: the case alpha==0.0f was already treated
188  // in
189  // the above layer sgemm_nn)
190  if (fabs(alpha - 1.0f) < std::numeric_limits<float>::epsilon()) {
191  for (j = 0; j < BLOCK_SZ_NR; ++j) {
192  for (i = 0; i < BLOCK_SZ_MR; ++i) {
193  C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
194  }
195  }
196  }
197 
198  else {
199  for (j = 0; j < BLOCK_SZ_NR; ++j) {
200  for (i = 0; i < BLOCK_SZ_MR; ++i) {
201  C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
202  }
203  }
204  }
205 }

References python.linalg::A, python.linalg::B, BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ sgemm_nn()

void gpmp::linalg::SGEMM::sgemm_nn ( int  m,
int  n,
int  k,
float  alpha,
const float *  A,
int  incRowA,
int  incColA,
const float *  B,
int  incRowB,
int  incColB,
float  beta,
float *  C,
int  incRowC,
int  incColC 
)

Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B.

Parameters
mNumber of rows of A and rows of C
nNumber of columns of B and columns of C
kNumber of columns of A and rows of B
alphaScalar alpha
APointer to matrix A
incRowAIncrement between consecutive rows of A
incColAIncrement between consecutive columns of A
BPointer to matrix B
incRowBIncrement between consecutive rows of B
incColBIncrement between consecutive columns of B
betaScalar beta
CPointer to matrix C
incRowCIncrement between consecutive rows of C
incColCIncrement between consecutive columns of C

Definition at line 332 of file sgemm_arr.cpp.

345  {
346  int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
347  int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
348  int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
349 
350  int _mc = m % BLOCK_SZ_M;
351  int _nc = n % BLOCK_SZ_N;
352  int _kc = k % BLOCK_SZ_K;
353 
354  int mc, nc, kc;
355  int i, j, l;
356 
357  float _beta;
358 
359  if (fabs(alpha) < std::numeric_limits<float>::epsilon() || k == 0) {
360  sgescal(m, n, beta, C, incRowC, incColC);
361  return;
362  }
363 
364  for (j = 0; j < nb; ++j) {
365  nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
366 
367  for (l = 0; l < kb; ++l) {
368  kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
369  _beta = (l == 0) ? beta : 1.0f;
370 
372  kc,
373  nc,
374  &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
375  incRowB,
376  incColB,
377  SGEMM_BUFF_B);
378 
379  for (i = 0; i < mb; ++i) {
380  mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
381 
383  mc,
384  kc,
385  &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
386  incRowA,
387  incColA,
388  SGEMM_BUFF_A);
389 
391  mc,
392  nc,
393  kc,
394  alpha,
395  _beta,
396  &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
397  incRowC,
398  incColC);
399  }
400  }
401  }
402 }
#define BLOCK_SZ_M
Definition: _dgemm.hpp:37
#define BLOCK_SZ_N
Definition: _dgemm.hpp:39
#define BLOCK_SZ_K
Definition: _dgemm.hpp:38
void sgemm_macro_kernel(int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
Definition: sgemm_arr.cpp:266
void pack_buffer_B(int kc, int nc, const float *B, int incRowB, int incColB, float *buffer)
Packs panels from B with padding if needed.
Definition: sgemm_arr.cpp:115
void pack_buffer_A(int mc, int kc, const float *A, int incRowA, int incColA, float *buffer)
Packs panels from A with padding if needed.
Definition: sgemm_arr.cpp:67

References python.linalg::A, python.linalg::B, BLOCK_SZ_K, BLOCK_SZ_M, BLOCK_SZ_N, and python.linalg::C.

◆ sgescal()

void gpmp::linalg::SGEMM::sgescal ( int  m,
int  n,
float  alpha,
float *  X,
int  incRowX,
int  incColX 
)

Scales elements of X by alpha.

Parameters
mNumber of rows
nNumber of columns
alphaScalar alpha
XPointer to matrix X
incRowXIncrement between consecutive rows of X
incColXIncrement between consecutive columns of X

Definition at line 239 of file sgemm_arr.cpp.

244  {
245  int i, j;
246 
247  if (fabs(alpha - 0.0f) > std::numeric_limits<float>::epsilon()) {
248  for (j = 0; j < n; ++j) {
249  for (i = 0; i < m; ++i) {
250  X[i * incRowX + j * incColX] *= alpha;
251  }
252  }
253  }
254 
255  else {
256  for (j = 0; j < n; ++j) {
257  for (i = 0; i < m; ++i) {
258  X[i * incRowX + j * incColX] = 0.0f;
259  }
260  }
261  }
262 }

Member Data Documentation

◆ SGEMM_BUFF_A

float gpmp::linalg::SGEMM::SGEMM_BUFF_A
static

< Buffer for storing packed micro panels of A
Buffer for storing packed micro panels of B

Single precision GEneral Matrix-Matrix product

Definition at line 53 of file _sgemm.hpp.

◆ SGEMM_BUFF_B

float gpmp::linalg::SGEMM::SGEMM_BUFF_B
static

Buffer for storing intermediate results

Definition at line 55 of file _sgemm.hpp.

◆ SGEMM_BUFF_C

float gpmp::linalg::SGEMM::SGEMM_BUFF_C
static

Definition at line 57 of file _sgemm.hpp.


The documentation for this class was generated from the following files: