openGPMP
Open Source Mathematics Package
sgemm_arr.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
35 #include <cmath>
36 #include <limits>
38 #include <stddef.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <time.h>
42 
43 // MATRIX BUFFERS
47 
48 // pack micro panels of size BLOCK_SZ_MR rows by k columns from A without
49 // padding
51  const float *A,
52  int incRowA,
53  int incColA,
54  float *buffer) {
55  int i, j;
56 
57  for (j = 0; j < k; ++j) {
58  for (i = 0; i < BLOCK_SZ_MR; ++i) {
59  buffer[i] = A[i * incRowA];
60  }
61  buffer += BLOCK_SZ_MR;
62  A += incColA;
63  }
64 }
65 
66 // packs panels from A with padding if needed
68  int kc,
69  const float *A,
70  int incRowA,
71  int incColA,
72  float *buffer) {
73  int mp = mc / BLOCK_SZ_MR;
74  int _mr = mc % BLOCK_SZ_MR;
75 
76  int i, j;
77 
78  for (i = 0; i < mp; ++i) {
79  pack_micro_A(kc, A, incRowA, incColA, buffer);
80  buffer += kc * BLOCK_SZ_MR;
81  A += BLOCK_SZ_MR * incRowA;
82  }
83  if (_mr > 0) {
84  for (j = 0; j < kc; ++j) {
85  for (i = 0; i < _mr; ++i) {
86  buffer[i] = A[i * incRowA];
87  }
88  for (i = _mr; i < BLOCK_SZ_MR; ++i) {
89  buffer[i] = 0.0f;
90  }
91  buffer += BLOCK_SZ_MR;
92  A += incColA;
93  }
94  }
95 }
96 
97 // packing complete panels from B of size BLOCK_SZ_NR by k columns
99  const float *B,
100  int incRowB,
101  int incColB,
102  float *buffer) {
103  int i, j;
104 
105  for (i = 0; i < k; ++i) {
106  for (j = 0; j < BLOCK_SZ_NR; ++j) {
107  buffer[j] = B[j * incColB];
108  }
109  buffer += BLOCK_SZ_NR;
110  B += incRowB;
111  }
112 }
113 
114 // packing panels from B with padding if needed
116  int nc,
117  const float *B,
118  int incRowB,
119  int incColB,
120  float *buffer) {
121  int np = nc / BLOCK_SZ_NR;
122  int _nr = nc % BLOCK_SZ_NR;
123 
124  int i, j;
125 
126  for (j = 0; j < np; ++j) {
127  pack_micro_B(kc, B, incRowB, incColB, buffer);
128  buffer += kc * BLOCK_SZ_NR;
129  B += BLOCK_SZ_NR * incColB;
130  }
131  if (_nr > 0) {
132  for (i = 0; i < kc; ++i) {
133  for (j = 0; j < _nr; ++j) {
134  buffer[j] = B[j * incColB];
135  }
136  for (j = _nr; j < BLOCK_SZ_NR; ++j) {
137  buffer[j] = 0.0f;
138  }
139  buffer += BLOCK_SZ_NR;
140  B += incRowB;
141  }
142  }
143 }
144 
145 // micro kernel that multiplies panels from A and B
147  float alpha,
148  const float *A,
149  const float *B,
150  float beta,
151  float *C,
152  int incRowC,
153  int incColC) {
154  float AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
155 
156  int i, j, l;
157 
158  // Compute AB = A*B
159  for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
160  AB[l] = 0;
161  }
162  for (l = 0; l < kc; ++l) {
163  for (j = 0; j < BLOCK_SZ_NR; ++j) {
164  for (i = 0; i < BLOCK_SZ_MR; ++i) {
165  AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
166  }
167  }
168  A += BLOCK_SZ_MR;
169  B += BLOCK_SZ_NR;
170  }
171 
172  // Update C <- beta*C
173  if (fabs(beta - 0.0f) < std::numeric_limits<float>::epsilon()) {
174  for (j = 0; j < BLOCK_SZ_NR; ++j) {
175  for (i = 0; i < BLOCK_SZ_MR; ++i) {
176  C[i * incRowC + j * incColC] = 0.0f;
177  }
178  }
179  } else if (fabs(beta - 1.0f) > std::numeric_limits<float>::epsilon()) {
180  for (j = 0; j < BLOCK_SZ_NR; ++j) {
181  for (i = 0; i < BLOCK_SZ_MR; ++i) {
182  C[i * incRowC + j * incColC] *= beta;
183  }
184  }
185  }
186 
187  // Update C <- C + alpha*AB (note: the case alpha==0.0f was already treated
188  // in
189  // the above layer sgemm_nn)
190  if (fabs(alpha - 1.0f) < std::numeric_limits<float>::epsilon()) {
191  for (j = 0; j < BLOCK_SZ_NR; ++j) {
192  for (i = 0; i < BLOCK_SZ_MR; ++i) {
193  C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
194  }
195  }
196  }
197 
198  else {
199  for (j = 0; j < BLOCK_SZ_NR; ++j) {
200  for (i = 0; i < BLOCK_SZ_MR; ++i) {
201  C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
202  }
203  }
204  }
205 }
206 
207 // Compute Y += alpha*X (float precision AX + Y)
209  int n,
210  float alpha,
211  const float *X,
212  int incRowX,
213  int incColX,
214  float *Y,
215  int incRowY,
216  int incColY) {
217  int i, j;
218 
219  if (fabs(alpha - 1.0f) > std::numeric_limits<float>::epsilon()) {
220 
221  for (j = 0; j < n; ++j) {
222  for (i = 0; i < m; ++i) {
223  Y[i * incRowY + j * incColY] +=
224  alpha * X[i * incRowX + j * incColX];
225  }
226  }
227  }
228 
229  else {
230  for (j = 0; j < n; ++j) {
231  for (i = 0; i < m; ++i) {
232  Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
233  }
234  }
235  }
236 }
237 
238 // Compute X *= alpha (scale elements)
240  int n,
241  float alpha,
242  float *X,
243  int incRowX,
244  int incColX) {
245  int i, j;
246 
247  if (fabs(alpha - 0.0f) > std::numeric_limits<float>::epsilon()) {
248  for (j = 0; j < n; ++j) {
249  for (i = 0; i < m; ++i) {
250  X[i * incRowX + j * incColX] *= alpha;
251  }
252  }
253  }
254 
255  else {
256  for (j = 0; j < n; ++j) {
257  for (i = 0; i < m; ++i) {
258  X[i * incRowX + j * incColX] = 0.0f;
259  }
260  }
261  }
262 }
263 
264 // Macro Kernel for the multiplication of blocks of A and B. We assume that
265 // these blocks were previously packed to buffers SGEMM_BUFF_A and SGEMM_BUFF_B.
267  int nc,
268  int kc,
269  float alpha,
270  float beta,
271  float *C,
272  int incRowC,
273  int incColC) {
274 
275  int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
276  int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
277 
278  int _mr = mc % BLOCK_SZ_MR;
279  int _nr = nc % BLOCK_SZ_NR;
280 
281  int mr, nr;
282  int i, j;
283 
284  for (j = 0; j < np; ++j) {
285  nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
286 
287  for (i = 0; i < mp; ++i) {
288  mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
289 
290  if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
291  sgemm_micro_kernel(
292  kc,
293  alpha,
294  &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
295  &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
296  beta,
297  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
298  incRowC,
299  incColC);
300  } else {
301  sgemm_micro_kernel(kc,
302  alpha,
303  &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
304  &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
305  0.0f,
306  SGEMM_BUFF_C,
307  1,
308  BLOCK_SZ_MR);
309  sgescal(
310  mr,
311  nr,
312  beta,
313  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
314  incRowC,
315  incColC);
316  sgeaxpy(
317  mr,
318  nr,
319  1.0f,
320  SGEMM_BUFF_C,
321  1,
322  BLOCK_SZ_MR,
323  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
324  incRowC,
325  incColC);
326  }
327  }
328  }
329 }
330 
331 // Main SGEMM entrypoint, compute C <- beta*C + alpha*A*B
333  int n,
334  int k,
335  float alpha,
336  const float *A,
337  int incRowA,
338  int incColA,
339  const float *B,
340  int incRowB,
341  int incColB,
342  float beta,
343  float *C,
344  int incRowC,
345  int incColC) {
346  int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
347  int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
348  int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
349 
350  int _mc = m % BLOCK_SZ_M;
351  int _nc = n % BLOCK_SZ_N;
352  int _kc = k % BLOCK_SZ_K;
353 
354  int mc, nc, kc;
355  int i, j, l;
356 
357  float _beta;
358 
359  if (fabs(alpha) < std::numeric_limits<float>::epsilon() || k == 0) {
360  sgescal(m, n, beta, C, incRowC, incColC);
361  return;
362  }
363 
364  for (j = 0; j < nb; ++j) {
365  nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
366 
367  for (l = 0; l < kb; ++l) {
368  kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
369  _beta = (l == 0) ? beta : 1.0f;
370 
371  pack_buffer_B(
372  kc,
373  nc,
374  &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
375  incRowB,
376  incColB,
377  SGEMM_BUFF_B);
378 
379  for (i = 0; i < mb; ++i) {
380  mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
381 
382  pack_buffer_A(
383  mc,
384  kc,
385  &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
386  incRowA,
387  incColA,
388  SGEMM_BUFF_A);
389 
390  sgemm_macro_kernel(
391  mc,
392  nc,
393  kc,
394  alpha,
395  _beta,
396  &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
397  incRowC,
398  incColC);
399  }
400  }
401  }
402 }
#define BLOCK_SZ_M
Definition: _dgemm.hpp:37
#define BLOCK_SZ_MR
Definition: _dgemm.hpp:40
#define BLOCK_SZ_N
Definition: _dgemm.hpp:39
#define BLOCK_SZ_NR
Definition: _dgemm.hpp:41
#define BLOCK_SZ_K
Definition: _dgemm.hpp:38
static float SGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
Definition: _sgemm.hpp:55
void pack_micro_A(int k, const float *A, int incRowA, int incColA, float *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
Definition: sgemm_arr.cpp:50
void sgemm_macro_kernel(int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
Definition: sgemm_arr.cpp:266
void pack_buffer_B(int kc, int nc, const float *B, int incRowB, int incColB, float *buffer)
Packs panels from B with padding if needed.
Definition: sgemm_arr.cpp:115
void sgeaxpy(int m, int n, float alpha, const float *X, int incRowX, int incColX, float *Y, int incRowY, int incColY)
Computes Y += alpha*X (float precision AX + Y)
Definition: sgemm_arr.cpp:208
void sgemm_micro_kernel(int kc, float alpha, const float *A, const float *B, float beta, float *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
Definition: sgemm_arr.cpp:146
static float SGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
Definition: _sgemm.hpp:53
void pack_buffer_A(int mc, int kc, const float *A, int incRowA, int incColA, float *buffer)
Packs panels from A with padding if needed.
Definition: sgemm_arr.cpp:67
static float SGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
Definition: _sgemm.hpp:57
void sgemm_nn(int m, int n, int k, float alpha, const float *A, int incRowA, int incColA, const float *B, int incRowB, int incColB, float beta, float *C, int incRowC, int incColC)
Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B.
Definition: sgemm_arr.cpp:332
void sgescal(int m, int n, float alpha, float *X, int incRowX, int incColX)
Scales elements of X by alpha.
Definition: sgemm_arr.cpp:239
void pack_micro_B(int k, const float *B, int incRowB, int incColB, float *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
Definition: sgemm_arr.cpp:98
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23