openGPMP
Open Source Mathematics Package
igemm_arr.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
35 #include <cmath>
36 #include <limits>
38 #include <stddef.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <time.h>
42 
43 // MATRIX BUFFERS
47 
48 // pack micro panels of size BLOCK_SZ_MR rows by k columns from A without
49 // padding
51  const int *A,
52  int incRowA,
53  int incColA,
54  int *buffer) {
55  int i, j;
56 
57  for (j = 0; j < k; ++j) {
58  for (i = 0; i < BLOCK_SZ_MR; ++i) {
59  buffer[i] = A[i * incRowA];
60  }
61  buffer += BLOCK_SZ_MR;
62  A += incColA;
63  }
64 }
65 
66 // packs panels from A with padding if needed
68  int kc,
69  const int *A,
70  int incRowA,
71  int incColA,
72  int *buffer) {
73  int mp = mc / BLOCK_SZ_MR;
74  int _mr = mc % BLOCK_SZ_MR;
75 
76  int i, j;
77 
78  for (i = 0; i < mp; ++i) {
79  pack_micro_A(kc, A, incRowA, incColA, buffer);
80  buffer += kc * BLOCK_SZ_MR;
81  A += BLOCK_SZ_MR * incRowA;
82  }
83  if (_mr > 0) {
84  for (j = 0; j < kc; ++j) {
85  for (i = 0; i < _mr; ++i) {
86  buffer[i] = A[i * incRowA];
87  }
88  for (i = _mr; i < BLOCK_SZ_MR; ++i) {
89  buffer[i] = 0;
90  }
91  buffer += BLOCK_SZ_MR;
92  A += incColA;
93  }
94  }
95 }
96 
97 // packing complete panels from B of size BLOCK_SZ_NR by k columns
99  const int *B,
100  int incRowB,
101  int incColB,
102  int *buffer) {
103  int i, j;
104 
105  for (i = 0; i < k; ++i) {
106  for (j = 0; j < BLOCK_SZ_NR; ++j) {
107  buffer[j] = B[j * incColB];
108  }
109  buffer += BLOCK_SZ_NR;
110  B += incRowB;
111  }
112 }
113 
114 // packing panels from B with padding if needed
116  int nc,
117  const int *B,
118  int incRowB,
119  int incColB,
120  int *buffer) {
121  int np = nc / BLOCK_SZ_NR;
122  int _nr = nc % BLOCK_SZ_NR;
123 
124  int i, j;
125 
126  for (j = 0; j < np; ++j) {
127  pack_micro_B(kc, B, incRowB, incColB, buffer);
128  buffer += kc * BLOCK_SZ_NR;
129  B += BLOCK_SZ_NR * incColB;
130  }
131  if (_nr > 0) {
132  for (i = 0; i < kc; ++i) {
133  for (j = 0; j < _nr; ++j) {
134  buffer[j] = B[j * incColB];
135  }
136  for (j = _nr; j < BLOCK_SZ_NR; ++j) {
137  buffer[j] = 0;
138  }
139  buffer += BLOCK_SZ_NR;
140  B += incRowB;
141  }
142  }
143 }
144 
145 // micro kernel that multiplies panels from A and B
147  int alpha,
148  const int *A,
149  const int *B,
150  int beta,
151  int *C,
152  int incRowC,
153  int incColC) {
154  int AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
155 
156  int i, j, l;
157 
158  // Compute AB = A*B
159  for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
160  AB[l] = 0;
161  }
162  for (l = 0; l < kc; ++l) {
163  for (j = 0; j < BLOCK_SZ_NR; ++j) {
164  for (i = 0; i < BLOCK_SZ_MR; ++i) {
165  AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
166  }
167  }
168  A += BLOCK_SZ_MR;
169  B += BLOCK_SZ_NR;
170  }
171 
172  // Update C <- beta*C
173  if (beta == 0) {
174  for (j = 0; j < BLOCK_SZ_NR; ++j) {
175  for (i = 0; i < BLOCK_SZ_MR; ++i) {
176  C[i * incRowC + j * incColC] = 0;
177  }
178  }
179  } else if (beta != 1) {
180  for (j = 0; j < BLOCK_SZ_NR; ++j) {
181  for (i = 0; i < BLOCK_SZ_MR; ++i) {
182  C[i * incRowC + j * incColC] *= beta;
183  }
184  }
185  }
186 
187  // Update C <- C + alpha*AB (note: the case alpha==0 was already treated
188  // in
189  // the above layer igemm_nn)
190  if (alpha == 1) {
191  for (j = 0; j < BLOCK_SZ_NR; ++j) {
192  for (i = 0; i < BLOCK_SZ_MR; ++i) {
193  C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
194  }
195  }
196  }
197 
198  else {
199  for (j = 0; j < BLOCK_SZ_NR; ++j) {
200  for (i = 0; i < BLOCK_SZ_MR; ++i) {
201  C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
202  }
203  }
204  }
205 }
206 
207 // Compute Y += alpha*X (int precision AX + Y)
209  int n,
210  int alpha,
211  const int *X,
212  int incRowX,
213  int incColX,
214  int *Y,
215  int incRowY,
216  int incColY) {
217  int i, j;
218 
219  if (alpha != 1) {
220  for (j = 0; j < n; ++j) {
221  for (i = 0; i < m; ++i) {
222  Y[i * incRowY + j * incColY] +=
223  alpha * X[i * incRowX + j * incColX];
224  }
225  }
226  }
227 
228  else {
229  for (j = 0; j < n; ++j) {
230  for (i = 0; i < m; ++i) {
231  Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
232  }
233  }
234  }
235 }
236 
237 // Compute X *= alpha (scale elements)
239  int n,
240  int alpha,
241  int *X,
242  int incRowX,
243  int incColX) {
244  int i, j;
245 
246  if (alpha != 0) {
247  for (j = 0; j < n; ++j) {
248  for (i = 0; i < m; ++i) {
249  X[i * incRowX + j * incColX] *= alpha;
250  }
251  }
252  }
253 
254  else {
255  for (j = 0; j < n; ++j) {
256  for (i = 0; i < m; ++i) {
257  X[i * incRowX + j * incColX] = 0;
258  }
259  }
260  }
261 }
262 
263 // Macro Kernel for the multiplication of blocks of A and B. We assume that
264 // these blocks were previously packed to buffers IGEMM_BUFF_A and IGEMM_BUFF_B.
266  int nc,
267  int kc,
268  int alpha,
269  int beta,
270  int *C,
271  int incRowC,
272  int incColC) {
273 
274  int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
275  int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
276 
277  int _mr = mc % BLOCK_SZ_MR;
278  int _nr = nc % BLOCK_SZ_NR;
279 
280  int mr, nr;
281  int i, j;
282 
283  for (j = 0; j < np; ++j) {
284  nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
285 
286  for (i = 0; i < mp; ++i) {
287  mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
288 
289  if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
290  igemm_micro_kernel(
291  kc,
292  alpha,
293  &IGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
294  &IGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
295  beta,
296  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
297  incRowC,
298  incColC);
299  } else {
300  igemm_micro_kernel(kc,
301  alpha,
302  &IGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
303  &IGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
304  0,
305  IGEMM_BUFF_C,
306  1,
307  BLOCK_SZ_MR);
308  igescal(
309  mr,
310  nr,
311  beta,
312  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
313  incRowC,
314  incColC);
315  igeaxpy(
316  mr,
317  nr,
318  1.0,
319  IGEMM_BUFF_C,
320  1,
321  BLOCK_SZ_MR,
322  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
323  incRowC,
324  incColC);
325  }
326  }
327  }
328 }
329 
330 // Main IGEMM entrypoint, compute C <- beta*C + alpha*A*B
332  int n,
333  int k,
334  int alpha,
335  const int *A,
336  int incRowA,
337  int incColA,
338  const int *B,
339  int incRowB,
340  int incColB,
341  int beta,
342  int *C,
343  int incRowC,
344  int incColC) {
345  int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
346  int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
347  int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
348 
349  int _mc = m % BLOCK_SZ_M;
350  int _nc = n % BLOCK_SZ_N;
351  int _kc = k % BLOCK_SZ_K;
352 
353  int mc, nc, kc;
354  int i, j, l;
355 
356  int _beta;
357 
358  if (alpha == 0 || k == 0) {
359  igescal(m, n, beta, C, incRowC, incColC);
360  return;
361  }
362 
363  for (j = 0; j < nb; ++j) {
364  nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
365 
366  for (l = 0; l < kb; ++l) {
367  kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
368  _beta = (l == 0) ? beta : 1.0;
369 
370  pack_buffer_B(
371  kc,
372  nc,
373  &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
374  incRowB,
375  incColB,
376  IGEMM_BUFF_B);
377 
378  for (i = 0; i < mb; ++i) {
379  mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
380 
381  pack_buffer_A(
382  mc,
383  kc,
384  &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
385  incRowA,
386  incColA,
387  IGEMM_BUFF_A);
388 
389  igemm_macro_kernel(
390  mc,
391  nc,
392  kc,
393  alpha,
394  _beta,
395  &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
396  incRowC,
397  incColC);
398  }
399  }
400  }
401 }
#define BLOCK_SZ_M
Definition: _dgemm.hpp:37
#define BLOCK_SZ_MR
Definition: _dgemm.hpp:40
#define BLOCK_SZ_N
Definition: _dgemm.hpp:39
#define BLOCK_SZ_NR
Definition: _dgemm.hpp:41
#define BLOCK_SZ_K
Definition: _dgemm.hpp:38
void igemm_macro_kernel(int mc, int nc, int kc, int alpha, int beta, int *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
Definition: igemm_arr.cpp:265
void igemm_nn(int m, int n, int k, int alpha, const int *A, int incRowA, int incColA, const int *B, int incRowB, int incColB, int beta, int *C, int incRowC, int incColC)
Main IGEMM entrypoint, computes C <- beta*C + alpha*A*B.
Definition: igemm_arr.cpp:331
void pack_buffer_B(int kc, int nc, const int *B, int incRowB, int incColB, int *buffer)
Packs panels from B with padding if needed.
Definition: igemm_arr.cpp:115
void igeaxpy(int m, int n, int alpha, const int *X, int incRowX, int incColX, int *Y, int incRowY, int incColY)
Computes Y += alpha*X (int precision AX + Y)
Definition: igemm_arr.cpp:208
void igemm_micro_kernel(int kc, int alpha, const int *A, const int *B, int beta, int *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
Definition: igemm_arr.cpp:146
void igescal(int m, int n, int alpha, int *X, int incRowX, int incColX)
Scales elements of X by alpha.
Definition: igemm_arr.cpp:238
static int IGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]
Definition: _igemm.hpp:55
void pack_micro_A(int k, const int *A, int incRowA, int incColA, int *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
Definition: igemm_arr.cpp:50
static int IGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]
Definition: _igemm.hpp:53
void pack_micro_B(int k, const int *B, int incRowB, int incColB, int *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
Definition: igemm_arr.cpp:98
static int IGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]
Definition: _igemm.hpp:57
void pack_buffer_A(int mc, int kc, const int *A, int incRowA, int incColA, int *buffer)
Packs panels from A with padding if needed.
Definition: igemm_arr.cpp:67
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23