openGPMP
Open Source Mathematics Package
dgemm_arr.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
35 #include <cmath>
36 #include <limits>
38 #include <stddef.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <time.h>
42 
43 #if defined(__SSE2__)
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 // ASM micro kernel function
70 extern void dgemm_kernel_asm(const double *A,
71  const double *B,
72  double *C,
73  const double *nextA,
74  const double *nextB,
75  long kl,
76  long kb,
77  long incRowC,
78  long incColC,
79  double alpha,
80  double beta);
81 
82 #ifdef __cplusplus
83 }
84 #endif
85 
86 #endif
87 
89  double alpha,
90  const double *A,
91  const double *B,
92  double beta,
93  double *C,
94  long incRowC,
95  long incColC,
96  const double *nextA,
97  const double *nextB) {
98  long kb = kc / 4;
99  long kl = kc % 4;
100 
101  dgemm_kernel_asm(A,
102  B,
103  C,
104  nextA,
105  nextB,
106  kl,
107  kb,
108  incRowC,
109  incColC,
110  alpha,
111  beta);
112 }
113 
114 // MATRIX BUFFERS
115 double gpmp::linalg::DGEMM::DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K];
116 double gpmp::linalg::DGEMM::DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N];
117 double gpmp::linalg::DGEMM::DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR];
118 
119 // pack micro panels of size BLOCK_SZ_MR rows by k columns from A without
120 // padding
122  const double *A,
123  int incRowA,
124  int incColA,
125  double *buffer) {
126  int i, j;
127 
128  for (j = 0; j < k; ++j) {
129  for (i = 0; i < BLOCK_SZ_MR; ++i) {
130  buffer[i] = A[i * incRowA];
131  }
132  buffer += BLOCK_SZ_MR;
133  A += incColA;
134  }
135 }
136 
137 // packs panels from A with padding if needed
139  int kc,
140  const double *A,
141  int incRowA,
142  int incColA,
143  double *buffer) {
144  int mp = mc / BLOCK_SZ_MR;
145  int _mr = mc % BLOCK_SZ_MR;
146 
147  int i, j;
148 
149  for (i = 0; i < mp; ++i) {
150  pack_micro_A(kc, A, incRowA, incColA, buffer);
151  buffer += kc * BLOCK_SZ_MR;
152  A += BLOCK_SZ_MR * incRowA;
153  }
154  if (_mr > 0) {
155  for (j = 0; j < kc; ++j) {
156  for (i = 0; i < _mr; ++i) {
157  buffer[i] = A[i * incRowA];
158  }
159  for (i = _mr; i < BLOCK_SZ_MR; ++i) {
160  buffer[i] = 0.0;
161  }
162  buffer += BLOCK_SZ_MR;
163  A += incColA;
164  }
165  }
166 }
167 
168 // packing complete panels from B of size BLOCK_SZ_NR by k columns
170  const double *B,
171  int incRowB,
172  int incColB,
173  double *buffer) {
174  int i, j;
175 
176  for (i = 0; i < k; ++i) {
177  for (j = 0; j < BLOCK_SZ_NR; ++j) {
178  buffer[j] = B[j * incColB];
179  }
180  buffer += BLOCK_SZ_NR;
181  B += incRowB;
182  }
183 }
184 
185 // packing panels from B with padding if needed
187  int nc,
188  const double *B,
189  int incRowB,
190  int incColB,
191  double *buffer) {
192  int np = nc / BLOCK_SZ_NR;
193  int _nr = nc % BLOCK_SZ_NR;
194 
195  int i, j;
196 
197  for (j = 0; j < np; ++j) {
198  pack_micro_B(kc, B, incRowB, incColB, buffer);
199  buffer += kc * BLOCK_SZ_NR;
200  B += BLOCK_SZ_NR * incColB;
201  }
202  if (_nr > 0) {
203  for (i = 0; i < kc; ++i) {
204  for (j = 0; j < _nr; ++j) {
205  buffer[j] = B[j * incColB];
206  }
207  for (j = _nr; j < BLOCK_SZ_NR; ++j) {
208  buffer[j] = 0.0;
209  }
210  buffer += BLOCK_SZ_NR;
211  B += incRowB;
212  }
213  }
214 }
215 
216 // Compute Y += alpha*X (double precision AX + Y)
218  int n,
219  double alpha,
220  const double *X,
221  int incRowX,
222  int incColX,
223  double *Y,
224  int incRowY,
225  int incColY) {
226  int i, j;
227 
228  if (fabs(alpha - 1.0) > std::numeric_limits<double>::epsilon()) {
229 
230  for (j = 0; j < n; ++j) {
231  for (i = 0; i < m; ++i) {
232  Y[i * incRowY + j * incColY] +=
233  alpha * X[i * incRowX + j * incColX];
234  }
235  }
236  }
237 
238  else {
239  for (j = 0; j < n; ++j) {
240  for (i = 0; i < m; ++i) {
241  Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
242  }
243  }
244  }
245 }
246 
247 // Compute X *= alpha (scale elements)
249  int n,
250  double alpha,
251  double *X,
252  int incRowX,
253  int incColX) {
254  int i, j;
255 
256  if (fabs(alpha - 0.0) > std::numeric_limits<double>::epsilon()) {
257  for (j = 0; j < n; ++j) {
258  for (i = 0; i < m; ++i) {
259  X[i * incRowX + j * incColX] *= alpha;
260  }
261  }
262  }
263 
264  else {
265  for (j = 0; j < n; ++j) {
266  for (i = 0; i < m; ++i) {
267  X[i * incRowX + j * incColX] = 0.0;
268  }
269  }
270  }
271 }
272 
273 // Macro Kernel for the multiplication of blocks of A and B. We assume that
274 // these blocks were previously packed to buffers DGEMM_BUFF_A and DGEMM_BUFF_B.
276  int nc,
277  int kc,
278  double alpha,
279  double beta,
280  double *C,
281  int incRowC,
282  int incColC) {
283 
284  int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
285  int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
286 
287  int _mr = mc % BLOCK_SZ_MR;
288  int _nr = nc % BLOCK_SZ_NR;
289 
290  int mr, nr;
291  int i, j;
292 
293 #if defined(__SSE__)
294 
295  const double *nextA = nullptr;
296  const double *nextB = nullptr;
297 
298 #endif
299 
300  for (j = 0; j < np; ++j) {
301  nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
302 
303  for (i = 0; i < mp; ++i) {
304  mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
305 
306  if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
307 #if defined(__SSE__)
308  dgemm_micro_kernel(
309  kc,
310  alpha,
311  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
312  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
313  beta,
314  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
315  incRowC,
316  incColC,
317  nextA,
318  nextB);
319 
320 #else
321  dgemm_micro_kernel(
322  kc,
323  alpha,
324  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
325  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
326  beta,
327  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
328  incRowC,
329  incColC);
330 
331 #endif
332  }
333 
334  else {
335 
336 #if defined(__SSE__)
337  dgemm_micro_kernel(kc,
338  alpha,
339  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
340  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
341  0.0,
342  DGEMM_BUFF_C,
343  1,
344  BLOCK_SZ_MR,
345  nextA,
346  nextB);
347 #else
348  dgemm_micro_kernel(kc,
349  alpha,
350  &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
351  &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
352  0.0,
353  DGEMM_BUFF_C,
354  1,
355  BLOCK_SZ_MR);
356 
357 #endif
358  dgescal(
359  mr,
360  nr,
361  beta,
362  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
363  incRowC,
364  incColC);
365  dgeaxpy(
366  mr,
367  nr,
368  1.0,
369  DGEMM_BUFF_C,
370  1,
371  BLOCK_SZ_MR,
372  &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
373  incRowC,
374  incColC);
375  }
376  }
377  }
378 }
379 
380 // Main DGEMM entrypoint, compute C <- beta*C + alpha*A*B
382  int n,
383  int k,
384  double alpha,
385  const double *A,
386  int incRowA,
387  int incColA,
388  const double *B,
389  int incRowB,
390  int incColB,
391  double beta,
392  double *C,
393  int incRowC,
394  int incColC) {
395  int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
396  int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
397  int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
398 
399  int _mc = m % BLOCK_SZ_M;
400  int _nc = n % BLOCK_SZ_N;
401  int _kc = k % BLOCK_SZ_K;
402 
403  int mc, nc, kc;
404  int i, j, l;
405 
406  double _beta;
407 
408  if (fabs(alpha) < std::numeric_limits<double>::epsilon() || k == 0) {
409  dgescal(m, n, beta, C, incRowC, incColC);
410  return;
411  }
412 
413  for (j = 0; j < nb; ++j) {
414  nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
415 
416  for (l = 0; l < kb; ++l) {
417  kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
418  _beta = (l == 0) ? beta : 1.0;
419 
420  pack_buffer_B(
421  kc,
422  nc,
423  &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
424  incRowB,
425  incColB,
426  DGEMM_BUFF_B);
427 
428  for (i = 0; i < mb; ++i) {
429  mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
430 
431  pack_buffer_A(
432  mc,
433  kc,
434  &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
435  incRowA,
436  incColA,
437  DGEMM_BUFF_A);
438 
439  dgemm_macro_kernel(
440  mc,
441  nc,
442  kc,
443  alpha,
444  _beta,
445  &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
446  incRowC,
447  incColC);
448  }
449  }
450  }
451 }
#define BLOCK_SZ_M
Definition: _dgemm.hpp:37
#define BLOCK_SZ_MR
Definition: _dgemm.hpp:40
#define BLOCK_SZ_N
Definition: _dgemm.hpp:39
#define BLOCK_SZ_NR
Definition: _dgemm.hpp:41
#define BLOCK_SZ_K
Definition: _dgemm.hpp:38
void dgemm_micro_kernel(int kc, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC)
Computes the micro kernel that multiplies panels from A and B.
void pack_buffer_B(int kc, int nc, const double *B, int incRowB, int incColB, double *buffer)
Packs panels from B with padding if needed.
Definition: dgemm_arr.cpp:186
void pack_micro_A(int k, const double *A, int incRowA, int incColA, double *buffer)
Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.
Definition: dgemm_arr.cpp:121
void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY)
Computes Y += alpha*X (double precision AX + Y)
Definition: dgemm_arr.cpp:217
void pack_micro_B(int k, const double *B, int incRowB, int incColB, double *buffer)
Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.
Definition: dgemm_arr.cpp:169
void dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX)
Scales elements of X by alpha.
Definition: dgemm_arr.cpp:248
void dgemm_macro_kernel(int mc, int nc, int kc, double alpha, double beta, double *C, int incRowC, int incColC)
Macro kernel for the multiplication of blocks of A and B.
Definition: dgemm_arr.cpp:275
void pack_buffer_A(int mc, int kc, const double *A, int incRowA, int incColA, double *buffer)
Packs panels from A with padding if needed.
Definition: dgemm_arr.cpp:138
void dgemm_nn(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC)
Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B.
Definition: dgemm_arr.cpp:381
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23