Class for performing matrix multiplication on double type arrays. More...

#include <_dgemm.hpp>

Public Member Functions
void	pack_micro_A (int k, const double A, int incRowA, int incColA, double buffer)
	Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding. More...

void	pack_buffer_A (int mc, int kc, const double A, int incRowA, int incColA, double buffer)
	Packs panels from A with padding if needed. More...

void	pack_micro_B (int k, const double B, int incRowB, int incColB, double buffer)
	Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding. More...

void	pack_buffer_B (int kc, int nc, const double B, int incRowB, int incColB, double buffer)
	Packs panels from B with padding if needed. More...

void	dgemm_micro_kernel (int kc, double alpha, const double A, const double B, double beta, double *C, int incRowC, int incColC)
	Computes the micro kernel that multiplies panels from A and B. More...

void	dgemm_micro_kernel (long kc, double alpha, const double A, const double B, double beta, double C, long incRowC, long incColC, const double nextA, const double *nextB)
	Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM) More...

void	dgeaxpy (int m, int n, double alpha, const double X, int incRowX, int incColX, double Y, int incRowY, int incColY)
	Computes Y += alpha*X (double precision AX + Y) More...

void	dgescal (int m, int n, double alpha, double *X, int incRowX, int incColX)
	Scales elements of X by alpha. More...

void	dgemm_macro_kernel (int mc, int nc, int kc, double alpha, double beta, double *C, int incRowC, int incColC)
	Macro kernel for the multiplication of blocks of A and B. More...

void	dgemm_nn (int m, int n, int k, double alpha, const double A, int incRowA, int incColA, const double B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC)
	Main DGEMM entrypoint, computes C <- betaC + alphaA*B. More...

Static Public Member Functions
static double DGEMM_BUFF_A[BLOCK_SZ_M *BLOCK_SZ_K]	__attribute__ ((aligned(32)))

static double DGEMM_BUFF_B[BLOCK_SZ_K *BLOCK_SZ_N]	__attribute__ ((aligned(32)))

static double DGEMM_BUFF_C[BLOCK_SZ_MR *BLOCK_SZ_NR]	__attribute__ ((aligned(32)))

Detailed Description

Class for performing matrix multiplication on double type arrays.

Definition at line 50 of file _dgemm.hpp.

Member Function Documentation

◆ attribute() [1/3]

static double DGEMM_BUFF_A [BLOCK_SZ_M * BLOCK_SZ_K] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32)) )

static

< Buffer for storing packed micro panels of A
Buffer for storing packed micro panels of B

◆ attribute() [2/3]

static double DGEMM_BUFF_B [BLOCK_SZ_K * BLOCK_SZ_N] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32)) )

static

Buffer for storing intermediate results

◆ attribute() [3/3]

static double DGEMM_BUFF_C [BLOCK_SZ_MR * BLOCK_SZ_NR] gpmp::linalg::DGEMM::__attribute__ ( (aligned(32)) )

static

◆ dgeaxpy()

void gpmp::linalg::DGEMM::dgeaxpy	(	int	m,
		int	n,
		double	alpha,
		const double *	X,
		int	incRowX,
		int	incColX,
		double *	Y,
		int	incRowY,
		int	incColY
	)

Computes Y += alpha*X (double precision AX + Y)

Parameters

m	Number of rows
n	Number of columns
alpha	Scalar alpha
X	Pointer to matrix X
incRowX	Increment between consecutive rows of X
incColX	Increment between consecutive columns of X
Y	Pointer to matrix Y
incRowY	Increment between consecutive rows of Y
incColY	Increment between consecutive columns of Y

Definition at line 217 of file dgemm_arr.cpp.

                                                {
     int i, j;
  
     if (fabs(alpha - 1.0) > std::numeric_limits<double>::epsilon()) {
  
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 Y[i * incRowY + j * incColY] +=
                     alpha * X[i * incRowX + j * incColX];
             }
         }
     }
  
     else {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
             }
         }
     }
 }

◆ dgemm_macro_kernel()

void gpmp::linalg::DGEMM::dgemm_macro_kernel	(	int	mc,
		int	nc,
		int	kc,
		double	alpha,
		double	beta,
		double *	C,
		int	incRowC,
		int	incColC
	)

Macro kernel for the multiplication of blocks of A and B.

Parameters

mc	Number of rows in the block of C
nc	Number of columns in the block of C
kc	Number of columns in the blocks of A and rows of B
alpha	Scalar alpha
beta	Scalar beta
C	Pointer to the output matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Definition at line 275 of file dgemm_arr.cpp.

                                                           {
  
     int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
     int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
  
     int _mr = mc % BLOCK_SZ_MR;
     int _nr = nc % BLOCK_SZ_NR;
  
     int mr, nr;
     int i, j;
  
 #if defined(__SSE__)
  
     const double *nextA = nullptr;
     const double *nextB = nullptr;
  
 #endif
  
     for (j = 0; j < np; ++j) {
         nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
  
         for (i = 0; i < mp; ++i) {
             mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
  
             if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
 #if defined(__SSE__)
                 dgemm_micro_kernel(
                     kc,
                     alpha,
                     &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                     &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                     beta,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC,
                     nextA,
                     nextB);
  
 #else
                 dgemm_micro_kernel(
                     kc,
                     alpha,
                     &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                     &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                     beta,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
  
 #endif
             }
  
             else {
  
 #if defined(__SSE__)
                 dgemm_micro_kernel(kc,
                                    alpha,
                                    &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                                    &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                                    0.0,
                                    DGEMM_BUFF_C,
                                    1,
                                    BLOCK_SZ_MR,
                                    nextA,
                                    nextB);
 #else
                 dgemm_micro_kernel(kc,
                                    alpha,
                                    &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                                    &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                                    0.0,
                                    DGEMM_BUFF_C,
                                    1,
                                    BLOCK_SZ_MR);
  
 #endif
                 dgescal(
                     mr,
                     nr,
                     beta,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
                 dgeaxpy(
                     mr,
                     nr,
                     1.0,
                     DGEMM_BUFF_C,
                     1,
                     BLOCK_SZ_MR,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
             }
         }
     }
 }

References BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ dgemm_micro_kernel() [1/2]

void gpmp::linalg::DGEMM::dgemm_micro_kernel	(	int	kc,
		double	alpha,
		const double *	A,
		const double *	B,
		double	beta,
		double *	C,
		int	incRowC,
		int	incColC
	)

Computes the micro kernel that multiplies panels from A and B.

Parameters

kc	Number of columns
alpha	Scalar alpha
A	Pointer to the packed panel A
B	Pointer to the packed panel B
beta	Scalar beta
C	Pointer to the output matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Double precision GEneral Matrix-Matrix product kernel without accl

Definition at line 38 of file dgemm_kernel.cpp.

                                                           {
     double AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
  
     int i, j, l;
  
     // Compute AB = A*B
     for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
         AB[l] = 0;
     }
     for (l = 0; l < kc; ++l) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
             }
         }
         A += BLOCK_SZ_MR;
         B += BLOCK_SZ_NR;
     }
  
     // Update C <- beta*C
     if (fabs(beta - 0.0) < std::numeric_limits<double>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] = 0.0;
             }
         }
     } else if (fabs(beta - 1.0) > std::numeric_limits<double>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] *= beta;
             }
         }
     }
  
     // Update C <- C + alpha*AB (note: the case alpha==0.0 was already treated
     // in
     //                                  the above layer dgemm_nn)
     if (fabs(alpha - 1.0) < std::numeric_limits<double>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
             }
         }
     }
  
     else {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
             }
         }
     }
 }

References python.linalg::A, python.linalg::B, BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ dgemm_micro_kernel() [2/2]

void gpmp::linalg::DGEMM::dgemm_micro_kernel	(	long	kc,
		double	alpha,
		const double *	A,
		const double *	B,
		double	beta,
		double *	C,
		long	incRowC,
		long	incColC,
		const double *	nextA,
		const double *	nextB
	)

Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM)

This function implements a micro-kernel operation for DGEMM, which is used as a building block in larger DGEMM routines. The micro-kernel performs a basic matrix multiplication operation with optimizations tailored for the SSE (Streaming SIMD Extensions) x86 architecture

Parameters

kc	The size of the inner dimension of the matrices A and B
alpha	Scaling factor for the matrix multiplication
A	Pointer to the first input matrix A in row-major order
B	Pointer to the second input matrix B in column-major order
beta	Scaling factor for the matrix C
C	Pointer to the output matrix C in row-major order
incRowC	Increment for moving between rows of the matrix C
incColC	Increment for moving between columns of the matrix C
nextA	Pointer to the next block of matrix A (unused in this micro-kernel)
nextB	Pointer to the next block of matrix B (unused in this micro-kernel)

Double precision GEneral Matrix-Matrix product

Definition at line 88 of file dgemm_arr.cpp.

                                                                   {
     long kb = kc / 4;
     long kl = kc % 4;
  
     dgemm_kernel_asm(A,
                      B,
                      C,
                      nextA,
                      nextB,
                      kl,
                      kb,
                      incRowC,
                      incColC,
                      alpha,
                      beta);
 }

References python.linalg::A, python.linalg::B, and python.linalg::C.

◆ dgemm_nn()

void gpmp::linalg::DGEMM::dgemm_nn	(	int	m,
		int	n,
		int	k,
		double	alpha,
		const double *	A,
		int	incRowA,
		int	incColA,
		const double *	B,
		int	incRowB,
		int	incColB,
		double	beta,
		double *	C,
		int	incRowC,
		int	incColC
	)

Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B.

Parameters

m	Number of rows of A and rows of C
n	Number of columns of B and columns of C
k	Number of columns of A and rows of B
alpha	Scalar alpha
A	Pointer to matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
B	Pointer to matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
beta	Scalar beta
C	Pointer to matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Definition at line 381 of file dgemm_arr.cpp.

                                                 {
     int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
     int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
     int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
  
     int _mc = m % BLOCK_SZ_M;
     int _nc = n % BLOCK_SZ_N;
     int _kc = k % BLOCK_SZ_K;
  
     int mc, nc, kc;
     int i, j, l;
  
     double _beta;
  
     if (fabs(alpha) < std::numeric_limits<double>::epsilon() || k == 0) {
         dgescal(m, n, beta, C, incRowC, incColC);
         return;
     }
  
     for (j = 0; j < nb; ++j) {
         nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
  
         for (l = 0; l < kb; ++l) {
             kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
             _beta = (l == 0) ? beta : 1.0;
  
             pack_buffer_B(
                 kc,
                 nc,
                 &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
                 incRowB,
                 incColB,
                 DGEMM_BUFF_B);
  
             for (i = 0; i < mb; ++i) {
                 mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
  
                 pack_buffer_A(
                     mc,
                     kc,
                     &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
                     incRowA,
                     incColA,
                     DGEMM_BUFF_A);
  
                 dgemm_macro_kernel(
                     mc,
                     nc,
                     kc,
                     alpha,
                     _beta,
                     &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
                     incRowC,
                     incColC);
             }
         }
     }
 }

References python.linalg::A, python.linalg::B, BLOCK_SZ_K, BLOCK_SZ_M, BLOCK_SZ_N, and python.linalg::C.

◆ dgescal()

void gpmp::linalg::DGEMM::dgescal	(	int	m,
		int	n,
		double	alpha,
		double *	X,
		int	incRowX,
		int	incColX
	)

Scales elements of X by alpha.

Parameters

m	Number of rows
n	Number of columns
alpha	Scalar alpha
X	Pointer to matrix X
incRowX	Increment between consecutive rows of X
incColX	Increment between consecutive columns of X

Definition at line 248 of file dgemm_arr.cpp.

                                                {
     int i, j;
  
     if (fabs(alpha - 0.0) > std::numeric_limits<double>::epsilon()) {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 X[i * incRowX + j * incColX] *= alpha;
             }
         }
     }
  
     else {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 X[i * incRowX + j * incColX] = 0.0;
             }
         }
     }
 }

◆ pack_buffer_A()

void gpmp::linalg::DGEMM::pack_buffer_A	(	int	mc,
		int	kc,
		const double *	A,
		int	incRowA,
		int	incColA,
		double *	buffer
	)

Packs panels from A with padding if needed.

Parameters

mc	Number of rows to pack
kc	Number of columns to pack
A	Pointer to the source matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
buffer	Pointer to the buffer to store the packed panels

Definition at line 138 of file dgemm_arr.cpp.

                                                         {
     int mp = mc / BLOCK_SZ_MR;
     int _mr = mc % BLOCK_SZ_MR;
  
     int i, j;
  
     for (i = 0; i < mp; ++i) {
         pack_micro_A(kc, A, incRowA, incColA, buffer);
         buffer += kc * BLOCK_SZ_MR;
         A += BLOCK_SZ_MR * incRowA;
     }
     if (_mr > 0) {
         for (j = 0; j < kc; ++j) {
             for (i = 0; i < _mr; ++i) {
                 buffer[i] = A[i * incRowA];
             }
             for (i = _mr; i < BLOCK_SZ_MR; ++i) {
                 buffer[i] = 0.0;
             }
             buffer += BLOCK_SZ_MR;
             A += incColA;
         }
     }
 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_buffer_B()

void gpmp::linalg::DGEMM::pack_buffer_B	(	int	kc,
		int	nc,
		const double *	B,
		int	incRowB,
		int	incColB,
		double *	buffer
	)

Packs panels from B with padding if needed.

Parameters

kc	Number of rows to pack
nc	Number of columns to pack
B	Pointer to the source matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
buffer	Pointer to the buffer to store the packed panels

Definition at line 186 of file dgemm_arr.cpp.

                                                         {
     int np = nc / BLOCK_SZ_NR;
     int _nr = nc % BLOCK_SZ_NR;
  
     int i, j;
  
     for (j = 0; j < np; ++j) {
         pack_micro_B(kc, B, incRowB, incColB, buffer);
         buffer += kc * BLOCK_SZ_NR;
         B += BLOCK_SZ_NR * incColB;
     }
     if (_nr > 0) {
         for (i = 0; i < kc; ++i) {
             for (j = 0; j < _nr; ++j) {
                 buffer[j] = B[j * incColB];
             }
             for (j = _nr; j < BLOCK_SZ_NR; ++j) {
                 buffer[j] = 0.0;
             }
             buffer += BLOCK_SZ_NR;
             B += incRowB;
         }
     }
 }

References python.linalg::B, and BLOCK_SZ_NR.

◆ pack_micro_A()

void gpmp::linalg::DGEMM::pack_micro_A	(	int	k,
		const double *	A,
		int	incRowA,
		int	incColA,
		double *	buffer
	)

Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.

Parameters

k	Number of columns to pack
A	Pointer to the source matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
buffer	Pointer to the buffer to store the packed micro panels

Definition at line 121 of file dgemm_arr.cpp.

                                                        {
     int i, j;
  
     for (j = 0; j < k; ++j) {
         for (i = 0; i < BLOCK_SZ_MR; ++i) {
             buffer[i] = A[i * incRowA];
         }
         buffer += BLOCK_SZ_MR;
         A += incColA;
     }
 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_micro_B()

void gpmp::linalg::DGEMM::pack_micro_B	(	int	k,
		const double *	B,
		int	incRowB,
		int	incColB,
		double *	buffer
	)

Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.

Parameters

k	Number of rows to pack
B	Pointer to the source matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
buffer	Pointer to the buffer to store the packed micro panels

Definition at line 169 of file dgemm_arr.cpp.

                                                        {
     int i, j;
  
     for (i = 0; i < k; ++i) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             buffer[j] = B[j * incColB];
         }
         buffer += BLOCK_SZ_NR;
         B += incRowB;
     }
 }

References python.linalg::B, and BLOCK_SZ_NR.

The documentation for this class was generated from the following files:

include/openGPMP/linalg/_dgemm.hpp
modules/linalg/dgemm_arr.cpp
modules/linalg/dgemm_kernel.cpp

Public Member Functions

Static Public Member Functions

Detailed Description

Member Function Documentation

◆ __attribute__() [1/3]

◆ __attribute__() [2/3]

◆ __attribute__() [3/3]

◆ dgeaxpy()

◆ dgemm_macro_kernel()

◆ dgemm_micro_kernel() [1/2]

◆ dgemm_micro_kernel() [2/2]

◆ dgemm_nn()

◆ dgescal()

◆ pack_buffer_A()

◆ pack_buffer_B()

◆ pack_micro_A()

◆ pack_micro_B()

◆ attribute() [1/3]

◆ attribute() [2/3]

◆ attribute() [3/3]