Class for performing matrix multiplication on float type arrays. More...

#include <_sgemm.hpp>

Public Member Functions
void	pack_micro_A (int k, const float A, int incRowA, int incColA, float buffer)
	Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding. More...

void	pack_buffer_A (int mc, int kc, const float A, int incRowA, int incColA, float buffer)
	Packs panels from A with padding if needed. More...

void	pack_micro_B (int k, const float B, int incRowB, int incColB, float buffer)
	Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding. More...

void	pack_buffer_B (int kc, int nc, const float B, int incRowB, int incColB, float buffer)
	Packs panels from B with padding if needed. More...

void	sgemm_micro_kernel (int kc, float alpha, const float A, const float B, float beta, float *C, int incRowC, int incColC)
	Computes the micro kernel that multiplies panels from A and B. More...

void	sgeaxpy (int m, int n, float alpha, const float X, int incRowX, int incColX, float Y, int incRowY, int incColY)
	Computes Y += alpha*X (float precision AX + Y) More...

void	sgescal (int m, int n, float alpha, float *X, int incRowX, int incColX)
	Scales elements of X by alpha. More...

void	sgemm_macro_kernel (int mc, int nc, int kc, float alpha, float beta, float *C, int incRowC, int incColC)
	Macro kernel for the multiplication of blocks of A and B. More...

void	sgemm_nn (int m, int n, int k, float alpha, const float A, int incRowA, int incColA, const float B, int incRowB, int incColB, float beta, float *C, int incRowC, int incColC)
	Main SGEMM entrypoint, computes C <- betaC + alphaA*B. More...

Static Public Attributes
static float	SGEMM_BUFF_A [BLOCK_SZ_M *BLOCK_SZ_K]

static float	SGEMM_BUFF_B [BLOCK_SZ_K *BLOCK_SZ_N]

static float	SGEMM_BUFF_C [BLOCK_SZ_MR *BLOCK_SZ_NR]

Detailed Description

Class for performing matrix multiplication on float type arrays.

Definition at line 50 of file _sgemm.hpp.

Member Function Documentation

◆ pack_buffer_A()

void gpmp::linalg::SGEMM::pack_buffer_A	(	int	mc,
		int	kc,
		const float *	A,
		int	incRowA,
		int	incColA,
		float *	buffer
	)

Packs panels from A with padding if needed.

Parameters

mc	Number of rows to pack
kc	Number of columns to pack
A	Pointer to the source matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
buffer	Pointer to the buffer to store the packed panels

Definition at line 67 of file sgemm_arr.cpp.

                                                        {
     int mp = mc / BLOCK_SZ_MR;
     int _mr = mc % BLOCK_SZ_MR;
  
     int i, j;
  
     for (i = 0; i < mp; ++i) {
         pack_micro_A(kc, A, incRowA, incColA, buffer);
         buffer += kc * BLOCK_SZ_MR;
         A += BLOCK_SZ_MR * incRowA;
     }
     if (_mr > 0) {
         for (j = 0; j < kc; ++j) {
             for (i = 0; i < _mr; ++i) {
                 buffer[i] = A[i * incRowA];
             }
             for (i = _mr; i < BLOCK_SZ_MR; ++i) {
                 buffer[i] = 0.0f;
             }
             buffer += BLOCK_SZ_MR;
             A += incColA;
         }
     }
 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_buffer_B()

void gpmp::linalg::SGEMM::pack_buffer_B	(	int	kc,
		int	nc,
		const float *	B,
		int	incRowB,
		int	incColB,
		float *	buffer
	)

Packs panels from B with padding if needed.

Parameters

kc	Number of rows to pack
nc	Number of columns to pack
B	Pointer to the source matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
buffer	Pointer to the buffer to store the packed panels

Definition at line 115 of file sgemm_arr.cpp.

                                                        {
     int np = nc / BLOCK_SZ_NR;
     int _nr = nc % BLOCK_SZ_NR;
  
     int i, j;
  
     for (j = 0; j < np; ++j) {
         pack_micro_B(kc, B, incRowB, incColB, buffer);
         buffer += kc * BLOCK_SZ_NR;
         B += BLOCK_SZ_NR * incColB;
     }
     if (_nr > 0) {
         for (i = 0; i < kc; ++i) {
             for (j = 0; j < _nr; ++j) {
                 buffer[j] = B[j * incColB];
             }
             for (j = _nr; j < BLOCK_SZ_NR; ++j) {
                 buffer[j] = 0.0f;
             }
             buffer += BLOCK_SZ_NR;
             B += incRowB;
         }
     }
 }

References python.linalg::B, and BLOCK_SZ_NR.

◆ pack_micro_A()

void gpmp::linalg::SGEMM::pack_micro_A	(	int	k,
		const float *	A,
		int	incRowA,
		int	incColA,
		float *	buffer
	)

Packs micro panels of size BLOCK_SZ_MR rows by k columns from A without padding.

Parameters

k	Number of columns to pack
A	Pointer to the source matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
buffer	Pointer to the buffer to store the packed micro panels

Definition at line 50 of file sgemm_arr.cpp.

                                                       {
     int i, j;
  
     for (j = 0; j < k; ++j) {
         for (i = 0; i < BLOCK_SZ_MR; ++i) {
             buffer[i] = A[i * incRowA];
         }
         buffer += BLOCK_SZ_MR;
         A += incColA;
     }
 }

References python.linalg::A, and BLOCK_SZ_MR.

◆ pack_micro_B()

void gpmp::linalg::SGEMM::pack_micro_B	(	int	k,
		const float *	B,
		int	incRowB,
		int	incColB,
		float *	buffer
	)

Packs micro panels of size BLOCK_SZ_NR columns by k rows from B without padding.

Parameters

k	Number of rows to pack
B	Pointer to the source matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
buffer	Pointer to the buffer to store the packed micro panels

Definition at line 98 of file sgemm_arr.cpp.

                                                       {
     int i, j;
  
     for (i = 0; i < k; ++i) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             buffer[j] = B[j * incColB];
         }
         buffer += BLOCK_SZ_NR;
         B += incRowB;
     }
 }

References python.linalg::B, and BLOCK_SZ_NR.

◆ sgeaxpy()

void gpmp::linalg::SGEMM::sgeaxpy	(	int	m,
		int	n,
		float	alpha,
		const float *	X,
		int	incRowX,
		int	incColX,
		float *	Y,
		int	incRowY,
		int	incColY
	)

Computes Y += alpha*X (float precision AX + Y)

Parameters

m	Number of rows
n	Number of columns
alpha	Scalar alpha
X	Pointer to matrix X
incRowX	Increment between consecutive rows of X
incColX	Increment between consecutive columns of X
Y	Pointer to matrix Y
incRowY	Increment between consecutive rows of Y
incColY	Increment between consecutive columns of Y

Definition at line 208 of file sgemm_arr.cpp.

                                                {
     int i, j;
  
     if (fabs(alpha - 1.0f) > std::numeric_limits<float>::epsilon()) {
  
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 Y[i * incRowY + j * incColY] +=
                     alpha * X[i * incRowX + j * incColX];
             }
         }
     }
  
     else {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 Y[i * incRowY + j * incColY] += X[i * incRowX + j * incColX];
             }
         }
     }
 }

◆ sgemm_macro_kernel()

void gpmp::linalg::SGEMM::sgemm_macro_kernel	(	int	mc,
		int	nc,
		int	kc,
		float	alpha,
		float	beta,
		float *	C,
		int	incRowC,
		int	incColC
	)

Macro kernel for the multiplication of blocks of A and B.

Parameters

mc	Number of rows in the block of C
nc	Number of columns in the block of C
kc	Number of columns in the blocks of A and rows of B
alpha	Scalar alpha
beta	Scalar beta
C	Pointer to the output matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Definition at line 266 of file sgemm_arr.cpp.

                                                           {
  
     int mp = (mc + BLOCK_SZ_MR - 1) / BLOCK_SZ_MR;
     int np = (nc + BLOCK_SZ_NR - 1) / BLOCK_SZ_NR;
  
     int _mr = mc % BLOCK_SZ_MR;
     int _nr = nc % BLOCK_SZ_NR;
  
     int mr, nr;
     int i, j;
  
     for (j = 0; j < np; ++j) {
         nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
  
         for (i = 0; i < mp; ++i) {
             mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
  
             if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
                 sgemm_micro_kernel(
                     kc,
                     alpha,
                     &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                     &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                     beta,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
             } else {
                 sgemm_micro_kernel(kc,
                                    alpha,
                                    &SGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
                                    &SGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
                                    0.0f,
                                    SGEMM_BUFF_C,
                                    1,
                                    BLOCK_SZ_MR);
                 sgescal(
                     mr,
                     nr,
                     beta,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
                 sgeaxpy(
                     mr,
                     nr,
                     1.0f,
                     SGEMM_BUFF_C,
                     1,
                     BLOCK_SZ_MR,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
             }
         }
     }
 }

References BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ sgemm_micro_kernel()

void gpmp::linalg::SGEMM::sgemm_micro_kernel	(	int	kc,
		float	alpha,
		const float *	A,
		const float *	B,
		float	beta,
		float *	C,
		int	incRowC,
		int	incColC
	)

Computes the micro kernel that multiplies panels from A and B.

Parameters

kc	Number of columns
alpha	Scalar alpha
A	Pointer to the packed panel A
B	Pointer to the packed panel B
beta	Scalar beta
C	Pointer to the output matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Definition at line 146 of file sgemm_arr.cpp.

                                                           {
     float AB[BLOCK_SZ_MR * BLOCK_SZ_NR];
  
     int i, j, l;
  
     // Compute AB = A*B
     for (l = 0; l < BLOCK_SZ_MR * BLOCK_SZ_NR; ++l) {
         AB[l] = 0;
     }
     for (l = 0; l < kc; ++l) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 AB[i + j * BLOCK_SZ_MR] += A[i] * B[j];
             }
         }
         A += BLOCK_SZ_MR;
         B += BLOCK_SZ_NR;
     }
  
     // Update C <- beta*C
     if (fabs(beta - 0.0f) < std::numeric_limits<float>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] = 0.0f;
             }
         }
     } else if (fabs(beta - 1.0f) > std::numeric_limits<float>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] *= beta;
             }
         }
     }
  
     // Update C <- C + alpha*AB (note: the case alpha==0.0f was already treated
     // in
     //                                  the above layer sgemm_nn)
     if (fabs(alpha - 1.0f) < std::numeric_limits<float>::epsilon()) {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] += AB[i + j * BLOCK_SZ_MR];
             }
         }
     }
  
     else {
         for (j = 0; j < BLOCK_SZ_NR; ++j) {
             for (i = 0; i < BLOCK_SZ_MR; ++i) {
                 C[i * incRowC + j * incColC] += alpha * AB[i + j * BLOCK_SZ_MR];
             }
         }
     }
 }

References python.linalg::A, python.linalg::B, BLOCK_SZ_MR, BLOCK_SZ_NR, and python.linalg::C.

◆ sgemm_nn()

void gpmp::linalg::SGEMM::sgemm_nn	(	int	m,
		int	n,
		int	k,
		float	alpha,
		const float *	A,
		int	incRowA,
		int	incColA,
		const float *	B,
		int	incRowB,
		int	incColB,
		float	beta,
		float *	C,
		int	incRowC,
		int	incColC
	)

Main SGEMM entrypoint, computes C <- beta*C + alpha*A*B.

Parameters

m	Number of rows of A and rows of C
n	Number of columns of B and columns of C
k	Number of columns of A and rows of B
alpha	Scalar alpha
A	Pointer to matrix A
incRowA	Increment between consecutive rows of A
incColA	Increment between consecutive columns of A
B	Pointer to matrix B
incRowB	Increment between consecutive rows of B
incColB	Increment between consecutive columns of B
beta	Scalar beta
C	Pointer to matrix C
incRowC	Increment between consecutive rows of C
incColC	Increment between consecutive columns of C

Definition at line 332 of file sgemm_arr.cpp.

                                                 {
     int mb = (m + BLOCK_SZ_M - 1) / BLOCK_SZ_M;
     int nb = (n + BLOCK_SZ_N - 1) / BLOCK_SZ_N;
     int kb = (k + BLOCK_SZ_K - 1) / BLOCK_SZ_K;
  
     int _mc = m % BLOCK_SZ_M;
     int _nc = n % BLOCK_SZ_N;
     int _kc = k % BLOCK_SZ_K;
  
     int mc, nc, kc;
     int i, j, l;
  
     float _beta;
  
     if (fabs(alpha) < std::numeric_limits<float>::epsilon() || k == 0) {
         sgescal(m, n, beta, C, incRowC, incColC);
         return;
     }
  
     for (j = 0; j < nb; ++j) {
         nc = (j != nb - 1 || _nc == 0) ? BLOCK_SZ_N : _nc;
  
         for (l = 0; l < kb; ++l) {
             kc = (l != kb - 1 || _kc == 0) ? BLOCK_SZ_K : _kc;
             _beta = (l == 0) ? beta : 1.0f;
  
             pack_buffer_B(
                 kc,
                 nc,
                 &B[l * BLOCK_SZ_K * incRowB + j * BLOCK_SZ_N * incColB],
                 incRowB,
                 incColB,
                 SGEMM_BUFF_B);
  
             for (i = 0; i < mb; ++i) {
                 mc = (i != mb - 1 || _mc == 0) ? BLOCK_SZ_M : _mc;
  
                 pack_buffer_A(
                     mc,
                     kc,
                     &A[i * BLOCK_SZ_M * incRowA + l * BLOCK_SZ_K * incColA],
                     incRowA,
                     incColA,
                     SGEMM_BUFF_A);
  
                 sgemm_macro_kernel(
                     mc,
                     nc,
                     kc,
                     alpha,
                     _beta,
                     &C[i * BLOCK_SZ_M * incRowC + j * BLOCK_SZ_N * incColC],
                     incRowC,
                     incColC);
             }
         }
     }
 }

References python.linalg::A, python.linalg::B, BLOCK_SZ_K, BLOCK_SZ_M, BLOCK_SZ_N, and python.linalg::C.

◆ sgescal()

void gpmp::linalg::SGEMM::sgescal	(	int	m,
		int	n,
		float	alpha,
		float *	X,
		int	incRowX,
		int	incColX
	)

Scales elements of X by alpha.

Parameters

m	Number of rows
n	Number of columns
alpha	Scalar alpha
X	Pointer to matrix X
incRowX	Increment between consecutive rows of X
incColX	Increment between consecutive columns of X

Definition at line 239 of file sgemm_arr.cpp.

                                                {
     int i, j;
  
     if (fabs(alpha - 0.0f) > std::numeric_limits<float>::epsilon()) {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 X[i * incRowX + j * incColX] *= alpha;
             }
         }
     }
  
     else {
         for (j = 0; j < n; ++j) {
             for (i = 0; i < m; ++i) {
                 X[i * incRowX + j * incColX] = 0.0f;
             }
         }
     }
 }

Member Data Documentation

◆ SGEMM_BUFF_A

float gpmp::linalg::SGEMM::SGEMM_BUFF_A

static

< Buffer for storing packed micro panels of A
Buffer for storing packed micro panels of B

Single precision GEneral Matrix-Matrix product

Definition at line 53 of file _sgemm.hpp.

◆ SGEMM_BUFF_B

float gpmp::linalg::SGEMM::SGEMM_BUFF_B

static

Buffer for storing intermediate results

Definition at line 55 of file _sgemm.hpp.

◆ SGEMM_BUFF_C

float gpmp::linalg::SGEMM::SGEMM_BUFF_C

static

Definition at line 57 of file _sgemm.hpp.

The documentation for this class was generated from the following files:

include/openGPMP/linalg/_sgemm.hpp
modules/linalg/sgemm_arr.cpp

Public Member Functions

Static Public Attributes

Detailed Description

Member Function Documentation

◆ pack_buffer_A()

◆ pack_buffer_B()

◆ pack_micro_A()

◆ pack_micro_B()

◆ sgeaxpy()

◆ sgemm_macro_kernel()

◆ sgemm_micro_kernel()

◆ sgemm_nn()

◆ sgescal()

Member Data Documentation

◆ SGEMM_BUFF_A

◆ SGEMM_BUFF_B

◆ SGEMM_BUFF_C