40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
50 #include <immintrin.h>
58 void gpmp::linalg::Mtx::mtx_add(
const double *
A,
64 for (
int i = 0; i <
rows; ++i) {
67 for (; j <
cols - 3; j += 4) {
69 __m256d a = _mm256_loadu_pd(&
A[i *
cols + j]);
70 __m256d b = _mm256_loadu_pd(&
B[i *
cols + j]);
71 __m256d c = _mm256_loadu_pd(&
C[i *
cols + j]);
73 c = _mm256_add_pd(a, b);
76 _mm256_storeu_pd(&
C[i *
cols + j], c);
80 for (; j <
cols; ++j) {
96 if (cols_a != rows_a) {
98 std::cerr <<
"Matching error";
104 for (
int i = 0; i < rows_a; ++i) {
105 for (
int j = 0; j < cols_b - 3; j += 4) {
107 __m256d sum_vec = _mm256_setzero_pd();
109 for (
int k = 0; k < cols_a; ++k) {
110 __m256d a_vec = _mm256_set1_pd(
A[i * cols_a + k]);
112 __m256d b_vec = _mm256_loadu_pd(&
B[k * cols_b + j]);
114 __m256d prod = _mm256_mul_pd(a_vec, b_vec);
116 sum_vec = _mm256_add_pd(sum_vec, prod);
118 _mm256_storeu_pd(&
C[i * cols_b + j], sum_vec);
122 for (
int j = cols_b - cols_b % 4; j < cols_b; ++j) {
125 for (
int k = 0; k < cols_a; ++k) {
126 sum +=
A[i * cols_a + k] *
B[k * cols_b + j];
128 C[i * cols_b + j] = sum;
135 std_mtx_mult(
A,
B,
C, rows_a, cols_a, cols_b);
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)