40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
50 #include <immintrin.h>
58 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<double>> &
A,
59 const std::vector<std::vector<double>> &
B,
60 std::vector<std::vector<double>> &
C) {
61 const int rows =
A.size();
62 const int cols =
A[0].size();
65 for (
int i = 0; i <
rows; ++i) {
68 for (; j <
cols - 3; j += 4) {
70 __m256d a = _mm256_loadu_pd(&
A[i][j]);
71 __m256d b = _mm256_loadu_pd(&
B[i][j]);
72 __m256d c = _mm256_loadu_pd(&
C[i][j]);
75 c = _mm256_add_pd(a, b);
78 _mm256_storeu_pd(&
C[i][j], c);
82 for (; j <
cols; ++j) {
83 C[i][j] =
A[i][j] +
B[i][j];
92 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<double>> &
A,
93 const std::vector<std::vector<double>> &
B,
94 std::vector<std::vector<double>> &
C) {
95 const int rows =
A.size();
96 const int cols =
A[0].size();
98 for (
int i = 0; i <
rows; ++i) {
101 for (; j <
cols - 3; j += 4) {
103 __m256d a = _mm256_loadu_pd(&
A[i][j]);
104 __m256d b = _mm256_loadu_pd(&
B[i][j]);
105 __m256d c = _mm256_loadu_pd(&
C[i][j]);
108 c = _mm256_sub_pd(a, b);
111 _mm256_storeu_pd(&
C[i][j], c);
115 for (; j <
cols; ++j) {
116 C[i][j] =
A[i][j] -
B[i][j];
123 const std::vector<std::vector<double>> &
B,
124 std::vector<std::vector<double>> &
C) {
125 const int rows_a =
A.size();
126 const int cols_a =
A[0].size();
127 const int cols_b =
B[0].size();
129 for (
int i = 0; i < rows_a; ++i) {
130 for (
int j = 0; j < cols_b; j += 4) {
132 __m256d c = _mm256_setzero_pd();
134 for (
int k = 0; k < cols_a; ++k) {
136 __m256d a = _mm256_set1_pd(
A[i][k]);
137 __m256d b = _mm256_loadu_pd(&
B[k][j]);
140 __m256d prod = _mm256_mul_pd(a, b);
143 c = _mm256_add_pd(c, prod);
147 _mm256_storeu_pd(&
C[i][j], c);
151 for (
int j = cols_b - cols_b % 4; j < cols_b; ++j) {
154 for (
int k = 0; k < cols_a; ++k) {
155 sum +=
A[i][k] *
B[k][j];
163 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<double>> &matrix) {
164 const int rows = matrix.size();
165 const int cols = matrix[0].size();
167 for (
int i = 0; i <
rows; i += 4) {
168 for (
int j = i; j <
cols; j += 4) {
169 __m256d row1 = _mm256_loadu_pd(&matrix[i][j]);
170 __m256d row2 = _mm256_loadu_pd(&matrix[i + 1][j]);
171 __m256d row3 = _mm256_loadu_pd(&matrix[i + 2][j]);
172 __m256d row4 = _mm256_loadu_pd(&matrix[i + 3][j]);
174 __m256d tmp1, tmp2, tmp3, tmp4;
177 tmp1 = _mm256_unpacklo_pd(row1, row2);
178 tmp2 = _mm256_unpackhi_pd(row1, row2);
179 tmp3 = _mm256_unpacklo_pd(row3, row4);
180 tmp4 = _mm256_unpackhi_pd(row3, row4);
182 row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
183 row2 = _mm256_permute2f128_pd(tmp2, tmp4, 0x20);
184 row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
185 row4 = _mm256_permute2f128_pd(tmp2, tmp4, 0x31);
188 _mm256_storeu_pd(&matrix[i][j], row1);
189 _mm256_storeu_pd(&matrix[i + 1][j], row2);
190 _mm256_storeu_pd(&matrix[i + 2][j], row3);
191 _mm256_storeu_pd(&matrix[i + 3][j], row4);
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)