40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
47 #elif defined(__SSE2__)
49 #include <emmintrin.h>
50 #include <smmintrin.h>
58 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<double>> &
A,
59 const std::vector<std::vector<double>> &
B,
60 std::vector<std::vector<double>> &
C) {
61 const int rows =
A.size();
62 const int cols =
A[0].size();
64 for (
int i = 0; i <
rows; ++i) {
67 for (; j <
cols - 1; j += 2) {
69 __m128d a = _mm_loadu_pd(&
A[i][j]);
70 __m128d b = _mm_loadu_pd(&
B[i][j]);
71 __m128d c = _mm_loadu_pd(&
C[i][j]);
77 _mm_storeu_pd(&
C[i][j], c);
81 for (; j <
cols; ++j) {
82 C[i][j] =
A[i][j] +
B[i][j];
88 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<double>> &
A,
89 const std::vector<std::vector<double>> &
B,
90 std::vector<std::vector<double>> &
C) {
91 const int rows =
A.size();
92 const int cols =
A[0].size();
94 for (
int i = 0; i <
rows; ++i) {
97 for (; j <
cols - 1; j += 2) {
99 __m128d a = _mm_loadu_pd(&
A[i][j]);
100 __m128d b = _mm_loadu_pd(&
B[i][j]);
101 __m128d c = _mm_loadu_pd(&
C[i][j]);
104 c = _mm_sub_pd(a, b);
107 _mm_storeu_pd(&
C[i][j], c);
111 for (; j <
cols; ++j) {
112 C[i][j] =
A[i][j] -
B[i][j];
119 const std::vector<std::vector<double>> &
B,
120 std::vector<std::vector<double>> &
C) {
121 const int rows_a =
A.size();
122 const int cols_a =
A[0].size();
123 const int cols_b =
B[0].size();
125 for (
int i = 0; i < rows_a; ++i) {
126 for (
int j = 0; j < cols_b; j += 2) {
128 __m128d c = _mm_setzero_pd();
130 for (
int k = 0; k < cols_a; ++k) {
132 __m128d a = _mm_set1_pd(
A[i][k]);
133 __m128d b = _mm_loadu_pd(&
B[k][j]);
136 __m128d prod = _mm_mul_pd(a, b);
139 c = _mm_add_pd(c, prod);
143 _mm_storeu_pd(&
C[i][j], c);
147 for (
int j = cols_b - cols_b % 2; j < cols_b; ++j) {
150 for (
int k = 0; k < cols_a; ++k) {
151 sum +=
A[i][k] *
B[k][j];
160 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<double>> &matrix) {
161 const int rows = matrix.size();
162 const int cols = matrix[0].size();
164 for (
int i = 0; i <
rows; i += 2) {
165 for (
int j = i; j <
cols; j += 2) {
166 __m128d row1 = _mm_loadu_pd(&matrix[i][j]);
167 __m128d row2 = _mm_loadu_pd(&matrix[i + 1][j]);
170 __m128d tmp1 = _mm_unpacklo_pd(row1, row2);
171 __m128d tmp2 = _mm_unpackhi_pd(row1, row2);
174 _mm_storeu_pd(&matrix[i][j], tmp1);
175 _mm_storeu_pd(&matrix[i + 1][j], tmp2);
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)