40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
49 #include <emmintrin.h>
50 #include <smmintrin.h>
57 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<int>> &
A,
58 const std::vector<std::vector<int>> &
B,
59 std::vector<std::vector<int>> &
C) {
60 const int rows =
A.size();
61 const int cols =
A[0].size();
64 for (
int i = 0; i <
rows; ++i) {
67 for (; j <
cols - 3; j += 4) {
69 __m128i a = _mm_loadu_si128(
70 reinterpret_cast<const __m128i *
>(&
A[i][j]));
71 __m128i b = _mm_loadu_si128(
72 reinterpret_cast<const __m128i *
>(&
B[i][j]));
73 __m128i c = _mm_loadu_si128(
74 reinterpret_cast<const __m128i *
>(&
C[i][j]));
77 c = _mm_add_epi32(a, b);
80 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&
C[i][j]), c);
84 for (; j <
cols; ++j) {
85 C[i][j] =
A[i][j] +
B[i][j];
94 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<int>> &
A,
95 const std::vector<std::vector<int>> &
B,
96 std::vector<std::vector<int>> &
C) {
97 const int rows =
A.size();
98 const int cols =
A[0].size();
100 for (
int i = 0; i <
rows; ++i) {
103 for (; j <
cols - 3; j += 4) {
106 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(&
A[i][j]));
108 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(&
B[i][j]));
110 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(&
C[i][j]));
113 c = _mm_sub_epi32(a, b);
116 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&
C[i][j]), c);
120 for (; j <
cols; ++j) {
121 C[i][j] =
A[i][j] -
B[i][j];
127 const std::vector<std::vector<int>> &
B,
128 std::vector<std::vector<int>> &
C) {
129 const int rows_a =
A.size();
130 const int cols_a =
A[0].size();
131 const int cols_b =
B[0].size();
133 for (
int i = 0; i < rows_a; ++i) {
134 for (
int j = 0; j < cols_b; j += 4) {
136 __m128i c = _mm_setzero_si128();
138 for (
int k = 0; k < cols_a; ++k) {
140 __m128i a = _mm_set1_epi32(
A[i][k]);
141 __m128i b = _mm_loadu_si128(
142 reinterpret_cast<const __m128i *
>(&
B[k][j]));
145 __m128i prod = _mm_mullo_epi32(a, b);
148 c = _mm_add_epi32(c, prod);
152 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&
C[i][j]), c);
156 for (
int j = cols_b - cols_b % 4; j < cols_b; ++j) {
159 for (
int k = 0; k < cols_a; ++k) {
160 sum +=
A[i][k] *
B[k][j];
169 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<int>> &matrix) {
170 const int rows = matrix.size();
171 const int cols = matrix[0].size();
173 for (
int i = 0; i <
rows; i += 4) {
174 for (
int j = i; j <
cols; j += 4) {
175 __m128i row1 = _mm_loadu_si128(
176 reinterpret_cast<const __m128i *
>(&matrix[i][j]));
177 __m128i row2 = _mm_loadu_si128(
178 reinterpret_cast<const __m128i *
>(&matrix[i + 1][j]));
179 __m128i row3 = _mm_loadu_si128(
180 reinterpret_cast<const __m128i *
>(&matrix[i + 2][j]));
181 __m128i row4 = _mm_loadu_si128(
182 reinterpret_cast<const __m128i *
>(&matrix[i + 3][j]));
184 __m128i tmp1, tmp2, tmp3, tmp4;
187 tmp1 = _mm_unpacklo_epi32(row1, row2);
188 tmp2 = _mm_unpacklo_epi32(row3, row4);
189 tmp3 = _mm_unpackhi_epi32(row1, row2);
190 tmp4 = _mm_unpackhi_epi32(row3, row4);
192 row1 = _mm_unpacklo_epi64(tmp1, tmp2);
193 row2 = _mm_unpackhi_epi64(tmp1, tmp2);
194 row3 = _mm_unpacklo_epi64(tmp3, tmp4);
195 row4 = _mm_unpackhi_epi64(tmp3, tmp4);
198 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&matrix[i][j]), row1);
199 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&matrix[i + 1][j]),
201 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&matrix[i + 2][j]),
203 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&matrix[i + 3][j]),
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)