40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
50 #include <immintrin.h>
58 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<int>> &
A,
59 const std::vector<std::vector<int>> &
B,
60 std::vector<std::vector<int>> &
C) {
61 const int rows =
A.size();
62 const int cols =
A[0].size();
65 for (
int i = 0; i <
rows; ++i) {
68 for (; j <
cols - 7; j += 8) {
70 __m256i a = _mm256_loadu_si256(
71 reinterpret_cast<const __m256i *
>(&
A[i][j]));
72 __m256i b = _mm256_loadu_si256(
73 reinterpret_cast<const __m256i *
>(&
B[i][j]));
74 __m256i c = _mm256_loadu_si256(
75 reinterpret_cast<const __m256i *
>(&
C[i][j]));
78 c = _mm256_add_epi32(a, b);
81 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i][j]), c);
85 for (; j <
cols; ++j) {
86 C[i][j] =
A[i][j] +
B[i][j];
98 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<int>> &
A,
99 const std::vector<std::vector<int>> &
B,
100 std::vector<std::vector<int>> &
C) {
101 const int rows =
A.size();
102 const int cols =
A[0].size();
104 for (
int i = 0; i <
rows; ++i) {
107 for (; j <
cols - 7; j += 8) {
110 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(&
A[i][j]));
112 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(&
B[i][j]));
114 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(&
C[i][j]));
117 c = _mm256_sub_epi32(a, b);
120 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i][j]), c);
124 for (; j <
cols; ++j) {
125 C[i][j] =
A[i][j] -
B[i][j];
132 const std::vector<std::vector<int>> &
B,
133 std::vector<std::vector<int>> &
C) {
134 const int rows_a =
A.size();
135 const int cols_a =
A[0].size();
136 const int cols_b =
B[0].size();
138 for (
int i = 0; i < rows_a; ++i) {
139 for (
int j = 0; j < cols_b; j += 8) {
141 __m256i c = _mm256_setzero_si256();
143 for (
int k = 0; k < cols_a; ++k) {
145 __m256i a = _mm256_set1_epi32(
A[i][k]);
146 __m256i b = _mm256_loadu_si256(
147 reinterpret_cast<const __m256i *
>(&
B[k][j]));
150 __m256i prod = _mm256_mullo_epi32(a, b);
153 c = _mm256_add_epi32(c, prod);
157 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i][j]), c);
161 for (
int j = cols_b - cols_b % 8; j < cols_b; ++j) {
164 for (
int k = 0; k < cols_a; ++k) {
165 sum +=
A[i][k] *
B[k][j];
174 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<int>> &matrix) {
175 const int rows = matrix.size();
176 const int cols = matrix[0].size();
178 for (
int i = 0; i <
rows; i += 8) {
179 for (
int j = i; j <
cols; j += 8) {
180 __m256i row1 = _mm256_loadu_si256(
181 reinterpret_cast<const __m256i *
>(&matrix[i][j]));
182 __m256i row2 = _mm256_loadu_si256(
183 reinterpret_cast<const __m256i *
>(&matrix[i + 1][j]));
184 __m256i row3 = _mm256_loadu_si256(
185 reinterpret_cast<const __m256i *
>(&matrix[i + 2][j]));
186 __m256i row4 = _mm256_loadu_si256(
187 reinterpret_cast<const __m256i *
>(&matrix[i + 3][j]));
188 __m256i row5 = _mm256_loadu_si256(
189 reinterpret_cast<const __m256i *
>(&matrix[i + 4][j]));
190 __m256i row6 = _mm256_loadu_si256(
191 reinterpret_cast<const __m256i *
>(&matrix[i + 5][j]));
192 __m256i row7 = _mm256_loadu_si256(
193 reinterpret_cast<const __m256i *
>(&matrix[i + 6][j]));
194 __m256i row8 = _mm256_loadu_si256(
195 reinterpret_cast<const __m256i *
>(&matrix[i + 7][j]));
197 __m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
200 tmp1 = _mm256_unpacklo_epi32(row1, row2);
201 tmp2 = _mm256_unpacklo_epi32(row3, row4);
202 tmp3 = _mm256_unpacklo_epi32(row5, row6);
203 tmp4 = _mm256_unpacklo_epi32(row7, row8);
204 tmp5 = _mm256_unpackhi_epi32(row1, row2);
205 tmp6 = _mm256_unpackhi_epi32(row3, row4);
206 tmp7 = _mm256_unpackhi_epi32(row5, row6);
207 tmp8 = _mm256_unpackhi_epi32(row7, row8);
209 row1 = _mm256_unpacklo_epi64(tmp1, tmp2);
210 row2 = _mm256_unpacklo_epi64(tmp3, tmp4);
211 row3 = _mm256_unpackhi_epi64(tmp1, tmp2);
212 row4 = _mm256_unpackhi_epi64(tmp3, tmp4);
213 row5 = _mm256_unpacklo_epi64(tmp5, tmp6);
214 row6 = _mm256_unpacklo_epi64(tmp7, tmp8);
215 row7 = _mm256_unpackhi_epi64(tmp5, tmp6);
216 row8 = _mm256_unpackhi_epi64(tmp7, tmp8);
219 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i][j]),
221 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 1][j]),
223 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 2][j]),
225 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 3][j]),
227 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 4][j]),
229 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 5][j]),
231 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 6][j]),
233 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&matrix[i + 7][j]),
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)