40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
50 #include <immintrin.h>
58 void gpmp::linalg::Mtx::mtx_add(
const int16_t *
A,
64 for (
int i = 0; i <
rows; ++i) {
66 for (; j <
cols - 15; j += 16) {
67 __m256i a = _mm256_loadu_si256(
68 reinterpret_cast<const __m256i *
>(&
A[i *
cols + j]));
69 __m256i b = _mm256_loadu_si256(
70 reinterpret_cast<const __m256i *
>(&
B[i *
cols + j]));
71 __m256i c = _mm256_loadu_si256(
72 reinterpret_cast<const __m256i *
>(&
C[i *
cols + j]));
75 c = _mm256_add_epi16(c, _mm256_add_epi16(a, b));
78 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i *
cols + j]),
82 for (; j <
cols; ++j) {
88 void gpmp::linalg::Mtx::mtx_sub(
const int16_t *
A,
93 for (
int i = 0; i <
rows; ++i) {
95 for (; j <
cols - 15; j += 16) {
96 __m256i a = _mm256_loadu_si256(
97 reinterpret_cast<const __m256i *
>(&
A[i *
cols + j]));
98 __m256i b = _mm256_loadu_si256(
99 reinterpret_cast<const __m256i *
>(&
B[i *
cols + j]));
100 __m256i c = _mm256_loadu_si256(
101 reinterpret_cast<const __m256i *
>(&
C[i *
cols + j]));
104 c = _mm256_sub_epi16(a, b);
107 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i *
cols + j]),
111 for (; j <
cols; ++j) {
123 for (
int i = 0; i < rows_a; ++i) {
124 for (
int j = 0; j < cols_b; j += 16) {
125 __m256i c = _mm256_setzero_si256();
127 for (
int k = 0; k < cols_a; ++k) {
128 __m256i a = _mm256_set1_epi16(
A[i * cols_a + k]);
129 __m256i b = _mm256_loadu_si256(
130 reinterpret_cast<const __m256i *
>(&
B[k * cols_b + j]));
132 __m256i prod = _mm256_mullo_epi16(a, b);
133 c = _mm256_add_epi16(c, prod);
136 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i * cols_b + j]),
141 for (
int j = cols_b - cols_b % 16; j < cols_b; ++j) {
144 for (
int k = 0; k < cols_a; ++k) {
145 sum +=
A[i * cols_a + k] *
B[k * cols_b + j];
148 C[i * cols_b + j] = sum;
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)