41 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
51 #include <immintrin.h>
59 void gpmp::linalg::Mtx::mtx_add(
const int8_t *
A,
65 for (
int i = 0; i <
rows; ++i) {
67 for (; j <
cols - 31; j += 32) {
68 __m256i a = _mm256_loadu_si256(
69 reinterpret_cast<const __m256i *
>(&
A[i *
cols + j]));
70 __m256i b = _mm256_loadu_si256(
71 reinterpret_cast<const __m256i *
>(&
B[i *
cols + j]));
72 __m256i c = _mm256_loadu_si256(
73 reinterpret_cast<const __m256i *
>(&
C[i *
cols + j]));
76 c = _mm256_add_epi8(c, _mm256_add_epi8(a, b));
79 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i *
cols + j]),
83 for (; j <
cols; ++j) {
89 void gpmp::linalg::Mtx::mtx_sub(
const int8_t *
A,
94 for (
int i = 0; i <
rows; ++i) {
96 for (; j <
cols - 31; j += 32) {
97 __m256i a = _mm256_loadu_si256(
98 reinterpret_cast<const __m256i *
>(&
A[i *
cols + j]));
99 __m256i b = _mm256_loadu_si256(
100 reinterpret_cast<const __m256i *
>(&
B[i *
cols + j]));
101 __m256i c = _mm256_loadu_si256(
102 reinterpret_cast<const __m256i *
>(&
C[i *
cols + j]));
105 c = _mm256_sub_epi8(a, b);
108 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i *
cols + j]),
112 for (; j <
cols; ++j) {
125 for (
int i = 0; i < rows_a; ++i) {
126 for (
int j = 0; j < cols_b; j += 32) {
127 __m256i c = _mm256_setzero_si256();
129 for (
int k = 0; k < cols_a; ++k) {
130 __m256i a = _mm256_set1_epi8(
A[i * cols_a + k]);
131 __m256i b = _mm256_loadu_si256(
132 reinterpret_cast<const __m256i *
>(&
B[k * cols_b + j]));
134 __m256i prod = _mm256_maddubs_epi16(a, b);
135 c = _mm256_add_epi16(c, prod);
138 c = _mm256_srai_epi16(c, 8);
139 c = _mm256_packs_epi16(c, _mm256_setzero_si256());
141 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(&
C[i * cols_b + j]),
146 for (
int j = cols_b - cols_b % 32; j < cols_b; ++j) {
149 for (
int k = 0; k < cols_a; ++k) {
150 sum +=
A[i * cols_a + k] *
B[k * cols_b + j];
153 C[i * cols_b + j] = sum;
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)