40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
47 #elif defined(__SSE2__)
49 #include <emmintrin.h>
50 #include <smmintrin.h>
57 void gpmp::linalg::Mtx::mtx_add(
const int16_t *
A,
62 for (
int i = 0; i <
rows; ++i) {
64 for (; j <
cols - 7; j += 8) {
65 __m128i a = _mm_loadu_si128(
66 reinterpret_cast<const __m128i *
>(&
A[i *
cols + j]));
67 __m128i b = _mm_loadu_si128(
68 reinterpret_cast<const __m128i *
>(&
B[i *
cols + j]));
69 __m128i c = _mm_loadu_si128(
70 reinterpret_cast<const __m128i *
>(&
C[i *
cols + j]));
73 c = _mm_add_epi16(c, _mm_add_epi16(a, b));
76 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&
C[i *
cols + j]), c);
79 for (; j <
cols; ++j) {