40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 
   47 #elif defined(__SSE2__) 
   49 #include <emmintrin.h> 
   50 #include <smmintrin.h> 
   57 void gpmp::linalg::Mtx::mtx_add(
const int16_t *
A,
 
   62     for (
int i = 0; i < 
rows; ++i) {
 
   64         for (; j < 
cols - 7; j += 8) {
 
   65             __m128i a = _mm_loadu_si128(
 
   66                 reinterpret_cast<const __m128i *
>(&
A[i * 
cols + j]));
 
   67             __m128i b = _mm_loadu_si128(
 
   68                 reinterpret_cast<const __m128i *
>(&
B[i * 
cols + j]));
 
   69             __m128i c = _mm_loadu_si128(
 
   70                 reinterpret_cast<const __m128i *
>(&
C[i * 
cols + j]));
 
   73             c = _mm_add_epi16(c, _mm_add_epi16(a, b));
 
   76             _mm_storeu_si128(
reinterpret_cast<__m128i *
>(&
C[i * 
cols + j]), c);
 
   79         for (; j < 
cols; ++j) {