40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 
   49 #include <emmintrin.h> 
   50 #include <smmintrin.h> 
   58 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<float>> &
A,
 
   59                                 const std::vector<std::vector<float>> &
B,
 
   60                                 std::vector<std::vector<float>> &
C) {
 
   61     const int rows = 
A.size();
 
   62     const int cols = 
A[0].size();
 
   65         for (
int i = 0; i < 
rows; ++i) {
 
   68             for (; j < 
cols - 3; j += 3) {
 
   70                 __m128 a = _mm_loadu_ps(&
A[i][j]);
 
   71                 __m128 b = _mm_loadu_ps(&
B[i][j]);
 
   72                 __m128 c = _mm_loadu_ps(&
C[i][j]);
 
   78                 _mm_storeu_ps(&
C[i][j], c);
 
   82             for (; j < 
cols; ++j) {
 
   83                 C[i][j] = 
A[i][j] + 
B[i][j];
 
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.