40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 
   47 #elif defined(__SSE2__) 
   49 #include <emmintrin.h> 
   50 #include <smmintrin.h> 
   58 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<double>> &
A,
 
   59                                 const std::vector<std::vector<double>> &
B,
 
   60                                 std::vector<std::vector<double>> &
C) {
 
   61     const int rows = 
A.size();
 
   62     const int cols = 
A[0].size();
 
   64     for (
int i = 0; i < 
rows; ++i) {
 
   67         for (; j < 
cols - 1; j += 2) {
 
   69             __m128d a = _mm_loadu_pd(&
A[i][j]);
 
   70             __m128d b = _mm_loadu_pd(&
B[i][j]);
 
   71             __m128d c = _mm_loadu_pd(&
C[i][j]);
 
   77             _mm_storeu_pd(&
C[i][j], c);
 
   81         for (; j < 
cols; ++j) {
 
   82             C[i][j] = 
A[i][j] + 
B[i][j];
 
   88 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<double>> &
A,
 
   89                                 const std::vector<std::vector<double>> &
B,
 
   90                                 std::vector<std::vector<double>> &
C) {
 
   91     const int rows = 
A.size();
 
   92     const int cols = 
A[0].size();
 
   94     for (
int i = 0; i < 
rows; ++i) {
 
   97         for (; j < 
cols - 1; j += 2) {
 
   99             __m128d a = _mm_loadu_pd(&
A[i][j]);
 
  100             __m128d b = _mm_loadu_pd(&
B[i][j]);
 
  101             __m128d c = _mm_loadu_pd(&
C[i][j]);
 
  104             c = _mm_sub_pd(a, b);
 
  107             _mm_storeu_pd(&
C[i][j], c);
 
  111         for (; j < 
cols; ++j) {
 
  112             C[i][j] = 
A[i][j] - 
B[i][j];
 
  119                                  const std::vector<std::vector<double>> &
B,
 
  120                                  std::vector<std::vector<double>> &
C) {
 
  121     const int rows_a = 
A.size();
 
  122     const int cols_a = 
A[0].size();
 
  123     const int cols_b = 
B[0].size();
 
  125     for (
int i = 0; i < rows_a; ++i) {
 
  126         for (
int j = 0; j < cols_b; j += 2) {
 
  128             __m128d c = _mm_setzero_pd();
 
  130             for (
int k = 0; k < cols_a; ++k) {
 
  132                 __m128d a = _mm_set1_pd(
A[i][k]);
 
  133                 __m128d b = _mm_loadu_pd(&
B[k][j]);
 
  136                 __m128d prod = _mm_mul_pd(a, b);
 
  139                 c = _mm_add_pd(c, prod);
 
  143             _mm_storeu_pd(&
C[i][j], c);
 
  147         for (
int j = cols_b - cols_b % 2; j < cols_b; ++j) {
 
  150             for (
int k = 0; k < cols_a; ++k) {
 
  151                 sum += 
A[i][k] * 
B[k][j];
 
  160 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<double>> &matrix) {
 
  161     const int rows = matrix.size();
 
  162     const int cols = matrix[0].size();
 
  164     for (
int i = 0; i < 
rows; i += 2) {
 
  165         for (
int j = i; j < 
cols; j += 2) {
 
  166             __m128d row1 = _mm_loadu_pd(&matrix[i][j]);
 
  167             __m128d row2 = _mm_loadu_pd(&matrix[i + 1][j]);
 
  170             __m128d tmp1 = _mm_unpacklo_pd(row1, row2);
 
  171             __m128d tmp2 = _mm_unpackhi_pd(row1, row2);
 
  174             _mm_storeu_pd(&matrix[i][j], tmp1);
 
  175             _mm_storeu_pd(&matrix[i + 1][j], tmp2);
 
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)