openGPMP
Open Source Mathematics Package
mtx_avx2_vec_i32.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cassert>
34 #include <cstddef>
35 #include <cstdint>
36 #include <iostream>
37 #include <openGPMP/linalg/mtx.hpp>
38 #include <vector>
39 
40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
41 
42 /************************************************************************
43  *
44  * Matrix Operations for AVX ISA
45  *
46  ************************************************************************/
47 #if defined(__AVX2__)
48 
49 // AVX family intrinsics
50 #include <immintrin.h>
51 
52 /************************************************************************
53  *
54  * Matrix Operations on vector<vector>
55  *
56  ************************************************************************/
57 // matrix addition using Intel intrinsic, accepts integer types
58 void gpmp::linalg::Mtx::mtx_add(const std::vector<std::vector<int>> &A,
59  const std::vector<std::vector<int>> &B,
60  std::vector<std::vector<int>> &C) {
61  const int rows = A.size();
62  const int cols = A[0].size();
63 
64  if (rows > 16) {
65  for (int i = 0; i < rows; ++i) {
66  int j = 0;
67  // requires at least size 8x8 size matrices
68  for (; j < cols - 7; j += 8) {
69  // load 8 elements from A, B, and C matrices using SIMD
70  __m256i a = _mm256_loadu_si256(
71  reinterpret_cast<const __m256i *>(&A[i][j]));
72  __m256i b = _mm256_loadu_si256(
73  reinterpret_cast<const __m256i *>(&B[i][j]));
74  __m256i c = _mm256_loadu_si256(
75  reinterpret_cast<const __m256i *>(&C[i][j]));
76 
77  // perform vectorized addition
78  c = _mm256_add_epi32(a, b);
79 
80  // store the result back to the C matrix
81  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
82  }
83 
84  // handle the remaining elements that are not multiples of 8
85  for (; j < cols; ++j) {
86  C[i][j] = A[i][j] + B[i][j];
87  }
88  }
89  }
90 
91  // use standard matrix addition
92  else {
93  std_mtx_add(A, B, C);
94  }
95 }
96 
97 // matrix subtraction using Intel intrinsics, accepts integer types
98 void gpmp::linalg::Mtx::mtx_sub(const std::vector<std::vector<int>> &A,
99  const std::vector<std::vector<int>> &B,
100  std::vector<std::vector<int>> &C) {
101  const int rows = A.size();
102  const int cols = A[0].size();
103 
104  for (int i = 0; i < rows; ++i) {
105  int j = 0;
106  // requires at least size 8x8 size matrices
107  for (; j < cols - 7; j += 8) {
108  // load 8 elements from A, B, and C matrices using SIMD
109  __m256i a =
110  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&A[i][j]));
111  __m256i b =
112  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&B[i][j]));
113  __m256i c =
114  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&C[i][j]));
115 
116  // perform vectorized subtraction
117  c = _mm256_sub_epi32(a, b);
118 
119  // store the result back to the C matrix
120  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
121  }
122 
123  // handle the remaining elements that are not multiples of 8
124  for (; j < cols; ++j) {
125  C[i][j] = A[i][j] - B[i][j];
126  }
127  }
128 }
129 
130 // matrix multiplication using Intel intrinsics, accepts integer types
131 void gpmp::linalg::Mtx::mtx_mult(const std::vector<std::vector<int>> &A,
132  const std::vector<std::vector<int>> &B,
133  std::vector<std::vector<int>> &C) {
134  const int rows_a = A.size();
135  const int cols_a = A[0].size();
136  const int cols_b = B[0].size();
137 
138  for (int i = 0; i < rows_a; ++i) {
139  for (int j = 0; j < cols_b; j += 8) {
140  // initialize a vector of zeros for the result
141  __m256i c = _mm256_setzero_si256();
142 
143  for (int k = 0; k < cols_a; ++k) {
144  // load 8 elements from matrices A and B using SIMD
145  __m256i a = _mm256_set1_epi32(A[i][k]);
146  __m256i b = _mm256_loadu_si256(
147  reinterpret_cast<const __m256i *>(&B[k][j]));
148 
149  // perform vectorized multiplication
150  __m256i prod = _mm256_mullo_epi32(a, b);
151 
152  // perform vectorized addition
153  c = _mm256_add_epi32(c, prod);
154  }
155 
156  // store the result back to the C matrix
157  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
158  }
159 
160  // handle the remaining elements that are not multiples of 8
161  for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
162  int sum = 0;
163 
164  for (int k = 0; k < cols_a; ++k) {
165  sum += A[i][k] * B[k][j];
166  }
167 
168  C[i][j] = sum;
169  }
170  }
171 }
172 
173 // transpose matrices of type int using Intel intrinsics
174 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<int>> &matrix) {
175  const int rows = matrix.size();
176  const int cols = matrix[0].size();
177 
178  for (int i = 0; i < rows; i += 8) {
179  for (int j = i; j < cols; j += 8) {
180  __m256i row1 = _mm256_loadu_si256(
181  reinterpret_cast<const __m256i *>(&matrix[i][j]));
182  __m256i row2 = _mm256_loadu_si256(
183  reinterpret_cast<const __m256i *>(&matrix[i + 1][j]));
184  __m256i row3 = _mm256_loadu_si256(
185  reinterpret_cast<const __m256i *>(&matrix[i + 2][j]));
186  __m256i row4 = _mm256_loadu_si256(
187  reinterpret_cast<const __m256i *>(&matrix[i + 3][j]));
188  __m256i row5 = _mm256_loadu_si256(
189  reinterpret_cast<const __m256i *>(&matrix[i + 4][j]));
190  __m256i row6 = _mm256_loadu_si256(
191  reinterpret_cast<const __m256i *>(&matrix[i + 5][j]));
192  __m256i row7 = _mm256_loadu_si256(
193  reinterpret_cast<const __m256i *>(&matrix[i + 6][j]));
194  __m256i row8 = _mm256_loadu_si256(
195  reinterpret_cast<const __m256i *>(&matrix[i + 7][j]));
196 
197  __m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
198 
199  // transpose 8x8 submatrix
200  tmp1 = _mm256_unpacklo_epi32(row1, row2);
201  tmp2 = _mm256_unpacklo_epi32(row3, row4);
202  tmp3 = _mm256_unpacklo_epi32(row5, row6);
203  tmp4 = _mm256_unpacklo_epi32(row7, row8);
204  tmp5 = _mm256_unpackhi_epi32(row1, row2);
205  tmp6 = _mm256_unpackhi_epi32(row3, row4);
206  tmp7 = _mm256_unpackhi_epi32(row5, row6);
207  tmp8 = _mm256_unpackhi_epi32(row7, row8);
208 
209  row1 = _mm256_unpacklo_epi64(tmp1, tmp2);
210  row2 = _mm256_unpacklo_epi64(tmp3, tmp4);
211  row3 = _mm256_unpackhi_epi64(tmp1, tmp2);
212  row4 = _mm256_unpackhi_epi64(tmp3, tmp4);
213  row5 = _mm256_unpacklo_epi64(tmp5, tmp6);
214  row6 = _mm256_unpacklo_epi64(tmp7, tmp8);
215  row7 = _mm256_unpackhi_epi64(tmp5, tmp6);
216  row8 = _mm256_unpackhi_epi64(tmp7, tmp8);
217 
218  // store the transposed 8x8 submatrix back to the matrix
219  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i][j]),
220  row1);
221  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 1][j]),
222  row2);
223  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 2][j]),
224  row3);
225  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 3][j]),
226  row4);
227  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 4][j]),
228  row5);
229  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 5][j]),
230  row6);
231  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 6][j]),
232  row7);
233  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 7][j]),
234  row8);
235  }
236  }
237 }
238 
239 #endif
240 
241 #endif
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
Definition: mtx.hpp:510
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23