openGPMP
Open Source Mathematics Package
mtx_avx2_arr_i32.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cassert>
34 #include <cstddef>
35 #include <cstdint>
36 #include <iostream>
37 #include <openGPMP/linalg/mtx.hpp>
38 #include <vector>
39 
40 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
41 
42 /************************************************************************
43  *
44  * Matrix Operations for AVX ISA
45  *
46  ************************************************************************/
47 #if defined(__AVX2__)
48 
49 // AVX family intrinsics
50 #include <immintrin.h>
51 
52 /************************************************************************
53  *
54  * Matrix Operations on Arrays
55  *
56  ************************************************************************/
57 
58 // matrix addition using Intel intrinsics, accepts integer arrays as matrices
59 void gpmp::linalg::Mtx::mtx_add(const int *A,
60  const int *B,
61  int *C,
62  int rows,
63  int cols) {
64  // BUG FIXME: this only works with size 184+ matrices
65  if (rows > 184) {
66  for (int i = 0; i < rows; ++i) {
67  int j = 0;
68  // requires at least size 8x8 size matrices
69  for (; j < cols - 7; j += 8) {
70  // load 8 elements from A, B, and C matrices using SIMD
71  __m256i a = _mm256_loadu_si256(
72  reinterpret_cast<const __m256i *>(&A[i * cols + j]));
73  __m256i b = _mm256_loadu_si256(
74  reinterpret_cast<const __m256i *>(&B[i * cols + j]));
75  __m256i c = _mm256_loadu_si256(
76  reinterpret_cast<const __m256i *>(&C[i * cols + j]));
77 
78  // perform vectorized addition and accumulate the result
79  c = _mm256_add_epi32(c, _mm256_add_epi32(a, b));
80 
81  // store the result back to the C matrix
82  _mm256_storeu_si256(
83  reinterpret_cast<__m256i *>(&C[i * cols + j]),
84  c);
85  }
86 
87  // handle the remaining elements that are not multiples of 8
88  for (; j < cols; ++j) {
89  C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
90  }
91  }
92  }
93 
94  else {
95  // use standard matrix addition
96  std_mtx_add(A, B, C, rows, cols);
97  }
98 }
99 
100 void gpmp::linalg::Mtx::mtx_sub(const int *A,
101  const int *B,
102  int *C,
103  int rows,
104  int cols) {
105  for (int i = 0; i < rows; ++i) {
106  int j = 0;
107  for (; j < cols - 7; j += 8) {
108  __m256i a = _mm256_loadu_si256(
109  reinterpret_cast<const __m256i *>(&A[i * cols + j]));
110  __m256i b = _mm256_loadu_si256(
111  reinterpret_cast<const __m256i *>(&B[i * cols + j]));
112  __m256i c = _mm256_loadu_si256(
113  reinterpret_cast<const __m256i *>(&C[i * cols + j]));
114 
115  // Perform vectorized subtraction and accumulate the result
116  c = _mm256_sub_epi32(a, b);
117 
118  // Store the result back to the C matrix
119  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
120  c);
121  }
122 
123  for (; j < cols; ++j) {
124  C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
125  }
126  }
127 }
128 
129 void gpmp::linalg::Mtx::mtx_mult(const int *A,
130  const int *B,
131  int *C,
132  int rows_a,
133  int cols_a,
134  int cols_b) {
135  for (int i = 0; i < rows_a; ++i) {
136  for (int j = 0; j < cols_b; j += 8) {
137  __m256i c = _mm256_setzero_si256();
138 
139  for (int k = 0; k < cols_a; ++k) {
140  __m256i a = _mm256_set1_epi32(A[i * cols_a + k]);
141  __m256i b = _mm256_loadu_si256(
142  reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));
143 
144  __m256i prod = _mm256_mullo_epi32(a, b);
145  c = _mm256_add_epi32(c, prod);
146  }
147 
148  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
149  c);
150  }
151 
152  // Handle remaining elements
153  for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
154  int sum = 0;
155 
156  for (int k = 0; k < cols_a; ++k) {
157  sum += A[i * cols_a + k] * B[k * cols_b + j];
158  }
159 
160  C[i * cols_b + j] = sum;
161  }
162  }
163 }
164 
165 void gpmp::linalg::Mtx::mtx_mult(const int *A,
166  const int *B,
167  int64_t *C,
168  int rows_a,
169  int cols_a,
170  int cols_b) {
171 
172  for (int i = 0; i < rows_a; ++i) {
173  for (int j = 0; j < cols_b; j += 4) {
174  __m256i c_lo = _mm256_setzero_si256();
175  __m256i c_hi = _mm256_setzero_si256();
176 
177  for (int k = 0; k < cols_a; ++k) {
178  __m256i a = _mm256_set1_epi32(A[i * cols_a + k]);
179  __m256i b = _mm256_loadu_si256(
180  reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));
181 
182  // Perform 32-bit integer multiplication
183  __m256i prod = _mm256_mullo_epi32(a, b);
184 
185  // Extract low and high 32-bit integers
186  __m256i prod_lo =
187  _mm256_cvtepi32_epi64(_mm256_extractf128_si256(prod, 0));
188  __m256i prod_hi =
189  _mm256_cvtepi32_epi64(_mm256_extractf128_si256(prod, 1));
190 
191  // Add to the accumulator
192  c_lo = _mm256_add_epi64(c_lo, prod_lo);
193  c_hi = _mm256_add_epi64(c_hi, prod_hi);
194  }
195 
196  // Store the result back to the C matrix
197  _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
198  c_lo);
199  _mm256_storeu_si256(
200  reinterpret_cast<__m256i *>(&C[i * cols_b + j + 4]),
201  c_hi);
202  }
203 
204  // Handle remaining elements
205  for (int j = cols_b - cols_b % 4; j < cols_b; ++j) {
206  int64_t sum = 0;
207 
208  for (int k = 0; k < cols_a; ++k) {
209  sum +=
210  static_cast<int64_t>(A[i * cols_a + k]) * B[k * cols_b + j];
211  }
212 
213  C[i * cols_b + j] = sum;
214  }
215  }
216 }
217 
218 void gpmp::linalg::Mtx::mtx_tpose(const int *A, int *C, int rows, int cols) {
219  // Transpose 8x8 blocks using AVX2 intrinsics
220  for (int i = 0; i < rows; i += 8) {
221  for (int j = 0; j < cols; j += 8) {
222  // Load 8x8 block from A
223  __m256i a0 = _mm256_loadu_si256(
224  (__m256i *)(const_cast<int *>(A) + i * cols + j));
225  __m256i a1 = _mm256_loadu_si256(
226  (__m256i *)(const_cast<int *>(A) + (i + 1) * cols + j));
227  __m256i a2 = _mm256_loadu_si256(
228  (__m256i *)(const_cast<int *>(A) + (i + 2) * cols + j));
229  __m256i a3 = _mm256_loadu_si256(
230  (__m256i *)(const_cast<int *>(A) + (i + 3) * cols + j));
231  __m256i a4 = _mm256_loadu_si256(
232  (__m256i *)(const_cast<int *>(A) + (i + 4) * cols + j));
233  __m256i a5 = _mm256_loadu_si256(
234  (__m256i *)(const_cast<int *>(A) + (i + 5) * cols + j));
235  __m256i a6 = _mm256_loadu_si256(
236  (__m256i *)(const_cast<int *>(A) + (i + 6) * cols + j));
237  __m256i a7 = _mm256_loadu_si256(
238  (__m256i *)(const_cast<int *>(A) + (i + 7) * cols + j));
239 
240  // Transpose 8x8 block
241  __m256i t0 = _mm256_unpacklo_epi32(a0, a1);
242  __m256i t1 = _mm256_unpacklo_epi32(a2, a3);
243  __m256i t2 = _mm256_unpacklo_epi32(a4, a5);
244  __m256i t3 = _mm256_unpacklo_epi32(a6, a7);
245  __m256i t4 = _mm256_unpackhi_epi32(a0, a1);
246  __m256i t5 = _mm256_unpackhi_epi32(a2, a3);
247  __m256i t6 = _mm256_unpackhi_epi32(a4, a5);
248  __m256i t7 = _mm256_unpackhi_epi32(a6, a7);
249 
250  __m256i tt0 = _mm256_unpacklo_epi64(t0, t1);
251  __m256i tt1 = _mm256_unpackhi_epi64(t0, t1);
252  __m256i tt2 = _mm256_unpacklo_epi64(t2, t3);
253  __m256i tt3 = _mm256_unpackhi_epi64(t2, t3);
254  __m256i tt4 = _mm256_unpacklo_epi64(t4, t5);
255  __m256i tt5 = _mm256_unpackhi_epi64(t4, t5);
256  __m256i tt6 = _mm256_unpacklo_epi64(t6, t7);
257  __m256i tt7 = _mm256_unpackhi_epi64(t6, t7);
258 
259  __m256i ttt0 = _mm256_permute2x128_si256(tt0, tt2, 0x20);
260  __m256i ttt1 = _mm256_permute2x128_si256(tt1, tt3, 0x20);
261  __m256i ttt2 = _mm256_permute2x128_si256(tt4, tt6, 0x20);
262  __m256i ttt3 = _mm256_permute2x128_si256(tt5, tt7, 0x20);
263  __m256i ttt4 = _mm256_permute2x128_si256(tt0, tt2, 0x31);
264  __m256i ttt5 = _mm256_permute2x128_si256(tt1, tt3, 0x31);
265  __m256i ttt6 = _mm256_permute2x128_si256(tt4, tt6, 0x31);
266  __m256i ttt7 = _mm256_permute2x128_si256(tt5, tt7, 0x31);
267 
268  // Store transposed block to C
269  _mm256_storeu_si256((__m256i *)(C + j * rows + i), ttt0);
270  _mm256_storeu_si256((__m256i *)(C + (j + 1) * rows + i), ttt1);
271  _mm256_storeu_si256((__m256i *)(C + (j + 2) * rows + i), ttt2);
272  _mm256_storeu_si256((__m256i *)(C + (j + 3) * rows + i), ttt3);
273  _mm256_storeu_si256((__m256i *)(C + (j + 4) * rows + i), ttt4);
274  _mm256_storeu_si256((__m256i *)(C + (j + 5) * rows + i), ttt5);
275  _mm256_storeu_si256((__m256i *)(C + (j + 6) * rows + i), ttt6);
276  _mm256_storeu_si256((__m256i *)(C + (j + 7) * rows + i), ttt7);
277  }
278  }
279 
280  // Transpose remaining elements
281  for (int i = rows - (rows % 8); i < rows; ++i) {
282  for (int j = cols - (cols % 8); j < cols; ++j) {
283  C[j * rows + i] = A[i * cols + j];
284  }
285  }
286 }
287 
288 #endif
289 
290 // x86
291 #endif
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols)
Perform matrix addition on two matrices as flat arrays.
Definition: mtx.hpp:510
void mtx_mult(std::vector< double > A, std::vector< double > B, std::vector< double > C)
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23