Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file. As this is an Open Source effort, all implementations
25 : * must be of the same methodology.
26 : *
27 : *
28 : *
29 : * This software is distributed on an AS IS basis, WITHOUT
30 : * WARRANTY OF ANY KIND, either express or implied.
31 : *
32 : ************************************************************************/
33 : #include <cassert>
34 : #include <cstddef>
35 : #include <cstdint>
36 : #include <iostream>
37 : #include <openGPMP/linalg/mtx.hpp>
38 : #include <vector>
39 :
40 : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
41 :
42 : /************************************************************************
43 : *
44 : * Matrix Operations for AVX ISA
45 : *
46 : ************************************************************************/
47 : #if defined(__AVX2__)
48 :
49 : // AVX family intrinsics
50 : #include <immintrin.h>
51 :
52 : /************************************************************************
53 : *
54 : * Matrix Operations on vector<vector>
55 : *
56 : ************************************************************************/
57 : // matrix addition using Intel intrinsic, accepts integer types
58 3 : void gpmp::linalg::Mtx::mtx_add(const std::vector<std::vector<int>> &A,
59 : const std::vector<std::vector<int>> &B,
60 : std::vector<std::vector<int>> &C) {
61 3 : const int rows = A.size();
62 3 : const int cols = A[0].size();
63 :
64 3 : if (rows > 16) {
65 2115 : for (int i = 0; i < rows; ++i) {
66 2112 : int j = 0;
67 : // requires at least size 8x8 size matrices
68 264768 : for (; j < cols - 7; j += 8) {
69 : // load 8 elements from A, B, and C matrices using SIMD
70 262656 : __m256i a = _mm256_loadu_si256(
71 262656 : reinterpret_cast<const __m256i *>(&A[i][j]));
72 262656 : __m256i b = _mm256_loadu_si256(
73 262656 : reinterpret_cast<const __m256i *>(&B[i][j]));
74 262656 : __m256i c = _mm256_loadu_si256(
75 262656 : reinterpret_cast<const __m256i *>(&C[i][j]));
76 :
77 : // perform vectorized addition
78 262656 : c = _mm256_add_epi32(a, b);
79 :
80 : // store the result back to the C matrix
81 262656 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
82 : }
83 :
84 : // handle the remaining elements that are not multiples of 8
85 2112 : for (; j < cols; ++j) {
86 0 : C[i][j] = A[i][j] + B[i][j];
87 : }
88 : }
89 : }
90 :
91 : // use standard matrix addition
92 : else {
93 0 : std_mtx_add(A, B, C);
94 : }
95 3 : }
96 :
97 : // matrix subtraction using Intel intrinsics, accepts integer types
98 0 : void gpmp::linalg::Mtx::mtx_sub(const std::vector<std::vector<int>> &A,
99 : const std::vector<std::vector<int>> &B,
100 : std::vector<std::vector<int>> &C) {
101 0 : const int rows = A.size();
102 0 : const int cols = A[0].size();
103 :
104 0 : for (int i = 0; i < rows; ++i) {
105 0 : int j = 0;
106 : // requires at least size 8x8 size matrices
107 0 : for (; j < cols - 7; j += 8) {
108 : // load 8 elements from A, B, and C matrices using SIMD
109 : __m256i a =
110 0 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&A[i][j]));
111 : __m256i b =
112 0 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&B[i][j]));
113 : __m256i c =
114 0 : _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&C[i][j]));
115 :
116 : // perform vectorized subtraction
117 0 : c = _mm256_sub_epi32(a, b);
118 :
119 : // store the result back to the C matrix
120 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
121 : }
122 :
123 : // handle the remaining elements that are not multiples of 8
124 0 : for (; j < cols; ++j) {
125 0 : C[i][j] = A[i][j] - B[i][j];
126 : }
127 : }
128 0 : }
129 :
130 : // matrix multiplication using Intel intrinsics, accepts integer types
131 1 : void gpmp::linalg::Mtx::mtx_mult(const std::vector<std::vector<int>> &A,
132 : const std::vector<std::vector<int>> &B,
133 : std::vector<std::vector<int>> &C) {
134 1 : const int rows_a = A.size();
135 1 : const int cols_a = A[0].size();
136 1 : const int cols_b = B[0].size();
137 :
138 1025 : for (int i = 0; i < rows_a; ++i) {
139 132096 : for (int j = 0; j < cols_b; j += 8) {
140 : // initialize a vector of zeros for the result
141 131072 : __m256i c = _mm256_setzero_si256();
142 :
143 134348800 : for (int k = 0; k < cols_a; ++k) {
144 : // load 8 elements from matrices A and B using SIMD
145 134217728 : __m256i a = _mm256_set1_epi32(A[i][k]);
146 134217728 : __m256i b = _mm256_loadu_si256(
147 134217728 : reinterpret_cast<const __m256i *>(&B[k][j]));
148 :
149 : // perform vectorized multiplication
150 134217728 : __m256i prod = _mm256_mullo_epi32(a, b);
151 :
152 : // perform vectorized addition
153 134217728 : c = _mm256_add_epi32(c, prod);
154 : }
155 :
156 : // store the result back to the C matrix
157 131072 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
158 : }
159 :
160 : // handle the remaining elements that are not multiples of 8
161 1024 : for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
162 0 : int sum = 0;
163 :
164 0 : for (int k = 0; k < cols_a; ++k) {
165 0 : sum += A[i][k] * B[k][j];
166 : }
167 :
168 0 : C[i][j] = sum;
169 : }
170 : }
171 1 : }
172 :
173 : // transpose matrices of type int using Intel intrinsics
174 0 : void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<int>> &matrix) {
175 0 : const int rows = matrix.size();
176 0 : const int cols = matrix[0].size();
177 :
178 0 : for (int i = 0; i < rows; i += 8) {
179 0 : for (int j = i; j < cols; j += 8) {
180 0 : __m256i row1 = _mm256_loadu_si256(
181 0 : reinterpret_cast<const __m256i *>(&matrix[i][j]));
182 0 : __m256i row2 = _mm256_loadu_si256(
183 0 : reinterpret_cast<const __m256i *>(&matrix[i + 1][j]));
184 0 : __m256i row3 = _mm256_loadu_si256(
185 0 : reinterpret_cast<const __m256i *>(&matrix[i + 2][j]));
186 0 : __m256i row4 = _mm256_loadu_si256(
187 0 : reinterpret_cast<const __m256i *>(&matrix[i + 3][j]));
188 0 : __m256i row5 = _mm256_loadu_si256(
189 0 : reinterpret_cast<const __m256i *>(&matrix[i + 4][j]));
190 0 : __m256i row6 = _mm256_loadu_si256(
191 0 : reinterpret_cast<const __m256i *>(&matrix[i + 5][j]));
192 0 : __m256i row7 = _mm256_loadu_si256(
193 0 : reinterpret_cast<const __m256i *>(&matrix[i + 6][j]));
194 0 : __m256i row8 = _mm256_loadu_si256(
195 0 : reinterpret_cast<const __m256i *>(&matrix[i + 7][j]));
196 :
197 : __m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
198 :
199 : // transpose 8x8 submatrix
200 0 : tmp1 = _mm256_unpacklo_epi32(row1, row2);
201 0 : tmp2 = _mm256_unpacklo_epi32(row3, row4);
202 0 : tmp3 = _mm256_unpacklo_epi32(row5, row6);
203 0 : tmp4 = _mm256_unpacklo_epi32(row7, row8);
204 0 : tmp5 = _mm256_unpackhi_epi32(row1, row2);
205 0 : tmp6 = _mm256_unpackhi_epi32(row3, row4);
206 0 : tmp7 = _mm256_unpackhi_epi32(row5, row6);
207 0 : tmp8 = _mm256_unpackhi_epi32(row7, row8);
208 :
209 0 : row1 = _mm256_unpacklo_epi64(tmp1, tmp2);
210 0 : row2 = _mm256_unpacklo_epi64(tmp3, tmp4);
211 0 : row3 = _mm256_unpackhi_epi64(tmp1, tmp2);
212 0 : row4 = _mm256_unpackhi_epi64(tmp3, tmp4);
213 0 : row5 = _mm256_unpacklo_epi64(tmp5, tmp6);
214 0 : row6 = _mm256_unpacklo_epi64(tmp7, tmp8);
215 0 : row7 = _mm256_unpackhi_epi64(tmp5, tmp6);
216 0 : row8 = _mm256_unpackhi_epi64(tmp7, tmp8);
217 :
218 : // store the transposed 8x8 submatrix back to the matrix
219 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i][j]),
220 : row1);
221 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 1][j]),
222 : row2);
223 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 2][j]),
224 : row3);
225 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 3][j]),
226 : row4);
227 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 4][j]),
228 : row5);
229 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 5][j]),
230 : row6);
231 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 6][j]),
232 : row7);
233 0 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 7][j]),
234 : row8);
235 : }
236 : }
237 0 : }
238 :
239 : #endif
240 :
241 : #endif
|