Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file. As this is an Open Source effort, all implementations
25 : * must be of the same methodology.
26 : *
27 : *
28 : *
29 : * This software is distributed on an AS IS basis, WITHOUT
30 : * WARRANTY OF ANY KIND, either express or implied.
31 : *
32 : ************************************************************************/
33 : #include <cassert>
34 : #include <cstddef>
35 : #include <cstdint>
36 : #include <iostream>
37 : #include <openGPMP/linalg/mtx.hpp>
38 : #include <vector>
39 :
40 : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
41 :
42 : /************************************************************************
43 : *
44 : * Matrix Operations for AVX ISA
45 : *
46 : ************************************************************************/
47 : #if defined(__AVX2__)
48 :
49 : // AVX family intrinsics
50 : #include <immintrin.h>
51 :
52 : /************************************************************************
53 : *
54 : * Matrix Operations on vector<vector>
55 : *
56 : ************************************************************************/
57 : // matrix addition using Intel intrinsics, accepts type double
58 3 : void gpmp::linalg::Mtx::mtx_add(const std::vector<std::vector<double>> &A,
59 : const std::vector<std::vector<double>> &B,
60 : std::vector<std::vector<double>> &C) {
61 3 : const int rows = A.size();
62 3 : const int cols = A[0].size();
63 :
64 3 : if (rows > 8) {
65 2115 : for (int i = 0; i < rows; ++i) {
66 2112 : int j = 0;
67 : // requires at least size 4x4 matrices
68 527424 : for (; j < cols - 3; j += 4) {
69 : // load 4 elements from A, B, and C matrices using SIMD
70 525312 : __m256d a = _mm256_loadu_pd(&A[i][j]);
71 525312 : __m256d b = _mm256_loadu_pd(&B[i][j]);
72 1050624 : __m256d c = _mm256_loadu_pd(&C[i][j]);
73 :
74 : // perform vectorized addition
75 525312 : c = _mm256_add_pd(a, b);
76 :
77 : // store the result back to the C matrix
78 525312 : _mm256_storeu_pd(&C[i][j], c);
79 : }
80 :
81 : // handle the remaining elements that are not multiples of 4
82 2112 : for (; j < cols; ++j) {
83 0 : C[i][j] = A[i][j] + B[i][j];
84 : }
85 : }
86 : } else {
87 0 : std_mtx_add(A, B, C);
88 : }
89 3 : }
90 :
91 : // matrix subtraction using Intel intrinsics, accepts double types
92 0 : void gpmp::linalg::Mtx::mtx_sub(const std::vector<std::vector<double>> &A,
93 : const std::vector<std::vector<double>> &B,
94 : std::vector<std::vector<double>> &C) {
95 0 : const int rows = A.size();
96 0 : const int cols = A[0].size();
97 :
98 0 : for (int i = 0; i < rows; ++i) {
99 0 : int j = 0;
100 : // requires at least size 4x4 matrices
101 0 : for (; j < cols - 3; j += 4) {
102 : // load 4 elements from A, B, and C matrices using SIMD
103 0 : __m256d a = _mm256_loadu_pd(&A[i][j]);
104 0 : __m256d b = _mm256_loadu_pd(&B[i][j]);
105 0 : __m256d c = _mm256_loadu_pd(&C[i][j]);
106 :
107 : // perform vectorized subtraction
108 0 : c = _mm256_sub_pd(a, b);
109 :
110 : // store the result back to the C matrix
111 0 : _mm256_storeu_pd(&C[i][j], c);
112 : }
113 :
114 : // handle the remaining elements that are not multiples of 4
115 0 : for (; j < cols; ++j) {
116 0 : C[i][j] = A[i][j] - B[i][j];
117 : }
118 : }
119 0 : }
120 :
121 : // matrix multiplication using Intel intrinsics, accepts double types
122 0 : void gpmp::linalg::Mtx::mtx_mult(const std::vector<std::vector<double>> &A,
123 : const std::vector<std::vector<double>> &B,
124 : std::vector<std::vector<double>> &C) {
125 0 : const int rows_a = A.size();
126 0 : const int cols_a = A[0].size();
127 0 : const int cols_b = B[0].size();
128 :
129 0 : for (int i = 0; i < rows_a; ++i) {
130 0 : for (int j = 0; j < cols_b; j += 4) {
131 : // initialize a vector of zeros for the result
132 0 : __m256d c = _mm256_setzero_pd();
133 :
134 0 : for (int k = 0; k < cols_a; ++k) {
135 : // load 4 elements from matrices A and B using SIMD
136 0 : __m256d a = _mm256_set1_pd(A[i][k]);
137 0 : __m256d b = _mm256_loadu_pd(&B[k][j]);
138 :
139 : // perform vectorized multiplication
140 0 : __m256d prod = _mm256_mul_pd(a, b);
141 :
142 : // perform vectorized addition
143 0 : c = _mm256_add_pd(c, prod);
144 : }
145 :
146 : // store the result back to the C matrix
147 0 : _mm256_storeu_pd(&C[i][j], c);
148 : }
149 :
150 : // handle the remaining elements that are not multiples of 4
151 0 : for (int j = cols_b - cols_b % 4; j < cols_b; ++j) {
152 0 : double sum = 0.0;
153 :
154 0 : for (int k = 0; k < cols_a; ++k) {
155 0 : sum += A[i][k] * B[k][j];
156 : }
157 :
158 0 : C[i][j] = sum;
159 : }
160 : }
161 0 : }
162 :
163 0 : void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<double>> &matrix) {
164 0 : const int rows = matrix.size();
165 0 : const int cols = matrix[0].size();
166 :
167 0 : for (int i = 0; i < rows; i += 4) {
168 0 : for (int j = i; j < cols; j += 4) {
169 0 : __m256d row1 = _mm256_loadu_pd(&matrix[i][j]);
170 0 : __m256d row2 = _mm256_loadu_pd(&matrix[i + 1][j]);
171 0 : __m256d row3 = _mm256_loadu_pd(&matrix[i + 2][j]);
172 0 : __m256d row4 = _mm256_loadu_pd(&matrix[i + 3][j]);
173 :
174 : __m256d tmp1, tmp2, tmp3, tmp4;
175 :
176 : // Transpose 4x4 submatrix
177 0 : tmp1 = _mm256_unpacklo_pd(row1, row2);
178 0 : tmp2 = _mm256_unpackhi_pd(row1, row2);
179 0 : tmp3 = _mm256_unpacklo_pd(row3, row4);
180 0 : tmp4 = _mm256_unpackhi_pd(row3, row4);
181 :
182 0 : row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
183 0 : row2 = _mm256_permute2f128_pd(tmp2, tmp4, 0x20);
184 0 : row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
185 0 : row4 = _mm256_permute2f128_pd(tmp2, tmp4, 0x31);
186 :
187 : // Store the transposed 4x4 submatrix back to the matrix
188 0 : _mm256_storeu_pd(&matrix[i][j], row1);
189 0 : _mm256_storeu_pd(&matrix[i + 1][j], row2);
190 0 : _mm256_storeu_pd(&matrix[i + 2][j], row3);
191 0 : _mm256_storeu_pd(&matrix[i + 3][j], row4);
192 : }
193 : }
194 0 : }
195 :
196 : #endif
197 :
198 : #endif
|