Line data Source code
1 : /************************************************************************* 2 : * 3 : * Project 4 : * _____ _____ __ __ _____ 5 : * / ____| __ \| \/ | __ \ 6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) | 7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/ 8 : *| (_) | |_) | __/ | | | |__| | | | | | | | 9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_| 10 : * | | 11 : * |_| 12 : * 13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al. 14 : * 15 : * This software is licensed as described in the file LICENSE, which 16 : * you should have received as part of this distribution. The terms 17 : * among other details are referenced in the official documentation 18 : * seen here : https://akielaries.github.io/openGPMP/ along with 19 : * important files seen in this project. 20 : * 21 : * You may opt to use, copy, modify, merge, publish, distribute 22 : * and/or sell copies of the Software, and permit persons to whom 23 : * the Software is furnished to do so, under the terms of the 24 : * LICENSE file. As this is an Open Source effort, all implementations 25 : * must be of the same methodology. 26 : * 27 : * 28 : * 29 : * This software is distributed on an AS IS basis, WITHOUT 30 : * WARRANTY OF ANY KIND, either express or implied. 31 : * 32 : ************************************************************************/ 33 : #include <cmath> 34 : #include <iostream> 35 : #include <numeric> 36 : #include <openGPMP/linalg/vector.hpp> 37 : #include <stdexcept> 38 : #include <vector> 39 : 40 : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 41 : 42 : /************************************************************************ 43 : * 44 : * Vector Operations for AVX ISA 45 : * 46 : ************************************************************************/ 47 : #if defined(__AVX2__) 48 : 49 : // AVX family intrinsics 50 : #include <immintrin.h> 51 : 52 : /************************************************************************ 53 : * 54 : * Vector Operations on Vectors 55 : * 56 : ************************************************************************/ 57 : 58 4 : void gpmp::linalg::vector_add(const std::vector<double> &vec1, 59 : const std::vector<double> &vec2, 60 : std::vector<double> &result) { 61 4 : const size_t size = vec1.size(); 62 4 : const double *data1 = vec1.data(); 63 4 : const double *data2 = vec2.data(); 64 4 : double *result_data = result.data(); 65 : 66 4 : if (size > 4) { 67 2 : size_t i = 0; 68 : // perform vectorized addition as long as there are at least 4 elements 69 : // remaining 70 2778057 : for (; i < size - 3; i += 4) { 71 : // Load 4 elements from vec1 and vec2 72 2778055 : __m256d a = _mm256_loadu_pd(data1 + i); 73 5556110 : __m256d b = _mm256_loadu_pd(data2 + i); 74 : 75 2778055 : __m256d c = _mm256_add_pd(a, b); 76 : 77 : // Store the result back to result vector 78 2778055 : _mm256_storeu_pd(result_data + i, c); 79 : } 80 : 81 : // Perform standard addition on the remaining elements 82 4 : for (; i < size; ++i) { 83 2 : result_data[i] = data1[i] + data2[i]; 84 : } 85 : } else { // If size is not greater than 4, perform standard addition 86 10 : for (size_t i = 0; i < size; ++i) { 87 8 : result_data[i] = data1[i] + data2[i]; 88 : } 89 : } 90 4 : } 91 : 92 : // Vector subtraction using AVX2 intrinsics, operates on double types 93 4 : void gpmp::linalg::vector_sub(const std::vector<double> &vec1, 94 : const std::vector<double> &vec2, 95 : std::vector<double> &result) { 96 4 : const int vecSize = vec1.size(); 97 4 : const int remainder = vecSize % 4; 98 4 : const int vecSizeAligned = vecSize - remainder; 99 : 100 4 : if (vecSize > 4) { 101 2778063 : for (int i = 0; i < vecSizeAligned; i += 4) { 102 2778059 : __m256d vec1Data = _mm256_loadu_pd(&vec1[i]); 103 5556118 : __m256d vec2Data = _mm256_loadu_pd(&vec2[i]); 104 2778059 : __m256d sub = _mm256_sub_pd(vec1Data, vec2Data); 105 2778059 : _mm256_storeu_pd(&result[i], sub); 106 : } 107 : 108 : // Perform standard subtraction on the remaining elements 109 6 : for (int i = vecSizeAligned; i < vecSize; ++i) { 110 2 : result[i] = vec1[i] - vec2[i]; 111 : } 112 : } else { 113 0 : for (int i = 0; i < vecSize; ++i) { 114 0 : result[i] = vec1[i] - vec2[i]; 115 : } 116 : } 117 4 : } 118 : 119 4 : void gpmp::linalg::scalar_mult(const std::vector<double> &vec, 120 : double scalar, 121 : std::vector<double> &result) { 122 4 : const int vecSize = vec.size(); 123 4 : const int remainder = vecSize % 4; 124 4 : const int vecSizeAligned = vecSize - remainder; 125 : 126 : // Set the scalar value into a vector register 127 4 : __m256d scalarVector = _mm256_set1_pd(scalar); 128 : 129 : // Process aligned part of the vector 130 2778061 : for (int i = 0; i < vecSizeAligned; i += 4) { 131 : // Load 4 double elements from the vector into a AVX register 132 5556114 : __m256d vecData = _mm256_loadu_pd(&vec[i]); 133 : 134 : // Multiply the elements with the scalar 135 2778057 : __m256d resultData = _mm256_mul_pd(vecData, scalarVector); 136 : 137 : // Store the result back to the result vector 138 2778057 : _mm256_storeu_pd(&result[i], resultData); 139 : } 140 : 141 : // Process the remaining part of the vector 142 6 : for (int i = vecSizeAligned; i < vecSize; ++i) { 143 2 : result[i] = vec[i] * scalar; 144 : } 145 4 : } 146 : 147 3 : double gpmp::linalg::dot_product(const std::vector<double> &vec1, 148 : const std::vector<double> &vec2) { 149 3 : const int vecSize = vec1.size(); 150 3 : const int remainder = vecSize % 4; 151 3 : const int vecSizeAligned = vecSize - remainder; 152 : 153 : // Initialize the result to zero 154 3 : __m256d dotProduct = _mm256_setzero_pd(); 155 : 156 : // Process aligned part of the vectors 157 2778059 : for (int i = 0; i < vecSizeAligned; i += 4) { 158 : // Load 4 double elements from each vector into AVX registers 159 2778056 : __m256d vec1Data = _mm256_loadu_pd(&vec1[i]); 160 5556112 : __m256d vec2Data = _mm256_loadu_pd(&vec2[i]); 161 : 162 : // Multiply the elements pairwise and accumulate the result 163 : dotProduct = 164 5556112 : _mm256_add_pd(dotProduct, _mm256_mul_pd(vec1Data, vec2Data)); 165 : } 166 : 167 : // Sum up the elements in the dotProduct register 168 3 : double result = 0.0; 169 : alignas(32) double temp[4]; 170 : _mm256_store_pd(temp, dotProduct); 171 15 : for (int i = 0; i < 4; ++i) { 172 12 : result += temp[i]; 173 : } 174 : 175 : // Process the remaining part of the vectors 176 5 : for (int i = vecSizeAligned; i < vecSize; ++i) { 177 2 : result += vec1[i] * vec2[i]; 178 : } 179 : 180 3 : return result; 181 : } 182 : 183 : #endif 184 : 185 : #endif