Line data Source code
1 : /************************************************************************* 2 : * 3 : * Project 4 : * _____ _____ __ __ _____ 5 : * / ____| __ \| \/ | __ \ 6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) | 7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/ 8 : *| (_) | |_) | __/ | | | |__| | | | | | | | 9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_| 10 : * | | 11 : * |_| 12 : * 13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al. 14 : * 15 : * This software is licensed as described in the file LICENSE, which 16 : * you should have received as part of this distribution. The terms 17 : * among other details are referenced in the official documentation 18 : * seen here : https://akielaries.github.io/openGPMP/ along with 19 : * important files seen in this project. 20 : * 21 : * You may opt to use, copy, modify, merge, publish, distribute 22 : * and/or sell copies of the Software, and permit persons to whom 23 : * the Software is furnished to do so, under the terms of the 24 : * LICENSE file. As this is an Open Source effort, all implementations 25 : * must be of the same methodology. 26 : * 27 : * 28 : * 29 : * This software is distributed on an AS IS basis, WITHOUT 30 : * WARRANTY OF ANY KIND, either express or implied. 31 : * 32 : ************************************************************************/ 33 : #include <cassert> 34 : #include <cstddef> 35 : #include <cstdint> 36 : #include <iostream> 37 : #include <openGPMP/linalg/mtx.hpp> 38 : #include <vector> 39 : 40 : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 41 : 42 : /************************************************************************ 43 : * 44 : * Matrix Operations for AVX ISA 45 : * 46 : ************************************************************************/ 47 : #if defined(__AVX2__) 48 : 49 : // AVX family intrinsics 50 : #include <immintrin.h> 51 : 52 : /************************************************************************ 53 : * 54 : * Matrix Operations on Arrays 55 : * 56 : ************************************************************************/ 57 : 58 : // matrix addition using Intel intrinsics, accepts float arrays as matrices 59 0 : void gpmp::linalg::Mtx::mtx_add(const float *A, 60 : const float *B, 61 : float *C, 62 : int rows, 63 : int cols) { 64 0 : if (rows > 16) { 65 0 : for (int i = 0; i < rows; ++i) { 66 0 : int j = 0; 67 : // requires at least size 4x4 size matrices 68 0 : for (; j < cols - 7; j += 8) { 69 : // load 4 elements from A, B, and C matrices using SIMD 70 0 : __m256 a = _mm256_loadu_ps(&A[i * cols + j]); 71 0 : __m256 b = _mm256_loadu_ps(&B[i * cols + j]); 72 0 : __m256 c = _mm256_loadu_ps(&C[i * cols + j]); 73 : // perform vectorized addition and accumulate the result 74 0 : c = _mm256_add_ps(a, b); 75 : 76 : // store the result back to the C matrix 77 0 : _mm256_storeu_ps(&C[i * cols + j], c); 78 : } 79 : 80 : // handle the remaining elements that are not multiples of 8 81 0 : for (; j < cols; ++j) { 82 0 : C[i * cols + j] = A[i * cols + j] + B[i * cols + j]; 83 : } 84 : } 85 : } else { 86 : // use standard matrix addition 87 0 : std_mtx_add(A, B, C, rows, cols); 88 : } 89 0 : } 90 : 91 : #endif 92 : 93 : // x86 94 : #endif