Line data Source code
1 : /************************************************************************* 2 : * 3 : * Project 4 : * _____ _____ __ __ _____ 5 : * / ____| __ \| \/ | __ \ 6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) | 7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/ 8 : *| (_) | |_) | __/ | | | |__| | | | | | | | 9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_| 10 : * | | 11 : * |_| 12 : * 13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al. 14 : * 15 : * This software is licensed as described in the file LICENSE, which 16 : * you should have received as part of this distribution. The terms 17 : * among other details are referenced in the official documentation 18 : * seen here : https://akielaries.github.io/openGPMP/ along with 19 : * important files seen in this project. 20 : * 21 : * You may opt to use, copy, modify, merge, publish, distribute 22 : * and/or sell copies of the Software, and permit persons to whom 23 : * the Software is furnished to do so, under the terms of the 24 : * LICENSE file. As this is an Open Source effort, all implementations 25 : * must be of the same methodology. 26 : * 27 : * 28 : * 29 : * This software is distributed on an AS IS basis, WITHOUT 30 : * WARRANTY OF ANY KIND, either express or implied. 31 : * 32 : ************************************************************************/ 33 : #include <cassert> 34 : #include <cstddef> 35 : #include <cstdint> 36 : #include <iostream> 37 : #include <openGPMP/linalg/mtx.hpp> 38 : #include <vector> 39 : 40 : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64) 41 : 42 : /************************************************************************ 43 : * 44 : * Matrix Operations for AVX ISA 45 : * 46 : ************************************************************************/ 47 : #if defined(__AVX2__) 48 : 49 : // AVX family intrinsics 50 : #include <immintrin.h> 51 : 52 : /************************************************************************ 53 : * 54 : * Matrix Operations on Arrays 55 : * 56 : ************************************************************************/ 57 : // matrix addition for 16-bit integers using 256-bit SIMD registers 58 3 : void gpmp::linalg::Mtx::mtx_add(const int16_t *A, 59 : const int16_t *B, 60 : int16_t *C, 61 : int rows, 62 : int cols) { 63 : // BUG FIXME 64 2351 : for (int i = 0; i < rows; ++i) { 65 2348 : int j = 0; 66 138820 : for (; j < cols - 15; j += 16) { 67 136472 : __m256i a = _mm256_loadu_si256( 68 136472 : reinterpret_cast<const __m256i *>(&A[i * cols + j])); 69 136472 : __m256i b = _mm256_loadu_si256( 70 136472 : reinterpret_cast<const __m256i *>(&B[i * cols + j])); 71 136472 : __m256i c = _mm256_loadu_si256( 72 136472 : reinterpret_cast<const __m256i *>(&C[i * cols + j])); 73 : 74 : // Perform vectorized addition and accumulate the result 75 136472 : c = _mm256_add_epi16(c, _mm256_add_epi16(a, b)); 76 : 77 : // Store the result back to the C matrix 78 136472 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]), 79 : c); 80 : } 81 : 82 5948 : for (; j < cols; ++j) { 83 3600 : C[i * cols + j] = A[i * cols + j] + B[i * cols + j]; 84 : } 85 : } 86 3 : } 87 : 88 1 : void gpmp::linalg::Mtx::mtx_sub(const int16_t *A, 89 : const int16_t *B, 90 : int16_t *C, 91 : int rows, 92 : int cols) { 93 1025 : for (int i = 0; i < rows; ++i) { 94 1024 : int j = 0; 95 66560 : for (; j < cols - 15; j += 16) { 96 65536 : __m256i a = _mm256_loadu_si256( 97 65536 : reinterpret_cast<const __m256i *>(&A[i * cols + j])); 98 65536 : __m256i b = _mm256_loadu_si256( 99 65536 : reinterpret_cast<const __m256i *>(&B[i * cols + j])); 100 65536 : __m256i c = _mm256_loadu_si256( 101 65536 : reinterpret_cast<const __m256i *>(&C[i * cols + j])); 102 : 103 : // Perform vectorized subtraction and accumulate the result 104 65536 : c = _mm256_sub_epi16(a, b); 105 : 106 : // Store the result back to the C matrix 107 65536 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]), 108 : c); 109 : } 110 : 111 1024 : for (; j < cols; ++j) { 112 0 : C[i * cols + j] = A[i * cols + j] - B[i * cols + j]; 113 : } 114 : } 115 1 : } 116 : 117 1 : void gpmp::linalg::Mtx::mtx_mult(const int16_t *A, 118 : const int16_t *B, 119 : int16_t *C, 120 : int rows_a, 121 : int cols_a, 122 : int cols_b) { 123 1025 : for (int i = 0; i < rows_a; ++i) { 124 66560 : for (int j = 0; j < cols_b; j += 16) { 125 65536 : __m256i c = _mm256_setzero_si256(); 126 : 127 67174400 : for (int k = 0; k < cols_a; ++k) { 128 67108864 : __m256i a = _mm256_set1_epi16(A[i * cols_a + k]); 129 67108864 : __m256i b = _mm256_loadu_si256( 130 67108864 : reinterpret_cast<const __m256i *>(&B[k * cols_b + j])); 131 : 132 67108864 : __m256i prod = _mm256_mullo_epi16(a, b); 133 67108864 : c = _mm256_add_epi16(c, prod); 134 : } 135 : 136 65536 : _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]), 137 : c); 138 : } 139 : 140 : // Handle remaining elements 141 1024 : for (int j = cols_b - cols_b % 16; j < cols_b; ++j) { 142 0 : int sum = 0; 143 : 144 0 : for (int k = 0; k < cols_a; ++k) { 145 0 : sum += A[i * cols_a + k] * B[k * cols_b + j]; 146 : } 147 : 148 0 : C[i * cols_b + j] = sum; 149 : } 150 : } 151 1 : } 152 : 153 : #endif 154 : 155 : // x86 156 : #endif