LCOV - code coverage report
Current view: top level - modules/linalg/avx - mtx_avx2_vec_i32.cpp (source / functions) Hit Total Coverage
Test: lcov.info Lines: 33 99 33.3 %
Date: 2024-05-13 05:06:06 Functions: 2 4 50.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*************************************************************************
       2             :  *
       3             :  *  Project
       4             :  *                         _____ _____  __  __ _____
       5             :  *                        / ____|  __ \|  \/  |  __ \
       6             :  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
       7             :  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
       8             :  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
       9             :  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
      10             :  *      | |
      11             :  *      |_|
      12             :  *
      13             :  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
      14             :  *
      15             :  * This software is licensed as described in the file LICENSE, which
      16             :  * you should have received as part of this distribution. The terms
      17             :  * among other details are referenced in the official documentation
      18             :  * seen here : https://akielaries.github.io/openGPMP/ along with
      19             :  * important files seen in this project.
      20             :  *
      21             :  * You may opt to use, copy, modify, merge, publish, distribute
      22             :  * and/or sell copies of the Software, and permit persons to whom
      23             :  * the Software is furnished to do so, under the terms of the
      24             :  * LICENSE file. As this is an Open Source effort, all implementations
      25             :  * must be of the same methodology.
      26             :  *
      27             :  *
      28             :  *
      29             :  * This software is distributed on an AS IS basis, WITHOUT
      30             :  * WARRANTY OF ANY KIND, either express or implied.
      31             :  *
      32             :  ************************************************************************/
      33             : #include <cassert>
      34             : #include <cstddef>
      35             : #include <cstdint>
      36             : #include <iostream>
      37             : #include <openGPMP/linalg/mtx.hpp>
      38             : #include <vector>
      39             : 
      40             : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
      41             : 
      42             : /************************************************************************
      43             :  *
      44             :  * Matrix Operations for AVX ISA
      45             :  *
      46             :  ************************************************************************/
      47             : #if defined(__AVX2__)
      48             : 
      49             : // AVX family intrinsics
      50             : #include <immintrin.h>
      51             : 
      52             : /************************************************************************
      53             :  *
      54             :  * Matrix Operations on vector<vector>
      55             :  *
      56             :  ************************************************************************/
      57             : // matrix addition using Intel intrinsic, accepts integer types
      58           3 : void gpmp::linalg::Mtx::mtx_add(const std::vector<std::vector<int>> &A,
      59             :                                 const std::vector<std::vector<int>> &B,
      60             :                                 std::vector<std::vector<int>> &C) {
      61           3 :     const int rows = A.size();
      62           3 :     const int cols = A[0].size();
      63             : 
      64           3 :     if (rows > 16) {
      65        2115 :         for (int i = 0; i < rows; ++i) {
      66        2112 :             int j = 0;
      67             :             // requires at least size 8x8 size matrices
      68      264768 :             for (; j < cols - 7; j += 8) {
      69             :                 // load 8 elements from A, B, and C matrices using SIMD
      70      262656 :                 __m256i a = _mm256_loadu_si256(
      71      262656 :                     reinterpret_cast<const __m256i *>(&A[i][j]));
      72      262656 :                 __m256i b = _mm256_loadu_si256(
      73      262656 :                     reinterpret_cast<const __m256i *>(&B[i][j]));
      74      262656 :                 __m256i c = _mm256_loadu_si256(
      75      262656 :                     reinterpret_cast<const __m256i *>(&C[i][j]));
      76             : 
      77             :                 // perform vectorized addition
      78      262656 :                 c = _mm256_add_epi32(a, b);
      79             : 
      80             :                 // store the result back to the C matrix
      81      262656 :                 _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
      82             :             }
      83             : 
      84             :             // handle the remaining elements that are not multiples of 8
      85        2112 :             for (; j < cols; ++j) {
      86           0 :                 C[i][j] = A[i][j] + B[i][j];
      87             :             }
      88             :         }
      89             :     }
      90             : 
      91             :     // use standard matrix addition
      92             :     else {
      93           0 :         std_mtx_add(A, B, C);
      94             :     }
      95           3 : }
      96             : 
      97             : // matrix subtraction using Intel intrinsics, accepts integer types
      98           0 : void gpmp::linalg::Mtx::mtx_sub(const std::vector<std::vector<int>> &A,
      99             :                                 const std::vector<std::vector<int>> &B,
     100             :                                 std::vector<std::vector<int>> &C) {
     101           0 :     const int rows = A.size();
     102           0 :     const int cols = A[0].size();
     103             : 
     104           0 :     for (int i = 0; i < rows; ++i) {
     105           0 :         int j = 0;
     106             :         // requires at least size 8x8 size matrices
     107           0 :         for (; j < cols - 7; j += 8) {
     108             :             // load 8 elements from A, B, and C matrices using SIMD
     109             :             __m256i a =
     110           0 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&A[i][j]));
     111             :             __m256i b =
     112           0 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&B[i][j]));
     113             :             __m256i c =
     114           0 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(&C[i][j]));
     115             : 
     116             :             // perform vectorized subtraction
     117           0 :             c = _mm256_sub_epi32(a, b);
     118             : 
     119             :             // store the result back to the C matrix
     120           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
     121             :         }
     122             : 
     123             :         // handle the remaining elements that are not multiples of 8
     124           0 :         for (; j < cols; ++j) {
     125           0 :             C[i][j] = A[i][j] - B[i][j];
     126             :         }
     127             :     }
     128           0 : }
     129             : 
     130             : // matrix multiplication using Intel intrinsics, accepts integer types
     131           1 : void gpmp::linalg::Mtx::mtx_mult(const std::vector<std::vector<int>> &A,
     132             :                                  const std::vector<std::vector<int>> &B,
     133             :                                  std::vector<std::vector<int>> &C) {
     134           1 :     const int rows_a = A.size();
     135           1 :     const int cols_a = A[0].size();
     136           1 :     const int cols_b = B[0].size();
     137             : 
     138        1025 :     for (int i = 0; i < rows_a; ++i) {
     139      132096 :         for (int j = 0; j < cols_b; j += 8) {
     140             :             // initialize a vector of zeros for the result
     141      131072 :             __m256i c = _mm256_setzero_si256();
     142             : 
     143   134348800 :             for (int k = 0; k < cols_a; ++k) {
     144             :                 // load 8 elements from matrices A and B using SIMD
     145   134217728 :                 __m256i a = _mm256_set1_epi32(A[i][k]);
     146   134217728 :                 __m256i b = _mm256_loadu_si256(
     147   134217728 :                     reinterpret_cast<const __m256i *>(&B[k][j]));
     148             : 
     149             :                 // perform vectorized multiplication
     150   134217728 :                 __m256i prod = _mm256_mullo_epi32(a, b);
     151             : 
     152             :                 // perform vectorized addition
     153   134217728 :                 c = _mm256_add_epi32(c, prod);
     154             :             }
     155             : 
     156             :             // store the result back to the C matrix
     157      131072 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i][j]), c);
     158             :         }
     159             : 
     160             :         // handle the remaining elements that are not multiples of 8
     161        1024 :         for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
     162           0 :             int sum = 0;
     163             : 
     164           0 :             for (int k = 0; k < cols_a; ++k) {
     165           0 :                 sum += A[i][k] * B[k][j];
     166             :             }
     167             : 
     168           0 :             C[i][j] = sum;
     169             :         }
     170             :     }
     171           1 : }
     172             : 
     173             : // transpose matrices of type int using Intel intrinsics
     174           0 : void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<int>> &matrix) {
     175           0 :     const int rows = matrix.size();
     176           0 :     const int cols = matrix[0].size();
     177             : 
     178           0 :     for (int i = 0; i < rows; i += 8) {
     179           0 :         for (int j = i; j < cols; j += 8) {
     180           0 :             __m256i row1 = _mm256_loadu_si256(
     181           0 :                 reinterpret_cast<const __m256i *>(&matrix[i][j]));
     182           0 :             __m256i row2 = _mm256_loadu_si256(
     183           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 1][j]));
     184           0 :             __m256i row3 = _mm256_loadu_si256(
     185           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 2][j]));
     186           0 :             __m256i row4 = _mm256_loadu_si256(
     187           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 3][j]));
     188           0 :             __m256i row5 = _mm256_loadu_si256(
     189           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 4][j]));
     190           0 :             __m256i row6 = _mm256_loadu_si256(
     191           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 5][j]));
     192           0 :             __m256i row7 = _mm256_loadu_si256(
     193           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 6][j]));
     194           0 :             __m256i row8 = _mm256_loadu_si256(
     195           0 :                 reinterpret_cast<const __m256i *>(&matrix[i + 7][j]));
     196             : 
     197             :             __m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
     198             : 
     199             :             // transpose 8x8 submatrix
     200           0 :             tmp1 = _mm256_unpacklo_epi32(row1, row2);
     201           0 :             tmp2 = _mm256_unpacklo_epi32(row3, row4);
     202           0 :             tmp3 = _mm256_unpacklo_epi32(row5, row6);
     203           0 :             tmp4 = _mm256_unpacklo_epi32(row7, row8);
     204           0 :             tmp5 = _mm256_unpackhi_epi32(row1, row2);
     205           0 :             tmp6 = _mm256_unpackhi_epi32(row3, row4);
     206           0 :             tmp7 = _mm256_unpackhi_epi32(row5, row6);
     207           0 :             tmp8 = _mm256_unpackhi_epi32(row7, row8);
     208             : 
     209           0 :             row1 = _mm256_unpacklo_epi64(tmp1, tmp2);
     210           0 :             row2 = _mm256_unpacklo_epi64(tmp3, tmp4);
     211           0 :             row3 = _mm256_unpackhi_epi64(tmp1, tmp2);
     212           0 :             row4 = _mm256_unpackhi_epi64(tmp3, tmp4);
     213           0 :             row5 = _mm256_unpacklo_epi64(tmp5, tmp6);
     214           0 :             row6 = _mm256_unpacklo_epi64(tmp7, tmp8);
     215           0 :             row7 = _mm256_unpackhi_epi64(tmp5, tmp6);
     216           0 :             row8 = _mm256_unpackhi_epi64(tmp7, tmp8);
     217             : 
     218             :             // store the transposed 8x8 submatrix back to the matrix
     219           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i][j]),
     220             :                                 row1);
     221           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 1][j]),
     222             :                                 row2);
     223           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 2][j]),
     224             :                                 row3);
     225           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 3][j]),
     226             :                                 row4);
     227           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 4][j]),
     228             :                                 row5);
     229           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 5][j]),
     230             :                                 row6);
     231           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 6][j]),
     232             :                                 row7);
     233           0 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(&matrix[i + 7][j]),
     234             :                                 row8);
     235             :         }
     236             :     }
     237           0 : }
     238             : 
     239             : #endif
     240             : 
     241             : #endif

Generated by: LCOV version 1.14