LCOV - code coverage report
Current view: top level - modules/linalg/avx - vector_avx2_i32.cpp (source / functions) Hit Total Coverage
Test: lcov.info Lines: 75 90 83.3 %
Date: 2024-05-13 05:06:06 Functions: 8 16 50.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*************************************************************************
       2             :  *
       3             :  *  Project
       4             :  *                         _____ _____  __  __ _____
       5             :  *                        / ____|  __ \|  \/  |  __ \
       6             :  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
       7             :  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
       8             :  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
       9             :  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
      10             :  *      | |
      11             :  *      |_|
      12             :  *
      13             :  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
      14             :  *
      15             :  * This software is licensed as described in the file LICENSE, which
      16             :  * you should have received as part of this distribution. The terms
      17             :  * among other details are referenced in the official documentation
      18             :  * seen here : https://akielaries.github.io/openGPMP/ along with
      19             :  * important files seen in this project.
      20             :  *
      21             :  * You may opt to use, copy, modify, merge, publish, distribute
      22             :  * and/or sell copies of the Software, and permit persons to whom
      23             :  * the Software is furnished to do so, under the terms of the
      24             :  * LICENSE file. As this is an Open Source effort, all implementations
      25             :  * must be of the same methodology.
      26             :  *
      27             :  *
      28             :  *
      29             :  * This software is distributed on an AS IS basis, WITHOUT
      30             :  * WARRANTY OF ANY KIND, either express or implied.
      31             :  *
      32             :  ************************************************************************/
      33             : #include <cmath>
      34             : #include <cstdint>
      35             : #include <iostream>
      36             : #include <numeric>
      37             : #include <openGPMP/linalg/vector.hpp>
      38             : #include <stdexcept>
      39             : #include <vector>
      40             : 
      41             : #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
      42             : 
      43             : /************************************************************************
      44             :  *
      45             :  * Vector Operations for AVX ISA
      46             :  *
      47             :  ************************************************************************/
      48             : #if defined(__AVX2__)
      49             : 
      50             : // AVX family intrinsics
      51             : #include <immintrin.h>
      52             : 
      53             : /************************************************************************
      54             :  *
      55             :  * Vector Operations on Vectors
      56             :  *
      57             :  ************************************************************************/
      58             : 
      59             : /*****************************************************************************/
      60             : 
      61             : template <typename T>
      62           4 : void gpmp::linalg::vector_add_i32(const T *data1,
      63             :                                   const T *data2,
      64             :                                   T *result_data,
      65             :                                   size_t size) {
      66           4 :     if (size > 16) {
      67           2 :         size_t i = 0;
      68     1389029 :         for (; i < size - 7; i += 8) {
      69             :             // Load 8 elements from vec1 and vec2
      70     1389027 :             __m256i a = _mm256_loadu_si256(
      71     1389027 :                 reinterpret_cast<const __m256i *>(data1 + i));
      72     1389027 :             __m256i b = _mm256_loadu_si256(
      73     1389027 :                 reinterpret_cast<const __m256i *>(data2 + i));
      74             : 
      75             :             // Perform vectorized addition
      76     1389027 :             __m256i c = _mm256_add_epi32(a, b);
      77             : 
      78             :             // Store the result back to result vector
      79     1389027 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
      80             :                                 c);
      81             :         }
      82           8 :         for (; i < size; ++i) {
      83           6 :             result_data[i] = data1[i] + data2[i];
      84             :         }
      85             :     }
      86             : 
      87             :     else {
      88             :         // if size is not a multiple of 8, perform standard addition
      89          18 :         for (size_t i = 0; i < size; ++i) {
      90          16 :             result_data[i] = data1[i] + data2[i];
      91             :         }
      92             :     }
      93           4 : }
      94             : 
      95           4 : void gpmp::linalg::vector_add(const std::vector<int32_t> &vec1,
      96             :                               const std::vector<int32_t> &vec2,
      97             :                               std::vector<int32_t> &result) {
      98             : 
      99           4 :     const size_t size = vec1.size();
     100           4 :     vector_add_i32(vec1.data(), vec2.data(), result.data(), size);
     101           4 : }
     102             : 
     103           0 : void gpmp::linalg::vector_add(const std::vector<uint32_t> &vec1,
     104             :                               const std::vector<uint32_t> &vec2,
     105             :                               std::vector<uint32_t> &result) {
     106           0 :     const size_t size = vec1.size();
     107           0 :     vector_add_i32(vec1.data(), vec2.data(), result.data(), size);
     108           0 : }
     109             : 
     110             : template <typename T>
     111           4 : void gpmp::linalg::vector_sub_i32(const T *data1,
     112             :                                   const T *data2,
     113             :                                   T *result_data,
     114             :                                   size_t size) {
     115             : 
     116           4 :     if (size > 16) {
     117           2 :         size_t i = 0;
     118     1389029 :         for (; i < size - 7; i += 8) {
     119             :             //__m256i a = _mm256_loadu_si256(reinterpret_cast<const __m256i
     120             :             //*>(&data1[i]));
     121     1389027 :             __m256i a = _mm256_loadu_si256(
     122     1389027 :                 reinterpret_cast<const __m256i *>(data1 + i));
     123             : 
     124             :             //__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i
     125             :             //*>(&data2[i]));
     126     1389027 :             __m256i b = _mm256_loadu_si256(
     127     1389027 :                 reinterpret_cast<const __m256i *>(data2 + i));
     128             : 
     129     1389027 :             __m256i c = _mm256_sub_epi32(a, b);
     130             :             //_mm256_storeu_si256(reinterpret_cast<__m256i *>(&result_data[i]),
     131             :             // c);
     132             :             // Store the result back to result vector
     133     1389027 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
     134             :                                 c);
     135             :         }
     136             : 
     137           8 :         for (; i < size; ++i) {
     138           6 :             result_data[i] = data1[i] - data2[i];
     139             :         }
     140             :     }
     141             : 
     142             :     else {
     143          18 :         for (size_t i = 0; i < size; ++i) {
     144          16 :             result_data[i] = data1[i] - data2[i];
     145             :         }
     146             :     }
     147           4 : }
     148             : 
     149           4 : void gpmp::linalg::vector_sub(const std::vector<int32_t> &vec1,
     150             :                               const std::vector<int32_t> &vec2,
     151             :                               std::vector<int32_t> &result) {
     152             : 
     153           4 :     const size_t size = vec1.size();
     154           4 :     vector_sub_i32(vec1.data(), vec2.data(), result.data(), size);
     155           4 : }
     156             : 
     157           0 : void gpmp::linalg::vector_sub(const std::vector<uint32_t> &vec1,
     158             :                               const std::vector<uint32_t> &vec2,
     159             :                               std::vector<uint32_t> &result) {
     160           0 :     const size_t size = vec1.size();
     161           0 :     vector_sub_i32(vec1.data(), vec2.data(), result.data(), size);
     162           0 : }
     163             : 
     164             : template <typename T>
     165           4 : void gpmp::linalg::scalar_mult_i32(const T *data,
     166             :                                    int scalar,
     167             :                                    T *result_data,
     168             :                                    size_t size) {
     169           4 :     if (size > 16) {
     170           2 :         size_t i = 0;
     171           2 :         __m256i scalar_vec = _mm256_set1_epi32(scalar);
     172             :         // Perform vectorized multiplication as long as there are at least 8
     173             :         // elements remaining
     174     1389029 :         for (; i < size - 7; i += 8) {
     175             :             // Load 8 elements from vec
     176             :             __m256i a =
     177     2778054 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
     178             : 
     179             :             // Perform vectorized multiplication
     180     1389027 :             __m256i c = _mm256_mullo_epi32(a, scalar_vec);
     181             : 
     182             :             // Store the result back to result vector
     183     1389027 :             _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
     184             :                                 c);
     185             :         }
     186             : 
     187             :         // Perform standard multiplication on the remaining elements
     188           8 :         for (; i < size; ++i) {
     189           6 :             result_data[i] = data[i] * scalar;
     190             :         }
     191             :     } else {
     192          18 :         for (size_t i = 0; i < size; ++i) {
     193          16 :             result_data[i] = data[i] * scalar;
     194             :         }
     195             :     }
     196           4 : }
     197             : 
     198           4 : void gpmp::linalg::scalar_mult(const std::vector<int32_t> &vec1,
     199             :                                int scalar,
     200             :                                std::vector<int32_t> &result) {
     201             : 
     202           4 :     const size_t size = vec1.size();
     203           4 :     scalar_mult_i32(vec1.data(), scalar, result.data(), size);
     204           4 : }
     205             : 
     206           0 : void gpmp::linalg::scalar_mult(const std::vector<uint32_t> &vec1,
     207             :                                int scalar,
     208             :                                std::vector<uint32_t> &result) {
     209           0 :     const size_t size = vec1.size();
     210           0 :     scalar_mult_i32(vec1.data(), scalar, result.data(), size);
     211           0 : }
     212             : 
     213             : template <typename T>
     214           3 : T gpmp::linalg::dot_product_i32(const T *vec1, const T *vec2, size_t size) {
     215           3 :     int result = 0;
     216           3 :     if (size > 16) {
     217           2 :         size_t i = 0;
     218             :         // Perform vectorized multiplication and addition as long as there are
     219             :         // at least 8 elements remaining
     220     1389029 :         for (; i < size - 7; i += 8) {
     221             :             // Load 8 elements from vec1 and vec2
     222             :             __m256i a =
     223     1389027 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(vec1 + i));
     224             :             __m256i b =
     225     2778054 :                 _mm256_loadu_si256(reinterpret_cast<const __m256i *>(vec2 + i));
     226             : 
     227             :             // Perform vectorized multiplication and addition
     228     1389027 :             __m256i c = _mm256_mullo_epi32(a, b);
     229     1389027 :             __m256i sum = _mm256_hadd_epi32(c, c);
     230     1389027 :             sum = _mm256_hadd_epi32(sum, sum);
     231             : 
     232             :             // Accumulate the result
     233     1389027 :             result += _mm256_extract_epi32(sum, 0);
     234     1389027 :             result += _mm256_extract_epi32(sum, 4);
     235             :         }
     236             : 
     237             :         // Perform standard dot product on the remaining elements
     238           8 :         for (; i < size; ++i) {
     239           6 :             result += vec1[i] * vec2[i];
     240             :         }
     241             :     } else {
     242           9 :         for (size_t i = 0; i < size; ++i) {
     243           8 :             result += vec1[i] * vec2[i];
     244             :         }
     245             :     }
     246             : 
     247           3 :     return result;
     248             : }
     249             : 
     250           3 : int gpmp::linalg::dot_product(const std::vector<int32_t> &vec1,
     251             :                               const std::vector<int32_t> &vec2) {
     252           3 :     const size_t size = vec1.size();
     253           3 :     return dot_product_i32(vec1.data(), vec2.data(), size);
     254             : }
     255             : 
     256           0 : int gpmp::linalg::dot_product(const std::vector<uint32_t> &vec1,
     257             :                               const std::vector<uint32_t> &vec2) {
     258           0 :     const size_t size = vec1.size();
     259           0 :     return dot_product_i32(vec1.data(), vec2.data(), size);
     260             : }
     261             : 
     262             : #endif // INTRINS
     263             : 
     264             : // x86
     265             : #endif

Generated by: LCOV version 1.14