openGPMP
Open Source Mathematics Package
vector_avx2_i32.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cmath>
34 #include <cstdint>
35 #include <iostream>
36 #include <numeric>
38 #include <stdexcept>
39 #include <vector>
40 
41 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
42 
43 /************************************************************************
44  *
45  * Vector Operations for AVX ISA
46  *
47  ************************************************************************/
48 #if defined(__AVX2__)
49 
50 // AVX family intrinsics
51 #include <immintrin.h>
52 
53 /************************************************************************
54  *
55  * Vector Operations on Vectors
56  *
57  ************************************************************************/
58 
59 /*****************************************************************************/
60 
61 template <typename T>
62 void gpmp::linalg::vector_add_i32(const T *data1,
63  const T *data2,
64  T *result_data,
65  size_t size) {
66  if (size > 16) {
67  size_t i = 0;
68  for (; i < size - 7; i += 8) {
69  // Load 8 elements from vec1 and vec2
70  __m256i a = _mm256_loadu_si256(
71  reinterpret_cast<const __m256i *>(data1 + i));
72  __m256i b = _mm256_loadu_si256(
73  reinterpret_cast<const __m256i *>(data2 + i));
74 
75  // Perform vectorized addition
76  __m256i c = _mm256_add_epi32(a, b);
77 
78  // Store the result back to result vector
79  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
80  c);
81  }
82  for (; i < size; ++i) {
83  result_data[i] = data1[i] + data2[i];
84  }
85  }
86 
87  else {
88  // if size is not a multiple of 8, perform standard addition
89  for (size_t i = 0; i < size; ++i) {
90  result_data[i] = data1[i] + data2[i];
91  }
92  }
93 }
94 
95 void gpmp::linalg::vector_add(const std::vector<int32_t> &vec1,
96  const std::vector<int32_t> &vec2,
97  std::vector<int32_t> &result) {
98 
99  const size_t size = vec1.size();
100  vector_add_i32(vec1.data(), vec2.data(), result.data(), size);
101 }
102 
103 void gpmp::linalg::vector_add(const std::vector<uint32_t> &vec1,
104  const std::vector<uint32_t> &vec2,
105  std::vector<uint32_t> &result) {
106  const size_t size = vec1.size();
107  vector_add_i32(vec1.data(), vec2.data(), result.data(), size);
108 }
109 
110 template <typename T>
111 void gpmp::linalg::vector_sub_i32(const T *data1,
112  const T *data2,
113  T *result_data,
114  size_t size) {
115 
116  if (size > 16) {
117  size_t i = 0;
118  for (; i < size - 7; i += 8) {
119  //__m256i a = _mm256_loadu_si256(reinterpret_cast<const __m256i
120  //*>(&data1[i]));
121  __m256i a = _mm256_loadu_si256(
122  reinterpret_cast<const __m256i *>(data1 + i));
123 
124  //__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i
125  //*>(&data2[i]));
126  __m256i b = _mm256_loadu_si256(
127  reinterpret_cast<const __m256i *>(data2 + i));
128 
129  __m256i c = _mm256_sub_epi32(a, b);
130  //_mm256_storeu_si256(reinterpret_cast<__m256i *>(&result_data[i]),
131  // c);
132  // Store the result back to result vector
133  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
134  c);
135  }
136 
137  for (; i < size; ++i) {
138  result_data[i] = data1[i] - data2[i];
139  }
140  }
141 
142  else {
143  for (size_t i = 0; i < size; ++i) {
144  result_data[i] = data1[i] - data2[i];
145  }
146  }
147 }
148 
149 void gpmp::linalg::vector_sub(const std::vector<int32_t> &vec1,
150  const std::vector<int32_t> &vec2,
151  std::vector<int32_t> &result) {
152 
153  const size_t size = vec1.size();
154  vector_sub_i32(vec1.data(), vec2.data(), result.data(), size);
155 }
156 
157 void gpmp::linalg::vector_sub(const std::vector<uint32_t> &vec1,
158  const std::vector<uint32_t> &vec2,
159  std::vector<uint32_t> &result) {
160  const size_t size = vec1.size();
161  vector_sub_i32(vec1.data(), vec2.data(), result.data(), size);
162 }
163 
164 template <typename T>
165 void gpmp::linalg::scalar_mult_i32(const T *data,
166  int scalar,
167  T *result_data,
168  size_t size) {
169  if (size > 16) {
170  size_t i = 0;
171  __m256i scalar_vec = _mm256_set1_epi32(scalar);
172  // Perform vectorized multiplication as long as there are at least 8
173  // elements remaining
174  for (; i < size - 7; i += 8) {
175  // Load 8 elements from vec
176  __m256i a =
177  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
178 
179  // Perform vectorized multiplication
180  __m256i c = _mm256_mullo_epi32(a, scalar_vec);
181 
182  // Store the result back to result vector
183  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
184  c);
185  }
186 
187  // Perform standard multiplication on the remaining elements
188  for (; i < size; ++i) {
189  result_data[i] = data[i] * scalar;
190  }
191  } else {
192  for (size_t i = 0; i < size; ++i) {
193  result_data[i] = data[i] * scalar;
194  }
195  }
196 }
197 
198 void gpmp::linalg::scalar_mult(const std::vector<int32_t> &vec1,
199  int scalar,
200  std::vector<int32_t> &result) {
201 
202  const size_t size = vec1.size();
203  scalar_mult_i32(vec1.data(), scalar, result.data(), size);
204 }
205 
206 void gpmp::linalg::scalar_mult(const std::vector<uint32_t> &vec1,
207  int scalar,
208  std::vector<uint32_t> &result) {
209  const size_t size = vec1.size();
210  scalar_mult_i32(vec1.data(), scalar, result.data(), size);
211 }
212 
213 template <typename T>
214 T gpmp::linalg::dot_product_i32(const T *vec1, const T *vec2, size_t size) {
215  int result = 0;
216  if (size > 16) {
217  size_t i = 0;
218  // Perform vectorized multiplication and addition as long as there are
219  // at least 8 elements remaining
220  for (; i < size - 7; i += 8) {
221  // Load 8 elements from vec1 and vec2
222  __m256i a =
223  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(vec1 + i));
224  __m256i b =
225  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(vec2 + i));
226 
227  // Perform vectorized multiplication and addition
228  __m256i c = _mm256_mullo_epi32(a, b);
229  __m256i sum = _mm256_hadd_epi32(c, c);
230  sum = _mm256_hadd_epi32(sum, sum);
231 
232  // Accumulate the result
233  result += _mm256_extract_epi32(sum, 0);
234  result += _mm256_extract_epi32(sum, 4);
235  }
236 
237  // Perform standard dot product on the remaining elements
238  for (; i < size; ++i) {
239  result += vec1[i] * vec2[i];
240  }
241  } else {
242  for (size_t i = 0; i < size; ++i) {
243  result += vec1[i] * vec2[i];
244  }
245  }
246 
247  return result;
248 }
249 
250 int gpmp::linalg::dot_product(const std::vector<int32_t> &vec1,
251  const std::vector<int32_t> &vec2) {
252  const size_t size = vec1.size();
253  return dot_product_i32(vec1.data(), vec2.data(), size);
254 }
255 
256 int gpmp::linalg::dot_product(const std::vector<uint32_t> &vec1,
257  const std::vector<uint32_t> &vec2) {
258  const size_t size = vec1.size();
259  return dot_product_i32(vec1.data(), vec2.data(), size);
260 }
261 
262 #endif // INTRINS
263 
264 // x86
265 #endif
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_add_i32(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector addition for signed 32-bit integers.
void vector_sub_i32(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector subtraction for signed 32-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void scalar_mult_i32(const T *data, int scalar, T *result_data, size_t size)
Performs scalar multiplication for signed 32-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.
T dot_product_i32(const T *vec1, const T *vec2, size_t size)
Computes the dot product for signed 32-bit integer vectors.