openGPMP
Open Source Mathematics Package
vector_avx2_i8.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cmath>
34 #include <cstdint>
35 #include <iostream>
36 #include <numeric>
38 #include <stdexcept>
39 #include <vector>
40 
41 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
42 
43 /************************************************************************
44  *
45  * Vector Operations for AVX ISA
46  *
47  ************************************************************************/
48 #if defined(__AVX2__)
49 
50 // AVX family intrinsics
51 #include <immintrin.h>
52 
53 /************************************************************************
54  *
55  * Vector Operations on Vectors
56  *
57  ************************************************************************/
58 
59 /*****************************************************************************/
60 
61 template <typename T>
62 void gpmp::linalg::vector_add_i8(const T *data1,
63  const T *data2,
64  T *result_data,
65  size_t size) {
66  size_t i = 0;
67  if (size > 64) {
68  for (; i < size - 31; i += 32) {
69  __m256i a = _mm256_loadu_si256(
70  reinterpret_cast<const __m256i *>(data1 + i));
71  __m256i b = _mm256_loadu_si256(
72  reinterpret_cast<const __m256i *>(data2 + i));
73  __m256i c = _mm256_add_epi8(a, b);
74  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
75  c);
76  }
77  }
78  for (; i < size; ++i) {
79  result_data[i] = data1[i] + data2[i];
80  }
81 }
82 
83 void gpmp::linalg::vector_add(const std::vector<int8_t> &vec1,
84  const std::vector<int8_t> &vec2,
85  std::vector<int8_t> &result) {
86 
87  const size_t size = vec1.size();
88  vector_add_i8(vec1.data(), vec2.data(), result.data(), size);
89 }
90 
91 void gpmp::linalg::vector_add(const std::vector<uint8_t> &vec1,
92  const std::vector<uint8_t> &vec2,
93  std::vector<uint8_t> &result) {
94  const size_t size = vec1.size();
95  vector_add_i8(vec1.data(), vec2.data(), result.data(), size);
96 }
97 
98 template <typename T>
99 void gpmp::linalg::vector_sub_i8(const T *data1,
100  const T *data2,
101  T *result_data,
102  size_t size) {
103  size_t i = 0;
104 
105  if (size > 64) {
106  for (; i < size - 31; i += 32) {
107  __m256i a = _mm256_loadu_si256(
108  reinterpret_cast<const __m256i *>(data1 + i));
109  __m256i b = _mm256_loadu_si256(
110  reinterpret_cast<const __m256i *>(data2 + i));
111  __m256i c = _mm256_sub_epi8(a, b);
112  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
113  c);
114  }
115  }
116 
117  // Perform standard subtraction on the remaining elements
118  for (; i < size; ++i) {
119  result_data[i] = data1[i] - data2[i];
120  }
121 }
122 
123 void gpmp::linalg::vector_sub(const std::vector<int8_t> &vec1,
124  const std::vector<int8_t> &vec2,
125  std::vector<int8_t> &result) {
126 
127  const size_t size = vec1.size();
128  vector_sub_i8(vec1.data(), vec2.data(), result.data(), size);
129 }
130 
131 void gpmp::linalg::vector_sub(const std::vector<uint8_t> &vec1,
132  const std::vector<uint8_t> &vec2,
133  std::vector<uint8_t> &result) {
134  const size_t size = vec1.size();
135  vector_sub_i8(vec1.data(), vec2.data(), result.data(), size);
136 }
137 
138 template <typename T>
139 void gpmp::linalg::scalar_mult_i8(const T *data,
140  int scalar,
141  T *result_data,
142  size_t size) {
143  size_t i = 0;
144 
145  if (size > 64) {
146  __m256i scalar_vec = _mm256_set1_epi16(scalar);
147 
148  for (; i < size - 31; i += 32) {
149  // Load 32 elements from vec
150  __m256i a =
151  _mm256_loadu_si256(reinterpret_cast<const __m256i *>(data + i));
152 
153  // Split the 16-bit values into two 8-bit halves
154  __m256i a_low = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
155  __m256i a_high = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
156 
157  // Perform vectorized multiplication of 16-bit integers
158  __m256i c_low = _mm256_mullo_epi16(a_low, scalar_vec);
159  __m256i c_high = _mm256_mullo_epi16(a_high, scalar_vec);
160 
161  // Pack the 16-bit integers back to 8-bit integers
162  __m256i c = _mm256_packus_epi16(c_low, c_high);
163 
164  // Store the result back to result vector
165  _mm256_storeu_si256(reinterpret_cast<__m256i *>(result_data + i),
166  c);
167  }
168  }
169  for (; i < size; ++i) {
170  result_data[i] = data[i] * scalar;
171  }
172 }
173 
174 // TODO/FIXME : the result should be templated too
175 void gpmp::linalg::scalar_mult(const std::vector<int8_t> &vec1,
176  int scalar,
177  std::vector<int8_t> &result) {
178 
179  const size_t size = vec1.size();
180  scalar_mult_i8(vec1.data(), scalar, result.data(), size);
181 }
182 
183 void gpmp::linalg::scalar_mult(const std::vector<uint8_t> &vec1,
184  int scalar,
185  std::vector<uint8_t> &result) {
186  const size_t size = vec1.size();
187  scalar_mult_i8(vec1.data(), scalar, result.data(), size);
188 }
189 
190 template <typename T>
191 T gpmp::linalg::dot_product_i8(const T *vec1, const T *vec2, size_t size) {
192  int result = 0;
193  if (size > 32) {
194  size_t i = 0;
195  // Perform vectorized multiplication and addition as long as there are
196  // at least 16 elements remaining
197  for (; i < size - 15; i += 16) {
198  // Load 16 elements from vec1 and vec2, unpacking them to 16-bit
199  // integers
200  __m256i a = _mm256_cvtepu8_epi16(
201  _mm_loadu_si128(reinterpret_cast<const __m128i *>(vec1 + i)));
202  __m256i b = _mm256_cvtepu8_epi16(
203  _mm_loadu_si128(reinterpret_cast<const __m128i *>(vec2 + i)));
204 
205  // Perform vectorized multiplication and addition
206  __m256i c = _mm256_mullo_epi16(a, b);
207  __m256i sum = _mm256_hadd_epi16(c, c);
208  sum = _mm256_hadd_epi16(sum, sum);
209  sum = _mm256_hadd_epi16(sum, sum);
210 
211  // Accumulate the result
212  result += _mm256_extract_epi16(sum, 0);
213  result += _mm256_extract_epi16(sum, 8);
214  }
215 
216  // Perform standard dot product on the remaining elements
217  for (; i < size; ++i) {
218  result += vec1[i] * vec2[i];
219  }
220  } else {
221  for (size_t i = 0; i < size; ++i) {
222  result += vec1[i] * vec2[i];
223  }
224  }
225 
226  return result;
227 }
228 
229 int gpmp::linalg::dot_product(const std::vector<int8_t> &vec1,
230  const std::vector<int8_t> &vec2) {
231  const size_t size = vec1.size();
232  return dot_product_i8(vec1.data(), vec2.data(), size);
233 }
234 
235 int gpmp::linalg::dot_product(const std::vector<uint8_t> &vec1,
236  const std::vector<uint8_t> &vec2) {
237  const size_t size = vec1.size();
238  return dot_product_i8(vec1.data(), vec2.data(), size);
239 }
240 
241 #endif // INTRINS
242 
243 // x86
244 #endif
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_add_i8(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector addition for signed 8-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void vector_sub_i8(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector subtraction for signed 8-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.
T dot_product_i8(const T *vec1, const T *vec2, size_t size)
Computes the dot product for signed 8-bit integer vectors.
void scalar_mult_i8(const T *data, int scalar, T *result_data, size_t size)
Performs scalar multiplication for signed 8-bit integers.