41 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
51 #include <immintrin.h>
68 for (; i < size - 7; i += 8) {
70 __m256i a = _mm256_loadu_si256(
71 reinterpret_cast<const __m256i *
>(data1 + i));
72 __m256i b = _mm256_loadu_si256(
73 reinterpret_cast<const __m256i *
>(data2 + i));
76 __m256i c = _mm256_add_epi32(a, b);
79 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
82 for (; i < size; ++i) {
83 result_data[i] = data1[i] + data2[i];
89 for (
size_t i = 0; i < size; ++i) {
90 result_data[i] = data1[i] + data2[i];
96 const std::vector<int32_t> &vec2,
97 std::vector<int32_t> &result) {
99 const size_t size = vec1.size();
104 const std::vector<uint32_t> &vec2,
105 std::vector<uint32_t> &result) {
106 const size_t size = vec1.size();
110 template <
typename T>
118 for (; i < size - 7; i += 8) {
121 __m256i a = _mm256_loadu_si256(
122 reinterpret_cast<const __m256i *
>(data1 + i));
126 __m256i b = _mm256_loadu_si256(
127 reinterpret_cast<const __m256i *
>(data2 + i));
129 __m256i c = _mm256_sub_epi32(a, b);
133 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
137 for (; i < size; ++i) {
138 result_data[i] = data1[i] - data2[i];
143 for (
size_t i = 0; i < size; ++i) {
144 result_data[i] = data1[i] - data2[i];
150 const std::vector<int32_t> &vec2,
151 std::vector<int32_t> &result) {
153 const size_t size = vec1.size();
158 const std::vector<uint32_t> &vec2,
159 std::vector<uint32_t> &result) {
160 const size_t size = vec1.size();
164 template <
typename T>
171 __m256i scalar_vec = _mm256_set1_epi32(scalar);
174 for (; i < size - 7; i += 8) {
177 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(data + i));
180 __m256i c = _mm256_mullo_epi32(a, scalar_vec);
183 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
188 for (; i < size; ++i) {
189 result_data[i] = data[i] * scalar;
192 for (
size_t i = 0; i < size; ++i) {
193 result_data[i] = data[i] * scalar;
200 std::vector<int32_t> &result) {
202 const size_t size = vec1.size();
208 std::vector<uint32_t> &result) {
209 const size_t size = vec1.size();
213 template <
typename T>
220 for (; i < size - 7; i += 8) {
223 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(vec1 + i));
225 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(vec2 + i));
228 __m256i c = _mm256_mullo_epi32(a, b);
229 __m256i sum = _mm256_hadd_epi32(c, c);
230 sum = _mm256_hadd_epi32(sum, sum);
233 result += _mm256_extract_epi32(sum, 0);
234 result += _mm256_extract_epi32(sum, 4);
238 for (; i < size; ++i) {
239 result += vec1[i] * vec2[i];
242 for (
size_t i = 0; i < size; ++i) {
243 result += vec1[i] * vec2[i];
251 const std::vector<int32_t> &vec2) {
252 const size_t size = vec1.size();
257 const std::vector<uint32_t> &vec2) {
258 const size_t size = vec1.size();
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_add_i32(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector addition for signed 32-bit integers.
void vector_sub_i32(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector subtraction for signed 32-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void scalar_mult_i32(const T *data, int scalar, T *result_data, size_t size)
Performs scalar multiplication for signed 32-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.
T dot_product_i32(const T *vec1, const T *vec2, size_t size)
Computes the dot product for signed 32-bit integer vectors.