41 #if defined(__x86_64__) || defined(__amd64__) || defined(__amd64)
51 #include <immintrin.h>
68 for (; i < size - 31; i += 32) {
69 __m256i a = _mm256_loadu_si256(
70 reinterpret_cast<const __m256i *
>(data1 + i));
71 __m256i b = _mm256_loadu_si256(
72 reinterpret_cast<const __m256i *
>(data2 + i));
73 __m256i c = _mm256_add_epi8(a, b);
74 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
78 for (; i < size; ++i) {
79 result_data[i] = data1[i] + data2[i];
84 const std::vector<int8_t> &vec2,
85 std::vector<int8_t> &result) {
87 const size_t size = vec1.size();
92 const std::vector<uint8_t> &vec2,
93 std::vector<uint8_t> &result) {
94 const size_t size = vec1.size();
106 for (; i < size - 31; i += 32) {
107 __m256i a = _mm256_loadu_si256(
108 reinterpret_cast<const __m256i *
>(data1 + i));
109 __m256i b = _mm256_loadu_si256(
110 reinterpret_cast<const __m256i *
>(data2 + i));
111 __m256i c = _mm256_sub_epi8(a, b);
112 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
118 for (; i < size; ++i) {
119 result_data[i] = data1[i] - data2[i];
124 const std::vector<int8_t> &vec2,
125 std::vector<int8_t> &result) {
127 const size_t size = vec1.size();
128 vector_sub_i8(vec1.data(), vec2.data(), result.data(), size);
132 const std::vector<uint8_t> &vec2,
133 std::vector<uint8_t> &result) {
134 const size_t size = vec1.size();
135 vector_sub_i8(vec1.data(), vec2.data(), result.data(), size);
138 template <
typename T>
146 __m256i scalar_vec = _mm256_set1_epi16(scalar);
148 for (; i < size - 31; i += 32) {
151 _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(data + i));
154 __m256i a_low = _mm256_unpacklo_epi8(a, _mm256_setzero_si256());
155 __m256i a_high = _mm256_unpackhi_epi8(a, _mm256_setzero_si256());
158 __m256i c_low = _mm256_mullo_epi16(a_low, scalar_vec);
159 __m256i c_high = _mm256_mullo_epi16(a_high, scalar_vec);
162 __m256i c = _mm256_packus_epi16(c_low, c_high);
165 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(result_data + i),
169 for (; i < size; ++i) {
170 result_data[i] = data[i] * scalar;
177 std::vector<int8_t> &result) {
179 const size_t size = vec1.size();
185 std::vector<uint8_t> &result) {
186 const size_t size = vec1.size();
190 template <
typename T>
197 for (; i < size - 15; i += 16) {
200 __m256i a = _mm256_cvtepu8_epi16(
201 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(vec1 + i)));
202 __m256i b = _mm256_cvtepu8_epi16(
203 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(vec2 + i)));
206 __m256i c = _mm256_mullo_epi16(a, b);
207 __m256i sum = _mm256_hadd_epi16(c, c);
208 sum = _mm256_hadd_epi16(sum, sum);
209 sum = _mm256_hadd_epi16(sum, sum);
212 result += _mm256_extract_epi16(sum, 0);
213 result += _mm256_extract_epi16(sum, 8);
217 for (; i < size; ++i) {
218 result += vec1[i] * vec2[i];
221 for (
size_t i = 0; i < size; ++i) {
222 result += vec1[i] * vec2[i];
230 const std::vector<int8_t> &vec2) {
231 const size_t size = vec1.size();
236 const std::vector<uint8_t> &vec2) {
237 const size_t size = vec1.size();
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_add_i8(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector addition for signed 8-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void vector_sub_i8(const T *data1, const T *data2, T *result_data, size_t size)
Performs vector subtraction for signed 8-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.
T dot_product_i8(const T *vec1, const T *vec2, size_t size)
Computes the dot product for signed 8-bit integer vectors.
void scalar_mult_i8(const T *data, int scalar, T *result_data, size_t size)
Performs scalar multiplication for signed 8-bit integers.