openGPMP
Open Source Mathematics Package
vector_arm_i32.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cmath>
34 #include <cstdint>
35 #include <iostream>
36 #include <numeric>
38 #include <stdexcept>
39 #include <vector>
40 
41 /************************************************************************
42  *
43  * Vector Operations for ARM NEON CPUs
44  *
45  ************************************************************************/
46 #if defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON) || \
47  defined(__ARM_ARCH) || defined(__aarch64__)
48 
49 // ARM intrinsic function header
50 #include <arm_neon.h>
51 /************************************************************************
52  *
53  * Vector Operations on Vectors
54  *
55  ************************************************************************/
56 
57 // Vector addition using ARM NEON intrinsics, operates on integer types
58 void gpmp::linalg::vector_add(const std::vector<int> &vec1,
59  const std::vector<int> &vec2,
60  std::vector<int> &result) {
61  const size_t size = vec1.size();
62  const int *data1 = vec1.data();
63  const int *data2 = vec2.data();
64  int *result_data = result.data();
65 
66  // Check if size is a multiple of 4
67  if (size % 4 == 0) {
68  for (size_t i = 0; i < size; i += 4) {
69  // Load 4 elements from vec1 and vec2
70  int32x4_t a = vld1q_s32(data1 + i);
71  int32x4_t b = vld1q_s32(data2 + i);
72 
73  // Perform vectorized addition
74  int32x4_t c = vaddq_s32(a, b);
75 
76  // Store the result back to result vector
77  vst1q_s32(result_data + i, c);
78  }
79  } else {
80  // If size is not a multiple of 4, perform standard addition
81  for (size_t i = 0; i < size; ++i) {
82  result_data[i] = data1[i] + data2[i];
83  }
84  }
85 }
86 
87 // Vector subtraction using ARM NEON intrinsics, operates on integer types
88 void gpmp::linalg::vector_sub(const std::vector<int> &vec1,
89  const std::vector<int> &vec2,
90  std::vector<int> &result) {
91  const int vecSize = vec1.size();
92  const int remainder = vecSize % 8;
93  const int vecSizeAligned = vecSize - remainder;
94 
95  for (int i = 0; i < vecSizeAligned; i += 8) {
96  int32x4_t vec1Data1 = vld1q_s32(&vec1[i]);
97  int32x4_t vec1Data2 = vld1q_s32(&vec1[i + 4]);
98  int32x4_t vec2Data1 = vld1q_s32(&vec2[i]);
99  int32x4_t vec2Data2 = vld1q_s32(&vec2[i + 4]);
100 
101  int32x4_t sub1 = vsubq_s32(vec1Data1, vec2Data1);
102  int32x4_t sub2 = vsubq_s32(vec1Data2, vec2Data2);
103 
104  vst1q_s32(&result[i], sub1);
105  vst1q_s32(&result[i + 4], sub2);
106  }
107 
108  for (int i = vecSizeAligned; i < vecSize; ++i) {
109  result[i] = vec1[i] - vec2[i];
110  }
111 }
112 
113 // Vector multiplication using ARM NEON intrinsics, operates on integer types
114 void gpmp::linalg::scalar_mult(const std::vector<int> &vec,
115  int scalar,
116  std::vector<int> &result) {
117  const size_t size = vec.size();
118  const int32_t *data = vec.data();
119  int32_t *result_data = result.data();
120 
121  if (size >= 4) {
122  int32x4_t scalar_vec = vdupq_n_s32(scalar);
123  size_t i = 0;
124  for (; i < size - 3; i += 4) {
125  // Load 4 elements from vec
126  int32x4_t a = vld1q_s32(data + i);
127 
128  // Perform vectorized multiplication
129  int32x4_t c = vmulq_s32(a, scalar_vec);
130 
131  // Store the result back to result vector
132  vst1q_s32(result_data + i, c);
133  }
134 
135  // Perform standard multiplication on the remaining elements
136  for (; i < size; ++i) {
137  result_data[i] = data[i] * scalar;
138  }
139  } else {
140  for (size_t i = 0; i < size; ++i) {
141  result_data[i] = data[i] * scalar;
142  }
143  }
144 }
145 
146 // Dot product using ARM NEON intrinsics, operates on integer types
147 int gpmp::linalg::dot_product(const std::vector<int> &vec1,
148  const std::vector<int> &vec2) {
149  const size_t size = vec1.size();
150  const int32_t *data1 = vec1.data();
151  const int32_t *data2 = vec2.data();
152  int result = 0;
153 
154  if (size >= 4) {
155  int32x4_t sum_vec = vdupq_n_s32(0);
156  size_t i = 0;
157  for (; i < size - 3; i += 4) {
158  // Load 4 elements from vec1 and vec2
159  int32x4_t a = vld1q_s32(data1 + i);
160  int32x4_t b = vld1q_s32(data2 + i);
161 
162  int32x4_t mul = vmulq_s32(a, b);
163 
164  // Accumulate the results
165  sum_vec = vaddq_s32(sum_vec, mul);
166  }
167 
168  // sum the results across the vector
169  int32_t temp[4];
170  vst1q_s32(temp, sum_vec);
171  result = temp[0] + temp[1] + temp[2] + temp[3];
172 
173  // process remaining elements if any
174  for (; i < size; ++i) {
175  result += data1[i] * data2[i];
176  }
177  }
178  // performs std dot product
179  else {
180  for (size_t i = 0; i < size; ++i) {
181  result += data1[i] * data2[i];
182  }
183  }
184 
185  return result;
186 }
187 
188 // ARM NEON
189 #endif
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.