openGPMP
Open Source Mathematics Package
vector_arm_f64.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <cmath>
34 #include <cstdint>
35 #include <iostream>
36 #include <numeric>
38 #include <stdexcept>
39 #include <vector>
40 
41 /************************************************************************
42  *
43  * Vector Operations for ARM NEON CPUs
44  *
45  ************************************************************************/
46 #if defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON) || \
47  defined(__ARM_ARCH) || defined(__aarch64__)
48 
49 // ARM intrinsic function header
50 #include <arm_neon.h>
51 /************************************************************************
52  *
53  * Vector Operations on Vectors
54  *
55  ************************************************************************/
56 
57 // Vector addition using ARM NEON intrinsics, operates on double types
58 void gpmp::linalg::vector_add(const std::vector<double> &vec1,
59  const std::vector<double> &vec2,
60  std::vector<double> &result) {
61  const size_t size = vec1.size();
62  const double *data1 = vec1.data();
63  const double *data2 = vec2.data();
64  double *result_data = result.data();
65 
66  // Check if size is a multiple of 2
67  if (size % 2 == 0) {
68  for (size_t i = 0; i < size; i += 2) {
69  // Load 2 elements from vec1 and vec2
70  float64x2_t a = vld1q_f64(data1 + i);
71  float64x2_t b = vld1q_f64(data2 + i);
72 
73  // Perform vectorized addition
74  float64x2_t c = vaddq_f64(a, b);
75 
76  // Store the result back to result vector
77  vst1q_f64(result_data + i, c);
78  }
79  } else {
80  // If size is not a multiple of 2, perform standard addition
81  for (size_t i = 0; i < size; ++i) {
82  result_data[i] = data1[i] + data2[i];
83  }
84  }
85 }
86 
87 // Vector subtraction using ARM NEON intrinsics, operates on double types
88 void gpmp::linalg::vector_sub(const std::vector<double> &vec1,
89  const std::vector<double> &vec2,
90  std::vector<double> &result) {
91  const int vecSize = vec1.size();
92  const int remainder = vecSize % 4;
93  const int vecSizeAligned = vecSize - remainder;
94 
95  for (int i = 0; i < vecSizeAligned; i += 4) {
96  float64x2_t vec1Data1 = vld1q_f64(&vec1[i]);
97  float64x2_t vec1Data2 = vld1q_f64(&vec1[i + 2]);
98  float64x2_t vec2Data1 = vld1q_f64(&vec2[i]);
99  float64x2_t vec2Data2 = vld1q_f64(&vec2[i + 2]);
100 
101  float64x2_t sub1 = vsubq_f64(vec1Data1, vec2Data1);
102  float64x2_t sub2 = vsubq_f64(vec1Data2, vec2Data2);
103 
104  vst1q_f64(&result[i], sub1);
105  vst1q_f64(&result[i + 2], sub2);
106  }
107 
108  for (int i = vecSizeAligned; i < vecSize; ++i) {
109  result[i] = vec1[i] - vec2[i];
110  }
111 }
112 
113 // Vector multiplication using ARM NEON intrinsics, operates on double types
114 void gpmp::linalg::scalar_mult(const std::vector<double> &vec,
115  double scalar,
116  std::vector<double> &result) {
117  const size_t size = vec.size();
118  const double *data = vec.data();
119  double *result_data = result.data();
120 
121  if (size >= 2) {
122  float64x2_t scalar_vec = vdupq_n_f64(scalar);
123  size_t i = 0;
124  for (; i < size - 1; i += 2) {
125  // Load 2 elements from vec
126  float64x2_t a = vld1q_f64(data + i);
127 
128  // Perform vectorized multiplication
129  float64x2_t c = vmulq_f64(a, scalar_vec);
130 
131  // Store the result back to result vector
132  vst1q_f64(result_data + i, c);
133  }
134 
135  // Perform standard multiplication on the remaining elements
136  for (; i < size; ++i) {
137  result_data[i] = data[i] * scalar;
138  }
139  } else {
140  for (size_t i = 0; i < size; ++i) {
141  result_data[i] = data[i] * scalar;
142  }
143  }
144 }
145 
146 // Dot product using ARM NEON intrinsics, operates on double types
147 double gpmp::linalg::dot_product(const std::vector<double> &vec1,
148  const std::vector<double> &vec2) {
149  const size_t size = vec1.size();
150  const double *data1 = vec1.data();
151  const double *data2 = vec2.data();
152  double result = 0.0;
153 
154  if (size >= 2) {
155  float64x2_t sum_vec = vdupq_n_f64(0.0);
156  size_t i = 0;
157  for (; i < size - 1; i += 2) {
158  // Load 2 elements from vec1 and vec2
159  float64x2_t a = vld1q_f64(data1 + i);
160  float64x2_t b = vld1q_f64(data2 + i);
161 
162  // Perform vectorized multiplication
163  float64x2_t mul = vmulq_f64(a, b);
164 
165  // Accumulate the results
166  sum_vec = vaddq_f64(sum_vec, mul);
167  }
168 
169  // Sum the results across the vector
170  double temp[2];
171  vst1q_f64(temp, sum_vec);
172  result = temp[0] + temp[1];
173 
174  // Process remaining elements if any
175  for (; i < size; ++i) {
176  result += data1[i] * data2[i];
177  }
178  } else {
179  for (size_t i = 0; i < size; ++i) {
180  result += data1[i] * data2[i];
181  }
182  }
183 
184  return result;
185 }
186 
187 // ARM NEON
188 #endif
int dot_product(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2)
Computes the dot product for vectors of signed 8-bit integers.
void vector_sub(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector subtraction for vectors of signed 8-bit integers.
void scalar_mult(const std::vector< int8_t > &vec1, int scalar, std::vector< int8_t > &result)
Performs scalar multiplication for vectors of signed 8-bit integers.
void vector_add(const std::vector< int8_t > &vec1, const std::vector< int8_t > &vec2, std::vector< int8_t > &result)
Performs vector addition for vectors of signed 8-bit integers.