45 #if defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON) || \
46 defined(__ARM_ARCH) || defined(__aarch64__)
57 void gpmp::linalg::Mtx::mtx_add(
const std::vector<std::vector<float>> &
A,
58 const std::vector<std::vector<float>> &
B,
59 std::vector<std::vector<float>> &
C) {
60 const int rows =
A.size();
61 const int cols =
A[0].size();
63 for (
int i = 0; i <
rows; ++i) {
66 for (; j <
cols - 3; j += 4) {
68 float32x4_t a = vld1q_f32(&
A[i][j]);
69 float32x4_t b = vld1q_f32(&
B[i][j]);
70 float32x4_t c = vld1q_f32(&
C[i][j]);
76 vst1q_f32(&
C[i][j], c);
80 for (; j <
cols; ++j) {
81 C[i][j] =
A[i][j] +
B[i][j];
87 void gpmp::linalg::Mtx::mtx_sub(
const std::vector<std::vector<float>> &
A,
88 const std::vector<std::vector<float>> &
B,
89 std::vector<std::vector<float>> &
C) {
90 const int rows =
A.size();
91 const int cols =
A[0].size();
93 for (
int i = 0; i <
rows; ++i) {
96 for (; j <
cols - 3; j += 4) {
98 float32x4_t a = vld1q_f32(&
A[i][j]);
99 float32x4_t b = vld1q_f32(&
B[i][j]);
100 float32x4_t c = vld1q_f32(&
C[i][j]);
106 vst1q_f32(&
C[i][j], c);
110 for (; j <
cols; ++j) {
111 C[i][j] =
A[i][j] -
B[i][j];
116 void gpmp::linalg::Mtx::mtx_tpose(std::vector<std::vector<double>> &matrix) {
117 const int rows = matrix.size();
118 const int cols = matrix[0].size();
120 for (
int i = 0; i <
rows; i += 2) {
121 for (
int j = i; j <
cols; j += 2) {
122 float64x2x2_t row1 = vld2q_f64(&matrix[i][j]);
123 float64x2x2_t row2 = vld2q_f64(&matrix[i + 1][j]);
126 float64x2x2_t transposed;
127 transposed.val[0] = vcombine_f64(vget_low_f64(row1.val[0]),
128 vget_low_f64(row2.val[0]));
129 transposed.val[1] = vcombine_f64(vget_low_f64(row1.val[1]),
130 vget_low_f64(row2.val[1]));
133 vst2q_f64(&matrix[i][j], transposed);