openGPMP
Open Source Mathematics Package
_gpu_mtx.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
38 #include "../../include/linalg/_gpu_mtx.h"
39 #include <chrono>
40 #include <iostream>
41 #include <math.h>
42 #include <random>
43 #include <stddef.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <sys/time.h>
47 #include <time.h>
48 #include <vector>
49 
50 // using deprecated functions from OpenCL
51 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
52 #define CL_TARGET_OPENCL_VERSION 300
53 
54 // if system is Apple, use this header
55 #ifdef __APPLE__
56 #include <OpenCL/opencl.h>
57 #else
58 #include <CL/cl.h>
59 #endif
60 
61 #define MEM_SIZE (128)
62 #define MAX_SOURCE_SIZE (0x100000)
63 #define PRINT_LINE(title) printf("\n========== %s ==========\n", title);
64 
65 #define MAX_SOURCE_SIZE (0x100000)
66 
67 // Helper function to check for OpenCL errors
68 void checkError(cl_int status, const char *errorMsg) {
69  if (status != CL_SUCCESS) {
70  std::cout << errorMsg << std::endl;
71  exit(1);
72  }
73 }
74 
75 void gpu_mtx_add(const std::vector<std::vector<int>> &A,
76  const std::vector<std::vector<int>> &B,
77  std::vector<std::vector<int>> &C) {
78  // Matrix dimensions
79  const int N = A.size();
80  const int M = A[0].size();
81 
82  // Input matrices
83  std::vector<int> flat_A(N * M);
84  std::vector<int> flat_B(N * M);
85  std::vector<int> flat_C(N * M, 0);
86 
87  // Flatten input matrices
88  for (int i = 0; i < N; i++) {
89  for (int j = 0; j < M; j++) {
90  flat_A[i * M + j] = A[i][j];
91  flat_B[i * M + j] = B[i][j];
92  }
93  }
94  std::chrono::steady_clock::time_point start_time_u =
95  std::chrono::steady_clock::now();
96  // Load the kernel source code
97  FILE *file = fopen("_gpu_mtx_kernel.c", "r");
98  if (!file) {
99  std::cout << "Failed to load kernel." << std::endl;
100  return;
101  }
102  char *source_str = (char *)malloc(MAX_SOURCE_SIZE);
103  size_t source_size = fread(source_str, 1, MAX_SOURCE_SIZE, file);
104  fclose(file);
105 
106  // Get platform and device information
107  cl_platform_id platform_id = NULL;
108  cl_device_id device_id = NULL;
109  cl_uint ret_num_devices;
110  cl_uint ret_num_platforms;
111 
112  cl_int status = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
113  status |= clGetDeviceIDs(platform_id,
114  CL_DEVICE_TYPE_GPU,
115  1,
116  &device_id,
117  &ret_num_devices);
118  checkError(status, "Error getting platform and device information.");
119 
120  // Create an OpenCL context
121  cl_context context =
122  clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
123  checkError(status, "Error creating context.");
124 
125  // Create a command queue
126  cl_command_queue command_queue =
127  clCreateCommandQueue(context, device_id, 0, &status);
128  checkError(status, "Error creating command queue.");
129 
130  // Create memory buffers on the device for each matrix
131  cl_mem mem_obj_A = clCreateBuffer(context,
132  CL_MEM_READ_ONLY,
133  N * M * sizeof(int),
134  NULL,
135  &status);
136  cl_mem mem_obj_B = clCreateBuffer(context,
137  CL_MEM_READ_ONLY,
138  N * M * sizeof(int),
139  NULL,
140  &status);
141  cl_mem mem_obj_C = clCreateBuffer(context,
142  CL_MEM_WRITE_ONLY,
143  N * M * sizeof(int),
144  NULL,
145  &status);
146 
147  // Write matrices A and B to the device memory
148  status = clEnqueueWriteBuffer(command_queue,
149  mem_obj_A,
150  CL_TRUE,
151  0,
152  N * M * sizeof(int),
153  flat_A.data(),
154  0,
155  NULL,
156  NULL);
157  status |= clEnqueueWriteBuffer(command_queue,
158  mem_obj_B,
159  CL_TRUE,
160  0,
161  N * M * sizeof(int),
162  flat_B.data(),
163  0,
164  NULL,
165  NULL);
166  checkError(status, "Error writing matrices to device memory.");
167 
168  // Create a program from the kernel source code
169  cl_program program = clCreateProgramWithSource(context,
170  1,
171  (const char **)&source_str,
172  (const size_t *)&source_size,
173  &status);
174  checkError(status, "Error creating program.");
175 
176  // Build the program
177  status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
178  checkError(status, "Error building program.");
179 
180  // Create the OpenCL kernel
181  cl_kernel kernel = clCreateKernel(program, "gpu_mtx_add", &status);
182  checkError(status, "Error creating kernel.");
183 
184  // Set the arguments of the kernel
185  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&mem_obj_A);
186  status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&mem_obj_B);
187  status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&mem_obj_C);
188  checkError(status, "Error setting kernel arguments.");
189 
190  // Define the global and local workgroup sizes
191  size_t global_work_size[2] = {N, M};
192  size_t local_work_size[2] = {1, 1};
193 
194  // Enqueue the kernel for execution
195  status = clEnqueueNDRangeKernel(command_queue,
196  kernel,
197  2,
198  NULL,
199  global_work_size,
200  local_work_size,
201  0,
202  NULL,
203  NULL);
204  checkError(status, "Error enqueueing kernel.");
205 
206  // Read the result from the device memory
207  status = clEnqueueReadBuffer(command_queue,
208  mem_obj_C,
209  CL_TRUE,
210  0,
211  N * M * sizeof(int),
212  flat_C.data(),
213  0,
214  NULL,
215  NULL);
216  checkError(status, "Error reading result from device memory.");
217  std::chrono::steady_clock::time_point end_time_u =
218  std::chrono::steady_clock::now();
219  std::cout << "Time elapsed: "
220  << std::chrono::duration_cast<std::chrono::milliseconds>(
221  end_time_u - start_time_u)
222  .count()
223  << " ms" << std::endl;
224 
225  // Populate the output matrix C
226  for (int i = 0; i < N; i++) {
227  for (int j = 0; j < M; j++) {
228  C[i][j] = flat_C[i * M + j];
229  }
230  }
231 
232  // Cleanup
233  clReleaseKernel(kernel);
234  clReleaseProgram(program);
235  clReleaseMemObject(mem_obj_A);
236  clReleaseMemObject(mem_obj_B);
237  clReleaseMemObject(mem_obj_C);
238  clReleaseCommandQueue(command_queue);
239  clReleaseContext(context);
240  free(source_str);
241 }
242 
243 int main() {
244  int matrixSize = 8192;
245  // std::vector<std::vector<int>> A = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
246  // std::vector<std::vector<int>> B = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};
247  // std::vector<std::vector<int>> C(A.size(), std::vector<int>(A[0].size()));
248  std::vector<std::vector<int>> A(matrixSize, std::vector<int>(matrixSize));
249  std::vector<std::vector<int>> B(matrixSize, std::vector<int>(matrixSize));
250  std::vector<std::vector<int>> C(matrixSize, std::vector<int>(matrixSize));
251 
252  // Initialize random number generator
253  std::random_device rd;
254  std::mt19937 gen(rd());
255  std::uniform_int_distribution<int> distribution(1, 100);
256 
257  // Populate matrices A and B with random values
258  for (int i = 0; i < matrixSize; ++i) {
259  for (int j = 0; j < matrixSize; ++j) {
260  A[i][j] = distribution(gen);
261  B[i][j] = distribution(gen);
262  }
263  }
264 
265  std::chrono::steady_clock::time_point start_time_u =
266  std::chrono::steady_clock::now();
267 
268  gpu_mtx_add(A, B, C);
269  std::chrono::steady_clock::time_point end_time_u =
270  std::chrono::steady_clock::now();
271 
272  // Print the result matrix
273  /*
274  std::cout << "Matrix A:" << std::endl;
275  for (int i = 0; i < matrixSize; ++i) {
276  for (int j = 0; j < matrixSize; ++j) {
277  std::cout << A[i][j] << " ";
278  }
279  std::cout << std::endl;
280  }
281 
282  std::cout << "Matrix B:" << std::endl;
283  for (int i = 0; i < matrixSize; ++i) {
284  for (int j = 0; j < matrixSize; ++j) {
285  std::cout << B[i][j] << " ";
286  }
287  std::cout << std::endl;
288  }
289 
290  std::cout << "Result matrix:\n";
291  for (const auto &row : C) {
292  for (int element : row) {
293  std::cout << element << " ";
294  }
295  std::cout << "\n";
296  }*/
297 
298  std::cout << "Time elapsed: "
299  << std::chrono::duration_cast<std::chrono::milliseconds>(
300  end_time_u - start_time_u)
301  .count()
302  << " ms" << std::endl;
303 
304  return 0;
305 }
void gpu_mtx_add(const std::vector< std::vector< int >> &A, const std::vector< std::vector< int >> &B, std::vector< std::vector< int >> &C)
Definition: _gpu_mtx.cpp:75
#define MAX_SOURCE_SIZE
Definition: _gpu_mtx.cpp:65
int main()
Definition: _gpu_mtx.cpp:243
void checkError(cl_int status, const char *errorMsg)
Definition: _gpu_mtx.cpp:68
const int N
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23
constexpr int matrixSize
Definition: mtx.cpp:6