openGPMP
Open Source Mathematics Package
Macros | Functions
_gpu_mtx.cpp File Reference

OpenCL-based GPU acceleration. More...

#include "../../include/linalg/_gpu_mtx.h"
#include <chrono>
#include <iostream>
#include <math.h>
#include <random>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <vector>
#include <CL/cl.h>

Go to the source code of this file.

Macros

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 
#define CL_TARGET_OPENCL_VERSION   300
 
#define MEM_SIZE   (128)
 
#define MAX_SOURCE_SIZE   (0x100000)
 
#define PRINT_LINE(title)   printf("\n========== %s ==========\n", title);
 
#define MAX_SOURCE_SIZE   (0x100000)
 

Functions

void checkError (cl_int status, const char *errorMsg)
 
void gpu_mtx_add (const std::vector< std::vector< int >> &A, const std::vector< std::vector< int >> &B, std::vector< std::vector< int >> &C)
 
int main ()
 

Detailed Description

OpenCL-based GPU acceleration.

Definition in file _gpu_mtx.cpp.

Macro Definition Documentation

◆ CL_TARGET_OPENCL_VERSION

#define CL_TARGET_OPENCL_VERSION   300

Definition at line 52 of file _gpu_mtx.cpp.

◆ CL_USE_DEPRECATED_OPENCL_1_2_APIS

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

Definition at line 51 of file _gpu_mtx.cpp.

◆ MAX_SOURCE_SIZE [1/2]

#define MAX_SOURCE_SIZE   (0x100000)

Definition at line 65 of file _gpu_mtx.cpp.

◆ MAX_SOURCE_SIZE [2/2]

#define MAX_SOURCE_SIZE   (0x100000)

Definition at line 65 of file _gpu_mtx.cpp.

◆ MEM_SIZE

#define MEM_SIZE   (128)

Definition at line 61 of file _gpu_mtx.cpp.

◆ PRINT_LINE

#define PRINT_LINE (   title)    printf("\n========== %s ==========\n", title);

Definition at line 63 of file _gpu_mtx.cpp.

Function Documentation

◆ checkError()

void checkError ( cl_int  status,
const char *  errorMsg 
)

Definition at line 68 of file _gpu_mtx.cpp.

68  {
69  if (status != CL_SUCCESS) {
70  std::cout << errorMsg << std::endl;
71  exit(1);
72  }
73 }

Referenced by gpu_mtx_add().

◆ gpu_mtx_add()

void gpu_mtx_add ( const std::vector< std::vector< int >> &  A,
const std::vector< std::vector< int >> &  B,
std::vector< std::vector< int >> &  C 
)

Definition at line 75 of file _gpu_mtx.cpp.

77  {
78  // Matrix dimensions
79  const int N = A.size();
80  const int M = A[0].size();
81 
82  // Input matrices
83  std::vector<int> flat_A(N * M);
84  std::vector<int> flat_B(N * M);
85  std::vector<int> flat_C(N * M, 0);
86 
87  // Flatten input matrices
88  for (int i = 0; i < N; i++) {
89  for (int j = 0; j < M; j++) {
90  flat_A[i * M + j] = A[i][j];
91  flat_B[i * M + j] = B[i][j];
92  }
93  }
94  std::chrono::steady_clock::time_point start_time_u =
95  std::chrono::steady_clock::now();
96  // Load the kernel source code
97  FILE *file = fopen("_gpu_mtx_kernel.c", "r");
98  if (!file) {
99  std::cout << "Failed to load kernel." << std::endl;
100  return;
101  }
102  char *source_str = (char *)malloc(MAX_SOURCE_SIZE);
103  size_t source_size = fread(source_str, 1, MAX_SOURCE_SIZE, file);
104  fclose(file);
105 
106  // Get platform and device information
107  cl_platform_id platform_id = NULL;
108  cl_device_id device_id = NULL;
109  cl_uint ret_num_devices;
110  cl_uint ret_num_platforms;
111 
112  cl_int status = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
113  status |= clGetDeviceIDs(platform_id,
114  CL_DEVICE_TYPE_GPU,
115  1,
116  &device_id,
117  &ret_num_devices);
118  checkError(status, "Error getting platform and device information.");
119 
120  // Create an OpenCL context
121  cl_context context =
122  clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
123  checkError(status, "Error creating context.");
124 
125  // Create a command queue
126  cl_command_queue command_queue =
127  clCreateCommandQueue(context, device_id, 0, &status);
128  checkError(status, "Error creating command queue.");
129 
130  // Create memory buffers on the device for each matrix
131  cl_mem mem_obj_A = clCreateBuffer(context,
132  CL_MEM_READ_ONLY,
133  N * M * sizeof(int),
134  NULL,
135  &status);
136  cl_mem mem_obj_B = clCreateBuffer(context,
137  CL_MEM_READ_ONLY,
138  N * M * sizeof(int),
139  NULL,
140  &status);
141  cl_mem mem_obj_C = clCreateBuffer(context,
142  CL_MEM_WRITE_ONLY,
143  N * M * sizeof(int),
144  NULL,
145  &status);
146 
147  // Write matrices A and B to the device memory
148  status = clEnqueueWriteBuffer(command_queue,
149  mem_obj_A,
150  CL_TRUE,
151  0,
152  N * M * sizeof(int),
153  flat_A.data(),
154  0,
155  NULL,
156  NULL);
157  status |= clEnqueueWriteBuffer(command_queue,
158  mem_obj_B,
159  CL_TRUE,
160  0,
161  N * M * sizeof(int),
162  flat_B.data(),
163  0,
164  NULL,
165  NULL);
166  checkError(status, "Error writing matrices to device memory.");
167 
168  // Create a program from the kernel source code
169  cl_program program = clCreateProgramWithSource(context,
170  1,
171  (const char **)&source_str,
172  (const size_t *)&source_size,
173  &status);
174  checkError(status, "Error creating program.");
175 
176  // Build the program
177  status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
178  checkError(status, "Error building program.");
179 
180  // Create the OpenCL kernel
181  cl_kernel kernel = clCreateKernel(program, "gpu_mtx_add", &status);
182  checkError(status, "Error creating kernel.");
183 
184  // Set the arguments of the kernel
185  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&mem_obj_A);
186  status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&mem_obj_B);
187  status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&mem_obj_C);
188  checkError(status, "Error setting kernel arguments.");
189 
190  // Define the global and local workgroup sizes
191  size_t global_work_size[2] = {N, M};
192  size_t local_work_size[2] = {1, 1};
193 
194  // Enqueue the kernel for execution
195  status = clEnqueueNDRangeKernel(command_queue,
196  kernel,
197  2,
198  NULL,
199  global_work_size,
200  local_work_size,
201  0,
202  NULL,
203  NULL);
204  checkError(status, "Error enqueueing kernel.");
205 
206  // Read the result from the device memory
207  status = clEnqueueReadBuffer(command_queue,
208  mem_obj_C,
209  CL_TRUE,
210  0,
211  N * M * sizeof(int),
212  flat_C.data(),
213  0,
214  NULL,
215  NULL);
216  checkError(status, "Error reading result from device memory.");
217  std::chrono::steady_clock::time_point end_time_u =
218  std::chrono::steady_clock::now();
219  std::cout << "Time elapsed: "
220  << std::chrono::duration_cast<std::chrono::milliseconds>(
221  end_time_u - start_time_u)
222  .count()
223  << " ms" << std::endl;
224 
225  // Populate the output matrix C
226  for (int i = 0; i < N; i++) {
227  for (int j = 0; j < M; j++) {
228  C[i][j] = flat_C[i * M + j];
229  }
230  }
231 
232  // Cleanup
233  clReleaseKernel(kernel);
234  clReleaseProgram(program);
235  clReleaseMemObject(mem_obj_A);
236  clReleaseMemObject(mem_obj_B);
237  clReleaseMemObject(mem_obj_C);
238  clReleaseCommandQueue(command_queue);
239  clReleaseContext(context);
240  free(source_str);
241 }
#define MAX_SOURCE_SIZE
Definition: _gpu_mtx.cpp:65
void checkError(cl_int status, const char *errorMsg)
Definition: _gpu_mtx.cpp:68
const int N
list C
Definition: linalg.py:24
list A
Definition: linalg.py:22
list B
Definition: linalg.py:23

References python.linalg::A, python.linalg::B, python.linalg::C, checkError(), MAX_SOURCE_SIZE, and N.

Referenced by main().

◆ main()

int main ( void  )

Definition at line 243 of file _gpu_mtx.cpp.

243  {
244  int matrixSize = 8192;
245  // std::vector<std::vector<int>> A = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
246  // std::vector<std::vector<int>> B = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};
247  // std::vector<std::vector<int>> C(A.size(), std::vector<int>(A[0].size()));
248  std::vector<std::vector<int>> A(matrixSize, std::vector<int>(matrixSize));
249  std::vector<std::vector<int>> B(matrixSize, std::vector<int>(matrixSize));
250  std::vector<std::vector<int>> C(matrixSize, std::vector<int>(matrixSize));
251 
252  // Initialize random number generator
253  std::random_device rd;
254  std::mt19937 gen(rd());
255  std::uniform_int_distribution<int> distribution(1, 100);
256 
257  // Populate matrices A and B with random values
258  for (int i = 0; i < matrixSize; ++i) {
259  for (int j = 0; j < matrixSize; ++j) {
260  A[i][j] = distribution(gen);
261  B[i][j] = distribution(gen);
262  }
263  }
264 
265  std::chrono::steady_clock::time_point start_time_u =
266  std::chrono::steady_clock::now();
267 
268  gpu_mtx_add(A, B, C);
269  std::chrono::steady_clock::time_point end_time_u =
270  std::chrono::steady_clock::now();
271 
272  // Print the result matrix
273  /*
274  std::cout << "Matrix A:" << std::endl;
275  for (int i = 0; i < matrixSize; ++i) {
276  for (int j = 0; j < matrixSize; ++j) {
277  std::cout << A[i][j] << " ";
278  }
279  std::cout << std::endl;
280  }
281 
282  std::cout << "Matrix B:" << std::endl;
283  for (int i = 0; i < matrixSize; ++i) {
284  for (int j = 0; j < matrixSize; ++j) {
285  std::cout << B[i][j] << " ";
286  }
287  std::cout << std::endl;
288  }
289 
290  std::cout << "Result matrix:\n";
291  for (const auto &row : C) {
292  for (int element : row) {
293  std::cout << element << " ";
294  }
295  std::cout << "\n";
296  }*/
297 
298  std::cout << "Time elapsed: "
299  << std::chrono::duration_cast<std::chrono::milliseconds>(
300  end_time_u - start_time_u)
301  .count()
302  << " ms" << std::endl;
303 
304  return 0;
305 }
void gpu_mtx_add(const std::vector< std::vector< int >> &A, const std::vector< std::vector< int >> &B, std::vector< std::vector< int >> &C)
Definition: _gpu_mtx.cpp:75
constexpr int matrixSize
Definition: mtx.cpp:6

References python.linalg::A, python.linalg::B, python.linalg::C, gpu_mtx_add(), and matrixSize.