openGPMP
Open Source Mathematics Package
_gpu_mtx_add.c
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
34 #include "../../include/linalg/_gpu_mtx.h"
35 #include <math.h>
36 #include <stddef.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <sys/time.h>
40 #include <time.h>
41 
42 /* gpu */
43 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
44 #define CL_TARGET_OPENCL_VERSION 300
45 
46 #ifdef __APPLE__
47 #include <OpenCL/opencl.h>
48 #else
49 #include <CL/cl.h>
50 #endif
51 
52 #define MEM_SIZE (128)
53 #define MAX_SOURCE_SIZE (0x100000)
54 #define PRINT_LINE(title) printf("\n========== %s ==========\n", title);
55 
56 void init_vec(int *vec, int len, int set_one_flag) {
57  for (int i = 0; i < len; i++) {
58  if (set_one_flag)
59  vec[i] = 1;
60  else
61  vec[i] = 0;
62  }
63 }
64 
65 void rand_vec(int *vec, int len) {
66  srand((unsigned)time(0));
67  for (int i = 0; i < len; i++) {
68  vec[i] = rand() % 2;
69  }
70 }
71 
72 void add_vec_cpu(const int *a, const int *b, int *res, const int len) {
73  for (int i = 0; i < len; i++) {
74  res[i] = a[i] + b[i];
75  }
76 }
77 
78 void print_vec(int *vec, int len) {
79  for (int i = 0; i < len; i++) {
80  printf("%d ", vec[i]);
81  }
82  printf("\n");
83 }
84 
85 void check_result(int *v1, int *v2, int len) {
86  int correct_num = 0;
87  for (int i = 0; i < len; i++) {
88  if (v1[i] == v2[i]) {
89  correct_num += 1;
90  }
91  }
92  printf("correct rate: %d / %d , %1.2f\n",
93  correct_num,
94  len,
95  (float)correct_num / len);
96 }
97 
98 void accl_mtx_exec() {
99  struct timeval start, finish;
100  double duration;
101  srand((unsigned)time(NULL));
102  clock_t startTime, endTime;
103  double totalTime;
104 
105  /* generate vector a and b */
106  int len = 64;
107  int *a, *b, *c, *c_d;
108  a = (int *)malloc(len * sizeof(int));
109  b = (int *)malloc(len * sizeof(int));
110  c = (int *)malloc(len * sizeof(int));
111  c_d = (int *)malloc(len * sizeof(int));
112  size_t data_size = len * sizeof(int);
113 
114  PRINT_LINE("INIT VALUE");
115  /* vector addition, cpu version */
116  printf("a: ");
117  init_vec(a, len, 1);
118  print_vec(a, len);
119 
120  printf("b: ");
121  rand_vec(b, len);
122  print_vec(b, len);
123 
124  printf("c: ");
125  init_vec(c, len, 0);
126 
127  startTime = clock();
128  add_vec_cpu(a, b, c, len);
129  endTime = clock();
130  // calculate difference to get total
131  totalTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;
132  print_vec(c, len);
133  printf("CPU: %f\n", totalTime);
134 
135  /* vector addition, gpu version */
136  cl_mem a_buff, b_buff, c_buff;
137  a_buff = b_buff = c_buff = NULL;
138 
139  cl_platform_id platform_id = NULL;
140  cl_uint ret_num_platforms;
141 
142  cl_device_id device_id = NULL;
143  cl_uint ret_num_devices;
144 
145  cl_context context = NULL;
146  cl_kernel kernel = NULL;
147  cl_program program = NULL;
148 
149  cl_command_queue command_queue = NULL;
150  cl_int ret;
151 
152  /* Load the source code containing the kernel */
153  char string[MEM_SIZE];
154  FILE *fp;
155  char fileName[] = "./_gpu_kernel_mtx_add.c";
156  char *source_str;
157  size_t source_size;
158 
159  fp = fopen(fileName, "r");
160  if (!fp) {
161 
162  fprintf(stderr, "Failed to load kernel.\n");
163  exit(1);
164  }
165  source_str = (char *)malloc(MAX_SOURCE_SIZE);
166  source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
167  fclose(fp);
168 
169  // Platform
170  ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
171  if (ret != CL_SUCCESS) {
172  printf("Failed to get platform ID.\n");
173  goto error;
174  }
175  // Device
176  ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
177  if (ret != CL_SUCCESS) {
178  printf("Failed to get device ID.\n");
179  goto error;
180  }
181  // Context
182  context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL); //&ret);
183  if (ret != CL_SUCCESS) {
184  printf("Failed to create OpenCL context.\n");
185  goto error;
186  }
187  command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
188  if (ret != CL_SUCCESS) {
189  printf("Failed to create command queue %d\n", (int)ret);
190  goto error;
191  }
192  // Memory Buffer
193  a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
194  b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
195  c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, data_size, NULL, &ret);
196 
197  ret = clEnqueueWriteBuffer(command_queue,
198  a_buff,
199  CL_TRUE,
200  0,
201  data_size,
202  (void *)a,
203  0,
204  NULL,
205  NULL);
206  ret |= clEnqueueWriteBuffer(command_queue,
207  b_buff,
208  CL_TRUE,
209  0,
210  data_size,
211  (void *)b,
212  0,
213  NULL,
214  NULL);
215  if (ret != CL_SUCCESS) {
216  printf("Failed to copy date from host to device: %d\n", (int)ret);
217  goto error;
218  }
219  // Create Kernel Program from source
220  program = clCreateProgramWithSource(context,
221  1,
222  (const char **)&source_str,
223  (const size_t *)&source_size,
224  &ret);
225  if (ret != CL_SUCCESS) {
226  printf("Failed to create OpenCL program from source %d\n", (int)ret);
227  goto error;
228  }
229  // Build Kernel Program
230  ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
231  if (ret != CL_SUCCESS) {
232  printf("Failed to build program %d\n", (int)ret);
233  char build_log[16348];
234  clGetProgramBuildInfo(program,
235  device_id,
236  CL_PROGRAM_BUILD_LOG,
237  sizeof(build_log),
238  build_log,
239  NULL);
240  printf("Error in kernel: %s\n", build_log);
241  goto error;
242  }
243  // Create OpenCL Kernel
244  kernel = clCreateKernel(program, "add_vec_gpu", &ret);
245  if (ret != CL_SUCCESS) {
246  printf("Failed to create kernel %d\n", (int)ret);
247  goto error;
248  }
249  ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_buff);
250  ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_buff);
251  ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_buff);
252  ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&len);
253  if (ret != CL_SUCCESS) {
254  printf("Failed to set kernel arguments %d\n", (int)ret);
255  goto error;
256  }
257 
258  /* Execute OpenCL Kernel */
259  // executed using a single work-item
260  // ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
261 
262  size_t global_work_size, local_work_size;
263  // Number of work items in each local work group
264  local_work_size = len;
265  // Number of total work items - localSize must be devisor
266  global_work_size =
267  (size_t)ceil(len / (float)local_work_size) * local_work_size;
268 
269  // size_t local_work_size[2] = { 8, 8 };
270  // size_t global_work_size[2] = { 1, len };
271  ret = clEnqueueNDRangeKernel(command_queue,
272  kernel,
273  1,
274  NULL,
275  &global_work_size,
276  &local_work_size,
277  0,
278  NULL,
279  NULL);
280  if (ret != CL_SUCCESS) {
281  printf("Failed to execute kernel for execution %d\n", (int)ret);
282  goto error;
283  }
284 
285  init_vec(c_d, len, 0);
286  /* Copy results from the memory buffer */
287  ret = clEnqueueReadBuffer(command_queue,
288  c_buff,
289  CL_TRUE,
290  0,
291  data_size,
292  (void *)c_d,
293  0,
294  NULL,
295  NULL);
296  if (ret != CL_SUCCESS) {
297  printf("Failed to copy data from device to host %d\n", (int)ret);
298  goto error;
299  }
300 
301  /* Display Result */
302  PRINT_LINE("CHECK RESULT cpu-verison && gpu-version");
303  printf("c_d: ");
304  print_vec(c_d, len);
305  check_result(c, c_d, len);
306  printf("len-1=%d, c_d[%d]==c[%d]: %d, c_d[%d]=%d, c[%d]=%d \n",
307  len - 1,
308  len - 1,
309  len - 1,
310  c_d[len - 1] == c[len - 1],
311  len - 1,
312  c_d[len - 1],
313  len - 1,
314  c[len - 1]);
315 
316  PRINT_LINE("CHECK RESULT ELEMENT BY ELEMENT");
317  printf("idx c c_d\n");
318  for (int i = 0; i < len; i++) {
319  printf("%2d %2d %2d \n", i, c[i], c_d[i]);
320  }
321 
322  /* Finalization */
323 error:
324 
325  /* free device resources */
326  clFlush(command_queue);
327  clFinish(command_queue);
328  clReleaseKernel(kernel);
329  clReleaseProgram(program);
330 
331  clReleaseMemObject(a_buff);
332  clReleaseMemObject(b_buff);
333  clReleaseMemObject(c_buff);
334 
335  clReleaseCommandQueue(command_queue);
336  clReleaseContext(context);
337 
338  /* free host resources */
339  free(source_str);
340  free(a);
341  free(b);
342  free(c);
343 }
344 // int main () {accl_mtx_exec(); return 0;}
void rand_vec(int *vec, int len)
Definition: _gpu_mtx_add.c:65
#define MAX_SOURCE_SIZE
Definition: _gpu_mtx_add.c:53
#define MEM_SIZE
Definition: _gpu_mtx_add.c:52
#define PRINT_LINE(title)
Definition: _gpu_mtx_add.c:54
void add_vec_cpu(const int *a, const int *b, int *res, const int len)
Definition: _gpu_mtx_add.c:72
void init_vec(int *vec, int len, int set_one_flag)
GPU kernel acceleration utility/helper functions.
Definition: _gpu_mtx_add.c:56
void accl_mtx_exec()
Definition: _gpu_mtx_add.c:98
void print_vec(int *vec, int len)
Definition: _gpu_mtx_add.c:78
void check_result(int *v1, int *v2, int len)
Definition: _gpu_mtx_add.c:85