#include "../../include/linalg/_gpu_mtx.h"
#include <math.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <CL/cl.h>

Macros
#define	CL_USE_DEPRECATED_OPENCL_1_2_APIS

#define	CL_TARGET_OPENCL_VERSION 300

#define	MEM_SIZE (128)

#define	MAX_SOURCE_SIZE (0x100000)

#define	PRINT_LINE(title) printf("\n========== %s ==========\n", title);

Functions
void	init_vec (int *vec, int len, int set_one_flag)
	GPU kernel acceleration utility/helper functions. More...

void	rand_vec (int *vec, int len)

void	add_vec_cpu (const int a, const int b, int *res, const int len)

void	print_vec (int *vec, int len)

void	check_result (int v1, int v2, int len)

void	accl_mtx_exec ()

Macro Definition Documentation

◆ CL_TARGET_OPENCL_VERSION

#define CL_TARGET_OPENCL_VERSION 300

Definition at line 44 of file _gpu_mtx_add.c.

◆ CL_USE_DEPRECATED_OPENCL_1_2_APIS

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

Definition at line 43 of file _gpu_mtx_add.c.

◆ MAX_SOURCE_SIZE

#define MAX_SOURCE_SIZE (0x100000)

Definition at line 53 of file _gpu_mtx_add.c.

◆ MEM_SIZE

#define MEM_SIZE (128)

Definition at line 52 of file _gpu_mtx_add.c.

◆ PRINT_LINE

#define PRINT_LINE ( title ) printf("\n========== %s ==========\n", title);

Definition at line 54 of file _gpu_mtx_add.c.

Function Documentation

◆ accl_mtx_exec()

void accl_mtx_exec ( )

Definition at line 98 of file _gpu_mtx_add.c.

                      {
     struct timeval start, finish;
     double duration;
     srand((unsigned)time(NULL));
     clock_t startTime, endTime;
     double totalTime;
  
     /* generate vector a and b */
     int len = 64;
     int *a, *b, *c, *c_d;
     a = (int *)malloc(len * sizeof(int));
     b = (int *)malloc(len * sizeof(int));
     c = (int *)malloc(len * sizeof(int));
     c_d = (int *)malloc(len * sizeof(int));
     size_t data_size = len * sizeof(int);
  
     PRINT_LINE("INIT VALUE");
     /* vector addition, cpu version */
     printf("a: ");
     init_vec(a, len, 1);
     print_vec(a, len);
  
     printf("b: ");
     rand_vec(b, len);
     print_vec(b, len);
  
     printf("c: ");
     init_vec(c, len, 0);
  
     startTime = clock();
     add_vec_cpu(a, b, c, len);
     endTime = clock();
     // calculate difference to get total
     totalTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;
     print_vec(c, len);
     printf("CPU: %f\n", totalTime);
  
     /* vector addition, gpu version  */
     cl_mem a_buff, b_buff, c_buff;
     a_buff = b_buff = c_buff = NULL;
  
     cl_platform_id platform_id = NULL;
     cl_uint ret_num_platforms;
  
     cl_device_id device_id = NULL;
     cl_uint ret_num_devices;
  
     cl_context context = NULL;
     cl_kernel kernel = NULL;
     cl_program program = NULL;
  
     cl_command_queue command_queue = NULL;
     cl_int ret;
  
     /* Load the source code containing the kernel */
     char string[MEM_SIZE];
     FILE *fp;
     char fileName[] = "./_gpu_kernel_mtx_add.c";
     char *source_str;
     size_t source_size;
  
     fp = fopen(fileName, "r");
     if (!fp) {
  
         fprintf(stderr, "Failed to load kernel.\n");
         exit(1);
     }
     source_str = (char *)malloc(MAX_SOURCE_SIZE);
     source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
     fclose(fp);
  
     // Platform
     ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
     if (ret != CL_SUCCESS) {
         printf("Failed to get platform ID.\n");
         goto error;
     }
     // Device
     ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
     if (ret != CL_SUCCESS) {
         printf("Failed to get device ID.\n");
         goto error;
     }
     // Context
     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL); //&ret);
     if (ret != CL_SUCCESS) {
         printf("Failed to create OpenCL context.\n");
         goto error;
     }
     command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     if (ret != CL_SUCCESS) {
         printf("Failed to create command queue %d\n", (int)ret);
         goto error;
     }
     // Memory Buffer
     a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
     b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
     c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, data_size, NULL, &ret);
  
     ret = clEnqueueWriteBuffer(command_queue,
                                a_buff,
                                CL_TRUE,
                                0,
                                data_size,
                                (void *)a,
                                0,
                                NULL,
                                NULL);
     ret |= clEnqueueWriteBuffer(command_queue,
                                 b_buff,
                                 CL_TRUE,
                                 0,
                                 data_size,
                                 (void *)b,
                                 0,
                                 NULL,
                                 NULL);
     if (ret != CL_SUCCESS) {
         printf("Failed to copy date from host to device: %d\n", (int)ret);
         goto error;
     }
     // Create Kernel Program from source
     program = clCreateProgramWithSource(context,
                                         1,
                                         (const char **)&source_str,
                                         (const size_t *)&source_size,
                                         &ret);
     if (ret != CL_SUCCESS) {
         printf("Failed to create OpenCL program from source %d\n", (int)ret);
         goto error;
     }
     // Build Kernel Program
     ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     if (ret != CL_SUCCESS) {
         printf("Failed to build program %d\n", (int)ret);
         char build_log[16348];
         clGetProgramBuildInfo(program,
                               device_id,
                               CL_PROGRAM_BUILD_LOG,
                               sizeof(build_log),
                               build_log,
                               NULL);
         printf("Error in kernel: %s\n", build_log);
         goto error;
     }
     // Create OpenCL Kernel
     kernel = clCreateKernel(program, "add_vec_gpu", &ret);
     if (ret != CL_SUCCESS) {
         printf("Failed to create kernel %d\n", (int)ret);
         goto error;
     }
     ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_buff);
     ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_buff);
     ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_buff);
     ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&len);
     if (ret != CL_SUCCESS) {
         printf("Failed to set kernel arguments %d\n", (int)ret);
         goto error;
     }
  
     /* Execute OpenCL Kernel */
     // executed using a single work-item
     // ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
  
     size_t global_work_size, local_work_size;
     // Number of work items in each local work group
     local_work_size = len;
     // Number of total work items - localSize must be devisor
     global_work_size =
         (size_t)ceil(len / (float)local_work_size) * local_work_size;
  
     // size_t local_work_size[2] = { 8, 8 };
     // size_t global_work_size[2] = { 1, len };
     ret = clEnqueueNDRangeKernel(command_queue,
                                  kernel,
                                  1,
                                  NULL,
                                  &global_work_size,
                                  &local_work_size,
                                  0,
                                  NULL,
                                  NULL);
     if (ret != CL_SUCCESS) {
         printf("Failed to execute kernel for execution %d\n", (int)ret);
         goto error;
     }
  
     init_vec(c_d, len, 0);
     /* Copy results from the memory buffer */
     ret = clEnqueueReadBuffer(command_queue,
                               c_buff,
                               CL_TRUE,
                               0,
                               data_size,
                               (void *)c_d,
                               0,
                               NULL,
                               NULL);
     if (ret != CL_SUCCESS) {
         printf("Failed to copy data from device to host %d\n", (int)ret);
         goto error;
     }
  
     /* Display Result */
     PRINT_LINE("CHECK RESULT cpu-verison && gpu-version");
     printf("c_d: ");
     print_vec(c_d, len);
     check_result(c, c_d, len);
     printf("len-1=%d, c_d[%d]==c[%d]: %d, c_d[%d]=%d, c[%d]=%d \n",
            len - 1,
            len - 1,
            len - 1,
            c_d[len - 1] == c[len - 1],
            len - 1,
            c_d[len - 1],
            len - 1,
            c[len - 1]);
  
     PRINT_LINE("CHECK RESULT ELEMENT BY ELEMENT");
     printf("idx  c  c_d\n");
     for (int i = 0; i < len; i++) {
         printf("%2d %2d %2d \n", i, c[i], c_d[i]);
     }
  
     /* Finalization */
 error:
  
     /* free device resources */
     clFlush(command_queue);
     clFinish(command_queue);
     clReleaseKernel(kernel);
     clReleaseProgram(program);
  
     clReleaseMemObject(a_buff);
     clReleaseMemObject(b_buff);
     clReleaseMemObject(c_buff);
  
     clReleaseCommandQueue(command_queue);
     clReleaseContext(context);
  
     /* free host resources */
     free(source_str);
     free(a);
     free(b);
     free(c);
 }

References add_vec_cpu(), check_result(), init_vec(), MAX_SOURCE_SIZE, MEM_SIZE, PRINT_LINE, print_vec(), and rand_vec().

Referenced by main().

◆ add_vec_cpu()

void add_vec_cpu	(	const int *	a,
		const int *	b,
		int *	res,
		const int	len
	)

Definition at line 72 of file _gpu_mtx_add.c.

                                                                       {
     for (int i = 0; i < len; i++) {
         res[i] = a[i] + b[i];
     }
 }

References python.linalg::res.

Referenced by accl_mtx_exec().

◆ check_result()

void check_result	(	int *	v1,
		int *	v2,
		int	len
	)

Definition at line 85 of file _gpu_mtx_add.c.

                                              {
     int correct_num = 0;
     for (int i = 0; i < len; i++) {
         if (v1[i] == v2[i]) {
             correct_num += 1;
         }
     }
     printf("correct rate: %d / %d , %1.2f\n",
            correct_num,
            len,
            (float)correct_num / len);
 }

Referenced by accl_mtx_exec().

◆ init_vec()

void init_vec	(	int *	vec,
		int	len,
		int	set_one_flag
	)

GPU kernel acceleration utility/helper functions.

Definition at line 56 of file _gpu_mtx_add.c.

                                                    {
     for (int i = 0; i < len; i++) {
         if (set_one_flag)
             vec[i] = 1;
         else
             vec[i] = 0;
     }
 }

Referenced by accl_mtx_exec().

◆ print_vec()

void print_vec	(	int *	vec,
		int	len
	)

Definition at line 78 of file _gpu_mtx_add.c.

                                   {
     for (int i = 0; i < len; i++) {
         printf("%d ", vec[i]);
     }
     printf("\n");
 }

Referenced by accl_mtx_exec().

◆ rand_vec()

void rand_vec	(	int *	vec,
		int	len
	)

Definition at line 65 of file _gpu_mtx_add.c.

                                  {
     srand((unsigned)time(0));
     for (int i = 0; i < len; i++) {
         vec[i] = rand() % 2;
     }
 }

Referenced by accl_mtx_exec().

Macros

Functions

Macro Definition Documentation

◆ CL_TARGET_OPENCL_VERSION

◆ CL_USE_DEPRECATED_OPENCL_1_2_APIS

◆ MAX_SOURCE_SIZE

◆ MEM_SIZE

◆ PRINT_LINE

Function Documentation

◆ accl_mtx_exec()

◆ add_vec_cpu()

◆ check_result()

◆ init_vec()

◆ print_vec()

◆ rand_vec()