openGPMP/__gpu__mtx__add_8c_source.html

 /*************************************************************************

  *

  *  Project

  *                         _____ _____  __  __ _____

  *                        / ____|  __ \|  \/  |  __ \

  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |

  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/

  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |

  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|

  *      | |

  *      |_|

  *

  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.

  *

  * This software is licensed as described in the file LICENSE, which

  * you should have received as part of this distribution. The terms

  * among other details are referenced in the official documentation

  * seen here : https://akielaries.github.io/openGPMP/ along with

  * important files seen in this project.

  *

  * You may opt to use, copy, modify, merge, publish, distribute

  * and/or sell copies of the Software, and permit persons to whom

  * the Software is furnished to do so, under the terms of the

  * LICENSE file. As this is an Open Source effort, all implementations

  * must be of the same methodology.

  *

  *

  *

  * This software is distributed on an AS IS basis, WITHOUT

  * WARRANTY OF ANY KIND, either express or implied.

  *

  ************************************************************************/


 #include "../../include/linalg/_gpu_mtx.h"

 #include <math.h>

 #include <stddef.h>

 #include <stdio.h>

 #include <stdlib.h>

 #include <sys/time.h>

 #include <time.h>


 /* gpu */

 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS

 #define CL_TARGET_OPENCL_VERSION 300


 #ifdef __APPLE__

 #include <OpenCL/opencl.h>

 #else

 #include <CL/cl.h>

 #endif


 #define MEM_SIZE (128)

 #define MAX_SOURCE_SIZE (0x100000)

 #define PRINT_LINE(title) printf("\n========== %s ==========\n", title);


 void init_vec(int *vec, int len, int set_one_flag) {

     for (int i = 0; i < len; i++) {

         if (set_one_flag)

             vec[i] = 1;

         else

             vec[i] = 0;

     }

 }


 void rand_vec(int *vec, int len) {

     srand((unsigned)time(0));

     for (int i = 0; i < len; i++) {

         vec[i] = rand() % 2;

     }

 }


 void add_vec_cpu(const int *a, const int *b, int *res, const int len) {

     for (int i = 0; i < len; i++) {

         res[i] = a[i] + b[i];

     }

 }


 void print_vec(int *vec, int len) {

     for (int i = 0; i < len; i++) {

         printf("%d ", vec[i]);

     }

     printf("\n");

 }


 void check_result(int *v1, int *v2, int len) {

     int correct_num = 0;

     for (int i = 0; i < len; i++) {

         if (v1[i] == v2[i]) {

             correct_num += 1;

         }

     }

     printf("correct rate: %d / %d , %1.2f\n",

            correct_num,

            len,

            (float)correct_num / len);

 }


 void accl_mtx_exec() {

     struct timeval start, finish;

     double duration;

     srand((unsigned)time(NULL));

     clock_t startTime, endTime;

     double totalTime;


     /* generate vector a and b */

     int len = 64;

     int *a, *b, *c, *c_d;

     a = (int *)malloc(len * sizeof(int));

     b = (int *)malloc(len * sizeof(int));

     c = (int *)malloc(len * sizeof(int));

     c_d = (int *)malloc(len * sizeof(int));

     size_t data_size = len * sizeof(int);


     PRINT_LINE("INIT VALUE");

     /* vector addition, cpu version */

     printf("a: ");

     init_vec(a, len, 1);

     print_vec(a, len);


     printf("b: ");

     rand_vec(b, len);

     print_vec(b, len);


     printf("c: ");

     init_vec(c, len, 0);


     startTime = clock();

     add_vec_cpu(a, b, c, len);

     endTime = clock();

     // calculate difference to get total

     totalTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;

     print_vec(c, len);

     printf("CPU: %f\n", totalTime);


     /* vector addition, gpu version  */

     cl_mem a_buff, b_buff, c_buff;

     a_buff = b_buff = c_buff = NULL;


     cl_platform_id platform_id = NULL;

     cl_uint ret_num_platforms;


     cl_device_id device_id = NULL;

     cl_uint ret_num_devices;


     cl_context context = NULL;

     cl_kernel kernel = NULL;

     cl_program program = NULL;


     cl_command_queue command_queue = NULL;

     cl_int ret;


     /* Load the source code containing the kernel */

     char string[MEM_SIZE];

     FILE *fp;

     char fileName[] = "./_gpu_kernel_mtx_add.c";

     char *source_str;

     size_t source_size;


     fp = fopen(fileName, "r");

     if (!fp) {


         fprintf(stderr, "Failed to load kernel.\n");

         exit(1);

     }

     source_str = (char *)malloc(MAX_SOURCE_SIZE);

     source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);

     fclose(fp);


     // Platform

     ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

     if (ret != CL_SUCCESS) {

         printf("Failed to get platform ID.\n");

         goto error;

     }

     // Device

     ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

     if (ret != CL_SUCCESS) {

         printf("Failed to get device ID.\n");

         goto error;

     }

     // Context

     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL); //&ret);

     if (ret != CL_SUCCESS) {

         printf("Failed to create OpenCL context.\n");

         goto error;

     }

     command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

     if (ret != CL_SUCCESS) {

         printf("Failed to create command queue %d\n", (int)ret);

         goto error;

     }

     // Memory Buffer

     a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);

     b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);

     c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, data_size, NULL, &ret);


     ret = clEnqueueWriteBuffer(command_queue,

                                a_buff,

                                CL_TRUE,

                                0,

                                data_size,

                                (void *)a,

                                0,

                                NULL,

                                NULL);

     ret |= clEnqueueWriteBuffer(command_queue,

                                 b_buff,

                                 CL_TRUE,

                                 0,

                                 data_size,

                                 (void *)b,

                                 0,

                                 NULL,

                                 NULL);

     if (ret != CL_SUCCESS) {

         printf("Failed to copy date from host to device: %d\n", (int)ret);

         goto error;

     }

     // Create Kernel Program from source

     program = clCreateProgramWithSource(context,

                                         1,

                                         (const char **)&source_str,

                                         (const size_t *)&source_size,

                                         &ret);

     if (ret != CL_SUCCESS) {

         printf("Failed to create OpenCL program from source %d\n", (int)ret);

         goto error;

     }

     // Build Kernel Program

     ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

     if (ret != CL_SUCCESS) {

         printf("Failed to build program %d\n", (int)ret);

         char build_log[16348];

         clGetProgramBuildInfo(program,

                               device_id,

                               CL_PROGRAM_BUILD_LOG,

                               sizeof(build_log),

                               build_log,

                               NULL);

         printf("Error in kernel: %s\n", build_log);

         goto error;

     }

     // Create OpenCL Kernel

     kernel = clCreateKernel(program, "add_vec_gpu", &ret);

     if (ret != CL_SUCCESS) {

         printf("Failed to create kernel %d\n", (int)ret);

         goto error;

     }

     ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_buff);

     ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_buff);

     ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_buff);

     ret |= clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&len);

     if (ret != CL_SUCCESS) {

         printf("Failed to set kernel arguments %d\n", (int)ret);

         goto error;

     }


     /* Execute OpenCL Kernel */

     // executed using a single work-item

     // ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);


     size_t global_work_size, local_work_size;

     // Number of work items in each local work group

     local_work_size = len;

     // Number of total work items - localSize must be devisor

     global_work_size =

         (size_t)ceil(len / (float)local_work_size) * local_work_size;


     // size_t local_work_size[2] = { 8, 8 };

     // size_t global_work_size[2] = { 1, len };

     ret = clEnqueueNDRangeKernel(command_queue,

                                  kernel,

                                  1,

                                  NULL,

                                  &global_work_size,

                                  &local_work_size,

                                  0,

                                  NULL,

                                  NULL);

     if (ret != CL_SUCCESS) {

         printf("Failed to execute kernel for execution %d\n", (int)ret);

         goto error;

     }


     init_vec(c_d, len, 0);

     /* Copy results from the memory buffer */

     ret = clEnqueueReadBuffer(command_queue,

                               c_buff,

                               CL_TRUE,

                               0,

                               data_size,

                               (void *)c_d,

                               0,

                               NULL,

                               NULL);

     if (ret != CL_SUCCESS) {

         printf("Failed to copy data from device to host %d\n", (int)ret);

         goto error;

     }


     /* Display Result */

     PRINT_LINE("CHECK RESULT cpu-verison && gpu-version");

     printf("c_d: ");

     print_vec(c_d, len);

     check_result(c, c_d, len);

     printf("len-1=%d, c_d[%d]==c[%d]: %d, c_d[%d]=%d, c[%d]=%d \n",

            len - 1,

            len - 1,

            len - 1,

            c_d[len - 1] == c[len - 1],

            len - 1,

            c_d[len - 1],

            len - 1,

            c[len - 1]);


     PRINT_LINE("CHECK RESULT ELEMENT BY ELEMENT");

     printf("idx  c  c_d\n");

     for (int i = 0; i < len; i++) {

         printf("%2d %2d %2d \n", i, c[i], c_d[i]);

     }


     /* Finalization */

 error:


     /* free device resources */

     clFlush(command_queue);

     clFinish(command_queue);

     clReleaseKernel(kernel);

     clReleaseProgram(program);


     clReleaseMemObject(a_buff);

     clReleaseMemObject(b_buff);

     clReleaseMemObject(c_buff);


     clReleaseCommandQueue(command_queue);

     clReleaseContext(context);


     /* free host resources */

     free(source_str);

     free(a);

     free(b);

     free(c);

 }

 // int main () {accl_mtx_exec(); return 0;}

rand_vec
void rand_vec(int *vec, int len)
Definition: _gpu_mtx_add.c:65

MAX_SOURCE_SIZE
#define MAX_SOURCE_SIZE
Definition: _gpu_mtx_add.c:53

MEM_SIZE
#define MEM_SIZE
Definition: _gpu_mtx_add.c:52

PRINT_LINE
#define PRINT_LINE(title)
Definition: _gpu_mtx_add.c:54

add_vec_cpu
void add_vec_cpu(const int *a, const int *b, int *res, const int len)
Definition: _gpu_mtx_add.c:72

init_vec
void init_vec(int *vec, int len, int set_one_flag)
GPU kernel acceleration utility/helper functions.
Definition: _gpu_mtx_add.c:56

accl_mtx_exec
void accl_mtx_exec()
Definition: _gpu_mtx_add.c:98

print_vec
void print_vec(int *vec, int len)
Definition: _gpu_mtx_add.c:78

check_result
void check_result(int *v1, int *v2, int len)
Definition: _gpu_mtx_add.c:85

python.linalg.res
res
Definition: linalg.py:29