34 #include "../../include/linalg/_gpu_mtx.h" 
   43 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 
   44 #define CL_TARGET_OPENCL_VERSION 300 
   47 #include <OpenCL/opencl.h> 
   52 #define MEM_SIZE (128) 
   53 #define MAX_SOURCE_SIZE (0x100000) 
   54 #define PRINT_LINE(title) printf("\n========== %s ==========\n", title);
 
   56 void init_vec(
int *vec, 
int len, 
int set_one_flag) {
 
   57     for (
int i = 0; i < len; i++) {
 
   66     srand((
unsigned)time(0));
 
   67     for (
int i = 0; i < len; i++) {
 
   73     for (
int i = 0; i < len; i++) {
 
   79     for (
int i = 0; i < len; i++) {
 
   80         printf(
"%d ", vec[i]);
 
   87     for (
int i = 0; i < len; i++) {
 
   92     printf(
"correct rate: %d / %d , %1.2f\n",
 
   95            (
float)correct_num / len);
 
   99     struct timeval start, finish;
 
  101     srand((
unsigned)time(NULL));
 
  102     clock_t startTime, endTime;
 
  107     int *a, *b, *c, *c_d;
 
  108     a = (
int *)malloc(len * 
sizeof(
int));
 
  109     b = (
int *)malloc(len * 
sizeof(
int));
 
  110     c = (
int *)malloc(len * 
sizeof(
int));
 
  111     c_d = (
int *)malloc(len * 
sizeof(
int));
 
  112     size_t data_size = len * 
sizeof(int);
 
  131     totalTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;
 
  133     printf(
"CPU: %f\n", totalTime);
 
  136     cl_mem a_buff, b_buff, c_buff;
 
  137     a_buff = b_buff = c_buff = NULL;
 
  139     cl_platform_id platform_id = NULL;
 
  140     cl_uint ret_num_platforms;
 
  142     cl_device_id device_id = NULL;
 
  143     cl_uint ret_num_devices;
 
  145     cl_context context = NULL;
 
  146     cl_kernel kernel = NULL;
 
  147     cl_program program = NULL;
 
  149     cl_command_queue command_queue = NULL;
 
  155     char fileName[] = 
"./_gpu_kernel_mtx_add.c";
 
  159     fp = fopen(fileName, 
"r");
 
  162         fprintf(stderr, 
"Failed to load kernel.\n");
 
  170     ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
 
  171     if (ret != CL_SUCCESS) {
 
  172         printf(
"Failed to get platform ID.\n");
 
  176     ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
  177     if (ret != CL_SUCCESS) {
 
  178         printf(
"Failed to get device ID.\n");
 
  182     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL); 
 
  183     if (ret != CL_SUCCESS) {
 
  184         printf(
"Failed to create OpenCL context.\n");
 
  187     command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
 
  188     if (ret != CL_SUCCESS) {
 
  189         printf(
"Failed to create command queue %d\n", (
int)ret);
 
  193     a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
 
  194     b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
 
  195     c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, data_size, NULL, &ret);
 
  197     ret = clEnqueueWriteBuffer(command_queue,
 
  206     ret |= clEnqueueWriteBuffer(command_queue,
 
  215     if (ret != CL_SUCCESS) {
 
  216         printf(
"Failed to copy date from host to device: %d\n", (
int)ret);
 
  220     program = clCreateProgramWithSource(context,
 
  222                                         (
const char **)&source_str,
 
  223                                         (
const size_t *)&source_size,
 
  225     if (ret != CL_SUCCESS) {
 
  226         printf(
"Failed to create OpenCL program from source %d\n", (
int)ret);
 
  230     ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 
  231     if (ret != CL_SUCCESS) {
 
  232         printf(
"Failed to build program %d\n", (
int)ret);
 
  233         char build_log[16348];
 
  234         clGetProgramBuildInfo(program,
 
  236                               CL_PROGRAM_BUILD_LOG,
 
  240         printf(
"Error in kernel: %s\n", build_log);
 
  244     kernel = clCreateKernel(program, 
"add_vec_gpu", &ret);
 
  245     if (ret != CL_SUCCESS) {
 
  246         printf(
"Failed to create kernel %d\n", (
int)ret);
 
  249     ret = clSetKernelArg(kernel, 0, 
sizeof(cl_mem), (
void *)&a_buff);
 
  250     ret |= clSetKernelArg(kernel, 1, 
sizeof(cl_mem), (
void *)&b_buff);
 
  251     ret |= clSetKernelArg(kernel, 2, 
sizeof(cl_mem), (
void *)&c_buff);
 
  252     ret |= clSetKernelArg(kernel, 3, 
sizeof(cl_int), (
void *)&len);
 
  253     if (ret != CL_SUCCESS) {
 
  254         printf(
"Failed to set kernel arguments %d\n", (
int)ret);
 
  262     size_t global_work_size, local_work_size;
 
  264     local_work_size = len;
 
  267         (size_t)ceil(len / (
float)local_work_size) * local_work_size;
 
  271     ret = clEnqueueNDRangeKernel(command_queue,
 
  280     if (ret != CL_SUCCESS) {
 
  281         printf(
"Failed to execute kernel for execution %d\n", (
int)ret);
 
  287     ret = clEnqueueReadBuffer(command_queue,
 
  296     if (ret != CL_SUCCESS) {
 
  297         printf(
"Failed to copy data from device to host %d\n", (
int)ret);
 
  302     PRINT_LINE(
"CHECK RESULT cpu-verison && gpu-version");
 
  306     printf(
"len-1=%d, c_d[%d]==c[%d]: %d, c_d[%d]=%d, c[%d]=%d \n",
 
  310            c_d[len - 1] == c[len - 1],
 
  316     PRINT_LINE(
"CHECK RESULT ELEMENT BY ELEMENT");
 
  317     printf(
"idx  c  c_d\n");
 
  318     for (
int i = 0; i < len; i++) {
 
  319         printf(
"%2d %2d %2d \n", i, c[i], c_d[i]);
 
  326     clFlush(command_queue);
 
  327     clFinish(command_queue);
 
  328     clReleaseKernel(kernel);
 
  329     clReleaseProgram(program);
 
  331     clReleaseMemObject(a_buff);
 
  332     clReleaseMemObject(b_buff);
 
  333     clReleaseMemObject(c_buff);
 
  335     clReleaseCommandQueue(command_queue);
 
  336     clReleaseContext(context);
 
void rand_vec(int *vec, int len)
#define PRINT_LINE(title)
void add_vec_cpu(const int *a, const int *b, int *res, const int len)
void init_vec(int *vec, int len, int set_one_flag)
GPU kernel acceleration utility/helper functions.
void print_vec(int *vec, int len)
void check_result(int *v1, int *v2, int len)