34 #include "../../include/linalg/_gpu_mtx.h"
43 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
44 #define CL_TARGET_OPENCL_VERSION 300
47 #include <OpenCL/opencl.h>
52 #define MEM_SIZE (128)
53 #define MAX_SOURCE_SIZE (0x100000)
54 #define PRINT_LINE(title) printf("\n========== %s ==========\n", title);
56 void init_vec(
int *vec,
int len,
int set_one_flag) {
57 for (
int i = 0; i < len; i++) {
66 srand((
unsigned)time(0));
67 for (
int i = 0; i < len; i++) {
73 for (
int i = 0; i < len; i++) {
79 for (
int i = 0; i < len; i++) {
80 printf(
"%d ", vec[i]);
87 for (
int i = 0; i < len; i++) {
92 printf(
"correct rate: %d / %d , %1.2f\n",
95 (
float)correct_num / len);
99 struct timeval start, finish;
101 srand((
unsigned)time(NULL));
102 clock_t startTime, endTime;
107 int *a, *b, *c, *c_d;
108 a = (
int *)malloc(len *
sizeof(
int));
109 b = (
int *)malloc(len *
sizeof(
int));
110 c = (
int *)malloc(len *
sizeof(
int));
111 c_d = (
int *)malloc(len *
sizeof(
int));
112 size_t data_size = len *
sizeof(int);
131 totalTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;
133 printf(
"CPU: %f\n", totalTime);
136 cl_mem a_buff, b_buff, c_buff;
137 a_buff = b_buff = c_buff = NULL;
139 cl_platform_id platform_id = NULL;
140 cl_uint ret_num_platforms;
142 cl_device_id device_id = NULL;
143 cl_uint ret_num_devices;
145 cl_context context = NULL;
146 cl_kernel kernel = NULL;
147 cl_program program = NULL;
149 cl_command_queue command_queue = NULL;
155 char fileName[] =
"./_gpu_kernel_mtx_add.c";
159 fp = fopen(fileName,
"r");
162 fprintf(stderr,
"Failed to load kernel.\n");
170 ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
171 if (ret != CL_SUCCESS) {
172 printf(
"Failed to get platform ID.\n");
176 ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
177 if (ret != CL_SUCCESS) {
178 printf(
"Failed to get device ID.\n");
182 context = clCreateContext(NULL, 1, &device_id, NULL, NULL, NULL);
183 if (ret != CL_SUCCESS) {
184 printf(
"Failed to create OpenCL context.\n");
187 command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
188 if (ret != CL_SUCCESS) {
189 printf(
"Failed to create command queue %d\n", (
int)ret);
193 a_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
194 b_buff = clCreateBuffer(context, CL_MEM_READ_ONLY, data_size, NULL, &ret);
195 c_buff = clCreateBuffer(context, CL_MEM_WRITE_ONLY, data_size, NULL, &ret);
197 ret = clEnqueueWriteBuffer(command_queue,
206 ret |= clEnqueueWriteBuffer(command_queue,
215 if (ret != CL_SUCCESS) {
216 printf(
"Failed to copy date from host to device: %d\n", (
int)ret);
220 program = clCreateProgramWithSource(context,
222 (
const char **)&source_str,
223 (
const size_t *)&source_size,
225 if (ret != CL_SUCCESS) {
226 printf(
"Failed to create OpenCL program from source %d\n", (
int)ret);
230 ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
231 if (ret != CL_SUCCESS) {
232 printf(
"Failed to build program %d\n", (
int)ret);
233 char build_log[16348];
234 clGetProgramBuildInfo(program,
236 CL_PROGRAM_BUILD_LOG,
240 printf(
"Error in kernel: %s\n", build_log);
244 kernel = clCreateKernel(program,
"add_vec_gpu", &ret);
245 if (ret != CL_SUCCESS) {
246 printf(
"Failed to create kernel %d\n", (
int)ret);
249 ret = clSetKernelArg(kernel, 0,
sizeof(cl_mem), (
void *)&a_buff);
250 ret |= clSetKernelArg(kernel, 1,
sizeof(cl_mem), (
void *)&b_buff);
251 ret |= clSetKernelArg(kernel, 2,
sizeof(cl_mem), (
void *)&c_buff);
252 ret |= clSetKernelArg(kernel, 3,
sizeof(cl_int), (
void *)&len);
253 if (ret != CL_SUCCESS) {
254 printf(
"Failed to set kernel arguments %d\n", (
int)ret);
262 size_t global_work_size, local_work_size;
264 local_work_size = len;
267 (size_t)ceil(len / (
float)local_work_size) * local_work_size;
271 ret = clEnqueueNDRangeKernel(command_queue,
280 if (ret != CL_SUCCESS) {
281 printf(
"Failed to execute kernel for execution %d\n", (
int)ret);
287 ret = clEnqueueReadBuffer(command_queue,
296 if (ret != CL_SUCCESS) {
297 printf(
"Failed to copy data from device to host %d\n", (
int)ret);
302 PRINT_LINE(
"CHECK RESULT cpu-verison && gpu-version");
306 printf(
"len-1=%d, c_d[%d]==c[%d]: %d, c_d[%d]=%d, c[%d]=%d \n",
310 c_d[len - 1] == c[len - 1],
316 PRINT_LINE(
"CHECK RESULT ELEMENT BY ELEMENT");
317 printf(
"idx c c_d\n");
318 for (
int i = 0; i < len; i++) {
319 printf(
"%2d %2d %2d \n", i, c[i], c_d[i]);
326 clFlush(command_queue);
327 clFinish(command_queue);
328 clReleaseKernel(kernel);
329 clReleaseProgram(program);
331 clReleaseMemObject(a_buff);
332 clReleaseMemObject(b_buff);
333 clReleaseMemObject(c_buff);
335 clReleaseCommandQueue(command_queue);
336 clReleaseContext(context);
void rand_vec(int *vec, int len)
#define PRINT_LINE(title)
void add_vec_cpu(const int *a, const int *b, int *res, const int len)
void init_vec(int *vec, int len, int set_one_flag)
GPU kernel acceleration utility/helper functions.
void print_vec(int *vec, int len)
void check_result(int *v1, int *v2, int len)