/**
* OpenCL C GPU kernel
*/
__kernel void matrixMul(__global float *C,<--- The function 'matrixMul' is never used.
__global float *A,<--- Parameter 'A' can be declared with const
__global float *B,<--- Parameter 'B' can be declared with const
int wA,
int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
// value stores the element that is
// computed by the thread
float value = 0;
for (int k = 0; k < wA; ++k) {
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
// Write the matrix to device memory each
// thread writes one element
C[ty * wA + tx] = value;
}