/**
 * OpenCL C GPU kernel
 */
__kernel void matrixMul(__global float *C,<--- The function 'matrixMul' is never used.
                        __global float *A,<--- Parameter 'A' can be declared with const
                        __global float *B,<--- Parameter 'B' can be declared with const
                        int wA,
                        int wB) {
    int tx = get_global_id(0);
    int ty = get_global_id(1);
    // value stores the element that is
    // computed by the thread
    float value = 0;
    for (int k = 0; k < wA; ++k) {
        float elementA = A[ty * wA + k];
        float elementB = B[k * wB + tx];
        value += elementA * elementB;
    }
    // Write the matrix to device memory each
    // thread writes one element
    C[ty * wA + tx] = value;
}