1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * OpenCL C GPU kernel
 */
__kernel void matrixMul(__global float *C,<--- The function 'matrixMul' is never used.
                        __global float *A,<--- Parameter 'A' can be declared with const
                        __global float *B,<--- Parameter 'B' can be declared with const
                        int wA,
                        int wB) {

    int tx = get_global_id(0);
    int ty = get_global_id(1);

    // value stores the element that is
    // computed by the thread
    float value = 0;
    for (int k = 0; k < wA; ++k) {
        float elementA = A[ty * wA + k];
        float elementB = B[k * wB + tx];
        value += elementA * elementB;
    }

    // Write the matrix to device memory each
    // thread writes one element
    C[ty * wA + tx] = value;
}