Chapter 3 Exercise

__global__ void MatrixMulKernel(float* M, float* N,
                            float* P, int Width) {
int row = blockIdx.x*blockDim.x+threadIdx.x;
if (row < Width) {
    for (int col = 0; col < Width; ++col) {
        float Pvalue = 0;
        for (int k = 0; k < Width; ++k) {
            Pvalue += M[row*Width+k] * N[k*Width+col];
        }
        P[row*Width+col] = Pvalue;
    }
}
}
MatrixMulKernel<<<ceil_div(Width, THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>();

__global__ void MatrixMulKernel(float* M, float* N,
                                float* P, int Width) {
    int col = blockIdx.x*blockDim.x+threadIdx.x;
    if (col < Width) {
        for(int row = 0; row < Width; ++row) {
            float Pvalue = 0;
            for (int k = 0; k < Width; ++k) {
                Pvalue += M[row*Width+k] * N[k*Width+col];
            }
            P[row*Width+col] = Pvalue;
        }
    }
}
MatrixMulKernel<<<ceil_div(Width, THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>();

c. for a., the access to N is not consecutive for b., the access to M is consecutive

__global__ void MatrixVectorMulKernel(float* A, float* B,
                                float* C, int Width) {
    int i = blockIdx.x*blockDim.x+threadIdx.x;
    if (i < Width) {
        float Pvalue = 0;
        for (int k = 0; k < Width; ++k) {
            Pvalue += B[i*Width+k] * C[k];
        }
        A[i] = Pvalue;
    }
}
MatrixVectorMulKernel<<<ceil_div(Width, THREADS_PER_BLOCK), THREADS_PER_BLOCK>>>();

a. 16*32 = 512 b. 512 * 95 = 48640 c. ((300-1)/16 + 1) * ((150-1)/32 + 1) = 19 * 5 = 95 d. 150 * 300 = 45000
a. 20400+10 = 8010 b. 10500+20 = 5020

5. 5(400500) + 20*400 + 10 = 1008010