Parallel Programming With CUDA

This article accelerates the solution of large-scale problems using GPU multi-core by writing CUDA kernel functions, taking matrix transposition and other 5 issues as examples.

0.1 Matrix Transposition

0.1.1 Algorithm Process

  1. Store the matrix to be transposed in GPU memory.
  2. Allocate space on the GPU to store the transposed matrix.
  3. Define a CUDA kernel function to implement matrix transposition. This kernel function should use the concepts of thread blocks and grids to handle all elements in the matrix. In each thread block, threads can use shared memory to process data. Finally, use global memory to write the results back to the GPU.
  4. Call the CUDA kernel function to execute matrix transposition.
  5. Copy the transposed matrix from GPU memory to host memory.
  6. Release GPU memory

0.1.2 Code Implementation

Based on traditional code, shared memory is used to optimize access to global memory, and padding operations are used to avoid bank conflicts.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
const int N = 10000;
const int TILE_DIM = 32;
const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 block_size(TILE_DIM, TILE_DIM);
const dim3 grid_size(grid_size_x, grid_size_y);
__global__ void transpose(const float *A, float *B, const int N){  
    __shared__ float S[TILE_DIM][TILE_DIM+1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;
    // Sequential read
    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    __syncthreads();
    // Sequential write
    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
}

int main(){  
    int nn = N * N;
    int nBytes = nn * sizeof(float);
    float a = 1.1;
    float b = 2.2;
    // CPU space allocation
    float* A = (float*)malloc(nBytes);
    float* B = (float*)malloc(nBytes);
    // Matrix initialization
    for (int i = 1; i <= nn; i++) {
        if (i % 2 == 0)
            A[i-1] = a;
        else
            A[i-1] = b;
    }
    // GPU space allocation (needs to be defined first)
    float* d_A, * d_B;
    cudaMalloc((void**)&d_A, nBytes);
    cudaMalloc((void**)&d_B, nBytes);
    cudaMemcpy(d_A, A, nBytes, cudaMemcpyHostToDevice);
    transpose<<<grid_size, block_size >>> (d_A, d_B, N);
    cudaMemcpy(B, d_B, nBytes, cudaMemcpyDeviceToHost);
    free(A);
    free(B);
    cudaFree(d_A);
    cudaFree(d_B);
    return 0;
}

0.2 Matrix Multiplication

0.2.1 Kernel Function Design

Use of Shared Memory: A portion of the two original matrices should be extracted as tiles and placed in shared memory to reduce access to global memory, thereby improving program performance. Non-square Matrix Handling: When multiplying two non-square matrices, the size of the tiles in shared memory needs to be adjusted, and conditional statements need to be added to prevent out-of-bounds errors.

0.2.2 Code Implementation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__global__ void matrixmul(const float *A, const float *B, float *C,const int N,const int M,const int K){   //N*M x M*K
    __shared__ float A_S[TILE_DIM][TILE_DIM];
    __shared__ float B_S[TILE_DIM][TILE_DIM];
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int row = by * TILE_DIM + ty;
    int col = bx * TILE_DIM + tx;
    float value = 0;
    for (int ph = 0; ph < M / TILE_DIM+1; ph++) {
        if (row < N && ph * TILE_DIM + tx < M)
            A_S[ty][tx] = A[row * M + ph * TILE_DIM + tx];
        else
            A_S[ty][tx] = 0.0;
        if (col < K && ph * TILE_DIM + ty < M) 
            B_S[ty][tx] = B[(ph * TILE_DIM + ty) * K + col];
        else
            B_S[ty][tx] = 0.0;
        __syncthreads();

        for (int k = 0; k < TILE_DIM; k++)
            value += A_S[ty][k] * B_S[k][tx];
        __syncthreads();
    }

    if (row < N && col < K)
        C[row*K+col]=value;
}

0.3 Histogram Statistics

Use of Shared Memory: A temporary array is created in shared memory to reduce access to global memory, thereby improving program performance. Cross-grid Looping: For large data volumes, a single thread needs to process multiple data. Atomic Operations: Whether the result is in shared memory or global memory, atomic operations are needed to avoid conflicts.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
	__shared__ unsigned int temp[256];
	temp[threadIdx.x] = 0;
	__syncthreads();
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int offset = blockDim.x *gridDim.x;
	while (i<size){
		atomicAdd(&temp[buffer[i]], 1);
		i += offset;
	}
	__syncthreads();
	atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}

0.4 Reduction Summation

Cross-grid Looping: For massive data (>1 million), even with GPU multi-core, it’s not possible to have so many threads, so a single thread needs to process multiple data. Shared Memory Optimization: The result of summing multiple elements by each thread is stored in shared memory, reducing access to global memory. Interleaved Pairing Method: Using the interleaved pairing method for reduction can alleviate some of the thread warp divergence. Atomic Operations: Accumulating the result to global memory may cause conflicts, so atomic operations are needed.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
__global__ void _sum_gpu(int *input, int count, int *output)
{
    __shared__ int sum_per_block[BLOCK_SIZE];

    int temp = 0;
    for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
         idx < count; idx += gridDim.x * blockDim.x
	)
        temp += input[idx];
    sum_per_block[threadIdx.x] = temp;
    __syncthreads();

    //**********shared memory summation stage***********
    for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
    {
        int double_kill = -1;
	if (threadIdx.x < length)
	    double_kill = sum_per_block[threadIdx.x] + sum_per_block[threadIdx.x + length];
	__syncthreads();  //why we need two __syncthreads() here, and,
	
	if (threadIdx.x < length)
	    sum_per_block[threadIdx.x] = double_kill;
	__syncthreads(); 
    } //the per-block partial sum is sum_per_block[0]

    if (threadIdx.x == 0) atomicAdd(output, sum_per_block[0]);

0.5 TOP K Problem

The TOP K problem refers to finding the top K largest or smallest elements in a dataset. Using CUDA reduction can efficiently solve the TOP K problem. The detailed steps to implement sorting and selecting the top K largest/smallest elements using CUDA reduction are as follows:

  1. Define a tuple type containing value and index for storing the original data and its index (to restore the original data after sorting)
  2. Traverse the original data and store the data in tuples
  3. Perform reduction operations on the tuples to obtain the indices of the top K largest/smallest values
  4. Restore the original data and sort by index to get the top K largest/smallest values

Specific implementation process:

  1. Copy data to GPU memory float *d_data; cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
  2. Store data in tuples
1
2
3
4
5
typedef struct {     float value;     int index; } Tuple;  
Tuple *d_tuples; 
int threadsPerBlock = 256; 
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
initializeTuples<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_tuples, n);
  1. Perform reduction operations on the tuples to obtain the indices of the top K largest/smallest values
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
int *d_indices;
kReduceKernel<<<blocksPerGrid, threadsPerBlock>>>(d_tuples, d_indices, n, k);
__global__ void kReduceKernel(Tuple *input, int *output, int n, int k) {
    extern __shared__ Tuple shared[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    shared[tid] = (i < n) ? input[i] : Tuple{0, 0};
    __syncthreads();
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            shared[tid] = (shared[tid].value > shared[tid + s].value) ? shared[tid] : shared[tid + s];
        __syncthreads();
    }

    if (tid == 0)
        output[blockIdx.x] = shared[0].index;
}
  1. Restore the original data on the CPU and sort by index to get the top K largest/smallest values
1
2
3
4
5
cudaMemcpy(h_indices, d_indices, size, cudaMemcpyDeviceToHost);  
for (int i = 0; i < k; ++i) {     
    int index = h_indices[i];     
    h_result[i] = h_data[index]; }  
std::sort(h_result, h_result + k);
Buy me a coffee~
Tim AlipayAlipay
Tim PayPalPayPal
Tim WeChat PayWeChat Pay
0%