Implementing Parallel Programming With CUDA

This article demonstrates how to write CUDA kernel functions and uses five examples, such as matrix transposition, to accelerate the solution of large-scale problems with GPU multi-core.

0.1 Matrix Transposition

0.1.1 Algorithm Process

  1. Store the matrix to be transposed in GPU memory.
  2. Allocate space on the GPU to store the transposed matrix.
  3. Define a CUDA kernel function to implement the matrix transposition. This kernel function should use the concept of thread blocks and thread grids to handle all elements in the matrix. Within each thread block, threads can use shared memory to process data. Finally, use global memory to write the result back to the GPU.
  4. Call the CUDA kernel function to perform matrix transposition.
  5. Copy the transposed matrix from GPU memory to host memory.
  6. Release GPU memory

0.1.2 Code Implementation

Based on traditional code, use shared memory to optimize access to global memory and apply padding operation to avoid bank conflicts.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
const int N = 10000;
const int TILE_DIM = 32;
const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 block_size(TILE_DIM, TILE_DIM);
const dim3 grid_size(grid_size_x, grid_size_y);
__global__ void transpose(const float *A, float *B, const int N){  
    __shared__ float S[TILE_DIM][TILE_DIM+1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;
    //Sequential read
    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    __syncthreads();
    //Sequential write
    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
}

int main(){  
    int nn = N * N;
    int nBytes = nn * sizeof(float);
    float a = 1.1;
    float b = 2.2;
    //CPU space allocation
    float* A = (float*)malloc(nBytes);
    float* B = (float*)malloc(nBytes);
    //Matrix initialization
    for (int i = 1; i <= nn; i++) {
        if (i % 2 == 0)
            A[i-1] = a;
        else
            A[i-1] = b;
    }
    //GPU space allocation (needs to be defined first)
    float* d_A, * d_B;
    cudaMalloc((void**)&d_A, nBytes);
    cudaMalloc((void**)&d_B, nBytes);
    cudaMemcpy(d_A, A, nBytes, cudaMemcpyHostToDevice);
    transpose<<<grid_size, block_size >>> (d_A, d_B, N);
    cudaMemcpy(B, d_B, nBytes, cudaMemcpyDeviceToHost);
    free(A);
    free(B);
    cudaFree(d_A);
    cudaFree(d_B);
    return 0;
}

0.2 Matrix Multiplication

0.2.1 Kernel Function Design

Use of Shared Memory: Part of the two original matrices should be extracted as tiles and placed in shared memory to reduce access to global memory and thus improve program performance.

Non-square Matrices Handling: When multiplying two non-square matrices, it’s necessary to adjust the size of the tiles in shared memory and add judgment statements to prevent boundary errors.

0.2.2 Code Implementation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
__global__ void matrixmul(const float *A, const float *B, float *C,const int N,const int M,const int K){   //N*M x M*K
    __shared__ float A_S[TILE_DIM][TILE_DIM];
    __shared__ float B_S[TILE_DIM][TILE_DIM];
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int row = by * TILE_DIM + ty;
    int col = bx * TILE_DIM + tx;
    float value = 0;
    for (int ph = 0; ph < M / TILE_DIM+1; ph++) {
        if (row < N and ph * TILE_DIM + tx < M)
            A_S[ty][tx] = A[row * M + ph * TILE_DIM + tx];
        else
            A_S[ty][tx] = 0.0;
        if (col < K and ph * TILE_DIM + ty < M) 
            B_S[ty][tx] = B[(ph * TILE_DIM + ty) * K + col];
        else
            B_S[ty][tx] = 0.0;
        __syncthreads();

0.3 Histogram Statistics

Use of Shared Memory: A temporary array is allocated in shared memory to reduce access to global memory, thus improving program performance.

Loop Across Grids: When handling large amounts of data, a thread needs to process multiple pieces of data.

Atomic Operations: Whether it is shared memory or global memory results, atomic operations are needed to avoid conflicts.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
    __shared__ unsigned int temp[256];
    temp[threadIdx.x] = 0;
    __syncthreads();
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    while (i < size) {
        atomicAdd(&temp[buffer[i]], 1);
        i += offset;
    }
    __syncthreads();
    atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}

0.4 Reduction Sum

Loop Across Grids: For massive data (>1M), not even GPU multi-core can handle so many threads, so a thread needs to process multiple pieces of data.

Shared Memory Optimization: Storing the sum of multiple elements by each thread in shared memory can reduce access to global memory.

Interleaved Pair Method: Reduction by interleaved pair method can alleviate some of the thread divergence phenomenon.

Atomic Operation: When adding the result to global memory, conflict might occur, which requires atomic operation.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
__global__ void _sum_gpu(int *input, int count, int *output)
{
    __shared__ int sum_per_block[BLOCK_SIZE];

    int temp = 0;
    for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
         idx < count; idx += gridDim.x * blockDim.x)
        temp += input[idx];
    sum_per_block[threadIdx.x] = temp;
    __syncthreads();

    //**********shared memory summation stage***********
    for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
    {
        int double_kill = -1;
    if (threadIdx.x < length)
        double_kill = sum_per_block[threadIdx.x] + sum_per_block[threadIdx.x + length];
    __syncthreads();  //why we need two __syncthreads() here, and,
    
    if (threadIdx.x < length)
        sum_per_block[threadIdx.x] = double_kill;
    __syncthreads(); 
    } //the per-block partial sum is sum_per_block[0]

    if (threadIdx.x == 0) atomicAdd(output, sum_per_block[0]);

0.5 TOP K Problem

The TOP K problem is to find the top K largest or smallest elements in a set of data. Using CUDA reduction, the TOP K problem can be efficiently solved. Below are the detailed steps using CUDA reduction to implement sorting and selecting the top K largest/smallest elements:

  1. Define a tuple type that contains value and index, for storing original data and their index (in order to restore the original data after sorting)
  2. Traverse through the original data and store the data in tuples
  3. Perform reduction on the tuple to get the index of the top K largest/smallest values
  4. Restore the original data and sort according to index, to get the top K largest/smallest values

The implementation process is as follows:

  1. Copy the data to GPU memory float *d_data; cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
  2. Store the data in tuples
1
2
3
4
5
typedef struct {     float value;     int index; } Tuple;  
Tuple *d_tuples; 
int threadsPerBlock = 256; 
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
initializeTuples<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_tuples, n);
  1. Perform a reduction on the tuple to get the index of the top K largest/smallest values
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
int *d_indices;
kReduceKernel<<<blocksPerGrid, threadsPerBlock>>>(d_tuples, d_indices, n, k);
__global__ void kReduceKernel(Tuple *input, int *output, int n, int k) {
    extern __shared__ Tuple shared[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    shared[tid] = (i < n) ? input[i] : Tuple{0, 0};
    __syncthreads();
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            shared[tid] = (shared[tid].value > shared[tid + s].value) ? shared[tid] : shared[tid + s];
        __syncthreads();
    }

    if (tid == 0)
        output[blockIdx.x] = shared[0].index;
}
  1. Restore the original data in the CPU and sort according to index, to get the top K largest/smallest values
1
2
3
4
5
cudaMemcpy(h_indices, d_indices, size, cudaMemcpyDeviceToHost);  
for (int i = 0; i < k; ++i) {     
    int index = h_indices[i];     
    h_result[i] = h_data[index]; }  
std::sort(h_result, h_result + k);
Buy me a coffee~
Tim AlipayAlipay
Tim PayPalPayPal
Tim WeChat PayWeChat Pay
0%