CUDA实现并行编程

Tim 收录于计算机技术数据科学与机器学习

2023-05-08 2025-05-16 约 1900 字预计阅读 4 分钟 - 次阅读

本文通过编写CUDA核函数，以矩阵转置等5个问题为例，利用GPU众核对大规模问题的求解加速。

0.1 矩阵转置

0.1.1 算法流程

将需要转置的矩阵存储到GPU内存中。
在GPU上分配空间存储转置后的矩阵。
定义CUDA核函数来实现矩阵转置。该核函数应该使用线程块和线程格的概念来处理矩阵中的所有元素。在每个线程块中，线程可以使用共享内存来处理数据。最后，利用全局内存将结果写回到GPU。
调用CUDA核函数以执行矩阵转置。
将转置后的矩阵从GPU内存复制到主机内存中。
释放GPU内存

0.1.2 代码实现

在传统代码的基础上，利用共享内存优化访问全局内存的方式，同时使用padding操作，避免bank冲突。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51


const int N = 10000;
const int TILE_DIM = 32;
const int grid_size_x = (N + TILE_DIM - 1) / TILE_DIM;
const int grid_size_y = grid_size_x;
const dim3 block_size(TILE_DIM, TILE_DIM);
const dim3 grid_size(grid_size_x, grid_size_y);
__global__ void transpose(const float *A, float *B, const int N){  
    __shared__ float S[TILE_DIM][TILE_DIM+1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;
    //顺序读
    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    __syncthreads();
    //顺序写
    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
}

int main(){  
    int nn = N * N;
    int nBytes = nn * sizeof(float);
    float a = 1.1;
    float b = 2.2;
    //cpu空间分配
    float* A = (float*)malloc(nBytes);
    float* B = (float*)malloc(nBytes);
    //矩阵初始化
    for (int i = 1; i <= nn; i++) {
        if (i % 2 == 0)
            A[i-1] = a;
        else
            A[i-1] = b;
    }
    //gpu空间分配（需要先定义）
    float* d_A, * d_B;
    cudaMalloc((void**)&d_A, nBytes);
    cudaMalloc((void**)&d_B, nBytes);
    cudaMemcpy(d_A, A, nBytes, cudaMemcpyHostToDevice);
    transpose<<<grid_size, block_size >>> (d_A, d_B, N);
    cudaMemcpy(B, d_B, nBytes, cudaMemcpyDeviceToHost);
    free(A);
    free(B);
    cudaFree(d_A);
    cudaFree(d_B);
    return 0;
}

0.2 矩阵相乘

0.2.1 核函数设计

共享内存的使用：应该将两个原始矩阵的一部分抽取出来，当作瓦片，放入共享内存，减少对全局内存的访问，从而提高程序性能。 非方阵处理：非方阵的两个矩阵相乘，需要调整共享内存中瓦片的尺寸，同时需要添加判断语句，防止发生越界错误。

0.2.2 代码实现

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


__global__ void matrixmul(const float *A, const float *B, float *C,const int N,const int M,const int K){   //N*M x M*K
    __shared__ float A_S[TILE_DIM][TILE_DIM];
    __shared__ float B_S[TILE_DIM][TILE_DIM];
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int row = by * TILE_DIM + ty;
    int col = bx * TILE_DIM + tx;
    float value = 0;
    for (int ph = 0; ph < M / TILE_DIM+1; ph++) {
        if (row < N && ph * TILE_DIM + tx < M)
            A_S[ty][tx] = A[row * M + ph * TILE_DIM + tx];
        else
            A_S[ty][tx] = 0.0;
        if (col < K && ph * TILE_DIM + ty < M) 
            B_S[ty][tx] = B[(ph * TILE_DIM + ty) * K + col];
        else
            B_S[ty][tx] = 0.0;
        __syncthreads();

        for (int k = 0; k < TILE_DIM; k++)
            value += A_S[ty][k] * B_S[k][tx];
        __syncthreads();
    }

    if (row < N && col < K)
        C[row*K+col]=value;
}

0.3 统计直方图

共享内存的使用：在共享内存开辟临时数组，减少对全局内存的访问，从而提高程序性能。 跨网格循环：数据量很大，一个线程要处理多个数据。 原子操作：无论是共享内存还是全局内存的结果，都需要使用原子操作，避免冲突

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo){
	__shared__ unsigned int temp[256];
	temp[threadIdx.x] = 0;
	__syncthreads();
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int offset = blockDim.x *gridDim.x;
	while (i<size){
		atomicAdd(&temp[buffer[i]], 1);
		i += offset;
	}
	__syncthreads();
	atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]);
}

0.4 规约求和

跨网格循环：对于海量数据(>100w)，即使是GPU众核，也做不到这么多的线程，因此一个线程要处理多个数据。 共享内存优化：将每个线程对多个元素求和的结果存于共享内存，可以减少对全局内存的访问。 交错配对法：利用交错配对法进行规约，可以缓解一部分的线程束分化现象。 原子操作:将结果累加到全局内存中，可能会发生冲突，需要使用原子操作。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26


__global__ void _sum_gpu(int *input, int count, int *output)
{
    __shared__ int sum_per_block[BLOCK_SIZE];

    int temp = 0;
    for (int idx = threadIdx.x + blockDim.x * blockIdx.x;
         idx < count; idx += gridDim.x * blockDim.x
	)
        temp += input[idx];
    sum_per_block[threadIdx.x] = temp;
    __syncthreads();

    //**********shared memory summation stage***********
    for (int length = BLOCK_SIZE / 2; length >= 1; length /= 2)
    {
        int double_kill = -1;
	if (threadIdx.x < length)
	    double_kill = sum_per_block[threadIdx.x] + sum_per_block[threadIdx.x + length];
	__syncthreads();  //why we need two __syncthreads() here, and,
	
	if (threadIdx.x < length)
	    sum_per_block[threadIdx.x] = double_kill;
	__syncthreads(); 
    } //the per-block partial sum is sum_per_block[0]

    if (threadIdx.x == 0) atomicAdd(output, sum_per_block[0]);

0.5 TOP K 问题

TOP K问题是指在一组数据中，找到前K个最大或最小的元素。利用CUDA规约计算可以高效地解决TOP K问题。以下是利用CUDA规约计算来实现排序和选择前K个最大/最小元素的详细步骤：

定义一个二元组类型，包含值和索引，用于存储原始数据及其索引（为了在排序后恢复原始数据）
对原始数据进行遍历，将数据存储到二元组中
对二元组进行归约操作，得到前K个最大/最小值的索引
恢复原始数据并按照索引排序，得到前K个最大/最小值

具体实现流程如下：

将数据复制到GPU显存中 float *d_data; cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
将数据存储到二元组中

1
2
3
4
5


typedef struct {     float value;     int index; } Tuple;  
Tuple *d_tuples; 
int threadsPerBlock = 256; 
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
initializeTuples<<<blocksPerGrid, threadsPerBlock>>>(d_data, d_tuples, n);

对二元组进行归约操作，得到前K个最大/最小值的索引

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


int *d_indices;
kReduceKernel<<<blocksPerGrid, threadsPerBlock>>>(d_tuples, d_indices, n, k);
__global__ void kReduceKernel(Tuple *input, int *output, int n, int k) {
    extern __shared__ Tuple shared[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    shared[tid] = (i < n) ? input[i] : Tuple{0, 0};
    __syncthreads();
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            shared[tid] = (shared[tid].value > shared[tid + s].value) ? shared[tid] : shared[tid + s];
        __syncthreads();
    }

    if (tid == 0)
        output[blockIdx.x] = shared[0].index;
}

在CPU中恢复原始数据并按照索引排序，得到前K个最大/最小值

1
2
3
4
5


cudaMemcpy(h_indices, d_indices, size, cudaMemcpyDeviceToHost);  
for (int i = 0; i < k; ++i) {     
    int index = h_indices[i];     
    h_result[i] = h_data[index]; }  
std::sort(h_result, h_result + k);

Buy me a coffee~

赞赏

支付宝

贝宝

微信