__global__voidmatrixmul(constfloat*A,constfloat*B,float*C,constintN,constintM,constintK){//N*M x M*K
__shared__floatA_S[TILE_DIM][TILE_DIM];__shared__floatB_S[TILE_DIM][TILE_DIM];inttx=threadIdx.x;intty=threadIdx.y;intbx=blockIdx.x;intby=blockIdx.y;introw=by*TILE_DIM+ty;intcol=bx*TILE_DIM+tx;floatvalue=0;for(intph=0;ph<M/TILE_DIM+1;ph++){if(row<N&&ph*TILE_DIM+tx<M)A_S[ty][tx]=A[row*M+ph*TILE_DIM+tx];elseA_S[ty][tx]=0.0;if(col<K&&ph*TILE_DIM+ty<M)B_S[ty][tx]=B[(ph*TILE_DIM+ty)*K+col];elseB_S[ty][tx]=0.0;__syncthreads();for(intk=0;k<TILE_DIM;k++)value+=A_S[ty][k]*B_S[k][tx];__syncthreads();}if(row<N&&col<K)C[row*K+col]=value;}
__global__void_sum_gpu(int*input,intcount,int*output){__shared__intsum_per_block[BLOCK_SIZE];inttemp=0;for(intidx=threadIdx.x+blockDim.x*blockIdx.x;idx<count;idx+=gridDim.x*blockDim.x)temp+=input[idx];sum_per_block[threadIdx.x]=temp;__syncthreads();//**********shared memory summation stage***********
for(intlength=BLOCK_SIZE/2;length>=1;length/=2){intdouble_kill=-1;if(threadIdx.x<length)double_kill=sum_per_block[threadIdx.x]+sum_per_block[threadIdx.x+length];__syncthreads();//why we need two __syncthreads() here, and,
if(threadIdx.x<length)sum_per_block[threadIdx.x]=double_kill;__syncthreads();}//the per-block partial sum is sum_per_block[0]
if(threadIdx.x==0)atomicAdd(output,sum_per_block[0]);
TOP K 问题
TOP K问题是指在一组数据中,找到前K个最大或最小的元素。利用CUDA规约计算可以高效地解决TOP K问题。
以下是利用CUDA规约计算来实现排序和选择前K个最大/最小元素的详细步骤: