考慮一個典型的總和運算 :
int sum = 0; for (i = 1; i < N; i++) { sum += i; } |
假設要做平行化, 考慮 data dependency 的問題, 假設 N=8,安排成3個 iteration 來做, 共需 3 個 iteration, 如下圖:
下面這個程式是簡化的, 假設 N = BlockSize*ThreadSize, 且 ThreadSize 必須為 2 的次方, 計算 1+2+3...+N 的總和, 每個 Block 負責計算一個總和, 在主程式中再把所有 Block 總和結果加起來, 得到數列的計算結果. 由於 globalReduce 會改變 a[] 的值, 所以先做 sharedReduce. 程式碼如下 :
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h>
#define BlockSize 8 #define ThreadSize 1024 #define ArraySize (BlockSize*ThreadSize)
__device__ void __syncthreads(); __global__ void globalReduceBlockSum(int *b, int *a) { int id = blockIdx.x * blockDim.x + threadIdx.x; for (int d = blockDim.x / 2; d > 0; d >>= 1) { if (threadIdx.x < d) { a[id] += a[id + d]; } __syncthreads(); }
if (threadIdx.x == 0) { b[blockIdx.x] = a[id]; } }
__global__ void sharedReduceBlockSum(int *b, int *a) { __shared__ int x[ThreadSize];
int id = blockIdx.x * blockDim.x + threadIdx.x; x[threadIdx.x] = a[id]; //copy to shared memory of block __syncthreads(); //wait all threads copy complete
for (int d = blockDim.x / 2; d > 0; d >>= 1) { if (threadIdx.x < d) { x[threadIdx.x] += x[threadIdx.x + d]; } __syncthreads(); }
if (threadIdx.x == 0) { b[blockIdx.x] = x[0]; } }
int main() { int host_a[ArraySize]; int host_b[BlockSize]; int *dev_a = 0; int *dev_b = 0; int sum = 0; float elapsedTime;
// setup performance meter from CUDA ---------- cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop);
cudaSetDevice(0);
cudaMalloc((void**)&dev_a, ArraySize * sizeof(int)); for (int i = 0; i < ArraySize; i++) host_a[i] = i+1; cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&dev_b, BlockSize * sizeof(int)); cudaMemset(dev_b, 0, BlockSize * sizeof(int));
// Run sharedReduce first, because b[] is modified in globalReduce
cudaEventRecord(start, 0); //keep start time sharedReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate cudaEventRecord(stop, 0); //keep stop time cudaEventSynchronize(stop); //wait stop event cudaEventElapsedTime(&elapsedTime, start, stop);
cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);
//Print result int answer = (ArraySize + 1)*ArraySize / 2;
printf("shared:\n"); sum = 0; for (int i = 0; i < BlockSize; i++) { sum += host_b[i]; printf("%d ", host_b[i]); } printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);
// run globalReduce cudaEventRecord(start, 0); //keep start time globalReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate cudaEventRecord(stop, 0); //keep stop time cudaEventSynchronize(stop); //wait stop event cudaEventElapsedTime(&elapsedTime, start, stop);
cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);
//Print result printf("global:\n"); sum = 0; for (int i = 0; i < BlockSize; i++) { sum += host_b[i]; printf("%d ", host_b[i]); } printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime); //cudaDeviceSynchronize(); getchar();
cudaFree(dev_a); cudaFree(dev_b); return 0; }
|
執行結果:
雖然 shared memory 速度比 global memory 速度快, 但是在這個例子, 因為計算太過簡單, 且 sharedReduce 也因為多了一個搬移至 shared memory 的動作, 所以在 N 值較小時反而比較慢.
參考資料 :
[1] John Owens, "GPU Reduce, Scan, and Sort", CS223 Guest Lecture, Electrical and Computer Engineering, http://web.cs.ucdavis.edu/~amenta/f15/amenta-reduce-scan-sort.pdf
[2] Bedřich Beneš, "CUDA Application Reduction Algorithm ", Department of Computer Graphics , Purdue University, http://hpcg.purdue.edu/bbenes/classes/CGT620/lectures/CGT%20620-CUDA-09-CUDA%20Parallel%20Reduction.pdf
留言列表