[CUDA] 平行化之 Reduce 演算法－No More Codes

考慮一個典型的總和運算 :

int sum = 0;

for (i = 1; i < N; i++) {

sum += i;

}

假設要做平行化, 考慮 data dependency 的問題, 假設 N=8,安排成3個 iteration 來做, 共需 3 個 iteration, 如下圖:

undefined

下面這個程式是簡化的, 假設 N = BlockSize*ThreadSize, 且 ThreadSize 必須為 2 的次方, 計算 1+2+3...+N 的總和, 每個 Block 負責計算一個總和, 在主程式中再把所有 Block 總和結果加起來, 得到數列的計算結果. 由於 globalReduce 會改變 a[] 的值, 所以先做 sharedReduce. 程式碼如下 :

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#define BlockSize 8

#define ThreadSize 1024

#define ArraySize (BlockSize*ThreadSize)

__device__ void __syncthreads();

__global__ void globalReduceBlockSum(int *b, int *a)

{

int id = blockIdx.x * blockDim.x + threadIdx.x;

for (int d = blockDim.x / 2; d > 0; d >>= 1)

{

if (threadIdx.x < d)

{

a[id] += a[id + d];

}

__syncthreads();

}

if (threadIdx.x == 0) {

b[blockIdx.x] = a[id];

}

__global__ void sharedReduceBlockSum(int *b, int *a)

{

__shared__ int x[ThreadSize];

int id = blockIdx.x * blockDim.x + threadIdx.x;

x[threadIdx.x] = a[id]; //copy to shared memory of block

__syncthreads(); //wait all threads copy complete

for (int d = blockDim.x / 2; d > 0; d >>= 1)

{

if (threadIdx.x < d)

{

x[threadIdx.x] += x[threadIdx.x + d];

}

__syncthreads();

}

if (threadIdx.x == 0) {

b[blockIdx.x] = x[0];

}

int main()

{

int host_a[ArraySize];

int host_b[BlockSize];

int *dev_a = 0;

int *dev_b = 0;

int sum = 0;

float elapsedTime;

// setup performance meter from CUDA ----------

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

cudaSetDevice(0);

cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));

for (int i = 0; i < ArraySize; i++)

host_a[i] = i+1;

cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);

cudaMalloc((void**)&dev_b, BlockSize * sizeof(int));

cudaMemset(dev_b, 0, BlockSize * sizeof(int));

// Run sharedReduce first, because b[] is modified in globalReduce

cudaEventRecord(start, 0); //keep start time

sharedReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate

cudaEventRecord(stop, 0); //keep stop time

cudaEventSynchronize(stop); //wait stop event

cudaEventElapsedTime(&elapsedTime, start, stop);

cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

//Print result

int answer = (ArraySize + 1)*ArraySize / 2;

printf("shared:\n");

sum = 0;

for (int i = 0; i < BlockSize; i++) {

sum += host_b[i];

printf("%d ", host_b[i]);

}

printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);

// run globalReduce

cudaEventRecord(start, 0); //keep start time

globalReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a); //calculate

cudaEventRecord(stop, 0); //keep stop time

cudaEventSynchronize(stop); //wait stop event

cudaEventElapsedTime(&elapsedTime, start, stop);

cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

//Print result

printf("global:\n");

sum = 0;

for (int i = 0; i < BlockSize; i++) {

sum += host_b[i];

printf("%d ", host_b[i]);

}

printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);

//cudaDeviceSynchronize();

getchar();

cudaFree(dev_a);

cudaFree(dev_b);

return 0;

}

執行結果:

undefined

雖然 shared memory 速度比 global memory 速度快, 但是在這個例子, 因為計算太過簡單, 且 sharedReduce 也因為多了一個搬移至 shared memory 的動作, 所以在 N 值較小時反而比較慢.

參考資料 :

[1] John Owens, "GPU Reduce, Scan, and Sort", CS223 Guest Lecture, Electrical and Computer Engineering, http://web.cs.ucdavis.edu/~amenta/f15/amenta-reduce-scan-sort.pdf

[2] Bedřich Beneš, "CUDA Application Reduction Algorithm ", Department of Computer Graphics , Purdue University, http://hpcg.purdue.edu/bbenes/classes/CGT620/lectures/CGT%20620-CUDA-09-CUDA%20Parallel%20Reduction.pdf