考慮一個典型的總和運算 :

     int sum = 0;

     for (i = 1; i < N; i++) {

         sum += i;

   }

假設要做平行化, 考慮 data dependency 的問題, 假設 N=8,安排成3個 iteration 來做, 共需 3 個 iteration, 如下圖:

undefined


下面這個程式是簡化的, 假設 N = BlockSize*ThreadSize, 且 ThreadSize 必須為 2 的次方, 計算 1+2+3...+N 的總和, 每個 Block 負責計算一個總和, 在主程式中再把所有 Block 總和結果加起來, 得到數列的計算結果. 由於 globalReduce 會改變 a[] 的值, 所以先做 sharedReduce. 程式碼如下 :

 

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

 

#define BlockSize 8

#define ThreadSize 1024

#define ArraySize (BlockSize*ThreadSize)

 

__device__ void __syncthreads();

__global__ void globalReduceBlockSum(int *b, int *a)

{

     int id = blockIdx.x * blockDim.x + threadIdx.x;

     for (int d = blockDim.x / 2; d > 0; d >>= 1)

     {

         if (threadIdx.x < d)

         {

              a[id] += a[id + d];

         }       

         __syncthreads();

     }

 

     if (threadIdx.x == 0) {

         b[blockIdx.x] = a[id];

     }

}

 

__global__ void sharedReduceBlockSum(int *b, int *a)

{

     __shared__ int x[ThreadSize];

 

     int id = blockIdx.x * blockDim.x + threadIdx.x;

     x[threadIdx.x] = a[id]; //copy to shared memory of block

     __syncthreads(); //wait all threads copy complete

 

     for (int d = blockDim.x / 2; d > 0; d >>= 1)

     {

         if (threadIdx.x < d)

         {

              x[threadIdx.x] += x[threadIdx.x + d];

         }

         __syncthreads();

     }

 

     if (threadIdx.x == 0) {

         b[blockIdx.x] = x[0];

     }

}

 

int main()

{

     int host_a[ArraySize];

     int host_b[BlockSize];

     int *dev_a = 0;

     int *dev_b = 0;

     int sum = 0;

     float elapsedTime;

 

     // setup performance meter from CUDA ----------

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

 

     cudaSetDevice(0);

 

     cudaMalloc((void**)&dev_a, ArraySize * sizeof(int));

     for (int i = 0; i <  ArraySize; i++)

         host_a[i] = i+1;

     cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);

 

     cudaMalloc((void**)&dev_b, BlockSize * sizeof(int));

     cudaMemset(dev_b, 0, BlockSize * sizeof(int));

 

     // Run sharedReduce first, because b[] is modified in globalReduce

 

     cudaEventRecord(start, 0); //keep start time

     sharedReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a);     //calculate

     cudaEventRecord(stop, 0); //keep stop time

     cudaEventSynchronize(stop); //wait stop event    

     cudaEventElapsedTime(&elapsedTime, start, stop);

 

     cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

 

     //Print result

     int answer = (ArraySize + 1)*ArraySize / 2;

 

     printf("shared:\n");

     sum = 0;

     for (int i = 0; i < BlockSize; i++) {

         sum += host_b[i];

         printf("%d ", host_b[i]);

     }

     printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);

 

     // run globalReduce

     cudaEventRecord(start, 0); //keep start time

     globalReduceBlockSum << <BlockSize, ThreadSize >> > (dev_b, dev_a);     //calculate

     cudaEventRecord(stop, 0); //keep stop time

     cudaEventSynchronize(stop); //wait stop event    

     cudaEventElapsedTime(&elapsedTime, start, stop);

 

     cudaMemcpy(host_b, dev_b, BlockSize * sizeof(int), cudaMemcpyDeviceToHost);

 

     //Print result

     printf("global:\n");

     sum = 0;

     for (int i = 0; i < BlockSize; i++) {

         sum += host_b[i];

         printf("%d ", host_b[i]);

     }

     printf("sum=%d answer=%d t=%f\n\n", sum, answer, elapsedTime);

     //cudaDeviceSynchronize();

     getchar();

 

     cudaFree(dev_a);

     cudaFree(dev_b);

     return 0;

}

 

 

執行結果:

undefined


雖然 shared memory 速度比 global memory 速度快, 但是在這個例子, 因為計算太過簡單, 且 sharedReduce因為多了一個搬移至 shared memory 的動作, 所以在 N 值較小時反而比較慢.
 

參考資料 :

[1] John Owens, "GPU Reduce, Scan, and Sort", CS223 Guest Lecture, Electrical and Computer Engineering, http://web.cs.ucdavis.edu/~amenta/f15/amenta-reduce-scan-sort.pdf

[2] Bedřich Beneš, "CUDA Application Reduction Algorithm ", Department of Computer Graphics , Purdue University, http://hpcg.purdue.edu/bbenes/classes/CGT620/lectures/CGT%20620-CUDA-09-CUDA%20Parallel%20Reduction.pdf

 

 

 

 

 

 

 

 

 

 

arrow
arrow
    全站熱搜

    ghostyguo 發表在 痞客邦 留言(0) 人氣()