以下程式碼展示 host memory與device memory之間的關係, 變數命名 host 開頭表示是在 host memory, dev 開頭表示是在 device memory. 一開始初始化 host_a[], 然後用cudaMemcpy() 複製至 dev_a[], 平行呼叫 kernel 函數 shared() 後, 將 dev_a[] 複製至 x[], x是shared memory, 可在同一個 block 內的所有 thread 使用.
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h>
#define BlockSize 2 #define ThreadSize 10 #define ArraySize (BlockSize*ThreadSize)
__global__ void shared(int* a) //running on device { __shared__ int x[ThreadSize]; //shared in the same block
int i = blockIdx.x*ThreadSize+threadIdx.x; x[threadIdx.x] = a[i]; //copy global to shared memory
if (threadIdx.x < ThreadSize/2) { x[threadIdx.x] = x[threadIdx.x + 1]; }
a[i] = x[threadIdx.x]; }
int main() //running on host { int host_a[ArraySize], host_b[ArraySize]; //memory in host int *dev_a = 0; //global memory on device
cudaSetDevice(0); //select a device
// init array values for (int i = 0; i < ArraySize; i++) { host_a[i] = i; printf("a[%d]=%d ", i, host_a[i]); if (i%ThreadSize == ThreadSize-1) printf("\n"); } printf("\n");
// init device memory array values cudaMalloc((void**)&dev_a, ArraySize * sizeof(int)); cudaMemcpy(dev_a, host_a, ArraySize * sizeof(int), cudaMemcpyHostToDevice);
// running kernel in parallel shared << < BlockSize, ThreadSize >> > (dev_a);
//copy result back
cudaMemcpy(host_b, dev_a, ArraySize * sizeof(int), cudaMemcpyDeviceToHost);
// waits for the kernel to finish, cudaDeviceSynchronize();
//output for (int i = 0; i < ArraySize; i++) { printf("b[%d]=%d ", i, host_b[i]); if (i%ThreadSize == ThreadSize-1) printf("\n"); } printf("\n"); getchar(); //wait keypressed
return 0; }
|
留言列表