這裡使用兩種方式:

  • 使用 windows的 PerformanceCounter, 只要 include <Windows.h> 即可使用
  • 使用 cudaEvent

arraySize受限於host與gpu的記憶體容量, 可自行調整測試, repeat為重複執行次數. 如果在 main() 中重複呼叫 addWithCuda(), 會因為記憶體搬移花費大量時間, 反而可能比CPU還慢, 因此直接在 addWithCuda() 中重複呼叫 kernel去計算. 資料陣列可宣告於 globalㄝ如果放在local 最好宣告 static, 否則  arraySize 太大會造成 stack overflow.

此例執行畫面中可以看到, 使用GPU大約速度增19倍, 實際效能與 arraySize 及系統配備有關.

undefined

程式碼:

 

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include "cooperative_groups_helpers.h"

#include <iostream>

#include <memory>

#include <string>

#include <Windows.h>

 

const int repeat = 10000; //重複計算次數;

const int arraySize = 1024 * 1024; //受限於hostgpu記憶體大小

BOOL WINAPI QueryPerformanceCounter(_Out_ LARGE_INTEGER *lpPerformanceCount);

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

void addWithCpu(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b, int size);

 

 

// data array for test, global or local static in heap to prevent stck overflow

/*

int a[arraySize];

int b[arraySize];

int c[arraySize];

int d[arraySize];

*/

int main()

{

     // data array for test,  global or local static in heap to prevent stck overflow

     static int a[arraySize];

     static int b[arraySize];

     static int c[arraySize];

     static int d[arraySize];

 

     // setup performance measure from windows ----------

     LARGE_INTEGER frequency;        // ticks per second

     LARGE_INTEGER t1, t2;           // ticks

     float elapsedTime;

 

     // setup performance measure from windows ---------

     QueryPerformanceFrequency(&frequency);

 

     // setup performance meter from CUDA ----------

     cudaEvent_t start, stop;

     cudaEventCreate(&start);

     cudaEventCreate(&stop);

 

     // init array ----------

     for (int i = 0; i < arraySize; i++) {

         a[i] = i;

         b[i] = arraySize + i;

     }

 

     /// Add by CPU ----------

     QueryPerformanceCounter(&t1); //keep start time

     for (int i = 0; i < repeat; i++) {

         addWithCpu(c, a, b, arraySize);

     }

     QueryPerformanceCounter(&t2); //keep stop time

     elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

     printf("c[%d]=%d, cpu t=%f\n",

         arraySize - 1, c[arraySize - 1], elapsedTime);

    

 

     // Add by CUDA ----------

     cudaEventRecord(start, 0); //keep start time

     cudaError_t cudaStatus = addWithCuda(d, a, b, arraySize);

     cudaEventRecord(stop, 0); //keep stop time

     cudaEventSynchronize(stop); //wait stop event

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addWithCuda failed!");

         return 1;

     }

     cudaEventElapsedTime(&elapsedTime, start, stop); 

     printf("d[%d]=%d, gpu t=%f\n",

         arraySize - 1, c[arraySize - 1], elapsedTime);

 

     // cudaDeviceReset must be called before exiting in order for profiling and

     // tracing tools such as Nsight and Visual Profiler to show complete traces.

     cudaStatus = cudaDeviceReset();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaDeviceReset failed!");

         return 1;

     }

 

     getchar();

     return 0;

}

 

// Add with CPU ---------

void addWithCpu(int *c, const int *a, const int *b, unsigned int size)

{

     for (unsigned int i = 0; i < size; i++) {

         c[i] = a[i] + b[i];

     }

}

 

// Add with GPU ---------

__global__ void addKernel(int *c, const int *a, const int *b, int size)

{

     int i = blockIdx.x*blockDim.x + threadIdx.x;

     if (i < size) {

         c[i] = a[i] + b[i];

     }

}

 

 

// Helper function for using CUDA to add vectors in parallel.

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)

{

     int *dev_a = 0;

     int *dev_b = 0;

     int *dev_c = 0;

     cudaError_t cudaStatus;

 

     // Choose which GPU to run on, change this on a multi-GPU system.

     int dev = 0;

     cudaStatus = cudaSetDevice(dev);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");

         goto Error;

     }

 

     cudaSetDevice(dev);

 

     // Allocate GPU buffers for three vectors (two input, one output)    .

     cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

 

     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

 

     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMalloc failed!");

         goto Error;

     }

 

     // Copy input vectors from host memory to GPU buffers.

     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 

     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 

     // Launch a kernel on the GPU with one thread for each element.

     /*

     int block = 1;

     unsigned int thread_x = (size-1) / blockDim.x+1;

     unsigned int thread_y = (size-1) % blockDim.x+1;

     dim3 thread = {thread_x, thread_y, 1 };

     */

     int block = (size - 1) / 1024 + 1;

     int thread = (size>1024) ? 1024 : (size - 1);

 

     for (int i = 0; i < repeat; i++) {

         addKernel << <block, thread >> > (dev_c, dev_a, dev_b, size);

     }

 

     // Check for any errors launching the kernel

     cudaStatus = cudaGetLastError();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

         goto Error;

     }

 

     // cudaDeviceSynchronize waits for the kernel to finish, and returns

     // any errors encountered during the launch.

     cudaStatus = cudaDeviceSynchronize();

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

         goto Error;

     }

 

     // Copy output vector from GPU buffer to host memory.

     cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

     if (cudaStatus != cudaSuccess) {

         fprintf(stderr, "cudaMemcpy failed!");

         goto Error;

     }

 

Error:

     cudaFree(dev_c);

     cudaFree(dev_a);

     cudaFree(dev_b);

 

     return cudaStatus;

}

 

arrow
arrow
    全站熱搜

    ghostyguo 發表在 痞客邦 留言(0) 人氣()