[CUDA] 測量效能－No More Codes

這裡使用兩種方式:

使用 windows的 PerformanceCounter, 只要 include <Windows.h> 即可使用
使用 cudaEvent

arraySize受限於host與gpu的記憶體容量, 可自行調整測試, repeat為重複執行次數. 如果在 main() 中重複呼叫 addWithCuda(), 會因為記憶體搬移花費大量時間, 反而可能比CPU還慢, 因此直接在 addWithCuda() 中重複呼叫 kernel去計算. 資料陣列可宣告於 globalㄝ如果放在local 最好宣告 static, 否則 arraySize 太大會造成 stack overflow.

此例執行畫面中可以看到, 使用GPU大約速度增19倍, 實際效能與 arraySize 及系統配備有關.

undefined

程式碼:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include "cooperative_groups_helpers.h"

#include <iostream>

#include <memory>

#include <string>

#include <Windows.h>

const int repeat = 10000; //重複計算次數;

const int arraySize = 1024 * 1024; //受限於host與gpu記憶體大小

BOOL WINAPI QueryPerformanceCounter(_Out_ LARGE_INTEGER *lpPerformanceCount);

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

void addWithCpu(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b, int size);

// data array for test, global or local static in heap to prevent stck overflow

int a[arraySize];

int b[arraySize];

int c[arraySize];

int d[arraySize];

int main()

{

// data array for test, global or local static in heap to prevent stck overflow

static int a[arraySize];

static int b[arraySize];

static int c[arraySize];

static int d[arraySize];

// setup performance measure from windows ----------

LARGE_INTEGER frequency; // ticks per second

LARGE_INTEGER t1, t2; // ticks

float elapsedTime;

// setup performance measure from windows ---------

QueryPerformanceFrequency(&frequency);

// setup performance meter from CUDA ----------

cudaEvent_t start, stop;

cudaEventCreate(&start);

cudaEventCreate(&stop);

// init array ----------

for (int i = 0; i < arraySize; i++) {

a[i] = i;

b[i] = arraySize + i;

}

/// Add by CPU ----------

QueryPerformanceCounter(&t1); //keep start time

for (int i = 0; i < repeat; i++) {

addWithCpu(c, a, b, arraySize);

}

QueryPerformanceCounter(&t2); //keep stop time

elapsedTime = (t2.QuadPart - t1.QuadPart) * 1000.0 / frequency.QuadPart;

printf("c[%d]=%d, cpu t=%f\n",

arraySize - 1, c[arraySize - 1], elapsedTime);

// Add by CUDA ----------

cudaEventRecord(start, 0); //keep start time

cudaError_t cudaStatus = addWithCuda(d, a, b, arraySize);

cudaEventRecord(stop, 0); //keep stop time

cudaEventSynchronize(stop); //wait stop event

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addWithCuda failed!");

return 1;

}

cudaEventElapsedTime(&elapsedTime, start, stop);

printf("d[%d]=%d, gpu t=%f\n",

arraySize - 1, c[arraySize - 1], elapsedTime);

// cudaDeviceReset must be called before exiting in order for profiling and

// tracing tools such as Nsight and Visual Profiler to show complete traces.

cudaStatus = cudaDeviceReset();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceReset failed!");

return 1;

}

getchar();

return 0;

}

// Add with CPU ---------

void addWithCpu(int *c, const int *a, const int *b, unsigned int size)

{

for (unsigned int i = 0; i < size; i++) {

c[i] = a[i] + b[i];

}

// Add with GPU ---------

__global__ void addKernel(int *c, const int *a, const int *b, int size)

{

int i = blockIdx.x*blockDim.x + threadIdx.x;

if (i < size) {

c[i] = a[i] + b[i];

}

// Helper function for using CUDA to add vectors in parallel.

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)

{

int *dev_a = 0;

int *dev_b = 0;

int *dev_c = 0;

cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.

int dev = 0;

cudaStatus = cudaSetDevice(dev);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");

goto Error;

}

cudaSetDevice(dev);

// Allocate GPU buffers for three vectors (two input, one output) .

cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMalloc failed!");

goto Error;

}

// Copy input vectors from host memory to GPU buffers.

cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

// Launch a kernel on the GPU with one thread for each element.

int block = 1;

unsigned int thread_x = (size-1) / blockDim.x+1;

unsigned int thread_y = (size-1) % blockDim.x+1;

dim3 thread = {thread_x, thread_y, 1 };

int block = (size - 1) / 1024 + 1;

int thread = (size>1024) ? 1024 : (size - 1);

for (int i = 0; i < repeat; i++) {

addKernel << <block, thread >> > (dev_c, dev_a, dev_b, size);

}

// Check for any errors launching the kernel

cudaStatus = cudaGetLastError();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));

goto Error;

}

// cudaDeviceSynchronize waits for the kernel to finish, and returns

// any errors encountered during the launch.

cudaStatus = cudaDeviceSynchronize();

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

goto Error;

}

// Copy output vector from GPU buffer to host memory.

cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);

if (cudaStatus != cudaSuccess) {

fprintf(stderr, "cudaMemcpy failed!");

goto Error;

}

Error:

cudaFree(dev_c);

cudaFree(dev_a);

cudaFree(dev_b);

return cudaStatus;

}

ghostyguo

No More Codes

ghostyguo 發表在痞客邦留言(0) 人氣()

E-mail轉寄

No More Codes

歡迎光臨ghostyguo在痞客邦的小天地

[CUDA] 測量效能

歷史上的今天

留言列表

文章分類

雜項記錄 (3)

硬體設計 (1)

訊號處理 (4)

Raspberry Pi (1)

程式設計 (19)

熱門文章

最新留言

參觀人氣

QR Code

文章搜尋

最新文章