CUDA: Pointers on the GPU

CS 441 Lecture, Dr. Lawlor
Here's probably the simplest possible CUDA program:
#include <iostream>
#include <cuda.h>

/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x; /* find my index */
vals[i]=i+param;
}
int main(int argc,char *argv[]) {
int n=16; /* total number of floats */
float *vals; /* device array of n values */
cudaMalloc( (void**) &vals, n*sizeof(float) ); //Allocate GPU space

set_array<<<1,n>>>(vals,0.1234); /* Initialize the space on the GPU */

/* Copy a few elements back to CPU for printing */
int i=7;
float f=-999.0; /* CPU copy of value */
cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost);
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
return 0;
}

(Try this in NetRun now!)

Basically, we:
In the version below, we actually check errors, and we're allocating threads in 'blocks', which lets you use multiple blocks to work around the hardware's thread count limit of 512 threads per block:
#include <iostream>
#include <cuda.h>
/* error checking */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}

/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x+blockDim.x*blockIdx.x; /* find my index */
vals[i]=i+param;
}
int main(int argc,char *argv[]) {
int w=4, h=4; /* number of blocks, threads per block */
int n=w*h; /* total number of floats */
float *vals; /* device array of n values */
check(cudaMalloc( (void**) &vals, n*sizeof(float) )); //Allocate some space

set_array<<<w,h>>>(vals,0.1234); /* Initialize the space on the GPU */

/* Copy a few elements back to CPU for printing */
for (int i=0;i<n;i+=3) {
float f=-999.0; /* CPU copy of value */
check(cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost));
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
}
return 0;
}

(Try this in NetRun now!)

CUDA Performance: Use Big Arrays!

One universal aspect of CUDA is that kernel calls (<<<), mallocs, and memcpy all take quite a long time to get running.  Once they're running, they're fairly fast, but for maximum efficiency you should plan on accessing about a megabyte each time!  Here's an example where I benchmark this length dependence:
#include <iostream>
#include <cuda.h>
#include "lib/inc.c"

float *dev_ptr=0, *host_ptr=0;
int len=0;

int time_memcpy(void) {
cudaMemcpy(dev_ptr,host_ptr,len,cudaMemcpyHostToDevice);
cudaThreadSynchronize();
return 0;
}

__global__ void doGPUdatawrite(float *arr,float val) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
arr[i]=val;
}
int time_datawrite(void) {
int blocks=1, tpb=len;
while (tpb>=512) {blocks*=2; tpb/=2;}
doGPUdatawrite<<<blocks,tpb>>>(dev_ptr,1.2345);
cudaThreadSynchronize();
return 0;
}

void time_sweep(const char *name,timeable_fn f) {
int max=1024*1024*4;
cudaMalloc((void **)&dev_ptr,max*sizeof(float));
cudaMallocHost((void **)&host_ptr,max*sizeof(float));
std::cout<<"Did mallocs\n";
for (len=64;len<=max;len*=4) {
double t=time_function(f);
printf("%s size %d: %.2f ns/float (%.1f us)\n",
name,len,t/len*1.0e9,t*1.0e6);
}
cudaFreeHost(host_ptr);
cudaFree(dev_ptr);
}

int main(int argc,char *argv[]) {
std::cout<<"Starting up...\n";
time_sweep("memcpy",time_memcpy);
time_sweep("GPU write",time_datawrite);
return 0;
}

(Try this in NetRun now!)

Here's the output on NetRun's NVIDIA GeForce GTX 280:
memcpy size 64: 163.73 ns/float (10.5 us)
memcpy size 256: 41.11 ns/float (10.5 us)
memcpy size 1024: 10.91 ns/float (11.2 us)
memcpy size 4096: 2.95 ns/float (12.1 us)
memcpy size 16384: 1.01 ns/float (16.6 us)
memcpy size 65536: 0.53 ns/float (34.7 us)
memcpy size 262144: 0.41 ns/float (108.3 us)
memcpy size 1048576: 0.38 ns/float (401.3 us)
memcpy size 4194304: 0.38 ns/float (1574.1 us)

GPU write size 64: 200.15 ns/float (12.8 us)
GPU write size 256: 50.37 ns/float (12.9 us)
GPU write size 1024: 12.65 ns/float (13.0 us)
GPU write size 4096: 3.21 ns/float (13.2 us)
GPU write size 16384: 0.85 ns/float (13.9 us)
GPU write size 65536: 0.25 ns/float (16.5 us)
GPU write size 262144: 0.10 ns/float (25.7 us)
GPU write size 1048576: 0.06 ns/float (62.8 us)
GPU write size 4194304: 0.05 ns/float (210.4 us)
Note that really big arrays, with millions of floats, take much less time per float than smaller arrays.  The basic trouble is that going through the OS, across the PCIe bus, into the GPU, and back takes like 10 microseconds (10us = 10000ns).   If you go to all that just to get one or two floats, you'll get abysmal performance.

CUDA Application Performance: Mandelbrot Rendering

Here's a little Mandelbrot set rendering application in CUDA.  It includes benchmarking code, which shows some surprising results.
#include <cuda.h>
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */

#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}

/* GPU Code! */
__global__ void fill_in_array(float *arr,int wid) {
int i=threadIdx.x+blockDim.x*blockIdx.x; // my thread's global number
int x=i%wid, y=i/wid; // my thread's pixel to work on

float cr=x*1.0/wid, ci=y*1.0/wid;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
arr[y*wid+x]=count;
}

/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=512,ht=512;
float *arr=0; /* LIVES ON THE GPU!!!! */
double start=time_in_seconds(), elapsed;
check(cudaMalloc((void **)&arr, wid*ht*sizeof(float)));
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
int threadsPerBlock=512;
int nBlocks=wid*ht/threadsPerBlock;
fill_in_array<<<nBlocks,threadsPerBlock>>>(arr,wid);

check(cudaThreadSynchronize()); // look for errors in kernel
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";

float harr[wid*ht];
check(cudaMemcpy(harr,arr,wid*ht*sizeof(float),cudaMemcpyDeviceToHost));
elapsed=time_in_seconds()-start;
std::cout<<"Copied out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";

std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
for (int i=0;i<wid*ht;i++) {
char c=(char)harr[i];
of<<c;
}
return 0;
}

(Try this in NetRun now!)

The surprising part about this is the timing breakdown:
Allocated array: 152.6ns/pixel, time 0.040 seconds
Rendered array: 3.4ns/pixel, time 0.001 seconds
Copied out array: 10.6ns/pixel, time 0.003 seconds
The biggest surprise is the cudaMalloc time, which is ridiculously huge because this is the first CUDA call in the program, so the driver has to locate and set up the graphics card.   Adding a "dummy" cudaMalloc of four bytes reduces the allocation time to less than a nanosecond per pixel.

Second, it takes longer to copy the data off the card than it does to compute it!  And sadly, I get a crash when I try to use "char" data as a kernel parameter, so I'm stuck with the "float" data for now. 

CUDA's generality does come at a small price.  This GLSL code runs on the same graphics card at about 2.7 ns/pixel, which is about 20% faster than CUDA, mostly due to GLSL's better branch and memory locality.  But GLSL does this by giving up pointers, and the ability to write to random memory locations.
	float cr=texcoords.x, ci=texcoords.y;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(count/255.0);

(Try this in NetRun now!)

Here's the same code, written for a multicore CPU using OpenMP.  On my quad-core Q6600, this runs in about 130ns/pixel, which is about 38x slower than CUDA on the GPU.  Even counting time for the cudaMemcpy, the GPU advantage is still over 12x.

#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */

const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=x*1.0/wid, ci=y*1.0/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}

/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";

start=time_in_seconds(); // <- reset the timer!
#pragma omp parallel for schedule(dynamic,10)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";

start=time_in_seconds(); // <- reset the timer!
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);

elapsed=time_in_seconds()-start;
std::cout<<"Wrote out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
return 0;
}

(Try this in NetRun now!)

Modern graphics cards are *very* fast!