#include <iostream>Basically, we:
#include <cuda.h>
/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x; /* find my index */
vals[i]=i+param;
}
int main(int argc,char *argv[]) {
int n=16; /* total number of floats */
float *vals; /* device array of n values */
cudaMalloc( (void**) &vals, n*sizeof(float) ); //Allocate GPU space
set_array<<<1,n>>>(vals,0.1234); /* Initialize the space on the GPU */
/* Copy a few elements back to CPU for printing */
int i=7;
float f=-999.0; /* CPU copy of value */
cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost);
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
return 0;
}
#include <iostream>
#include <cuda.h>
/* error checking */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x+blockDim.x*blockIdx.x; /* find my index */
vals[i]=i+param;
}
int main(int argc,char *argv[]) {
int w=4, h=4; /* number of blocks, threads per block */
int n=w*h; /* total number of floats */
float *vals; /* device array of n values */
check(cudaMalloc( (void**) &vals, n*sizeof(float) )); //Allocate some space
set_array<<<w,h>>>(vals,0.1234); /* Initialize the space on the GPU */
/* Copy a few elements back to CPU for printing */
for (int i=0;i<n;i+=3) {
float f=-999.0; /* CPU copy of value */
check(cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost));
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
}
return 0;
}
#include <iostream>Here's the output on NetRun's NVIDIA GeForce GTX 280:
#include <cuda.h>
#include "lib/inc.c"
float *dev_ptr=0, *host_ptr=0;
int len=0;
int time_memcpy(void) {
cudaMemcpy(dev_ptr,host_ptr,len,cudaMemcpyHostToDevice);
cudaThreadSynchronize();
return 0;
}
__global__ void doGPUdatawrite(float *arr,float val) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
arr[i]=val;
}
int time_datawrite(void) {
int blocks=1, tpb=len;
while (tpb>=512) {blocks*=2; tpb/=2;}
doGPUdatawrite<<<blocks,tpb>>>(dev_ptr,1.2345);
cudaThreadSynchronize();
return 0;
}
void time_sweep(const char *name,timeable_fn f) {
int max=1024*1024*4;
cudaMalloc((void **)&dev_ptr,max*sizeof(float));
cudaMallocHost((void **)&host_ptr,max*sizeof(float));
std::cout<<"Did mallocs\n";
for (len=64;len<=max;len*=4) {
double t=time_function(f);
printf("%s size %d: %.2f ns/float (%.1f us)\n",
name,len,t/len*1.0e9,t*1.0e6);
}
cudaFreeHost(host_ptr);
cudaFree(dev_ptr);
}
int main(int argc,char *argv[]) {
std::cout<<"Starting up...\n";
time_sweep("memcpy",time_memcpy);
time_sweep("GPU write",time_datawrite);
return 0;
}
memcpy size 64: 163.73 ns/float (10.5 us)Note that really big arrays, with millions of floats, take much less time per float than smaller arrays. The basic trouble is that going through the OS, across the PCIe bus, into the GPU, and back takes like 10 microseconds (10us = 10000ns). If you go to all that just to get one or two floats, you'll get abysmal performance.
memcpy size 256: 41.11 ns/float (10.5 us)
memcpy size 1024: 10.91 ns/float (11.2 us)
memcpy size 4096: 2.95 ns/float (12.1 us)
memcpy size 16384: 1.01 ns/float (16.6 us)
memcpy size 65536: 0.53 ns/float (34.7 us)
memcpy size 262144: 0.41 ns/float (108.3 us)
memcpy size 1048576: 0.38 ns/float (401.3 us)
memcpy size 4194304: 0.38 ns/float (1574.1 us)
GPU write size 64: 200.15 ns/float (12.8 us)
GPU write size 256: 50.37 ns/float (12.9 us)
GPU write size 1024: 12.65 ns/float (13.0 us)
GPU write size 4096: 3.21 ns/float (13.2 us)
GPU write size 16384: 0.85 ns/float (13.9 us)
GPU write size 65536: 0.25 ns/float (16.5 us)
GPU write size 262144: 0.10 ns/float (25.7 us)
GPU write size 1048576: 0.06 ns/float (62.8 us)
GPU write size 4194304: 0.05 ns/float (210.4 us)
#include <cuda.h>The surprising part about this is the timing breakdown:
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
/* GPU Code! */
__global__ void fill_in_array(float *arr,int wid) {
int i=threadIdx.x+blockDim.x*blockIdx.x; // my thread's global number
int x=i%wid, y=i/wid; // my thread's pixel to work on
float cr=x*1.0/wid, ci=y*1.0/wid;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
arr[y*wid+x]=count;
}
/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=512,ht=512;
float *arr=0; /* LIVES ON THE GPU!!!! */
double start=time_in_seconds(), elapsed;
check(cudaMalloc((void **)&arr, wid*ht*sizeof(float)));
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
int threadsPerBlock=512;
int nBlocks=wid*ht/threadsPerBlock;
fill_in_array<<<nBlocks,threadsPerBlock>>>(arr,wid);
check(cudaThreadSynchronize()); // look for errors in kernel
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
float harr[wid*ht];
check(cudaMemcpy(harr,arr,wid*ht*sizeof(float),cudaMemcpyDeviceToHost));
elapsed=time_in_seconds()-start;
std::cout<<"Copied out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
for (int i=0;i<wid*ht;i++) {
char c=(char)harr[i];
of<<c;
}
return 0;
}
Allocated array: 152.6ns/pixel, time 0.040 secondsThe biggest surprise is the cudaMalloc time, which is ridiculously huge because this is the first CUDA call in the program, so the driver has to locate and set up the graphics card. Adding a "dummy" cudaMalloc of four bytes reduces the allocation time to less than a nanosecond per pixel.
Rendered array: 3.4ns/pixel, time 0.001 seconds
Copied out array: 10.6ns/pixel, time 0.003 seconds
Allocated array: 0.02ns/pixel, time 5.11634e-06 secondsGLSL is as fast as the tuned char-size CUDA immediately, and the code is much shorter. But GLSL does this by giving up pointers, and the ability to write to random memory locations.
Rendered array: 2.74ns/pixel, time 0.000719074 seconds
Copied out array: 4.63ns/pixel, time 0.00121307 seconds
float cr=texcoords.x, ci=texcoords.y;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(count/255.0);
Here's the same code, written for a multicore CPU using
OpenMP (which we'll cover at the end of this month). On my quad-core Q6600, this runs in about 130ns/pixel,
which is about 48x slower than CUDA on the GPU. Even counting
time for the cudaMemcpy, the GPU advantage is still over 28x.
#include <iostream>Here we're using a whole lot of SSE to use multicore+SSE, which runs in about 40ns/pixel. This is still over 10x slower than the GPU.
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=x*1.0/wid, ci=y*1.0/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}
/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
#pragma omp parallel for schedule(dynamic,10)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);
elapsed=time_in_seconds()-start;
std::cout<<"Wrote out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
return 0;
}
#include <iostream>Modern graphics cards are *very* fast!
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
#include <xmmintrin.h>
const int wid=512, ht=512;
/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
#pragma omp parallel for schedule(dynamic,10)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x+=4) {
float cr[4], ci[4];
for (int i=0;i<4;i++) {
cr[i]=(x+i)*1.0f/wid;
ci[i]=y*1.0f/ht;
}
float zero[4]={0.0,0.0,0.0,0.0};
float two[4]={2.0,2.0,2.0,2.0};
__m128 TWO=_mm_load_ps(two);
float four[4]={4.0,4.0,4.0,4.0};
__m128 FOUR=_mm_load_ps(four);
__m128 CR=_mm_load_ps(cr), CI=_mm_load_ps(ci);
__m128 ZR=_mm_load_ps(zero), ZI=_mm_load_ps(zero);
int count;
enum {max_count=100};
for (count=0;count<max_count;count++) {
__m128 tZR=ZR*ZR-ZI*ZI+CR; /*subtle: don't overwrite ZR yet!*/
__m128 tZI=TWO*ZR*ZI+CI;
__m128 LEN=tZR*tZR+tZI*tZI;
__m128 MASK=_mm_cmplt_ps(LEN,FOUR); /* set if len<4.0 */
ZR=_mm_or_ps(_mm_and_ps(MASK,tZR),_mm_andnot_ps(MASK,ZR));
ZI=_mm_or_ps(_mm_and_ps(MASK,tZI),_mm_andnot_ps(MASK,ZI));
if (_mm_movemask_ps(LEN-FOUR)==0) break; /* everybody's > 4 */
}
_mm_store_ps(cr,ZR); _mm_store_ps(ci,ZI);
for (int i=0;i<4;i++) {
arr[x+i+y*wid]=(256.0/2.0)*cr[i];
}
}
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);
elapsed=time_in_seconds()-start;
std::cout<<"Wrote out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
return 0;
}
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <cstdlib>
int main(void)
{
// generate some random data on the host
thrust::host_vector<int> h_vec(10);
for (unsigned int i=0;i<h_vec.size();i++) h_vec[i]=rand()%10;
// transfer to device
thrust::device_vector<int> d_vec = h_vec;
// sum on device
int final_sum = thrust::reduce(d_vec.begin(), d_vec.end(),
0, thrust::plus<int>());
std::cout<<"Final sum="<<final_sum<<"\n";
return 0;
}
Here's how to use thrust::reduce to find the biggest and smallest elements in the array too.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <cstdlib>
int main(void)
{
// generate some random data on the host
thrust::host_vector<int> h_vec(10);
for (unsigned int i=0;i<h_vec.size();i++) h_vec[i]=rand()%10;
// transfer to device
thrust::device_vector<int> d_vec = h_vec;
// sum on device
int final_sum = thrust::reduce(d_vec.begin(), d_vec.end(),
0, thrust::plus<int>());
int final_max = thrust::reduce(d_vec.begin(), d_vec.end(),
0, thrust::maximum<int>());
int final_min = thrust::reduce(d_vec.begin(), d_vec.end(),
999, thrust::minimum<int>());
std::cout<<"Final sum="<<final_sum<<" max="<<final_max<<" min="<<final_min<<"\n";
return 0;
}
It's not very efficient (see below) to call thrust::reduce three
times on the same vector. It's more efficient to call it once,
and collect up the sum, min, and max all in one go. To do this,
we need to write a weird 'functor' to pass to thrust::reduce.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <cstdlib>
// This is used to store everything we know about the ints seen so far:
class sum_min_max {
public:
int sum, min, max;
sum_min_max() {sum=0; min=1000000000; max=-1000000000;}
__device__ __host__ sum_min_max(int value) {sum=value; min=value; max=value;}
};
// This 'functor' function object combines two sum_min_max objects
class smm_combiner {
public:
__device__ __host__
sum_min_max operator()(sum_min_max l,const sum_min_max &r) {
l.sum+=r.sum;
if (l.min>r.min) l.min=r.min;
if (l.max<r.max) l.max=r.max;
return l;
}
};
int main(void)
{
// generate some random data on the host
thrust::host_vector<int> h_vec(10);
for (unsigned int i=0;i<h_vec.size();i++) h_vec[i]=rand()%10;
// transfer to device
thrust::device_vector<int> d_vec = h_vec;
// sum/min/max on device
sum_min_max final = thrust::reduce(d_vec.begin(), d_vec.end(),
sum_min_max(), smm_combiner());
std::cout<<"Final sum="<<final.sum<<" max="<<final.max<<" min="<<final.min<<"\n";
return 0;
}
This same idea could probably be better written as a thrust::tuple.
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <cstdlib>
#include "lib/inc.c" /* for netrun timing functions */
int main(void)
{
// generate some random data on the host
thrust::host_vector<int> h_vec(10);
for (unsigned int i=0;i<h_vec.size();i++) h_vec[i]=rand();
// transfer to device
thrust::device_vector<int> d_vec = h_vec;
// sort on device
thrust::sort(d_vec.begin(), d_vec.end());
// copy back, and print
h_vec = d_vec;
for (unsigned int i=0;i<h_vec.size();i++)
std::cout<<"val["<<i<<"] = "<<h_vec[i]<<"\n";
return 0;
}
#include <thrust/host_vector.h>This outputs:
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <cstdlib>
#include "lib/inc.c" /* for netrun timing functions */
thrust::device_vector<int> d_vec;
int sort_device_vector(void) {
// sort on device
thrust::sort(d_vec.begin(), d_vec.end());
return 0;
}
int main(void)
{
for (int vec_size=16;vec_size<=4*1024*1024;vec_size*=4)
{
// generate random data on the host
thrust::host_vector<int> h_vec(vec_size);
for (unsigned int i=0;i<h_vec.size();i++) h_vec[i]=rand();
// transfer to device
d_vec = h_vec;
double time=time_function(sort_device_vector);
printf("Time for sort of size %d: %.3f ns/elt (%.3f ms)\n",
h_vec.size(), time/h_vec.size()*1.0e9, time*1.0e3);
// copy back and print
h_vec = d_vec;
if (vec_size<=16)
for (unsigned int i=0;i<h_vec.size();i++)
std::cout<<"val["<<i<<"] = "<<h_vec[i]<<"\n";
}
return 0;
}
Time for sort of size 16: 21099.579 ns/elt (0.338 ms)Note how sorting 16 elements and 16 thousand elements takes nearly exactly the same amount of time. This is common on the GPU--the startup time to get a kernel running is measured in microseconds, not nanoseconds like a function call. This means you need to do work in huge batches, millions of data items at a time, to get good efficiency.
Time for sort of size 64: 5302.252 ns/elt (0.339 ms)
Time for sort of size 256: 1377.077 ns/elt (0.353 ms)
Time for sort of size 1024: 312.655 ns/elt (0.320 ms)
Time for sort of size 4096: 78.331 ns/elt (0.321 ms)
Time for sort of size 16384: 20.922 ns/elt (0.343 ms)
Time for sort of size 65536: 8.636 ns/elt (0.566 ms)
Time for sort of size 262144: 5.332 ns/elt (1.398 ms)
Time for sort of size 1048576: 3.527 ns/elt (3.699 ms)
Time for sort of size 4194304: 3.027 ns/elt (12.694 ms)