#include <iostream>Basically, we:
#include <cuda.h>
/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x; /* find my index */
vals[i]=i+param;
}
/* CPU code: memory movement and kernel calls */
int main(int argc,char *argv[]) {
int n=16; /* total number of floats */
float *vals; /* device array of n values */
cudaMalloc( (void**) &vals, n*sizeof(float) ); //Allocate GPU space
set_array<<<1,n>>>(vals,0.1234); /* Initialize the space on the GPU */
/* Copy a few elements back to CPU for printing */
int i=7;
float f=-999.0; /* CPU copy of value */
cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost);
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
return 0;
}
#include <iostream>
#include <cuda.h>
/* error checking */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
/* GPU code: set an array to a value */
__global__ void set_array(float *vals,float param) {
int i=threadIdx.x+blockDim.x*blockIdx.x; /* find my index */
vals[i]=i+param;
}
int main(int argc,char *argv[]) {
int w=4, h=4; /* number of blocks, threads per block */
int n=w*h; /* total number of floats */
float *vals; /* device array of n values */
check(cudaMalloc( (void**) &vals, n*sizeof(float) )); //Allocate some space
set_array<<<w,h>>>(vals,0.1234); /* Initialize the space on the GPU */
/* Copy a few elements back to CPU for printing */
for (int i=0;i<n;i+=3) {
float f=-999.0; /* CPU copy of value */
check(cudaMemcpy(&f,&vals[i],sizeof(float),cudaMemcpyDeviceToHost));
std::cout<<"vals["<<i<<"] = "<<f<<"\n";
}
return 0;
}
Several interesting architectural details crop up in the CUDA
documentation:
CUDA
Concept |
Purpose |
How big? |
CPU
Equivalent |
thread |
One single execution of a kernel. This is mostly conceptual, since the hardware operates on warps of threads. | 1 |
function call |
warp (of threads) |
A group of 32 threads that all take the same branches. The hardware does a good job with predication, so warps aren't "in your face" like with SSE instructions. | 32 "threads" |
SIMD unit ("floats") |
block (of threads) |
A group of a few hundred
threads
that have access to
the same "__shared__" memory. The block size is
specified in
software, and though limited by hardware to 512 or 1024
threads
maximum, typically 256 threads per block performs
best. Serious
slowdowns for non power of two, or less than 100
or so threads per block. The PTX manual calls blocks
"CTAs"
(Cooperative
Thread Arrays). |
About 256 threads |
One multicore thread (Blocks run SMP/SMT style) |
kernel |
A set of blocks of threads,
all running one __global__ function. |
About a million threads |
Parallel loop |
Notice that despite what I think is *intentionally* obfuscated
terminology, deep down there's a whole lot of similarity between
GPU
and CPU architectures; the GPU is just focused on wider
problems.
The GPU's SIMD units are much wider, 32 floats at a time versus 8
at a
time for AVX. A typical GPU will have perhaps 8
multiprocessors
(each called an SM or SMX), versus about 4 for a CPU. The
big
difference is each multiprocessor is deeply hyperthreaded, able to
run
up to a hundred or so threads, typically limited by the physical
register count, nowadays an absurd 65,536
registers per multiprocessor.
So you get really ridiculously good GPU performance if your code
can
fill all the threads, but if you've got single-threaded work, a
GPU actually gives ridiculously bad performance!
The memory model is also highly segmented and specialized:
Name |
Example |
Purpose |
Size (varies) |
Speed
(approx) |
registers |
int i; |
Local storage within a
thread. |
Several thousand per
multiprocessor. |
10TB/s (!!) |
shared memory |
__shared__ float arr[256]; |
Communication within a thread
block. |
16KB - 48KB/block |
1TB/s |
constant memory | __const__ float *ptr | Read-only prebroadcast memory, for kernel parameters and such. | 64KB total, 8KB per multiprocessor. | 1TB/s? |
global memory |
__global__ float *ptr |
Communication across blocks,
and with the CPU. |
1GB or so. |
100GB/s |
texture memory |
texture<float, 3> myTex;
|
2D and 3D read-only images,
with caching and interpolation. |
(same as global memory) |
100GB/s |
local memory |
__local__ float arr[10]; |
Place for compiler to spill
variables. Much slower than registers--do not use. |
(same as global memory) |
100GB/s |
host memory |
cudaMemcpy( &device[0],
&host[0], sizeof(float)*n, cudaMemcpyHostToDevice) |
Only the CPU can do memory
allocation, file I/O, network comms, etc. |
8-32GB |
2-4GB/s (PCIe) |
network |
skt_sendN(&buf[0],buflen); |
Communicate across a cluster. |
(big) |
100MB/s (gigabit) |
#include <iostream>Here's the output on NetRun's NVIDIA GeForce GTX 280:
#include <cuda.h>
#include "lib/inc.c"
float *dev_ptr=0, *host_ptr=0;
int len=0;
int time_memcpy(void) {
cudaMemcpy(dev_ptr,host_ptr,len,cudaMemcpyHostToDevice);
cudaThreadSynchronize();
return 0;
}
__global__ void doGPUdatawrite(float *arr,float val) {
int i=blockIdx.x*blockDim.x+threadIdx.x;
arr[i]=val;
}
int time_datawrite(void) {
int blocks=1, tpb=len;
while (tpb>=512) {blocks*=2; tpb/=2;}
doGPUdatawrite<<<blocks,tpb>>>(dev_ptr,1.2345);
cudaThreadSynchronize();
return 0;
}
void time_sweep(const char *name,timeable_fn f) {
int max=1024*1024*4;
cudaMalloc((void **)&dev_ptr,max*sizeof(float));
cudaMallocHost((void **)&host_ptr,max*sizeof(float));
std::cout<<"Did mallocs\n";
for (len=64;len<=max;len*=4) {
double t=time_function(f);
printf("%s size %d: %.2f ns/float (%.1f us)\n",
name,len,t/len*1.0e9,t*1.0e6);
}
cudaFreeHost(host_ptr);
cudaFree(dev_ptr);
}
int main(int argc,char *argv[]) {
std::cout<<"Starting up...\n";
time_sweep("memcpy",time_memcpy);
time_sweep("GPU write",time_datawrite);
return 0;
}
memcpy size 64: 163.73 ns/float (10.5 us)Note that really big arrays, with millions of floats, take much less time per float than smaller arrays. The basic trouble is that going through the OS, across the PCIe bus, into the GPU, and back takes like 10 microseconds (10us = 10000ns). If you go to all that just to get one or two floats, you'll get terrible performance.
memcpy size 256: 41.11 ns/float (10.5 us)
memcpy size 1024: 10.91 ns/float (11.2 us)
memcpy size 4096: 2.95 ns/float (12.1 us)
memcpy size 16384: 1.01 ns/float (16.6 us)
memcpy size 65536: 0.53 ns/float (34.7 us)
memcpy size 262144: 0.41 ns/float (108.3 us)
memcpy size 1048576: 0.38 ns/float (401.3 us)
memcpy size 4194304: 0.38 ns/float (1574.1 us)
GPU write size 64: 200.15 ns/float (12.8 us)
GPU write size 256: 50.37 ns/float (12.9 us)
GPU write size 1024: 12.65 ns/float (13.0 us)
GPU write size 4096: 3.21 ns/float (13.2 us)
GPU write size 16384: 0.85 ns/float (13.9 us)
GPU write size 65536: 0.25 ns/float (16.5 us)
GPU write size 262144: 0.10 ns/float (25.7 us)
GPU write size 1048576: 0.06 ns/float (62.8 us)
GPU write size 4194304: 0.05 ns/float (210.4 us)
#include <cuda.h>The surprising part about this is the timing breakdown:
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
/* GPU Code! */
__global__ void fill_in_array(float *arr,int wid) {
int i=threadIdx.x+blockDim.x*blockIdx.x; // my thread's global number
int x=i%wid, y=i/wid; // my thread's pixel to work on
float cr=x*1.0/wid, ci=y*1.0/wid;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
arr[y*wid+x]=count;
}
/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=512,ht=512;
float *arr=0; /* LIVES ON THE GPU!!!! */
double start=time_in_seconds(), elapsed;
check(cudaMalloc((void **)&arr, wid*ht*sizeof(float)));
elapsed=time_in_seconds()-start;
std::cout<<"Allocated array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
start=time_in_seconds(); // <- reset the timer!
int threadsPerBlock=512;
int nBlocks=wid*ht/threadsPerBlock;
fill_in_array<<<nBlocks,threadsPerBlock>>>(arr,wid);
check(cudaThreadSynchronize()); // look for errors in kernel
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
float harr[wid*ht];
check(cudaMemcpy(harr,arr,wid*ht*sizeof(float),cudaMemcpyDeviceToHost));
elapsed=time_in_seconds()-start;
std::cout<<"Copied out array: "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed<<" seconds\n";
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
for (int i=0;i<wid*ht;i++) {
char c=(char)harr[i];
of<<c;
}
return 0;
}
Allocated array: 152.6ns/pixel, time 0.040 secondsThe biggest surprise is the cudaMalloc time, which is ridiculously huge because this is the first CUDA call in the program, so the driver has to locate and set up the graphics card. Adding a "dummy" cudaMalloc of four bytes reduces the allocation time to less than a nanosecond per pixel.
Rendered array: 3.4ns/pixel, time 0.001 seconds
Copied out array: 10.6ns/pixel, time 0.003 seconds
Allocated array: 0.02ns/pixel, time 5.11634e-06 seconds
Rendered array: 2.74ns/pixel, time 0.000719074 seconds
Copied out array: 4.63ns/pixel, time 0.00121307 seconds
The easiest way to apply CUDA is to naturally parallel
problems--things like:
It gets much harder if the threads need to communicate in any
way; and many cryptographic operations with sequential
dependencies, like the repetitive rounds of nearly any cipher, or
the chained (dependent) blocks in CBC, can't be run on parallel
hardware at all.
Generally, graphics hardware is oriented towards floating point work (they didn't have integer hardware until 2006 or so, well after programmability), yet I don't know of any popular ciphers that use any floating point. For large-integer work, like most public key algorithms, you'd need a CUDA multiprecision arithmetic library; I've started looking at CUMP but I haven't found a modular arithmetic layer, and am beginning to doubt that one exists.