Multicore Performance Example
Here's a simple example where we simulate the flight path of a baseball for various starting conditions.
class baseball { public: float px,py,pz; float vx,vy,vz; void step(float dt) { float drag=-1.0; float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz; float coriolis=0.00001; ax+=coriolis*vy; ay-=coriolis*vx; vx+=ax*dt; vy+=ay*dt; vz+=az*dt; px+=vx*dt; py+=vy*dt; pz+=vz*dt; } }; float foo(void) { #define ASCII 0 /* set to 1 to see ASCII art of the ball's path */ #if ASCII const int ASCIIART=100; // chars wide and high char buf[ASCIIART][ASCIIART]; for (int y=0;y<ASCIIART;y++) for (int x=0;x<ASCIIART;x++) buf[y][x]=' '; #endif baseball b; for (float angle=0.0;angle<1.0;angle+=0.125) { b.px=b.py=0.0; b.pz=2.0; b.vx=35.7; b.vy=0.2; b.vz=2.0+angle; // Move the baseball: for (int i=0;i<1000;i++) { b.step(0.001); #if ASCII int y=(int)(b.pz*ASCIIART/100*40); int x=(int)(b.px*ASCIIART/100*4); if (y>=0 && y<ASCIIART && x>=0 && x<ASCIIART) buf[y][x]='O'+(char)(angle*8); #endif } } #if ASCII for (int y=0;y<ASCIIART;y++) { for (int x=0;x<ASCIIART;x++) std::cout<<buf[ASCIIART-1-y][x]; std::cout<<std::endl; } #endif return b.pz; }
We can convert to multicore using OpenMP by adding the pragma to parallelize the angle loop:
#pragma omp parallel for for (int angle_i=0;angle_i<8;angle_i++) // for (float angle=0.0;angle<1.0;angle+=0.125) // omp can't do float loops { float angle=0.125*angle_i; baseball b; b.px=b.py=0.0; b.pz=2.0; b.vx=35.7; b.vy=0.2; b.vz=2.0+angle; for (int i=0;i<1000;i++) { b.step(0.001);
...
}
}
Here's the CUDA baseball simulator:
#include <iostream> #include <stdio.h> #include "lib/inc.c" class baseball { public: float px,py,pz; float vx,vy,vz; __device__ __host__ void step(float dt) { float drag=-1.0; float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz; float coriolis=0.00001; ax+=coriolis*vy; ay-=coriolis*vx; vx+=ax*dt; vy+=ay*dt; vz+=az*dt; px+=vx*dt; py+=vy*dt; pz+=vz*dt; } }; __global__ void move_baseball(baseball *b_array) { int i=blockIdx.x*blockDim.x+threadIdx.x; // global thread ID // printf("Hello from GPU: thread %d of block %d, f=%d\n",t,b,f); float angle=0.000125*i; baseball b; b.px=b.py=0.0; b.pz=2.0; b.vx=35.7; b.vy=0.2; b.vz=2.0+angle; // Move the baseball: for (int i=0;i<1000;i++) { b.step(0.001); } b_array[i]=b; } int main() { int threads=512; int *cpu_flosm=new int[threads*8192]; for (int num_blocks=1;num_blocks<=8192;num_blocks*=2) { baseball *b_array; //cudaMallocManaged(&flosm,threads*num_blocks*sizeof(int)); // slow! cudaMalloc(&b_array,threads*num_blocks*sizeof(baseball)); double start=time_in_seconds(); move_baseball<<< num_blocks,threads >>>(b_array); // spawn threads cudaDeviceSynchronize(); // join threads // Copy back one random baseball, to show it's possible: baseball cpu_b; cudaMemcpy(&cpu_b,b_array,sizeof(baseball),cudaMemcpyDeviceToHost); double elapsed=time_in_seconds()-start; std::cout<<num_blocks<<" blocks took "<<elapsed*1.0e9<<" ns: "<<elapsed*1.0e9/(threads*num_blocks)<<" ns/thread\n"; } }
Performance for a single or small number of baseballs is way worse than the CPU; performance for many baseballs is way better than the CPU.