Multicore Performance Example

Here's a simple example where we simulate the flight path of a baseball for various starting conditions.

class baseball {
public:
	float px,py,pz;
	float vx,vy,vz;
	void step(float dt) {
		float drag=-1.0;
		float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
		float coriolis=0.00001;
		ax+=coriolis*vy; ay-=coriolis*vx; 
		vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
		px+=vx*dt; py+=vy*dt; pz+=vz*dt;
	}
};

float foo(void) {

#define ASCII 0 /* set to 1 to see ASCII art of the ball's path */
#if ASCII
	const int ASCIIART=100; // chars wide and high
	char buf[ASCIIART][ASCIIART];
	for (int y=0;y<ASCIIART;y++) for (int x=0;x<ASCIIART;x++) buf[y][x]=' ';
#endif

	baseball b;
	for (float angle=0.0;angle<1.0;angle+=0.125)
	{
		b.px=b.py=0.0; b.pz=2.0;
		b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
		
	//	Move the baseball:
		for (int i=0;i<1000;i++)
		{
			b.step(0.001);
	
	#if ASCII
			int y=(int)(b.pz*ASCIIART/100*40);
			int x=(int)(b.px*ASCIIART/100*4);
			if (y>=0 && y<ASCIIART && x>=0 && x<ASCIIART)
				buf[y][x]='O'+(char)(angle*8);
	#endif
		}
	}

#if ASCII
	for (int y=0;y<ASCIIART;y++) {
		for (int x=0;x<ASCIIART;x++) 
			std::cout<<buf[ASCIIART-1-y][x];
		std::cout<<std::endl;
	}
#endif

	return b.pz;
}

(Try this in NetRun now!)

We can convert to multicore using OpenMP by adding the pragma to parallelize the angle loop:

#pragma omp parallel for
	for (int angle_i=0;angle_i<8;angle_i++)
//	for (float angle=0.0;angle<1.0;angle+=0.125) // omp can't do float loops
	{
		float angle=0.125*angle_i;
		baseball b;
		b.px=b.py=0.0; b.pz=2.0;
		b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
		
		for (int i=0;i<1000;i++)
		{
			b.step(0.001);
			...
		}
}

(Try this in NetRun now!)

 Here's the CUDA baseball simulator:

#include <iostream>
#include <stdio.h>
#include "lib/inc.c"

class baseball {
public:
	float px,py,pz;
	float vx,vy,vz;
	__device__ __host__ void step(float dt) {
		float drag=-1.0;
		float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
		float coriolis=0.00001;
		ax+=coriolis*vy; ay-=coriolis*vx; 
		vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
		px+=vx*dt; py+=vy*dt; pz+=vz*dt;
	}
};

__global__ void move_baseball(baseball *b_array) {
	int i=blockIdx.x*blockDim.x+threadIdx.x; // global thread ID
	// printf("Hello from GPU: thread %d of block %d, f=%d\n",t,b,f);
	float angle=0.000125*i;
	baseball b;
	b.px=b.py=0.0; b.pz=2.0;
	b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
	
	//	Move the baseball:
	for (int i=0;i<1000;i++)
	{
		b.step(0.001);
	}

	b_array[i]=b;
}

int main() {
	int threads=512;
	int *cpu_flosm=new int[threads*8192];
	for (int num_blocks=1;num_blocks<=8192;num_blocks*=2) {
		
		baseball *b_array;
		//cudaMallocManaged(&flosm,threads*num_blocks*sizeof(int)); // slow!
		cudaMalloc(&b_array,threads*num_blocks*sizeof(baseball));
		
		double start=time_in_seconds();
		
		move_baseball<<< num_blocks,threads >>>(b_array); // spawn threads
		cudaDeviceSynchronize(); // join threads
		
		// Copy back one random baseball, to show it's possible:
		baseball cpu_b;
		cudaMemcpy(&cpu_b,b_array,sizeof(baseball),cudaMemcpyDeviceToHost);
		double elapsed=time_in_seconds()-start;
		
		std::cout<<num_blocks<<" blocks took "<<elapsed*1.0e9<<" ns: "<<elapsed*1.0e9/(threads*num_blocks)<<" ns/thread\n";
	}
}

(Try this in NetRun now!)

Performance for a single or small number of baseballs is way worse than the CPU; performance for many baseballs is way better than the CPU.

Updated: