Mandelbrot: in MPI and EPGPU

CS 641 Lecture, Dr. Lawlor

The simplest way to divide up a Mandelbrot image rendering problem is block by block. Here's a simple block send version; speedup isn't very good, because load balance is poor--most of the work is done by processor 0.

#include <iostream>
#include <fstream>
#include <mpi.h>

void call_finalize(void) {
	for (int leash=0;leash<10;leash++) {
		int err=MPI_Finalize();
		if (err==0) break;
		printf("woa... dunno, man (%d)\n",err);
	}
}

/**
  MPI error checking macro.  Use this like:
   checkError(MPI_Something_something(stuff,otherstuff));
*/
#define checkError(code) do { \
	int err=code; \
	if (err!=0) { std::cerr<<"MPI error "<<err<<": from file "<<__FILE__<<":"<<__LINE__<<" code was "<< #code<<"\n",\
		exit(1); }  \
} while(false)


char mandelbrot(int x,int y) {
	float scale=2.0/1000;
	float cr=x*scale-1.0, ci=y*scale;
	float zr=cr, zi=ci;
	int count;
	enum {max_count=255};
	for (count=0;count<max_count;count++) {
		if ((zr*zr+zi*zi)>4.0) break;
		float tzr=zr*zr-zi*zi+cr;
		zi=2.0f*zr*zi+ci;
		zr=tzr;
	}
	return count;
}

int main(int argc,char *argv[]) {
	MPI_Init(&argc,&argv);
	atexit(call_finalize);

	int rank=0; checkError(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
	int size=1; checkError(MPI_Comm_size(MPI_COMM_WORLD,&size));

	int rows=1000; /* size of the problem: number of rows to render */
	int per_rows=rows/size; /* rows per process */
	int first_row=per_rows*rank; int last_row=first_row+per_rows;

	enum {cols=1000};

	char storage[rows*cols]; /* 1D array storing 2D data: x + y*cols */

	double start=MPI_Wtime();

	for (int y=first_row;y<last_row;y++) {
		for (int x=0;x<cols;x++) {
			storage[x+y*cols]=mandelbrot(x,y);
		}
	}

/* Collect the whole image to rank 0 */
	int tag=43219;
	if (rank==0) { /* recv from each other process */
		for (int his_rank=1;his_rank<size;his_rank++) {
			int his_first_row=per_rows*his_rank;
			MPI_Status sts;
			checkError(MPI_Recv(&storage[his_first_row*cols], // buffer
				per_rows*cols, // count
				MPI_BYTE, // datatype
				his_rank, // source
				tag,
				MPI_COMM_WORLD,
				&sts));
		}
	} else { /* send my stuff to rank 0 */
		MPI_Send(&storage[first_row*cols],per_rows*cols,MPI_BYTE,
			0,tag,MPI_COMM_WORLD);
	}

	if (rank==0) { /* write the image */
		std::ofstream of("out.ppm");
		of<<"P5\n"<<rows<<" "<<cols<<" 255\n"; of.write(storage,rows*cols);
	}

	double elapsed=MPI_Wtime()-start;
	std::cout<<"Rank "<<rank<<" took "<<elapsed<<" seconds\n";

	return 0;
}

(Try this in NetRun now!)

Here's a version where we interleave rows (like OpenMP "static" decomposition). Managing local packed data coordinate system versus the interleaved rows makes unpacking the network data painful, butload balance and hence performance is much better.

#include <iostream>
#include <fstream>
#include <mpi.h>

void call_finalize(void) {
	for (int leash=0;leash<10;leash++) {
		int err=MPI_Finalize();
		if (err==0) break;
		printf("woa... dunno, man (%d)\n",err);
	}
}

/**
  MPI error checking macro.  Use this like:
   checkError(MPI_Something_something(stuff,otherstuff));
*/
#define checkError(code) do { \
	int err=code; \
	if (err!=0) { std::cerr<<"MPI error "<<err<<": from file "<<__FILE__<<":"<<__LINE__<<" code was "<< #code<<"\n",\
		exit(1); }  \
} while(false)


char mandelbrot(int x,int y) {
	float scale=2.0/1000;
	float cr=x*scale-1.0, ci=y*scale;
	float zr=cr, zi=ci;
	int count;
	enum {max_count=255};
	for (count=0;count<max_count;count++) {
		if ((zr*zr+zi*zi)>4.0) break;
		float tzr=zr*zr-zi*zi+cr;
		zi=2.0f*zr*zi+ci;
		zr=tzr;
	}
	return count;
}

	
int callthebadcode(const char *str) {return 17;}
int main(int argc,char *argv[]) {
	MPI_Init(&argc,&argv);
	atexit(call_finalize);

	int rank=0; checkError(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
	int size=1; checkError(MPI_Comm_size(MPI_COMM_WORLD,&size));

	int rows=1000; /* size of the problem: number of rows to render */
	int per_rows=rows/size; /* rows per process */
	int first_row=rank; 
	enum {cols=1000};

	char storage[per_rows*cols]; /* 1D array storing 2D data: x + y*cols */

	double start=MPI_Wtime();

	for (int ly=0;ly<per_rows;ly++) { // local coordinates
		int gy=first_row+size*ly; // global coordinates
		for (int x=0;x<cols;x++) {
			storage[x+ly*cols]=mandelbrot(x,gy);
		}
	}

/* Collect the whole image to rank 0 */
	int tag=43219;
	if (rank==0) { /* recv from each other process */
		char big[rows*cols]; /* global image */
		for (int his_rank=0;his_rank<size;his_rank++) {
			if (his_rank!=0) {
				int his_first_row=per_rows*his_rank;
				MPI_Status sts;
				checkError(MPI_Recv(&storage[0], // buffer
					per_rows*cols, // count
				MPI_BYTE, // datatype
				his_rank, // source
				tag,
				MPI_COMM_WORLD,
				&sts));
			}
			for (int ly=0;ly<per_rows;ly++) {
				int gy=his_rank+size*ly; // global coordinates
				memcpy(&big[gy*cols],&storage[ly*cols],cols);
			}
		}
		std::ofstream of("out.ppm");
		of<<"P5\n"<<rows<<" "<<cols<<" 255\n"; of.write(big,rows*cols);
	} else { /* send my stuff to rank 0 */
		MPI_Send(&storage[0],per_rows*cols,MPI_BYTE,
			0,tag,MPI_COMM_WORLD);
	}

	double elapsed=MPI_Wtime()-start;
	std::cout<<"Rank "<<rank<<" took "<<elapsed<<" seconds\n";

	return 0;
}

(Try this in NetRun now!)

Finally, just for fun, here's the same code in my brand-new OpenCL wrapper library called EPGPU. One graphics card can handily beat the entire powerwall's CPUs!

#include "epgpu.h"
#include "epgpu.cpp"
#include "lib/inc.c"
#include <fstream>

GPU_FILLKERNEL_2D(char,
 mandelbrot,(),
	float scale=2.0/1000;
	float cr=i*scale-1.0; float ci=j*scale;
	float zr=cr; float zi=ci;
	int count;
	enum {max_count=255};
	for (count=0;count<max_count;count++) {
		if ((zr*zr+zi*zi)>4.0) break;
		float tzr=zr*zr-zi*zi+cr;
		zi=2.0f*zr*zi+ci;
		zr=tzr;
	}
	result=count;
)

int main() {
	int w=1000,h=1000;
	gpu_array2d<char> arr(w,h);
	arr=mandelbrot(); /* throwaway run, to get it compiled! */

	double start=time_in_seconds();
	arr=mandelbrot();

	char arrCPU[w*h];
	arr.read(arrCPU);

	std::ofstream of("out.ppm");
	of<<"P5\n"<<w<<" "<<h<<" 255\n"; of.write(arrCPU,w*h);
	double elapsed=time_in_seconds()-start;
	std::cout<<"Total render time="<<elapsed<<"\n";
	return 0;
}

(Try this in NetRun now!)