Mandelbrot: in MPI and EPGPU
CS 641 Lecture, Dr. Lawlor
The simplest way to divide up a Mandelbrot image rendering problem is
block by block. Here's a simple block send version; speedup isn't
very good, because load balance is poor--most of the work is done by
processor 0.
#include <iostream>
#include <fstream>
#include <mpi.h>
void call_finalize(void) {
for (int leash=0;leash<10;leash++) {
int err=MPI_Finalize();
if (err==0) break;
printf("woa... dunno, man (%d)\n",err);
}
}
/**
MPI error checking macro. Use this like:
checkError(MPI_Something_something(stuff,otherstuff));
*/
#define checkError(code) do { \
int err=code; \
if (err!=0) { std::cerr<<"MPI error "<<err<<": from file "<<__FILE__<<":"<<__LINE__<<" code was "<< #code<<"\n",\
exit(1); } \
} while(false)
char mandelbrot(int x,int y) {
float scale=2.0/1000;
float cr=x*scale-1.0, ci=y*scale;
float zr=cr, zi=ci;
int count;
enum {max_count=255};
for (count=0;count<max_count;count++) {
if ((zr*zr+zi*zi)>4.0) break;
float tzr=zr*zr-zi*zi+cr;
zi=2.0f*zr*zi+ci;
zr=tzr;
}
return count;
}
int main(int argc,char *argv[]) {
MPI_Init(&argc,&argv);
atexit(call_finalize);
int rank=0; checkError(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
int size=1; checkError(MPI_Comm_size(MPI_COMM_WORLD,&size));
int rows=1000; /* size of the problem: number of rows to render */
int per_rows=rows/size; /* rows per process */
int first_row=per_rows*rank; int last_row=first_row+per_rows;
enum {cols=1000};
char storage[rows*cols]; /* 1D array storing 2D data: x + y*cols */
double start=MPI_Wtime();
for (int y=first_row;y<last_row;y++) {
for (int x=0;x<cols;x++) {
storage[x+y*cols]=mandelbrot(x,y);
}
}
/* Collect the whole image to rank 0 */
int tag=43219;
if (rank==0) { /* recv from each other process */
for (int his_rank=1;his_rank<size;his_rank++) {
int his_first_row=per_rows*his_rank;
MPI_Status sts;
checkError(MPI_Recv(&storage[his_first_row*cols], // buffer
per_rows*cols, // count
MPI_BYTE, // datatype
his_rank, // source
tag,
MPI_COMM_WORLD,
&sts));
}
} else { /* send my stuff to rank 0 */
MPI_Send(&storage[first_row*cols],per_rows*cols,MPI_BYTE,
0,tag,MPI_COMM_WORLD);
}
if (rank==0) { /* write the image */
std::ofstream of("out.ppm");
of<<"P5\n"<<rows<<" "<<cols<<" 255\n"; of.write(storage,rows*cols);
}
double elapsed=MPI_Wtime()-start;
std::cout<<"Rank "<<rank<<" took "<<elapsed<<" seconds\n";
return 0;
}
(Try this in NetRun now!)
Here's a version where we interleave rows (like OpenMP "static"
decomposition). Managing local packed data coordinate system
versus the interleaved rows makes unpacking the network data painful,
butload balance and hence performance is much better.
#include <iostream>
#include <fstream>
#include <mpi.h>
void call_finalize(void) {
for (int leash=0;leash<10;leash++) {
int err=MPI_Finalize();
if (err==0) break;
printf("woa... dunno, man (%d)\n",err);
}
}
/**
MPI error checking macro. Use this like:
checkError(MPI_Something_something(stuff,otherstuff));
*/
#define checkError(code) do { \
int err=code; \
if (err!=0) { std::cerr<<"MPI error "<<err<<": from file "<<__FILE__<<":"<<__LINE__<<" code was "<< #code<<"\n",\
exit(1); } \
} while(false)
char mandelbrot(int x,int y) {
float scale=2.0/1000;
float cr=x*scale-1.0, ci=y*scale;
float zr=cr, zi=ci;
int count;
enum {max_count=255};
for (count=0;count<max_count;count++) {
if ((zr*zr+zi*zi)>4.0) break;
float tzr=zr*zr-zi*zi+cr;
zi=2.0f*zr*zi+ci;
zr=tzr;
}
return count;
}
int callthebadcode(const char *str) {return 17;}
int main(int argc,char *argv[]) {
MPI_Init(&argc,&argv);
atexit(call_finalize);
int rank=0; checkError(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
int size=1; checkError(MPI_Comm_size(MPI_COMM_WORLD,&size));
int rows=1000; /* size of the problem: number of rows to render */
int per_rows=rows/size; /* rows per process */
int first_row=rank;
enum {cols=1000};
char storage[per_rows*cols]; /* 1D array storing 2D data: x + y*cols */
double start=MPI_Wtime();
for (int ly=0;ly<per_rows;ly++) { // local coordinates
int gy=first_row+size*ly; // global coordinates
for (int x=0;x<cols;x++) {
storage[x+ly*cols]=mandelbrot(x,gy);
}
}
/* Collect the whole image to rank 0 */
int tag=43219;
if (rank==0) { /* recv from each other process */
char big[rows*cols]; /* global image */
for (int his_rank=0;his_rank<size;his_rank++) {
if (his_rank!=0) {
int his_first_row=per_rows*his_rank;
MPI_Status sts;
checkError(MPI_Recv(&storage[0], // buffer
per_rows*cols, // count
MPI_BYTE, // datatype
his_rank, // source
tag,
MPI_COMM_WORLD,
&sts));
}
for (int ly=0;ly<per_rows;ly++) {
int gy=his_rank+size*ly; // global coordinates
memcpy(&big[gy*cols],&storage[ly*cols],cols);
}
}
std::ofstream of("out.ppm");
of<<"P5\n"<<rows<<" "<<cols<<" 255\n"; of.write(big,rows*cols);
} else { /* send my stuff to rank 0 */
MPI_Send(&storage[0],per_rows*cols,MPI_BYTE,
0,tag,MPI_COMM_WORLD);
}
double elapsed=MPI_Wtime()-start;
std::cout<<"Rank "<<rank<<" took "<<elapsed<<" seconds\n";
return 0;
}
(Try this in NetRun now!)
Finally, just for fun, here's the same code in my brand-new
OpenCL wrapper library called EPGPU.
One graphics card can handily beat the entire powerwall's CPUs!
#include "epgpu.h"
#include "epgpu.cpp"
#include "lib/inc.c"
#include <fstream>
GPU_FILLKERNEL_2D(char,
mandelbrot,(),
float scale=2.0/1000;
float cr=i*scale-1.0; float ci=j*scale;
float zr=cr; float zi=ci;
int count;
enum {max_count=255};
for (count=0;count<max_count;count++) {
if ((zr*zr+zi*zi)>4.0) break;
float tzr=zr*zr-zi*zi+cr;
zi=2.0f*zr*zi+ci;
zr=tzr;
}
result=count;
)
int main() {
int w=1000,h=1000;
gpu_array2d<char> arr(w,h);
arr=mandelbrot(); /* throwaway run, to get it compiled! */
double start=time_in_seconds();
arr=mandelbrot();
char arrCPU[w*h];
arr.read(arrCPU);
std::ofstream of("out.ppm");
of<<"P5\n"<<w<<" "<<h<<" 255\n"; of.write(arrCPU,w*h);
double elapsed=time_in_seconds()-start;
std::cout<<"Total render time="<<elapsed<<"\n";
return 0;
}
(Try this in NetRun now!)