Course Review for Final Exam
CS 441 Lecture, Dr. Lawlor
Pre-midterm content on the final:
- Pipelining
- Superscalar execution and dependencies
- Register renaming
- Instruction reordering (out-of-order execution)
- Bits and execution of floating-point numbers
You should also understand the following performance pitfalls, and how to analyze an application for high performance:
- When is optimizing worthwhile?
- If it would take you a month to write an amazing version that's
twice as fast, then it might actually get the answer sooner to just
start the slow code running, and take a monthlong vacation as it runs.
- If it would take you a week to get a 40% speedup, depending on
your hourly rate, it might be cheaper for the company to just buy you a
faster computer.
- Load imbalance: not all units busy, because work is unevenly assigned.
- Only fix is to reassign work. Static work reassignment,
such as round robin task assignment, is usually pretty easy and
effective. Dynamic reassignment is pretty easy in OpenMP
("schedule(dynamic,1)"), and is at least theoretically possible on SSE, but
much tougher on the GPU or in MPI.
- Parallelizing
the
wrong thing: adding cores won't help an application that is limited
by the performance of RAM, the disk, or the network. To help
these applications, you need to add RAM, add disks, or add new network
links.
- Alpha (startup) cost. Negligible for SSE, but in the
microseconds for OpenMP (thread startup: 5us/thread), MPI (message
startup: 50us on gigE), or GPUs (kernel startup: 10us).
- General
fix is to "use bigger stuff", typically thousands of ints or floats at
once to amortize out the overhead. This is a limiting factor for
some applications.
- Sometimes, alpha cost can be overlapped with some other useful task.
- Dependencies. Some applications have very little
inherent parallelism--for example, sorting is tough to parallelize, while "find
the maximum value" is pretty easy.
- Sometimes a new algorithm is needed. Don't be afraid to replicate work!
These explicit parallel programming models:
- SIMD/vector instructions, including SSE
- Multithreaded shared memory programming, including pthreads and OpenMP
- Distributed-memory programming, including sockets and MPI
- GPU programming, including GLSL
|
OpenMP
|
MPI
|
GLSL
|
SSE
|
Uses...
|
threads
|
processes
|
shader cores
|
floats
|
Where?
|
#pragma before a "for" loop
|
Throughout main
|
func<<<blocks,threads>>> kernel call
|
_mm_add_ps macro
|
Pitfalls?
|
global or shared variables
|
blocking calls
|
data transfer and startup cost
|
branch decoherence
|
Scalability
|
Typically 2-8x maximum
|
Nearly unlimited
|
Up to 1000x or so (relative to CPU)
|
Up to 4x
|
Mandelbench
Here's part of the Mandelbrot set fractal, rendered at 512x512 in each of the parallel programming models listed above.
Sequential C++
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}
/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
#define timer (time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n"
std::cout<<"Allocated array: "<<timer;
start=time_in_seconds(); // <- reset the timer!
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<timer;
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);
elapsed=time_in_seconds()-start;
std::cout<<"Render + IO: "<<timer;
return 0;
}
(Try this in NetRun now!)
Allocated array: 0.255568ns/pixel
Rendered array: 491.817ns/pixel
Render + IO: 493.858ns/pixel
(This is one core of a Quad-core Intel Q6600)
OpenMP
...
#pragma omp parallel for schedule(dynamic,1)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
...
(Try this in NetRun now!)
Allocated array: 0.240107ns/pixel
Rendered array: 138.046ns/pixel
Render + IO: 140.446ns/pixel
OpenMP gives about a 3.6x speedup on this Quad-core Intel Q6600.
OpenMP + SSE
#include <xmmintrin.h> /* SSE __m128 header */
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
const int wid=512, ht=512;
/* Render 4 adjacent pixels of the mandelbrot set: x+0 through x+3 */
void render_mandelbrot(int x,int y,unsigned char *output)
{
/* See http://www.cs.uaf.edu/2009/fall/cs441/lecture/11_19_mandelbrot_compare.html */
float cr[4], ci[4];
for (int i=0;i<4;i++) {
cr[i]=0.2+(x+i)*(0.2/wid);
ci[i]=0.5+y*(0.2/ht);
}
float two[4]={2.0,2.0,2.0,2.0};
__m128 TWO=_mm_load_ps(two);
float four[4]={4.0,4.0,4.0,4.0};
__m128 FOUR=_mm_load_ps(four);
__m128 CR=_mm_load_ps(cr), CI=_mm_load_ps(ci);
__m128 ZR=CR, ZI=CI;
int count;
enum {max_count=256};
for (count=0;count<max_count;count++) {
__m128 tZR=ZR*ZR-ZI*ZI+CR; /*subtle: don't overwrite ZR yet!*/
__m128 tZI=TWO*ZR*ZI+CI;
__m128 LEN=tZR*tZR+tZI*tZI;
__m128 MASK=_mm_cmplt_ps(LEN,FOUR); /* set if len<4.0 */
ZR=_mm_or_ps(_mm_and_ps(MASK,tZR),_mm_andnot_ps(MASK,ZR));
ZI=_mm_or_ps(_mm_and_ps(MASK,tZI),_mm_andnot_ps(MASK,ZI));
if (_mm_movemask_ps(LEN-FOUR)==0) break; /* everybody's > 4 */
}
_mm_store_ps(cr,ZR); _mm_store_ps(ci,ZI);
for (int i=0;i<4;i++) {
output[i]=(char)(cr[i]*(256/4.0));
}
}
/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
#define timer (time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n"
std::cout<<"Allocated array: "<<timer;
start=time_in_seconds(); // <- reset the timer!
#pragma omp parallel for schedule(dynamic,1)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x+=4)
render_mandelbrot(x,y,&arr[y*wid+x]);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<timer;
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);
elapsed=time_in_seconds()-start;
std::cout<<"Render + IO: "<<timer;
return 0;
}
(Try this in NetRun now!)
Allocated array: 0.241016ns/pixel
Rendered array: 85.1824ns/pixel
Render + IO: 87.4979ns/pixel
See my 2009 lecture for the development of the SSE version.
SSE gives us about a 1.7x speedup, and then OpenMP gives us about a
3.6x speedup, for a total of about 6x speedup over the sequential
code. SSE is crucial to get the most performance from modern
CPUs, but it is by far the ugliest code of the bunch! I need to
rewrite this version using my "float4" class; I'm also cheating a bit
and returning zr instead of "count".
fork()+sockets
// Socket-based multicore parallelism (for quad-core machine)
#include "osl/socket.h"
#include "osl/socket.cpp"
#include <sys/wait.h> /* for wait() */
#include <unistd.h> /* for fork() */
#include "lib/inc.c" /* NetRun, for time_in_seconds */
#include <iostream>
#include <fstream>
#include <math.h>
const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}
/* Run as process "rank", one process among "size" others.
Each socket connects you with another rank: s[0] connects to rank 0.
*/
void run(int rank,int size,SOCKET *s)
{
double start=time_in_seconds();
float ns=1.0e9/(wid*ht); /* convert seconds to ns/pixel */
enum {n=512}; /* total size of the problem */
int lht=(ht+size-1)/size; /* local height = ht/size, rounded up */
char limg[wid*lht];
/* Render interleaved rows; each processor taking one and skipping size-1 */
for (int ly=0, gy=rank;gy<ht;ly++, gy+=size)
for (int x=0;x<wid;x++)
limg[ly*wid+x]=render_mandelbrot(x,gy);
if (rank!=0) /* send local data to 0 */
{
skt_sendN(s[0],limg,wid*lht);
} else /* I'm rank 0: receive and unpack rows */
{
printf("render: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);
char img[wid*ht]; /* global image */
for (int srcrank=0;srcrank<size;srcrank++) {
if (srcrank>0) { /* receive somebody else's rows */
skt_recvN(s[srcrank],limg,wid*lht);
}
/* Copy out local rows into global image */
for (int ly=0, gy=srcrank;gy<ht;ly++, gy+=size)
memcpy(&img[wid*gy],&limg[wid*ly],wid);
}
printf("render+net: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);
/* write the image */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&img[0],wid*ht);
printf("render+net+IO: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);
}
}
int main(void) {
double start=time_in_seconds();
unsigned int port=0;
const int size=4; /* quad-core machine */
SOCKET s[size];
SERVER_SOCKET serv=skt_server(&port);
for (int child=1;child<size;child++) {
int newpid=fork();
if (newpid==0) { /* I'm the child */
s[0]=skt_connect(skt_lookup_ip("127.0.0.1"),port,2);
run(child,size,s);
skt_close(s[0]);
exit(0); /* close out child process when done */
}
/* else I'm the parent */
s[child]=skt_accept(serv,0,0);
}
/* Now that all children are created, run as parent */
run(0,size,s);
/* Once parent is done, collect all the children */
for (int child=1;child<size;child++) {
skt_close(s[child]);
int status=0;
wait(&status); /* wait for child to finish */
}
printf("total_run: %.3f ms\n",(time_in_seconds()-start)*1.0e3);
return 0;
}
(Try this in NetRun now!)
render: 132.263 ns/pixel
render+net: 136.078 ns/pixel
render+net+IO: 138.461 ns/pixel
Speedup with sockets is actually a tiny bit better than with OpenMP, but still about 3.6x on this Intel Q6600.
MPI
#include <iostream>
#include <fstream>
#include <mpi.h>
#include <math.h>
const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}
int main(int argc,char *argv[])
{
MPI_Init(&argc,&argv);
int rank,size;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
double start=MPI_Wtime();
float ns=1.0e9/(wid*ht); /* convert seconds to ns/pixel */
enum {n=512}; /* total size of the problem */
int lht=(ht+size-1)/size; /* local height = ht/size, rounded up */
char limg[wid*lht];
/* Render interleaved rows; each processor taking one and skipping size-1 */
for (int ly=0, gy=rank;gy<ht;ly++, gy+=size)
for (int x=0;x<wid;x++)
limg[ly*wid+x]=render_mandelbrot(x,gy);
int tag=37331; /* MPI tag (random value) */
if (rank!=0) /* send local data to 0 */
{
MPI_Send(limg,wid*lht,MPI_CHAR, 0,tag,MPI_COMM_WORLD);
} else /* I'm rank 0: receive and unpack rows */
{
printf("render: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);
char img[wid*ht]; /* global image */
for (int srcrank=0;srcrank<size;srcrank++) {
if (srcrank>0) { /* receive somebody else's rows */
MPI_Status sts;
MPI_Recv(limg,wid*lht,MPI_CHAR, srcrank,tag,MPI_COMM_WORLD,&sts);
}
/* Copy out local rows into global image */
for (int ly=0, gy=srcrank;gy<ht;ly++, gy+=size)
memcpy(&img[wid*gy],&limg[wid*ly],wid);
}
printf("render+net: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);
/* write the image */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&img[0],wid*ht);
printf("render+net+IO: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);
}
MPI_Finalize();
return 0;
}
(Try this in NetRun now!)
1 process:
render: 695.171 ns/pixel
render+net: 696.270 ns/pixel
render+net+IO: 699.032 ns/pixel
2 processes:
render: 348.187 ns/pixel
render+net: 356.354 ns/pixel
render+net+IO: 359.066 ns/pixel
4 processes:
render: 172.669 ns/pixel
render+net: 186.508 ns/pixel
render+net+IO: 189.327 ns/pixel
10 processes:
render: 70.209 ns/pixel
render+net: 81.886 ns/pixel
render+net+IO: 84.621 ns/pixel
MPI does an excellent job of connecting these Intel Core2 Duos with
gigabit Ethernet. The baseline is a little slower than the newer
Q6600 CPU, but we can beat even the hideous OpenMP+SSE version by
adding enough processors.
GLSL
float cr=0.2+0.2*texcoords.x, ci=0.5+0.2*(1.0-texcoords.y);
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(fract(count/255.0));
(Try this in NetRun now!)
Rendering: 2.929 ns/pixel
The NVIDIA GeForce GTX 280 just demolishes all the CPUs here: 167x
faster than the serial version, and still dozens of times faster than
all ten CPUs with MPI. The code is actually still quite readable,
although I'm leaving out the code and time to copy the data off the GPU
(shown below).
ogl/gpgpu.h (Orion's OpenGL Libraries)
#include <iostream>
#include "ogl/gpgpu.h" /* Dr. Lawlor's OpenGL wrappers for general-purpose GPU computing */
#include "ogl/glsl.cpp"
#include <GL/glew.c> /* include entire body here, for easy linking */
#include "lib/inc.c" /* NetRun file, for time_in_seconds */
int main() {
gpu_env e; /* make the environment (OpenGL window &c) */
const int wid=512, ht=512;
double start=time_in_seconds();
gpu_array img(e,"img",wid,ht,0,GL_LUMINANCE32F_ARB);
std::cout<<"Allocate: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
for (int run=0;run<2;run++) { //<- the first run is 30ms slower (compile time)
start=time_in_seconds(); // <- reset the timer for each run
GPU_RUN(img,
float cr=0.2+0.2*location.x; float ci=0.5+0.2*location.y;
float zr=cr; float zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
/* z= z*z+c */
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(fract(count/255.0));
)
glFinish();// <- needed for accurate timing only
std::cout<<"Render: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
/* Copy data off to CPU */
char out[wid*ht];
img.read((float *)out,wid,ht,0,0,GL_UNSIGNED_BYTE);
std::cout<<"Render + Mem: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
/* Write image file */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write(out,wid*ht);
std::cout<<"Render + Mem + IO: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
}
return 0;
}
(Try this in NetRun now!)
Allocate: 37.7193ns/pixel
Render: 3.21184ns/pixel
Render + Mem: 12.081ns/pixel
Render + Mem + IO: 15.0641ns/pixel
At the moment, the slowest thing you can do on a GPU is talk to a
CPU. Copying the data off takes 4x longer than calculating
it. The first call to GPU_RUN has to compile the GLSL code, so I
call it twice in a loop here. I think the code is pretty
readable, but I wrote the gpu_ library, so I would think that! A
more standard interface, CUDA, is shown below.
CUDA
#include <cuda.h>
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */
#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
const int wid=512, ht=512;
/* GPU Code! */
__global__ void fill_in_array(float *arr) {
int i=threadIdx.x+blockDim.x*blockIdx.x; // my thread's global number
int x=i%wid, y=i/wid; // my thread's pixel to work on
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
arr[y*wid+x]=count;
}
/* Run on CPU */
int main(int argc,char *argv[]) {
float *arr=0; /* DATA LIVES ON THE GPU!!!! */
cudaMalloc((void **)&arr,sizeof(float)); /* allocate one float (pay CUDA startup time) */
/* Allocate memory */
double start=time_in_seconds(), t;
check(cudaMalloc((void **)&arr, wid*ht*sizeof(float)));
t=time_in_seconds()-start;
std::cout<<"Allocated array: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";
start=time_in_seconds(); // <- reset the timer!
int threadsPerBlock=512;
int nBlocks=wid*ht/threadsPerBlock;
/* Call CUDA kernel */
fill_in_array<<<nBlocks,threadsPerBlock>>>(arr);
check(cudaThreadSynchronize()); // look for errors in kernel
t=time_in_seconds()-start;
std::cout<<"Rendered array: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";
/* Copy data back to CPU */
float harr[wid*ht];
check(cudaMemcpy(harr,arr,wid*ht*sizeof(float),cudaMemcpyDeviceToHost));
t=time_in_seconds()-start;
std::cout<<"Render + Mem: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";
/* Convert from float to char */
char carr[wid*ht];
for (int i=0;i<wid*ht;i++) carr[i]=(char)harr[i];
/* Write data to file */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write(carr,wid*ht);
std::cout<<"Render + Mem + IO: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
return 0;
}
(Try this in NetRun now!)
Allocate: 0.328004ns/pixel
Render: 4.31813ns/pixel
Render + Mem: 11.5661ns/pixel
Render + Mem + IO: 32.9589ns/pixel
CUDA actually seems to run the calculation portion more slowly than
GLSL for this problem, but pulls slightly ahead when including the time
to copy the answer off the card, and falls way behind by the time you
finish the I/O. I'm hitting a weird CUDA 2.2 driver bug if I try
to have the kernel write raw char data, so the I/O time is much larger
because I have to do float-to-char conversion on the CPU. There's
also a startup effect in CUDA: the first call takes about 40ms to find
and initialize the card.