Multicore with OpenMP, and Streaming SIMD Extensions (SSE)

CS 463 Lecture, Dr. Lawlor

Here's a simple SHA-1 brute force hash search algorithm. It reads the target hash from the command line, and tries a bunch of source values to see if it can find one with that hash.

/* Secure Hash Algorithm-1 (SHA-1) implementation.
   Implemented and placed in the public domain by Steve Reid
Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html)
Adapted by Orion Sky Lawlor, 7/20/2001
Re-adapted for single-file operation 2012-11.
*/
typedef unsigned int SHA1_word32;
enum {SHA1_nstate=5};
enum {SHA1_ndata=16};

/// Initialize SHA1_hash_words of state.
void SHA1_init(SHA1_word32 *state)
{
        state[0] = 0x67452301u;
        state[1] = 0xEFCDAB89u;
        state[2] = 0x98BADCFEu;
        state[3] = 0x10325476u;
        state[4] = 0xC3D2E1F0u;
}

/// Circular left shift in 32 bits
inline SHA1_word32 rotlFixed(SHA1_word32 x, SHA1_word32 y)
{
#if defined(_MSC_VER) || defined(__BCPLUSPLUS__)
	return y ? _lrotl(x, y) : x;
#elif defined(__MWERKS__) && TARGET_CPU_PPC
	return y ? __rlwinm(x,y,0,31) : x;
#else /*Default C version*/
/* works for int, don't and for SIMD
	return ((0xFFffFFffu)&(x<<y)) | (((0xFFffFFffu)&x)>>(32-y));
*/
	return ((x<<y)) | ((x)>>(32-y));
#endif
}

#define blk0(i) (W[i] = data[i])
#define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))

#define f1(x,y,z) (z^(x&(y^z)))
#define f2(x,y,z) (x^y^z)
#define f3(x,y,z) ((x&y)|(z&(x|y)))
#define f4(x,y,z) (x^y^z)

/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDCu+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6u+rotlFixed(v,5);w=rotlFixed(w,30);

void SHA1_transform(SHA1_word32 *state, const SHA1_word32 *data)
{
	SHA1_word32 W[16];
	/* Copy state to working vars */
	SHA1_word32 a = state[0];
	SHA1_word32 b = state[1];
	SHA1_word32 c = state[2];
	SHA1_word32 d = state[3];
	SHA1_word32 e = state[4];
	/* 4 rounds of 20 operations each. Loop unrolled. */
	R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
	R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
	R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
	R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
	R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
	R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
	R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
	R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
	R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
	R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
	R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
	R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
	R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
	R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
	R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
	R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
	R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
	R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
	R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
	R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
	/* Add the working vars back into context.state[] */
	state[0] += a;
	state[1] += b;
	state[2] += c;
	state[3] += d;
	state[4] += e;
}

/* Silly single-integer interface. 
  The real interface takes a variable-length bit string of data, transforms each
  SHA1_ndata size chunk, and finally tacks on an end-of-message 0x80 and a bit count.
  I'm ignoring all that here.
*/
void hash(unsigned int *src,unsigned int *hashes) {
	SHA1_word32 state[SHA1_nstate]; 
	SHA1_init(&state[0]);

	SHA1_word32 data[SHA1_ndata];
	for (int i=0;i<SHA1_ndata;i++) data[i]=0;
	data[0]=src[0];

	SHA1_transform(&state[0],&data[0]);
	hashes[0]=state[0];
}

/************ Main program ****************/
int foo(void) {
	unsigned int matches=0;
	int nbatches=1000000;
	unsigned int htarget=read_input();
	double start=time_in_seconds();
	for (int v=0;v<nbatches;v++) {
		unsigned int vs[1], hs[1];
		for (int s=0;s<1;s++) vs[s]=v+s;
		hash(vs,hs);
		for (int s=0;s<1;s++) {
		  unsigned int h=hs[s];
		  if (h==htarget) { /* we have a match! */
		 	matches++;
			std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n";
		  }
		}
	}
	double end=time_in_seconds();
	std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n";
	std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n";
	return matches;
}

(Try this in NetRun now!)

This runs in 178 nanoseconds per hash on my Sandy Bridge machine--about 5 million hashes per second, which is fairly respectable.

Multicore, via OpenMP

One simple way to run faster is to use multiple cores. The easiest way to use multiple cores is via OpenMP, a compiler directive that says "make this loop run with multiple threads across the cores".

To use it, add the line "#pragma omp parallel for" above the v loop in foo, and switch to "OpenMP" mode.
(Try this in NetRun now!)

This immediately gets us to 55 nanoseconds per hash, not quite a four-fold speedup due to overheads creating threads, and the single shared memory subsystem.

In real code, you must be very careful when using threads:

If the loop takes less than about 1000 nanoseconds, it will cost more to create the threads than you gain from multicore. This means adding the pragma willy-nilly can slow the program down, sometimes many hundreds of times slower!
If any of the functions executed in the parallel loop (the functions you call, or the functions *they* call, etc) access global variables, you can easily get the wrong answer or a crash at run time. The problem is that multiple threads will be reading and writing the global variables simultaneously, so unless protected by a "#pragma omp critical", it's easy for one thread to overwrite the other thread's work. This depends on the exact moment various pieces of code actually run, so it's not consistent from run to run--your code might only crash on a full moon when your boss is watching! The fix is to carefully audit your code and any libraries you call for "thread safety".

SSE: Streaming SIMD Extensions

Since threads take a long time to start up, hardware architects often use an orthogonal way to express parallelism called SIMD: Single Instruction, Multiple Data. Intel's version of this is called SSE, the Streaming SIMD Extensions. From C, you can #include <emmintrin.h>, and the crypo-friendly integer values are four-integer blocks named __m128i, and the operations end with "_epi32".

This means you can add two SIMD blocks of integers using:

#include <emmintrin.h> /* Intel SSE header */

int foo(void) {
	__m128i a=_mm_setr_epi32(1,2,3,4);
	__m128i b=_mm_setr_epi32(10,100,1000,1000);

	__m128i c=_mm_add_epi32(a,b);

	int out[4];
	_mm_store_si128((__m128i *)&out[0],c);
	for (int i=0;i<4;i++) std::cout<<out[i]<<" ";
	return 0;
}

(Try this in NetRun now!)

This is pretty darn ugly, so I built a header named "osl/floats.h" that defines a class named "ints". This class has overloaded operators to let you treat SSE values similar to ordinary single values. In fact, I can switch the typedef around in the SHA1 implementation above, and get a SSE version basically free!

Note that this version computes four totally independent hashes per run--this is the easiest way to use SIMD, computing unrelated values, because it eliminates dependencies between the slices of an SIMD register.

/* Secure Hash Algorithm-1 (SHA-1) implementation.
   Implemented and placed in the public domain by Steve Reid
Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html)
Adapted by Orion Sky Lawlor, 7/20/2001
Re-adapted for single-file operation 2012-11.
*/
#undef __AVX__ // the SSE integer stuff is faster than AVX
#include "osl/floats.h"

typedef ints SHA1_word32;

... all SHA1 stuff works exactly the same way ...

void hash(unsigned int *src,unsigned int *hashes) {
	SHA1_word32 state[SHA1_nstate]; 
	SHA1_init(&state[0]);

	SHA1_word32 data[SHA1_ndata];
	for (int i=0;i<SHA1_ndata;i++) data[i]=0;
	for (int s=0;s<ints::n;s++) data[0][s]=src[s];

	SHA1_transform(&state[0],&data[0]);
	for (int s=0;s<ints::n;s++) hashes[s]=state[0][s];
}

/************ Main program ****************/
int foo(void) {
	unsigned int matches=0;
	int nbatches=1000000;
	unsigned int htarget=read_input();
	double start=time_in_seconds();

#pragma omp parallel for
	for (int v=0;v<nbatches;v+=ints::n) {
		unsigned int vs[ints::n], hs[ints::n];
		for (int s=0;s<ints::n;s++) vs[s]=v+s;
		hash(vs,hs);
		for (int s=0;s<ints::n;s++) {
		  unsigned int h=hs[s];
		  if (h==htarget) { /* we have a match! */
		 	matches++;
			std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n";
		  }
		}
	}
	double end=time_in_seconds();
	std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n";
	std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n";
	return matches;
}

(Try this in NetRun now!)

The end result is 19ns per hash, which is over 50 million hashes per second, and nearly 10x faster than our original sequential code!