/* Secure Hash Algorithm-1 (SHA-1) implementation. Implemented and placed in the public domain by Steve Reid Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html) Adapted by Orion Sky Lawlor, 7/20/2001 Re-adapted for single-file operation 2012-11. */ typedef unsigned int SHA1_word32; enum {SHA1_nstate=5}; enum {SHA1_ndata=16}; /// Initialize SHA1_hash_words of state. void SHA1_init(SHA1_word32 *state) { state[0] = 0x67452301u; state[1] = 0xEFCDAB89u; state[2] = 0x98BADCFEu; state[3] = 0x10325476u; state[4] = 0xC3D2E1F0u; } /// Circular left shift in 32 bits inline SHA1_word32 rotlFixed(SHA1_word32 x, SHA1_word32 y) { #if defined(_MSC_VER) || defined(__BCPLUSPLUS__) return y ? _lrotl(x, y) : x; #elif defined(__MWERKS__) && TARGET_CPU_PPC return y ? __rlwinm(x,y,0,31) : x; #else /*Default C version*/ /* works for int, don't and for SIMD return ((0xFFffFFffu)&(x<<y)) | (((0xFFffFFffu)&x)>>(32-y)); */ return ((x<<y)) | ((x)>>(32-y)); #endif } #define blk0(i) (W[i] = data[i]) #define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1)) #define f1(x,y,z) (z^(x&(y^z))) #define f2(x,y,z) (x^y^z) #define f3(x,y,z) ((x&y)|(z&(x|y))) #define f4(x,y,z) (x^y^z) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30); #define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30); #define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1u+rotlFixed(v,5);w=rotlFixed(w,30); #define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDCu+rotlFixed(v,5);w=rotlFixed(w,30); #define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6u+rotlFixed(v,5);w=rotlFixed(w,30); void SHA1_transform(SHA1_word32 *state, const SHA1_word32 *data) { SHA1_word32 W[16]; /* Copy state to working vars */ SHA1_word32 a = state[0]; SHA1_word32 b = state[1]; SHA1_word32 c = state[2]; SHA1_word32 d = state[3]; SHA1_word32 e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3); R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7); R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11); R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15); R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19); R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23); R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27); R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31); R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35); R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39); R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43); R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47); R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51); R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55); R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59); R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63); R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67); R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71); R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75); R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; } /* Silly single-integer interface. The real interface takes a variable-length bit string of data, transforms each SHA1_ndata size chunk, and finally tacks on an end-of-message 0x80 and a bit count. I'm ignoring all that here. */ void hash(unsigned int *src,unsigned int *hashes) { SHA1_word32 state[SHA1_nstate]; SHA1_init(&state[0]); SHA1_word32 data[SHA1_ndata]; for (int i=0;i<SHA1_ndata;i++) data[i]=0; data[0]=src[0]; SHA1_transform(&state[0],&data[0]); hashes[0]=state[0]; } /************ Main program ****************/ int foo(void) { unsigned int matches=0; int nbatches=1000000; unsigned int htarget=read_input(); double start=time_in_seconds(); for (int v=0;v<nbatches;v++) { unsigned int vs[1], hs[1]; for (int s=0;s<1;s++) vs[s]=v+s; hash(vs,hs); for (int s=0;s<1;s++) { unsigned int h=hs[s]; if (h==htarget) { /* we have a match! */ matches++; std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n"; } } } double end=time_in_seconds(); std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n"; std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n"; return matches; }This runs in 178 nanoseconds per hash on my Sandy Bridge machine--about 5 million hashes per second, which is fairly respectable.
One simple way to run faster is to use multiple cores. The easiest way to use multiple cores is via OpenMP, a compiler directive that says "make this loop run with multiple threads across the cores".
To use it, add the line "#pragma omp parallel for" above the v
loop in foo, and switch to "OpenMP" mode.
(Try
this in NetRun now!)
This immediately gets us to 55 nanoseconds per hash, not quite a
four-fold speedup due to overheads creating threads, and the
single shared memory subsystem.
In real code, you must be very careful when using threads:
Since threads take a long time to start up, hardware architects
often use an orthogonal way to express parallelism called SIMD:
Single Instruction, Multiple Data. Intel's version of this
is called SSE, the Streaming SIMD Extensions. From C, you
can #include <emmintrin.h>, and
the crypo-friendly integer values are four-integer blocks named
__m128i, and the operations end with "_epi32".
This means you can add two SIMD blocks of integers using:
#include <emmintrin.h> /* Intel SSE header */ int foo(void) { __m128i a=_mm_setr_epi32(1,2,3,4); __m128i b=_mm_setr_epi32(10,100,1000,1000);
__m128i c=_mm_add_epi32(a,b);
int out[4]; _mm_store_si128((__m128i *)&out[0],c); for (int i=0;i<4;i++) std::cout<<out[i]<<" "; return 0; }
This is pretty darn ugly, so I built a header named "osl/floats.h"
that defines a class named "ints". This class has overloaded
operators to let you treat SSE values similar to ordinary single
values. In fact, I can switch the typedef around in the SHA1
implementation above, and get a SSE version basically free!
Note that this version computes four totally independent hashes
per run--this is the easiest way to use SIMD, computing unrelated
values, because it eliminates dependencies between the slices of
an SIMD register.
/* Secure Hash Algorithm-1 (SHA-1) implementation. Implemented and placed in the public domain by Steve Reid Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html) Adapted by Orion Sky Lawlor, 7/20/2001 Re-adapted for single-file operation 2012-11. */ #undef __AVX__ // the SSE integer stuff is faster than AVX #include "osl/floats.h" typedef ints SHA1_word32;
... all SHA1 stuff works exactly the same way ...
void hash(unsigned int *src,unsigned int *hashes) { SHA1_word32 state[SHA1_nstate]; SHA1_init(&state[0]); SHA1_word32 data[SHA1_ndata]; for (int i=0;i<SHA1_ndata;i++) data[i]=0; for (int s=0;s<ints::n;s++) data[0][s]=src[s]; SHA1_transform(&state[0],&data[0]); for (int s=0;s<ints::n;s++) hashes[s]=state[0][s]; } /************ Main program ****************/ int foo(void) { unsigned int matches=0; int nbatches=1000000; unsigned int htarget=read_input(); double start=time_in_seconds(); #pragma omp parallel for for (int v=0;v<nbatches;v+=ints::n) { unsigned int vs[ints::n], hs[ints::n]; for (int s=0;s<ints::n;s++) vs[s]=v+s; hash(vs,hs); for (int s=0;s<ints::n;s++) { unsigned int h=hs[s]; if (h==htarget) { /* we have a match! */ matches++; std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n"; } } } double end=time_in_seconds(); std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n"; std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n"; return matches; }
The end result is 19ns per hash,
which is over 50 million hashes per second, and nearly 10x faster
than our original sequential code!