movups xmm1,[thing1]; <- copy the four floats into xmm1
addps xmm1,xmm1; <- add floats to themselves
movups [retval],xmm1; <- move that constant into the global "retval"
; Print out retval
extern farray_print
push 4 ;<- number of floats to print
push retval ;<- points to array of floats
call farray_print
add esp,8 ; <- pop off arguments
ret
section .data
thing1: dd 10.2, 100.2, 1000.2, 10000.2;<- source constant
retval: dd 0.0, 0.0, 0.0, 0.0 ;<- our return value
Here's a version that loads up two separate float constants, and adds them:
movups xmm1,[thing1]; <- copy the four floats into xmm1The "dd" lines declare a 32-bit constant. NASM is smart enough to automatically use float format if you type a constant with a decimal point!
movups xmm6,[thing2]; <- copy the four floats into xmm1
addps xmm1,xmm6; <- add floats
movups [retval],xmm1; <- move that constant into the global "retval"
; Print out retval
extern farray_print
push 4 ;<- number of floats to print
push retval ;<- points to array of floats
call farray_print
add esp,8 ; <- pop off arguments
ret
section .data
thing1: dd 10.2, 100.2, 1000.2, 10000.2;<- source constant
thing2: dd 1.2, 2.2, 3.2, 4.2;<- source constant
retval: dd 0.0, 0.0, 0.0, 0.0 ;<- our return value
Scalar Single-precision (float) |
Scalar Double-precision (double) |
Packed Single-precision (4 floats) |
Packed Double-precision (2 doubles) |
Comments |
|
add |
addss |
addsd |
addps |
addpd |
sub, mul, div all work the same way |
min |
minss |
minsd |
minps |
minpd |
max works the same way |
sqrt |
sqrtss |
sqrtsd |
sqrtps |
sqrtpd |
Square root (sqrt), reciprocal (rcp), and reciprocal-square-root (rsqrt) all work the same way |
mov |
movss |
movsd |
movaps (aligned) movups (unaligned) |
movapd (aligned) movupd (unaligned) |
Aligned loads are up to 4x
faster, but will crash if given an unaligned address! Stack is always
16-byte aligned specifically for this instruction. Use "align 16" directive for static data. |
cvt | cvtss2sd cvtss2si cvttss2si |
cvtsd2ss cvtsd2si cvttsd2si |
cvtps2pd cvtps2dq cvttps2dq |
cvtpd2ps cvtpd2dq cvttpd2dq |
Convert to ("2", get it?) Single
Integer (si, stored in register like eax) or four DWORDs (dq, stored in
xmm register). "cvtt" versions do truncation (round down); "cvt"
versions round to nearest. |
com |
ucomiss |
ucomisd |
n/a |
n/a |
Sets CPU flags like normal x86 "cmp" instruction, from SSE registers. |
cmp |
cmpeqss |
cmpeqsd |
cmpeqps |
cmpeqpd |
Compare for equality ("lt",
"le", "neq", "nlt", "nle" versions work the same way). Sets all bits
of float to zero if false (0.0), or all bits to ones if true (a NaN).
Result is used as a bitmask for the bitwise AND and OR operations. |
and |
n/a |
n/a |
andps andnps |
andpd andnpd |
Bitwise AND operation. "andn"
versions are bitwise AND-NOT operations (A=(~A) & B). "or"
version works the same way. |
movss xmm3,[pi]; load up constantIt's annoyingly tricky to display full floating-point values. The trouble here is that our function "foo" returns an int back to main, so we have to call a function to print floating-point values. Also, with SSE floating-point, on a 64-bit machine you're supposed to keep the stack aligned to a 16-byte boundary (the SSE "movaps" instruction crashes if it's not given a 16-byte aligned value). Sadly, the "call" instruction messes up your stack's alignment by pushing an 8-byte return address, so we've got to use up another 8 bytes of stack space purely for stack alignment, like this.
addss xmm3,xmm3 ; add pi to itself
cvtss2si eax,xmm3 ; round to integer
ret
section .data
pi: dd 3.14159265358979 ; constant
movss xmm3,[pi]; load up constant
addss xmm3,xmm3 ; add pi to itself
movss [output],xmm3; write register out to memory
; Print floating-point output
mov rdi,output ; first parameter: pointer to floats
mov rsi,1 ; second parameter: number of floats
sub rsp,8 ; keep stack 16-byte aligned (else get crash!)
extern farray_print
call farray_print
add rsp,8
ret
section .data
pi: dd 3.14159265358979 ; constant
output: dd 0.0 ; overwritten at runtime
__m128 _mm_load_ps(float *src) |
Load 4 floats from a 16-byte aligned address. WARNING: Segfaults if the address isn't a multiple of 16! |
__m128 _mm_loadu_ps(float *src) | Load 4 floats from an unaligned address (4x slower on older machines, same speed on latest machines *if* data is aligned) |
__m128 _mm_load1_ps(float *src) | Load 1 individual float into all 4 fields of an __m128 |
__m128 _mm_setr_ps(float a,float b,float c,float d) |
Load 4 separate floats from parameters into an __m128 |
void _mm_store_ps(float *dest,__m128 src) |
Store 4 floats to an aligned address. |
void _mm_storeu_ps(float *dest,__m128 src) | Store 4 floats to unaligned address |
__m128 _mm_add_ps(__m128 a,__m128 b) |
Add corresponding floats (also "sub") |
__m128 _mm_mul_ps(__m128 a,__m128 b) | Multiply corresponding floats (also "div", but it's slow) |
__m128 _mm_min_ps(__m128 a,__m128 b) | Take corresponding minimum (also "max") |
__m128 _mm_sqrt_ps(__m128 a) | Take square roots of 4 floats (12ns, slow like divide) |
__m128 _mm_rcp_ps(__m128 a) | Compute rough (12-bit accuracy) reciprocal of all 4 floats (as fast as an add!) |
__m128 _mm_rsqrt_ps(__m128 a) | Rough (12-bit) reciprocal-square-root of all 4 floats (fast) |
__m128 _mm_shuffle_ps(__m128 lo,__m128 hi, _MM_SHUFFLE(hi3,hi2,lo1,lo0)) |
Interleave inputs into low 2 floats and high 2 floats of output. Basically out[0]=lo[lo0]; out[1]=lo[lo1]; out[2]=hi[hi2]; out[3]=hi[hi3]; For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float a[i] into all 4 output floats. |
for (int i=0;i<n_vals;i++) {(executable NetRun link)
vals[i]=vals[i]*a+b;
}
for (int i=0;i<n_vals;i+=4) {(Try this in NetRun now!)
vals[i+0]=vals[i+0]*a+b;
vals[i+1]=vals[i+1]*a+b;
vals[i+2]=vals[i+2]*a+b;
vals[i+3]=vals[i+3]*a+b;
}
__m128 SSEa=_mm_load1_ps(&a);This gives us about a 4x speedup over the original, and still a 2x speedup over the unrolled version!
__m128 SSEb=_mm_load1_ps(&b);
__m128 v=_mm_load_ps(&vec[i]);
v=_mm_add_ps(_mm_mul_ps(v,SSEa),SSEb);
_mm_store_ps(&vec[i],v);
|
||||
|
||||
|
||||
|
|
|
|
enum {n=4};Unrolling the inner loop, as ugly as it is, speeds things up substantially, to 26ns:
float mat[n][n];
float vec[n];
float outvector[n];
int foo(void) {
for (int row=0;row<4;row++) {
float sum=0.0;
for (int col=0;col<n;col++) {
float m=mat[row][col];
float v=vec[col];
sum+=m*v;
}
outvector[row]=sum;
}
return 0;
}
enum {n=4};Making a line-by-line transformation to SSE doesn't really buy any performance, at 25ns:
float mat[n][n];
float vec[n];
float outvector[n];
int foo(void) {
for (int row=0;row<4;row++) {
float sum=0.0, m,v;
m=mat[row][0];
v=vec[0];
sum+=m*v;
m=mat[row][1];
v=vec[1];
sum+=m*v;
m=mat[row][2];
v=vec[2];
sum+=m*v;
m=mat[row][3];
v=vec[3];
sum+=m*v;
outvector[row]=sum;
}
return 0;
}
#include <pmmintrin.h>The trouble here is that we can cheaply operate on 4-vectors, but summing up the elements of those 4-vectors (with the hadd instruction) is expensive. We can eliminate that horizontal summation by operating on columns, although now we need a new matrix layout. This is down to 19ns on a Pentium 4, and just 12ns on the Q6600!
enum {n=4};
__m128 mat[n]; /* rows */
__m128 vec;
float outvector[n];
int foo(void) {
for (int row=0;row<n;row++) {
__m128 mrow=mat[row];
__m128 v=vec;
__m128 sum=mrow*v;
sum=_mm_hadd_ps(sum,sum); /* adds adjacent-two floats */
_mm_store_ss(&outvector[row],_mm_hadd_ps(sum,sum)); /* adds those floats */
}
return 0;
}
#include <xmmintrin.h>
enum {n=4};
__m128 mat[n]; /* by column */
float vec[n];
__m128 outvector;
int foo(void) {
float z=0.0;
__m128 sum=_mm_load1_ps(&z);
for (int col=0;col<n;col++) {
__m128 mcol=mat[col];
float v=vec[col];
sum+=mcol*_mm_load1_ps(&v);
}
outvector=sum;
return 0;
}
(Try this in NetRun now!)
The bottom line is that, as is *typical*, just making the inner
loop parallel requires extensive changes to the data layout and
processing throughout the program.
if (x) y=a; else y=b; // conditionalinto any of these forms:
y=b+x*(a-b); // linear version (assumes a-b works)Note that this last one is just a software version of a hardware multiplexer, and it works well in SIMD mode.
y=x*a+(1-x)*b; // rearranged version of above
y=(x&a)|(~x)&b); // bitwise version (assumes x is all zero bits, or all one bits)
for (int i=0;i<n;i++) {(Try this in NetRun now!)
if (vec[i]<7)
vec[i]=vec[i]*a+b;
else
vec[i]=c;
}
for (int i=0;i<n;i++) {Written in ordinary sequential code, this is actually a slowdown, not a speedup! But in SSE this branch-to-logical transformation means you can keep barreling along in parallel, without having to switch to sequential floating point to do the branches:
unsigned int mask=(vec[i]<7)?0xffFFffFF:0;
vec[i]=((vec[i]*a+b)&mask) | (c&~mask);
}
__m128 A=_mm_load1_ps(&a), B=_mm_load1_ps(&b), C=_mm_load1_ps(&c);This gives about a 3.8x speedup over the original loop on my machine... but the code is horrible!
__m128 Thresh=_mm_load1_ps(&thresh);
for (int i=0;i<n;i+=4) {
__m128 V=_mm_load_ps(&vec[i]);
__m128 mask=_mm_cmplt_ps(V,Thresh); // Do all four comparisons
__m128 V_then=_mm_add_ps(_mm_mul_ps(V,A),B); // "then" half of "if"
__m128 V_else=C; // "else" half of "if"
V=_mm_or_ps( _mm_and_ps(mask,V_then), _mm_andnot_ps(mask,V_else) );
_mm_store_ps(&vec[i],V);
}
#include <xmmintrin.h>
class fourfloats; /* forward declaration */
/* Wrapper around four bitmasks: 0 if false, all-ones (NAN) if true.
This class is used to implement comparisons on SSE registers.
*/
class fourmasks {
__m128 mask;
public:
fourmasks(__m128 m) {mask=m;}
__m128 if_then_else(fourfloats dthen,fourfloats delse);
};
/* Nice wrapper around __m128:
it represents four floating point values. */
class fourfloats {
__m128 v;
public:
fourfloats(float onevalue) { v=_mm_load1_ps(&onevalue); }
fourfloats(__m128 ssevalue) {v=ssevalue;} // put in an SSE value
operator __m128 () const {return v;} // take out an SSE value
fourfloats(const float *fourvalues) { v=_mm_load_ps(fourvalues);}
void store(float *fourvalues) {_mm_store_ps(fourvalues,v);}
/* arithmetic operations return blocks of floats */
fourfloats operator+(const fourfloats &right) {
return _mm_add_ps(v,right.v);
}
/* comparison operations return blocks of masks (bools) */
fourmasks operator<(const fourfloats &right) {
return _mm_cmplt_ps(v,right.v);
}
};
inline __m128 fourmasks::if_then_else(fourfloats dthen,fourfloats delse) {
return _mm_and_ps(mask,dthen)+
_mm_andnot_ps(mask,delse);
}
float src[4]={1.0,5.0,3.0,4.0};
float dest[4];
int foo(void) {
/*
// Serial code
for (int i=0;i<4;i++) {
if (src[i]<4.0) dest[i]=src[i]*2.0; else dest[i]=17.0;
}
*/
// Parallel code
fourfloats s(src);
fourfloats d=(s<4.0).if_then_else(s+s,17.0);
d.store(dest);
//farray_print(dest,4); // <- for debugging
return 0;
}
Compilers are now good enough that there is zero speed penalty due to the nice "fourfloats" class: the nice syntax comes for free!
Here's a slightly better developed wrapper:#include <immintrin.h> /* Intel SIMD intrinsics header (use emmintrin.h if this fails) */
class floats; /* forward declaration */
/* This is a SIMD block of boolean flags. */
class booleans {
__m128 v;
public:
enum {n=4}; // how many bools we work on at once
booleans(__m128 n) :v(n) {}
/* Logical operations on booleans: */
friend booleans operator&&(const booleans &a,const booleans &b) {return _mm_and_ps(a.v,b.v);}
friend booleans operator||(const booleans &a,const booleans &b) {return _mm_or_ps(a.v,b.v);}
/*
Return "the_then" where we are true, and "the_else" where we are false.
Basically performs a per-element branch, but without branching.
Example:
booleans m=(a<b);
c=m.then_else(3,7);
*/
floats then_else(const floats &the_then,const floats &the_else);
};
/* This is a SIMD block of single-precision floats. */
class floats {
__m128 v;
public:
enum {n=4}; // floats::n is how many floats we work on at once
floats() {}
floats(__m128 n) :v(n) {}
floats(const float *src) {v=_mm_loadu_ps(src);}
floats(float single) {v=_mm_load1_ps(&single);}
operator __m128 (void) const {return v;}
void store(float *dest) const {_mm_storeu_ps(dest,v);}
/* Caution: this only works if dest is 16-byte aligned. */
void store_aligned(float *dest) const {_mm_store_ps(dest,v);}
/* Printing support */
friend std::ostream &operator<<(std::ostream &o,const floats &a) {
float out[floats::n];
a.store(out);
for (int i=0;i<floats::n;i++) std::cout<<out[i]<<" ";
return o;
}
/* Arithmetic operations */
friend floats operator+(const floats &a,const floats &b) {return _mm_add_ps(a.v,b.v);}
friend floats operator-(const floats &a,const floats &b) {return _mm_sub_ps(a.v,b.v);}
friend floats operator*(const floats &a,const floats &b) {return _mm_mul_ps(a.v,b.v);}
friend floats operator/(const floats &a,const floats &b) {return _mm_div_ps(a.v,b.v);}
friend floats min(const floats &a,const floats &b) {return _mm_min_ps(a.v,b.v);}
friend floats max(const floats &a,const floats &b) {return _mm_max_ps(a.v,b.v);}
friend floats sqrt(const floats &a) {return _mm_sqrt_ps(a.v);}
/* Comparison operators */
friend booleans operator<(const floats &a,const floats &b) {return _mm_cmplt_ps(a.v,b.v);}
friend booleans operator<=(const floats &a,const floats &b) {return _mm_cmple_ps(a.v,b.v);}
friend booleans operator>=(const floats &a,const floats &b) {return _mm_cmpge_ps(a.v,b.v);}
friend booleans operator>(const floats &a,const floats &b) {return _mm_cmpgt_ps(a.v,b.v);}
friend booleans operator==(const floats &a,const floats &b) {return _mm_cmpeq_ps(a.v,b.v);}
friend booleans operator!=(const floats &a,const floats &b) {return _mm_cmpneq_ps(a.v,b.v);}
};
inline floats booleans::then_else(const floats &the_then,const floats &the_else)
{
return _mm_or_ps(
_mm_and_ps(v,the_then),
_mm_andnot_ps(v,the_else)
);
}
/* A little test code: */
float a[4]={1.375,1.567,1.876,1.999};
float b[4]={10.0,0.1,0.2,10.3};
float c[4]={5.0,5.1,5.2,5.3};
float d[4]={8.1,8.2,8.3,8.4};
float out[4];
int foo(void) {
floats A(a), B(b);
booleans m=(A<B);
std::cout<<"then_else="<<m.then_else(3,c)<<"\n";
return 0;
}
#include <immintrin.h> /* AVX + SSE4 intrinsics header */Performance, as you might expect, is single clock cycle despite the longer vectors--Intel just built wider floating point hardware!
// Internal class: do not use...
class not_vec8 {
__m256 v; // bitwise inverse of our value (!!)
public:
not_vec8(__m256 val) {v=val;}
__m256 get(void) const {return v;} // returns INVERSE of our value (!!)
};
// This is the class to use!
class vec8 {
__m256 v;
public:
vec8(__m256 val) {v=val;}
vec8(const float *src) {v=_mm256_loadu_ps(src);}
vec8(float x) {v=_mm256_broadcast_ss(&x);}
vec8 operator+(const vec8 &rhs) const {return _mm256_add_ps(v,rhs.v);}
vec8 operator-(const vec8 &rhs) const {return _mm256_sub_ps(v,rhs.v);}
vec8 operator*(const vec8 &rhs) const {return _mm256_mul_ps(v,rhs.v);}
vec8 operator/(const vec8 &rhs) const {return _mm256_div_ps(v,rhs.v);}
vec8 operator&(const vec8 &rhs) const {return _mm256_and_ps(v,rhs.v);}
vec8 operator|(const vec8 &rhs) const {return _mm256_or_ps(v,rhs.v);}
vec8 operator^(const vec8 &rhs) const {return _mm256_xor_ps(v,rhs.v);}
vec8 operator==(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_EQ_OQ);}
vec8 operator!=(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_NEQ_OQ);}
vec8 operator<(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_LT_OQ);}
vec8 operator<=(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_LE_OQ);}
vec8 operator>(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_GT_OQ);}
vec8 operator>=(const vec8 &rhs) const {return _mm256_cmp_ps(v,rhs.v,_CMP_GT_OQ);}
not_vec8 operator~(void) const {return not_vec8(v);}
__m256 get(void) const {return v;}
float *store(float *ptr) {
_mm256_store_ps(ptr,v);
return ptr;
}
float &operator[](int index) { return ((float *)&v)[index]; }
float operator[](int index) const { return ((const float *)&v)[index]; }
friend ostream &operator<<(ostream &o,const vec8 &y) {
o<<y[0]<<" "<<y[1]<<" "<<y[2]<<" "<<y[3];
return o;
}
friend vec8 operator&(const vec8 &lhs,const not_vec8 &rhs)
{return _mm256_andnot_ps(rhs.get(),lhs.get());}
friend vec8 operator&(const not_vec8 &lhs, const vec8 &rhs)
{return _mm256_andnot_ps(lhs.get(),rhs.get());}
vec8 if_then_else(const vec8 &then,const vec8 &else_part) const {
return _mm256_or_ps( _mm256_and_ps(v,then.v),
_mm256_andnot_ps(v, else_part.v)
);
}
};
vec8 sqrt(const vec8 &v) {
return _mm256_sqrt_ps(v.get());
}
vec8 rsqrt(const vec8 &v) {
return _mm256_rsqrt_ps(v.get());
}
/* Return value = dot product of a & b, replicated 4 times */
inline vec8 dot(const vec8 &a,const vec8 &b) {
vec8 t=a*b;
__m256 vt=_mm256_hadd_ps(t.get(),t.get());
vt=_mm256_hadd_ps(vt,vt);
return vt;
}
float data[8]={1.2,2.3,3.4,1.5,10.0,100.0,1000.0,10000.0};
vec8 a(data);
vec8 b(10.0);
vec8 c(0.0);
int foo(void) {
c=(a<b).if_then_else(3.7,c);
return 0;
}