template <class INTISH>On our usual 64-bit machine, this prints out:
void count_bits(const char *name) {
INTISH i=(INTISH)1, lastgood=0;
int bits=0;
do {
bits++;
lastgood=i;
i=i+i; /* high bit will eventually overflow, so i==0 */
} while (i!=0);
std::cout<<name<<" has "<<sizeof(INTISH)<<" bytes, "<<bits<<" bits ("<<lastgood<<" high bit)\n";
}
int foo(void) {
count_bits<char>("char");
count_bits<unsigned char>("unsigned char");
count_bits<short>("short");
count_bits<unsigned short>("unsigned short");
count_bits<int>("int");
count_bits<unsigned int>("unsigned int");
count_bits<long>("long");
count_bits<unsigned long>("unsigned long");
return 0;
}
char has 1 bytes, 8 bits (-128 high bit)See the middle of the bytes lecture for more size information.
unsigned char has 1 bytes, 8 bits (128 high bit) assembly BYTE
short has 2 bytes, 16 bits (-32768 high bit)
unsigned short has 2 bytes, 16 bits (32768 high bit) assembly WORD
int has 4 bytes, 32 bits (-2147483648 high bit)
unsigned int has 4 bytes, 32 bits (2147483648 high bit) assembly DWORD
long has 8 bytes, 64 bits (-9223372036854775808 high bit)
unsigned long has 8 bytes, 64 bits (9223372036854775808 high bit) assembly QWORD
; Count number of bits in floating-point mantissaThis returns 24, meaning my 32-bit float can represent 1.0+1.0*2-23 exactly, but 1.0+1.0*2-24 gets rounded off to 1.0.
movss xmm10,[one]; load constants
movss xmm5,[one_half]
movss xmm0,xmm10; testbit--drops by half every iteration
mov eax,0 ; bit count
loopstart:
add eax,1 ; increment bit count
mulss xmm0,xmm5 ; multiply by one half: drops down to next test bit
movss xmm2,xmm0 ; build test pattern, starting at 1.0
addss xmm2,xmm10 ; compute 1+testbit
ucomiss xmm2,xmm10 ; compare test pattern against 1.0
jne loopstart ; if they're not equal, try again
ret
section .data
one: dd 1.0 ; constants
one_half: dd 0.5
; Count number of bits in floating-point mantisdaThis returns 53: clearly, a 64-bit "double" uses most of its bits to represent the mantissa!
movsd xmm10,[one]; load constants
movsd xmm5,[one_half]
movsd xmm0,xmm10; testbit--drops by half every iteration
mov eax,0 ; bit count
loopstart:
add eax,1 ; increment bit count
mulsd xmm0,xmm5 ; multiply by one half: drops down to next test bit
movsd xmm2,xmm0 ; build test pattern, starting at 1.0
addsd xmm2,xmm10 ; compute 1+testbit
ucomisd xmm2,xmm10 ; compare test pattern against 1.0
jne loopstart ; if they're not equal, try again
ret
section .data
one: dq 1.0 ; constants
one_half: dq 0.5
Scalar Single-precision (float) |
Scalar Double-precision (double) |
Packed Single-precision (4 floats) |
Packed Double-precision (2 doubles) |
Comments |
|
add |
addss |
addsd |
addps |
addpd |
sub, mul, div all work the same way |
min |
minss |
minsd |
minps |
minpd |
max works the same way |
sqrt |
sqrtss |
sqrtsd |
sqrtps |
sqrtpd |
Square root (sqrt), reciprocal (rcp), and reciprocal-square-root (rsqrt) all work the same way |
mov |
movss |
movsd |
movaps (aligned) movups (unaligned) |
movapd (aligned) movupd (unaligned) |
Aligned loads are up to 4x
faster, but will crash if given an unaligned address! Stack is always
16-byte aligned specifically for this instruction. Use "align 16" directive for static data. |
cvt | cvtss2sd cvtss2si cvttss2si |
cvtsd2ss cvtsd2si cvttsd2si |
cvtps2pd cvtps2dq cvttps2dq |
cvtpd2ps cvtpd2dq cvttpd2dq |
Convert to ("2", get it?) Single
Integer (si, stored in register like eax) or four DWORDs (dq, stored in
xmm register). "cvtt" versions do truncation (round down); "cvt"
versions round to nearest. |
com |
ucomiss |
ucomisd |
n/a |
n/a |
Sets CPU flags like normal x86 "cmp" instruction, from SSE registers. |
cmp |
cmpeqss |
cmpeqsd |
cmpeqps |
cmpeqpd |
Compare for equality ("lt",
"le", "neq", "nlt", "nle" versions work the same way). Sets all bits
of float to zero if false (0.0), or all bits to ones if true (a NaN).
Result is used as a bitmask for the bitwise AND and OR operations. |
and |
n/a |
n/a |
andps andnps |
andpd andnpd |
Bitwise AND operation. "andn"
versions are bitwise AND-NOT operations (A=(~A) & B). "or"
version works the same way. |
movss xmm3,[pi]; load up constantIt's annoyingly tricky to display full floating-point values. The trouble here is that our function "foo" returns an int to main, so we have to call a function to print floating-point values. Also, with SSE floating-point, on a 64-bit machine you're supposed to keep the stack aligned to a 16-byte boundary (the SSE "movaps" instruction crashes if it's not given a 16-byte aligned value). Sadly, the "call" instruction messes up your stack's alignment by pushing an 8-byte return address, so we've got to use up another 8 bytes of stack space purely for stack alignment, like this.
addss xmm3,xmm3 ; add pi to itself
cvtss2si eax,xmm3 ; round to integer
ret
section .data
pi: dd 3.14159265358979 ; constant
movss xmm3,[pi]; load up constant
addss xmm3,xmm3 ; add pi to itself
movss [output],xmm3; write register out to memory
; Print floating-point output
mov rdi,output ; first parameter: pointer to floats
mov rsi,1 ; second parameter: number of floats
sub rsp,8 ; keep stack 16-byte aligned (else get crash!)
extern farray_print
call farray_print
add rsp,8
ret
section .data
pi: dd 3.14159265358979 ; constant
output: dd 0.0 ; overwritten at runtime