push rax ; <- gotta align stack before calling farray_printHere's how we'd modify that float using SSE instructions:
mov rdi,val
mov rsi,1 ; number of floats to print
extern farray_print
call farray_print
pop rax
ret
section .data
val: dd 1.234
movss xmm0,DWORD[val]
addss xmm0,xmm0
movss DWORD[val],xmm0
push rax ; <- gotta align stack before calling farray_print
mov rdi,val
mov rsi,1 ; number of floats to print
extern farray_print
call farray_print
pop rax
ret
section .data
val: dd 1.234
Here's how we'd overwrite that float with the constant "pi" using the pre-SSE ancient "x87" floating point instructions:
fldpi
fstp DWORD[val]
push rax ; <- gotta align stack before calling farray_print
mov rdi,val
mov rsi,1 ; number of floats to print
extern farray_print
call farray_print
pop rax
ret
section .data
val: dd 1.234
Here's how we'd load the float, and add it to itself:
fld DWORD[val] ; load one copy
fld DWORD[val] ; load a second copy
faddp ; add the top copies (and pop)
fstp DWORD[val] ; store and pop
push rax ; <- gotta align stack before calling farray_print
mov rdi,val
mov rsi,1 ; number of floats to print
extern farray_print
call farray_print
pop rax
ret
section .data
val: dd 1.234
Obvious question...
fld DWORD[val]Unlike faddp's popping the top two values, just "fadd" can manipulate existing values on the stack. For example, I can add the top value to itself like this:
fld st0
faddp
fstp DWORD[val
fld DWORD[val]You can dig deeper in the stack, here ignoring st1 and adding st0 and st2.
fadd st0,st0
fstp DWORD[val
fldpi
fld1
fld DWORD[val]
fadd st0,st2 ; st0: val, st1: 1, st2: pi
fstp DWORD[val
But you can't add st1 and st2: "fadd st1,st2" gives
error: invalid combination of opcode and operands
Why not? They didn't have enough bits to specify an arbitrary
source and arbitrary destination, so one value needs to be on the top
of the stack for every operation. This is easy for simple
functions, but ends up taking lots of loads and stores for
realistically long functions. This shuffling is one big reason we don't use the old stack-based floating point anymore.
Curiously, saving bits in the instructions is very important for
bytecode languages, like Java or .NET, so they typically use a
stack-based arithmetic model instead of registers. The
Just-In-Time compiler usually can pick registers when generating
machine code though, so this doesn't cost much speed.
fldpi ; value to convertOuch! Even on a new machine, this takes over 13 nanoseconds, while an SSE "cvttss2si" only takes 3ns:
fnstcw WORD [oldcw] ; store old rounding mode
mov ax,WORD [oldcw]
mov ah,0xc ; overwrite high bits (rounding mode)
mov WORD [newcw],ax ; store to memory
fldcw WORD [newcw] ; change rounding mode
fistp DWORD [value] ; do conversion to memory
fldcw WORD [oldcw] ; change back to original mode
mov eax,[value] ; load from memory
ret
section .data
oldcw: dw 0 ; storage for old control word
newcw: dw 0 ; storage for new control work
value: dd 0 ; place to put new value
movss xmm0,[value] ; load value to convert
cvttss2si eax,xmm0 ; convert it (with truncation: round down)
ret
section .data
value: dd 3.14159
Recall from our float bits lecture that floats are stored using 32 perfectly ordinary bits:
Sign |
Exponent |
Fraction (or
"Mantissa") |
1 bit-- 0 for positive 1 for negative |
8 unsigned bits-- 127 means 20 137 means 210 |
23 bits-- a binary fraction. Don't forget the implicit leading 1! |
union unholy_t { /* a union between a float and an integer */For example, we can use integer bitwise operations to zero out the float's sign bit, making a quite cheap floating-point absolute value operation:
public:
float f;
int i;
};
int foo(void) {
unholy_t unholy;
unholy.f=3.0; /* put in a float */
return unholy.i; /* take out an integer */
}
float val=-3.1415;Back before SSE, floating point to integer conversion in C++ was really really slow. As we list above, the problem is that the same x86 FPU control word bits affect rounding both for float operations like addition and for float-to-int conversion. Thus you've got to save the old control word out to memory, switch its rounding mode to integer, load the new control word, do the integer conversion, and finally load the original control word to resume normal operation.
int foo(void) {
unholy_t unholy;
unholy.f=val; /* put in a negative float */
unholy.i=unholy.i&0x7fFFffFF; /* mask off the float's sign bit */
return unholy.f; /* now the float is positive! */
}
union unholy_t { /* a union between a float and an integer */This "fast float-to-integer trick" has been independently discovered by many smart people, including:
public:
float f;
int i;
};
float val=+3.1415;
int foo(void) {
unholy_t unholy;
unholy.f=val+(1<<23); /* scrape off the fraction bits with the weird constant */
return unholy.i&0x7FffFF; /* mask off the float's sign and exponent bits */
}