It is currently Fri May 31, 2024 6:16 am

struct vec4
{
float x, y, z, w;
//an SSE3 instruction (haddps) provides the means to remove
//fadd instructions. To fully utilize the SIMD we must do 4
//vectors at once. The vectors are normalized in place and
//lengths is filled with the original lengths.
inline static void Normalize4Vectors
(vec4* a, vec4* b, vec4* c, vec4* d, vec4* lengths)
{
__asm{
mov eax, a
movups xmm0, [eax];//copy a to xmm0
movaps xmm4, xmm0;//save copy of a to xmm4
mulps xmm0, xmm0;//xmm0 contains the components of a squared
mov eax, b
movups xmm1, [eax];//copy b to xmm1
movaps xmm5, xmm1;//save copy of b to xmm5
mulps xmm1, xmm1;//xmm1 contains the components of b squared
haddps xmm0, xmm1;//xmm0.x = a.x^2 + a.y^2, xmm0.y = a.z^2+a.w^2
;//xmm0.z = b.x^2 + b.y^2, xmm0.w = b.z^2+b.w^2
mov eax, c
movups xmm1, [eax];//copy c to xmm1
movaps xmm6, xmm1;//save copy of c to xmm6
mulps xmm1, xmm1;//xmm1 contains the components of c squared
mov eax, c
movups xmm2, [eax];//copy d to xmm2
movaps xmm7, xmm2;//save copy of d to xmm7
mulps xmm2, xmm2;//xmm3 contains the components of d squared
haddps xmm1, xmm2;//xmm2.x = c.x^2 + c.y^2, xmm2.y = c.z^2+c.w^2
;//xmm2.z = d.x^2 + d.y^2, xmm2.w = d.z^2+d.w^2
haddps xmm0, xmm1;//we now have xmm0 filled with the 4 lengths squared
;//xmm1.x = a.x^2 + a.y^2 + a.z^2 + a.w^2
;//etc
sqrtps xmm1, xmm0;//square root each
mov eax, result
movups [eax], xmm1;//output lengths
pshufd xmm0, xmm1, 0;//xmm1 filled with a's length-shuffle 0,0,0,0
divps xmm4, xmm0;//xmm4 contains normalized a
mov eax, a
movups [eax], xmm4;//output new a
pshufd xmm0, xmm1, 0x55;//xmm1 filled with a's length-shuffle 1,1,1,1
divps xmm5, xmm0;//xmm5 contains normalized b
mov eax, b
movups [eax], xmm5;//output new b
pshufd xmm0, xmm1, 0xAA;//xmm1 filled with a's length-shuffle 2,2,2,2
divps xmm6, xmm0;//xmm6 contains normalized c
mov eax, c
movups [eax], xmm6;//output new c
pshufd xmm0, xmm1, 0xFF;//xmm1 filled with a's length-shuffle 3,3,3,3
divps xmm7, xmm0;//xmm7 contains normalized d
mov eax, d
movups [eax], xmm7;//output new d
}
}
};












zombie@computer wrote:cplusplus.com?
Zipfinator wrote:I have no idea what that block of text is trying to explain... The benchmarks are impressive though, I think...

Terr wrote:Zipfinator wrote:I have no idea what that block of text is trying to explain... The benchmarks are impressive though, I think...
Normalizing a vector... means taking an given arrow in 3D space, and shortening it to a length of 1 without changing the direction it points in. The block of text is a crazy low-level way of doing the algebra.







Users browsing this forum: No registered users