IA-32 Intel® Architecture Optimization
Example 5-9
Horizontal Add Using movhlps/movlhps (continued)
// START HORIZONTAL ADD
movaps
xmm5, xmm0
movlhps xmm5, xmm1
movhlps xmm1, xmm0
addps
xmm5, xmm1
movaps
xmm4, xmm2
movlhps xmm2, xmm3
movhlps xmm3, xmm4
addps
xmm3, xmm2
movaps
xmm6, xmm3
shufps
xmm3, xmm5, 0xDD
shufps xmm5, xmm6, 0x88
addps
xmm6, xmm5
// END HORIZONTAL ADD
movaps [edx], xmm6
}
}
5-20
// xmm5= A1,A2,A3,A4
// xmm5= A1,A2,B1,B2
// xmm1= A3,A4,B3,B4
// xmm5= A1+A3,A2+A4,B1+B3,B2+B4
// xmm2= C1,C2,D1,D2
// xmm3= C3,C4,D3,D4
// xmm3= C1+C3,C2+C4,D1+D3,D2+D4
// xmm6= C1+C3,C2+C4,D1+D3,D2+D4
//xmm6=A1+A3,B1+B3,C1+C3,D1+D3
// xmm5= A2+A4,B2+B4,C2+C4,D2+D4
// xmm6= D,C,B,A
Need help?
Do you have a question about the ARCHITECTURE IA-32 and is the answer not in the manual?