AMD Athlon Processor x86 Optimization Manual page 64

X86 code optimization
Table of Contents

Advertisement

AMD Athlon™ Processor x86 Code Optimization
48
MOV
ECX, (-LARGE_NUM)
MOV
EAX, OFFSET array_a
MOV
EDX, OFFSET array_b
MOV
ECX, OFFSET array_c
$loop:
PREFETCHW
[EAX+196]
PREFETCH
[EDX+196]
PREFETCH
[ECX+196]
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE]
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+8] ;b[i+1]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+8] ;b[i+1]*c[i+1]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+8] ;a[i+1] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+16];b[i+2]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+16];b[i+2]*c[i+2]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+16];a[i+2] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+24];b[i+3]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+24];b[i+3]*c[i+3]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+24];a[i+3] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+32];b[i+4]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+32];b[i+4]*c[i+4]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+32];a[i+4] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+40];b[i+5]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+40];b[i+5]*c[i+5]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+40];a[i+5] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+48];b[i+6]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+48];b[i+6]*c[i+6]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+48];a[i+6] =
FLD
QWORD PTR [EDX+ECX*8+ARR_SIZE+56];b[i+7]
FMUL
QWORD PTR [ECX+ECX*8+ARR_SIZE+56];b[i+7]*c[i+7]
FSTP
QWORD PTR [EAX+ECX*8+ARR_SIZE+56];a[i+7] =
ADD
ECX, 8
JNZ
$loop
END
;used biased index
;get address of array_a
;get address of array_b
;get address of array_c
;two cachelines ahead
;two cachelines ahead
;two cachelines ahead
;b[i]
;b[i]*c[i]
;a[i] = b[i]*c[i]
; b[i+1]*c[i+1]
; [i+2]*c[i+2]
; b[i+3]*c[i+3]
; b[i+4]*c[i+4]
; b[i+5]*c[i+5]
; b[i+6]*c[i+6]
; b[i+7]*c[i+7]
;next 8 products
;until none left
Use the 3DNow!™ PREFETCH and PREFETCHW
22007E/0—November 1999

Advertisement

Table of Contents
loading

Table of Contents