crc64: cache prefetch optimization
Change-Id: I4d4df3881e6fb8365cba605f7f5d010503ce73e9 Signed-off-by: Xiaodong Liu <xiaodong.liu@intel.com>
This commit is contained in:
parent
b933fe8341
commit
f42fc40db3
@ -37,6 +37,8 @@
|
||||
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -132,6 +134,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128 ; buf += 128;
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
pshufb xmm9, xmm11
|
||||
@ -147,6 +150,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
pshufb xmm9, xmm11
|
||||
@ -162,6 +166,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
pshufb xmm9, xmm11
|
||||
@ -177,6 +182,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
pshufb xmm9, xmm11
|
||||
|
@ -40,6 +40,8 @@
|
||||
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -121,6 +123,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
movdqa xmm8, xmm0
|
||||
@ -134,6 +137,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
movdqa xmm8, xmm2
|
||||
@ -147,6 +151,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
movdqa xmm8, xmm4
|
||||
@ -160,6 +165,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
movdqa xmm8, xmm6
|
||||
|
@ -36,6 +36,8 @@
|
||||
;
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -131,6 +133,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128 ; buf += 128;
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
pshufb xmm9, xmm11
|
||||
@ -146,6 +149,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
pshufb xmm9, xmm11
|
||||
@ -161,6 +165,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
pshufb xmm9, xmm11
|
||||
@ -176,6 +181,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
pshufb xmm9, xmm11
|
||||
|
@ -37,6 +37,8 @@
|
||||
;
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -118,6 +120,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
movdqa xmm8, xmm0
|
||||
@ -131,6 +134,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
movdqa xmm8, xmm2
|
||||
@ -144,6 +148,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
movdqa xmm8, xmm4
|
||||
@ -157,6 +162,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
movdqa xmm8, xmm6
|
||||
|
@ -36,6 +36,8 @@
|
||||
;
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -131,6 +133,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128 ; buf += 128;
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
pshufb xmm9, xmm11
|
||||
@ -146,6 +149,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
pshufb xmm9, xmm11
|
||||
@ -161,6 +165,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
pshufb xmm9, xmm11
|
||||
@ -176,6 +181,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
pshufb xmm9, xmm11
|
||||
|
@ -37,6 +37,8 @@
|
||||
;
|
||||
%include "reg_sizes.asm"
|
||||
|
||||
%define fetch_dist 1024
|
||||
|
||||
[bits 64]
|
||||
default rel
|
||||
|
||||
@ -118,6 +120,7 @@ _fold_128_B_loop:
|
||||
; update the buffer pointer
|
||||
add arg2, 128
|
||||
|
||||
prefetchnta [arg2+fetch_dist+0]
|
||||
movdqu xmm9, [arg2+16*0]
|
||||
movdqu xmm12, [arg2+16*1]
|
||||
movdqa xmm8, xmm0
|
||||
@ -131,6 +134,7 @@ _fold_128_B_loop:
|
||||
pxor xmm1, xmm12
|
||||
xorps xmm1, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+32]
|
||||
movdqu xmm9, [arg2+16*2]
|
||||
movdqu xmm12, [arg2+16*3]
|
||||
movdqa xmm8, xmm2
|
||||
@ -144,6 +148,7 @@ _fold_128_B_loop:
|
||||
pxor xmm3, xmm12
|
||||
xorps xmm3, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+64]
|
||||
movdqu xmm9, [arg2+16*4]
|
||||
movdqu xmm12, [arg2+16*5]
|
||||
movdqa xmm8, xmm4
|
||||
@ -157,6 +162,7 @@ _fold_128_B_loop:
|
||||
pxor xmm5, xmm12
|
||||
xorps xmm5, xmm13
|
||||
|
||||
prefetchnta [arg2+fetch_dist+96]
|
||||
movdqu xmm9, [arg2+16*6]
|
||||
movdqu xmm12, [arg2+16*7]
|
||||
movdqa xmm8, xmm6
|
||||
|
Loading…
Reference in New Issue
Block a user