crc64: cache prefetch optimization
Change-Id: I4d4df3881e6fb8365cba605f7f5d010503ce73e9 Signed-off-by: Xiaodong Liu <xiaodong.liu@intel.com>
This commit is contained in:
parent
b933fe8341
commit
f42fc40db3
@ -37,6 +37,8 @@
|
|||||||
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
|
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -132,6 +134,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128 ; buf += 128;
|
add arg2, 128 ; buf += 128;
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -147,6 +150,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -162,6 +166,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -177,6 +182,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
|
@ -40,6 +40,8 @@
|
|||||||
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
|
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -121,6 +123,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128
|
add arg2, 128
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
movdqa xmm8, xmm0
|
movdqa xmm8, xmm0
|
||||||
@ -134,6 +137,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
movdqa xmm8, xmm2
|
movdqa xmm8, xmm2
|
||||||
@ -147,6 +151,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
movdqa xmm8, xmm4
|
movdqa xmm8, xmm4
|
||||||
@ -160,6 +165,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
movdqa xmm8, xmm6
|
movdqa xmm8, xmm6
|
||||||
|
@ -36,6 +36,8 @@
|
|||||||
;
|
;
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -131,6 +133,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128 ; buf += 128;
|
add arg2, 128 ; buf += 128;
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -146,6 +149,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -161,6 +165,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -176,6 +181,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
|
@ -37,6 +37,8 @@
|
|||||||
;
|
;
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -118,6 +120,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128
|
add arg2, 128
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
movdqa xmm8, xmm0
|
movdqa xmm8, xmm0
|
||||||
@ -131,6 +134,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
movdqa xmm8, xmm2
|
movdqa xmm8, xmm2
|
||||||
@ -144,6 +148,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
movdqa xmm8, xmm4
|
movdqa xmm8, xmm4
|
||||||
@ -157,6 +162,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
movdqa xmm8, xmm6
|
movdqa xmm8, xmm6
|
||||||
|
@ -36,6 +36,8 @@
|
|||||||
;
|
;
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -131,6 +133,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128 ; buf += 128;
|
add arg2, 128 ; buf += 128;
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -146,6 +149,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -161,6 +165,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
@ -176,6 +181,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
pshufb xmm9, xmm11
|
pshufb xmm9, xmm11
|
||||||
|
@ -37,6 +37,8 @@
|
|||||||
;
|
;
|
||||||
%include "reg_sizes.asm"
|
%include "reg_sizes.asm"
|
||||||
|
|
||||||
|
%define fetch_dist 1024
|
||||||
|
|
||||||
[bits 64]
|
[bits 64]
|
||||||
default rel
|
default rel
|
||||||
|
|
||||||
@ -118,6 +120,7 @@ _fold_128_B_loop:
|
|||||||
; update the buffer pointer
|
; update the buffer pointer
|
||||||
add arg2, 128
|
add arg2, 128
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+0]
|
||||||
movdqu xmm9, [arg2+16*0]
|
movdqu xmm9, [arg2+16*0]
|
||||||
movdqu xmm12, [arg2+16*1]
|
movdqu xmm12, [arg2+16*1]
|
||||||
movdqa xmm8, xmm0
|
movdqa xmm8, xmm0
|
||||||
@ -131,6 +134,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm1, xmm12
|
pxor xmm1, xmm12
|
||||||
xorps xmm1, xmm13
|
xorps xmm1, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+32]
|
||||||
movdqu xmm9, [arg2+16*2]
|
movdqu xmm9, [arg2+16*2]
|
||||||
movdqu xmm12, [arg2+16*3]
|
movdqu xmm12, [arg2+16*3]
|
||||||
movdqa xmm8, xmm2
|
movdqa xmm8, xmm2
|
||||||
@ -144,6 +148,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm3, xmm12
|
pxor xmm3, xmm12
|
||||||
xorps xmm3, xmm13
|
xorps xmm3, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+64]
|
||||||
movdqu xmm9, [arg2+16*4]
|
movdqu xmm9, [arg2+16*4]
|
||||||
movdqu xmm12, [arg2+16*5]
|
movdqu xmm12, [arg2+16*5]
|
||||||
movdqa xmm8, xmm4
|
movdqa xmm8, xmm4
|
||||||
@ -157,6 +162,7 @@ _fold_128_B_loop:
|
|||||||
pxor xmm5, xmm12
|
pxor xmm5, xmm12
|
||||||
xorps xmm5, xmm13
|
xorps xmm5, xmm13
|
||||||
|
|
||||||
|
prefetchnta [arg2+fetch_dist+96]
|
||||||
movdqu xmm9, [arg2+16*6]
|
movdqu xmm9, [arg2+16*6]
|
||||||
movdqu xmm12, [arg2+16*7]
|
movdqu xmm12, [arg2+16*7]
|
||||||
movdqa xmm8, xmm6
|
movdqa xmm8, xmm6
|
||||||
|
Loading…
Reference in New Issue
Block a user