crc64: cache prefetch optimization

Change-Id: I4d4df3881e6fb8365cba605f7f5d010503ce73e9
Signed-off-by: Xiaodong Liu <xiaodong.liu@intel.com>
This commit is contained in:
Xiaodong Liu 2016-11-07 22:15:57 -05:00 committed by Greg Tucker
parent b933fe8341
commit f42fc40db3
6 changed files with 36 additions and 0 deletions

View File

@ -37,6 +37,8 @@
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8 ; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -132,6 +134,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 ; buf += 128; add arg2, 128 ; buf += 128;
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -147,6 +150,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -162,6 +166,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -177,6 +182,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
pshufb xmm9, xmm11 pshufb xmm9, xmm11

View File

@ -40,6 +40,8 @@
; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8 ; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -121,6 +123,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 add arg2, 128
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
movdqa xmm8, xmm0 movdqa xmm8, xmm0
@ -134,6 +137,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
movdqa xmm8, xmm2 movdqa xmm8, xmm2
@ -147,6 +151,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
movdqa xmm8, xmm4 movdqa xmm8, xmm4
@ -160,6 +165,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
movdqa xmm8, xmm6 movdqa xmm8, xmm6

View File

@ -36,6 +36,8 @@
; ;
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -131,6 +133,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 ; buf += 128; add arg2, 128 ; buf += 128;
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -146,6 +149,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -161,6 +165,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -176,6 +181,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
pshufb xmm9, xmm11 pshufb xmm9, xmm11

View File

@ -37,6 +37,8 @@
; ;
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -118,6 +120,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 add arg2, 128
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
movdqa xmm8, xmm0 movdqa xmm8, xmm0
@ -131,6 +134,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
movdqa xmm8, xmm2 movdqa xmm8, xmm2
@ -144,6 +148,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
movdqa xmm8, xmm4 movdqa xmm8, xmm4
@ -157,6 +162,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
movdqa xmm8, xmm6 movdqa xmm8, xmm6

View File

@ -36,6 +36,8 @@
; ;
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -131,6 +133,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 ; buf += 128; add arg2, 128 ; buf += 128;
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -146,6 +149,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -161,6 +165,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
pshufb xmm9, xmm11 pshufb xmm9, xmm11
@ -176,6 +181,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
pshufb xmm9, xmm11 pshufb xmm9, xmm11

View File

@ -37,6 +37,8 @@
; ;
%include "reg_sizes.asm" %include "reg_sizes.asm"
%define fetch_dist 1024
[bits 64] [bits 64]
default rel default rel
@ -118,6 +120,7 @@ _fold_128_B_loop:
; update the buffer pointer ; update the buffer pointer
add arg2, 128 add arg2, 128
prefetchnta [arg2+fetch_dist+0]
movdqu xmm9, [arg2+16*0] movdqu xmm9, [arg2+16*0]
movdqu xmm12, [arg2+16*1] movdqu xmm12, [arg2+16*1]
movdqa xmm8, xmm0 movdqa xmm8, xmm0
@ -131,6 +134,7 @@ _fold_128_B_loop:
pxor xmm1, xmm12 pxor xmm1, xmm12
xorps xmm1, xmm13 xorps xmm1, xmm13
prefetchnta [arg2+fetch_dist+32]
movdqu xmm9, [arg2+16*2] movdqu xmm9, [arg2+16*2]
movdqu xmm12, [arg2+16*3] movdqu xmm12, [arg2+16*3]
movdqa xmm8, xmm2 movdqa xmm8, xmm2
@ -144,6 +148,7 @@ _fold_128_B_loop:
pxor xmm3, xmm12 pxor xmm3, xmm12
xorps xmm3, xmm13 xorps xmm3, xmm13
prefetchnta [arg2+fetch_dist+64]
movdqu xmm9, [arg2+16*4] movdqu xmm9, [arg2+16*4]
movdqu xmm12, [arg2+16*5] movdqu xmm12, [arg2+16*5]
movdqa xmm8, xmm4 movdqa xmm8, xmm4
@ -157,6 +162,7 @@ _fold_128_B_loop:
pxor xmm5, xmm12 pxor xmm5, xmm12
xorps xmm5, xmm13 xorps xmm5, xmm13
prefetchnta [arg2+fetch_dist+96]
movdqu xmm9, [arg2+16*6] movdqu xmm9, [arg2+16*6]
movdqu xmm12, [arg2+16*7] movdqu xmm12, [arg2+16*7]
movdqa xmm8, xmm6 movdqa xmm8, xmm6