diff --git a/crc/crc64_ecma_norm_by8.asm b/crc/crc64_ecma_norm_by8.asm index 9fb5fdf..cff01e1 100644 --- a/crc/crc64_ecma_norm_by8.asm +++ b/crc/crc64_ecma_norm_by8.asm @@ -37,6 +37,8 @@ ; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8 %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -132,6 +134,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 ; buf += 128; + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] pshufb xmm9, xmm11 @@ -147,6 +150,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] pshufb xmm9, xmm11 @@ -162,6 +166,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] pshufb xmm9, xmm11 @@ -177,6 +182,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] pshufb xmm9, xmm11 diff --git a/crc/crc64_ecma_refl_by8.asm b/crc/crc64_ecma_refl_by8.asm index 226a394..9d3847e 100644 --- a/crc/crc64_ecma_refl_by8.asm +++ b/crc/crc64_ecma_refl_by8.asm @@ -40,6 +40,8 @@ ; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_refl_by8 %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -121,6 +123,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] movdqa xmm8, xmm0 @@ -134,6 +137,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] movdqa xmm8, xmm2 @@ -147,6 +151,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] movdqa xmm8, xmm4 @@ -160,6 +165,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] movdqa xmm8, xmm6 diff --git a/crc/crc64_iso_norm_by8.asm b/crc/crc64_iso_norm_by8.asm index f227d23..1a4195d 100644 --- a/crc/crc64_iso_norm_by8.asm +++ b/crc/crc64_iso_norm_by8.asm @@ -36,6 +36,8 @@ ; %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -131,6 +133,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 ; buf += 128; + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] pshufb xmm9, xmm11 @@ -146,6 +149,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] pshufb xmm9, xmm11 @@ -161,6 +165,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] pshufb xmm9, xmm11 @@ -176,6 +181,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] pshufb xmm9, xmm11 diff --git a/crc/crc64_iso_refl_by8.asm b/crc/crc64_iso_refl_by8.asm index 7ecd924..d7ed8ae 100644 --- a/crc/crc64_iso_refl_by8.asm +++ b/crc/crc64_iso_refl_by8.asm @@ -37,6 +37,8 @@ ; %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -118,6 +120,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] movdqa xmm8, xmm0 @@ -131,6 +134,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] movdqa xmm8, xmm2 @@ -144,6 +148,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] movdqa xmm8, xmm4 @@ -157,6 +162,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] movdqa xmm8, xmm6 diff --git a/crc/crc64_jones_norm_by8.asm b/crc/crc64_jones_norm_by8.asm index 6cd358a..0e5e75a 100644 --- a/crc/crc64_jones_norm_by8.asm +++ b/crc/crc64_jones_norm_by8.asm @@ -36,6 +36,8 @@ ; %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -131,6 +133,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 ; buf += 128; + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] pshufb xmm9, xmm11 @@ -146,6 +149,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] pshufb xmm9, xmm11 @@ -161,6 +165,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] pshufb xmm9, xmm11 @@ -176,6 +181,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] pshufb xmm9, xmm11 diff --git a/crc/crc64_jones_refl_by8.asm b/crc/crc64_jones_refl_by8.asm index 33938c2..39da6b8 100644 --- a/crc/crc64_jones_refl_by8.asm +++ b/crc/crc64_jones_refl_by8.asm @@ -37,6 +37,8 @@ ; %include "reg_sizes.asm" +%define fetch_dist 1024 + [bits 64] default rel @@ -118,6 +120,7 @@ _fold_128_B_loop: ; update the buffer pointer add arg2, 128 + prefetchnta [arg2+fetch_dist+0] movdqu xmm9, [arg2+16*0] movdqu xmm12, [arg2+16*1] movdqa xmm8, xmm0 @@ -131,6 +134,7 @@ _fold_128_B_loop: pxor xmm1, xmm12 xorps xmm1, xmm13 + prefetchnta [arg2+fetch_dist+32] movdqu xmm9, [arg2+16*2] movdqu xmm12, [arg2+16*3] movdqa xmm8, xmm2 @@ -144,6 +148,7 @@ _fold_128_B_loop: pxor xmm3, xmm12 xorps xmm3, xmm13 + prefetchnta [arg2+fetch_dist+64] movdqu xmm9, [arg2+16*4] movdqu xmm12, [arg2+16*5] movdqa xmm8, xmm4 @@ -157,6 +162,7 @@ _fold_128_B_loop: pxor xmm5, xmm12 xorps xmm5, xmm13 + prefetchnta [arg2+fetch_dist+96] movdqu xmm9, [arg2+16*6] movdqu xmm12, [arg2+16*7] movdqa xmm8, xmm6