From cf30138c7b92bf65606f78d31a4e8d4f1a6d2f14 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 17 Jun 2016 16:59:22 -0400 Subject: [PATCH] igzip: Load memory into xmm in stateless registers Signed-off-by: Roy Oursler Reviewed-by: Greg Tucker --- igzip/Makefile.am | 4 +- igzip/igzip_multibinary.asm | 3 +- igzip/igzip_stateless.asm | 46 ++++++++++--------- igzip/igzip_stateless_02.asm | 7 +++ igzip/igzip_update_histogram.asm | 2 +- igzip/stdmac.asm | 76 ++++++++++++++++++++++++++++++++ 6 files changed, 115 insertions(+), 23 deletions(-) create mode 100644 igzip/igzip_stateless_02.asm diff --git a/igzip/Makefile.am b/igzip/Makefile.am index cee0a44..a704753 100644 --- a/igzip/Makefile.am +++ b/igzip/Makefile.am @@ -31,7 +31,9 @@ lsrc += igzip/igzip.c igzip/hufftables_c.c \ igzip/crc_utils_01.asm \ igzip/crc_utils_04.asm \ igzip/igzip_body_01.asm igzip/igzip_body_04.asm igzip/igzip_finish.asm \ - igzip/igzip_stateless_01.asm igzip/igzip_stateless_04.asm \ + igzip/igzip_stateless_01.asm \ + igzip/igzip_stateless_02.asm \ + igzip/igzip_stateless_04.asm \ igzip/crc_data.asm \ igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \ igzip/igzip_multibinary.asm \ diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm index 32bcb8e..8743aea 100644 --- a/igzip/igzip_multibinary.asm +++ b/igzip/igzip_multibinary.asm @@ -40,6 +40,7 @@ default rel extern isal_deflate_body_stateless_base extern isal_deflate_body_stateless_01 +extern isal_deflate_body_stateless_02 extern isal_deflate_body_stateless_04 extern isal_deflate_body_base @@ -66,7 +67,7 @@ mbin_interface isal_deflate_init mbin_dispatch_init5 isal_deflate_init, isal_deflate_init_base, isal_deflate_init_01, isal_deflate_init_01, isal_deflate_init_01 mbin_interface isal_deflate_body_stateless -mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_01, isal_deflate_body_stateless_04 +mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_02, isal_deflate_body_stateless_04 mbin_interface isal_deflate_body mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body_01, isal_deflate_body_01, isal_deflate_body_04 diff --git a/igzip/igzip_stateless.asm b/igzip/igzip_stateless.asm index 38f20fc..16934cc 100644 --- a/igzip/igzip_stateless.asm +++ b/igzip/igzip_stateless.asm @@ -38,7 +38,7 @@ %include "stdmac.asm" %define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds -%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary +%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary %ifdef DEBUG %macro MARK 1 @@ -105,6 +105,7 @@ global %1 %define xtmp1 xmm1 ; tmp %define xhash xmm2 %define xmask xmm3 +%define xdata xmm4 %define ytmp0 ymm0 ; tmp %define ytmp1 ymm1 ; tmp @@ -158,7 +159,7 @@ skip1: mov stream, rcx mov dword [stream + _internal_state_has_eob], 0 - vmovdqu xmask, [mask] + MOVDQU xmask, [mask] ; state->bitbuf.set_buf(stream->next_out, stream->avail_out); mov m_out_buf, [stream + _next_out] @@ -209,9 +210,9 @@ MARK __stateless_compute_hash_ %+ ARCH shr tmp6, 8 compute_hash hash2, tmp6 - vmovd xhash, hash %+ d - vpinsrd xhash, hash2 %+ d, 1 - vpand xhash, xhash, xmask + MOVD xhash, hash %+ d + PINSRD xhash, hash2 %+ d, 1 + PAND xhash, xhash, xmask jmp write_lit_bits @@ -240,7 +241,8 @@ loop2: mov [stream + _internal_state_head + 2 * hash2], f_i %+ w dec dist2 - mov tmp8, [file_start + f_i + 1] + MOVQ tmp8, xdata + shr tmp8, 16 mov tmp6, tmp8 compute_hash tmp2, tmp8 @@ -258,18 +260,20 @@ loop2: shr tmp6, 8 compute_hash tmp3, tmp6 - vmovd xhash, tmp2 %+ d - vpinsrd xhash, tmp3 %+ d, 1 - vpand xhash, xmask + MOVD xhash, tmp2 %+ d + PINSRD xhash, tmp3 %+ d, 1 + PAND xhash, xhash, xmask MARK __stateless_compare_ %+ ARCH ;; Check for long len/dist match (>7) with first literal - mov len, [tmp1] + MOVQ len, xdata + mov curr_data, len + PSRLDQ xdata, 1 xor len, [tmp1 + dist] jz compare_loop ;; Check for len/dist match (>7) with second literal - mov len2, [tmp1 + 1] + MOVQ len2, xdata xor len2, [tmp1 + dist2 + 1] jz compare_loop2 @@ -300,7 +304,6 @@ len_dist_lit_huffman_pre: bsf len2, len2 shr len2, 3 - len_dist_lit_huffman: neg dist2 %ifndef LONGER_HUFFTABLE @@ -317,7 +320,8 @@ len_dist_lit_huffman: mov rcx, code_len3 - mov tmp5, [file_start + f_i + 3] + MOVQ tmp5, xdata + shr tmp5, 24 compute_hash tmp4, tmp5 and tmp4, HASH_MASK @@ -329,11 +333,12 @@ len_dist_lit_huffman: lea tmp3, [f_i + 1] ; tmp3 <= k add f_i, len2 + MOVDQU xdata, [file_start + f_i] mov curr_data, [file_start + f_i] mov curr_data2, curr_data - vmovd hash %+ d, xhash - vpextrd hash2 %+ d, xhash, 1 + MOVD hash %+ d, xhash + PEXTRD hash2 %+ d, xhash, 1 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w compute_hash hash, curr_data @@ -403,12 +408,13 @@ len_dist_huffman: lea tmp3, [f_i + 2] ; tmp3 <= k add f_i, len - vmovd hash %+ d, xhash - vpextrd hash2 %+ d, xhash, 1 + MOVD hash %+ d, xhash + PEXTRD hash2 %+ d, xhash, 1 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w add tmp3,1 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w + MOVDQU xdata, [file_start + f_i] mov curr_data, [file_start + f_i] mov curr_data2, curr_data compute_hash hash, curr_data @@ -441,18 +447,18 @@ loop4_done: jl loop2 jmp end_loop_2 - MARK __stateless_write_lit_bits_ %+ ARCH write_lit_bits: + MOVDQU xdata, [file_start + f_i + 1] mov f_end_i, [rsp + f_end_i_mem_offset] add f_i, 1 mov curr_data, [file_start + f_i] - vmovd hash %+ d, xhash + MOVD hash %+ d, xhash write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 - vpextrd hash2 %+ d, xhash, 1 + PEXTRD hash2 %+ d, xhash, 1 ; continue cmp f_i, f_end_i diff --git a/igzip/igzip_stateless_02.asm b/igzip/igzip_stateless_02.asm new file mode 100644 index 0000000..cb2a3ae --- /dev/null +++ b/igzip/igzip_stateless_02.asm @@ -0,0 +1,7 @@ +%define ARCH 02 + +%ifndef COMPARE_TYPE +%define COMPARE_TYPE 1 +%endif + +%include "igzip_stateless.asm" diff --git a/igzip/igzip_update_histogram.asm b/igzip/igzip_update_histogram.asm index d975609..4c1ad67 100644 --- a/igzip/igzip_update_histogram.asm +++ b/igzip/igzip_update_histogram.asm @@ -14,7 +14,7 @@ extern rfc1951_lookup_table _len_to_code_offset equ 0 %define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds -%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary +%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary %define LIT_LEN 286 %define DIST_LEN 30 %define HIST_ELEM_SIZE 8 diff --git a/igzip/stdmac.asm b/igzip/stdmac.asm index a9e24e9..4ee1a71 100644 --- a/igzip/stdmac.asm +++ b/igzip/stdmac.asm @@ -271,3 +271,79 @@ ssc: shl %%dest, cl %endif %endm + +%macro MOVDQU 2 +%define %%dest %1 +%define %%src %2 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vmovdqu %%dest, %%src +%else + movdqu %%dest, %%src +%endif +%endm + +%macro MOVD 2 +%define %%dest %1 +%define %%src %2 +%if (ARCH == 02 || ARCH == 03 || ARCH == 04) + vmovd %%dest, %%src +%else + movd %%dest, %%src +%endif +%endm + +%macro MOVQ 2 +%define %%dest %1 +%define %%src %2 +%if (ARCH == 02 || ARCH == 03 || ARCH == 04) + vmovq %%dest, %%src +%else + movq %%dest, %%src +%endif +%endm + +%macro PINSRD 3 +%define %%dest %1 +%define %%src %2 +%define %%offset %3 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vpinsrd %%dest, %%src, %%offset +%else + pinsrd %%dest, %%src, %%offset +%endif +%endm + +%macro PEXTRD 3 +%define %%dest %1 +%define %%src %2 +%define %%offset %3 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vpextrd %%dest, %%src, %%offset +%else + pextrd %%dest, %%src, %%offset +%endif +%endm + +%macro PSRLDQ 2 +%define %%dest %1 +%define %%offset %2 +%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04)) + vpsrldq %%dest, %%offset +%else + psrldq %%dest, %%offset +%endif +%endm + +%macro PAND 3 +%define %%dest %1 +%define %%src1 %2 +%define %%src2 %3 +%if (ARCH == 02 || ARCH == 03 || ARCH == 04) + vpand %%dest, %%src1, %%src2 +%else +%ifnidn %%dest, %%src1 + movdqa %%dest, %%src1 +%endif + pand %%dest, %%src2 +%endif +%endm