igzip: Load memory into xmm in stateless registers
Signed-off-by: Roy Oursler <roy.j.oursler@intel.com> Reviewed-by: Greg Tucker <greg.b.tucker@intel.com>
This commit is contained in:
parent
4d1fe78bfa
commit
cf30138c7b
@ -31,7 +31,9 @@ lsrc += igzip/igzip.c igzip/hufftables_c.c \
|
|||||||
igzip/crc_utils_01.asm \
|
igzip/crc_utils_01.asm \
|
||||||
igzip/crc_utils_04.asm \
|
igzip/crc_utils_04.asm \
|
||||||
igzip/igzip_body_01.asm igzip/igzip_body_04.asm igzip/igzip_finish.asm \
|
igzip/igzip_body_01.asm igzip/igzip_body_04.asm igzip/igzip_finish.asm \
|
||||||
igzip/igzip_stateless_01.asm igzip/igzip_stateless_04.asm \
|
igzip/igzip_stateless_01.asm \
|
||||||
|
igzip/igzip_stateless_02.asm \
|
||||||
|
igzip/igzip_stateless_04.asm \
|
||||||
igzip/crc_data.asm \
|
igzip/crc_data.asm \
|
||||||
igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \
|
igzip/crc32_gzip.asm igzip/detect_repeated_char.asm \
|
||||||
igzip/igzip_multibinary.asm \
|
igzip/igzip_multibinary.asm \
|
||||||
|
@ -40,6 +40,7 @@ default rel
|
|||||||
|
|
||||||
extern isal_deflate_body_stateless_base
|
extern isal_deflate_body_stateless_base
|
||||||
extern isal_deflate_body_stateless_01
|
extern isal_deflate_body_stateless_01
|
||||||
|
extern isal_deflate_body_stateless_02
|
||||||
extern isal_deflate_body_stateless_04
|
extern isal_deflate_body_stateless_04
|
||||||
|
|
||||||
extern isal_deflate_body_base
|
extern isal_deflate_body_base
|
||||||
@ -66,7 +67,7 @@ mbin_interface isal_deflate_init
|
|||||||
mbin_dispatch_init5 isal_deflate_init, isal_deflate_init_base, isal_deflate_init_01, isal_deflate_init_01, isal_deflate_init_01
|
mbin_dispatch_init5 isal_deflate_init, isal_deflate_init_base, isal_deflate_init_01, isal_deflate_init_01, isal_deflate_init_01
|
||||||
|
|
||||||
mbin_interface isal_deflate_body_stateless
|
mbin_interface isal_deflate_body_stateless
|
||||||
mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_01, isal_deflate_body_stateless_04
|
mbin_dispatch_init5 isal_deflate_body_stateless, isal_deflate_body_stateless_base, isal_deflate_body_stateless_01, isal_deflate_body_stateless_02, isal_deflate_body_stateless_04
|
||||||
|
|
||||||
mbin_interface isal_deflate_body
|
mbin_interface isal_deflate_body
|
||||||
mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body_01, isal_deflate_body_01, isal_deflate_body_04
|
mbin_dispatch_init5 isal_deflate_body, isal_deflate_body_base, isal_deflate_body_01, isal_deflate_body_01, isal_deflate_body_04
|
||||||
|
@ -38,7 +38,7 @@
|
|||||||
%include "stdmac.asm"
|
%include "stdmac.asm"
|
||||||
|
|
||||||
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
||||||
%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
||||||
|
|
||||||
%ifdef DEBUG
|
%ifdef DEBUG
|
||||||
%macro MARK 1
|
%macro MARK 1
|
||||||
@ -105,6 +105,7 @@ global %1
|
|||||||
%define xtmp1 xmm1 ; tmp
|
%define xtmp1 xmm1 ; tmp
|
||||||
%define xhash xmm2
|
%define xhash xmm2
|
||||||
%define xmask xmm3
|
%define xmask xmm3
|
||||||
|
%define xdata xmm4
|
||||||
|
|
||||||
%define ytmp0 ymm0 ; tmp
|
%define ytmp0 ymm0 ; tmp
|
||||||
%define ytmp1 ymm1 ; tmp
|
%define ytmp1 ymm1 ; tmp
|
||||||
@ -158,7 +159,7 @@ skip1:
|
|||||||
mov stream, rcx
|
mov stream, rcx
|
||||||
mov dword [stream + _internal_state_has_eob], 0
|
mov dword [stream + _internal_state_has_eob], 0
|
||||||
|
|
||||||
vmovdqu xmask, [mask]
|
MOVDQU xmask, [mask]
|
||||||
|
|
||||||
; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
|
; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
|
||||||
mov m_out_buf, [stream + _next_out]
|
mov m_out_buf, [stream + _next_out]
|
||||||
@ -209,9 +210,9 @@ MARK __stateless_compute_hash_ %+ ARCH
|
|||||||
shr tmp6, 8
|
shr tmp6, 8
|
||||||
compute_hash hash2, tmp6
|
compute_hash hash2, tmp6
|
||||||
|
|
||||||
vmovd xhash, hash %+ d
|
MOVD xhash, hash %+ d
|
||||||
vpinsrd xhash, hash2 %+ d, 1
|
PINSRD xhash, hash2 %+ d, 1
|
||||||
vpand xhash, xhash, xmask
|
PAND xhash, xhash, xmask
|
||||||
|
|
||||||
jmp write_lit_bits
|
jmp write_lit_bits
|
||||||
|
|
||||||
@ -240,7 +241,8 @@ loop2:
|
|||||||
mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
|
mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
|
||||||
dec dist2
|
dec dist2
|
||||||
|
|
||||||
mov tmp8, [file_start + f_i + 1]
|
MOVQ tmp8, xdata
|
||||||
|
shr tmp8, 16
|
||||||
mov tmp6, tmp8
|
mov tmp6, tmp8
|
||||||
compute_hash tmp2, tmp8
|
compute_hash tmp2, tmp8
|
||||||
|
|
||||||
@ -258,18 +260,20 @@ loop2:
|
|||||||
shr tmp6, 8
|
shr tmp6, 8
|
||||||
compute_hash tmp3, tmp6
|
compute_hash tmp3, tmp6
|
||||||
|
|
||||||
vmovd xhash, tmp2 %+ d
|
MOVD xhash, tmp2 %+ d
|
||||||
vpinsrd xhash, tmp3 %+ d, 1
|
PINSRD xhash, tmp3 %+ d, 1
|
||||||
vpand xhash, xmask
|
PAND xhash, xhash, xmask
|
||||||
|
|
||||||
MARK __stateless_compare_ %+ ARCH
|
MARK __stateless_compare_ %+ ARCH
|
||||||
;; Check for long len/dist match (>7) with first literal
|
;; Check for long len/dist match (>7) with first literal
|
||||||
mov len, [tmp1]
|
MOVQ len, xdata
|
||||||
|
mov curr_data, len
|
||||||
|
PSRLDQ xdata, 1
|
||||||
xor len, [tmp1 + dist]
|
xor len, [tmp1 + dist]
|
||||||
jz compare_loop
|
jz compare_loop
|
||||||
|
|
||||||
;; Check for len/dist match (>7) with second literal
|
;; Check for len/dist match (>7) with second literal
|
||||||
mov len2, [tmp1 + 1]
|
MOVQ len2, xdata
|
||||||
xor len2, [tmp1 + dist2 + 1]
|
xor len2, [tmp1 + dist2 + 1]
|
||||||
jz compare_loop2
|
jz compare_loop2
|
||||||
|
|
||||||
@ -300,7 +304,6 @@ len_dist_lit_huffman_pre:
|
|||||||
bsf len2, len2
|
bsf len2, len2
|
||||||
shr len2, 3
|
shr len2, 3
|
||||||
|
|
||||||
|
|
||||||
len_dist_lit_huffman:
|
len_dist_lit_huffman:
|
||||||
neg dist2
|
neg dist2
|
||||||
%ifndef LONGER_HUFFTABLE
|
%ifndef LONGER_HUFFTABLE
|
||||||
@ -317,7 +320,8 @@ len_dist_lit_huffman:
|
|||||||
|
|
||||||
mov rcx, code_len3
|
mov rcx, code_len3
|
||||||
|
|
||||||
mov tmp5, [file_start + f_i + 3]
|
MOVQ tmp5, xdata
|
||||||
|
shr tmp5, 24
|
||||||
compute_hash tmp4, tmp5
|
compute_hash tmp4, tmp5
|
||||||
and tmp4, HASH_MASK
|
and tmp4, HASH_MASK
|
||||||
|
|
||||||
@ -329,11 +333,12 @@ len_dist_lit_huffman:
|
|||||||
lea tmp3, [f_i + 1] ; tmp3 <= k
|
lea tmp3, [f_i + 1] ; tmp3 <= k
|
||||||
|
|
||||||
add f_i, len2
|
add f_i, len2
|
||||||
|
MOVDQU xdata, [file_start + f_i]
|
||||||
mov curr_data, [file_start + f_i]
|
mov curr_data, [file_start + f_i]
|
||||||
mov curr_data2, curr_data
|
mov curr_data2, curr_data
|
||||||
|
|
||||||
vmovd hash %+ d, xhash
|
MOVD hash %+ d, xhash
|
||||||
vpextrd hash2 %+ d, xhash, 1
|
PEXTRD hash2 %+ d, xhash, 1
|
||||||
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
||||||
|
|
||||||
compute_hash hash, curr_data
|
compute_hash hash, curr_data
|
||||||
@ -403,12 +408,13 @@ len_dist_huffman:
|
|||||||
lea tmp3, [f_i + 2] ; tmp3 <= k
|
lea tmp3, [f_i + 2] ; tmp3 <= k
|
||||||
add f_i, len
|
add f_i, len
|
||||||
|
|
||||||
vmovd hash %+ d, xhash
|
MOVD hash %+ d, xhash
|
||||||
vpextrd hash2 %+ d, xhash, 1
|
PEXTRD hash2 %+ d, xhash, 1
|
||||||
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
|
||||||
add tmp3,1
|
add tmp3,1
|
||||||
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
|
mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
|
||||||
|
|
||||||
|
MOVDQU xdata, [file_start + f_i]
|
||||||
mov curr_data, [file_start + f_i]
|
mov curr_data, [file_start + f_i]
|
||||||
mov curr_data2, curr_data
|
mov curr_data2, curr_data
|
||||||
compute_hash hash, curr_data
|
compute_hash hash, curr_data
|
||||||
@ -441,18 +447,18 @@ loop4_done:
|
|||||||
jl loop2
|
jl loop2
|
||||||
jmp end_loop_2
|
jmp end_loop_2
|
||||||
|
|
||||||
|
|
||||||
MARK __stateless_write_lit_bits_ %+ ARCH
|
MARK __stateless_write_lit_bits_ %+ ARCH
|
||||||
write_lit_bits:
|
write_lit_bits:
|
||||||
|
MOVDQU xdata, [file_start + f_i + 1]
|
||||||
mov f_end_i, [rsp + f_end_i_mem_offset]
|
mov f_end_i, [rsp + f_end_i_mem_offset]
|
||||||
add f_i, 1
|
add f_i, 1
|
||||||
mov curr_data, [file_start + f_i]
|
mov curr_data, [file_start + f_i]
|
||||||
|
|
||||||
vmovd hash %+ d, xhash
|
MOVD hash %+ d, xhash
|
||||||
|
|
||||||
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
|
write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
|
||||||
|
|
||||||
vpextrd hash2 %+ d, xhash, 1
|
PEXTRD hash2 %+ d, xhash, 1
|
||||||
|
|
||||||
; continue
|
; continue
|
||||||
cmp f_i, f_end_i
|
cmp f_i, f_end_i
|
||||||
|
7
igzip/igzip_stateless_02.asm
Normal file
7
igzip/igzip_stateless_02.asm
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
%define ARCH 02
|
||||||
|
|
||||||
|
%ifndef COMPARE_TYPE
|
||||||
|
%define COMPARE_TYPE 1
|
||||||
|
%endif
|
||||||
|
|
||||||
|
%include "igzip_stateless.asm"
|
@ -14,7 +14,7 @@ extern rfc1951_lookup_table
|
|||||||
_len_to_code_offset equ 0
|
_len_to_code_offset equ 0
|
||||||
|
|
||||||
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
|
||||||
%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
%define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
|
||||||
%define LIT_LEN 286
|
%define LIT_LEN 286
|
||||||
%define DIST_LEN 30
|
%define DIST_LEN 30
|
||||||
%define HIST_ELEM_SIZE 8
|
%define HIST_ELEM_SIZE 8
|
||||||
|
@ -271,3 +271,79 @@ ssc:
|
|||||||
shl %%dest, cl
|
shl %%dest, cl
|
||||||
%endif
|
%endif
|
||||||
%endm
|
%endm
|
||||||
|
|
||||||
|
%macro MOVDQU 2
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src %2
|
||||||
|
%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04))
|
||||||
|
vmovdqu %%dest, %%src
|
||||||
|
%else
|
||||||
|
movdqu %%dest, %%src
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro MOVD 2
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src %2
|
||||||
|
%if (ARCH == 02 || ARCH == 03 || ARCH == 04)
|
||||||
|
vmovd %%dest, %%src
|
||||||
|
%else
|
||||||
|
movd %%dest, %%src
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro MOVQ 2
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src %2
|
||||||
|
%if (ARCH == 02 || ARCH == 03 || ARCH == 04)
|
||||||
|
vmovq %%dest, %%src
|
||||||
|
%else
|
||||||
|
movq %%dest, %%src
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro PINSRD 3
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src %2
|
||||||
|
%define %%offset %3
|
||||||
|
%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04))
|
||||||
|
vpinsrd %%dest, %%src, %%offset
|
||||||
|
%else
|
||||||
|
pinsrd %%dest, %%src, %%offset
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro PEXTRD 3
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src %2
|
||||||
|
%define %%offset %3
|
||||||
|
%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04))
|
||||||
|
vpextrd %%dest, %%src, %%offset
|
||||||
|
%else
|
||||||
|
pextrd %%dest, %%src, %%offset
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro PSRLDQ 2
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%offset %2
|
||||||
|
%if ((ARCH == 02) || (ARCH == 03) || (ARCH == 04))
|
||||||
|
vpsrldq %%dest, %%offset
|
||||||
|
%else
|
||||||
|
psrldq %%dest, %%offset
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
|
||||||
|
%macro PAND 3
|
||||||
|
%define %%dest %1
|
||||||
|
%define %%src1 %2
|
||||||
|
%define %%src2 %3
|
||||||
|
%if (ARCH == 02 || ARCH == 03 || ARCH == 04)
|
||||||
|
vpand %%dest, %%src1, %%src2
|
||||||
|
%else
|
||||||
|
%ifnidn %%dest, %%src1
|
||||||
|
movdqa %%dest, %%src1
|
||||||
|
%endif
|
||||||
|
pand %%dest, %%src2
|
||||||
|
%endif
|
||||||
|
%endm
|
||||||
|
Loading…
Reference in New Issue
Block a user