igzip: Avx512 version for encdode_df
Change-Id: I1625a3d7e016805791cfd09e31909562f432fd71 Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
This commit is contained in:
parent
b3c09b9b7c
commit
5a55e3096c
@ -56,6 +56,7 @@ lsrc_x86_64 += \
|
||||
igzip/igzip_decode_block_stateless_04.asm \
|
||||
igzip/igzip_inflate_multibinary.asm \
|
||||
igzip/encode_df_04.asm \
|
||||
igzip/encode_df_06.asm \
|
||||
igzip/proc_heap.asm
|
||||
|
||||
src_include += -I $(srcdir)/igzip
|
||||
@ -93,7 +94,6 @@ other_src += igzip/bitbuf2.asm \
|
||||
igzip/inflate_std_vects.h \
|
||||
igzip/flatten_ll.h \
|
||||
igzip/encode_df.h \
|
||||
igzip/encode_df_asm.asm \
|
||||
igzip/heap_macros.asm \
|
||||
igzip/igzip_checksums.h
|
||||
|
||||
|
@ -1,4 +1,530 @@
|
||||
%include "reg_sizes.asm"
|
||||
%include "lz0a_const.asm"
|
||||
%include "data_struct2.asm"
|
||||
%include "stdmac.asm"
|
||||
|
||||
%define ARCH 04
|
||||
%define USE_HSWNI
|
||||
|
||||
%include "encode_df_asm.asm"
|
||||
; tree entry is 4 bytes:
|
||||
; lit/len tree (513 entries)
|
||||
; | 3 | 2 | 1 | 0 |
|
||||
; | len | code |
|
||||
;
|
||||
; dist tree
|
||||
; | 3 | 2 | 1 | 0 |
|
||||
; |eblen:codlen| code |
|
||||
|
||||
; token format:
|
||||
; DIST_OFFSET:0 : lit/len
|
||||
; 31:(DIST_OFFSET + 5) : dist Extra Bits
|
||||
; (DIST_OFFSET + 5):DIST_OFFSET : dist code
|
||||
; lit/len: 0-256 (literal)
|
||||
; 257-512 (dist + 254)
|
||||
|
||||
; returns final token pointer
|
||||
; equal to token_end if successful
|
||||
; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
|
||||
; BitBuf *out_buf, uint32_t *trees);
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
%define arg1 rcx
|
||||
%define arg2 rdx
|
||||
%define arg3 r8
|
||||
%define arg4 r9
|
||||
%define sym rsi
|
||||
%define dsym rdi
|
||||
%define hufftables r9
|
||||
%define ptr r11
|
||||
%else
|
||||
; Linux
|
||||
%define arg1 rdi
|
||||
%define arg2 rsi
|
||||
%define arg3 rdx
|
||||
%define arg4 rcx
|
||||
%define sym r9
|
||||
%define dsym r8
|
||||
%define hufftables r11
|
||||
%define ptr rdi
|
||||
%endif
|
||||
|
||||
%define in_buf_end arg2
|
||||
%define bitbuf arg3
|
||||
%define out_buf bitbuf
|
||||
; bit_count is rcx
|
||||
%define bits rax
|
||||
%define data r12
|
||||
%define tmp rbx
|
||||
%define len dsym
|
||||
%define tmp2 r10
|
||||
%define end_ptr rbp
|
||||
|
||||
%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
|
||||
%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
|
||||
|
||||
%define codes1 ymm1
|
||||
%define code_lens1 ymm2
|
||||
%define codes2 ymm3
|
||||
%define code_lens2 ymm4
|
||||
%define codes3 ymm5
|
||||
%define code_lens3 ymm6
|
||||
%define codes4 ymm7
|
||||
%define syms ymm7
|
||||
|
||||
%define code_lens4 ymm8
|
||||
%define dsyms ymm8
|
||||
|
||||
%define ytmp ymm9
|
||||
%define codes_lookup1 ymm10
|
||||
%define codes_lookup2 ymm11
|
||||
%define datas ymm12
|
||||
%define ybits ymm13
|
||||
%define ybits_count ymm14
|
||||
%define yoffset_mask ymm15
|
||||
|
||||
%define VECTOR_SIZE 0x20
|
||||
%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
|
||||
%define VECTOR_SLOP 0x20 - 8
|
||||
|
||||
gpr_save_mem_offset equ 0
|
||||
gpr_save_mem_size equ 8 * 6
|
||||
xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
|
||||
xmm_save_mem_size equ 10 * 16
|
||||
bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
|
||||
bitbuf_mem_size equ 8
|
||||
stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
|
||||
|
||||
|
||||
%macro FUNC_SAVE 0
|
||||
sub rsp, stack_size
|
||||
mov [rsp + gpr_save_mem_offset + 0*8], rbx
|
||||
mov [rsp + gpr_save_mem_offset + 1*8], rbp
|
||||
mov [rsp + gpr_save_mem_offset + 2*8], r12
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
mov [rsp + gpr_save_mem_offset + 3*8], rsi
|
||||
mov [rsp + gpr_save_mem_offset + 4*8], rdi
|
||||
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
|
||||
MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
|
||||
%endif
|
||||
|
||||
%endm
|
||||
|
||||
%macro FUNC_RESTORE 0
|
||||
mov rbx, [rsp + gpr_save_mem_offset + 0*8]
|
||||
mov rbp, [rsp + gpr_save_mem_offset + 1*8]
|
||||
mov r12, [rsp + gpr_save_mem_offset + 2*8]
|
||||
|
||||
%ifidn __OUTPUT_FORMAT__, win64
|
||||
mov rsi, [rsp + gpr_save_mem_offset + 3*8]
|
||||
mov rdi, [rsp + gpr_save_mem_offset + 4*8]
|
||||
|
||||
MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
|
||||
MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
|
||||
MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
|
||||
MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
|
||||
MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
|
||||
MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
|
||||
MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
|
||||
MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
|
||||
MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
|
||||
MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
|
||||
%endif
|
||||
add rsp, stack_size
|
||||
|
||||
%endmacro
|
||||
|
||||
global encode_deflate_icf_ %+ ARCH
|
||||
encode_deflate_icf_ %+ ARCH:
|
||||
FUNC_SAVE
|
||||
|
||||
%ifnidn ptr, arg1
|
||||
mov ptr, arg1
|
||||
%endif
|
||||
%ifnidn hufftables, arg4
|
||||
mov hufftables, arg4
|
||||
%endif
|
||||
|
||||
mov [rsp + bitbuf_mem_offset], bitbuf
|
||||
mov bits, [bitbuf + _m_bits]
|
||||
mov ecx, [bitbuf + _m_bit_count]
|
||||
mov end_ptr, [bitbuf + _m_out_end]
|
||||
mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
|
||||
|
||||
sub end_ptr, VECTOR_SLOP
|
||||
sub in_buf_end, VECTOR_LOOP_PROCESSED
|
||||
cmp ptr, in_buf_end
|
||||
jge .finish
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vmovdqu datas, [ptr]
|
||||
vpand syms, datas, [lit_mask]
|
||||
vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpsrld dsyms, datas, DIST_OFFSET
|
||||
vpand dsyms, dsyms, [dist_mask]
|
||||
vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
|
||||
|
||||
vmovq ybits %+ x, bits
|
||||
vmovq ybits_count %+ x, rcx
|
||||
vmovdqa yoffset_mask, [offset_mask]
|
||||
|
||||
.main_loop:
|
||||
;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
|
||||
vpsrld code_lens1, codes_lookup1, 24
|
||||
vpand codes1, codes_lookup1, [lit_icr_mask]
|
||||
|
||||
;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
|
||||
;; and code_lens3 the extra bit counts
|
||||
vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
|
||||
vpsrld code_lens2, codes_lookup2, 24
|
||||
vpsrld code_lens3, codes_lookup2, 16
|
||||
vpand code_lens3, [eb_icr_mask]
|
||||
|
||||
;; Set codes3 to contain the extra bits
|
||||
vpsrld codes3, datas, EXTRA_BITS_OFFSET
|
||||
|
||||
cmp out_buf, end_ptr
|
||||
ja .main_loop_exit
|
||||
|
||||
;; Start code lookups for next iteration
|
||||
add ptr, VECTOR_SIZE
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vmovdqu datas, [ptr]
|
||||
vpand syms, datas, [lit_mask]
|
||||
vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vpsrld dsyms, datas, DIST_OFFSET
|
||||
vpand dsyms, dsyms, [dist_mask]
|
||||
vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
|
||||
|
||||
;; Merge dist code with extra bits
|
||||
vpsllvd codes3, codes3, code_lens2
|
||||
vpxor codes2, codes2, codes3
|
||||
vpaddd code_lens2, code_lens2, code_lens3
|
||||
|
||||
;; Check for long codes
|
||||
vpaddd code_lens3, code_lens1, code_lens2
|
||||
vpcmpgtd ytmp, code_lens3, [max_write_d]
|
||||
vptest ytmp, ytmp
|
||||
jnz .long_codes
|
||||
|
||||
;; Merge dist and len codes
|
||||
vpsllvd codes2, codes2, code_lens1
|
||||
vpxor codes1, codes1, codes2
|
||||
|
||||
;; Split buffer data into qwords, ytmp is 0 after last branch
|
||||
vpblendd codes3, ytmp, codes1, 0x55
|
||||
vpsrlq codes1, codes1, 32
|
||||
vpsrlq code_lens1, code_lens3, 32
|
||||
vpblendd code_lens3, ytmp, code_lens3, 0x55
|
||||
|
||||
;; Merge bitbuf bits
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpxor codes3, codes3, ybits
|
||||
vpaddq code_lens3, code_lens3, ybits_count
|
||||
|
||||
;; Merge two symbols into qwords
|
||||
vpsllvq codes1, codes1, code_lens3
|
||||
vpxor codes1, codes1, codes3
|
||||
vpaddq code_lens1, code_lens1, code_lens3
|
||||
|
||||
;; Split buffer data into dqwords, ytmp is 0 after last branch
|
||||
vpblendd codes2, ytmp, codes1, 0x33
|
||||
vpblendd code_lens2, ytmp, code_lens1, 0x33
|
||||
vpsrldq codes1, 8
|
||||
vpsrldq code_lens1, 8
|
||||
|
||||
;; Merge two qwords into dqwords
|
||||
vmovdqa ytmp, [q_64]
|
||||
vpsubq code_lens3, ytmp, code_lens2
|
||||
vpsrlvq codes3, codes1, code_lens3
|
||||
vpslldq codes3, codes3, 8
|
||||
|
||||
vpsllvq codes1, codes1, code_lens2
|
||||
|
||||
vpxor codes1, codes1, codes3
|
||||
vpxor codes1, codes1, codes2
|
||||
vpaddq code_lens1, code_lens1, code_lens2
|
||||
|
||||
vmovq tmp, code_lens1 %+ x ;Number of bytes
|
||||
shr tmp, 3
|
||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||
|
||||
;; bit shift upper dqword combined bits to line up with lower dqword
|
||||
vextracti128 codes2 %+ x, codes1, 1
|
||||
vextracti128 code_lens2 %+ x, code_lens1, 1
|
||||
|
||||
vpbroadcastq ybits_count, ybits_count %+ x
|
||||
vpsrldq codes3, codes2, 1
|
||||
vpsllvq codes2, codes2, ybits_count
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpslldq codes3, codes3, 1
|
||||
vpor codes2, codes2, codes3
|
||||
|
||||
; Write out lower dqword of combined bits
|
||||
vmovdqu [out_buf], codes1
|
||||
movzx bits, byte [out_buf + tmp]
|
||||
vmovq codes1 %+ x, bits
|
||||
vpaddq code_lens1, code_lens1, code_lens2
|
||||
|
||||
vmovq tmp2, code_lens1 %+ x ;Number of bytes
|
||||
shr tmp2, 3
|
||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||
|
||||
; Write out upper dqword of combined bits
|
||||
vpor codes1 %+ x, codes1 %+ x, codes2 %+ x
|
||||
vmovdqu [out_buf + tmp], codes1 %+ x
|
||||
add out_buf, tmp2
|
||||
movzx bits, byte [out_buf]
|
||||
vmovq ybits %+ x, bits
|
||||
|
||||
cmp ptr, in_buf_end
|
||||
jbe .main_loop
|
||||
|
||||
.main_loop_exit:
|
||||
vmovq rcx, ybits_count %+ x
|
||||
vmovq bits, ybits %+ x
|
||||
jmp .finish
|
||||
|
||||
.long_codes:
|
||||
add end_ptr, VECTOR_SLOP
|
||||
sub ptr, VECTOR_SIZE
|
||||
|
||||
vpxor ytmp, ytmp, ytmp
|
||||
vpblendd codes3, ytmp, codes1, 0x55
|
||||
vpblendd code_lens3, ytmp, code_lens1, 0x55
|
||||
vpblendd codes4, ytmp, codes2, 0x55
|
||||
|
||||
vpsllvq codes4, codes4, code_lens3
|
||||
vpxor codes3, codes3, codes4
|
||||
vpaddd code_lens3, code_lens1, code_lens2
|
||||
|
||||
vpsrlq codes1, codes1, 32
|
||||
vpsrlq code_lens1, code_lens1, 32
|
||||
vpsrlq codes2, codes2, 32
|
||||
|
||||
vpsllvq codes2, codes2, code_lens1
|
||||
vpxor codes1, codes1, codes2
|
||||
|
||||
vpsrlq code_lens1, code_lens3, 32
|
||||
vpblendd code_lens3, ytmp, code_lens3, 0x55
|
||||
|
||||
;; Merge bitbuf bits
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpxor codes3, codes3, ybits
|
||||
vpaddq code_lens3, code_lens3, ybits_count
|
||||
vpaddq code_lens1, code_lens1, code_lens3
|
||||
|
||||
xor bits, bits
|
||||
xor rcx, rcx
|
||||
vpsubq code_lens1, code_lens1, code_lens3
|
||||
%rep 2
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
;; insert LL code
|
||||
vmovq sym, codes3 %+ x
|
||||
vmovq tmp2, code_lens3 %+ x
|
||||
SHLX sym, sym, rcx
|
||||
or bits, sym
|
||||
add rcx, tmp2
|
||||
|
||||
; empty bits
|
||||
mov [out_buf], bits
|
||||
mov tmp, rcx
|
||||
shr tmp, 3 ; byte count
|
||||
add out_buf, tmp
|
||||
mov tmp, rcx
|
||||
and rcx, ~7
|
||||
SHRX bits, bits, rcx
|
||||
mov rcx, tmp
|
||||
and rcx, 7
|
||||
add ptr, 4
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
;; insert LL code
|
||||
vmovq sym, codes1 %+ x
|
||||
vmovq tmp2, code_lens1 %+ x
|
||||
SHLX sym, sym, rcx
|
||||
or bits, sym
|
||||
add rcx, tmp2
|
||||
|
||||
; empty bits
|
||||
mov [out_buf], bits
|
||||
mov tmp, rcx
|
||||
shr tmp, 3 ; byte count
|
||||
add out_buf, tmp
|
||||
mov tmp, rcx
|
||||
and rcx, ~7
|
||||
SHRX bits, bits, rcx
|
||||
mov rcx, tmp
|
||||
and rcx, 7
|
||||
add ptr, 4
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
;; insert LL code
|
||||
vpextrq sym, codes3 %+ x, 1
|
||||
vpextrq tmp2, code_lens3 %+ x, 1
|
||||
SHLX sym, sym, rcx
|
||||
or bits, sym
|
||||
add rcx, tmp2
|
||||
|
||||
; empty bits
|
||||
mov [out_buf], bits
|
||||
mov tmp, rcx
|
||||
shr tmp, 3 ; byte count
|
||||
add out_buf, tmp
|
||||
mov tmp, rcx
|
||||
and rcx, ~7
|
||||
SHRX bits, bits, rcx
|
||||
mov rcx, tmp
|
||||
and rcx, 7
|
||||
add ptr, 4
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
;; insert LL code
|
||||
vpextrq sym, codes1 %+ x, 1
|
||||
vpextrq tmp2, code_lens1 %+ x, 1
|
||||
SHLX sym, sym, rcx
|
||||
or bits, sym
|
||||
add rcx, tmp2
|
||||
|
||||
; empty bits
|
||||
mov [out_buf], bits
|
||||
mov tmp, rcx
|
||||
shr tmp, 3 ; byte count
|
||||
add out_buf, tmp
|
||||
mov tmp, rcx
|
||||
and rcx, ~7
|
||||
SHRX bits, bits, rcx
|
||||
mov rcx, tmp
|
||||
and rcx, 7
|
||||
add ptr, 4
|
||||
|
||||
vextracti128 codes3 %+ x, codes3, 1
|
||||
vextracti128 code_lens3 %+ x, code_lens3, 1
|
||||
vextracti128 codes1 %+ x, codes1, 1
|
||||
vextracti128 code_lens1 %+ x, code_lens1, 1
|
||||
%endrep
|
||||
sub end_ptr, VECTOR_SLOP
|
||||
|
||||
vmovq ybits %+ x, bits
|
||||
vmovq ybits_count %+ x, rcx
|
||||
cmp ptr, in_buf_end
|
||||
jbe .main_loop
|
||||
|
||||
.finish:
|
||||
add in_buf_end, VECTOR_LOOP_PROCESSED
|
||||
add end_ptr, VECTOR_SLOP
|
||||
|
||||
cmp ptr, in_buf_end
|
||||
jge .overflow
|
||||
|
||||
.finish_loop:
|
||||
mov DWORD(data), [ptr]
|
||||
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
|
||||
mov sym, data
|
||||
and sym, LIT_MASK ; sym has ll_code
|
||||
mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
|
||||
|
||||
; look up dist sym
|
||||
mov dsym, data
|
||||
shr dsym, DIST_OFFSET
|
||||
and dsym, DIST_MASK
|
||||
mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
|
||||
|
||||
; insert LL code
|
||||
; sym: 31:24 length; 23:0 code
|
||||
mov tmp2, sym
|
||||
and sym, 0xFFFFFF
|
||||
SHLX sym, sym, rcx
|
||||
shr tmp2, 24
|
||||
or bits, sym
|
||||
add rcx, tmp2
|
||||
|
||||
; insert dist code
|
||||
movzx tmp, WORD(dsym)
|
||||
SHLX tmp, tmp, rcx
|
||||
or bits, tmp
|
||||
mov tmp, dsym
|
||||
shr tmp, 24
|
||||
add rcx, tmp
|
||||
|
||||
; insert dist extra bits
|
||||
shr data, EXTRA_BITS_OFFSET
|
||||
add ptr, 4
|
||||
SHLX data, data, rcx
|
||||
or bits, data
|
||||
shr dsym, 16
|
||||
and dsym, 0xFF
|
||||
add rcx, dsym
|
||||
|
||||
; empty bits
|
||||
mov [out_buf], bits
|
||||
mov tmp, rcx
|
||||
shr tmp, 3 ; byte count
|
||||
add out_buf, tmp
|
||||
mov tmp, rcx
|
||||
and rcx, ~7
|
||||
SHRX bits, bits, rcx
|
||||
mov rcx, tmp
|
||||
and rcx, 7
|
||||
|
||||
cmp ptr, in_buf_end
|
||||
jb .finish_loop
|
||||
|
||||
.overflow:
|
||||
mov tmp, [rsp + bitbuf_mem_offset]
|
||||
mov [tmp + _m_bits], bits
|
||||
mov [tmp + _m_bit_count], ecx
|
||||
mov [tmp + _m_out_buf], out_buf
|
||||
|
||||
mov rax, ptr
|
||||
|
||||
FUNC_RESTORE
|
||||
|
||||
ret
|
||||
|
||||
section .data
|
||||
align 32
|
||||
max_write_d:
|
||||
dd 0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
|
||||
offset_mask:
|
||||
dq 0x0000000000000007, 0x0000000000000000
|
||||
dq 0x0000000000000000, 0x0000000000000000
|
||||
q_64:
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
lit_mask:
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dist_mask:
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
lit_icr_mask:
|
||||
dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
|
||||
dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
|
||||
eb_icr_mask:
|
||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||
|
@ -3,6 +3,11 @@
|
||||
%include "data_struct2.asm"
|
||||
%include "stdmac.asm"
|
||||
|
||||
%ifdef HAVE_AS_KNOWS_AVX512
|
||||
|
||||
%define ARCH 06
|
||||
%define USE_HSWNI
|
||||
|
||||
; tree entry is 4 bytes:
|
||||
; lit/len tree (513 entries)
|
||||
; | 3 | 2 | 1 | 0 |
|
||||
@ -59,29 +64,40 @@
|
||||
%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
|
||||
%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
|
||||
|
||||
%define codes1 ymm1
|
||||
%define code_lens1 ymm2
|
||||
%define codes2 ymm3
|
||||
%define code_lens2 ymm4
|
||||
%define codes3 ymm5
|
||||
%define code_lens3 ymm6
|
||||
%define codes4 ymm7
|
||||
%define syms ymm7
|
||||
%define codes1 zmm1
|
||||
%define code_lens1 zmm2
|
||||
%define codes2 zmm3
|
||||
%define code_lens2 zmm4
|
||||
%define codes3 zmm5
|
||||
%define ztmp zmm5
|
||||
%define code_lens3 zmm6
|
||||
%define codes4 zmm7
|
||||
%define syms zmm7
|
||||
|
||||
%define code_lens4 ymm8
|
||||
%define dsyms ymm8
|
||||
%define code_lens4 zmm8
|
||||
%define dsyms zmm8
|
||||
%define zbits_count_q zmm8
|
||||
|
||||
%define ytmp ymm9
|
||||
%define codes_lookup1 ymm10
|
||||
%define codes_lookup2 ymm11
|
||||
%define datas ymm12
|
||||
%define ybits ymm13
|
||||
%define ybits_count ymm14
|
||||
%define yoffset_mask ymm15
|
||||
%define codes_lookup1 zmm9
|
||||
%define codes_lookup2 zmm10
|
||||
%define datas zmm11
|
||||
%define zbits zmm12
|
||||
%define zbits_count zmm13
|
||||
%define zoffset_mask zmm14
|
||||
|
||||
%define VECTOR_SIZE 0x20
|
||||
%define zq_64 zmm15
|
||||
%define zlit_mask zmm16
|
||||
%define zdist_mask zmm17
|
||||
%define zlit_icr_mask zmm18
|
||||
%define zeb_icr_mask zmm19
|
||||
%define zmax_write zmm20
|
||||
%define zrot_perm zmm21
|
||||
%define zq_8 zmm22
|
||||
%define zmin_write zmm23
|
||||
|
||||
%define VECTOR_SIZE 0x40
|
||||
%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
|
||||
%define VECTOR_SLOP 0x20 - 8
|
||||
%define VECTOR_SLOP 0x40 - 8
|
||||
|
||||
gpr_save_mem_offset equ 0
|
||||
gpr_save_mem_size equ 8 * 6
|
||||
@ -162,31 +178,48 @@ encode_deflate_icf_ %+ ARCH:
|
||||
cmp ptr, in_buf_end
|
||||
jge .finish
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vmovdqu datas, [ptr]
|
||||
vpand syms, datas, [lit_mask]
|
||||
vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
|
||||
kxorq k0, k0, k0
|
||||
kmovq k1, [k_mask_1]
|
||||
kmovq k2, [k_mask_2]
|
||||
kmovq k3, [k_mask_3]
|
||||
kmovq k4, [k_mask_4]
|
||||
kmovq k5, [k_mask_5]
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vmovdqa64 zoffset_mask, [offset_mask]
|
||||
vmovdqa64 zlit_mask, [lit_mask]
|
||||
vmovdqa64 zdist_mask, [dist_mask]
|
||||
vmovdqa64 zlit_icr_mask, [lit_icr_mask]
|
||||
vmovdqa64 zeb_icr_mask, [eb_icr_mask]
|
||||
vmovdqa64 zmax_write, [max_write_d]
|
||||
vmovdqa64 zq_64, [q_64]
|
||||
vmovdqa64 zrot_perm, [rot_perm]
|
||||
vmovdqa64 zq_8, [q_8]
|
||||
vmovdqa64 zmin_write, [min_write_q]
|
||||
|
||||
knotq k6, k0
|
||||
vmovdqu64 datas, [ptr]
|
||||
vpandd syms, datas, [lit_mask]
|
||||
vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
|
||||
|
||||
knotq k7, k0
|
||||
vpsrld dsyms, datas, DIST_OFFSET
|
||||
vpand dsyms, dsyms, [dist_mask]
|
||||
vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
|
||||
vpandd dsyms, dsyms, [dist_mask]
|
||||
vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
|
||||
|
||||
vmovq ybits %+ x, bits
|
||||
vmovq ybits_count %+ x, rcx
|
||||
vmovdqa yoffset_mask, [offset_mask]
|
||||
vmovq zbits %+ x, bits
|
||||
vmovq zbits_count %+ x, rcx
|
||||
|
||||
.main_loop:
|
||||
;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
|
||||
vpsrld code_lens1, codes_lookup1, 24
|
||||
vpand codes1, codes_lookup1, [lit_icr_mask]
|
||||
vpandd codes1, codes_lookup1, zlit_icr_mask
|
||||
|
||||
;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
|
||||
;; and code_lens3 the extra bit counts
|
||||
vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
|
||||
vmovdqu16 codes2 {k1}{z}, codes_lookup2 ;Bits 8 and above of zbits are 0
|
||||
vpsrld code_lens2, codes_lookup2, 24
|
||||
vpsrld code_lens3, codes_lookup2, 16
|
||||
vpand code_lens3, [eb_icr_mask]
|
||||
vpandd code_lens3, code_lens3, zeb_icr_mask
|
||||
|
||||
;; Set codes3 to contain the extra bits
|
||||
vpsrld codes3, datas, EXTRA_BITS_OFFSET
|
||||
@ -195,117 +228,123 @@ encode_deflate_icf_ %+ ARCH:
|
||||
ja .main_loop_exit
|
||||
|
||||
;; Start code lookups for next iteration
|
||||
knotq k6, k0
|
||||
add ptr, VECTOR_SIZE
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
vmovdqu datas, [ptr]
|
||||
vpand syms, datas, [lit_mask]
|
||||
vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
|
||||
vmovdqu64 datas, [ptr]
|
||||
vpandd syms, datas, zlit_mask
|
||||
vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
|
||||
|
||||
vpcmpeqq ytmp, ytmp, ytmp
|
||||
knotq k7, k0
|
||||
vpsrld dsyms, datas, DIST_OFFSET
|
||||
vpand dsyms, dsyms, [dist_mask]
|
||||
vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
|
||||
vpandd dsyms, dsyms, zdist_mask
|
||||
vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
|
||||
|
||||
;; Merge dist code with extra bits
|
||||
vpsllvd codes3, codes3, code_lens2
|
||||
vpxor codes2, codes2, codes3
|
||||
vpxord codes2, codes2, codes3
|
||||
vpaddd code_lens2, code_lens2, code_lens3
|
||||
|
||||
;; Check for long codes
|
||||
vpaddd code_lens3, code_lens1, code_lens2
|
||||
vpcmpgtd ytmp, code_lens3, [max_write_d]
|
||||
vptest ytmp, ytmp
|
||||
vpcmpgtd k6, code_lens3, zmax_write
|
||||
ktestd k6, k6
|
||||
jnz .long_codes
|
||||
|
||||
;; Merge dist and len codes
|
||||
vpsllvd codes2, codes2, code_lens1
|
||||
vpxor codes1, codes1, codes2
|
||||
vpxord codes1, codes1, codes2
|
||||
|
||||
;; Split buffer data into qwords, ytmp is 0 after last branch
|
||||
vpblendd codes3, ytmp, codes1, 0x55
|
||||
vmovdqa32 codes3 {k1}{z}, codes1
|
||||
vpsrlq codes1, codes1, 32
|
||||
vpsrlq code_lens1, code_lens3, 32
|
||||
vpblendd code_lens3, ytmp, code_lens3, 0x55
|
||||
vmovdqa32 code_lens3 {k1}{z}, code_lens3
|
||||
|
||||
;; Merge bitbuf bits
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpxor codes3, codes3, ybits
|
||||
vpaddq code_lens3, code_lens3, ybits_count
|
||||
vpsllvq codes3, codes3, zbits_count
|
||||
vpxord codes3, codes3, zbits
|
||||
vpaddq code_lens3, code_lens3, zbits_count
|
||||
|
||||
;; Merge two symbols into qwords
|
||||
vpsllvq codes1, codes1, code_lens3
|
||||
vpxor codes1, codes1, codes3
|
||||
vpxord codes1, codes1, codes3
|
||||
vpaddq code_lens1, code_lens1, code_lens3
|
||||
|
||||
;; Split buffer data into dqwords, ytmp is 0 after last branch
|
||||
vpblendd codes2, ytmp, codes1, 0x33
|
||||
vpblendd code_lens2, ytmp, code_lens1, 0x33
|
||||
vpsrldq codes1, 8
|
||||
vpsrldq code_lens1, 8
|
||||
;; Determine total bits at end of each qword
|
||||
kshiftlq k7, k3, 2
|
||||
vpermq zbits_count {k5}{z}, zrot_perm, code_lens1
|
||||
vpaddq code_lens2, zbits_count, code_lens1
|
||||
vshufi64x2 zbits_count {k3}{z}, code_lens2, code_lens2, 0x90
|
||||
vpaddq code_lens2, code_lens2, zbits_count
|
||||
vshufi64x2 zbits_count {k7}{z}, code_lens2, code_lens2, 0x40
|
||||
vpaddq code_lens2, code_lens2, zbits_count
|
||||
|
||||
;; Merge two qwords into dqwords
|
||||
vmovdqa ytmp, [q_64]
|
||||
vpsubq code_lens3, ytmp, code_lens2
|
||||
vpsrlvq codes3, codes1, code_lens3
|
||||
vpslldq codes3, codes3, 8
|
||||
;; Bit align quadwords
|
||||
vpandd zbits_count, code_lens2, zoffset_mask
|
||||
vpermq zbits_count_q {k5}{z}, zrot_perm, zbits_count
|
||||
vpsllvq codes1, codes1, zbits_count_q
|
||||
|
||||
vpsllvq codes1, codes1, code_lens2
|
||||
;; Get last byte in each qword
|
||||
vpsrlq code_lens2, code_lens2, 3
|
||||
vpaddq code_lens1, code_lens1, zbits_count_q
|
||||
vpsrlq code_lens1, code_lens1, 3
|
||||
vpaddq code_lens1, code_lens1, zq_8
|
||||
vpshufb codes3 {k4}{z}, codes1, code_lens1
|
||||
|
||||
vpxor codes1, codes1, codes3
|
||||
vpxor codes1, codes1, codes2
|
||||
vpaddq code_lens1, code_lens1, code_lens2
|
||||
;; Check whether any of the last bytes overlap
|
||||
vpcmpq k6 {k5}, code_lens1, zmin_write, 0
|
||||
ktestd k6, k6
|
||||
jnz .small_codes
|
||||
|
||||
vmovq tmp, code_lens1 %+ x ;Number of bytes
|
||||
shr tmp, 3
|
||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||
.small_codes_next:
|
||||
;; Save off zbits and zbits_count for next loop
|
||||
knotq k7, k5
|
||||
vpermq zbits {k7}{z}, zrot_perm, codes3
|
||||
vpermq zbits_count {k7}{z}, zrot_perm, zbits_count
|
||||
|
||||
;; bit shift upper dqword combined bits to line up with lower dqword
|
||||
vextracti128 codes2 %+ x, codes1, 1
|
||||
vextracti128 code_lens2 %+ x, code_lens1, 1
|
||||
;; Merge last byte in each qword with the next qword
|
||||
vpermq codes3 {k5}{z}, zrot_perm, codes3
|
||||
vpxord codes1, codes1, codes3
|
||||
|
||||
vpbroadcastq ybits_count, ybits_count %+ x
|
||||
vpsrldq codes3, codes2, 1
|
||||
vpsllvq codes2, codes2, ybits_count
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpslldq codes3, codes3, 1
|
||||
vpor codes2, codes2, codes3
|
||||
;; Determine total bytes written
|
||||
vextracti64x2 code_lens1 %+ x, code_lens2, 3
|
||||
vpextrq tmp2, code_lens1 %+ x, 1
|
||||
|
||||
; Write out lower dqword of combined bits
|
||||
vmovdqu [out_buf], codes1
|
||||
movzx bits, byte [out_buf + tmp]
|
||||
vmovq codes1 %+ x, bits
|
||||
vpaddq code_lens1, code_lens1, code_lens2
|
||||
;; Write out qwords
|
||||
knotq k6, k0
|
||||
vpermq code_lens2 {k5}{z}, zrot_perm, code_lens2
|
||||
vpscatterqq [out_buf + code_lens2] {k6}, codes1
|
||||
|
||||
vmovq tmp2, code_lens1 %+ x ;Number of bytes
|
||||
shr tmp2, 3
|
||||
vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
|
||||
|
||||
; Write out upper dqword of combined bits
|
||||
vpor codes1 %+ x, codes1 %+ x, codes2 %+ x
|
||||
vmovdqu [out_buf + tmp], codes1 %+ x
|
||||
add out_buf, tmp2
|
||||
movzx bits, byte [out_buf]
|
||||
vmovq ybits %+ x, bits
|
||||
|
||||
cmp ptr, in_buf_end
|
||||
jbe .main_loop
|
||||
|
||||
.main_loop_exit:
|
||||
vmovq rcx, ybits_count %+ x
|
||||
vmovq bits, ybits %+ x
|
||||
vmovq rcx, zbits_count %+ x
|
||||
vmovq bits, zbits %+ x
|
||||
jmp .finish
|
||||
|
||||
.small_codes:
|
||||
;; Merge overlapping last bytes
|
||||
vpermq codes4 {k6}{z}, zrot_perm, codes3
|
||||
vporq codes3, codes3, codes4
|
||||
kshiftlq k7, k6, 1
|
||||
ktestd k6, k7
|
||||
jz .small_codes_next
|
||||
|
||||
kandq k6, k6, k7
|
||||
jmp .small_codes
|
||||
|
||||
.long_codes:
|
||||
add end_ptr, VECTOR_SLOP
|
||||
sub ptr, VECTOR_SIZE
|
||||
|
||||
vpxor ytmp, ytmp, ytmp
|
||||
vpblendd codes3, ytmp, codes1, 0x55
|
||||
vpblendd code_lens3, ytmp, code_lens1, 0x55
|
||||
vpblendd codes4, ytmp, codes2, 0x55
|
||||
vmovdqa32 codes3 {k1}{z}, codes1
|
||||
vmovdqa32 code_lens3 {k1}{z}, code_lens1
|
||||
vmovdqa32 codes4 {k1}{z}, codes2
|
||||
|
||||
vpsllvq codes4, codes4, code_lens3
|
||||
vpxor codes3, codes3, codes4
|
||||
vpxord codes3, codes3, codes4
|
||||
vpaddd code_lens3, code_lens1, code_lens2
|
||||
|
||||
vpsrlq codes1, codes1, 32
|
||||
@ -313,21 +352,28 @@ encode_deflate_icf_ %+ ARCH:
|
||||
vpsrlq codes2, codes2, 32
|
||||
|
||||
vpsllvq codes2, codes2, code_lens1
|
||||
vpxor codes1, codes1, codes2
|
||||
vpxord codes1, codes1, codes2
|
||||
|
||||
vpsrlq code_lens1, code_lens3, 32
|
||||
vpblendd code_lens3, ytmp, code_lens3, 0x55
|
||||
vmovdqa32 code_lens3 {k1}{z}, code_lens3
|
||||
|
||||
;; Merge bitbuf bits
|
||||
vpsllvq codes3, codes3, ybits_count
|
||||
vpxor codes3, codes3, ybits
|
||||
vpaddq code_lens3, code_lens3, ybits_count
|
||||
vpsllvq codes3, codes3, zbits_count
|
||||
vpxord codes3, codes3, zbits
|
||||
vpaddq code_lens3, code_lens3, zbits_count
|
||||
vpaddq code_lens1, code_lens1, code_lens3
|
||||
|
||||
xor bits, bits
|
||||
xor rcx, rcx
|
||||
vpsubq code_lens1, code_lens1, code_lens3
|
||||
%rep 2
|
||||
|
||||
vmovdqu64 codes2, codes1
|
||||
vmovdqu64 code_lens2, code_lens1
|
||||
vmovdqu64 codes4, codes3
|
||||
vmovdqu64 code_lens4, code_lens3
|
||||
%assign i 0
|
||||
%rep 4
|
||||
%assign i (i + 1)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
cmp out_buf, end_ptr
|
||||
ja .overflow
|
||||
@ -416,15 +462,15 @@ encode_deflate_icf_ %+ ARCH:
|
||||
and rcx, 7
|
||||
add ptr, 4
|
||||
|
||||
vextracti128 codes3 %+ x, codes3, 1
|
||||
vextracti128 code_lens3 %+ x, code_lens3, 1
|
||||
vextracti128 codes1 %+ x, codes1, 1
|
||||
vextracti128 code_lens1 %+ x, code_lens1, 1
|
||||
vextracti32x4 codes3 %+ x, codes4, i
|
||||
vextracti32x4 code_lens3 %+ x, code_lens4, i
|
||||
vextracti32x4 codes1 %+ x, codes2, i
|
||||
vextracti32x4 code_lens1 %+ x, code_lens2, i
|
||||
%endrep
|
||||
sub end_ptr, VECTOR_SLOP
|
||||
|
||||
vmovq ybits %+ x, bits
|
||||
vmovq ybits_count %+ x, rcx
|
||||
vmovq zbits %+ x, bits
|
||||
vmovq zbits_count %+ x, rcx
|
||||
cmp ptr, in_buf_end
|
||||
jbe .main_loop
|
||||
|
||||
@ -504,24 +550,55 @@ encode_deflate_icf_ %+ ARCH:
|
||||
ret
|
||||
|
||||
section .data
|
||||
align 32
|
||||
align 64
|
||||
max_write_d:
|
||||
dd 0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
|
||||
dd 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
|
||||
dd 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
|
||||
min_write_q:
|
||||
dq 0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08
|
||||
offset_mask:
|
||||
dq 0x0000000000000007, 0x0000000000000000
|
||||
dq 0x0000000000000000, 0x0000000000000000
|
||||
dq 0x0000000000000007, 0x0000000000000007
|
||||
dq 0x0000000000000007, 0x0000000000000007
|
||||
dq 0x0000000000000007, 0x0000000000000007
|
||||
dq 0x0000000000000007, 0x0000000000000007
|
||||
q_64:
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
dq 0x0000000000000040, 0x0000000000000000
|
||||
q_8 :
|
||||
dq 0x0000000000000000, 0x0000000000000008
|
||||
dq 0x0000000000000000, 0x0000000000000008
|
||||
dq 0x0000000000000000, 0x0000000000000008
|
||||
dq 0x0000000000000000, 0x0000000000000008
|
||||
lit_mask:
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
|
||||
dist_mask:
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
|
||||
lit_icr_mask:
|
||||
dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
|
||||
dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
|
||||
dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
|
||||
dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
|
||||
dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
|
||||
dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
|
||||
eb_icr_mask:
|
||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||
dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
|
||||
dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
|
||||
dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
|
||||
dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
|
||||
dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
|
||||
rot_perm:
|
||||
dq 0x00000007, 0x00000000, 0x00000001, 0x00000002
|
||||
dq 0x00000003, 0x00000004, 0x00000005, 0x00000006
|
||||
|
||||
k_mask_1: dq 0x55555555
|
||||
k_mask_2: dq 0x11111111
|
||||
k_mask_3: dq 0xfffffffc
|
||||
k_mask_4: dw 0x0101, 0x0101, 0x0101, 0x0101
|
||||
k_mask_5: dq 0xfffffffe
|
||||
|
||||
%endif
|
@ -60,6 +60,10 @@ extern isal_update_histogram_04
|
||||
extern encode_deflate_icf_base
|
||||
extern encode_deflate_icf_04
|
||||
|
||||
%ifdef HAVE_AS_KNOWS_AVX512
|
||||
extern encode_deflate_icf_06
|
||||
%endif
|
||||
|
||||
extern crc32_gzip_base
|
||||
extern crc32_gzip_01
|
||||
|
||||
@ -82,8 +86,13 @@ mbin_dispatch_init5 isal_deflate_icf_finish, isal_deflate_icf_finish_base, isal_
|
||||
mbin_interface isal_update_histogram
|
||||
mbin_dispatch_init5 isal_update_histogram, isal_update_histogram_base, isal_update_histogram_01, isal_update_histogram_01, isal_update_histogram_04
|
||||
|
||||
%ifdef HAVE_AS_KNOWS_AVX512
|
||||
mbin_interface encode_deflate_icf
|
||||
mbin_dispatch_init6 encode_deflate_icf, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_04, encode_deflate_icf_06
|
||||
%else
|
||||
mbin_interface encode_deflate_icf
|
||||
mbin_dispatch_init5 encode_deflate_icf, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_04
|
||||
%endif
|
||||
|
||||
mbin_interface crc32_gzip
|
||||
mbin_dispatch_init5 crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01
|
||||
|
@ -118,6 +118,72 @@
|
||||
%define ymm14x xmm14
|
||||
%define ymm15x xmm15
|
||||
|
||||
%define zmm0x xmm0
|
||||
%define zmm1x xmm1
|
||||
%define zmm2x xmm2
|
||||
%define zmm3x xmm3
|
||||
%define zmm4x xmm4
|
||||
%define zmm5x xmm5
|
||||
%define zmm6x xmm6
|
||||
%define zmm7x xmm7
|
||||
%define zmm8x xmm8
|
||||
%define zmm9x xmm9
|
||||
%define zmm10x xmm10
|
||||
%define zmm11x xmm11
|
||||
%define zmm12x xmm12
|
||||
%define zmm13x xmm13
|
||||
%define zmm14x xmm14
|
||||
%define zmm15x xmm15
|
||||
%define zmm16x xmm16
|
||||
%define zmm17x xmm17
|
||||
%define zmm18x xmm18
|
||||
%define zmm19x xmm19
|
||||
%define zmm20x xmm20
|
||||
%define zmm21x xmm21
|
||||
%define zmm22x xmm22
|
||||
%define zmm23x xmm23
|
||||
%define zmm24x xmm24
|
||||
%define zmm25x xmm25
|
||||
%define zmm26x xmm26
|
||||
%define zmm27x xmm27
|
||||
%define zmm28x xmm28
|
||||
%define zmm29x xmm29
|
||||
%define zmm30x xmm30
|
||||
%define zmm31x xmm31
|
||||
|
||||
%define zmm0y ymm0
|
||||
%define zmm1y ymm1
|
||||
%define zmm2y ymm2
|
||||
%define zmm3y ymm3
|
||||
%define zmm4y ymm4
|
||||
%define zmm5y ymm5
|
||||
%define zmm6y ymm6
|
||||
%define zmm7y ymm7
|
||||
%define zmm8y ymm8
|
||||
%define zmm9y ymm9
|
||||
%define zmm10y ymm10
|
||||
%define zmm11y ymm11
|
||||
%define zmm12y ymm12
|
||||
%define zmm13y ymm13
|
||||
%define zmm14y ymm14
|
||||
%define zmm15y ymm15
|
||||
%define zmm16y ymm16
|
||||
%define zmm17y ymm17
|
||||
%define zmm18y ymm18
|
||||
%define zmm19y ymm19
|
||||
%define zmm20y ymm20
|
||||
%define zmm21y ymm21
|
||||
%define zmm22y ymm22
|
||||
%define zmm23y ymm23
|
||||
%define zmm24y ymm24
|
||||
%define zmm25y ymm25
|
||||
%define zmm26y ymm26
|
||||
%define zmm27y ymm27
|
||||
%define zmm28y ymm28
|
||||
%define zmm29y ymm29
|
||||
%define zmm30y ymm30
|
||||
%define zmm31y ymm31
|
||||
|
||||
%define DWORD(reg) reg %+ d
|
||||
%define WORD(reg) reg %+ w
|
||||
%define BYTE(reg) reg %+ b
|
||||
|
Loading…
x
Reference in New Issue
Block a user