From 5a55e3096c848d7ae52e89e103ec30d2e7961958 Mon Sep 17 00:00:00 2001
From: Roy Oursler <roy.j.oursler@intel.com>
Date: Mon, 10 Apr 2017 15:05:30 -0700
Subject: [PATCH] igzip: Avx512 version for encdode_df

Change-Id: I1625a3d7e016805791cfd09e31909562f432fd71
Signed-off-by: Roy Oursler <roy.j.oursler@intel.com>
---
 igzip/Makefile.am                             |   2 +-
 igzip/encode_df_04.asm                        | 528 +++++++++++++++++-
 igzip/{encode_df_asm.asm => encode_df_06.asm} | 311 +++++++----
 igzip/igzip_multibinary.asm                   |   9 +
 include/reg_sizes.asm                         |  66 +++
 5 files changed, 797 insertions(+), 119 deletions(-)
 rename igzip/{encode_df_asm.asm => encode_df_06.asm} (58%)

diff --git a/igzip/Makefile.am b/igzip/Makefile.am
index 62d179d..3c88b5d 100644
--- a/igzip/Makefile.am
+++ b/igzip/Makefile.am
@@ -56,6 +56,7 @@ lsrc_x86_64 +=  \
 		igzip/igzip_decode_block_stateless_04.asm \
 		igzip/igzip_inflate_multibinary.asm \
 		igzip/encode_df_04.asm \
+		igzip/encode_df_06.asm \
 		igzip/proc_heap.asm
 
 src_include += -I $(srcdir)/igzip
@@ -93,7 +94,6 @@ other_src   += 	igzip/bitbuf2.asm  \
 		igzip/inflate_std_vects.h \
 		igzip/flatten_ll.h \
 		igzip/encode_df.h \
-		igzip/encode_df_asm.asm \
 		igzip/heap_macros.asm \
 		igzip/igzip_checksums.h
 
diff --git a/igzip/encode_df_04.asm b/igzip/encode_df_04.asm
index dbf01be..723037f 100644
--- a/igzip/encode_df_04.asm
+++ b/igzip/encode_df_04.asm
@@ -1,4 +1,530 @@
+%include "reg_sizes.asm"
+%include "lz0a_const.asm"
+%include "data_struct2.asm"
+%include "stdmac.asm"
+
 %define ARCH 04
 %define USE_HSWNI
 
-%include "encode_df_asm.asm"
+; tree entry is 4 bytes:
+; lit/len tree (513 entries)
+; |  3  |  2   |  1 | 0 |
+; | len |       code    |
+;
+; dist tree
+; |  3  |  2   |  1 | 0 |
+; |eblen:codlen|   code |
+
+; token format:
+; DIST_OFFSET:0 : lit/len
+; 31:(DIST_OFFSET + 5) : dist Extra Bits
+; (DIST_OFFSET + 5):DIST_OFFSET : dist code
+; lit/len: 0-256 (literal)
+;          257-512 (dist + 254)
+
+; returns final token pointer
+; equal to token_end if successful
+;    uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
+;                            BitBuf *out_buf, uint32_t *trees);
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define sym		rsi
+%define dsym		rdi
+%define hufftables	r9
+%define ptr		r11
+%else
+; Linux
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define sym		r9
+%define dsym		r8
+%define hufftables	r11
+%define ptr		rdi
+%endif
+
+%define in_buf_end	arg2
+%define bitbuf		arg3
+%define out_buf		bitbuf
+; bit_count is rcx
+%define bits		rax
+%define data		r12
+%define tmp		rbx
+%define len 		dsym
+%define tmp2 		r10
+%define end_ptr		rbp
+
+%define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
+%define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)
+
+%define codes1		ymm1
+%define code_lens1	ymm2
+%define codes2		ymm3
+%define code_lens2	ymm4
+%define codes3		ymm5
+%define	code_lens3	ymm6
+%define codes4		ymm7
+%define syms		ymm7
+
+%define code_lens4	ymm8
+%define dsyms		ymm8
+
+%define ytmp		ymm9
+%define codes_lookup1	ymm10
+%define	codes_lookup2	ymm11
+%define datas		ymm12
+%define ybits		ymm13
+%define ybits_count	ymm14
+%define yoffset_mask	ymm15
+
+%define VECTOR_SIZE 0x20
+%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
+%define VECTOR_SLOP 0x20 - 8
+
+gpr_save_mem_offset	equ	0
+gpr_save_mem_size	equ	8 * 6
+xmm_save_mem_offset	equ	gpr_save_mem_offset + gpr_save_mem_size
+xmm_save_mem_size	equ	10 * 16
+bitbuf_mem_offset	equ	xmm_save_mem_offset + xmm_save_mem_size
+bitbuf_mem_size		equ	8
+stack_size		equ	gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
+
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	mov	[rsp + gpr_save_mem_offset + 0*8], rbx
+	mov	[rsp + gpr_save_mem_offset + 1*8], rbp
+	mov	[rsp + gpr_save_mem_offset + 2*8], r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	[rsp + gpr_save_mem_offset + 3*8], rsi
+	mov	[rsp + gpr_save_mem_offset + 4*8], rdi
+
+	MOVDQU	[rsp + xmm_save_mem_offset + 0*8], xmm6
+	MOVDQU	[rsp + xmm_save_mem_offset + 1*8], xmm7
+	MOVDQU	[rsp + xmm_save_mem_offset + 2*8], xmm8
+	MOVDQU	[rsp + xmm_save_mem_offset + 3*8], xmm9
+	MOVDQU	[rsp + xmm_save_mem_offset + 4*8], xmm10
+	MOVDQU	[rsp + xmm_save_mem_offset + 5*8], xmm11
+	MOVDQU	[rsp + xmm_save_mem_offset + 6*8], xmm12
+	MOVDQU	[rsp + xmm_save_mem_offset + 7*8], xmm13
+	MOVDQU	[rsp + xmm_save_mem_offset + 8*8], xmm14
+	MOVDQU	[rsp + xmm_save_mem_offset + 9*8], xmm15
+%endif
+
+%endm
+
+%macro FUNC_RESTORE 0
+	mov	rbx, [rsp + gpr_save_mem_offset + 0*8]
+	mov	rbp, [rsp + gpr_save_mem_offset + 1*8]
+	mov	r12, [rsp + gpr_save_mem_offset + 2*8]
+
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	rsi, [rsp + gpr_save_mem_offset + 3*8]
+	mov	rdi, [rsp + gpr_save_mem_offset + 4*8]
+
+	MOVDQU	xmm6, [rsp + xmm_save_mem_offset + 0*8]
+	MOVDQU	xmm7, [rsp + xmm_save_mem_offset + 1*8]
+	MOVDQU	xmm8, [rsp + xmm_save_mem_offset + 2*8]
+	MOVDQU	xmm9, [rsp + xmm_save_mem_offset + 3*8]
+	MOVDQU	xmm10, [rsp + xmm_save_mem_offset + 4*8]
+	MOVDQU	xmm11, [rsp + xmm_save_mem_offset + 5*8]
+	MOVDQU	xmm12, [rsp + xmm_save_mem_offset + 6*8]
+	MOVDQU	xmm13, [rsp + xmm_save_mem_offset + 7*8]
+	MOVDQU	xmm14, [rsp + xmm_save_mem_offset + 8*8]
+	MOVDQU	xmm15, [rsp + xmm_save_mem_offset + 9*8]
+%endif
+	add	rsp, stack_size
+
+%endmacro
+
+global encode_deflate_icf_ %+ ARCH
+encode_deflate_icf_ %+ ARCH:
+	FUNC_SAVE
+
+%ifnidn ptr, arg1
+	mov	ptr, arg1
+%endif
+%ifnidn hufftables, arg4
+	mov	hufftables, arg4
+%endif
+
+	mov	[rsp + bitbuf_mem_offset], bitbuf
+	mov	bits, [bitbuf + _m_bits]
+	mov	ecx, [bitbuf + _m_bit_count]
+	mov	end_ptr, [bitbuf + _m_out_end]
+	mov	out_buf, [bitbuf + _m_out_buf]	; clobbers bitbuf
+
+	sub	end_ptr, VECTOR_SLOP
+	sub	in_buf_end, VECTOR_LOOP_PROCESSED
+	cmp	ptr, in_buf_end
+	jge	.finish
+
+	vpcmpeqq	ytmp, ytmp, ytmp
+	vmovdqu	datas, [ptr]
+	vpand	syms, datas, [lit_mask]
+	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+
+	vpcmpeqq	ytmp, ytmp, ytmp
+	vpsrld	dsyms, datas, DIST_OFFSET
+	vpand	dsyms, dsyms, [dist_mask]
+	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+
+	vmovq	ybits %+ x, bits
+	vmovq	ybits_count %+ x, rcx
+	vmovdqa	yoffset_mask, [offset_mask]
+
+.main_loop:
+	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
+	vpsrld	code_lens1, codes_lookup1, 24
+	vpand	codes1, codes_lookup1, [lit_icr_mask]
+
+	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
+	;; and code_lens3 the extra bit counts
+	vpblendw	codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
+	vpsrld	code_lens2, codes_lookup2, 24
+	vpsrld	code_lens3, codes_lookup2, 16
+	vpand	code_lens3, [eb_icr_mask]
+
+	;; Set codes3 to contain the extra bits
+	vpsrld	codes3, datas, EXTRA_BITS_OFFSET
+
+	cmp	out_buf, end_ptr
+	ja	.main_loop_exit
+
+	;; Start code lookups for next iteration
+	add	ptr, VECTOR_SIZE
+	vpcmpeqq	ytmp, ytmp, ytmp
+	vmovdqu	datas, [ptr]
+	vpand	syms, datas, [lit_mask]
+	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+
+	vpcmpeqq	ytmp, ytmp, ytmp
+	vpsrld	dsyms, datas, DIST_OFFSET
+	vpand	dsyms, dsyms, [dist_mask]
+	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+
+	;; Merge dist code with extra bits
+	vpsllvd	codes3, codes3, code_lens2
+	vpxor	codes2, codes2, codes3
+	vpaddd	code_lens2, code_lens2, code_lens3
+
+	;; Check for long codes
+	vpaddd	code_lens3, code_lens1, code_lens2
+	vpcmpgtd	ytmp, code_lens3, [max_write_d]
+	vptest	ytmp, ytmp
+	jnz	.long_codes
+
+	;; Merge dist and len codes
+	vpsllvd	codes2, codes2, code_lens1
+	vpxor	codes1, codes1, codes2
+
+	;; Split buffer data into qwords, ytmp is 0 after last branch
+	vpblendd codes3, ytmp, codes1, 0x55
+	vpsrlq	codes1, codes1, 32
+	vpsrlq	code_lens1, code_lens3, 32
+	vpblendd	code_lens3, ytmp, code_lens3, 0x55
+
+	;; Merge bitbuf bits
+	vpsllvq codes3, codes3, ybits_count
+	vpxor	codes3, codes3, ybits
+	vpaddq	code_lens3, code_lens3, ybits_count
+
+	;; Merge two symbols into qwords
+	vpsllvq	codes1, codes1, code_lens3
+	vpxor codes1, codes1, codes3
+	vpaddq code_lens1, code_lens1, code_lens3
+
+	;; Split buffer data into dqwords, ytmp is 0 after last branch
+	vpblendd	codes2, ytmp, codes1, 0x33
+	vpblendd	code_lens2, ytmp, code_lens1, 0x33
+	vpsrldq	codes1, 8
+	vpsrldq	code_lens1, 8
+
+	;; Merge two qwords into dqwords
+	vmovdqa	ytmp, [q_64]
+	vpsubq	code_lens3, ytmp, code_lens2
+	vpsrlvq	codes3, codes1, code_lens3
+	vpslldq	codes3, codes3, 8
+
+	vpsllvq	codes1, codes1, code_lens2
+
+	vpxor	codes1, codes1, codes3
+	vpxor	codes1, codes1, codes2
+	vpaddq	code_lens1, code_lens1, code_lens2
+
+	vmovq	tmp, code_lens1 %+ x 	;Number of bytes
+	shr	tmp, 3
+	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
+
+	;; bit shift upper dqword combined bits to line up with lower dqword
+	vextracti128	codes2 %+ x, codes1, 1
+	vextracti128	code_lens2 %+ x, code_lens1, 1
+
+	vpbroadcastq	ybits_count, ybits_count %+ x
+	vpsrldq	codes3, codes2, 1
+	vpsllvq	codes2, codes2, ybits_count
+	vpsllvq	codes3, codes3, ybits_count
+	vpslldq	codes3, codes3, 1
+	vpor	codes2, codes2, codes3
+
+	; Write out lower dqword of combined bits
+	vmovdqu	[out_buf], codes1
+	movzx	bits, byte [out_buf + tmp]
+	vmovq	codes1 %+ x, bits
+	vpaddq	code_lens1, code_lens1, code_lens2
+
+	vmovq	tmp2, code_lens1 %+ x	;Number of bytes
+	shr	tmp2, 3
+	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
+
+	; Write out upper dqword of combined bits
+	vpor	codes1 %+ x, codes1 %+ x, codes2 %+ x
+	vmovdqu	[out_buf + tmp], codes1 %+ x
+	add	out_buf, tmp2
+	movzx	bits, byte [out_buf]
+	vmovq	ybits %+ x, bits
+
+	cmp	ptr, in_buf_end
+	jbe	.main_loop
+
+.main_loop_exit:
+	vmovq	rcx, ybits_count %+ x
+	vmovq	bits, ybits %+ x
+	jmp	.finish
+
+.long_codes:
+	add	end_ptr, VECTOR_SLOP
+	sub	ptr, VECTOR_SIZE
+
+	vpxor ytmp, ytmp, ytmp
+	vpblendd codes3, ytmp, codes1, 0x55
+	vpblendd code_lens3, ytmp, code_lens1, 0x55
+	vpblendd codes4, ytmp, codes2, 0x55
+
+	vpsllvq	codes4, codes4, code_lens3
+	vpxor	codes3, codes3, codes4
+	vpaddd	code_lens3, code_lens1, code_lens2
+
+	vpsrlq	codes1, codes1, 32
+	vpsrlq	code_lens1, code_lens1, 32
+	vpsrlq	codes2, codes2, 32
+
+	vpsllvq	codes2, codes2, code_lens1
+	vpxor codes1, codes1, codes2
+
+	vpsrlq code_lens1, code_lens3, 32
+	vpblendd	code_lens3, ytmp, code_lens3, 0x55
+
+	;; Merge bitbuf bits
+	vpsllvq codes3, codes3, ybits_count
+	vpxor	codes3, codes3, ybits
+	vpaddq	code_lens3, code_lens3, ybits_count
+	vpaddq code_lens1, code_lens1, code_lens3
+
+	xor	bits, bits
+	xor	rcx, rcx
+	vpsubq code_lens1, code_lens1, code_lens3
+%rep 2
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vmovq	sym, codes3 %+ x
+	vmovq	tmp2, code_lens3 %+ x
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vmovq	sym, codes1 %+ x
+	vmovq	tmp2, code_lens1 %+ x
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vpextrq	sym, codes3 %+ x, 1
+	vpextrq	tmp2, code_lens3 %+ x, 1
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	cmp	out_buf, end_ptr
+	ja	.overflow
+	;; insert LL code
+	vpextrq	sym, codes1 %+ x, 1
+	vpextrq	tmp2, code_lens1 %+ x, 1
+	SHLX	sym, sym, rcx
+	or	bits, sym
+	add	rcx, tmp2
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+	add	ptr, 4
+
+	vextracti128 codes3 %+ x, codes3, 1
+	vextracti128 code_lens3 %+ x, code_lens3, 1
+	vextracti128 codes1 %+ x, codes1, 1
+	vextracti128 code_lens1 %+ x, code_lens1, 1
+%endrep
+	sub	end_ptr, VECTOR_SLOP
+
+	vmovq	ybits %+ x, bits
+	vmovq	ybits_count %+ x, rcx
+	cmp	ptr, in_buf_end
+	jbe	.main_loop
+
+.finish:
+	add	in_buf_end, VECTOR_LOOP_PROCESSED
+	add	end_ptr, VECTOR_SLOP
+
+	cmp	ptr, in_buf_end
+	jge	.overflow
+
+.finish_loop:
+	mov	DWORD(data), [ptr]
+
+	cmp	out_buf, end_ptr
+	ja	.overflow
+
+	mov	sym, data
+	and	sym, LIT_MASK	; sym has ll_code
+	mov	DWORD(sym), [hufftables + _lit_len_table + sym * 4]
+
+	; look up dist sym
+	mov	dsym, data
+	shr	dsym, DIST_OFFSET
+	and	dsym, DIST_MASK
+	mov	DWORD(dsym), [hufftables + _dist_table + dsym * 4]
+
+	; insert LL code
+	; sym: 31:24 length; 23:0 code
+	mov	tmp2, sym
+	and	sym, 0xFFFFFF
+	SHLX	sym, sym, rcx
+	shr	tmp2, 24
+	or	bits, sym
+	add	rcx, tmp2
+
+	; insert dist code
+	movzx	tmp, WORD(dsym)
+	SHLX	tmp, tmp, rcx
+	or	bits, tmp
+	mov	tmp, dsym
+	shr	tmp, 24
+	add	rcx, tmp
+
+	; insert dist extra bits
+	shr	data, EXTRA_BITS_OFFSET
+	add	ptr, 4
+	SHLX	data, data, rcx
+	or	bits, data
+	shr	dsym, 16
+	and	dsym, 0xFF
+	add	rcx, dsym
+
+	; empty bits
+	mov	[out_buf], bits
+	mov	tmp, rcx
+	shr	tmp, 3		; byte count
+	add	out_buf, tmp
+	mov	tmp, rcx
+	and	rcx, ~7
+	SHRX	bits, bits, rcx
+	mov	rcx, tmp
+	and	rcx, 7
+
+	cmp	ptr, in_buf_end
+	jb	.finish_loop
+
+.overflow:
+	mov	tmp, [rsp + bitbuf_mem_offset]
+	mov	[tmp + _m_bits], bits
+	mov	[tmp + _m_bit_count], ecx
+	mov	[tmp + _m_out_buf], out_buf
+
+	mov	rax, ptr
+
+	FUNC_RESTORE
+
+	ret
+
+section .data
+	align 32
+max_write_d:
+	dd	0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
+offset_mask:
+	dq	0x0000000000000007, 0x0000000000000000
+	dq	0x0000000000000000, 0x0000000000000000
+q_64:
+	dq	0x0000000000000040, 0x0000000000000000
+	dq	0x0000000000000040, 0x0000000000000000
+lit_mask:
+	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+dist_mask:
+	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+lit_icr_mask:
+	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
+	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
+eb_icr_mask:
+	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
+	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
diff --git a/igzip/encode_df_asm.asm b/igzip/encode_df_06.asm
similarity index 58%
rename from igzip/encode_df_asm.asm
rename to igzip/encode_df_06.asm
index 62ada29..e958fd1 100644
--- a/igzip/encode_df_asm.asm
+++ b/igzip/encode_df_06.asm
@@ -3,6 +3,11 @@
 %include "data_struct2.asm"
 %include "stdmac.asm"
 
+%ifdef HAVE_AS_KNOWS_AVX512
+
+%define ARCH 06
+%define USE_HSWNI
+
 ; tree entry is 4 bytes:
 ; lit/len tree (513 entries)
 ; |  3  |  2   |  1 | 0 |
@@ -59,29 +64,40 @@
 %define LIT_MASK	((0x1 << LIT_LEN_BIT_COUNT) - 1)
 %define DIST_MASK	((0x1 << DIST_LIT_BIT_COUNT) - 1)
 
-%define codes1		ymm1
-%define code_lens1	ymm2
-%define codes2		ymm3
-%define code_lens2	ymm4
-%define codes3		ymm5
-%define	code_lens3	ymm6
-%define codes4		ymm7
-%define syms		ymm7
+%define codes1		zmm1
+%define code_lens1	zmm2
+%define codes2		zmm3
+%define code_lens2	zmm4
+%define codes3		zmm5
+%define ztmp		zmm5
+%define	code_lens3	zmm6
+%define codes4		zmm7
+%define syms		zmm7
 
-%define code_lens4	ymm8
-%define dsyms		ymm8
+%define code_lens4	zmm8
+%define dsyms		zmm8
+%define zbits_count_q	zmm8
 
-%define ytmp		ymm9
-%define codes_lookup1	ymm10
-%define	codes_lookup2	ymm11
-%define datas		ymm12
-%define ybits		ymm13
-%define ybits_count	ymm14
-%define yoffset_mask	ymm15
+%define codes_lookup1	zmm9
+%define	codes_lookup2	zmm10
+%define datas		zmm11
+%define zbits		zmm12
+%define zbits_count	zmm13
+%define zoffset_mask	zmm14
 
-%define VECTOR_SIZE 0x20
+%define zq_64		zmm15
+%define zlit_mask	zmm16
+%define zdist_mask	zmm17
+%define zlit_icr_mask	zmm18
+%define zeb_icr_mask	zmm19
+%define zmax_write	zmm20
+%define zrot_perm	zmm21
+%define zq_8		zmm22
+%define zmin_write	zmm23
+
+%define VECTOR_SIZE 0x40
 %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
-%define VECTOR_SLOP 0x20 - 8
+%define VECTOR_SLOP 0x40 - 8
 
 gpr_save_mem_offset	equ	0
 gpr_save_mem_size	equ	8 * 6
@@ -162,31 +178,48 @@ encode_deflate_icf_ %+ ARCH:
 	cmp	ptr, in_buf_end
 	jge	.finish
 
-	vpcmpeqq	ytmp, ytmp, ytmp
-	vmovdqu	datas, [ptr]
-	vpand	syms, datas, [lit_mask]
-	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+	kxorq	k0, k0, k0
+	kmovq	k1, [k_mask_1]
+	kmovq	k2, [k_mask_2]
+	kmovq	k3, [k_mask_3]
+	kmovq	k4, [k_mask_4]
+	kmovq	k5, [k_mask_5]
 
-	vpcmpeqq	ytmp, ytmp, ytmp
+	vmovdqa64 zoffset_mask, [offset_mask]
+	vmovdqa64 zlit_mask, [lit_mask]
+	vmovdqa64 zdist_mask, [dist_mask]
+	vmovdqa64 zlit_icr_mask, [lit_icr_mask]
+	vmovdqa64 zeb_icr_mask, [eb_icr_mask]
+	vmovdqa64 zmax_write, [max_write_d]
+	vmovdqa64 zq_64, [q_64]
+	vmovdqa64 zrot_perm, [rot_perm]
+	vmovdqa64 zq_8, [q_8]
+	vmovdqa64 zmin_write, [min_write_q]
+
+	knotq	k6, k0
+	vmovdqu64	datas, [ptr]
+	vpandd	syms, datas, [lit_mask]
+	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
+
+	knotq	k7, k0
 	vpsrld	dsyms, datas, DIST_OFFSET
-	vpand	dsyms, dsyms, [dist_mask]
-	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+	vpandd	dsyms, dsyms, [dist_mask]
+	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
 
-	vmovq	ybits %+ x, bits
-	vmovq	ybits_count %+ x, rcx
-	vmovdqa	yoffset_mask, [offset_mask]
+	vmovq	zbits %+ x, bits
+	vmovq	zbits_count %+ x, rcx
 
 .main_loop:
 	;;  Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
 	vpsrld	code_lens1, codes_lookup1, 24
-	vpand	codes1, codes_lookup1, [lit_icr_mask]
+	vpandd	codes1, codes_lookup1, zlit_icr_mask
 
 	;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
 	;; and code_lens3 the extra bit counts
-	vpblendw	codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
+	vmovdqu16	codes2 {k1}{z}, codes_lookup2 ;Bits 8 and above of zbits are 0
 	vpsrld	code_lens2, codes_lookup2, 24
 	vpsrld	code_lens3, codes_lookup2, 16
-	vpand	code_lens3, [eb_icr_mask]
+	vpandd	code_lens3, code_lens3, zeb_icr_mask
 
 	;; Set codes3 to contain the extra bits
 	vpsrld	codes3, datas, EXTRA_BITS_OFFSET
@@ -195,117 +228,123 @@ encode_deflate_icf_ %+ ARCH:
 	ja	.main_loop_exit
 
 	;; Start code lookups for next iteration
+	knotq	k6, k0
 	add	ptr, VECTOR_SIZE
-	vpcmpeqq	ytmp, ytmp, ytmp
-	vmovdqu	datas, [ptr]
-	vpand	syms, datas, [lit_mask]
-	vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
+	vmovdqu64	datas, [ptr]
+	vpandd	syms, datas, zlit_mask
+	vpgatherdd codes_lookup1 {k6}, [hufftables + _lit_len_table + 4 * syms]
 
-	vpcmpeqq	ytmp, ytmp, ytmp
+	knotq	k7, k0
 	vpsrld	dsyms, datas, DIST_OFFSET
-	vpand	dsyms, dsyms, [dist_mask]
-	vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
+	vpandd	dsyms, dsyms, zdist_mask
+	vpgatherdd codes_lookup2 {k7}, [hufftables + _dist_table + 4 * dsyms]
 
 	;; Merge dist code with extra bits
 	vpsllvd	codes3, codes3, code_lens2
-	vpxor	codes2, codes2, codes3
+	vpxord	codes2, codes2, codes3
 	vpaddd	code_lens2, code_lens2, code_lens3
 
 	;; Check for long codes
 	vpaddd	code_lens3, code_lens1, code_lens2
-	vpcmpgtd	ytmp, code_lens3, [max_write_d]
-	vptest	ytmp, ytmp
+	vpcmpgtd	k6, code_lens3, zmax_write
+	ktestd	k6, k6
 	jnz	.long_codes
 
 	;; Merge dist and len codes
 	vpsllvd	codes2, codes2, code_lens1
-	vpxor	codes1, codes1, codes2
+	vpxord	codes1, codes1, codes2
 
-	;; Split buffer data into qwords, ytmp is 0 after last branch
-	vpblendd codes3, ytmp, codes1, 0x55
+	vmovdqa32 codes3 {k1}{z}, codes1
 	vpsrlq	codes1, codes1, 32
 	vpsrlq	code_lens1, code_lens3, 32
-	vpblendd	code_lens3, ytmp, code_lens3, 0x55
+	vmovdqa32	code_lens3 {k1}{z}, code_lens3
 
 	;; Merge bitbuf bits
-	vpsllvq codes3, codes3, ybits_count
-	vpxor	codes3, codes3, ybits
-	vpaddq	code_lens3, code_lens3, ybits_count
+	vpsllvq codes3, codes3, zbits_count
+	vpxord	codes3, codes3, zbits
+	vpaddq	code_lens3, code_lens3, zbits_count
 
 	;; Merge two symbols into qwords
 	vpsllvq	codes1, codes1, code_lens3
-	vpxor codes1, codes1, codes3
+	vpxord codes1, codes1, codes3
 	vpaddq code_lens1, code_lens1, code_lens3
 
-	;; Split buffer data into dqwords, ytmp is 0 after last branch
-	vpblendd	codes2, ytmp, codes1, 0x33
-	vpblendd	code_lens2, ytmp, code_lens1, 0x33
-	vpsrldq	codes1, 8
-	vpsrldq	code_lens1, 8
+	;; Determine total bits at end of each qword
+	kshiftlq k7, k3, 2
+	vpermq	zbits_count {k5}{z}, zrot_perm, code_lens1
+	vpaddq	code_lens2, zbits_count, code_lens1
+	vshufi64x2 zbits_count {k3}{z}, code_lens2, code_lens2, 0x90
+	vpaddq	code_lens2, code_lens2, zbits_count
+	vshufi64x2 zbits_count {k7}{z}, code_lens2, code_lens2, 0x40
+	vpaddq	code_lens2, code_lens2, zbits_count
 
-	;; Merge two qwords into dqwords
-	vmovdqa	ytmp, [q_64]
-	vpsubq	code_lens3, ytmp, code_lens2
-	vpsrlvq	codes3, codes1, code_lens3
-	vpslldq	codes3, codes3, 8
+	;; Bit align quadwords
+	vpandd	zbits_count, code_lens2, zoffset_mask
+	vpermq	zbits_count_q {k5}{z}, zrot_perm, zbits_count
+	vpsllvq	codes1, codes1, zbits_count_q
 
-	vpsllvq	codes1, codes1, code_lens2
+	;; Get last byte in each qword
+	vpsrlq	code_lens2, code_lens2, 3
+	vpaddq	code_lens1, code_lens1, zbits_count_q
+	vpsrlq	code_lens1, code_lens1, 3
+	vpaddq	code_lens1, code_lens1, zq_8
+	vpshufb	codes3 {k4}{z}, codes1, code_lens1
 
-	vpxor	codes1, codes1, codes3
-	vpxor	codes1, codes1, codes2
-	vpaddq	code_lens1, code_lens1, code_lens2
+	;; Check whether any of the last bytes overlap
+	vpcmpq k6 {k5}, code_lens1, zmin_write, 0
+	ktestd k6, k6
+	jnz .small_codes
 
-	vmovq	tmp, code_lens1 %+ x 	;Number of bytes
-	shr	tmp, 3
-	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
+.small_codes_next:
+	;; Save off zbits and zbits_count for next loop
+	knotq	k7, k5
+	vpermq	zbits {k7}{z}, zrot_perm, codes3
+	vpermq	zbits_count {k7}{z}, zrot_perm, zbits_count
 
-	;; bit shift upper dqword combined bits to line up with lower dqword
-	vextracti128	codes2 %+ x, codes1, 1
-	vextracti128	code_lens2 %+ x, code_lens1, 1
+	;; Merge last byte in each qword with the next qword
+	vpermq	codes3 {k5}{z}, zrot_perm, codes3
+	vpxord codes1, codes1, codes3
 
-	vpbroadcastq	ybits_count, ybits_count %+ x
-	vpsrldq	codes3, codes2, 1
-	vpsllvq	codes2, codes2, ybits_count
-	vpsllvq	codes3, codes3, ybits_count
-	vpslldq	codes3, codes3, 1
-	vpor	codes2, codes2, codes3
+	;; Determine total bytes written
+	vextracti64x2 code_lens1 %+ x, code_lens2, 3
+	vpextrq tmp2, code_lens1 %+ x, 1
 
-	; Write out lower dqword of combined bits
-	vmovdqu	[out_buf], codes1
-	movzx	bits, byte [out_buf + tmp]
-	vmovq	codes1 %+ x, bits
-	vpaddq	code_lens1, code_lens1, code_lens2
+	;; Write out qwords
+	knotq	k6, k0
+	vpermq code_lens2 {k5}{z}, zrot_perm, code_lens2
+	vpscatterqq [out_buf + code_lens2] {k6}, codes1
 
-	vmovq	tmp2, code_lens1 %+ x	;Number of bytes
-	shr	tmp2, 3
-	vpand	ybits_count, code_lens1, yoffset_mask ;Extra bits
-
-	; Write out upper dqword of combined bits
-	vpor	codes1 %+ x, codes1 %+ x, codes2 %+ x
-	vmovdqu	[out_buf + tmp], codes1 %+ x
 	add	out_buf, tmp2
-	movzx	bits, byte [out_buf]
-	vmovq	ybits %+ x, bits
 
 	cmp	ptr, in_buf_end
 	jbe	.main_loop
 
 .main_loop_exit:
-	vmovq	rcx, ybits_count %+ x
-	vmovq	bits, ybits %+ x
+	vmovq	rcx, zbits_count %+ x
+	vmovq	bits, zbits %+ x
 	jmp	.finish
 
+.small_codes:
+	;; Merge overlapping last bytes
+	vpermq	codes4 {k6}{z}, zrot_perm, codes3
+	vporq codes3, codes3, codes4
+	kshiftlq k7, k6, 1
+	ktestd k6, k7
+	jz .small_codes_next
+
+	kandq k6, k6, k7
+	jmp .small_codes
+
 .long_codes:
 	add	end_ptr, VECTOR_SLOP
 	sub	ptr, VECTOR_SIZE
 
-	vpxor ytmp, ytmp, ytmp
-	vpblendd codes3, ytmp, codes1, 0x55
-	vpblendd code_lens3, ytmp, code_lens1, 0x55
-	vpblendd codes4, ytmp, codes2, 0x55
+	vmovdqa32 codes3 {k1}{z}, codes1
+	vmovdqa32 code_lens3 {k1}{z}, code_lens1
+	vmovdqa32 codes4 {k1}{z}, codes2
 
 	vpsllvq	codes4, codes4, code_lens3
-	vpxor	codes3, codes3, codes4
+	vpxord	codes3, codes3, codes4
 	vpaddd	code_lens3, code_lens1, code_lens2
 
 	vpsrlq	codes1, codes1, 32
@@ -313,21 +352,28 @@ encode_deflate_icf_ %+ ARCH:
 	vpsrlq	codes2, codes2, 32
 
 	vpsllvq	codes2, codes2, code_lens1
-	vpxor codes1, codes1, codes2
+	vpxord codes1, codes1, codes2
 
 	vpsrlq code_lens1, code_lens3, 32
-	vpblendd	code_lens3, ytmp, code_lens3, 0x55
+	vmovdqa32	code_lens3 {k1}{z}, code_lens3
 
 	;; Merge bitbuf bits
-	vpsllvq codes3, codes3, ybits_count
-	vpxor	codes3, codes3, ybits
-	vpaddq	code_lens3, code_lens3, ybits_count
+	vpsllvq codes3, codes3, zbits_count
+	vpxord	codes3, codes3, zbits
+	vpaddq	code_lens3, code_lens3, zbits_count
 	vpaddq code_lens1, code_lens1, code_lens3
 
 	xor	bits, bits
 	xor	rcx, rcx
 	vpsubq code_lens1, code_lens1, code_lens3
-%rep 2
+
+	vmovdqu64 codes2, codes1
+	vmovdqu64 code_lens2, code_lens1
+	vmovdqu64 codes4, codes3
+	vmovdqu64 code_lens4, code_lens3
+%assign i 0
+%rep 4
+%assign i (i + 1)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	cmp	out_buf, end_ptr
 	ja	.overflow
@@ -416,15 +462,15 @@ encode_deflate_icf_ %+ ARCH:
 	and	rcx, 7
 	add	ptr, 4
 
-	vextracti128 codes3 %+ x, codes3, 1
-	vextracti128 code_lens3 %+ x, code_lens3, 1
-	vextracti128 codes1 %+ x, codes1, 1
-	vextracti128 code_lens1 %+ x, code_lens1, 1
+	vextracti32x4 codes3 %+ x, codes4, i
+	vextracti32x4 code_lens3 %+ x, code_lens4, i
+	vextracti32x4 codes1 %+ x, codes2, i
+	vextracti32x4 code_lens1 %+ x, code_lens2, i
 %endrep
 	sub	end_ptr, VECTOR_SLOP
 
-	vmovq	ybits %+ x, bits
-	vmovq	ybits_count %+ x, rcx
+	vmovq	zbits %+ x, bits
+	vmovq	zbits_count %+ x, rcx
 	cmp	ptr, in_buf_end
 	jbe	.main_loop
 
@@ -504,24 +550,55 @@ encode_deflate_icf_ %+ ARCH:
 	ret
 
 section .data
-	align 32
+	align 64
 max_write_d:
-	dd	0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
+	dd	0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
+	dd	0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1c
+min_write_q:
+	dq	0x00, 0x08, 0x00, 0x08, 0x00, 0x08, 0x00, 0x08
 offset_mask:
-	dq	0x0000000000000007, 0x0000000000000000
-	dq	0x0000000000000000, 0x0000000000000000
+	dq	0x0000000000000007, 0x0000000000000007
+	dq	0x0000000000000007, 0x0000000000000007
+	dq	0x0000000000000007, 0x0000000000000007
+	dq	0x0000000000000007, 0x0000000000000007
 q_64:
 	dq	0x0000000000000040, 0x0000000000000000
 	dq	0x0000000000000040, 0x0000000000000000
+	dq	0x0000000000000040, 0x0000000000000000
+	dq	0x0000000000000040, 0x0000000000000000
+q_8 :
+	dq	0x0000000000000000, 0x0000000000000008
+	dq	0x0000000000000000, 0x0000000000000008
+	dq	0x0000000000000000, 0x0000000000000008
+	dq	0x0000000000000000, 0x0000000000000008
 lit_mask:
 	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
 	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
+	dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
 dist_mask:
 	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
 	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
+	dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
 lit_icr_mask:
-	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
-	dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
+	dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
+	dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
+	dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
+	dd 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff
 eb_icr_mask:
-	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
-	dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
+	dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
+	dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
+	dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
+	dd 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
+rot_perm:
+	dq 0x00000007, 0x00000000, 0x00000001, 0x00000002
+	dq 0x00000003, 0x00000004, 0x00000005, 0x00000006
+
+k_mask_1: dq 0x55555555
+k_mask_2: dq 0x11111111
+k_mask_3: dq 0xfffffffc
+k_mask_4: dw 0x0101, 0x0101, 0x0101, 0x0101
+k_mask_5: dq 0xfffffffe
+
+%endif
diff --git a/igzip/igzip_multibinary.asm b/igzip/igzip_multibinary.asm
index 8387ab0..b1163a4 100644
--- a/igzip/igzip_multibinary.asm
+++ b/igzip/igzip_multibinary.asm
@@ -60,6 +60,10 @@ extern isal_update_histogram_04
 extern encode_deflate_icf_base
 extern encode_deflate_icf_04
 
+%ifdef HAVE_AS_KNOWS_AVX512
+extern encode_deflate_icf_06
+%endif
+
 extern crc32_gzip_base
 extern crc32_gzip_01
 
@@ -82,8 +86,13 @@ mbin_dispatch_init5	isal_deflate_icf_finish, isal_deflate_icf_finish_base, isal_
 mbin_interface		isal_update_histogram
 mbin_dispatch_init5	isal_update_histogram, isal_update_histogram_base, isal_update_histogram_01, isal_update_histogram_01, isal_update_histogram_04
 
+%ifdef HAVE_AS_KNOWS_AVX512
+mbin_interface		encode_deflate_icf
+mbin_dispatch_init6	encode_deflate_icf, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_04, encode_deflate_icf_06
+%else
 mbin_interface		encode_deflate_icf
 mbin_dispatch_init5	encode_deflate_icf, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_base, encode_deflate_icf_04
+%endif
 
 mbin_interface		crc32_gzip
 mbin_dispatch_init5	crc32_gzip, crc32_gzip_base, crc32_gzip_base, crc32_gzip_01, crc32_gzip_01
diff --git a/include/reg_sizes.asm b/include/reg_sizes.asm
index cd689b7..bd44559 100644
--- a/include/reg_sizes.asm
+++ b/include/reg_sizes.asm
@@ -118,6 +118,72 @@
 %define ymm14x xmm14
 %define ymm15x xmm15
 
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
 %define DWORD(reg) reg %+ d
 %define WORD(reg)  reg %+ w
 %define BYTE(reg)  reg %+ b