crc: release crc32_gzip_refl code outside

Merge crc32_gzip_refl function definitions, base code, multi-binary code into crc32.h, crc32_base,c and crc_multibinary.asm in order to keep consistency. Add crc32_gzip_refl files into crc/Makefile.am Original crc32_gzip_refl removed NOT operation, re-add it. Change-Id: Ib0cbbeb1ab3c9fcafec324b392596d2514202424 Signed-off-by: Xiaodong Liu <xiaodong.liu@intel.com>
2017-02-24 02:00:24 -05:00 · 2017-02-24 02:00:24 -05:00 · 7137c4a5be
commit 7137c4a5be
parent 39ce870235
8 changed files with 994 additions and 3 deletions
--- a/crc/Makefile.am
+++ b/crc/Makefile.am
@ -48,7 +48,8 @@ lsrc_x86_64 += \
 	crc/crc64_iso_refl_by8.asm \
 	crc/crc64_iso_norm_by8.asm \
 	crc/crc64_jones_refl_by8.asm \
-	crc/crc64_jones_norm_by8.asm
+	crc/crc64_jones_norm_by8.asm \
 	crc/crc32_gzip_refl_by8.asm
 src_include += -I $(srcdir)/crc
 extern_hdrs +=  include/crc.h include/crc64.h
@ -56,9 +57,10 @@ extern_hdrs +=  include/crc.h include/crc64.h
 other_src   +=  include/reg_sizes.asm include/types.h include/test.h
 check_tests +=  crc/crc16_t10dif_test crc/crc32_ieee_test crc/crc32_iscsi_test \
-		crc/crc64_funcs_test
+		crc/crc64_funcs_test crc/crc32_gzip_refl_test
 perf_tests  +=  crc/crc16_t10dif_perf crc/crc32_ieee_perf crc/crc32_iscsi_perf \
-		crc/crc64_funcs_perf
+		crc/crc64_funcs_perf crc/crc32_gzip_refl_perf
 examples    +=  crc/crc_simple_test crc/crc64_example
--- a/crc/crc32_gzip_refl_by8.asm
+++ b/crc/crc32_gzip_refl_by8.asm
@ -0,0 +1,624 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
 ;  are met:
 ;    * Redistributions of source code must retain the above copyright
 ;      notice, this list of conditions and the following disclaimer.
 ;    * Redistributions in binary form must reproduce the above copyright
 ;      notice, this list of conditions and the following disclaimer in
 ;      the documentation and/or other materials provided with the
 ;      distribution.
 ;    * Neither the name of Intel Corporation nor the names of its
 ;      contributors may be used to endorse or promote products derived
 ;      from this software without specific prior written permission.
 ;
 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;       Function API:
 ;       UINT32 crc32_gzip_refl_by8(
 ;               UINT32 init_crc, //initial CRC value, 32 bits
 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
 ;               UINT64 len //buffer length in bytes (64-bit data)
 ;       );
 ;
 ;       Authors:
 ;               Erdinc Ozturk
 ;               Vinodh Gopal
 ;               James Guilford
 ;
 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
 ;
 ;
 ;       sample yasm command line:
 ;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
 ;
 ;       As explained here:
 ;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
 ;       CRC-32 checksum is described in RFC 1952
 ;       Implementing RFC 1952 CRC:
 ;       http://www.ietf.org/rfc/rfc1952.txt
 %include "reg_sizes.asm"
 %define	fetch_dist	1024
 [bits 64]
 default rel
 section .text
 %ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8
        %xdefine        arg1_low32 ecx
 %else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx
        %xdefine        arg1_low32 edi
 %endif
 %define TMP 16*0
 %ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
 %else
        %define VARIABLE_OFFSET 16*2+8
 %endif
 align 16
 global  crc32_gzip_refl_by8:function
 crc32_gzip_refl_by8:
        ; unsigned long c = crc ^ 0xffffffffL;
        not     arg1_low32	;
        sub     rsp, VARIABLE_OFFSET
 %ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
 %endif
        ; check if smaller than 256B
        cmp     arg3, 256
        ; for sizes less than 256, we can't fold 128B at a time...
        jl      _less_than_256
        ; load the initial crc value
        movd    xmm10, arg1_low32      ; initial crc
        ; receive the initial 64B data, xor the initial crc value
        movdqu  xmm0, [arg2+16*0]
        movdqu  xmm1, [arg2+16*1]
        movdqu  xmm2, [arg2+16*2]
        movdqu  xmm3, [arg2+16*3]
        movdqu  xmm4, [arg2+16*4]
        movdqu  xmm5, [arg2+16*5]
        movdqu  xmm6, [arg2+16*6]
        movdqu  xmm7, [arg2+16*7]
        ; XOR the initial_crc value
        pxor    xmm0, xmm10
 	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
                                        ;imm value of pclmulqdq instruction will determine which constant to use
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ; we subtract 256 instead of 128 to save one instruction from the loop
        sub     arg3, 256
        ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
        ; loop will fold 128B at a time until we have 128+y Bytes of buffer
        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
 _fold_128_B_loop:
        ; update the buffer pointer
        add     arg2, 128
 	prefetchnta [arg2+fetch_dist+0]
        movdqu  xmm9, [arg2+16*0]
        movdqu  xmm12, [arg2+16*1]
        movdqa  xmm8, xmm0
        movdqa  xmm13, xmm1
        pclmulqdq       xmm0, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm1, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm0, xmm9
        xorps   xmm0, xmm8
        pxor    xmm1, xmm12
        xorps   xmm1, xmm13
 	prefetchnta [arg2+fetch_dist+32]
        movdqu  xmm9, [arg2+16*2]
        movdqu  xmm12, [arg2+16*3]
        movdqa  xmm8, xmm2
        movdqa  xmm13, xmm3
        pclmulqdq       xmm2, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm3, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm2, xmm9
        xorps   xmm2, xmm8
        pxor    xmm3, xmm12
        xorps   xmm3, xmm13
 	prefetchnta [arg2+fetch_dist+64]
        movdqu  xmm9, [arg2+16*4]
        movdqu  xmm12, [arg2+16*5]
        movdqa  xmm8, xmm4
        movdqa  xmm13, xmm5
        pclmulqdq       xmm4, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm5, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm4, xmm9
        xorps   xmm4, xmm8
        pxor    xmm5, xmm12
        xorps   xmm5, xmm13
 	prefetchnta [arg2+fetch_dist+96]
        movdqu  xmm9, [arg2+16*6]
        movdqu  xmm12, [arg2+16*7]
        movdqa  xmm8, xmm6
        movdqa  xmm13, xmm7
        pclmulqdq       xmm6, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm7, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm6, xmm9
        xorps   xmm6, xmm8
        pxor    xmm7, xmm12
        xorps   xmm7, xmm13
        sub     arg3, 128
        ; check if there is another 128B in the buffer to be able to fold
        jge     _fold_128_B_loop
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        add     arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
        ; fold the 8 xmm registers to 1 xmm register with different constants
        movdqa  xmm10, [rk9]
        movdqa  xmm8, xmm0
        pclmulqdq       xmm0, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm0
        movdqa  xmm10, [rk11]
        movdqa  xmm8, xmm1
        pclmulqdq       xmm1, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm1
        movdqa  xmm10, [rk13]
        movdqa  xmm8, xmm2
        pclmulqdq       xmm2, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2
        movdqa  xmm10, [rk15]
        movdqa  xmm8, xmm3
        pclmulqdq       xmm3, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm3
        movdqa  xmm10, [rk17]
        movdqa  xmm8, xmm4
        pclmulqdq       xmm4, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm4
        movdqa  xmm10, [rk19]
        movdqa  xmm8, xmm5
        pclmulqdq       xmm5, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm5
        movdqa  xmm10, [rk1]
        movdqa  xmm8, xmm6
        pclmulqdq       xmm6, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm6
        ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
        ; instead of a cmp instruction, we use the negative flag with the jl instruction
        add     arg3, 128-16
        jl      _final_reduction_for_128
        ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
        ; we can fold 16 bytes at a time if y>=16
        ; continue folding 16B at a time
 _16B_reduction_loop:
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        movdqu  xmm0, [arg2]
        pxor    xmm7, xmm0
        add     arg2, 16
        sub     arg3, 16
        ; instead of a cmp instruction, we utilize the flags with the jge instruction
        ; equivalent of: cmp arg3, 16-16
        ; check if there is any more 16B in the buffer to be able to fold
        jge     _16B_reduction_loop
        ;now we have 16+z bytes left to reduce, where 0<= z < 16.
        ;first, we reduce the data in the xmm7 register
 _final_reduction_for_128:
        add arg3, 16
        je _128_done
 ; here we are getting data that is less than 16 bytes.
        ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
        ; after that the registers need to be adjusted.
 _get_last_two_xmms:
        movdqa xmm2, xmm7
        movdqu xmm1, [arg2 - 16 + arg3]
        ; get rid of the extra data that was loaded before
        ; load the shift constant
        lea     rax, [pshufb_shf_table]
        add     rax, arg3
        movdqu  xmm0, [rax]
        pshufb  xmm7, xmm0
        pxor    xmm0, [mask3]
        pshufb  xmm2, xmm0
        pblendvb        xmm2, xmm1     ;xmm0 is implicit
        ;;;;;;;;;;
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2
 _128_done:
        ; compute crc of a 128-bit value
        movdqa  xmm10, [rk5]
        movdqa  xmm0, xmm7
        ;64b fold
        pclmulqdq       xmm7, xmm10, 0
        psrldq  xmm0, 8
        pxor    xmm7, xmm0
        ;32b fold
        movdqa  xmm0, xmm7
        pslldq  xmm7, 4
        pclmulqdq       xmm7, xmm10, 0x10
        pxor    xmm7, xmm0
        ;barrett reduction
 _barrett:
        pand    xmm7, [mask2]
        movdqa  xmm1, xmm7
        movdqa  xmm2, xmm7
        movdqa  xmm10, [rk7]
        pclmulqdq       xmm7, xmm10, 0
        pxor    xmm7, xmm2
        pand    xmm7, [mask]
        movdqa  xmm2, xmm7
        pclmulqdq       xmm7, xmm10, 0x10
        pxor    xmm7, xmm2
        pxor    xmm7, xmm1
        pextrd  eax, xmm7, 2
 _cleanup:
        ; return c ^ 0xffffffffL;
        not     eax
 %ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
 %endif
        add     rsp, VARIABLE_OFFSET
        ret
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 align 16
 _less_than_256:
        ; check if there is enough buffer to be able to fold 16B at a time
        cmp     arg3, 32
        jl      _less_than_32
        ; if there is, load the constants
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
        movd    xmm0, arg1_low32       ; get the initial crc value
        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0
        ; update the buffer pointer
        add     arg2, 16
        ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
        sub     arg3, 32
        jmp     _16B_reduction_loop
 align 16
 _less_than_32:
        ; mov initial crc to the return value. this is necessary for zero-length buffers.
        mov     eax, arg1_low32
        test    arg3, arg3
        je      _cleanup
        movd    xmm0, arg1_low32        ; get the initial crc value
        cmp     arg3, 16
        je      _exact_16_left
        jl      _less_than_16_left
        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0              ; xor the initial crc value
        add     arg2, 16
        sub     arg3, 16
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
        jmp     _get_last_two_xmms
 align 16
 _less_than_16_left:
        ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
        pxor    xmm1, xmm1
        mov     r11, rsp
        movdqa  [r11], xmm1
        cmp     arg3, 4
        jl      _only_less_than_4
        ;       backup the counter value
        mov     r9, arg3
        cmp     arg3, 8
        jl      _less_than_8_left
        ; load 8 Bytes
        mov     rax, [arg2]
        mov     [r11], rax
        add     r11, 8
        sub     arg3, 8
        add     arg2, 8
 _less_than_8_left:
        cmp     arg3, 4
        jl      _less_than_4_left
        ; load 4 Bytes
        mov     eax, [arg2]
        mov     [r11], eax
        add     r11, 4
        sub     arg3, 4
        add     arg2, 4
 _less_than_4_left:
        cmp     arg3, 2
        jl      _less_than_2_left
        ; load 2 Bytes
        mov     ax, [arg2]
        mov     [r11], ax
        add     r11, 2
        sub     arg3, 2
        add     arg2, 2
 _less_than_2_left:
        cmp     arg3, 1
        jl      _zero_left
        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al
 _zero_left:
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value
        lea rax,[pshufb_shf_table]
        movdqu  xmm0, [rax + r9]
        pshufb  xmm7,xmm0
        jmp     _128_done
 align 16
 _exact_16_left:
        movdqu  xmm7, [arg2]
        pxor    xmm7, xmm0      ; xor the initial crc value
        jmp     _128_done
 _only_less_than_4:
        cmp     arg3, 3
        jl      _only_less_than_3
        ; load 3 Bytes
        mov     al, [arg2]
        mov     [r11], al
        mov     al, [arg2+1]
        mov     [r11+1], al
        mov     al, [arg2+2]
        mov     [r11+2], al
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value
        pslldq  xmm7, 5
        jmp     _barrett
 _only_less_than_3:
        cmp     arg3, 2
        jl      _only_less_than_2
        ; load 2 Bytes
        mov     al, [arg2]
        mov     [r11], al
        mov     al, [arg2+1]
        mov     [r11+1], al
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value
        pslldq  xmm7, 6
        jmp     _barrett
 _only_less_than_2:
        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value
        pslldq  xmm7, 7
        jmp     _barrett
 section .data
 ; precomputed constants
 align 16
 rk1 :
 DQ 0x00000000ccaa009e
 rk2 :
 DQ 0x00000001751997d0
 rk3 :
 DQ 0x000000014a7fe880
 rk4 :
 DQ 0x00000001e88ef372
 rk5 :
 DQ 0x00000000ccaa009e
 rk6 :
 DQ 0x0000000163cd6124
 rk7 :
 DQ 0x00000001f7011640
 rk8 :
 DQ 0x00000001db710640
 rk9 :
 DQ 0x00000001d7cfc6ac
 rk10 :
 DQ 0x00000001ea89367e
 rk11 :
 DQ 0x000000018cb44e58
 rk12 :
 DQ 0x00000000df068dc2
 rk13 :
 DQ 0x00000000ae0b5394
 rk14 :
 DQ 0x00000001c7569e54
 rk15 :
 DQ 0x00000001c6e41596
 rk16 :
 DQ 0x0000000154442bd4
 rk17 :
 DQ 0x0000000174359406
 rk18 :
 DQ 0x000000003db1ecdc
 rk19 :
 DQ 0x000000015a546366
 rk20 :
 DQ 0x00000000f1da05aa
 mask:
 dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 mask2:
 dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 mask3:
 dq     0x8080808080808080, 0x8080808080808080
 pshufb_shf_table:
 ; use these values for shift constants for the pshufb instruction
 ; different alignments result in values as shown:
 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 ;;;       func        core, ver, snum
 slversion crc32_gzip_refl_by8, 01,   00,  002c
--- a/crc/crc32_gzip_refl_perf.c
+++ b/crc/crc32_gzip_refl_perf.c
@ -0,0 +1,103 @@
 /**********************************************************************
  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************/
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include <sys/time.h>
 #include "crc.h"
 #include "test.h"
 //#define CACHED_TEST
 #ifdef CACHED_TEST
 // Cached test, loop many times over small dataset
 # define TEST_LEN     8*1024
 # define TEST_LOOPS   400000
 # define TEST_TYPE_STR "_warm"
 #else
 // Uncached test.  Pull from large mem base.
 #  define GT_L3_CACHE  32*1024*1024	/* some number > last level cache */
 #  define TEST_LEN     (2 * GT_L3_CACHE)
 #  define TEST_LOOPS   100
 #  define TEST_TYPE_STR "_cold"
 #endif
 #ifndef TEST_SEED
 # define TEST_SEED 0x1234
 #endif
 #define TEST_MEM TEST_LEN
 int main(int argc, char *argv[])
 {
 	int i;
 	void *buf;
 	uint32_t crc;
 	struct perf start, stop;
 	printf("crc32_gzip_refl_perf:\n");
 	if (posix_memalign(&buf, 1024, TEST_LEN)) {
 		printf("alloc error: Fail");
 		return -1;
 	}
 	printf("Start timed tests\n");
 	fflush(0);
 	memset(buf, 0, TEST_LEN);
 	crc = crc32_gzip_refl(TEST_SEED, buf, TEST_LEN);
 	perf_start(&start);
 	for (i = 0; i < TEST_LOOPS; i++) {
 		crc = crc32_gzip_refl(TEST_SEED, buf, TEST_LEN);
 	}
 	perf_stop(&stop);
 	printf("crc32_gzip_refl" TEST_TYPE_STR ": ");
 	perf_print(stop, start, (long long)TEST_LEN * i);
 	printf("finish 0x%x\n", crc);
 	printf("crc32_gzip_refl_base_perf:\n");
 	printf("Start timed tests\n");
 	fflush(0);
 	crc = crc32_gzip_refl_base(TEST_SEED, buf, TEST_LEN);
 	perf_start(&start);
 	for (i = 0; i < (TEST_LOOPS / 100 + 1); i++) {
 		crc = crc32_gzip_refl_base(TEST_SEED, buf, TEST_LEN);
 	}
 	perf_stop(&stop);
 	printf("crc32_gzip_refl_base" TEST_TYPE_STR ": ");
 	perf_print(stop, start, (long long)TEST_LEN * i);
 	printf("finish 0x%x\n", crc);
 	return 0;
 }
--- a/crc/crc32_gzip_refl_test.c
+++ b/crc/crc32_gzip_refl_test.c
@ -0,0 +1,174 @@
 /**********************************************************************
  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************/
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "crc.h"
 #include "types.h"
 #ifndef TEST_SEED
 # define TEST_SEED 0x1234
 #endif
 #define MAX_BUF   512
 #define TEST_SIZE  20
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
 // Generates pseudo-random data
 void rand_buffer(unsigned char *buf, long buffer_size)
 {
 	long i;
 	for (i = 0; i < buffer_size; i++)
 		buf[i] = rand();
 }
 int main(int argc, char *argv[])
 {
 	int fail = 0;
 	u32 r;
 	int verbose = argc - 1;
 	int i, s, ret;
 	void *buf_alloc;
 	unsigned char *buf;
 	printf("Test crc32_gzip_refl ");
 	// Align to MAX_BUF boundary
 	ret = posix_memalign(&buf_alloc, MAX_BUF, MAX_BUF * TEST_SIZE);
 	if (ret) {
 		printf("alloc error: Fail");
 		return -1;
 	}
 	buf = (unsigned char *)buf_alloc;
 	srand(TEST_SEED);
 	// Test of all zeros
 	memset(buf, 0, MAX_BUF * 10);
 	u32 crc = crc32_gzip_refl(TEST_SEED, buf, MAX_BUF);
 	u32 crc_ref = crc32_gzip_refl_base(TEST_SEED, buf, MAX_BUF);
 	if (crc != crc_ref) {
 		fail++;
 		printf("\n		   opt   ref\n");
 		printf("		 ------ ------\n");
 		printf("crc	zero = 0x%8x 0x%8x \n", crc, crc_ref);
 	} else
 		printf(".");
 	// Another simple test pattern
 	memset(buf, 0x8a, MAX_BUF);
 	crc = crc32_gzip_refl(TEST_SEED, buf, MAX_BUF);
 	crc_ref = crc32_gzip_refl_base(TEST_SEED, buf, MAX_BUF);
 	if (crc != crc_ref)
 		fail++;
 	if (verbose)
 		printf("crc  all 8a = 0x%8x 0x%8x\n", crc, crc_ref);
 	else
 		printf(".");
 	// Do a few random tests
 	r = rand();
 	rand_buffer(buf, MAX_BUF * TEST_SIZE);
 	for (i = 0; i < TEST_SIZE; i++) {
 		crc = crc32_gzip_refl(r, buf, MAX_BUF);
 		crc_ref = crc32_gzip_refl_base(r, buf, MAX_BUF);
 		if (crc != crc_ref)
 			fail++;
 		if (verbose)
 			printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
 		else
 			printf(".");
 		buf += MAX_BUF;
 	}
 	// Do a few random sizes
 	buf = (unsigned char *)buf_alloc;	//reset buf
 	r = rand();
 	for (i = MAX_BUF; i >= 0; i--) {
 		crc = crc32_gzip_refl(r, buf, i);
 		crc_ref = crc32_gzip_refl_base(r, buf, i);
 		if (crc != crc_ref) {
 			fail++;
 			printf("fail random size%i 0x%8x 0x%8x\n", i, crc, crc_ref);
 		} else
 			printf(".");
 	}
 	// Try different seeds
 	for (s = 0; s < 20; s++) {
 		buf = (unsigned char *)buf_alloc;	//reset buf
 		r = rand();	// just to get a new seed
 		rand_buffer(buf, MAX_BUF * TEST_SIZE);	// new pseudo-rand data
 		if (verbose)
 			printf("seed = 0x%x\n", r);
 		for (i = 0; i < TEST_SIZE; i++) {
 			crc = crc32_gzip_refl(r, buf, MAX_BUF);
 			crc_ref = crc32_gzip_refl_base(r, buf, MAX_BUF);
 			if (crc != crc_ref)
 				fail++;
 			if (verbose)
 				printf("crc rand%3d = 0x%8x 0x%8x\n", i, crc, crc_ref);
 			else
 				printf(".");
 			buf += MAX_BUF;
 		}
 	}
 	// Run tests at end of buffer
 	buf = (unsigned char *)buf_alloc;	//reset buf
 	buf = buf + ((MAX_BUF - 1) * TEST_SIZE);	//Line up TEST_SIZE from end
 	for (i = 0; i < TEST_SIZE; i++) {
 		crc = crc32_gzip_refl(TEST_SEED, buf + i, TEST_SIZE - i);
 		crc_ref = crc32_gzip_refl_base(TEST_SEED, buf + i, TEST_SIZE - i);
 		if (crc != crc_ref)
 			fail++;
 		if (verbose)
 			printf("crc eob rand%3d = 0x%4x 0x%4x\n", i, crc, crc_ref);
 		else
 			printf(".");
 	}
 	printf("Test done: %s\n", fail ? "Fail" : "Pass");
 	if (fail)
 		printf("\nFailed %d tests\n", fail);
 	return fail;
 }
--- a/crc/crc_base.c
+++ b/crc/crc_base.c
@ -154,6 +154,26 @@ uint32_t crc32_ieee_base(uint32_t seed, uint8_t * buf, uint64_t len)
 	return ~rem;
 }
 // crc32_gzip_refl baseline function.
 // Please get difference details between crc32_gzip_ref and crc32_ieee
 // from crc.h.
 // Slow crc32 from the definition.  Can be sped up with a lookup table.
 uint32_t crc32_gzip_refl_base(uint32_t seed, uint8_t * buf, uint64_t len)
 {
 	uint64_t rem = ~seed;
 	int i, j;
 	uint32_t poly = 0xEDB88320;	// IEEE standard
 	for (i = 0; i < len; i++) {
 		rem = rem ^ (buf[i]);
 		for (j = 0; j < MAX_ITER; j++) {
 			rem = (rem & 0x1ULL) ? (rem >> 1) ^ poly : (rem >> 1);
 		}
 	}
 	return ~rem;
 }
 struct slver {
 	unsigned short snum;
 	unsigned char ver;
@ -168,3 +188,6 @@ struct slver crc16_t10dif_base_slver = { 0x011e, 0x02, 0x00 };
 struct slver crc32_ieee_base_slver_0001011f;
 struct slver crc32_ieee_base_slver = { 0x011f, 0x02, 0x00 };
 struct slver crc32_gzip_refl_base_slver_0000002b;
 struct slver crc32_gzip_refl_base_slver = { 0x002b, 0x00, 0x00 };
--- a/crc/crc_base_aliases.c
+++ b/crc/crc_base_aliases.c
@ -46,6 +46,11 @@ uint32_t crc32_ieee(uint32_t seed, const unsigned char *buf, uint64_t len)
 	return crc32_ieee_base(seed, (uint8_t *) buf, len);
 }
 uint32_t crc32_gzip_refl(uint32_t seed, const unsigned char *buf, uint64_t len)
 {
 	return crc32_gzip_refl_base(seed, (uint8_t *) buf, len);
 }
 uint64_t crc64_ecma_refl(uint64_t seed, const uint8_t * buf, uint64_t len)
 {
 	return crc64_ecma_refl_base(seed, buf, len);
--- a/crc/crc_multibinary.asm
+++ b/crc/crc_multibinary.asm
@ -50,6 +50,11 @@ extern crc16_t10dif_01
 extern crc16_t10dif_by4  ;; Optimized for SLM
 extern crc16_t10dif_base
 extern crc32_gzip_refl_by8
 extern crc32_gzip_refl_base
 %include "multibinary.asm"
 section .data
 ;;; *_mbinit are initial values for *_dispatched; is updated on first call.
 ;;; Therefore, *_dispatch_init is only executed on first call.
@ -174,7 +179,11 @@ use_t10dif_base:
 	pop     rax
 	ret
 mbin_interface			crc32_gzip_refl
 mbin_dispatch_init_clmul	crc32_gzip_refl, crc32_gzip_refl_base, crc32_gzip_refl_by8
 ;;;       func            	core, ver, snum
 slversion crc16_t10dif,		00,   03,  011a
 slversion crc32_ieee,		00,   03,  011b
 slversion crc32_iscsi,		00,   03,  011c
 slversion crc32_gzip_refl,		00,   00,  002a
--- a/include/crc.h
+++ b/include/crc.h
@ -66,6 +66,17 @@ uint16_t crc16_t10dif(
 *
 * This function determines what instruction sets are enabled and
 * selects the appropriate version at runtime.
 * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and
 * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320
 * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is
 * actually designed for normal CRC32 IEEE version. And function
 * crc32_gzip_refl is actually designed for reflected CRC32 IEEE.
 * These two versions of CRC32 IEEE are not compatible with each other.
 * Users who want to replace their not optimized crc32 ieee with ISA-L's
 * crc32 function should be careful of that.
 * Since many applications use CRC32 IEEE reflected version, Please have
 * a check whether crc32_gzip_refl is right one for you instead of
 * crc32_ieee.
 *
 * @returns 32 bit CRC
 */
@ -76,6 +87,34 @@ uint32_t crc32_ieee(
 	uint64_t len              //!< buffer length in bytes (64-bit data)
 	);
 /**
 * @brief Generate the customized CRC
 * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard,
 * runs appropriate version.
 *
 * This function determines what instruction sets are enabled and
 * selects the appropriate version at runtime.
 *
 * Note: CRC32 IEEE standard is widely used in HDLC, Ethernet, Gzip and
 * many others. Its polynomial is 0x04C11DB7 in normal and 0xEDB88320
 * in reflection (or reverse). In ISA-L CRC, function crc32_ieee is
 * actually designed for normal CRC32 IEEE version. And function
 * crc32_gzip_refl is actually designed for reflected CRC32 IEEE.
 * These two versions of CRC32 IEEE are not compatible with each other.
 * Users who want to replace their not optimized crc32 ieee with ISA-L's
 * crc32 function should be careful of that.
 * Since many applications use CRC32 IEEE reflected version, Please have
 * a check whether crc32_gzip_refl is right one for you instead of
 * crc32_ieee.
 *
 * @returns 32 bit CRC
 */
 uint32_t crc32_gzip_refl(
 	uint32_t init_crc,          //!< initial CRC value, 32 bits
 	const unsigned char *buf, //!< buffer to calculate CRC on
 	uint64_t len                //!< buffer length in bytes (64-bit data)
 	);
 /**
 * @brief ISCSI CRC function, runs appropriate version.
@ -126,6 +165,18 @@ uint32_t crc32_ieee_base(
 	uint64_t len 	//!< buffer length in bytes (64-bit data)
 	);
 /**
 * @brief Generate the customized CRC
 * based on RFC 1952 CRC (http://www.ietf.org/rfc/rfc1952.txt) standard,
 * runs baseline version
 * @returns 32 bit CRC
 */
 uint32_t crc32_gzip_refl_base(
 	uint32_t seed,	//!< initial CRC value, 32 bits
 	uint8_t *buf,	//!< buffer to calculate CRC on
 	uint64_t len	//!< buffer length in bytes (64-bit data)
 	);
 #ifdef __cplusplus
 }