a5b324d2cd
- Includes gf_nvect_dot_prod, gf_nvect_mad functions - Change ec multibinary to use common macros - Autoconf checks for nasm or yasm support and picks if available - Leave out compile of any avx512 code if assembler not available Signed-off-by: Greg Tucker <greg.b.tucker@intel.com>
272 lines
7.0 KiB
NASM
272 lines
7.0 KiB
NASM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions
|
|
; are met:
|
|
; * Redistributions of source code must retain the above copyright
|
|
; notice, this list of conditions and the following disclaimer.
|
|
; * Redistributions in binary form must reproduce the above copyright
|
|
; notice, this list of conditions and the following disclaimer in
|
|
; the documentation and/or other materials provided with the
|
|
; distribution.
|
|
; * Neither the name of Intel Corporation nor the names of its
|
|
; contributors may be used to endorse or promote products derived
|
|
; from this software without specific prior written permission.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
%ifndef _MULTIBINARY_ASM_
|
|
%define _MULTIBINARY_ASM_
|
|
|
|
%ifidn __OUTPUT_FORMAT__, elf32
|
|
%define mbin_def_ptr dd
|
|
%define mbin_ptr_sz dword
|
|
%define mbin_rdi edi
|
|
%define mbin_rsi esi
|
|
%define mbin_rax eax
|
|
%define mbin_rbx ebx
|
|
%define mbin_rcx ecx
|
|
%define mbin_rdx edx
|
|
%else
|
|
%define mbin_def_ptr dq
|
|
%define mbin_ptr_sz qword
|
|
%define mbin_rdi rdi
|
|
%define mbin_rsi rsi
|
|
%define mbin_rax rax
|
|
%define mbin_rbx rbx
|
|
%define mbin_rcx rcx
|
|
%define mbin_rdx rdx
|
|
%endif
|
|
|
|
;;;;
|
|
; multibinary macro:
|
|
; creates the visable entry point that uses HW optimized call pointer
|
|
; creates the init of the HW optimized call pointer
|
|
;;;;
|
|
%macro mbin_interface 1
|
|
;;;;
|
|
; *_dispatched is defaulted to *_mbinit and replaced on first call.
|
|
; Therefore, *_dispatch_init is only executed on first call.
|
|
;;;;
|
|
section .data
|
|
%1_dispatched:
|
|
mbin_def_ptr %1_mbinit
|
|
|
|
section .text
|
|
global %1:function
|
|
%1_mbinit:
|
|
;;; only called the first time to setup hardware match
|
|
call %1_dispatch_init
|
|
;;; falls thru to execute the hw optimized code
|
|
%1:
|
|
jmp mbin_ptr_sz [%1_dispatched]
|
|
%endmacro
|
|
|
|
;;;;;
|
|
; mbin_dispatch_init parameters
|
|
; Use this function when SSE/00/01 is a minimum requirement
|
|
; 1-> function name
|
|
; 2-> SSE/00/01 optimized function used as base
|
|
; 3-> AVX or AVX/02 opt func
|
|
; 4-> AVX2 or AVX/04 opt func
|
|
;;;;;
|
|
%macro mbin_dispatch_init 4
|
|
section .text
|
|
%1_dispatch_init:
|
|
push mbin_rsi
|
|
push mbin_rax
|
|
push mbin_rbx
|
|
push mbin_rcx
|
|
push mbin_rdx
|
|
lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
|
|
|
|
mov eax, 1
|
|
cpuid
|
|
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
|
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
|
lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
|
|
jne _%1_init_done ; AVX is not available so end
|
|
mov mbin_rsi, mbin_rbx
|
|
|
|
;; Try for AVX2
|
|
xor ecx, ecx
|
|
mov eax, 7
|
|
cpuid
|
|
test ebx, FLAG_CPUID7_EBX_AVX2
|
|
lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
|
|
cmovne mbin_rsi, mbin_rbx
|
|
|
|
;; Does it have xmm and ymm support
|
|
xor ecx, ecx
|
|
xgetbv
|
|
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
je _%1_init_done
|
|
lea mbin_rsi, [%2 WRT_OPT]
|
|
|
|
_%1_init_done:
|
|
pop mbin_rdx
|
|
pop mbin_rcx
|
|
pop mbin_rbx
|
|
pop mbin_rax
|
|
mov [%1_dispatched], mbin_rsi
|
|
pop mbin_rsi
|
|
ret
|
|
%endmacro
|
|
|
|
;;;;;
|
|
; mbin_dispatch_init2 parameters
|
|
; Cases where only base functions are available
|
|
; 1-> function name
|
|
; 2-> base function
|
|
;;;;;
|
|
%macro mbin_dispatch_init2 2
|
|
section .text
|
|
%1_dispatch_init:
|
|
push mbin_rsi
|
|
lea mbin_rsi, [%2 WRT_OPT] ; Default
|
|
mov [%1_dispatched], mbin_rsi
|
|
pop mbin_rsi
|
|
ret
|
|
%endmacro
|
|
|
|
;;;;;
|
|
; mbin_dispatch_init5 parameters
|
|
; 1-> function name
|
|
; 2-> base function
|
|
; 3-> SSE4_1 or 00/01 optimized function
|
|
; 4-> AVX/02 opt func
|
|
; 5-> AVX2/04 opt func
|
|
;;;;;
|
|
%macro mbin_dispatch_init5 5
|
|
section .text
|
|
%1_dispatch_init:
|
|
push mbin_rsi
|
|
push mbin_rax
|
|
push mbin_rbx
|
|
push mbin_rcx
|
|
push mbin_rdx
|
|
lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
|
|
|
|
mov eax, 1
|
|
cpuid
|
|
; Test for SSE4.1
|
|
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
|
lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
|
|
cmovne mbin_rsi, mbin_rbx
|
|
|
|
and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
|
cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
|
|
lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
|
|
jne _%1_init_done ; AVX is not available so end
|
|
mov mbin_rsi, mbin_rbx
|
|
|
|
;; Try for AVX2
|
|
xor ecx, ecx
|
|
mov eax, 7
|
|
cpuid
|
|
test ebx, FLAG_CPUID7_EBX_AVX2
|
|
lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
|
|
cmovne mbin_rsi, mbin_rbx
|
|
|
|
;; Does it have xmm and ymm support
|
|
xor ecx, ecx
|
|
xgetbv
|
|
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
je _%1_init_done
|
|
lea mbin_rsi, [%3 WRT_OPT]
|
|
|
|
_%1_init_done:
|
|
pop mbin_rdx
|
|
pop mbin_rcx
|
|
pop mbin_rbx
|
|
pop mbin_rax
|
|
mov [%1_dispatched], mbin_rsi
|
|
pop mbin_rsi
|
|
ret
|
|
%endmacro
|
|
|
|
;;;;;
|
|
; mbin_dispatch_init6 parameters
|
|
; 1-> function name
|
|
; 2-> base function
|
|
; 3-> SSE4_1 or 00/01 optimized function
|
|
; 4-> AVX/02 opt func
|
|
; 5-> AVX2/04 opt func
|
|
; 6-> AVX512/06 opt func
|
|
;;;;;
|
|
%macro mbin_dispatch_init6 6
|
|
section .text
|
|
%1_dispatch_init:
|
|
push mbin_rsi
|
|
push mbin_rax
|
|
push mbin_rbx
|
|
push mbin_rcx
|
|
push mbin_rdx
|
|
push mbin_rdi
|
|
lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
|
|
|
|
mov eax, 1
|
|
cpuid
|
|
mov ebx, ecx ; save cpuid1.ecx
|
|
test ecx, FLAG_CPUID1_ECX_SSE4_1
|
|
je _%1_init_done ; Use base function if no SSE4_1
|
|
lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
|
|
|
|
;; Test for XMM_YMM support/AVX
|
|
test ecx, FLAG_CPUID1_ECX_OSXSAVE
|
|
je _%1_init_done
|
|
xor ecx, ecx
|
|
xgetbv ; xcr -> edx:eax
|
|
mov edi, eax ; save xgetvb.eax
|
|
|
|
and eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
cmp eax, FLAG_XGETBV_EAX_XMM_YMM
|
|
jne _%1_init_done
|
|
test ebx, FLAG_CPUID1_ECX_AVX
|
|
je _%1_init_done
|
|
lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
|
|
|
|
;; Test for AVX2
|
|
xor ecx, ecx
|
|
mov eax, 7
|
|
cpuid
|
|
test ebx, FLAG_CPUID7_EBX_AVX2
|
|
je _%1_init_done ; No AVX2 possible
|
|
lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
|
|
|
|
;; Test for AVX512
|
|
and edi, FLAG_XGETBV_EAX_ZMM_OPM
|
|
cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
|
|
jne _%1_init_done ; No AVX512 possible
|
|
and ebx, FLAGS_CPUID7_ECX_AVX512_G1
|
|
cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1
|
|
lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
|
|
cmove mbin_rsi, mbin_rbx
|
|
|
|
_%1_init_done:
|
|
pop mbin_rdi
|
|
pop mbin_rdx
|
|
pop mbin_rcx
|
|
pop mbin_rbx
|
|
pop mbin_rax
|
|
mov [%1_dispatched], mbin_rsi
|
|
pop mbin_rsi
|
|
ret
|
|
%endmacro
|
|
|
|
%endif ; ifndef _MULTIBINARY_ASM_
|