;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2019 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Henrik Gramner <henrik@gramner.com> ;* Anton Mitrofanov <BugMaster@narod.ru> ;* Fiona Glaser <fiona@x264.com> ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;*****************************************************************************
; This is a header file for the x264ASM assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used in x264.
; Unlike the rest of x264, this file is available under an ISC license, as it ; has significant usefulness outside of x264 and we want it to be available ; to the largest audience possible. Of course, if you modify it for your own ; purposes to add a new feature, we strongly encourage contributing a patch ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org .
; Set PREFIX for libaom builds.
%if FORMAT_ELF
%undef PREFIX
%elif WIN64
%undef PREFIX
%else
%define PREFIX
%endif
%ifdef PREFIX
%define mangle(x) _ %+ x
%else
%define mangle(x) x
%endif
; In some instances macho32 tables get misaligned when using .rodata. ; When looking at the disassembly it appears that the offset is either ; correct or consistently off by 90. Placing them in the .text section ; works around the issue. It appears to be specific to the way libaom ; handles the tables.
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,win32
SECTION .rdata align=%1
%elif WIN64
SECTION .rdata align=%1
%elifidn __OUTPUT_FORMAT__,macho32
SECTION .text align=%1
fakegot:
%elifidn __OUTPUT_FORMAT__,aout
SECTION .text
%else
SECTION .rodata align=%1
%endif
%endmacro
; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most of x264's asm.
; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space.
; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro.
; RET: ; Pops anything that was pushed by PROLOGUE, and returns.
; REP_RET: ; Use this instead of RET if it's a branch target.
; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size
%xdefine %%stack_offset stack_offset
%undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
%assign %%i 0
%rep %0
%xdefine %1q r %+ %%i %+ q
%xdefine %1d r %+ %%i %+ d
%xdefine %1w r %+ %%i %+ w
%xdefine %1h r %+ %%i %+ h
%xdefine %1b r %+ %%i %+ b
%xdefine %1m r %+ %%i %+ m
%xdefine %1mp r %+ %%i %+ mp
CAT_XDEFINE arg_name, %%i, %1
%assign %%i %%i+1
%rotate 1
%endrep
%xdefine stack_offset %%stack_offset
%assign n_arg_names %0
%endmacro
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
%if %1 != 0
%assign %%pad 0
%assign stack_size %1
%if stack_size < 0
%assign stack_size -stack_size
%endif
%if WIN64
%assign %%pad %%pad + 32 ; shadow space
%if mmsize != 8
%assign xmm_regs_used %2
%if xmm_regs_used > 8
%assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
%endif
%endif
%endif
%if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment
%assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded
%else
%assign %%reg_num (regs_used - 1)
%xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded])
%if %1 < 0 ; need to store rsp on stack
%xdefine rstkm [rsp + stack_size + %%pad]
%assign %%pad %%pad + gprsize
%else; can keep rsp in rstk during whole function
%xdefine rstkm rstk
%endif
%assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded
movifnidn rstkm, rstk
%endif
WIN64_PUSH_XMM
%endif
%endif
%endmacro
%macro SETUP_STACK_POINTER 1
%ifnum %1
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0 ; Reserve an additional register for storing the original stack pointer, but avoid using ; eax/rax for this purpose since it can potentially get overwritten as a return value.
%assign regs_used (regs_used + 1)
%if AOM_ARCH_X86_64 && regs_used == 7
%assign regs_used 8
%elif AOM_ARCH_X86_64 == 0 && regs_used == 1
%assign regs_used 2
%endif
%endif
%if AOM_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
%assign regs_used 5 + UNIX64 * 3
%endif
%endif
%endif
%endmacro
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
%if has_epilogue || cpuflag(ssse3) RET
%else repret
%endif
annotate_function_size
%endmacro
%define last_branch_adr $$
%macro AUTO_REP_RET 0
%if notcpuflag(ssse3)
times ((last_branch_adr-$)>>31)+1 rep; times 1 iff $ == last_branch_adr.
%endif ret
annotate_function_size
%endmacro
;============================================================================= ; arch-independent part ;=============================================================================
%assign function_align 16
; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
%macro cglobal 1-2+ ""; name, [PROLOGUE args]
cglobal_internal 1, %1 %+ SUFFIX, %2
%endmacro
%macro cvisible 1-2+ ""; name, [PROLOGUE args]
cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro
%macro cglobal_internal 2-3+
annotate_function_size
%ifndef cglobaled_%2
%if %1
%xdefine %2 mangle(private_prefix %+ _ %+ %2)
%else
%xdefine %2 mangle(public_prefix %+ _ %+ %2)
%endif
%xdefine %2.skip_prologue %2 %+ .skip_prologue
CAT_XDEFINE cglobaled_, %2, 1
%endif
%xdefine current_function %2
%xdefine current_function_section __SECT__
%if FORMAT_ELF
%if %1
global %2:function VISIBILITY
%else
global %2:function
%endif
%elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
global %2:private_extern
%else
global %2
%endif align function_align
%2:
RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
%xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
%endmacro
; Create a global symbol from a local label with the correct name mangling and type
%macro cglobal_label 1
%if FORMAT_ELF
global current_function %+ %1:function VISIBILITY
%elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
global current_function %+ %1:private_extern
%else
global current_function %+ %1
%endif
%1:
%endmacro
; like cextern, but without the prefix
%macro cextern_naked 1
%ifdef PREFIX
%xdefine %1 mangle(%1)
%endif
CAT_XDEFINE cglobaled_, %1, 1 extern %1
%endmacro
%macro const 1-2+
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
%if FORMAT_ELF
global %1:data VISIBILITY
%elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
global %1:private_extern
%else
global %1
%endif
%1: %2
%endmacro
; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
[SECTION .note.GNU-stack noalloc noexec nowrite progbits]
%endif
; Tell debuggers how large the function was. ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, ; then its size might be unspecified.
%macro annotate_function_size 0
%ifdef __YASM_VER__
%ifdef current_function
%if FORMAT_ELF
current_function_section
%%ecf equ $
size current_function %%ecf - current_function
__SECT__
%endif
%endif
%endif
%endmacro
%assign cpuflags_cache32 (1<<22)
%assign cpuflags_cache64 (1<<23)
%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<25)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
%define notcpuflag(x) (cpuflag(x) ^ 1)
; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
%macro INIT_CPUFLAGS 0-*
%xdefine SUFFIX
%undef cpuname
%assign cpuflags 0
; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# ; (All 4 remain in sync through SWAP.)
%assign i 0
%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles.
%macro PERMUTE 2-* ; takes a list of pairs to swap
%rep %0/2
%xdefine %%tmp%2 m%2
%rotate 2
%endrep
%rep %0/2
%xdefine m%1 %%tmp%2
CAT_XDEFINE nn, m%1, %1
%rotate 2
%endrep
%endmacro
%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
%ifnum %1 ; SWAP 0, 1, ...
SWAP_INTERNAL_NUM %1, %2
%else; SWAP m0, m1, ...
SWAP_INTERNAL_NAME %1, %2
%endif
%endmacro
; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs.
%macro SAVE_MM_PERMUTATION 0-1
%if %0
%xdefine %%f %1_m
%else
%xdefine %%f current_function %+ _m
%endif
%assign %%i 0
%rep num_mmregs
%xdefine %%tmp m %+ %%i
CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp
%assign %%i %%i+1
%endrep
%endmacro
%assign i 0
%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
CAT_XDEFINE sizeofzmm, i, 64
CAT_XDEFINE regnumofxmm, i, i
CAT_XDEFINE regnumofymm, i, i
CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
%macro CHECK_AVX_INSTR_EMU 3-*
%xdefine %%opcode %1
%xdefine %%dst %2
%rep %0-2
%ifidn %%dst, %3
%error non-avx emulation of ``%%opcode'' is not supported
%endif
%rotate 1
%endrep
%endmacro
;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands
%macro RUN_AVX_INSTR 6-9+
%ifnum sizeof%7
%assign __sizeofreg sizeof%7
%elifnum sizeof%6
%assign __sizeofreg sizeof%6
%else
%assign __sizeofreg mmsize
%endif
%assign __emulate_avx 0
%if avx_enabled && __sizeofreg >= 16
%xdefine __instr v%1
%else
%xdefine __instr %1
%if %0 >= 8+%4
%assign __emulate_avx 1
%endif
%endif
%ifnidn %2, fnord
%ifdef cpuname
%if notcpuflag(%2)
%error use of ``%1'' %2 instruction in cpuname function: current_function
%elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
%error use of ``%1'' sse2 instruction in cpuname function: current_function
%elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
%error use of ``%1'' avx2 instruction in cpuname function: current_function
%elif __sizeofreg == 16 && notcpuflag(sse)
%error use of ``%1'' sse instruction in cpuname function: current_function
%elif __sizeofreg == 32 && notcpuflag(avx)
%error use of ``%1'' avx instruction in cpuname function: current_function
%elif __sizeofreg == 64 && notcpuflag(avx512)
%error use of ``%1'' avx512 instruction in cpuname function: current_function
%elifidn %1, pextrw ; special case because the base instruction is mmx2,
%ifnid %6 ; but sse4 is required for memory operands
%if notcpuflag(sse4)
%error use of ``%1'' sse4 instruction in cpuname function: current_function
%endif
%endif
%endif
%endif
%endif
%if __emulate_avx
%xdefine __src1 %7
%xdefine __src2 %8
%if %5 && %4 == 0
%ifnidn %6, %7
%ifidn %6, %8
%xdefine __src1 %8
%xdefine __src2 %7
%elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them.
%xdefine __src1 %8
%xdefine __src2 %7
%endif
%endif
%endif
%ifnidn %6, __src1
%if %0 >= 9
CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9
%else
CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2
%endif
%if __sizeofreg == 8
MOVQ %6, __src1
%elif %3
MOVAPS %6, __src1
%else
MOVDQA %6, __src1
%endif
%endif
%if %0 >= 9
%1 %6, __src2, %9
%else
%1 %6, __src2
%endif
%elif %0 >= 9
__instr %6, %7, %8, %9
%elif %0 == 8
%if avx_enabled && %5
%xdefine __src1 %7
%xdefine __src2 %8
%ifnum regnumof%7
%ifnum regnumof%8
%if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 ; Most VEX-encoded instructions require an additional byte to encode when ; src2 is a high register (e.g. m8..15). If the instruction is commutative ; we can swap src1 and src2 when doing so reduces the instruction length.
%xdefine __src1 %8
%xdefine __src2 %7
%endif
%endif
%endif
__instr %6, __src1, __src2
%else
__instr %6, %7, %8
%endif
%elif %0 == 7
%if avx_enabled && %5
%xdefine __src1 %6
%xdefine __src2 %7
%ifnum regnumof%6
%ifnum regnumof%7
%if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
%xdefine __src1 %7
%xdefine __src2 %6
%endif
%endif
%endif
__instr %6, __src1, __src2
%else
__instr %6, %7
%endif
%else
__instr %6
%endif
%endmacro
; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; FMA3 is only possible if dst is the same as one of the src registers. ; Either src2 or src3 can be a memory operand.
%macro FMA4_INSTR 2-*
%push fma4_instr
%xdefine %$prefix %1
%rep %0 - 1
%macro %$prefix%2 4-6 %$prefix, %2
%if notcpuflag(fma3) && notcpuflag(fma4)
%error use of ``%5%6'' fma instruction in cpuname function: current_function
%elif cpuflag(fma4)
v%5%6 %1, %2, %3, %4
%elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
%ifnum sizeof%3
v%{5}213%6 %2, %3, %4
%else
v%{5}132%6 %2, %4, %3
%endif
%elifidn %1, %3
v%{5}213%6 %3, %2, %4
%elifidn %1, %4
v%{5}231%6 %4, %2, %3
%else
%error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
%endif
%endmacro
%rotate 1
%endrep
%pop
%endmacro
FMA4_INSTR fmadd, pd, ps, sd, ss
FMA4_INSTR fmaddsub, pd, ps
FMA4_INSTR fmsub, pd, ps, sd, ss
FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.