dnl Contributed to the GNU project by Torbjorn Granlund.
dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C The inner loops of thiscode are the result of running a code generation and
C optimization tool suite written by David Harvey and Torbjorn Granlund.
C NOTES
C * There is a major stupidity in that we call mpn_mul_1 initially, for a
C large trip count. Instead, we should follow the generic/sqr_basecase.c
C code which uses addmul_2s from the start, conditionally leaving a 1x1
C multiply to the end. (In assembly code, one would stop invoking
C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to
C save/restore carry, instead it can propagate into the high product word.
C * Align more labels, should shave off a few cycles.
C * We can safely use 32-bit size operations, since operands with (2^32)
C limbs will lead to non-termination in practice.
C * The jump table could probably be optimized, at least for non-pic.
C * The special codefor n <= 4 was quickly written. It is probably too
C large and unnecessarily slow.
C * Consider combining small cases code so that the n=k-1 code jumps into the
C middle of the n=k code.
C * Avoid saving registers for small cases code.
C * Needed variables:
C n r11 input size
C i r8 work left, initially n
C j r9 inner loop count
C r15 unused
C v0 r13
C v1 r14
C rp rdi
C up rsi
C w0 rbx
C w1 rcx
C w2 rbp
C w3 r10
C tp r12
C lo rax
C hi rdx
C rsp
C INPUT PARAMETERS
define(`rp', `%rdi')
define(`up', `%rsi')
define(`n_param', `%rdx')
ASM_START()
TEXT ALIGN(16)
PROLOGUE(mpn_sqr_basecase)
FUNC_ENTRY(3) mov R32(n_param), R32(%rcx) mov R32(n_param), R32(n) C free original n register (rdx)
add $-40, %rsp
and $3, R32(%rcx) cmp $4, R32(n_param) lea 4(%rcx), %r8
lea eval(2*8)(tp), tp C tp += 2 lea -8(up), up jmp L(dowhile)
L(1m4): lea 8(rp,n,8), tp C point tp in middle of result operand mov (up), v0 C u0 mov 8(up), %rax C u1 lea 8(up,n,8), up C point up at end of input operand
lea -3(n), i
C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) lea -3(n), j neg j
mov %rax, v1 C u1 mul v0 C u0 * u1 mov %rdx, w1 xor R32(w2), R32(w2) mov %rax, 8(rp) jmp L(m0)
lea eval(2*8)(tp), tp C tp += 2 lea -8(up), up jmp L(dowhile_mid)
L(3m4): lea 8(rp,n,8), tp C point tp in middle of result operand mov (up), v0 C u0 mov 8(up), %rax C u1 lea 8(up,n,8), up C point up at end of input operand
lea -5(n), i
C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) lea -1(n), j neg j
mov %rax, v1 C u1 mul v0 C u0 * u1 mov %rdx, w3 xor R32(w0), R32(w0) xor R32(w1), R32(w1) mov %rax, 8(rp) jmp L(m2)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.