dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C POWER3/PPC630 -
C POWER4/PPC970 -
C POWER5 -
C POWER6 -
C POWER7 -
C POWER8 -
C POWER9 1.62
C TODO
C * Completely separate evn and odd code into two outer loops. Also consider
C unrolling these two outer loops and thereby eliminate all branches.
C * Avoid the reloading of u1 before every loop start.
C * Reduce register usage.
C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
C * Consider skewing conditional adjustments to allow mask creation with subfe
C like in the un=3 code. It might streamline the adjustments (or not).
C INPUT PARAMETERS
define(`rp', `r3')
define(`up', `r4')
define(`un', `r5')
mr rp2, rp
mr up2, up
addi r22, un, -1 C count for loop FIXME: Adjust
subfo r0, r0, r0 C clear OV (and r0)
rldicl r0, un, 0, 63 C r0 = un & 1
cmpdi cr7, r0, 0
L(odd): maddld( r23, u0, u0, r26) C W u2^2
maddhdu(r12, u0, u0, r26) C W u2^2
maddld( r5, u1, u1, r27) C W u3^2
maddhdu(r10, u1, u1, r27) C W u3^2
ld r26, 8(rp2)
ld r8, -8(up2)
sradi r8, r8, 63 C CAUTION: clobbers CA and r8, r8, u0
sradi r11, u0, 63 C CAUTION: clobbers CA and r11, r11, u1
LSHU1U
addc r23, r23, r8
ld r8, 8(up2)
ld r9, 16(up2)
maddld( r28, r8, u0, r26) C W u3 x u2
maddhdu(r31, r8, u0, r26) C W u3 x u2
ld r26, 24(rp2)
std r23, 0(rp2) C W0
L(evn): maddld( r23, u0, u0, r26) C W u2^2
maddhdu(r5, u0, u0, r26) C W u2^2
maddld( r12, u1, u1, r27) C W u3^2
maddhdu(r11, u1, u1, r27) C W u3^2
ld r27, 8(rp2)
ld r9, -8(up2)
sradi r9, r9, 63 C CAUTION: clobbers CA and r9, r9, u0
sradi r10, u0, 63 C CAUTION: clobbers CA and r10, r10, u1
LSHU1U
addc r23, r23, r9
ld r9, 8(up2)
ld r8, 16(up2)
maddld( r29, r9, u0, r27) C W u3 x u2
maddhdu(r30, r9, u0, r27) C W u3 x u2
ld r27, 24(rp2)
std r23, 0(rp2) C W0
L(corner_evn):
ld r27, 40(rp2)
maddld( r23, u0, u0, r26) C W u2^2
maddhdu(r5, u0, u0, r26) C W u2^2
mulld r12, u1, u1 C W u3^2
mulhdu r11, u1, u1 C W u3^2
ld r9, 8(up2)
sradi r9, r9, 63 C CAUTION: clobbers CA and r9, r9, u0
sradi r10, u0, 63 C CAUTION: clobbers CA and r10, r10, u1
LSHU1UHF
addc r23, r23, r9
ld r9, 24(up2)
maddld( r29, r9, u0, r27) C W u3 x u2
maddhdu(r30, r9, u0, r27) C W u3 x u2
std r23, 32(rp2)
adde r5, r29, r5
std r5, 40(rp2)
addex( r12, r12, r30, 0)
adde r12, r12, r10 C W FIXME can this co?
std r12, 48(rp2)
li r4, 0
addex( r5, r11, r4, 0)
addze r5, r5
std r5, 56(rp2)
b L(ret)
L(corner_odd):
ld r27, 48(rp2)
maddld( r23, u0, u0, r26) C W u2^2
maddhdu(r12, u0, u0, r26) C W u2^2
maddld( r5, u1, u1, r27) C W u3^2
maddhdu(r10, u1, u1, r27) C W u3^2
ld r26, 40(rp2)
ld r8, 8(up2)
sradi r8, r8, 63 C CAUTION: clobbers CA and r8, r8, u0
sradi r11, u0, 63 C CAUTION: clobbers CA and r11, r11, u1
LSHU1UF
addc r23, r23, r8
ld r8, 24(up2)
ld r9, 32(up2)
maddld( r28, r8, u0, r26) C W u3 x u2
maddhdu(r31, r8, u0, r26) C W u3 x u2
std r23, 32(rp2)
maddld( r29, r9, u0, r11)
maddhdu(r30, r9, u0, r11)
adde r12, r28, r12
std r12, 40(rp2)
mulld r12, r9, u1
mulhdu r11, r9, u1
addex( r5, r5, r31, 0)
adde r5, r29, r5
std r5, 48(rp2)
addex( r12, r12, r30, 0)
adde r12, r12, r10
std r12, 56(rp2)
mulld r23, r9, r9 C W u2^2
mulhdu r12, r9, r9 C W u2^2
adde r23, r23, r11
addze r12, r12
sradi r4, r8, 63 C CAUTION: clobbers CA and r4, r4, r9
addex( r23, r23, r4, 0)
std r23, 64(rp2)
li r4, 0
addex( r12, r12, r4, 0)
std r12, 72(rp2)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.