dnl Contributed to the GNU project by Torbjörn Granlund.
dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C Code structure:
C
C
C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
C | | | |
C | | | |
C | | | |
C \|/ \|/ \|/ \|/
C ____________ ____________
C / \ / \
C \|/ \ \|/ \
C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
C \ /|\ \ /|\
C \____________/ \____________/
C \ /
C \ /
C \ /
C cor3 cor2
C \ /
C \ /
C sqr_diag_addlsh1
C TODO
C * Align more labels.
C * Further tweak counter and updates in outer loops. (This could save
C perhaps 5n cycles).
C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then
C initialise loop counter i with a right shift.
C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
C (This could save 2-3 cycles for n > 4.)
C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry
C propagation.
C * Stop loops earlier suppressing writes of upper-most rp[] values.
C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
C particularly on Cortex-A8.
ASM_START()
PROLOGUE(mpn_sqr_basecase) and r12, n, #3 cmp n, #4
addgt r12, r12, #4 add pc, pc, r12, lsl #2 nop
b L(4)
b L(1)
b L(2)
b L(3)
b L(0m4)
b L(1m4)
b L(2m4)
b L(3m4)
L(1m4): push {r4-r11, r14} mov n_saved, n sub i, n, #4 sub n, n, #2 add r10, pc, #L(am2_2m4)-.-8
ldm up, {v0,v1,u0} sub up, up, #4 mov cyb, #0 mov r5, #0
umull r4, cya, v1, v0
str r4, [rp], #-12 mov r4, #0
b L(ko0)
L(3m4): push {r4-r11, r14} mov n_saved, n sub i, n, #4 sub n, n, #2 add r10, pc, #L(am2_0m4)-.-8
ldm up, {v0,v1,u0} add up, up, #4 mov cyb, #0 mov r5, #0
umull r4, cya, v1, v0
str r4, [rp], #-4 mov r4, #0
b L(ko2)
L(2m4): push {r4-r11, r14} mov n_saved, n sub i, n, #4 sub n, n, #2 add r10, pc, #L(am2_3m4)-.-8
ldm up, {v0,v1,u1} mov cyb, #0 mov r4, #0
umull r5, cya, v1, v0
str r5, [rp], #-8 mov r5, #0
b L(ko1)
L(0m4): push {r4-r11, r14} mov n_saved, n sub i, n, #4 sub n, n, #2 add r10, pc, #L(am2_1m4)-.-8
ldm up, {v0,v1,u1} mov cyb, #0 mov r4, #0 add up, up, #8
umull r5, cya, v1, v0
str r5, [rp, #0] mov r5, #0
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.