dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
dnl Copyright 1999-2002 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor);
C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor, mp_limb_t carry);
C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
C mp_srcptr src, mp_size_t size,
C mp_limb_t divisor, mp_limb_t inverse,
C unsigned shift);
C
C Thiscode is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
C see that file for some comments. It's possible what's here can be improved.
dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
dnl
dnl The different speeds of the integer and fraction parts means that using
dnl xsize+size isn't quite right. The threshold wants to be a bit higher
dnl for the integer part and a bit lower for the fraction part. (Or what's
dnl really wanted is to speed up the integer part!)
dnl
dnl The threshold is set to make the integer part right. At 4 limbs the
dnl divandmul are about the same there, but on the fractional part the
dnl mul is much faster.
L(start_preinv):
C eax inverse
C ebx size
C ecx shift
C edx
C esi src
C edicarry
C ebp divisor
C
C mm7 rshift
movl %eax, VAR_INVERSE
orl %ebx, %ebx C size
leal -12(%esi,%ebx,4), %eax C &src[size-3]
movl %eax, VAR_SRC jz L(start_zero)
movl 8(%eax), %esi C src high limb
cmpl $1, %ebx jz L(start_one)
L(start_two_or_more):
movl 4(%eax), %edx C src second highest limb
shldl( %cl, %esi, %edi) C n2 = carry,high << l
shldl( %cl, %edx, %esi) C n10 = high,second << l
cmpl $2, %ebx je L(integer_two_left) jmp L(integer_top)
L(start_one):
shldl( %cl, %esi, %edi) C n2 = carry,high << l
shll %cl, %esi C n10 = high << l jmp L(integer_one_left)
L(start_zero):
C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
C skipped a division.
shll %cl, %edi C n2 = carry << l
movl %edi, %eax C return value for zero_done
cmpl $0, PARAM_XSIZE
je L(zero_done) jmp L(fraction_some)
C -----------------------------------------------------------------------------
C
C This loop runs at about 25 cycles, which is probably sub-optimal, and
C certainly more than the dependent chain would suggest. A better loop, or
C a better rough analysis of what's possible, would be welcomed.
C
C In the current implementation, the following successively dependent
C micro-ops seem to exist.
C
C uops
C n2+n1 1 (addl)
C mul 5
C q1+1 3 (addl/adcl)
C mul 5
C sub 3 (subl/sbbl)
C addback 2 (cmov)
C ---
C 19
C
C Lack of registers hinders explicit scheduling and it might be that the
C normal out of order execution isn't able to hide enough under the mul
C latencies.
C
C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
C combination was tried for the addback (despite the fact it would lengthen
C the dependent chain) but found to be no faster.
ALIGN(16)
L(integer_top):
C eax scratch
C ebx scratch (nadj, q1)
C ecx scratch (src, dst)
C edx scratch
C esi n10
C edi n2
C ebp d
C
C mm0 scratch (src qword)
C mm7 rshift for normalization
movl %esi, %eax
movl %ebp, %ebx
sarl $31, %eax C -n1
movl VAR_SRC, %ecx
andl %eax, %ebx C -n1 & d
negl %eax C n1
addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
addl %edi, %eax C n2+n1
movq (%ecx), %mm0 C next src limb and the one below it
mull VAR_INVERSE C m*(n2+n1)
subl $4, %ecx
movl %ecx, VAR_SRC
C
C
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
movl %ebp, %eax C d
leal 1(%edi), %ebx C n2+1
sbbl %edx, %edi C n - (q1+1)*d
movl %esi, %edi C remainder -> n2
leal (%ebp,%esi), %edx
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 movd %mm0, %esi
sbbl $0, %ebx C q
subl $4, %ecx
movl %ebx, (%ecx)
cmpl %eax, %ecx
movl %ecx, VAR_DST jne L(integer_top)
L(integer_loop_done):
C -----------------------------------------------------------------------------
C
C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
C q1_ff special case. This make the code a bit smaller and simpler, and
C costs only 2 cycles (each).
L(integer_two_left):
C eax scratch
C ebx scratch (nadj, q1)
C ecx scratch (src, dst)
C edx scratch
C esi n10
C edi n2
C ebp divisor
C
C mm7 rshift
movl %esi, %eax
movl %ebp, %ebx
sarl $31, %eax C -n1
movl PARAM_SRC, %ecx
andl %eax, %ebx C -n1 & d
negl %eax C n1
addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
addl %edi, %eax C n2+n1
mull VAR_INVERSE C m*(n2+n1)
movd (%ecx), %mm0 C src low limb
movl VAR_DST_STOP, %ecx
C
C
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
leal 1(%edi), %ebx C n2+1
movl %ebp, %eax C d
sbbl %edx, %edi C n - (q1+1)*d
movl %esi, %edi C remainder -> n2
leal (%ebp,%esi), %edx
cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1 movd %mm0, %esi
sbbl $0, %ebx C q
movl %ebx, -4(%ecx)
C -----------------------------------------------------------------------------
L(integer_one_left):
C eax scratch
C ebx scratch (nadj, q1)
C ecx scratch (dst)
C edx scratch
C esi n10
C edi n2
C ebp divisor
C
C mm7 rshift
movl %esi, %eax
movl %ebp, %ebx
sarl $31, %eax C -n1
movl VAR_DST_STOP, %ecx
andl %eax, %ebx C -n1 & d
negl %eax C n1
addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
addl %edi, %eax C n2+n1
mull VAR_INVERSE C m*(n2+n1)
C
C
C
addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
leal 1(%edi), %ebx C n2+1
movl %ebp, %eax C d
C -----------------------------------------------------------------------------
C
C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
C of q*d is simply -d and the remainder n-q*d = n10+d
L(q1_ff):
C eax (divisor)
C ebx (q1+1 == 0)
C ecx
C edx
C esi n10
C edi n2
C ebp divisor
movl %ecx, VAR_DST
psrlq %mm7, %mm0
leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
movl $-1, (%ecx) movd %mm0, %esi C next n10
cmpl %ecx, %edx jne L(integer_top)
jmp L(integer_loop_done)
C -----------------------------------------------------------------------------
C
C In the current implementation, the following successively dependent
C micro-ops seem to exist.
C
C uops
C mul 5
C q1+1 1 (addl)
C mul 5
C sub 3 (negl/sbbl)
C addback 2 (cmov)
C ---
C 16
C
C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
C the addback was found to be a touch slower.
ALIGN(16)
L(fraction_some):
C eax
C ebx
C ecx
C edx
C esi
C edicarry
C ebp divisor
ALIGN(16)
L(fraction_top):
C eax n2, then scratch
C ebx scratch (nadj, q1)
C ecx dst, decrementing
C edx scratch
C esi dst stop point
C edi n2
C ebp divisor
mull VAR_INVERSE C m*n2
movl %ebp, %eax C d
subl $4, %ecx C dst
leal 1(%edi), %ebx
C
C
C
addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
mull %ebx C (q1+1)*d
C
C
C
C
negl %eax C low of n - (q1+1)*d
sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
leal (%ebp,%eax), %edx
cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
sbbl $0, %ebx C q
movl %eax, %edi C remainder->n2
cmpl %esi, %ecx
movl %ebx, (%ecx) C previous q jne L(fraction_top)
jmp L(fraction_done)
EPILOGUE()
Messung V0.5
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.