Quelle mode1o.asm Sprache: Masm

dnl  AMD64 mpn_modexact_1_odd -- Hensel norm remainder.

dnl  Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl

dnl  The GNU MP Library is free software; you can redistribute it and/or modify

dnl  it under the terms of either:

dnl

dnl    * the GNU Lesser General Public License as published by the Free

dnl      Software Foundation; either version 3 of the License, or (at your

dnl      option) any later version.

dnl

dnl  or

dnl

dnl    * the GNU General Public License as published by the Free Software

dnl      Foundation; either version 2 of the License, or (at your option) any

dnl      later version.

dnl

dnl  or both in parallel, as here.

dnl

dnl  The GNU MP Library is distributed in the hope that it will be useful, but

dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

dnl  for more details.

dnl

dnl  You should have received copies of the GNU General Public License and the

dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,

dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C      cycles/limb

C AMD K8,K9 10

C AMD K10 10

C Intel P4 33

C Intel core2 13

C Intel corei 14.5

C Intel atom 35

C VIA nano  ?

C The dependent chain in the main loop is

C

C                            cycles

C sub %rdx, %rax 1

C imul %r9, %rax 4

C mul %r8  5

C         ----

C       total         10

C

C The mov load from src seems to need to be scheduled back before the jz to

C achieve this speed, out-of-order execution apparently can't completely hide

C the latency otherwise.

C

C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it

C for the first iteration (where there's no cbit).

C

C The code alignment used (32-byte) for the loop also seems necessary.  Without

C that the non-PIC case has adc crossing the 0x60 offset, apparently making it

C run at 11 cycles instead of 10.

ABI_SUPPORT(DOS64)

ABI_SUPPORT(STD64)

ASM_START()

 TEXT

 ALIGN(32)

PROLOGUE(mpn_modexact_1_odd)

 FUNC_ENTRY(3)

 mov $0, R32(%rcx)

IFDOS(` jmp L(ent)  ')

PROLOGUE(mpn_modexact_1c_odd)

 FUNC_ENTRY(4)

L(ent):

 C rdi src

 C rsi size

 C rdx divisor

 C rcx carry

 mov %rdx, %r8  C d

 shr R32(%rdx)  C d/2

 LEA( binvert_limb_table, %r9)

 and $127, R32(%rdx)

 mov %rcx, %r10  C initial carry

 movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits

 mov (%rdi), %rax  C src[0]

 lea (%rdi,%rsi,8), %r11 C src end

 mov %r8, %rdi  C d, made available to imull

 lea (%rdx,%rdx), R32(%rcx) C 2*inv

 imul R32(%rdx), R32(%rdx) C inv*inv

 neg %rsi   C -size

 imul R32(%rdi), R32(%rdx) C inv*inv*d

 sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits

 lea (%rcx,%rcx), R32(%rdx) C 2*inv

 imul R32(%rcx), R32(%rcx) C inv*inv

 imul R32(%rdi), R32(%rcx) C inv*inv*d

 sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits

 xor R32(%rcx), R32(%rcx) C initial cbit

 lea (%rdx,%rdx), %r9 C 2*inv

 imul %rdx, %rdx  C inv*inv

 imul %r8, %rdx  C inv*inv*d

 sub %rdx, %r9  C inv = 2*inv - inv*inv*d, 64 bits

 mov %r10, %rdx  C initial climb

 ASSERT(e,` C d*inv == 1 mod 2^64

 mov %r8, %r10

 imul %r9, %r10

 cmp $1, %r10')

 inc %rsi

 jz L(one)

 ALIGN(16)

L(top):

 C rax l = src[i]-cbit

 C rcx new cbit, 0 or 1

 C rdx climb, high of last product

 C rsi counter, limbs, negative

 C rdi

 C r8 divisor

 C r9 inverse

 C r11 src end ptr

 sub %rdx, %rax  C l = src[i]-cbit - climb

 adc $0, %rcx  C more cbit

 imul %r9, %rax  C q = l * inverse

 mul %r8   C climb = high (q * d)

 mov (%r11,%rsi,8), %rax C src[i+1]

 sub %rcx, %rax  C next l = src[i+1] - cbit

 setc R8(%rcx)  C new cbit

 inc %rsi

 jnz L(top)

L(one):

 sub %rdx, %rax  C l = src[i]-cbit - climb

 adc $0, %rcx  C more cbit

 imul %r9, %rax  C q = l * inverse

 mul %r8   C climb = high (q * d)

 lea (%rcx,%rdx), %rax C climb+cbit

 FUNC_EXIT()

 ret

EPILOGUE(mpn_modexact_1c_odd)

EPILOGUE(mpn_modexact_1_odd)

Messung V0.5

¤ Dauer der Verarbeitung: 0.11 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.