Quelle mod_34lsub1.asm

Sprache: Masm
dnl  IA-64 mpn_mod_34lsub1

dnl  Contributed to the GNU project by Torbjorn Granlund.

dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl

dnl  The GNU MP Library is free software; you can redistribute it and/or modify

dnl  it under the terms of either:

dnl

dnl    * the GNU Lesser General Public License as published by the Free

dnl      Software Foundation; either version 3 of the License, or (at your

dnl      option) any later version.

dnl

dnl  or

dnl

dnl    * the GNU General Public License as published by the Free Software

dnl      Foundation; either version 2 of the License, or (at your option) any

dnl      later version.

dnl

dnl  or both in parallel, as here.

dnl

dnl  The GNU MP Library is distributed in the hope that it will be useful, but

dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY

dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License

dnl  for more details.

dnl

dnl  You should have received copies of the GNU General Public License and the

dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,

dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C           cycles/limb

C Itanium:      ?

C Itanium 2:    1

C INPUT PARAMETERS

define(`up', `r32')

define(`n',  `r33')

C Some useful aliases for registers we use

define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')

define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')

define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')

C This is a fairly simple-minded implementation.  One could approach 0.67 c/l

C with a more sophisticated implementation.  If we're really crazy, we could

C super-unroll, storing carries just in predicate registers, then copy them to

C a general register, and population count them from there.  That'd bring us

C close to 3 insn/limb, for nearly 0.5 c/l.

C Computing n/3 needs 16 cycles, which is a lot of startup overhead.

C We therefore use a plain while-style loop:

C add  n = -3, n

C cmp.le  p9, p0 = 3, n

C  (p9) br.cond  .Loop

C Alternatively, we could table n/3 for, say, n < 256, and predicate the

C 16-cycle code.

C The summing-up code at the end was written quickly, and could surely be

C vastly improved.

ASM_START()

PROLOGUE(mpn_mod_34lsub1)

 .prologue

 .save ar.lc, r2

 .body

ifdef(`HAVE_ABI_32',`

 addp4  up = 0, up  C   M I

 nop.m  0

 zxt4  n = n   C   I

 ;;

')

ifelse(0,1,`

 movl  r14 = 0xAAAAAAAAAAAAAAAB

 ;;

 setf.sig f6 = r14

 setf.sig f7 = r33

 ;;

 xmpy.hu  f6 = f6, f7

 ;;

 getf.sig r8 = f6

 ;;

 shr.u  r8 = r8, 1  C Loop count

 ;;

 mov.i  ar.lc = r8

')

 ld8 u0 = [up], 8

 cmp.ne p9, p0 = 1, n

  (p9) br L(gt1)

 ;;

 shr.u r8 = u0, 48

 dep.z r27 = u0, 0, 48

 ;;

 add r8 = r8, r27

 br.ret.sptk.many b0

L(gt1):

 {.mmi;	nop.m	0

 mov a0 = 0

 add n = -2, n

}{.mmi;	mov	c0 = 0

 mov c1 = 0

 mov c2 = 0

 ;;

}{.mmi;	ld8	u1 = [up], 8

 mov a1 = 0

 cmp.ltu p6, p0 = r0, r0  C clear p6

}{.mmb;	cmp.gt	p9, p0 = 3, n

 mov a2 = 0

  (p9) br.cond.dptk L(end)

 ;;

}

 ALIGN(32)

L(top):

 {.mmi;	ld8	u2 = [up], 8

  (p6) add c0 = 1, c0

 cmp.ltu p7, p0 = a0, u0

}{.mmb;	sub	a0 = a0, u0

 add n = -3, n

 nop.b 0

 ;;

}{.mmi;	ld8	u0 = [up], 8

  (p7) add c1 = 1, c1

 cmp.ltu p8, p0 = a1, u1

}{.mmb;	sub	a1 = a1, u1

 cmp.le p9, p0 = 3, n

 nop.b 0

 ;;

}{.mmi;	ld8	u1 = [up], 8

  (p8) add c2 = 1, c2

 cmp.ltu p6, p0 = a2, u2

}{.mmb;	sub	a2 = a2, u2

 nop.m 0

dnl br.cloop.dptk L(top)

  (p9) br.cond.dptk L(top)

 ;;

}

L(end):

 cmp.eq p10, p0 = 0, n

 cmp.eq p11, p0 = 1, n

  (p10) br L(0)

L(2):

 {.mmi;	ld8	u2 = [up], 8

  (p6) add c0 = 1, c0

 cmp.ltu p7, p0 = a0, u0

}{.mmb;	sub	a0 = a0, u0

 nop.m 0

  (p11) br L(1)

 ;;

} ld8 u0 = [up], 8

  (p7) add c1 = 1, c1

 cmp.ltu p8, p0 = a1, u1

 sub a1 = a1, u1

 ;;

  (p8) add c2 = 1, c2

 cmp.ltu p6, p0 = a2, u2

 sub a2 = a2, u2

 ;;

  (p6) add c0 = 1, c0

 cmp.ltu p7, p0 = a0, u0

 sub a0 = a0, u0

 ;;

  (p7) add c1 = 1, c1

 br L(com)

L(1):

  (p7) add c1 = 1, c1

 cmp.ltu p8, p0 = a1, u1

 sub a1 = a1, u1

 ;;

  (p8) add c2 = 1, c2

 cmp.ltu p6, p0 = a2, u2

 sub a2 = a2, u2

 ;;

  (p6) add c0 = 1, c0

 br L(com)

L(0):

  (p6) add c0 = 1, c0

 cmp.ltu p7, p0 = a0, u0

 sub a0 = a0, u0

 ;;

  (p7) add c1 = 1, c1

 cmp.ltu p8, p0 = a1, u1

 sub a1 = a1, u1

 ;;

  (p8) add c2 = 1, c2

L(com):

C |     a2    |     a1    |     a0    |

C |        |        |        |        |

 shr.u r24 = a0, 48  C 16 bits

 shr.u r25 = a1, 32  C 32 bits

 shr.u r26 = a2, 16  C 48 bits

 ;;

 shr.u r10 = c0, 48  C 16 bits, always zero

 shr.u r11 = c1, 32  C 32 bits

 shr.u r30 = c2, 16  C 48 bits

 ;;

 dep.z r27 = a0,  0, 48 C 48 bits

 dep.z r28 = a1, 16, 32 C 48 bits

 dep.z r29 = a2, 32, 16 C 48 bits

 dep.z r31 = c0,  0, 48 C 48 bits

 dep.z r14 = c1, 16, 32 C 48 bits

 dep.z r15 = c2, 32, 16 C 48 bits

 ;;

 {.mmi;	add	r24 = r24, r25

 add r26 = r26, r27

 add r28 = r28, r29

}{.mmi;	add	r10 = r10, r11

 add r30 = r30, r31

 add r14 = r14, r15

 ;;

}

 movl r8 = 0xffffffffffff0

 add r24 = r24, r26

 add r10 = r10, r30

 ;;

 add r24 = r24, r28

 add r10 = r10, r14

 ;;

 sub r8 = r8, r24

 ;;

 add r8 = r8, r10

 br.ret.sptk.many b0

EPILOGUE()

ASM_END()
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.10 Sekunden (vorverarbeitet am 2026-04-30) ¤

Wurzel
Suchen
Beweissystem der NASA
Beweissystem Isabelle
NIST Cobol Testsuite
Cephes Mathematical Library
Wiener Entwicklungsmethode
Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.