dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
dnl Copyright 2013, 2014 Free Software Foundation,
Inc.
dnl
This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software
; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General
Public License as published by the Free
dnl Software Foundation
; either version 3 of the License, or (at your
dnl
option) any later
version.
dnl
dnl
or
dnl
dnl * the GNU General
Public License as published by the Free Software
dnl Foundation
; either version 2 of the License, or (at your option) any
dnl later
version.
dnl
dnl
or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
dnl
or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General
Public License
dnl
for more details.
dnl
dnl You should have received copies of the GNU General
Public License
and the
dnl GNU Lesser General
Public License along with the GNU MP Library.
If not,
dnl see
https://www.gnu.org/licenses/.
include(`../config.m4
')
C cycles/limb
C Cortex-A53 4.5
C Cortex-A57 1.9
C X-Gene 4.36
C TODO
C * Consider greater unrolling.
C * Arrange to
align the pointer,
if that helps performance. Use the same
C read-and-mask trick we use on PCs,
for simplicity
and performance. (Sorry
C valgrind!)
C * Explore
if explicit
align directives, e.g.,
"[ptr:128]" help.
C * See rth
's gmp-devel 2013-02/03 messages about final summation tricks.
changecom(blah)
C INPUT PARAMETERS
define(`ap
', x0)
define(`bp
', x1)
define(`n
', x2)
C We sum into 16 16-bit counters in v4,v5, but at the
end we sum them
and end
C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits,
or
C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
C allows the huge count
code to jump deep into the
code (at L(chu)).
define(`maxsize
', 0x1fff)
define(`chunksize
',0x1ff0)
ASM_START()
PROLOGUE(mpn_hamdist)
mov x11, #maxsize
cmp n, x11
b.hi L(gt8k)
L(lt8k):
movi v4.16b, #0 C clear summation register
movi v5.16b, #0 C clear summation register
tbz n, #0, L(xx0)
sub n, n, #1
ld1 {v0.1d}, [ap], #8 C load 1 limb
ld1 {v16.1d}, [bp], #8 C load 1 limb
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b C could also splat
L(xx0): tbz n, #1, L(x00)
sub n, n, #2
ld1 {v0.2d}, [ap], #16 C load 2 limbs
ld1 {v16.2d}, [bp], #16 C load 2 limbs
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b
L(x00): tbz n, #2, L(000)
subs n, n, #4
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
b.ls L(sum)
L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
sub n, n, #4
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
b L(mid)
L(000): subs n, n, #8
b.lo L(e0)
L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
eor v2.16b, v2.16b, v18.16b
eor v3.16b, v3.16b, v19.16b
cnt v6.16b, v2.16b
cnt v7.16b, v3.16b
subs n, n, #8
b.lo L(
end)
L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
uadalp v4.8h, v6.16b
cnt v6.16b, v0.16b
uadalp v5.8h, v7.16b
cnt v7.16b, v1.16b
L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
eor v2.16b, v2.16b, v18.16b
eor v3.16b, v3.16b, v19.16b
subs n, n, #8
uadalp v4.8h, v6.16b
cnt v6.16b, v2.16b
uadalp v5.8h, v7.16b
cnt v7.16b, v3.16b
b.hs L(top)
L(
end): uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b
L(sum): eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b
add v4.8h, v4.8h, v5.8h
C we have 8 16-bit counts
L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
uaddlp v4.2d, v4.4s C we have 2 64-bit counts
mov x0, v4.d[0]
mov x1, v4.d[1]
add x0, x0, x1
ret
C
Code for count > maxsize. Splits operand
and calls above
code.
define(`ap2
', x5) C caller-saves reg not used above
define(`bp2
', x6) C caller-saves reg not used above
L(gt8k):
mov x8, x30
mov x7, n C full count (caller-saves reg not used above)
mov x4, #0 C total sum (caller-saves reg not used above)
mov x9, #chunksize*8 C caller-saves reg not used above
mov x10, #chunksize C caller-saves reg not used above
1:
add ap2, ap, x9 C point at subsequent block
add bp2, bp, x9 C point at subsequent block
mov n, #chunksize-8 C count
for this invocation, adjusted
for entry pt
movi v4.16b, #0 C clear chunk summation register
movi v5.16b, #0 C clear chunk summation register
bl L(chu) C jump deep inside
code
add x4, x4, x0
mov ap, ap2 C put chunk pointer in place
for calls
mov bp, bp2 C put chunk pointer in place
for calls
sub x7, x7, x10
cmp x7, x11
b.hi 1b
mov n, x7 C count
for final invocation
bl L(lt8k)
add x0, x4, x0
mov x30, x8
ret
EPILOGUE()