/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Core of the accelerated CRC algorithm. * In your file, define the constants and CRC_FUNCTION_NAME * Then include this file. * * Calculate the checksum of data that is 16 byte aligned and a multiple of * 16 bytes. * * The first step is to reduce it to 1024 bits. We do this in 8 parallel * chunks in order to mask the latency of the vpmsum instructions. If we * have more than 32 kB of data to checksum we repeat this step multiple * times, passing in the previous 1024 bits. * * The next step is to reduce the 1024 bits to 64 bits. This step adds * 32 bits of 0s to the end - this matches what a CRC does. We just * calculate constants that land the data in this 32 bits. * * We then use fixed point Barrett reduction to compute a mod n over GF(2) * for n = CRC using POWER8 instructions. We use x = 32. * * https://en.wikipedia.org/wiki/Barrett_reduction * * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
*/
/* Get the initial value into v8 */
vxor v8,v8,v8
MTVRD(v8, R3)
#ifdef REFLECT
vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
#else
vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
#endif
/* Checksum in blocks of MAX_SIZE */
1: lis r7,MAX_SIZE@h
ori r7,r7,MAX_SIZE@l
mr r9,r7
cmpd r6,r7
bgt 2f
mr r7,r6
2: subf r6,r7,r6
/* our main loop does 128 bytes at a time */
srdi r7,r7,7
/* * Work out the offset into the constants table to start at. Each * constant is 16 bytes, and it is used against 128 bytes of input * data - 128 / 16 = 8
*/
sldi r8,r7,4
srdi r9,r9,3
subf r8,r8,r9
/* We reduce our final 128 bytes in a separate step */
addi r7,r7,-1
mtctr r7
LOAD_REG_ADDR(r3, .constants)
/* Find the start of our constants */
add r3,r3,r8
/* zero v0-v7 which will contain our checksums */
vxor v0,v0,v0
vxor v1,v1,v1
vxor v2,v2,v2
vxor v3,v3,v3
vxor v4,v4,v4
vxor v5,v5,v5
vxor v6,v6,v6
vxor v7,v7,v7
lvx const1,0,r3
/* * If we are looping back to consume more data we use the values * already in v16-v23.
*/
cmpdi r0,1
beq 2f
/* * main loop. We modulo schedule it such that it takes three iterations * to complete - first iteration load, second iteration vpmsum, third * iteration xor.
*/
.balign 16
4: lvx const1,0,r3
addi r3,r3,16
ori r2,r2,0
vxor v0,v0,v8
VPMSUMD(v8,v16,const2)
lvx v16,0,r4
VPERM(v16,v16,v16,byteswap)
ori r2,r2,0
vxor v1,v1,v9
VPMSUMD(v9,v17,const2)
lvx v17,off16,r4
VPERM(v17,v17,v17,byteswap)
ori r2,r2,0
vxor v2,v2,v10
VPMSUMD(v10,v18,const2)
lvx v18,off32,r4
VPERM(v18,v18,v18,byteswap)
ori r2,r2,0
vxor v3,v3,v11
VPMSUMD(v11,v19,const2)
lvx v19,off48,r4
VPERM(v19,v19,v19,byteswap)
lvx const2,0,r3
ori r2,r2,0
vxor v4,v4,v12
VPMSUMD(v12,v20,const1)
lvx v20,off64,r4
VPERM(v20,v20,v20,byteswap)
ori r2,r2,0
vxor v5,v5,v13
VPMSUMD(v13,v21,const1)
lvx v21,off80,r4
VPERM(v21,v21,v21,byteswap)
ori r2,r2,0
vxor v6,v6,v14
VPMSUMD(v14,v22,const1)
lvx v22,off96,r4
VPERM(v22,v22,v22,byteswap)
ori r2,r2,0
#ifdef REFLECT /* * vpmsumd produces a 96 bit result in the least significant bits * of the register. Since we are bit reflected we have to shift it * left 32 bits so it occupies the least significant bits in the * bit reflected domain.
*/
vsldoi v0,v0,zeroes,4
vsldoi v1,v1,zeroes,4
vsldoi v2,v2,zeroes,4
vsldoi v3,v3,zeroes,4
vsldoi v4,v4,zeroes,4
vsldoi v5,v5,zeroes,4
vsldoi v6,v6,zeroes,4
vsldoi v7,v7,zeroes,4
#endif
vsldoi v1,v0,v0,8
vxor v0,v0,v1 /* xor two 64 bit results together */
#ifdef REFLECT /* shift left one bit */
vspltisb v1,1
vsl v0,v0,v1
#endif
vand v0,v0,mask_64bit
#ifndef REFLECT /* * Now for the Barrett reduction algorithm. The idea is to calculate q, * the multiple of our polynomial that we need to subtract. By * doing the computation 2x bits higher (ie 64 bits) and shifting the * result back down 2x bits, we round down to the nearest multiple.
*/
VPMSUMD(v1,v0,const1) /* ma */
vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
VPMSUMD(v1,v1,const2) /* qn */
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
/* * Get the result into r3. We need to shift it left 8 bytes: * V0 [ 0 1 2 X ] * V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
#else /* * The reflected version of Barrett reduction. Instead of bit * reflecting our data (which is expensive to do), we bit reflect our * constants and our algorithm, which means the intermediate data in * our vector registers goes from 0-63 instead of 63-0. We can reflect * the algorithm because we don't carry in mod 2 arithmetic.
*/
vand v1,v0,mask_32bit /* bottom 32 bits of a */
VPMSUMD(v1,v1,const1) /* ma */
vand v1,v1,mask_32bit /* bottom 32bits of ma */
VPMSUMD(v1,v1,const2) /* qn */
vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
/* * Since we are bit reflected, the result (ie the low 32 bits) is in * the high 32 bits. We just need to shift it left 4 bytes * V0 [ 0 1 X 3 ] * V0 [ 0 X 2 3 ]
*/
vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
#endif
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.