Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  pclmulqdq.rs   Sprache: unbekannt

 
use core::arch::x86_64::__m128i;
use core::{
    arch::x86_64::{
        _mm_and_si128, _mm_clmulepi64_si128, _mm_extract_epi32, _mm_load_si128, _mm_loadu_si128,
        _mm_or_si128, _mm_shuffle_epi8, _mm_slli_si128, _mm_srli_si128, _mm_storeu_si128,
        _mm_xor_si128,
    },
    mem::MaybeUninit,
};

use crate::{crc32::slice_to_uninit, CRC32_INITIAL_VALUE};

#[derive(Debug)]
#[repr(C, align(16))]
struct Align16<T>(T);

#[cfg(target_arch = "x86_64")]
const fn reg(input: [u32; 4]) -> __m128i {
    // safety: any valid [u32; 4] represents a valid __m128i
    unsafe { core::mem::transmute(input) }
}

#[derive(Debug, Clone, Copy)]
#[cfg(target_arch = "x86_64")]
pub(crate) struct Accumulator {
    fold: [__m128i; 4],
}

#[cfg(target_arch = "x86_64")]
impl Accumulator {
    const XMM_FOLD4: __m128i = reg([0xc6e41596u32, 0x00000001u32, 0x54442bd4u32, 0x00000001u32]);

    pub const fn new() -> Self {
        let xmm_crc0 = reg([0x9db42487, 0, 0, 0]);
        let xmm_zero = reg([0, 0, 0, 0]);

        Self {
            fold: [xmm_crc0, xmm_zero, xmm_zero, xmm_zero],
        }
    }

    pub fn fold(&mut self, src: &[u8], start: u32) {
        unsafe { self.fold_help::<false>(&mut [], src, start) }
    }

    pub fn fold_copy(&mut self, dst: &mut [MaybeUninit<u8>], src: &[u8]) {
        unsafe { self.fold_help::<true>(dst, src, 0) }
    }

    #[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
    pub unsafe fn finish(self) -> u32 {
        const CRC_MASK1: __m128i =
            reg([0xFFFFFFFFu32, 0xFFFFFFFFu32, 0x00000000u32, 0x00000000u32]);

        const CRC_MASK2: __m128i =
            reg([0x00000000u32, 0xFFFFFFFFu32, 0xFFFFFFFFu32, 0xFFFFFFFFu32]);

        const RK1_RK2: __m128i = reg([
            0xccaa009e, 0x00000000, /* rk1 */
            0x751997d0, 0x00000001, /* rk2 */
        ]);

        const RK5_RK6: __m128i = reg([
            0xccaa009e, 0x00000000, /* rk5 */
            0x63cd6124, 0x00000001, /* rk6 */
        ]);

        const RK7_RK8: __m128i = reg([
            0xf7011640, 0x00000001, /* rk7 */
            0xdb710640, 0x00000001, /* rk8 */
        ]);

        let [mut xmm_crc0, mut xmm_crc1, mut xmm_crc2, mut xmm_crc3] = self.fold;

        /*
         * k1
         */
        let mut crc_fold = RK1_RK2;

        let x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
        xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
        xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);

        let x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
        xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
        xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);

        let x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
        xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);

        /*
         * k5
         */
        crc_fold = RK5_RK6;

        xmm_crc0 = xmm_crc3;
        xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
        xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);

        xmm_crc0 = xmm_crc3;
        xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
        xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
        xmm_crc3 = _mm_and_si128(xmm_crc3, CRC_MASK2);

        /*
         * k7
         */
        xmm_crc1 = xmm_crc3;
        xmm_crc2 = xmm_crc3;
        crc_fold = RK7_RK8;

        xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
        xmm_crc3 = _mm_and_si128(xmm_crc3, CRC_MASK1);

        xmm_crc2 = xmm_crc3;
        xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);

        !(_mm_extract_epi32(xmm_crc3, 2) as u32)
    }

    fn fold_step<const N: usize>(&mut self) {
        self.fold = core::array::from_fn(|i| match self.fold.get(i + N) {
            Some(v) => *v,
            None => unsafe { Self::step(self.fold[(i + N) - 4]) },
        });
    }

    #[inline(always)]
    unsafe fn step(input: __m128i) -> __m128i {
        _mm_xor_si128(
            _mm_clmulepi64_si128(input, Self::XMM_FOLD4, 0x01),
            _mm_clmulepi64_si128(input, Self::XMM_FOLD4, 0x10),
        )
    }

    unsafe fn partial_fold(&mut self, xmm_crc_part: __m128i, len: usize) {
        const PSHUFB_SHF_TABLE: [__m128i; 15] = [
            reg([0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d]), /* shl 15 (16 - 1)/shr1 */
            reg([0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e]), /* shl 14 (16 - 3)/shr2 */
            reg([0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f]), /* shl 13 (16 - 4)/shr3 */
            reg([0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100]), /* shl 12 (16 - 4)/shr4 */
            reg([0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201]), /* shl 11 (16 - 5)/shr5 */
            reg([0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302]), /* shl 10 (16 - 6)/shr6 */
            reg([0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403]), /* shl  9 (16 - 7)/shr7 */
            reg([0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504]), /* shl  8 (16 - 8)/shr8 */
            reg([0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605]), /* shl  7 (16 - 9)/shr9 */
            reg([0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706]), /* shl  6 (16 -10)/shr10*/
            reg([0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807]), /* shl  5 (16 -11)/shr11*/
            reg([0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908]), /* shl  4 (16 -12)/shr12*/
            reg([0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09]), /* shl  3 (16 -13)/shr13*/
            reg([0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a]), /* shl  2 (16 -14)/shr14*/
            reg([0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b]), /* shl  1 (16 -15)/shr15*/
        ];

        let xmm_shl = PSHUFB_SHF_TABLE[len - 1];
        let xmm_shr = _mm_xor_si128(xmm_shl, reg([0x80808080u32; 4]));

        let xmm_a0 = Self::step(_mm_shuffle_epi8(self.fold[0], xmm_shl));

        self.fold[0] = _mm_shuffle_epi8(self.fold[0], xmm_shr);
        let xmm_tmp1 = _mm_shuffle_epi8(self.fold[1], xmm_shl);
        self.fold[0] = _mm_or_si128(self.fold[0], xmm_tmp1);

        self.fold[1] = _mm_shuffle_epi8(self.fold[1], xmm_shr);
        let xmm_tmp2 = _mm_shuffle_epi8(self.fold[2], xmm_shl);
        self.fold[1] = _mm_or_si128(self.fold[1], xmm_tmp2);

        self.fold[2] = _mm_shuffle_epi8(self.fold[2], xmm_shr);
        let xmm_tmp3 = _mm_shuffle_epi8(self.fold[3], xmm_shl);
        self.fold[2] = _mm_or_si128(self.fold[2], xmm_tmp3);

        self.fold[3] = _mm_shuffle_epi8(self.fold[3], xmm_shr);
        let xmm_crc_part = _mm_shuffle_epi8(xmm_crc_part, xmm_shl);
        self.fold[3] = _mm_or_si128(self.fold[3], xmm_crc_part);

        // zlib-ng uses casts and a floating-point xor instruction here. There is a theory that
        // this breaks dependency chains on some CPUs and gives better throughput. Other sources
        // claim that casting between integer and float has a cost and should be avoided. We can't
        // measure the difference, and choose the shorter code.
        self.fold[3] = _mm_xor_si128(self.fold[3], xmm_a0)
    }

    #[allow(clippy::needless_range_loop)]
    fn progress<const N: usize, const COPY: bool>(
        &mut self,
        dst: &mut [MaybeUninit<u8>],
        src: &mut &[u8],
        init_crc: &mut u32,
    ) -> usize {
        let mut it = src.chunks_exact(16);
        let mut input: [_; N] = core::array::from_fn(|_| unsafe {
            _mm_load_si128(it.next().unwrap().as_ptr() as *const __m128i)
        });

        *src = &src[N * 16..];

        if COPY {
            for (s, d) in input[..N].iter().zip(dst.chunks_exact_mut(16)) {
                unsafe { _mm_storeu_si128(d.as_mut_ptr() as *mut __m128i, *s) };
            }
        } else if *init_crc != CRC32_INITIAL_VALUE {
            let xmm_initial = reg([*init_crc, 0, 0, 0]);
            input[0] = unsafe { _mm_xor_si128(input[0], xmm_initial) };
            *init_crc = CRC32_INITIAL_VALUE;
        }

        self.fold_step::<N>();

        for i in 0..N {
            self.fold[i + (4 - N)] = unsafe { _mm_xor_si128(self.fold[i + (4 - N)], input[i]) };
        }

        if COPY {
            N * 16
        } else {
            0
        }
    }

    #[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
    unsafe fn fold_help<const COPY: bool>(
        &mut self,
        mut dst: &mut [MaybeUninit<u8>],
        mut src: &[u8],
        mut init_crc: u32,
    ) {
        let mut xmm_crc_part = reg([0; 4]);

        let mut partial_buf = Align16([0u8; 16]);

        // Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
        // bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
        // carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
        // by definition can be up to 15 bytes + one full vector load. */
        assert!(src.len() >= 31 || init_crc == CRC32_INITIAL_VALUE);

        if COPY {
            assert_eq!(dst.len(), src.len(), "dst and src must be the same length")
        }

        if src.len() < 16 {
            if COPY {
                if src.is_empty() {
                    return;
                }

                partial_buf.0[..src.len()].copy_from_slice(src);
                xmm_crc_part = _mm_load_si128(partial_buf.0.as_mut_ptr() as *mut __m128i);
                dst[..src.len()].copy_from_slice(slice_to_uninit(&partial_buf.0[..src.len()]));
            }
        } else {
            let (before, _, _) = unsafe { src.align_to::<__m128i>() };

            if !before.is_empty() {
                xmm_crc_part = _mm_loadu_si128(src.as_ptr() as *const __m128i);
                if COPY {
                    _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, xmm_crc_part);
                    dst = &mut dst[before.len()..];
                } else {
                    let is_initial = init_crc == CRC32_INITIAL_VALUE;

                    if !is_initial {
                        let xmm_initial = reg([init_crc, 0, 0, 0]);
                        xmm_crc_part = _mm_xor_si128(xmm_crc_part, xmm_initial);
                        init_crc = CRC32_INITIAL_VALUE;
                    }

                    if before.len() < 4 && !is_initial {
                        let xmm_t0 = xmm_crc_part;
                        xmm_crc_part = _mm_loadu_si128((src.as_ptr() as *const __m128i).add(1));

                        self.fold_step::<1>();

                        self.fold[3] = _mm_xor_si128(self.fold[3], xmm_t0);
                        src = &src[16..];
                    }
                }

                self.partial_fold(xmm_crc_part, before.len());

                src = &src[before.len()..];
            }

            // if is_x86_feature_detected!("vpclmulqdq") {
            //     if src.len() >= 256 {
            //         if COPY {
            //             // size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
            //             // dst += n;
            //         } else {
            //             // size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, xmm_initial, first);
            //             // first = false;
            //         }
            //         // len -= n;
            //         // src += n;
            //     }
            // }

            while src.len() >= 64 {
                let n = self.progress::<4, COPY>(dst, &mut src, &mut init_crc);
                dst = &mut dst[n..];
            }

            if src.len() >= 48 {
                let n = self.progress::<3, COPY>(dst, &mut src, &mut init_crc);
                dst = &mut dst[n..];
            } else if src.len() >= 32 {
                let n = self.progress::<2, COPY>(dst, &mut src, &mut init_crc);
                dst = &mut dst[n..];
            } else if src.len() >= 16 {
                let n = self.progress::<1, COPY>(dst, &mut src, &mut init_crc);
                dst = &mut dst[n..];
            }
        }

        if !src.is_empty() {
            core::ptr::copy_nonoverlapping(
                src.as_ptr(),
                &mut xmm_crc_part as *mut _ as *mut u8,
                src.len(),
            );
            if COPY {
                _mm_storeu_si128(partial_buf.0.as_mut_ptr() as *mut __m128i, xmm_crc_part);
                core::ptr::copy_nonoverlapping(
                    partial_buf.0.as_ptr() as *const MaybeUninit<u8>,
                    dst.as_mut_ptr(),
                    src.len(),
                );
            }

            self.partial_fold(xmm_crc_part, src.len());
        }
    }
}

[ Dauer der Verarbeitung: 0.2 Sekunden  (vorverarbeitet)  ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge