/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved. * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
// Save the higher 64 bit of each 128 bit lane. const __m256i sse_hi = _mm256_srli_si256(sse_256, 8); // Add the higher 64 bit to the low 64 bit.
sse_256 = _mm256_add_epi64(sse_256, sse_hi); // Accumulate the sse_256 register to get final sse const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
_mm256_extractf128_si256(sse_256, 1));
// Store the results.
_mm_storel_epi64((__m128i *)&sse, sse_128); return sse;
}
// init sse and ssz registerd to zero
sse_reg = _mm256_setzero_si256();
ssz_reg = _mm256_setzero_si256();
for (i = 0; i < block_size; i += 16) { // load 32 bytes from coeff and dqcoeff
read_coeff(coeff, i, &coeff_reg);
read_coeff(dqcoeff, i, &dqcoeff_reg); // dqcoeff - coeff
dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff)
dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff
coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word
exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word
exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
} // save the higher 64 bit of each 128 bit lane
sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit
sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
// add each 64 bit from each of the 128 bit lane of the 256 bit
sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
_mm256_extractf128_si256(sse_reg, 1));
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.