/* * jdsample-neon.c - upsampling (Arm Neon) * * Copyright (C) 2020, Arm Limited. All Rights Reserved. * Copyright (C) 2020, D. R. Commander. All Rights Reserved. * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must not be * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution.
*/
/* The diagram below shows a row of samples produced by h2v1 downsampling. * * s0 s1 s2 * +---------+---------+---------+ * | | | | * | p0 p1 | p2 p3 | p4 p5 | * | | | | * +---------+---------+---------+ * * Samples s0-s2 were created by averaging the original pixel component values * centered at positions p0-p5 above. To approximate those original pixel * component values, we proportionally blend the adjacent samples in each row. * * An upsampled pixel component value is computed by blending the sample * containing the pixel center with the nearest neighboring sample, in the * ratio 3:1. For example: * p1(upsampled) = 3/4 * s0 + 1/4 * s1 * p2(upsampled) = 3/4 * s1 + 1/4 * s0 * When computing the first and last pixel component values in the row, there * is no adjacent sample to blend, so: * p0(upsampled) = s0 * p5(upsampled) = s2
*/
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow]; /* First pixel component value in this row of the original image */
*outptr = (JSAMPLE)GETJSAMPLE(*inptr);
/* The offset is initially 1, because the first pixel component has already * been stored. However, in subsequent iterations of the SIMD loop, this * offset is (2 * colctr - 1) to stay within the bounds of the sample * buffers without having to resort to a slow scalar tail case for the last * (downsampled_width % 16) samples. See "Creation of 2-D sample arrays" * in jmemmgr.c for more details.
*/ unsigned outptr_offset = 1;
uint8x16x2_t output_pixels;
/* We use software pipelining to maximise performance. The code indented * an extra two spaces begins the next iteration of the loop.
*/ for (colctr = 16; colctr < downsampled_width; colctr += 16) {
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2));
/* Multiplication makes vectors twice as wide. '_l' and '_h' suffixes * denote low half and high half respectively.
*/
s1_add_3s0_l =
vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
s1_add_3s0_h =
vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
s0_add_3s1_l =
vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
s0_add_3s1_h =
vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8); /* Add ordered dithering bias to odd pixel values. */
s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
/* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
outptr_offset = 2 * colctr - 1;
}
/* Complete the last iteration of the loop. */
/* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
vrshrn_n_u16(s1_add_3s0_h, 2));
output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
vshrn_n_u16(s0_add_3s1_h, 2)); /* Store pixel component values to memory. */
vst2q_u8(outptr + outptr_offset, output_pixels);
/* Last pixel component value in this row of the original image */
outptr[2 * downsampled_width - 1] =
GETJSAMPLE(inptr[downsampled_width - 1]);
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling. * * s0 s1 s2 * +---------+---------+---------+ * | p0 p1 | p2 p3 | p4 p5 | * sA | | | | * | p6 p7 | p8 p9 | p10 p11| * +---------+---------+---------+ * | p12 p13| p14 p15| p16 p17| * sB | | | | * | p18 p19| p20 p21| p22 p23| * +---------+---------+---------+ * | p24 p25| p26 p27| p28 p29| * sC | | | | * | p30 p31| p32 p33| p34 p35| * +---------+---------+---------+ * * Samples s0A-s2C were created by averaging the original pixel component * values centered at positions p0-p35 above. To approximate one of those * original pixel component values, we proportionally blend the sample * containing the pixel center with the nearest neighboring samples in each * row, column, and diagonal. * * An upsampled pixel component value is computed by first blending the sample * containing the pixel center with the nearest neighboring samples in the * same column, in the ratio 3:1, and then blending each column sum with the * nearest neighboring column sum, in the ratio 3:1. For example: * p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) + * 1/4 * (3/4 * s0B + 1/4 * s0A) * = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A * When computing the first and last pixel component values in the row, there * is no horizontally adjacent sample to blend, so: * p12(upsampled) = 3/4 * s0B + 1/4 * s0A * p23(upsampled) = 3/4 * s2B + 1/4 * s2C * When computing the first and last pixel component values in the column, * there is no vertically adjacent sample to blend, so: * p2(upsampled) = 3/4 * s1A + 1/4 * s0A * p33(upsampled) = 3/4 * s1C + 1/4 * s2C * When computing the corner pixel component values, there is no adjacent * sample to blend, so: * p0(upsampled) = s0A * p35(upsampled) = s2C
*/
inrow = outrow = 0; while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1]; /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, * respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
/* First pixel component value in this row of the original image */ int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
*outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4); int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
*outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
/* Step 1: Blend samples vertically in columns s0 and s1. * Leave the divide by 4 until the end, when it can be done for both * dimensions at once, right-shifting by 4.
*/
/* Store pixel component values to memory. * The minimum size of the output buffer for each row is 64 bytes => no * need to worry about buffer overflow here. See "Creation of 2-D sample * arrays" in jmemmgr.c for more details.
*/
vst2q_u8(outptr0 + 1, output_pixels0);
vst2q_u8(outptr1 + 1, output_pixels1);
/* The first pixel of the image shifted our loads and stores by one byte. * We have to re-align on a 32-byte boundary at some point before the end * of the row (we do it now on the 32/33 pixel boundary) to stay within the * bounds of the sample buffers without having to resort to a slow scalar * tail case for the last (downsampled_width % 16) samples. See "Creation * of 2-D sample arrays" in jmemmgr.c for more details.
*/ for (colctr = 16; colctr < downsampled_width; colctr += 16) { /* Step 1: Blend samples vertically in columns s0 and s1. */
/* Last pixel component value in this row of the original image */ int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr0[downsampled_width - 1]);
outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4); int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
GETJSAMPLE(inptr2[downsampled_width - 1]);
outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
inrow++;
}
}
/* The diagram below shows a column of samples produced by h1v2 downsampling * (or by losslessly rotating or transposing an h2v1-downsampled image.) * * +---------+ * | p0 | * sA | | * | p1 | * +---------+ * | p2 | * sB | | * | p3 | * +---------+ * | p4 | * sC | | * | p5 | * +---------+ * * Samples sA-sC were created by averaging the original pixel component values * centered at positions p0-p5 above. To approximate those original pixel * component values, we proportionally blend the adjacent samples in each * column. * * An upsampled pixel component value is computed by blending the sample * containing the pixel center with the nearest neighboring sample, in the * ratio 3:1. For example: * p1(upsampled) = 3/4 * sA + 1/4 * sB * p2(upsampled) = 3/4 * sB + 1/4 * sA * When computing the first and last pixel component values in the column, * there is no adjacent sample to blend, so: * p0(upsampled) = sA * p5(upsampled) = sC
*/
inrow = outrow = 0; while (outrow < max_v_samp_factor) {
inptr0 = input_data[inrow - 1];
inptr1 = input_data[inrow];
inptr2 = input_data[inrow + 1]; /* Suffixes 0 and 1 denote the upper and lower rows of output pixels, * respectively.
*/
outptr0 = output_data[outrow++];
outptr1 = output_data[outrow++];
inrow++;
/* The size of the input and output buffers is always a multiple of 32 * bytes => no need to worry about buffer overflow when reading/writing * memory. See "Creation of 2-D sample arrays" in jmemmgr.c for more * details.
*/ for (colctr = 0; colctr < downsampled_width; colctr += 16) { /* Load samples. */
uint8x16_t sA = vld1q_u8(inptr0 + colctr);
uint8x16_t sB = vld1q_u8(inptr1 + colctr);
uint8x16_t sC = vld1q_u8(inptr2 + colctr); /* Blend samples vertically. */
uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
vget_high_u8(sB), three_u8);
uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
vget_low_u8(sB), three_u8);
uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
vget_high_u8(sB), three_u8); /* Add ordered dithering bias to pixel values in even output rows. */
colsum0_l = vaddq_u16(colsum0_l, one_u16);
colsum0_h = vaddq_u16(colsum0_h, one_u16); /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
vshrn_n_u16(colsum0_h, 2));
uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
vrshrn_n_u16(colsum1_h, 2)); /* Store pixel component values to memory. */
vst1q_u8(outptr0 + colctr, output_pixels0);
vst1q_u8(outptr1 + colctr, output_pixels1);
}
}
}
/* The diagram below shows a row of samples produced by h2v1 downsampling. * * s0 s1 * +---------+---------+ * | | | * | p0 p1 | p2 p3 | * | | | * +---------+---------+ * * Samples s0 and s1 were created by averaging the original pixel component * values centered at positions p0-p3 above. To approximate those original * pixel component values, we duplicate the samples horizontally: * p0(upsampled) = p1(upsampled) = s0 * p2(upsampled) = p3(upsampled) = s1
*/
for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
inptr = input_data[inrow];
outptr = output_data[inrow]; for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr); /* Duplicate the samples. The store operation below interleaves them so * that adjacent pixel component values take on the same sample value, * per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } }; /* Store pixel component values to memory. * Due to the way sample buffers are allocated, we don't need to worry * about tail cases when output_width is not a multiple of 32. See * "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr + 2 * colctr, output_pixels);
}
}
}
/* The diagram below shows an array of samples produced by h2v2 downsampling. * * s0 s1 * +---------+---------+ * | p0 p1 | p2 p3 | * sA | | | * | p4 p5 | p6 p7 | * +---------+---------+ * | p8 p9 | p10 p11| * sB | | | * | p12 p13| p14 p15| * +---------+---------+ * * Samples s0A-s1B were created by averaging the original pixel component * values centered at positions p0-p15 above. To approximate those original * pixel component values, we duplicate the samples both horizontally and * vertically: * p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A * p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A * p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B * p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
*/
for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
uint8x16_t samples = vld1q_u8(inptr + colctr); /* Duplicate the samples. The store operation below interleaves them so * that adjacent pixel component values take on the same sample value, * per above.
*/
uint8x16x2_t output_pixels = { { samples, samples } }; /* Store pixel component values for both output rows to memory. * Due to the way sample buffers are allocated, we don't need to worry * about tail cases when output_width is not a multiple of 32. See * "Creation of 2-D sample arrays" in jmemmgr.c for details.
*/
vst2q_u8(outptr0 + 2 * colctr, output_pixels);
vst2q_u8(outptr1 + 2 * colctr, output_pixels);
}
}
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.