; ; jquant.asm - sample data conversion and quantization (MMX) ; ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB ; Copyright (C) 2016, 2024, D. R. Commander. ; ; Based on the x86 SIMD extension for IJG JPEG library ; Copyright (C) 1999-2006, MIYASAKA Masaru. ; For conditions of distribution and use, see copyright notice in jsimdext.inc ; ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
popedi popesi ; pop edx ; need not be preserved ; pop ecx ; need not be preserved pop ebx popebp ret
; -------------------------------------------------------------------------- ; ; Quantize/descale the coefficients, and store into coef_block ; ; This implementation is based on an algorithm described in ; "Optimizing subroutines in assembly language: ; An optimization guide for x86 platforms" (https://agner.org/optimize). ; ; GLOBAL(void) ; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors, ; DCTELEM *workspace); ;
%define RECIPROCAL(m, n, b) \
MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
%define CORRECTION(m, n, b) \
MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
%define SCALE(m, n, b) \
MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
%define SHIFT(m, n, b) \
MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
; ; MMX is an annoyingly crappy instruction set. It has two ; misfeatures that are causing problems here: ; ; - All multiplications are signed. ; ; - The second operand for the shifts is not treated as packed. ; ; ; We work around the first problem by implementing this algorithm: ; ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) ; { ; enum { SHORT_BIT = 16 }; ; signed short sx = (signed short)x; ; signed short sy = (signed short)y; ; signed long sz; ; ; sz = (long)sx * (long)sy; /* signed multiply */ ; ; if (sx < 0) sz += (long)sy << SHORT_BIT; ; if (sy < 0) sz += (long)sx << SHORT_BIT; ; ; return (unsigned long)sz; ; } ; ; (note that a negative sx adds _sy_ and vice versa) ; ; For the second problem, we replace the shift by a multiplication. ; Unfortunately that means we have to deal with the signed issue again. ;
movq mm4, mm0 ; store current value for later
movq mm5, mm1
pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
paddw mm0, mm4 ; reciprocal is always negative (MSB=1),
paddw mm1, mm5 ; so we always need to add the initial value ; (input value is never negative as we ; inverted it at the start of this routine)
; here it gets a bit tricky as both scale ; and mm0/mm1 can be negative
movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
movq mm7, MMWORD [SCALE(0,1,edx)]
movq mm4, mm0
movq mm5, mm1
pmulhw mm0, mm6
pmulhw mm1, mm7
psraw mm6, (WORD_BIT-1) ; determine if scale is negative
psraw mm7, (WORD_BIT-1)
pand mm6, mm4 ; and add input if it is
pand mm7, mm5
paddw mm0, mm6
paddw mm1, mm7
psraw mm4, (WORD_BIT-1) ; then check if negative input
psraw mm5, (WORD_BIT-1)
pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
pand mm5, MMWORD [SCALE(0,1,edx)]
paddw mm0, mm4
paddw mm1, mm5
addesi, byte 8*SIZEOF_DCTELEM addedx, byte 8*SIZEOF_DCTELEM addedi, byte 8*SIZEOF_JCOEF dec al jnz near .quantloop2 dec ah jnz near .quantloop1 ; to avoid branch misprediction
emms ; empty MMX state
popedi popesi ; pop edx ; need not be preserved ; pop ecx ; unused ; pop ebx ; unused popebp ret
; For some reason, the OS X linker does not honor the request to align the ; segment unless we do this. align 32
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.