void add_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size) { int h = 8; /* Keep this function a leaf function by generating the constants manually (mainly for the hack value ;-). */ uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ uint64_t signmask = zap(-1, 0x33); signmask ^= signmask >> 1; /* 0x8000800080008000 */ do { uint64_t shorts0, pix0, signs0; uint64_t shorts1, pix1, signs1; shorts0 = ldq(block); shorts1 = ldq(block + 4); pix0 = unpkbw(ldl(pixels)); /* Signed subword add (MMX paddw). */ signs0 = shorts0 & signmask; shorts0 &= ~signmask; shorts0 += pix0; shorts0 ^= signs0; /* Clamp. */ shorts0 = maxsw4(shorts0, 0); shorts0 = minsw4(shorts0, clampmask); /* Next 4. */ pix1 = unpkbw(ldl(pixels + 4)); signs1 = shorts1 & signmask; shorts1 &= ~signmask; shorts1 += pix1; shorts1 ^= signs1; shorts1 = maxsw4(shorts1, 0); shorts1 = minsw4(shorts1, clampmask); stl(pkwb(shorts0), pixels); stl(pkwb(shorts1), pixels + 4); pixels += line_size; block += 8; } while (--h); }
static void dct_unquantize_h263_inter_axp(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int i, n_coeffs; uint64_t qmul, qadd; uint64_t correction; qadd = WORD_VEC((qscale - 1) | 1); qmul = qscale << 1; /* This mask kills spill from negative subwords to the next subword. */ correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */ n_coeffs = s->intra_scantable.raster_end[s->block_last_index[n]]; for(i = 0; i <= n_coeffs; block += 4, i += 4) { uint64_t levels, negmask, zeros, add; levels = ldq(block); if (levels == 0) continue; #ifdef __alpha_max__ /* I don't think the speed difference justifies runtime detection. */ negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ #else negmask = cmpbge(WORD_VEC(0x7fff), levels); negmask &= (negmask >> 1) | (1 << 7); negmask = zap(-1, negmask); #endif zeros = cmpbge(0, levels); zeros &= zeros >> 1; /* zeros |= zeros << 1 is not needed since qadd <= 255, so zapping the lower byte suffices. */ levels *= qmul; levels -= correction & (negmask << 16); /* Negate qadd for negative levels. */ add = qadd ^ negmask; add += WORD_VEC(0x0001) & negmask; /* Set qadd to 0 for levels == 0. */ add = zap(add, zeros); levels += add; stq(levels, block); } }
/* These functions were the base for the optimized assembler routines, and remain here for documentation purposes. */ static void put_pixels_clamped_mvi(const int16_t *block, uint8_t *pixels, ptrdiff_t line_size) { int i = 8; uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ do { uint64_t shorts0, shorts1; shorts0 = ldq(block); shorts0 = maxsw4(shorts0, 0); shorts0 = minsw4(shorts0, clampmask); stl(pkwb(shorts0), pixels); shorts1 = ldq(block + 4); shorts1 = maxsw4(shorts1, 0); shorts1 = minsw4(shorts1, clampmask); stl(pkwb(shorts1), pixels + 4); pixels += line_size; block += 8; } while (--i); }
static void dct_unquantize_h263_axp(DCTELEM *block, int n_coeffs, uint64_t qscale, uint64_t qadd) { uint64_t qmul = qscale << 1; uint64_t correction = WORD_VEC(qmul * 255 >> 8); int i; qadd = WORD_VEC(qadd); for(i = 0; i <= n_coeffs; block += 4, i += 4) { uint64_t levels, negmask, zeros, add, sub; levels = ldq(block); if (levels == 0) continue; #ifdef __alpha_max__ /* I don't think the speed difference justifies runtime detection. */ negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ #else negmask = cmpbge(WORD_VEC(0x7fff), levels); negmask &= (negmask >> 1) | (1 << 7); negmask = zap(-1, negmask); #endif zeros = cmpbge(0, levels); zeros &= zeros >> 1; /* zeros |= zeros << 1 is not needed since qadd <= 255, so zapping the lower byte suffices. */ levels *= qmul; levels -= correction & (negmask << 16); add = qadd & ~negmask; sub = qadd & negmask; /* Set qadd to 0 for levels == 0. */ add = zap(add, zeros); levels += add; levels -= sub; stq(levels, block); } }
static void dct_unquantize_h263_intra_axp(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int i, n_coeffs; uint64_t qmul, qadd; uint64_t correction; DCTELEM *orig_block = block; DCTELEM block0; /* might not be used uninitialized */ qadd = WORD_VEC((qscale - 1) | 1); qmul = qscale << 1; /* This mask kills spill from negative subwords to the next subword. */ correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */ if (!s->h263_aic) { if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; } else { qadd = 0; } n_coeffs = 63; // does not always use zigzag table for(i = 0; i <= n_coeffs; block += 4, i += 4) { uint64_t levels, negmask, zeros, add; levels = ldq(block); if (levels == 0) continue; #ifdef __alpha_max__ /* I don't think the speed difference justifies runtime detection. */ negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ #else negmask = cmpbge(WORD_VEC(0x7fff), levels); negmask &= (negmask >> 1) | (1 << 7); negmask = zap(-1, negmask); #endif zeros = cmpbge(0, levels); zeros &= zeros >> 1; /* zeros |= zeros << 1 is not needed since qadd <= 255, so zapping the lower byte suffices. */ levels *= qmul; levels -= correction & (negmask << 16); /* Negate qadd for negative levels. */ add = qadd ^ negmask; add += WORD_VEC(0x0001) & negmask; /* Set qadd to 0 for levels == 0. */ add = zap(add, zeros); levels += add; stq(levels, block); } if (s->mb_intra && !s->h263_aic) orig_block[0] = block0; }