static void fn(yuv2yuv)(uint8_t *_dst[3], const ptrdiff_t dst_stride[3], uint8_t *_src[3], const ptrdiff_t src_stride[3], int w, int h, const int16_t c[3][3][8], const int16_t yuv_offset[2][8]) { opixel **dst = (opixel **) _dst; ipixel **src = (ipixel **) _src; const ipixel *src0 = src[0], *src1 = src[1], *src2 = src[2]; opixel *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2]; int y, x; const int sh = 14 + IN_BIT_DEPTH - OUT_BIT_DEPTH; const int rnd = 1 << (sh - 1); int y_off_in = yuv_offset[0][0]; int y_off_out = yuv_offset[1][0] << sh; const int uv_off_in = 128 << (IN_BIT_DEPTH - 8); const int uv_off_out = rnd + (128 << (OUT_BIT_DEPTH - 8 + sh)); int cyy = c[0][0][0], cyu = c[0][1][0], cyv = c[0][2][0]; int cuu = c[1][1][0], cuv = c[1][2][0], cvu = c[2][1][0], cvv = c[2][2][0]; av_assert2(c[1][0][0] == 0); av_assert2(c[2][0][0] == 0); w = AV_CEIL_RSHIFT(w, SS_W); h = AV_CEIL_RSHIFT(h, SS_H); for (y = 0; y < h; y++) { for (x = 0; x < w; x++) { int y00 = src0[x << SS_W] - y_off_in; #if SS_W == 1 int y01 = src0[2 * x + 1] - y_off_in; #if SS_H == 1 int y10 = src0[src_stride[0] / sizeof(ipixel) + 2 * x] - y_off_in; int y11 = src0[src_stride[0] / sizeof(ipixel) + 2 * x + 1] - y_off_in; #endif #endif int u = src1[x] - uv_off_in, v = src2[x] - uv_off_in; int uv_val = cyu * u + cyv * v + rnd + y_off_out; dst0[x << SS_W] = av_clip_pixel((cyy * y00 + uv_val) >> sh); #if SS_W == 1 dst0[x * 2 + 1] = av_clip_pixel((cyy * y01 + uv_val) >> sh); #if SS_H == 1 dst0[x * 2 + 0 + dst_stride[0] / sizeof(opixel)] = av_clip_pixel((cyy * y10 + uv_val) >> sh); dst0[x * 2 + 1 + dst_stride[0] / sizeof(opixel)] = av_clip_pixel((cyy * y11 + uv_val) >> sh); #endif #endif dst1[x] = av_clip_pixel((u * cuu + v * cuv + uv_off_out) >> sh); dst2[x] = av_clip_pixel((u * cvu + v * cvv + uv_off_out) >> sh); } dst0 += (dst_stride[0] * (1 << SS_H)) / sizeof(opixel); dst1 += dst_stride[1] / sizeof(opixel); dst2 += dst_stride[2] / sizeof(opixel); src0 += (src_stride[0] * (1 << SS_H)) / sizeof(ipixel); src1 += src_stride[1] / sizeof(ipixel); src2 += src_stride[2] / sizeof(ipixel); } }
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride, x86_reg block_w, x86_reg block_h, x86_reg src_x, x86_reg src_y, x86_reg w, x86_reg h, emu_edge_vfix_func * const *vfix_tbl, emu_edge_vvar_func *v_extend_var, emu_edge_hfix_func * const *hfix_tbl, emu_edge_hvar_func *h_extend_var) { x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; if (!w || !h) return; av_assert2(block_w <= FFABS(dst_stride)); if (src_y >= h) { src -= src_y*src_stride; src_y_add = h - 1; src_y = h - 1; } else if (src_y <= -block_h) { src -= src_y*src_stride; src_y_add = 1 - block_h; src_y = 1 - block_h; } if (src_x >= w) { src += w - 1 - src_x; src_x = w - 1; } else if (src_x <= -block_w) { src += 1 - block_w - src_x; src_x = 1 - block_w; } start_y = FFMAX(0, -src_y); start_x = FFMAX(0, -src_x); end_y = FFMIN(block_h, h-src_y); end_x = FFMIN(block_w, w-src_x); av_assert2(start_x < end_x && block_w > 0); av_assert2(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below src += (src_y_add + start_y) * src_stride + start_x; w = end_x - start_x; if (w <= 22) { vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h); } else { v_extend_var(dst + start_x, dst_stride, src, src_stride, start_y, end_y, block_h, w); } // fill left if (start_x) { if (start_x <= 22) { hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); } else {
static inline void x8_select_ac_table(IntraX8Context * const w , int mode){ MpegEncContext * const s= w->s; int table_index; av_assert2(mode<4); if( w->j_ac_vlc[mode] ) return; table_index = get_bits(&s->gb, 3); w->j_ac_vlc[mode] = &j_ac_vlc[w->quant<13][mode>>1][table_index];//2 modes use same tables av_assert2(w->j_ac_vlc[mode]); }
int avfilter_transform(const uint8_t *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, const float *matrix, enum InterpolateMethod interpolate, enum FillMethod fill) { int x, y; float x_s, y_s; uint8_t def = 0; uint8_t (*func)(float, float, const uint8_t *, int, int, int, uint8_t) = NULL; switch(interpolate) { case INTERPOLATE_NEAREST: func = interpolate_nearest; break; case INTERPOLATE_BILINEAR: func = interpolate_bilinear; break; case INTERPOLATE_BIQUADRATIC: func = interpolate_biquadratic; break; default: return AVERROR(EINVAL); } for (y = 0; y < height; y++) { for(x = 0; x < width; x++) { x_s = x * matrix[0] + y * matrix[1] + matrix[2]; y_s = x * matrix[3] + y * matrix[4] + matrix[5]; switch(fill) { case FILL_ORIGINAL: def = src[y * src_stride + x]; break; case FILL_CLAMP: y_s = av_clipf(y_s, 0, height - 1); x_s = av_clipf(x_s, 0, width - 1); def = src[(int)y_s * src_stride + (int)x_s]; break; case FILL_MIRROR: x_s = mirror(x_s, width-1); y_s = mirror(y_s, height-1); av_assert2(x_s >= 0 && y_s >= 0); av_assert2(x_s < width && y_s < height); def = src[(int)y_s * src_stride + (int)x_s]; } dst[y * dst_stride + x] = func(x_s, y_s, src, width, height, src_stride, def); } } return 0; }
/** * 0th order modified bessel function of the first kind. */ static double bessel(double x){ double v=1; double lastv=0; double t=1; int i; static const double inv[100]={ 1.0/( 1* 1), 1.0/( 2* 2), 1.0/( 3* 3), 1.0/( 4* 4), 1.0/( 5* 5), 1.0/( 6* 6), 1.0/( 7* 7), 1.0/( 8* 8), 1.0/( 9* 9), 1.0/(10*10), 1.0/(11*11), 1.0/(12*12), 1.0/(13*13), 1.0/(14*14), 1.0/(15*15), 1.0/(16*16), 1.0/(17*17), 1.0/(18*18), 1.0/(19*19), 1.0/(20*20), 1.0/(21*21), 1.0/(22*22), 1.0/(23*23), 1.0/(24*24), 1.0/(25*25), 1.0/(26*26), 1.0/(27*27), 1.0/(28*28), 1.0/(29*29), 1.0/(30*30), 1.0/(31*31), 1.0/(32*32), 1.0/(33*33), 1.0/(34*34), 1.0/(35*35), 1.0/(36*36), 1.0/(37*37), 1.0/(38*38), 1.0/(39*39), 1.0/(40*40), 1.0/(41*41), 1.0/(42*42), 1.0/(43*43), 1.0/(44*44), 1.0/(45*45), 1.0/(46*46), 1.0/(47*47), 1.0/(48*48), 1.0/(49*49), 1.0/(50*50), 1.0/(51*51), 1.0/(52*52), 1.0/(53*53), 1.0/(54*54), 1.0/(55*55), 1.0/(56*56), 1.0/(57*57), 1.0/(58*58), 1.0/(59*59), 1.0/(60*60), 1.0/(61*61), 1.0/(62*62), 1.0/(63*63), 1.0/(64*64), 1.0/(65*65), 1.0/(66*66), 1.0/(67*67), 1.0/(68*68), 1.0/(69*69), 1.0/(70*70), 1.0/(71*71), 1.0/(72*72), 1.0/(73*73), 1.0/(74*74), 1.0/(75*75), 1.0/(76*76), 1.0/(77*77), 1.0/(78*78), 1.0/(79*79), 1.0/(80*80), 1.0/(81*81), 1.0/(82*82), 1.0/(83*83), 1.0/(84*84), 1.0/(85*85), 1.0/(86*86), 1.0/(87*87), 1.0/(88*88), 1.0/(89*89), 1.0/(90*90), 1.0/(91*91), 1.0/(92*92), 1.0/(93*93), 1.0/(94*94), 1.0/(95*95), 1.0/(96*96), 1.0/(97*97), 1.0/(98*98), 1.0/(99*99), 1.0/(10000) }; x= x*x/4; for(i=0; v != lastv; i++){ lastv=v; t *= x*inv[i]; v += t; av_assert2(i<99); } return v; }
static int tm2_read_deltas(TM2Context *ctx, int stream_id) { int d, mb; int i, v; d = get_bits(&ctx->gb, 9); mb = get_bits(&ctx->gb, 5); av_assert2(mb < 32); if ((d < 1) || (d > TM2_DELTAS) || (mb < 1)) { av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect delta table: %i deltas x %i bits\n", d, mb); return AVERROR_INVALIDDATA; } for (i = 0; i < d; i++) { v = get_bits_long(&ctx->gb, mb); if (v & (1 << (mb - 1))) ctx->deltas[stream_id][i] = v - (1 << mb); else ctx->deltas[stream_id][i] = v; } for (; i < TM2_DELTAS; i++) ctx->deltas[stream_id][i] = 0; return 0; }
int RENAME(swri_resample)(ResampleContext *c, DELEM *dst, const DELEM *src, int *consumed, int src_size, int dst_size, int update_ctx){ int dst_index, i; int index= c->index; int frac= c->frac; int dst_incr_frac= c->dst_incr % c->src_incr; int dst_incr= c->dst_incr / c->src_incr; av_assert1(c->filter_shift == FILTER_SHIFT); av_assert1(c->felem_size == sizeof(FELEM)); if (c->filter_length == 1 && c->phase_shift == 0) { int64_t index2= (1LL<<32)*c->frac/c->src_incr + (1LL<<32)*index; int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr; int new_size = (src_size * (int64_t)c->src_incr - frac + c->dst_incr - 1) / c->dst_incr; dst_size= FFMIN(dst_size, new_size); for(dst_index=0; dst_index < dst_size; dst_index++){ dst[dst_index] = src[index2>>32]; index2 += incr; } index += dst_index * dst_incr; index += (frac + dst_index * (int64_t)dst_incr_frac) / c->src_incr; frac = (frac + dst_index * (int64_t)dst_incr_frac) % c->src_incr; av_assert2(index >= 0); *consumed= index; index = 0; } else if (index >= 0 &&
static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, int16_t *block, int n, int qscale) { int level, qmul, qadd; int nCoeffs; av_assert2(s->block_last_index[n]>=0); qmul = qscale << 1; if (!s->h263_aic) { if (n < 4) level = block[0] * s->y_dc_scale; else level = block[0] * s->c_dc_scale; qadd = (qscale - 1) | 1; }else{ qadd = 0; level = block[0]; } if(s->ac_pred) nCoeffs=63; else nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); block[0] = level; }
int RENAME(swri_resample)(ResampleContext *c, DELEM *dst, const DELEM *src, int *consumed, int src_size, int dst_size, int update_ctx){ int dst_index, i; int index= c->index; int frac= c->frac; int dst_incr_frac= c->dst_incr % c->src_incr; int dst_incr= c->dst_incr / c->src_incr; int compensation_distance= c->compensation_distance; av_assert1(c->filter_shift == FILTER_SHIFT); av_assert1(c->felem_size == sizeof(FELEM)); if(compensation_distance == 0 && c->filter_length == 1 && c->phase_shift==0){ int64_t index2= ((int64_t)index)<<32; int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr; dst_size= FFMIN(dst_size, (src_size-1-index) * (int64_t)c->src_incr / c->dst_incr); for(dst_index=0; dst_index < dst_size; dst_index++){ dst[dst_index] = src[index2>>32]; index2 += incr; } index += dst_index * dst_incr; index += (frac + dst_index * (int64_t)dst_incr_frac) / c->src_incr; frac = (frac + dst_index * (int64_t)dst_incr_frac) % c->src_incr; av_assert2(index >= 0); *consumed= index >> c->phase_shift; index &= c->phase_mask; }else if(compensation_distance == 0 && !c->linear && index >= 0){
/** * Hybrid window filtering, see blocks 36 and 49 of the G.728 specification. * * @param order filter order * @param n input length * @param non_rec number of non-recursive samples * @param out filter output * @param hist pointer to the input history of the filter * @param out pointer to the non-recursive part of the output * @param out2 pointer to the recursive part of the output * @param window pointer to the windowing function table */ static void do_hybrid_window(RA288Context *ractx, int order, int n, int non_rec, float *out, float *hist, float *out2, const float *window) { int i; float buffer1[MAX_BACKWARD_FILTER_ORDER + 1]; float buffer2[MAX_BACKWARD_FILTER_ORDER + 1]; LOCAL_ALIGNED(32, float, work, [FFALIGN(MAX_BACKWARD_FILTER_ORDER + MAX_BACKWARD_FILTER_LEN + MAX_BACKWARD_FILTER_NONREC, 16)]); av_assert2(order>=0); ractx->fdsp->vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16)); convolve(buffer1, work + order , n , order); convolve(buffer2, work + order + n, non_rec, order); for (i=0; i <= order; i++) { out2[i] = out2[i] * 0.5625 + buffer1[i]; out [i] = out2[i] + buffer2[i]; } /* Multiply by the white noise correcting factor (WNCF). */ *out *= 257.0 / 256.0; }
void avio_w8(AVIOContext *s, int b) { av_assert2(b>=-128 && b<=255); *s->buf_ptr++ = b; if (s->buf_ptr >= s->buf_end) flush_buffer(s); }
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) { x86_reg qmul, qadd, nCoeffs; qmul = qscale << 1; qadd = (qscale - 1) | 1; av_assert2(s->block_last_index[n]>=0 || s->h263_aic); nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; __asm__ volatile( "movd %1, %%mm6 \n\t" //qmul "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "movd %2, %%mm5 \n\t" //qadd "pxor %%mm7, %%mm7 \n\t" "packssdw %%mm5, %%mm5 \n\t" "packssdw %%mm5, %%mm5 \n\t" "psubw %%mm5, %%mm7 \n\t" "pxor %%mm4, %%mm4 \n\t" ".p2align 4 \n\t" "1: \n\t" "movq (%0, %3), %%mm0 \n\t" "movq 8(%0, %3), %%mm1 \n\t" "pmullw %%mm6, %%mm0 \n\t" "pmullw %%mm6, %%mm1 \n\t" "movq (%0, %3), %%mm2 \n\t" "movq 8(%0, %3), %%mm3 \n\t" "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "paddw %%mm7, %%mm0 \n\t" "paddw %%mm7, %%mm1 \n\t" "pxor %%mm0, %%mm2 \n\t" "pxor %%mm1, %%mm3 \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 "pandn %%mm2, %%mm0 \n\t" "pandn %%mm3, %%mm1 \n\t" "movq %%mm0, (%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t" "add $16, %3 \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) : "memory" ); }
/** * Hash function adding character * @param head LZW code for prefix * @param add Character to add * @return New hash value */ static inline int hash(int head, const int add) { head ^= (add << LZW_HASH_SHIFT); if (head >= LZW_HASH_SIZE) head -= LZW_HASH_SIZE; av_assert2(head >= 0 && head < LZW_HASH_SIZE); return head; }
/** Discard data from the FIFO. */ void av_fifo_drain(AVFifoBuffer *f, int size) { av_assert2(av_fifo_size(f) >= size); f->rptr += size; if (f->rptr >= f->end) f->rptr -= f->end - f->buffer; f->rndx += size; }
int swri_audio_convert(AudioConvert *ctx, AudioData *out, AudioData *in, int len) { int ch; int off=0; const int os= (out->planar ? 1 :out->ch_count) *out->bps; unsigned misaligned = 0; av_assert0(ctx->channels == out->ch_count); if (ctx->in_simd_align_mask) { int planes = in->planar ? in->ch_count : 1; unsigned m = 0; for (ch = 0; ch < planes; ch++) m |= (intptr_t)in->ch[ch]; misaligned |= m & ctx->in_simd_align_mask; } if (ctx->out_simd_align_mask) { int planes = out->planar ? out->ch_count : 1; unsigned m = 0; for (ch = 0; ch < planes; ch++) m |= (intptr_t)out->ch[ch]; misaligned |= m & ctx->out_simd_align_mask; } //FIXME optimize common cases if(ctx->simd_f && !ctx->ch_map && !misaligned){ off = len&~15; av_assert1(off>=0); av_assert1(off<=len); av_assert2(ctx->channels == SWR_CH_MAX || !in->ch[ctx->channels]); if(off>0){ if(out->planar == in->planar){ int planes = out->planar ? out->ch_count : 1; for(ch=0; ch<planes; ch++){ ctx->simd_f(out->ch+ch, (const uint8_t **)in->ch+ch, off * (out->planar ? 1 :out->ch_count)); } }else{ ctx->simd_f(out->ch, (const uint8_t **)in->ch, off); } } if(off == len) return 0; } for(ch=0; ch<ctx->channels; ch++){ const int ich= ctx->ch_map ? ctx->ch_map[ch] : ch; const int is= ich < 0 ? 0 : (in->planar ? 1 : in->ch_count) * in->bps; const uint8_t *pi= ich < 0 ? ctx->silence : in->ch[ich]; uint8_t *po= out->ch[ch]; uint8_t *end= po + os*len; if(!po) continue; ctx->conv_f(po+off*os, pi+off*is, is, os, end); } return 0; }
int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max) { AVRational a0 = { 0, 1 }, a1 = { 1, 0 }; int sign = (num < 0) ^ (den < 0); int64_t gcd = av_gcd(FFABS(num), FFABS(den)); if (gcd) { num = FFABS(num) / gcd; den = FFABS(den) / gcd; } if (num <= max && den <= max) { a1 = (AVRational) { num, den }; den = 0; } while (den) { uint64_t x = num / den; int64_t next_den = num - den * x; int64_t a2n = x * a1.num + a0.num; int64_t a2d = x * a1.den + a0.den; if (a2n > max || a2d > max) { if (a1.num) x = (max - a0.num) / a1.num; if (a1.den) x = FFMIN(x, (max - a0.den) / a1.den); if (den * (2 * x * a1.den + a0.den) > num * a1.den) a1 = (AVRational) { x * a1.num + a0.num, x * a1.den + a0.den }; break; } a0 = a1; a1 = (AVRational) { a2n, a2d }; num = den; den = next_den; } av_assert2(av_gcd(a1.num, a1.den) <= 1U); av_assert2(a1.num <= max && a1.den <= max); *dst_num = sign ? -a1.num : a1.num; *dst_den = a1.den; return den == 0; }
static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, ptrdiff_t linesize_arg, int block_w, int block_h, int src_x, int src_y, int w, int h, emu_edge_core_func *core_fn) { int start_y, start_x, end_y, end_x, src_y_add = 0; int linesize = linesize_arg; if(!w || !h) return; if (src_y >= h) { src -= src_y*linesize; src_y_add = h - 1; src_y = h - 1; } else if (src_y <= -block_h) { src -= src_y*linesize; src_y_add = 1 - block_h; src_y = 1 - block_h; } if (src_x >= w) { src += w - 1 - src_x; src_x = w - 1; } else if (src_x <= -block_w) { src += 1 - block_w - src_x; src_x = 1 - block_w; } start_y = FFMAX(0, -src_y); start_x = FFMAX(0, -src_x); end_y = FFMIN(block_h, h-src_y); end_x = FFMIN(block_w, w-src_x); av_assert2(start_x < end_x && block_w > 0); av_assert2(start_y < end_y && block_h > 0); // fill in the to-be-copied part plus all above/below src += (src_y_add + start_y) * linesize + start_x; buf += start_x; core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w); }
static int get_qPy_pred(HEVCContext *s, int xC, int yC, int xBase, int yBase, int log2_cb_size) { HEVCLocalContext *lc = s->HEVClc; int ctb_size_mask = (1 << s->sps->log2_ctb_size) - 1; int MinCuQpDeltaSizeMask = (1 << (s->sps->log2_ctb_size - s->pps->diff_cu_qp_delta_depth)) - 1; int xQgBase = xBase - (xBase & MinCuQpDeltaSizeMask); int yQgBase = yBase - (yBase & MinCuQpDeltaSizeMask); int min_cb_width = s->sps->min_cb_width; int x_cb = xQgBase >> s->sps->log2_min_cb_size; int y_cb = yQgBase >> s->sps->log2_min_cb_size; int availableA = (xBase & ctb_size_mask) && (xQgBase & ctb_size_mask); int availableB = (yBase & ctb_size_mask) && (yQgBase & ctb_size_mask); int qPy_pred, qPy_a, qPy_b; // qPy_pred if (lc->first_qp_group || (!xQgBase && !yQgBase)) { lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded; qPy_pred = s->sh.slice_qp; } else { qPy_pred = lc->qPy_pred; } // qPy_a if (availableA == 0) qPy_a = qPy_pred; else qPy_a = s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]; // qPy_b if (availableB == 0) qPy_b = qPy_pred; else qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]; av_assert2(qPy_a >= -s->sps->qp_bd_offset && qPy_a < 52); av_assert2(qPy_b >= -s->sps->qp_bd_offset && qPy_b < 52); return (qPy_a + qPy_b + 1) >> 1; }
ERROR #endif void RENAME(swri_noise_shaping)(SwrContext *s, AudioData *dsts, const AudioData *srcs, const AudioData *noises, int count){ int pos = s->dither.ns_pos; int i, j, ch; int taps = s->dither.ns_taps; float S = s->dither.ns_scale; float S_1 = s->dither.ns_scale_1; av_assert2((taps&3) != 2); av_assert2((taps&3) != 3 || s->dither.ns_coeffs[taps] == 0); for (ch=0; ch<srcs->ch_count; ch++) { const float *noise = ((const float *)noises->ch[ch]) + s->dither.noise_pos; const DELEM *src = (const DELEM*)srcs->ch[ch]; DELEM *dst = (DELEM*)dsts->ch[ch]; float *ns_errors = s->dither.ns_errors[ch]; const float *ns_coeffs = s->dither.ns_coeffs; pos = s->dither.ns_pos; for (i=0; i<count; i++) { double d1, d = src[i]*S_1; for(j=0; j<taps-2; j+=4) { d -= ns_coeffs[j ] * ns_errors[pos + j ] +ns_coeffs[j + 1] * ns_errors[pos + j + 1] +ns_coeffs[j + 2] * ns_errors[pos + j + 2] +ns_coeffs[j + 3] * ns_errors[pos + j + 3]; } if(j < taps) d -= ns_coeffs[j] * ns_errors[pos + j]; pos = pos ? pos - 1 : taps - 1; d1 = rint(d + noise[i]); ns_errors[pos + taps] = ns_errors[pos] = d1 - d; d1 *= S; CLIP(d1); dst[i] = d1; } } s->dither.ns_pos = pos; }
static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, uint8_t *dummy, ptrdiff_t stride, int h) { int i, temp[64], sum = 0; av_assert2(h == 8); for (i = 0; i < 8; i++) { // FIXME: try pointer walks BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], src[stride * i + 0], src[stride * i + 1]); BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], src[stride * i + 2], src[stride * i + 3]); BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], src[stride * i + 4], src[stride * i + 5]); BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], src[stride * i + 6], src[stride * i + 7]); BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); } for (i = 0; i < 8; i++) { BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); } sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean return sum; }
static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, int16_t *block, int n, int qscale) { int qmul, qadd; int nCoeffs; av_assert2(s->block_last_index[n]>=0); qadd = (qscale - 1) | 1; qmul = qscale << 1; nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); }
static void set(uint8_t *a[], int ch, int index, int ch_count, enum AVSampleFormat f, double v){ uint8_t *p; if(av_sample_fmt_is_planar(f)){ f= av_get_alt_sample_fmt(f, 0); p= a[ch]; }else{ p= a[0]; index= ch + index*ch_count; } switch(f){ case AV_SAMPLE_FMT_U8 : ((uint8_t*)p)[index]= (v+1.0)*255.0/2; break; case AV_SAMPLE_FMT_S16: ((int16_t*)p)[index]= v*32767; break; case AV_SAMPLE_FMT_S32: ((int32_t*)p)[index]= v*2147483647; break; case AV_SAMPLE_FMT_FLT: ((float *)p)[index]= v; break; case AV_SAMPLE_FMT_DBL: ((double *)p)[index]= v; break; default: av_assert2(0); } }
static int swri_resample(ResampleContext *c, uint8_t *dst, const uint8_t *src, int *consumed, int src_size, int dst_size, int update_ctx) { if (c->filter_length == 1 && c->phase_count == 1) { int index= c->index; int frac= c->frac; int64_t index2= (1LL<<32)*c->frac/c->src_incr + (1LL<<32)*index; int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr; int new_size = (src_size * (int64_t)c->src_incr - frac + c->dst_incr - 1) / c->dst_incr; dst_size= FFMIN(dst_size, new_size); c->dsp.resample_one(dst, src, dst_size, index2, incr); index += dst_size * c->dst_incr_div; index += (frac + dst_size * (int64_t)c->dst_incr_mod) / c->src_incr; av_assert2(index >= 0); *consumed= index; if (update_ctx) { c->frac = (frac + dst_size * (int64_t)c->dst_incr_mod) % c->src_incr; c->index = 0; } } else { int64_t end_index = (1LL + src_size - c->filter_length) * c->phase_count; int64_t delta_frac = (end_index - c->index) * c->src_incr - c->frac; int delta_n = (delta_frac + c->dst_incr - 1) / c->dst_incr; dst_size = FFMIN(dst_size, delta_n); if (dst_size > 0) { /* resample_linear and resample_common should have same behavior * when frac and dst_incr_mod are zero */ if (c->linear && (c->frac || c->dst_incr_mod)) *consumed = c->dsp.resample_linear(c, dst, src, dst_size, update_ctx); else *consumed = c->dsp.resample_common(c, dst, src, dst_size, update_ctx); } else { *consumed = 0; } } return dst_size; }
static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs) { int i; for (i = 0; i < nb_coefs; i++) { int e; int v = abs(coef[i]); if (v == 0) e = 24; else { e = 23 - av_log2(v); if (e >= 24) { e = 24; coef[i] = 0; } av_assert2(e >= 0); } exp[i] = e; } }
static int get_high_utility_cell(elbg_data *elbg) { int i=0; /* Using linear search, do binary if it ever turns to be speed critical */ uint64_t r; if (elbg->utility_inc[elbg->numCB-1] < INT_MAX) { r = av_lfg_get(elbg->rand_state) % (unsigned int)elbg->utility_inc[elbg->numCB-1] + 1; } else { r = av_lfg_get(elbg->rand_state); r = (av_lfg_get(elbg->rand_state) + (r<<32)) % elbg->utility_inc[elbg->numCB-1] + 1; } while (elbg->utility_inc[i] < r) { i++; } av_assert2(elbg->cells[i]); return i; }
static av_always_inline void decode_line(FFV1Context *s, int w, int16_t *sample[2], int plane_index, int bits) { PlaneContext *const p = &s->plane[plane_index]; RangeCoder *const c = &s->c; int x; int run_count = 0; int run_mode = 0; int run_index = s->run_index; if (s->slice_coding_mode == 1) { int i; for (x = 0; x < w; x++) { int v = 0; for (i=0; i<bits; i++) { uint8_t state = 128; v += v + get_rac(c, &state); } sample[1][x] = v; } return; } for (x = 0; x < w; x++) { int diff, context, sign; context = get_context(p, sample[1] + x, sample[0] + x, sample[1] + x); if (context < 0) { context = -context; sign = 1; } else sign = 0; av_assert2(context < p->context_count); if (s->ac) { diff = get_symbol_inline(c, p->state[context], 1); } else { if (context == 0 && run_mode == 0) run_mode = 1; if (run_mode) { if (run_count == 0 && run_mode == 1) { if (get_bits1(&s->gb)) { run_count = 1 << ff_log2_run[run_index]; if (x + run_count <= w) run_index++; } else { if (ff_log2_run[run_index]) run_count = get_bits(&s->gb, ff_log2_run[run_index]); else run_count = 0; if (run_index) run_index--; run_mode = 2; } } run_count--; if (run_count < 0) { run_mode = 0; run_count = 0; diff = get_vlc_symbol(&s->gb, &p->vlc_state[context], bits); if (diff >= 0) diff++; } else diff = 0; } else diff = get_vlc_symbol(&s->gb, &p->vlc_state[context], bits); ff_dlog(s->avctx, "count:%d index:%d, mode:%d, x:%d pos:%d\n", run_count, run_index, run_mode, x, get_bits_count(&s->gb)); } if (sign) diff = -diff; sample[1][x] = av_mod_uintp2(predict(sample[1] + x, sample[0] + x) + diff, bits); } s->run_index = run_index; }
static int encode_block(WMACodecContext *s, float (*src_coefs)[BLOCK_MAX_SIZE], int total_gain) { int v, bsize, ch, coef_nb_bits, parse_exponents; float mdct_norm; int nb_coefs[MAX_CHANNELS]; static const int fixed_exp[25] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }; // FIXME remove duplication relative to decoder if (s->use_variable_block_len) { av_assert0(0); // FIXME not implemented } else { /* fixed block len */ s->next_block_len_bits = s->frame_len_bits; s->prev_block_len_bits = s->frame_len_bits; s->block_len_bits = s->frame_len_bits; } s->block_len = 1 << s->block_len_bits; // av_assert0((s->block_pos + s->block_len) <= s->frame_len); bsize = s->frame_len_bits - s->block_len_bits; // FIXME factor v = s->coefs_end[bsize] - s->coefs_start; for (ch = 0; ch < s->avctx->channels; ch++) nb_coefs[ch] = v; { int n4 = s->block_len / 2; mdct_norm = 1.0 / (float) n4; if (s->version == 1) mdct_norm *= sqrt(n4); } if (s->avctx->channels == 2) put_bits(&s->pb, 1, !!s->ms_stereo); for (ch = 0; ch < s->avctx->channels; ch++) { // FIXME only set channel_coded when needed, instead of always s->channel_coded[ch] = 1; if (s->channel_coded[ch]) init_exp(s, ch, fixed_exp); } for (ch = 0; ch < s->avctx->channels; ch++) { if (s->channel_coded[ch]) { WMACoef *coefs1; float *coefs, *exponents, mult; int i, n; coefs1 = s->coefs1[ch]; exponents = s->exponents[ch]; mult = pow(10, total_gain * 0.05) / s->max_exponent[ch]; mult *= mdct_norm; coefs = src_coefs[ch]; if (s->use_noise_coding && 0) { av_assert0(0); // FIXME not implemented } else { coefs += s->coefs_start; n = nb_coefs[ch]; for (i = 0; i < n; i++) { double t = *coefs++ / (exponents[i] * mult); if (t < -32768 || t > 32767) return -1; coefs1[i] = lrint(t); } } } } v = 0; for (ch = 0; ch < s->avctx->channels; ch++) { int a = s->channel_coded[ch]; put_bits(&s->pb, 1, a); v |= a; } if (!v) return 1; for (v = total_gain - 1; v >= 127; v -= 127) put_bits(&s->pb, 7, 127); put_bits(&s->pb, 7, v); coef_nb_bits = ff_wma_total_gain_to_bits(total_gain); if (s->use_noise_coding) { for (ch = 0; ch < s->avctx->channels; ch++) { if (s->channel_coded[ch]) { int i, n; n = s->exponent_high_sizes[bsize]; for (i = 0; i < n; i++) { put_bits(&s->pb, 1, s->high_band_coded[ch][i] = 0); if (0) nb_coefs[ch] -= s->exponent_high_bands[bsize][i]; } } } } parse_exponents = 1; if (s->block_len_bits != s->frame_len_bits) put_bits(&s->pb, 1, parse_exponents); if (parse_exponents) { for (ch = 0; ch < s->avctx->channels; ch++) { if (s->channel_coded[ch]) { if (s->use_exp_vlc) { encode_exp_vlc(s, ch, fixed_exp); } else { av_assert0(0); // FIXME not implemented // encode_exp_lsp(s, ch); } } } } else av_assert0(0); // FIXME not implemented for (ch = 0; ch < s->avctx->channels; ch++) { if (s->channel_coded[ch]) { int run, tindex; WMACoef *ptr, *eptr; tindex = (ch == 1 && s->ms_stereo); ptr = &s->coefs1[ch][0]; eptr = ptr + nb_coefs[ch]; run = 0; for (; ptr < eptr; ptr++) { if (*ptr) { int level = *ptr; int abs_level = FFABS(level); int code = 0; if (abs_level <= s->coef_vlcs[tindex]->max_level) if (run < s->coef_vlcs[tindex]->levels[abs_level - 1]) code = run + s->int_table[tindex][abs_level - 1]; av_assert2(code < s->coef_vlcs[tindex]->n); put_bits(&s->pb, s->coef_vlcs[tindex]->huffbits[code], s->coef_vlcs[tindex]->huffcodes[code]); if (code == 0) { if (1 << coef_nb_bits <= abs_level) return -1; put_bits(&s->pb, coef_nb_bits, abs_level); put_bits(&s->pb, s->frame_len_bits, run); } // FIXME the sign is flipped somewhere put_bits(&s->pb, 1, level < 0); run = 0; } else run++; } if (run) put_bits(&s->pb, s->coef_vlcs[tindex]->huffbits[1], s->coef_vlcs[tindex]->huffcodes[1]); } if (s->version == 1 && s->avctx->channels >= 2) avpriv_align_put_bits(&s->pb); } return 0; }
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) { x86_reg nCoeffs; const uint16_t *quant_matrix; av_assert2(s->block_last_index[n]>=0); if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; else qscale <<= 1; if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; quant_matrix = s->inter_matrix; __asm__ volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlq $48, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ".p2align 4 \n\t" "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psrlw $5, %%mm0 \n\t" "psrlw $5, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "pxor %%mm4, %%mm7 \n\t" "pxor %%mm5, %%mm7 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "jng 1b \n\t" "movd 124(%0, %3), %%mm0 \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $32, %%mm7 \n\t" "pxor %%mm6, %%mm7 \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $16, %%mm7 \n\t" "pxor %%mm6, %%mm7 \n\t" "pslld $31, %%mm7 \n\t" "psrlq $15, %%mm7 \n\t" "pxor %%mm7, %%mm0 \n\t" "movd %%mm0, 124(%0, %3) \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) : "%"REG_a, "memory" ); }
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) { x86_reg nCoeffs; const uint16_t *quant_matrix; int block0; av_assert2(s->block_last_index[n]>=0); if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale]; else qscale <<= 1; if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; quant_matrix = s->intra_matrix; __asm__ volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ".p2align 4 \n\t" "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $4, %%mm0 \n\t" "psraw $4, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" ); block[0]= block0; //Note, we do not do mismatch control for intra as errors cannot accumulate }
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, int16_t *block, int n, int qscale) { x86_reg nCoeffs; const uint16_t *quant_matrix; av_assert2(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; quant_matrix = s->inter_matrix; __asm__ volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" ".p2align 4 \n\t" "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm4 \n\t" "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $4, %%mm0 \n\t" "psraw $4, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" "psubw %%mm7, %%mm1 \n\t" "por %%mm7, %%mm0 \n\t" "por %%mm7, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" "pxor %%mm3, %%mm1 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" "movq %%mm4, (%0, %%"REG_a") \n\t" "movq %%mm5, 8(%0, %%"REG_a") \n\t" "add $16, %%"REG_a" \n\t" "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) : "%"REG_a, "memory" ); }