/** Encodes a single vector of integers (eg, a partition within a * coefficient block) using PVQ * * @param [in,out] ec range encoder * @param [in] qg quantized gain * @param [in] theta quantized post-prediction theta * @param [in] max_theta maximum possible quantized theta value * @param [in] in coefficient vector to code * @param [in] n number of coefficients in partition * @param [in] k number of pulses in partition * @param [in,out] model entropy encoder state * @param [in,out] adapt adaptation context * @param [in,out] exg ExQ16 expectation of gain value * @param [in,out] ext ExQ16 expectation of theta value * @param [in] nodesync do not use info that depend on the reference * @param [in] cdf_ctx selects which cdf context to use * @param [in] is_keyframe whether we're encoding a keyframe * @param [in] code_skip whether the "skip rest" flag is allowed * @param [in] skip_rest when set, we skip all higher bands * @param [in] encode_flip whether we need to encode the CfL flip flag now * @param [in] flip value of the CfL flip flag */ static void pvq_encode_partition(od_ec_enc *ec, int qg, int theta, int max_theta, const od_coeff *in, int n, int k, generic_encoder model[3], od_adapt_ctx *adapt, int *exg, int *ext, int nodesync, int cdf_ctx, int is_keyframe, int code_skip, int skip_rest, int encode_flip, int flip) { int noref; int id; noref = (theta == -1); id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest; if (is_keyframe) { OD_ASSERT(id != 8); if (id >= 8) id--; } else { OD_ASSERT(id != 10); if (id >= 10) id--; } /* Jointly code gain, theta and noref for small values. Then we handle larger gain and theta values. For noref, theta = -1. */ od_encode_cdf_adapt(ec, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0], 8 + 7*code_skip, adapt->pvq.pvq_gaintheta_increment); if (encode_flip) { /* We could eventually do some smarter entropy coding here, but it would have to be good enough to overcome the overhead of the entropy coder. An early attempt using a "toogle" flag with simple adaptation wasn't worth the trouble. */ od_ec_enc_bits(ec, flip, 1); } if (qg > 0) { int tmp; tmp = *exg; generic_encode(ec, &model[!noref], qg - 1, -1, &tmp, 2); OD_IIR_DIADIC(*exg, qg << 16, 2); } if (theta > 1 && (nodesync || max_theta > 3)) { int tmp; tmp = *ext; generic_encode(ec, &model[2], theta - 2, nodesync ? -1 : max_theta - 3, &tmp, 2); OD_IIR_DIADIC(*ext, theta << 16, 2); } od_encode_pvq_codeword(ec, &adapt->pvq.pvq_codeword_ctx, in, n - (theta != -1), k); }
int oggbyte_readcopy(oggbyte_buffer *_b, void *_dest, uint32_t _bytes) { ptrdiff_t endbyte; endbyte = _b->ptr - _b->buf; OD_ASSERT(endbyte >= 0); OD_ASSERT(endbyte <= _b->storage); if ((size_t)(_b->storage - endbyte) < _bytes) return -1; memcpy(_dest, _b->ptr, _bytes); _b->ptr += _bytes; return 0; }
void image_data_pred_block(image_data *_this,int _bi,int _bj){ int b_sz; double *pred; int mode; od_coeff coeffs[5*B_SZ_MAX*B_SZ_MAX]; b_sz=1<<_this->b_sz_log; pred=&_this->pred[_this->pred_stride*b_sz*_bj+b_sz*_bi]; #if MASK_BLOCKS if(!_this->mask[_this->nxblocks*_bj+_bi]){ od_coeff *fdct; int j; int i; fdct=&_this->fdct[_this->fdct_stride*b_sz*(_bj+1)+b_sz*(_bi+1)]; for(j=0;j<b_sz;j++){ for(i=0;i<b_sz;i++){ pred[_this->pred_stride*j+i]=fdct[_this->fdct_stride*j+i]; } } return; } #endif mode=_this->mode[_this->nxblocks*_bj+_bi]; image_data_load_block(_this,_bi,_bj,coeffs); OD_ASSERT(_this->b_sz_log>=OD_LOG_BSIZE0&&_this->b_sz_log<=B_SZ_LOG_MAX); (*NE_INTRA_MULT[_this->b_sz_log-OD_LOG_BSIZE0])(pred,_this->pred_stride, coeffs,mode); }
void oggbyte_readinit(oggbyte_buffer *_b, unsigned char *_buf, ptrdiff_t _bytes) { OD_ASSERT(_bytes >= 0); OD_CLEAR(_b, 1); _b->buf = _b->ptr = _buf; _b->storage = _bytes; }
static const char *od_log_level_name(od_log_level level) { /* Check for invalid input */ OD_ASSERT(level < OD_LOG_LEVEL_MAX); if (level >= OD_LOG_LEVEL_MAX) return "INVALID"; return od_log_level_names[level]; }
static const char *od_log_facility_name(od_log_facility fac) { /* Check for invalid input */ OD_ASSERT(fac < OD_LOG_FACILITY_MAX); if (fac >= OD_LOG_FACILITY_MAX) return "INVALID"; return od_log_module_names[fac]; }
/** Synthesizes one parition of coefficient values from a PVQ-encoded * vector. This 'partial' version is called by the encode loop where * the Householder reflection has already been computed and there's no * need to recompute it. * * @param [out] xcoeff output coefficient partition (x in math doc) * @param [in] ypulse PVQ-encoded values (y in the math doc); in * the noref case, this vector has n entries, * in the reference case it contains n-1 entries * (the m-th entry is not included) * @param [in] r reference vector (prediction) * @param [in] n number of elements in this partition * @param [in] noref indicates presence or lack of prediction * @param [in] g decoded quantized vector gain * @param [in] theta decoded theta (prediction error) * @param [in] m alignment dimension of Householder reflection * @param [in] s sign of Householder reflection * @param [in] qm_inv inverse of the QM with magnitude compensation */ void od_pvq_synthesis_partial(od_coeff *xcoeff, const od_coeff *ypulse, const double *r, int n, int noref, double g, double theta, int m, int s, const int16_t *qm_inv) { int i; int yy; double scale; int nn; OD_ASSERT(g != 0); nn = n-(!noref); /* when noref==0, vector in is sized n-1 */ yy = 0; for (i = 0; i < nn; i++) yy += ypulse[i]*(int32_t)ypulse[i]; if (yy == 0) scale = 0; else scale = g/sqrt(yy); if (noref) { for (i = 0; i < n; i++) { xcoeff[i] = (od_coeff)floor(.5 + (ypulse[i]*scale)*(qm_inv[i]*OD_QM_INV_SCALE_1)); } } else{ double x[MAXN]; scale *= sin(theta); for (i = 0; i < m; i++) x[i] = ypulse[i]*scale; x[m] = -s*g*cos(theta); for (i = m; i < nn; i++) x[i+1] = ypulse[i]*scale; od_apply_householder(x, r, n); for (i = 0; i < n; i++) { xcoeff[i] = (od_coeff)floor(.5 + (x[i]*(qm_inv[i]*OD_QM_INV_SCALE_1))); } } }
/** Adapts a Q15 cdf after encoding/decoding a symbol. */ void od_cdf_adapt_q15(int val, uint16_t *cdf, int n, int *count, int rate) { int i; *count = OD_MINI(*count + 1, 1 << rate); OD_ASSERT(cdf[n - 1] == 32768); if (*count >= 1 << rate) { /* Steady-state adaptation based on a simple IIR with dyadic rate. */ for (i = 0; i < n; i++) { int tmp; /* When (i < val), we want the adjustment ((cdf[i] - tmp) >> rate) to be positive so long as (cdf[i] > i + 1), and 0 when (cdf[i] == i + 1), to ensure we don't drive any probabilities to 0. Replacing cdf[i] with (i + 2) and solving ((i + 2 - tmp) >> rate == 1) for tmp produces tmp == i + 2 - (1 << rate). Using this value of tmp with cdf[i] == i + 1 instead gives an adjustment of 0 as desired. When (i >= val), we want ((cdf[i] - tmp) >> rate) to be negative so long as cdf[i] < 32768 - (n - 1 - i), and 0 when cdf[i] == 32768 - (n - 1 - i), again to ensure we don't drive any probabilities to 0. Since right-shifting any negative value is still negative, we can solve (32768 - (n - 1 - i) - tmp == 0) for tmp, producing tmp = 32769 - n + i. Using this value of tmp with smaller values of cdf[i] instead gives negative adjustments, as desired. Combining the two cases gives the expression below. These could be stored in a lookup table indexed by n and rate to avoid the arithmetic. */ tmp = 2 - (1<<rate) + i + (32767 + (1<<rate) - n)*(i >= val); cdf[i] -= (cdf[i] - tmp) >> rate; } } else {
/*Computes the SATD using the actual DCT, instead of the Hadamard transform. This is useful for comparing SATD numbers to the frequency domain techniques without having to worry about scaling.*/ static unsigned od_satd(const unsigned char *_src,int _src_stride, const unsigned char *_ref,int _ref_stride){ od_coeff buf[B_SZ*B_SZ]; unsigned satd; int i; int j; OD_ASSERT(B_SZ==4); for(i=0;i<B_SZ;i++){ for(j=0;j<B_SZ;j++){ buf[B_SZ*i+j]=_src[j]-_ref[j]; } _src+=_src_stride; _ref+=_ref_stride; } #if B_SZ_LOG>=OD_LOG_BSIZE0&&B_SZ_LOG<OD_LOG_BSIZE0+OD_NBSIZES (*OD_FDCT_2D[B_SZ_LOG-OD_LOG_BSIZE0])(buf,B_SZ,buf,B_SZ); #else # error "Need an fDCT implementation for this block size." #endif satd=0; for(i=0;i<B_SZ;i++){ for(j=0;j<B_SZ;j++)satd+=abs(buf[B_SZ*i+j]); } return satd; }
void od_copy_nxm(unsigned char *_dst, int _dstride, const unsigned char *_src, int _sstride, int _log_n, int _log_m) { int j; OD_ASSERT(_log_n != _log_m); for (j = 0; j < 1 << _log_m; j++) { OD_COPY(_dst, _src, 1 << _log_n); _dst += _dstride; _src += _sstride; } }
int od_logging_active_impl(od_log_facility fac, od_log_level level) { /* Check for invalid input */ OD_ASSERT(fac < OD_LOG_FACILITY_MAX); if (fac >= OD_LOG_FACILITY_MAX) return 0; if (od_log_levels[fac] < level) return 0; /* Skipped */ return 1; }
/*Takes updated low and range values, renormalizes them so that 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if necessary), and stores them back in the encoder context. low: The new value of low. rng: The new value of the range.*/ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, unsigned rng) { int d; int c; int s; c = enc->cnt; OD_ASSERT(rng <= 65535U); d = 16 - OD_ILOG_NZ(rng); s = c + d; /*TODO: Right now we flush every time we have at least one byte available. Instead we should use an od_ec_window and flush right before we're about to shift bits off the end of the window. For a 32-bit window this is about the same amount of work, but for a 64-bit window it should be a fair win.*/ if (s >= 0) { ogg_uint16_t *buf; ogg_uint32_t storage; ogg_uint32_t offs; unsigned m; buf = enc->precarry_buf; storage = enc->precarry_storage; offs = enc->offs; if (offs + 2 > storage) { storage = 2*storage + 2; buf = (ogg_uint16_t *)_ogg_realloc(buf, sizeof(*buf)*storage); if (buf == NULL) { enc->error = -1; enc->offs = 0; return; } enc->precarry_buf = buf; enc->precarry_storage = storage; } c += 16; m = (1 << c) - 1; if (s >= 8) { OD_ASSERT(offs < storage); buf[offs++] = (ogg_uint16_t)(low >> c); low &= m; c -= 8; m >>= 8; }
static void od_pre_blocks(od_coeff *_out,int _out_stride,od_coeff *_in, int _in_stride,int _bx,int _by,int _b_sz_log){ int b_sz; int by; int bx; int j; int i; b_sz=1<<_b_sz_log; for(by=0;by<_by;by++){ int y; y=by<<_b_sz_log; for(bx=0;bx<_bx;bx++){ int x; x=bx<<_b_sz_log; for(i=0;i<b_sz;i++){ od_coeff col[B_SZ_MAX]; #if APPLY_FILTER for(j=0;j<b_sz;j++){ col[j]=_in[_in_stride*(y+j)+x+i]; } OD_ASSERT(_b_sz_log>=OD_LOG_BSIZE0&&_b_sz_log<=B_SZ_LOG_MAX); (*NE_PRE_FILTER[_b_sz_log-OD_LOG_BSIZE0])(col,col); for(j=0;j<b_sz;j++){ _out[_out_stride*(y+j)+x+i]=col[j]; } #else for(j=0;j<b_sz;j++){ _out[_out_stride*(y+j)+x+i]=_in[_in_stride*(y+j)+x+i]; } #endif } #if APPLY_FILTER for(j=0;j<b_sz;j++){ od_coeff *row; row=&_out[_out_stride*(y+j)+x]; OD_ASSERT(_b_sz_log>=OD_LOG_BSIZE0&&_b_sz_log<=B_SZ_LOG_MAX); (*NE_PRE_FILTER[_b_sz_log-OD_LOG_BSIZE0])(row,row); } #endif } } }
void od_encode_quantizer_scaling(daala_enc_ctx *enc, int q_scaling, int sbx, int sby, int skip) { int nhsb; OD_ASSERT(skip == !!skip); nhsb = enc->state.nhsb; OD_ASSERT(sbx < nhsb); OD_ASSERT(sby < enc->state.nvsb); OD_ASSERT(!skip || q_scaling == 0); enc->state.sb_q_scaling[sby*nhsb + sbx] = q_scaling; if (!skip) { int above; int left; /* use value from neighbour if possible, otherwise use 0 */ above = sby > 0 ? enc->state.sb_q_scaling[(sby - 1)*enc->state.nhsb + sbx] : 0; left = sbx > 0 ? enc->state.sb_q_scaling[sby*enc->state.nhsb + (sbx - 1)] : 0; od_encode_cdf_adapt(&enc->ec, q_scaling, enc->state.adapt.q_cdf[above + left*4], 4, enc->state.adapt.q_increment); } }
static void od_idct_blocks(od_coeff *_out,int _out_stride,od_coeff *_in, int _in_stride,int _bx,int _by,int _b_sz_log){ int bx; int by; for(by=0;by<_by;by++){ int y; y=by<<_b_sz_log; for(bx=0;bx<_bx;bx++){ int x; x=bx<<_b_sz_log; OD_ASSERT(_b_sz_log>=OD_LOG_BSIZE0&&_b_sz_log<=B_SZ_LOG_MAX); (*OD_IDCT_2D[_b_sz_log-OD_LOG_BSIZE0])(&_out[_out_stride*y+x],_out_stride, &_in[_in_stride*y+x],_in_stride); } } }
/*Perform SATD on 8x8 blocks within src and ref then sum the results of each one.*/ OD_SIMD_INLINE int32_t od_mc_compute_sum_8x8_satd8(int ln, const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int n; int i; int j; int32_t satd; n = 1 << ln; OD_ASSERT(n >= 8); satd = 0; for (i = 0; i < n; i += 8) { for (j = 0; j < n; j += 8) { satd += od_mc_compute_satd8_8x8_part(src + i*systride + j, systride, ref + i*rystride + j, rystride); } } return satd; }
/* There is a bug in get_intra_dims where chroma blocks do not line up under luma blocks. */ static void cfl_get_intra_dims(const video_input_info *_info,int _pli, int _padding,int *_x0,int *_y0,int *_nxblocks,int *_nyblocks){ int xshift; int yshift; int padding; xshift=_pli!=0&&!(_info->pixel_fmt&1); yshift=_pli!=0&&!(_info->pixel_fmt&2); OD_ASSERT(xshift==yshift); padding=_padding<<B_SZ_LOG; /*An offset of 1 would be fine to provide enough context for VP8-style intra prediction, but for frequency-domain prediction, we'll want a full block, plus overlap.*/ *_x0=(_info->pic_x>>xshift)+(padding>>1+xshift); *_y0=(_info->pic_y>>yshift)+(padding>>1+yshift); /*We take an extra block off the end to give enough context for above-right intra prediction.*/ *_nxblocks=_info->pic_w-padding>>B_SZ_LOG; *_nyblocks=_info->pic_h-padding>>B_SZ_LOG; }
/* Initialize the quantization matrix with the magnitude compensation applied. We need to compensate for the magnitude because lapping causes some basis functions to be smaller, so they would end up being quantized too finely (the same error in the quantized domain would result in a smaller pixel domain error). */ void od_init_qm(int16_t *x, int16_t *x_inv, const int *qm) { int i; int j; int16_t y[OD_QM_BSIZE]; int16_t y_inv[OD_QM_BSIZE]; int16_t *x1; int16_t *x1_inv; int off; int bs; int xydec; for (bs = 0; bs < OD_NBSIZES; bs++) { for (xydec = 0; xydec < 2; xydec++) { off = od_qm_offset(bs, xydec); x1 = x + off; x1_inv = x_inv + off; for (i = 0; i < 4 << bs; i++) { for (j = 0; j < 4 << bs; j++) { double mag; #if OD_DEBLOCKING || OD_DISABLE_FILTER mag = 1.0; #else mag = OD_BASIS_MAG[xydec][bs][i]*OD_BASIS_MAG[xydec][bs][j]; #endif if (i == 0 && j == 0) { mag = 1.0; } else { mag /= 0.0625*qm[(i << 1 >> bs)*8 + (j << 1 >> bs)]; OD_ASSERT(mag > 0.0); } /*Convert to fit in 16 bits.*/ y[i*(4 << bs) + j] = (int16_t)OD_MINI(OD_QM_SCALE_MAX, (int16_t)floor(.5 + mag*OD_QM_SCALE)); y_inv[i*(4 << bs) + j] = (int16_t)floor(.5 + OD_QM_SCALE*OD_QM_INV_SCALE/(double)y[i*(4 << bs) + j]); } } od_raster_to_coding_order_16(x1, 4 << bs, y, 4 << bs); od_raster_to_coding_order_16(x1_inv, 4 << bs, y_inv, 4 << bs); } } }
void image_data_mask(image_data *_this,const unsigned char *_data,int _stride){ int j; int i; memset(_this->mask,0,_this->nyblocks*_this->nxblocks); /* process_block_size32 needs 32x32 image data with 6 pixel padding */ OD_ASSERT(((_stride-_this->nxblocks*B_SZ)/2)>=6); for(j=0;j<_this->nyblocks*B_SZ/32;j++){ for(i=0;i<_this->nxblocks*B_SZ/32;i++){ const unsigned char *b; BlockSizeComp bs; int dec[4][4]; int k; int l; b=&_data[_stride*32*j+32*i]; process_block_size32(&bs, b,_stride, b, _stride, dec, 21); for(l=0;l<32/B_SZ;l++){ for(k=0;k<32/B_SZ;k++){ /*printf("i=%i j=%i k=%i l=%i\n",i,j,k,l); printf("bx=%i by=%i\n",i*32/B_SZ+k*B_SZ,j*32/B_SZ+l*B_SZ); fflush(stdout);*/ #if B_SZ==4 _this->mask[_this->nxblocks*(j*32/B_SZ+l)+i*32/B_SZ+k]= (dec[l>>1][k>>1]==0); #elif B_SZ==8 _this->mask[_this->nxblocks*(j*32/B_SZ+l)+i*32/B_SZ+k]= (dec[l][1]==1); #elif B_SZ==16 _this->mask[_this->nxblocks*(j*32/B_SZ+l)+i*32/B_SZ+k]= (dec[l<<1][k<<1]==2||dec[l<<1][k<<1]==3); #else # error "Invalid block size." #endif } } } } }
/*This is a smart copy that copies the intersection of the two img planes and performs any needed bitdepth or packing conversion. Note that this copy performs clamping as it's only used for copying into and out of the codec, not for internal reference->reference copies. Does not touch any padding/border/unintersected area.*/ void od_img_plane_copy(daala_image* dst, daala_image* src, int pli) { daala_image_plane *dst_plane; daala_image_plane *src_plane; unsigned char *dst_data; unsigned char *src_data; int dst_xstride; int dst_ystride; int src_xstride; int src_ystride; int dst_xdec; int dst_ydec; int src_xdec; int src_ydec; int dst_plane_width; int dst_plane_height; int src_plane_width; int src_plane_height; int w; int h; int x; int y; if (pli >= dst->nplanes || pli >= src->nplanes) { return; } dst_plane = dst->planes+pli; src_plane = src->planes+pli; dst_xstride = dst_plane->xstride; dst_ystride = dst_plane->ystride; src_xstride = src_plane->xstride; src_ystride = src_plane->ystride; OD_ASSERT(dst_xstride == 1 || dst_xstride == 2); OD_ASSERT(src_xstride == 1 || src_xstride == 2); dst_xdec = dst_plane->xdec; dst_ydec = dst_plane->ydec; src_xdec = src_plane->xdec; src_ydec = src_plane->ydec; dst_data = dst_plane->data; src_data = src_plane->data; dst_plane_width = OD_PLANE_SZ(dst->width, dst_xdec); dst_plane_height = OD_PLANE_SZ(dst->height, dst_ydec); src_plane_width = OD_PLANE_SZ(src->width, src_xdec); src_plane_height = OD_PLANE_SZ(src->height, src_ydec); w = OD_MINI(dst_plane_width, src_plane_width); h = OD_MINI(dst_plane_height, src_plane_height); for (y = 0; y < h; y++) { unsigned char *src_ptr; unsigned char *dst_ptr; src_ptr = src_data; dst_ptr = dst_data; if (src_plane->bitdepth > 8) { if (dst_plane->bitdepth > 8) { /*Both source and destination are multibyte.*/ if (dst_plane->bitdepth >= src_plane->bitdepth) { /*Shift source up into destination.*/ int upshift; upshift = dst_plane->bitdepth - src_plane->bitdepth; for (x = 0; x < w; x++) { *(int16_t *)dst_ptr = OD_CLAMPI(0, *(int16_t *)src_ptr << upshift, (1 << dst_plane->bitdepth) - 1); src_ptr += src_xstride; dst_ptr += dst_xstride; } } else { /*Round source down into destination.*/ int dnshift; dnshift = src_plane->bitdepth - dst_plane->bitdepth; for (x = 0; x < w; x++) { *(int16_t *)dst_ptr = OD_CLAMPI(0, (*(int16_t *)src_ptr + (1 << dnshift >> 1)) >> dnshift, (1 << dst_plane->bitdepth) - 1); src_ptr += src_xstride; dst_ptr += dst_xstride; } } } else { /*Round multibyte source down into 8-bit destination.*/ int dnshift; dnshift = src_plane->bitdepth - 8; OD_ASSERT(dst_plane->bitdepth == 8); for (x = 0; x < w; x++) { *dst_ptr = OD_CLAMP255((*(int16_t *)src_ptr + (1 << dnshift >> 1)) >> dnshift); src_ptr += src_xstride; dst_ptr += dst_xstride; } } } else {
/** Perform PVQ quantization with prediction, trying several * possible gains and angles. See draft-valin-videocodec-pvq and * http://jmvalin.ca/slides/pvq.pdf for more details. * * @param [out] out coefficients after quantization * @param [in] x0 coefficients before quantization * @param [in] r0 reference, aka predicted coefficients * @param [in] n number of dimensions * @param [in] q0 quantization step size * @param [out] y pulse vector (i.e. selected PVQ codevector) * @param [out] itheta angle between input and reference (-1 if noref) * @param [out] max_theta maximum value of itheta that could have been * @param [out] vk total number of pulses * @param [in] beta per-band activity masking beta param * @param [out] skip_diff distortion cost of skipping this block * (accumulated) * @param [in] robust make stream robust to error in the reference * @param [in] is_keyframe whether we're encoding a keyframe * @param [in] pli plane index * @param [in] adapt probability adaptation context * @param [in] qm QM with magnitude compensation * @param [in] qm_inv Inverse of QM with magnitude compensation * @return gain index of the quatized gain */ static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0, int n, int q0, od_coeff *y, int *itheta, int *max_theta, int *vk, double beta, double *skip_diff, int robust, int is_keyframe, int pli, const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv) { od_val32 g; od_val32 gr; od_coeff y_tmp[MAXN]; int i; /* Number of pulses. */ int k; /* Companded gain of x and reference, normalized to q. */ od_val32 cg; od_val32 cgr; int icgr; int qg; /* Best RDO cost (D + lamdba*R) so far. */ double best_cost; /* Distortion (D) that corresponds to the best RDO cost. */ double best_dist; double dist; /* Sign of Householder reflection. */ int s; /* Dimension on which Householder reflects. */ int m; od_val32 theta; double corr; int best_k; od_val32 best_qtheta; od_val32 gain_offset; int noref; double lambda; double skip_dist; int cfl_enabled; int skip; double gain_weight; od_val16 x16[MAXN]; od_val16 r16[MAXN]; int xshift; int rshift; lambda = OD_PVQ_LAMBDA; /* Give more weight to gain error when calculating the total distortion. */ gain_weight = 1.4; OD_ASSERT(n > 1); corr = 0; #if !defined(OD_FLOAT_PVQ) /* Shift needed to make x fit in 16 bits even after rotation. This shift value is not normative (it can be changed without breaking the bitstream) */ xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15); /* Shift needed to make the reference fit in 15 bits, so that the Householder vector can fit in 16 bits. This shift value *is* normative, and has to match the decoder. */ rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14); #else xshift = 0; rshift = 0; #endif for (i = 0; i < n; i++) { #if defined(OD_FLOAT_PVQ) /*This is slightly different from the original float PVQ code, where the qm was applied in the accumulation in od_pvq_compute_gain and the vectors were od_coeffs, not od_val16 (i.e. double).*/ x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1; r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1; #else x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift); r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift); #endif corr += OD_MULT16_16(x16[i], r16[i]); } cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL; cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift); cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift); if (cfl_enabled) cgr = OD_CGAIN_SCALE; /* gain_offset is meant to make sure one of the quantized gains has exactly the same gain as the reference. */ #if defined(OD_FLOAT_PVQ) icgr = (int)floor(.5 + cgr); #else icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT); #endif gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT); /* Start search with null case: gain=0, no pulse. */ qg = 0; dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; best_dist = dist; best_cost = dist + lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0, n, is_keyframe, pli); noref = 1; best_k = 0; *itheta = -1; *max_theta = 0; OD_CLEAR(y, n); best_qtheta = 0; m = 0; s = 1; corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift)); corr = OD_MAXF(OD_MINF(corr, 1.), -1.); if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; else { skip_dist = gain_weight*(cg - cgr)*(cg - cgr) + cgr*(double)cg*(2 - 2*corr); skip_dist *= OD_CGAIN_SCALE_2; } if (!is_keyframe) { /* noref, gain=0 isn't allowed, but skip is allowed. */ od_val32 scgr; scgr = OD_MAXF(0,gain_offset); if (icgr == 0) { best_dist = gain_weight*(cg - scgr)*(cg - scgr) + scgr*(double)cg*(2 - 2*corr); best_dist *= OD_CGAIN_SCALE_2; } best_cost = best_dist + lambda*od_pvq_rate(0, icgr, 0, 0, adapt, NULL, 0, n, is_keyframe, pli); best_qtheta = 0; *itheta = 0; *max_theta = 0; noref = 0; } if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) { od_val16 xr[MAXN]; int gain_bound; gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT); /* Perform theta search only if prediction is useful. */ theta = OD_ROUND32(OD_THETA_SCALE*acos(corr)); m = od_compute_householder(r16, n, gr, &s, rshift); od_apply_householder(xr, x16, r16, n); for (i = m; i < n - 1; i++) xr[i] = xr[i + 1]; /* Search for the best gain within a reasonable range. */ for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) { int j; od_val32 qcg; int ts; /* Quantized companded gain */ qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset; /* Set angular resolution (in ra) to match the encoded gain */ ts = od_pvq_compute_max_theta(qcg, beta); /* Search for the best angle within a reasonable range. */ for (j = OD_MAXI(0, (int)floor(.5 + theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2); j <= OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts)); j++) { double cos_dist; double cost; double dist_theta; double sin_prod; od_val32 qtheta; qtheta = od_pvq_compute_theta(j, ts); k = od_pvq_compute_k(qcg, j, qtheta, 0, n, beta, robust || is_keyframe); sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)* OD_TRIG_SCALE_1; /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since that's the factor by which cos_dist is multiplied to get the distortion metric. */ cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp, qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2); /* See Jmspeex' Journal of Dubious Theoretical Results. */ dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1 + sin_prod*(2 - 2*cos_dist); dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; dist *= OD_CGAIN_SCALE_2; /* Do approximate RDO. */ cost = dist + lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp, k, n, is_keyframe, pli); if (cost < best_cost) { best_cost = cost; best_dist = dist; qg = i; best_k = k; best_qtheta = qtheta; *itheta = j; *max_theta = ts; noref = 0; OD_COPY(y, y_tmp, n - 1); } } } } /* Don't bother with no-reference version if there's a reasonable correlation. The only exception is luma on a keyframe because H/V prediction is unreliable. */ if (n <= OD_MAX_PVQ_SIZE && ((is_keyframe && pli == 0) || corr < .5 || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) { int gain_bound; gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT); /* Search for the best gain (haven't determined reasonable range yet). */ for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) { double cos_dist; double cost; od_val32 qcg; qcg = OD_SHL(i, OD_CGAIN_SHIFT); k = od_pvq_compute_k(qcg, -1, -1, 1, n, beta, robust || is_keyframe); cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp, qcg*(double)cg*OD_CGAIN_SCALE_2); /* See Jmspeex' Journal of Dubious Theoretical Results. */ dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*(2 - 2*cos_dist); dist *= OD_CGAIN_SCALE_2; /* Do approximate RDO. */ cost = dist + lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k, n, is_keyframe, pli); if (cost <= best_cost) { best_cost = cost; best_dist = dist; qg = i; noref = 1; best_k = k; *itheta = -1; *max_theta = 0; OD_COPY(y, y_tmp, n); } } } k = best_k; theta = best_qtheta; skip = 0; if (noref) { if (qg == 0) skip = OD_PVQ_SKIP_ZERO; } else { if (!is_keyframe && qg == 0) { skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY); } if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY; } /* Synthesize like the decoder would. */ if (skip) { if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n); else OD_CLEAR(out, n); } else { if (noref) gain_offset = 0; g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta); od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s, qm_inv); } *vk = k; *skip_diff += skip_dist - best_dist; /* Encode gain differently depending on whether we use prediction or not. Special encoding on inter frames where qg=0 is allowed for noref=0 but not noref=1.*/ if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr); else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1); }
/* Indexing for the packed quantization matrices. */ int od_qm_get_index(int bs, int band) { /* The -band/3 term is due to the fact that we force corresponding horizontal and vertical bands to have the same quantization. */ OD_ASSERT(bs >= 0 && bs < OD_NBSIZES); return bs*(bs + 1) + band - band/3; }
int daala_encode_flush_header(daala_enc_ctx *_enc, daala_comment *_dc, ogg_packet *_op) { daala_info *info = &_enc->state.info; if (_enc == NULL || _op == NULL) return OD_EFAULT; switch (_enc->packet_state) { case OD_PACKET_INFO_HDR: { int pli; oggbyte_reset(&_enc->obb); oggbyte_write1(&_enc->obb, 0x80); oggbyte_writecopy(&_enc->obb, "daala", 5); oggbyte_write1(&_enc->obb, info->version_major); oggbyte_write1(&_enc->obb, info->version_minor); oggbyte_write1(&_enc->obb, info->version_sub); OD_ASSERT(info->pic_width > 0); oggbyte_write4(&_enc->obb, info->pic_width); OD_ASSERT(info->pic_height > 0); oggbyte_write4(&_enc->obb, info->pic_height); oggbyte_write4(&_enc->obb, info->pixel_aspect_numerator); oggbyte_write4(&_enc->obb, info->pixel_aspect_denominator); oggbyte_write4(&_enc->obb, info->timebase_numerator); oggbyte_write4(&_enc->obb, info->timebase_denominator); oggbyte_write4(&_enc->obb, info->frame_duration); OD_ASSERT(info->keyframe_granule_shift < 32); oggbyte_write1(&_enc->obb, info->keyframe_granule_shift); OD_ASSERT((info->nplanes >= 1) && (info->nplanes <= OD_NPLANES_MAX)); oggbyte_write1(&_enc->obb, info->nplanes); for (pli = 0; pli < info->nplanes; ++pli) { oggbyte_write1(&_enc->obb, info->plane_info[pli].xdec); oggbyte_write1(&_enc->obb, info->plane_info[pli].ydec); } _op->b_o_s = 1; } break; case OD_PACKET_COMMENT_HDR: { const char *vendor; uint32_t vendor_len; int ci; oggbyte_reset(&_enc->obb); oggbyte_write1(&_enc->obb, 0x81); oggbyte_writecopy(&_enc->obb, "daala", 5); vendor = daala_version_string(); vendor_len = strlen(vendor); oggbyte_write4(&_enc->obb, vendor_len); oggbyte_writecopy(&_enc->obb, vendor, vendor_len); oggbyte_write4(&_enc->obb, _dc->comments); for (ci = 0; ci < _dc->comments; ci++) { if (_dc->user_comments[ci] != NULL) { oggbyte_write4(&_enc->obb, _dc->comment_lengths[ci]); oggbyte_writecopy(&_enc->obb, _dc->user_comments[ci], _dc->comment_lengths[ci]); } else oggbyte_write4(&_enc->obb, 0); } _op->b_o_s = 0; } break; case OD_PACKET_SETUP_HDR: { /*TODO: Produce header contents.*/ oggbyte_reset(&_enc->obb); oggbyte_write1(&_enc->obb, 0x82); oggbyte_writecopy(&_enc->obb, "daala", 5); _op->b_o_s = 0; } break; /*No more headers to emit.*/ default: return 0; } /*This is kind of fugly: we hand the user a buffer which they do not own. We will overwrite it when the next packet is output, so the user better be done with it by then. Vorbis is little better: it hands back buffers that it will free the next time the headers are requested, or when the encoder is cleared. Hopefully libogg2 will make this much cleaner.*/ _op->packet = oggbyte_get_buffer(&_enc->obb); _op->bytes = oggbyte_bytes(&_enc->obb); _op->e_o_s = 0; _op->granulepos = 0; /*Is this smart? Vorbis does not even set this field.*/ _op->packetno = 0; return ++_enc->packet_state+3; }
void oggbyte_writetrunc(oggbyte_buffer *_b, ptrdiff_t _bytes) { OD_ASSERT(_bytes >= 0); _b->ptr = _b->buf + _bytes; }