/*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420mpeg2 chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. The 4:2:2 modes look exactly the same, except there are twice as many chroma lines, and they are vertically co-sited with the luma samples in both the mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, const unsigned char *_src,int _c_w,int _c_h){ int y; int x; for(y=0;y<_c_h;y++){ /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for(x=0;x<OC_MINI(_c_w,2);x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[0]-17*_src[OC_MAXI(x-1,0)]+ 114*_src[x]+35*_src[OC_MINI(x+1,_c_w-1)]-9*_src[OC_MINI(x+2,_c_w-1)]+ _src[OC_MINI(x+3,_c_w-1)]+64)>>7,255); } for(;x<_c_w-3;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[x-2]-17*_src[x-1]+ 114*_src[x]+35*_src[x+1]-9*_src[x+2]+_src[x+3]+64)>>7,255); } for(;x<_c_w;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[x-2]-17*_src[x-1]+ 114*_src[x]+35*_src[OC_MINI(x+1,_c_w-1)]-9*_src[OC_MINI(x+2,_c_w-1)]+ _src[_c_w-1]+64)>>7,255); } _dst+=_c_w; _src+=_c_w; } }
static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){ size_t size; int mindepth; int depth; int loccupancy; int occupancy; if(_binode->nbits!=0&&_depth>0){ return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+ oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1); } depth=mindepth=oc_huff_tree_mindepth(_binode); occupancy=1<<mindepth; do{ loccupancy=occupancy; occupancy=oc_huff_tree_occupancy(_binode,++depth); } while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0)); depth--; size=oc_huff_node_size(depth); if(depth>0){ size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1); size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1); } return size; }
void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){ ogg_int64_t lq; int qi; int qi1; int nqis; /*For now, lambda is fixed depending on the qi value and frame type: lambda=qscale*(qavg[qti][qi]**2), where qscale=0.2125. This was derived by exhaustively searching for the optimal quantizer for the AC coefficients in each block from a number of test sequences for a number of fixed lambda values and fitting the peaks of the resulting histograms (on the log(qavg) scale). The same model applies to both inter and intra frames. A more adaptive scheme might perform better.*/ qi=_enc->state.qis[0]; /*If rate control is active, use the lambda for the _target_ quantizer. This allows us to scale to rates slightly lower than we'd normally be able to reach, and give the rate control a semblance of "fractional qi" precision. TODO: Add API for changing QI, and allow extra precision.*/ if(_enc->state.info.target_bitrate>0)lq=_enc->rc.log_qtarget; else lq=_enc->log_qavg[_qti][qi]; /*The resulting lambda value is less than 0x500000.*/ _enc->lambda=(int)oc_bexp64(2*lq-0x4780BD468D6B62BLL); /*Select additional quantizers. The R-D optimal block AC quantizer statistics suggest that the distribution is roughly Gaussian-like with a slight positive skew. K-means clustering on log_qavg to select 3 quantizers produces cluster centers of {log_qavg-0.6,log_qavg,log_qavg+0.7}. Experiments confirm these are relatively good choices. Although we do greedy R-D optimization of the qii flags to avoid switching too frequently, this becomes ineffective at low rates, either because we do a poor job of predicting the actual R-D cost, or the greedy optimization is not sufficient. Therefore adaptive quantization is disabled above an (experimentally suggested) threshold of log_qavg=7.00 (e.g., below INTRA qi=12 or INTER qi=20 with current matrices). This may need to be revised if the R-D cost estimation or qii flag optimization strategies change.*/ nqis=1; if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&& _enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0, lq+(OC_Q57(7)+5)/10); if(qi1!=qi)_enc->state.qis[nqis++]=qi1; qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MINI(qi+1,63),0, lq-(OC_Q57(6)+5)/10); if(qi1!=qi&&qi1!=_enc->state.qis[nqis-1])_enc->state.qis[nqis++]=qi1; } _enc->state.nqis=nqis; }
/*Re-initialize the Bessel filter coefficients with the specified delay. This does not alter the x/y state, but changes the reaction time of the filter. Altering the time constant of a reactive filter without alterning internal state is something that has to be done carefuly, but our design operates at high enough delays and with small enough time constant changes to make it safe.*/ static void oc_iir_filter_reinit(oc_iir_filter *_f,int _delay){ int alpha; ogg_int64_t one48; ogg_int64_t warp; ogg_int64_t k1; ogg_int64_t k2; ogg_int64_t d; ogg_int64_t a; ogg_int64_t ik2; ogg_int64_t b1; ogg_int64_t b2; /*This borrows some code from an unreleased version of Postfish. See the recipe at http://unicorn.us.com/alex/2polefilters.html for details on deriving the filter coefficients.*/ /*alpha is Q24*/ alpha=(1<<24)/_delay; one48=(ogg_int64_t)1<<48; /*warp is 7.12*/ warp=OC_MAXI(oc_warp_alpha(alpha),1); /*k1 is 9.12*/ k1=3*warp; /*k2 is 16.24.*/ k2=k1*warp; /*d is 16.15.*/ d=((1<<12)+k1<<12)+k2+256>>9; /*a is 0.32, since d is larger than both 1.0 and k2.*/ a=(k2<<23)/d; /*ik2 is 25.24.*/ ik2=one48/k2; /*b1 is Q56; in practice, the integer ranges between -2 and 2.*/ b1=2*a*(ik2-(1<<24)); /*b2 is Q56; in practice, the integer ranges between -2 and 2.*/ b2=(one48<<8)-(4*a<<24)-b1; /*All of the filter parameters are Q24.*/ _f->c[0]=(ogg_int32_t)(b1+((ogg_int64_t)1<<31)>>32); _f->c[1]=(ogg_int32_t)(b2+((ogg_int64_t)1<<31)>>32); _f->g=(ogg_int32_t)(a+128>>8); }
/*Finds the largest complete sub-tree rooted at the current node and collapses it into a single node. This procedure is then applied recursively to all the children of that node. _binode: The root of the sub-tree to collapse. _binode->nbits must be 0 or 1. Return: The new root of the collapsed sub-tree.*/ static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode, char **_storage){ oc_huff_node *root; size_t size; int mindepth; int depth; int loccupancy; int occupancy; depth=mindepth=oc_huff_tree_mindepth(_binode); occupancy=1<<mindepth; do{ loccupancy=occupancy; occupancy=oc_huff_tree_occupancy(_binode,++depth); } while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0)); depth--; if(depth<=1)return oc_huff_tree_copy(_binode,_storage); size=oc_huff_node_size(depth); root=oc_huff_node_init(_storage,size,depth); root->depth=_binode->depth; oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage); return root; }
/*Encodes a description of the given Huffman tables. Although the codes are stored in the encoder as flat arrays, in the bit stream and in the decoder they are structured as a tree. This function recovers the tree structure from the flat array and then writes it out. Note that the codes MUST form a Huffman code, and not merely a prefix-free code, since the binary tree is assumed to be full. _opb: The buffer to store the tree in. _codes: The Huffman tables to pack. Return: 0 on success, or a negative value if one of the given Huffman tables does not form a full, prefix-free code.*/ int oc_huff_codes_pack(oggpack_buffer *_opb, const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){ int i; for(i=0;i<TH_NHUFFMAN_TABLES;i++){ oc_huff_entry entries[TH_NDCT_TOKENS]; int bpos; int maxlen; int mask; int j; /*First, find the maximum code length so we can align all the bit patterns.*/ maxlen=_codes[i][0].nbits; for(j=1;j<TH_NDCT_TOKENS;j++)maxlen=OC_MAXI(_codes[i][j].nbits,maxlen); /*It's improbable that a code with more than 32 bits could pass the validation below, but abort early in any case.*/ if(maxlen>32)return TH_EINVAL; mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1; /*Copy over the codes into our temporary workspace. The bit patterns are aligned, and the original entry each code is from is stored as well.*/ for(j=0;j<TH_NDCT_TOKENS;j++){ entries[j].shift=maxlen-_codes[i][j].nbits; entries[j].pattern=_codes[i][j].pattern<<entries[j].shift&mask; entries[j].token=j; } /*Sort the codes into ascending order. This is the order the leaves of the tree will be traversed.*/ qsort(entries,TH_NDCT_TOKENS,sizeof(entries[0]),huff_entry_cmp); /*For each leaf of the tree:*/ bpos=maxlen; for(j=0;j<TH_NDCT_TOKENS;j++){ ogg_uint32_t bit; /*Fail if this code has no bits at all. Technically a codebook with a single 0-bit entry is legal, but the encoder currently does not support codebooks which do not contain all the tokens.*/ if(entries[j].shift>=maxlen)return TH_EINVAL; /*Descend into the tree, writing a bit for each branch.*/ for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1); /*Mark this as a leaf node, and write its value.*/ oggpackB_write(_opb,1,1); oggpackB_write(_opb,entries[j].token,5); /*For each 1 branch we've descended, back up the tree until we reach a 0 branch.*/ bit=(ogg_uint32_t)1<<bpos; for(;entries[j].pattern&bit;bpos++)bit<<=1; /*Validate the code.*/ if(j+1<TH_NDCT_TOKENS){ mask=~(bit-1)<<1; /*The next entry should have a 1 bit where we had a 0, and should match our code above that bit. This verifies both fullness and prefix-freeness simultaneously.*/ if(!(entries[j+1].pattern&bit)|| (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){ return TH_EINVAL; } } /*If there are no more codes, we should have ascended back to the top of the tree.*/ else if(bpos<maxlen)return TH_EINVAL; } } return 0; }
/*Perform a motion vector search for this macro block against a single reference frame. As a bonus, individual block motion vectors are computed as well, as much of the work can be shared. The actual motion vector is stored in the appropriate place in the oc_mb_enc_info structure. _accum: Drop frame/golden MV accumulators. _mbi: The macro block index. _frame: The frame to use for SATD calculations and refinement, either OC_FRAME_PREV or OC_FRAME_GOLD. _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, int _frame_full){ /*Note: Traditionally this search is done using a rate-distortion objective function of the form D+lambda*R. However, xiphmont tested this and found it produced a small degredation, while requiring extra computation. This is most likely due to Theora's peculiar MV encoding scheme: MVs are not coded relative to a predictor, and the only truly cheap way to use a MV is in the LAST or LAST2 MB modes, which are not being considered here. Therefore if we use the MV found here, it's only because both LAST and LAST2 performed poorly, and therefore the MB is not likely to be uniform or suffer from the aperture problem. Furthermore we would like to re-use the MV found here for as many MBs as possible, so picking a slightly sub-optimal vector to save a bit or two may cause increased degredation in many blocks to come. We could artificially reduce lambda to compensate, but it's faster to just disable it entirely, and use D (the distortion) as the sole criterion.*/ oc_mcenc_ctx mcenc; const ptrdiff_t *frag_buf_offs; const ptrdiff_t *fragis; const unsigned char *src; const unsigned char *ref; const unsigned char *satd_ref; int ystride; oc_mb_enc_info *embs; ogg_int32_t hit_cache[31]; ogg_int32_t hitbit; unsigned best_block_err[4]; unsigned block_err[4]; unsigned best_err; int best_vec[2]; int best_block_vec[4][2]; int candx; int candy; int bi; embs=_enc->mb_info; /*Find some candidate motion vectors.*/ oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame); /*Clear the cache of locations we've examined.*/ memset(hit_cache,0,sizeof(hit_cache)); /*Start with the median predictor.*/ candx=mcenc.candidates[0][0]; candy=mcenc.candidates[0][1]; hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]]; ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame_full]]; satd_ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; ystride=_enc->state.ref_ystride[0]; /*TODO: customize error function for speed/(quality+size) tradeoff.*/ best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); best_vec[0]=candx; best_vec[1]=candy; if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } /*If this predictor fails, move on to set A.*/ if(best_err>OC_YSAD_THRESH1){ unsigned err; unsigned t2; int ncs; int ci; /*Compute the early termination threshold for set A.*/ t2=embs[_mbi].error[_frame]; ncs=OC_MINI(3,embs[_mbi].ncneighbors); for(ci=0;ci<ncs;ci++){ t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); } t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; /*Examine the candidates in set A.*/ for(ci=1;ci<mcenc.setb0;ci++){ candx=mcenc.candidates[ci][0]; candy=mcenc.candidates[ci][1]; /*If we've already examined this vector, then we would be using it if it was better than what we are using.*/ hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } if(best_err>t2){ /*Examine the candidates in set B.*/ for(;ci<mcenc.ncandidates;ci++){ candx=mcenc.candidates[ci][0]; candy=mcenc.candidates[ci][1]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } /*Use the same threshold for set B as in set A.*/ if(best_err>t2){ int best_site; int nsites; int sitei; int site; int b; /*Square pattern search.*/ for(;;){ best_site=4; /*Compose the bit flags for boundary conditions.*/ b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; nsites=OC_SQUARE_NSITES[b]; for(sitei=0;sitei<nsites;sitei++){ site=OC_SQUARE_SITES[b][sitei]; candx=best_vec[0]+OC_SQUARE_DX[site]; candy=best_vec[1]+OC_SQUARE_DY[site]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_site=site; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } if(best_site==4)break; best_vec[0]+=OC_SQUARE_DX[best_site]; best_vec[1]+=OC_SQUARE_DY[best_site]; } /*Final 4-MV search.*/ /*Simply use 1/4 of the macro block set A and B threshold as the individual block threshold.*/ if(_frame==OC_FRAME_PREV){ t2>>=2; for(bi=0;bi<4;bi++){ if(best_block_err[bi]>t2){ /*Square pattern search. We do this in a slightly interesting manner. We continue to check the SAD of all four blocks in the macro block. This gives us two things: 1) We can continue to use the hit_cache to avoid duplicate checks. Otherwise we could continue to read it, but not write to it without saving and restoring it for each block. Note that we could still eliminate a large number of duplicate checks by taking into account the site we came from when choosing the site list. We can still do that to avoid extra hit_cache queries, and it might even be a speed win. 2) It gives us a slightly better chance of escaping local minima. We would not be here if we weren't doing a fairly bad job in finding a good vector, and checking these vectors can save us from 100 to several thousand points off our SAD 1 in 15 times. TODO: Is this a good idea? Who knows. It needs more testing.*/ for(;;){ int bestx; int besty; int bj; bestx=best_block_vec[bi][0]; besty=best_block_vec[bi][1]; /*Compose the bit flags for boundary conditions.*/ b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; nsites=OC_SQUARE_NSITES[b]; for(sitei=0;sitei<nsites;sitei++){ site=OC_SQUARE_SITES[b][sitei]; candx=bestx+OC_SQUARE_DX[site]; candy=besty+OC_SQUARE_DY[site]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ best_block_err[bj]=block_err[bj]; best_block_vec[bj][0]=candx; best_block_vec[bj][1]=candy; } } if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ break; } } } } } }
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){ const th_quant_ranges *qranges; const th_quant_base *base_mats[2*3*64]; int indices[2][3][64]; int nbase_mats; int nbits; int ci; int qi; int qri; int qti; int pli; int qtj; int plj; int bmi; int i; i=_qinfo->loop_filter_limits[0]; for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]); nbits=OC_ILOG_32(i); oggpackB_write(_opb,nbits,3); for(qi=0;qi<64;qi++){ oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits); } /*580 bits for VP3.*/ i=1; for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->ac_scale[qi],i); nbits=OC_ILOGNZ_32(i); oggpackB_write(_opb,nbits-1,4); for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits); /*516 bits for VP3.*/ i=1; for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->dc_scale[qi],i); nbits=OC_ILOGNZ_32(i); oggpackB_write(_opb,nbits-1,4); for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits); /*Consolidate any duplicate base matrices.*/ nbase_mats=0; for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){ qranges=_qinfo->qi_ranges[qti]+pli; for(qri=0;qri<=qranges->nranges;qri++){ for(bmi=0;;bmi++){ if(bmi>=nbase_mats){ base_mats[bmi]=qranges->base_matrices+qri; indices[qti][pli][qri]=nbase_mats++; break; } else if(memcmp(base_mats[bmi][0],qranges->base_matrices[qri], sizeof(base_mats[bmi][0]))==0){ indices[qti][pli][qri]=bmi; break; } } } } /*Write out the list of unique base matrices. 1545 bits for VP3 matrices.*/ oggpackB_write(_opb,nbase_mats-1,9); for(bmi=0;bmi<nbase_mats;bmi++){ for(ci=0;ci<64;ci++)oggpackB_write(_opb,base_mats[bmi][0][ci],8); } /*Now store quant ranges and their associated indices into the base matrix list. 46 bits for VP3 matrices.*/ nbits=OC_ILOG_32(nbase_mats-1); for(i=0;i<6;i++){ qti=i/3; pli=i%3; qranges=_qinfo->qi_ranges[qti]+pli; if(i>0){ if(qti>0){ if(qranges->nranges==_qinfo->qi_ranges[qti-1][pli].nranges&& memcmp(qranges->sizes,_qinfo->qi_ranges[qti-1][pli].sizes, qranges->nranges*sizeof(qranges->sizes[0]))==0&& memcmp(indices[qti][pli],indices[qti-1][pli], (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){ oggpackB_write(_opb,1,2); continue; } } qtj=(i-1)/3; plj=(i-1)%3; if(qranges->nranges==_qinfo->qi_ranges[qtj][plj].nranges&& memcmp(qranges->sizes,_qinfo->qi_ranges[qtj][plj].sizes, qranges->nranges*sizeof(qranges->sizes[0]))==0&& memcmp(indices[qti][pli],indices[qtj][plj], (qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){ oggpackB_write(_opb,0,1+(qti>0)); continue; } oggpackB_write(_opb,1,1); } oggpackB_write(_opb,indices[qti][pli][0],nbits); for(qi=qri=0;qi<63;qri++){ oggpackB_write(_opb,qranges->sizes[qri]-1,OC_ILOG_32(62-qi)); qi+=qranges->sizes[qri]; oggpackB_write(_opb,indices[qti][pli][qri+1],nbits); } } }
void oc_enc_rc_resize(oc_enc_ctx *_enc){ /*If encoding has not yet begun, reset the buffer state.*/ if(_enc->state.curframe_num<0)oc_enc_rc_reset(_enc); else{ int idt; /*Otherwise, update the bounds on the buffer, but not the current fullness.*/ _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate* (ogg_int64_t)_enc->state.info.fps_denominator)/ _enc->state.info.fps_numerator; /*Insane framerates or frame sizes mean insane bitrates. Let's not get carried away.*/ if(_enc->rc.bits_per_frame>0x400000000000LL){ _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL; } else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32; _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12); _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay; _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)* OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay); /*Update the INTER-frame scale filter delay. We jump to it immediately if we've already seen enough frames; otherwise it is simply set as the new target.*/ _enc->rc.inter_delay_target=idt=OC_MAXI(_enc->rc.buf_delay>>1,10); if(idt<OC_MINI(_enc->rc.inter_delay,_enc->rc.inter_count)){ oc_iir_filter_init(&_enc->rc.scalefilter[1],idt, _enc->rc.scalefilter[1].y[0]); _enc->rc.inter_delay=idt; } } /*If we're in pass-2 mode, make sure the frame metrics array is big enough to hold frame statistics for the full buffer.*/ if(_enc->rc.twopass==2){ int cfm; int buf_delay; int reset_window; buf_delay=_enc->rc.buf_delay; reset_window=_enc->rc.frame_metrics==NULL&&(_enc->rc.frames_total[0]==0|| buf_delay<_enc->rc.frames_total[0]+_enc->rc.frames_total[1] +_enc->rc.frames_total[2]); cfm=_enc->rc.cframe_metrics; /*Only try to resize the frame metrics buffer if a) it's too small and b) we were using a finite buffer, or are about to start.*/ if(cfm<buf_delay&&(_enc->rc.frame_metrics!=NULL||reset_window)){ oc_frame_metrics *fm; int nfm; int fmh; fm=(oc_frame_metrics *)_ogg_realloc(_enc->rc.frame_metrics, buf_delay*sizeof(*_enc->rc.frame_metrics)); if(fm==NULL){ /*We failed to allocate a finite buffer.*/ /*If we don't have a valid 2-pass header yet, just return; we'll reset the buffer size when we read the header.*/ if(_enc->rc.frames_total[0]==0)return; /*Otherwise revert to the largest finite buffer previously set, or to whole-file buffering if we were still using that.*/ _enc->rc.buf_delay=_enc->rc.frame_metrics!=NULL? cfm:_enc->rc.frames_total[0]+_enc->rc.frames_total[1] +_enc->rc.frames_total[2]; oc_enc_rc_resize(_enc); return; } _enc->rc.frame_metrics=fm; _enc->rc.cframe_metrics=buf_delay; /*Re-organize the circular buffer.*/ fmh=_enc->rc.frame_metrics_head; nfm=_enc->rc.nframe_metrics; if(fmh+nfm>cfm){ int shift; shift=OC_MINI(fmh+nfm-cfm,buf_delay-cfm); memcpy(fm+cfm,fm,OC_MINI(fmh+nfm-cfm,buf_delay-cfm)*sizeof(*fm)); if(fmh+nfm>buf_delay)memmove(fm,fm+shift,fmh+nfm-buf_delay); } } /*We were using whole-file buffering; now we're not.*/ if(reset_window){ _enc->rc.nframes[0]=_enc->rc.nframes[1]=_enc->rc.nframes[2]=0; _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0; _enc->rc.scale_window_end=_enc->rc.scale_window0= _enc->state.curframe_num+_enc->prev_dup_count+1; if(_enc->rc.twopass_buffer_bytes){ int qti; /*We already read the metrics for the first frame in the window.*/ *(_enc->rc.frame_metrics)=*&_enc->rc.cur_metrics; _enc->rc.nframe_metrics++; qti=_enc->rc.cur_metrics.frame_type; _enc->rc.nframes[qti]++; _enc->rc.nframes[2]+=_enc->rc.cur_metrics.dup_count; _enc->rc.scale_sum[qti]+=oc_bexp_q24(_enc->rc.cur_metrics.log_scale); _enc->rc.scale_window_end+=_enc->rc.cur_metrics.dup_count+1; if(_enc->rc.scale_window_end-_enc->rc.scale_window0<buf_delay){ /*We need more frame data.*/ _enc->rc.twopass_buffer_bytes=0; } } } /*Otherwise, we could shrink the size of the current window, if necessary, but leaving it like it is lets us adapt to the new buffer size more gracefully.*/ } }
static void oc_enc_rc_reset(oc_enc_ctx *_enc){ ogg_int64_t npixels; ogg_int64_t ibpp; int inter_delay; /*TODO: These parameters should be exposed in a th_encode_ctl() API.*/ _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate* (ogg_int64_t)_enc->state.info.fps_denominator)/ _enc->state.info.fps_numerator; /*Insane framerates or frame sizes mean insane bitrates. Let's not get carried away.*/ if(_enc->rc.bits_per_frame>0x400000000000LL){ _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL; } else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32; _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12); _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay; /*Start with a buffer fullness of 50% plus 25% of the amount we plan to spend on a single keyframe interval. We can require fully half the bits in an interval for a keyframe, so this initial level gives us maximum flexibility for over/under-shooting in subsequent frames.*/ _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)* OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay); _enc->rc.fullness=_enc->rc.target; /*Pick exponents and initial scales for quantizer selection.*/ npixels=_enc->state.info.frame_width* (ogg_int64_t)_enc->state.info.frame_height; _enc->rc.log_npixels=oc_blog64(npixels); ibpp=npixels/_enc->rc.bits_per_frame; if(ibpp<1){ _enc->rc.exp[0]=59; _enc->rc.log_scale[0]=oc_blog64(1997)-OC_Q57(8); } else if(ibpp<2){ _enc->rc.exp[0]=55; _enc->rc.log_scale[0]=oc_blog64(1604)-OC_Q57(8); } else{ _enc->rc.exp[0]=48; _enc->rc.log_scale[0]=oc_blog64(834)-OC_Q57(8); } if(ibpp<4){ _enc->rc.exp[1]=100; _enc->rc.log_scale[1]=oc_blog64(2249)-OC_Q57(8); } else if(ibpp<8){ _enc->rc.exp[1]=95; _enc->rc.log_scale[1]=oc_blog64(1751)-OC_Q57(8); } else{ _enc->rc.exp[1]=73; _enc->rc.log_scale[1]=oc_blog64(1260)-OC_Q57(8); } _enc->rc.prev_drop_count=0; _enc->rc.log_drop_scale=OC_Q57(0); /*Set up second order followers, initialized according to corresponding time constants.*/ oc_iir_filter_init(&_enc->rc.scalefilter[0],4, oc_q57_to_q24(_enc->rc.log_scale[0])); inter_delay=(_enc->rc.twopass? OC_MAXI(_enc->keyframe_frequency_force,12):_enc->rc.buf_delay)>>1; _enc->rc.inter_count=0; /*We clamp the actual inter_delay to a minimum of 10 to work within the range of values where later incrementing the delay works as designed. 10 is not an exact choice, but rather a good working trade-off.*/ _enc->rc.inter_delay=10; _enc->rc.inter_delay_target=inter_delay; oc_iir_filter_init(&_enc->rc.scalefilter[1],_enc->rc.inter_delay, oc_q57_to_q24(_enc->rc.log_scale[1])); oc_iir_filter_init(&_enc->rc.vfrfilter,4, oc_bexp64_q24(_enc->rc.log_drop_scale)); }
/*This format is only used for interlaced content, but is included for completeness. 420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420paldv chroma samples are sited like: YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. Then we use another filter to move the C_r location down one quarter pixel, and the C_b location up one quarter pixel.*/ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m,unsigned char *_dst, unsigned char *_aux){ unsigned char *tmp; int c_w; int c_h; int c_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst+=_y4m->pic_w*_y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w=(_y4m->pic_w+1)/2; c_h=(_y4m->pic_h+_y4m->dst_c_dec_h-1)/_y4m->dst_c_dec_h; c_sz=c_w*c_h; tmp=_aux+2*c_sz; for(pli=1;pli<3;pli++){ /*First do the horizontal re-sampling. This is the same as the mpeg2 case, except that after the horizontal case, we need to apply a second vertical filter.*/ y4m_42xmpeg2_42xjpeg_helper(tmp,_aux,c_w,c_h); _aux+=c_sz; switch(pli){ case 1:{ /*Slide C_b up a quarter-pel. This is the same filter used above, but in the other order.*/ for(x=0;x<c_w;x++){ for(y=0;y<OC_MINI(c_h,3);y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(tmp[0] -9*tmp[OC_MAXI(y-2,0)*c_w]+35*tmp[OC_MAXI(y-1,0)*c_w] +114*tmp[y*c_w]-17*tmp[OC_MINI(y+1,c_h-1)*c_w] +4*tmp[OC_MINI(y+2,c_h-1)*c_w]+64)>>7,255); } for(;y<c_h-2;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(tmp[(y-3)*c_w] -9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w] -17*tmp[(y+1)*c_w]+4*tmp[(y+2)*c_w]+64)>>7,255); } for(;y<c_h;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(tmp[(y-3)*c_w] -9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w] -17*tmp[OC_MINI(y+1,c_h-1)*c_w]+4*tmp[(c_h-1)*c_w]+64)>>7,255); } _dst++; tmp++; } _dst+=c_sz-c_w; tmp-=c_w; }break; case 2:{ /*Slide C_r down a quarter-pel. This is the same as the horizontal filter.*/ for(x=0;x<c_w;x++){ for(y=0;y<OC_MINI(c_h,2);y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(4*tmp[0] -17*tmp[OC_MAXI(y-1,0)*c_w]+114*tmp[y*c_w] +35*tmp[OC_MINI(y+1,c_h-1)*c_w]-9*tmp[OC_MINI(y+2,c_h-1)*c_w] +tmp[OC_MINI(y+3,c_h-1)*c_w]+64)>>7,255); } for(;y<c_h-3;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(4*tmp[(y-2)*c_w] -17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[(y+1)*c_w] -9*tmp[(y+2)*c_w]+tmp[(y+3)*c_w]+64)>>7,255); } for(;y<c_h;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,(4*tmp[(y-2)*c_w] -17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[OC_MINI(y+1,c_h-1)*c_w] -9*tmp[OC_MINI(y+2,c_h-1)*c_w]+tmp[(c_h-1)*c_w]+64)>>7,255); } _dst++; tmp++; } }break; } /*For actual interlaced material, this would have to be done separately on each field, and the shift amounts would be different. C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, C_b up 1/8 in the bottom field. The corresponding filters would be: Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ } }