void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){ _info->version_major=_ci->version_major; _info->version_minor=_ci->version_minor; _info->version_subminor=_ci->version_subminor; _info->frame_width=_ci->width; _info->frame_height=_ci->height; _info->pic_width=_ci->frame_width; _info->pic_height=_ci->frame_height; _info->pic_x=_ci->offset_x; _info->pic_y=_ci->offset_y; _info->fps_numerator=_ci->fps_numerator; _info->fps_denominator=_ci->fps_denominator; _info->aspect_numerator=_ci->aspect_numerator; _info->aspect_denominator=_ci->aspect_denominator; switch(_ci->colorspace){ case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break; case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break; default:_info->colorspace=TH_CS_UNSPECIFIED;break; } switch(_ci->pixelformat){ case OC_PF_420:_info->pixel_fmt=TH_PF_420;break; case OC_PF_422:_info->pixel_fmt=TH_PF_422;break; case OC_PF_444:_info->pixel_fmt=TH_PF_444;break; default:_info->pixel_fmt=TH_PF_RSVD; } _info->target_bitrate=_ci->target_bitrate; _info->quality=_ci->quality; _info->keyframe_granule_shift=_ci->keyframe_frequency_force>0? OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0; }
/*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420mpeg2 chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. The 4:2:2 modes look exactly the same, except there are twice as many chroma lines, and they are vertically co-sited with the luma samples in both the mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m,unsigned char *_dst, unsigned char *_aux){ int c_w; int c_h; int pli; int y; int x; /*Skip past the luma data.*/ _dst+=_y4m->pic_w*_y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w=(_y4m->pic_w+_y4m->dst_c_dec_h-1)/_y4m->dst_c_dec_h; c_h=(_y4m->pic_h+_y4m->dst_c_dec_v-1)/_y4m->dst_c_dec_v; for(pli=1;pli<3;pli++){ for(y=0;y<c_h;y++){ /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for(x=0;x<OC_MINI(c_w,2);x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[0]-17*_aux[OC_MAXI(x-1,0)]+ 114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+ _aux[OC_MINI(x+3,c_w-1)]+64>>7,255); } for(;x<c_w-3;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+ 114*_aux[x]+35*_aux[x+1]-9*_aux[x+2]+_aux[x+3]+64>>7,255); } for(;x<c_w;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+ 114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+ _aux[c_w-1]+64>>7,255); } _dst+=c_w; _aux+=c_w; } } }
/*Finds the depth of shortest branch of the given sub-tree. The tree must be binary. _binode: The root of the given sub-tree. _binode->nbits must be 0 or 1. Return: The smallest depth of a leaf node in this sub-tree. 0 indicates this sub-tree is a leaf node.*/ static int oc_huff_tree_mindepth(oc_huff_node *_binode){ int depth0; int depth1; if(_binode->nbits==0)return 0; depth0=oc_huff_tree_mindepth(_binode->nodes[0]); depth1=oc_huff_tree_mindepth(_binode->nodes[1]); return OC_MINI(depth0,depth1)+1; }
/*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 411 chroma samples are sited like: YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | We use a filter to resample at site locations one eighth pixel (at the source chroma plane's horizontal resolution) and five eighths of a pixel to the right. Then we use another filter to decimate the planes by 2 in the vertical direction.*/ static void y4m_convert_411_420jpeg(y4m_input *_y4m,unsigned char *_dst, unsigned char *_aux){ unsigned char *tmp; int c_w; int c_h; int c_sz; int dst_c_w; int dst_c_h; int dst_c_sz; int tmp_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst+=_y4m->pic_w*_y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w=(_y4m->pic_w+_y4m->src_c_dec_h-1)/_y4m->src_c_dec_h; c_h=_y4m->pic_h; dst_c_w=(_y4m->pic_w+_y4m->dst_c_dec_h-1)/_y4m->dst_c_dec_h; dst_c_h=(_y4m->pic_h+_y4m->dst_c_dec_v-1)/_y4m->dst_c_dec_v; c_sz=c_w*c_h; dst_c_sz=dst_c_w*dst_c_h; tmp_sz=dst_c_w*c_h; tmp=_aux+2*c_sz; for(pli=1;pli<3;pli++){ /*In reality, the horizontal and vertical steps could be pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ /*First do horizontal filtering (convert to 422jpeg)*/ for(y=0;y<c_h;y++){ /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a 4-tap Mitchell window.*/ for(x=0;x<OC_MINI(c_w,1);x++){ tmp[x<<1]=(unsigned char)OC_CLAMPI(0,(111*_aux[0] +18*_aux[OC_MINI(1,c_w-1)]-_aux[OC_MINI(2,c_w-1)]+64)>>7,255); tmp[x<<1|1]=(unsigned char)OC_CLAMPI(0,(47*_aux[0] +86*_aux[OC_MINI(1,c_w-1)]-5*_aux[OC_MINI(2,c_w-1)]+64)>>7,255); } for(;x<c_w-2;x++){ tmp[x<<1]=(unsigned char)OC_CLAMPI(0,(_aux[x-1]+110*_aux[x] +18*_aux[x+1]-_aux[x+2]+64)>>7,255); tmp[x<<1|1]=(unsigned char)OC_CLAMPI(0,(-3*_aux[x-1]+50*_aux[x] +86*_aux[x+1]-5*_aux[x+2]+64)>>7,255); } for(;x<c_w;x++){ tmp[x<<1]=(unsigned char)OC_CLAMPI(0,(_aux[x-1]+110*_aux[x] +18*_aux[OC_MINI(x+1,c_w-1)]-_aux[c_w-1]+64)>>7,255); if((x<<1|1)<dst_c_w){ tmp[x<<1|1]=(unsigned char)OC_CLAMPI(0,(-3*_aux[x-1]+50*_aux[x] +86*_aux[OC_MINI(x+1,c_w-1)]-5*_aux[c_w-1]+64)>>7,255); } } tmp+=dst_c_w; _aux+=c_w; } tmp-=tmp_sz; /*Now do the vertical filtering.*/ y4m_422jpeg_420jpeg_helper(_dst,tmp,dst_c_w,c_h); _dst+=dst_c_sz; }
/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0. This is used as a helper by several converation routines.*/ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, const unsigned char *_src,int _c_w,int _c_h){ int y; int x; /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for(x=0;x<_c_w;x++){ for(y=0;y<OC_MINI(_c_h,2);y+=2){ _dst[(y>>1)*_c_w]=OC_CLAMPI(0,(64*_src[0] +78*_src[OC_MINI(1,_c_h-1)*_c_w] -17*_src[OC_MINI(2,_c_h-1)*_c_w] +3*_src[OC_MINI(3,_c_h-1)*_c_w]+64)>>7,255); } for(;y<_c_h-3;y+=2){ _dst[(y>>1)*_c_w]=OC_CLAMPI(0,(3*(_src[(y-2)*_c_w]+_src[(y+3)*_c_w]) -17*(_src[(y-1)*_c_w]+_src[(y+2)*_c_w]) +78*(_src[y*_c_w]+_src[(y+1)*_c_w])+64)>>7,255); } for(;y<_c_h;y+=2){ _dst[(y>>1)*_c_w]=OC_CLAMPI(0,(3*(_src[(y-2)*_c_w] +_src[(_c_h-1)*_c_w])-17*(_src[(y-1)*_c_w] +_src[OC_MINI(y+2,_c_h-1)*_c_w]) +78*(_src[y*_c_w]+_src[OC_MINI(y+1,_c_h-1)*_c_w])+64)>>7,255); } _src++; _dst++; } }
/*420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420mpeg2 chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | BR | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. The 4:2:2 modes look exactly the same, except there are twice as many chroma lines, and they are vertically co-sited with the luma samples in both the mpeg2 and jpeg cases (thus requiring no vertical resampling).*/ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, const unsigned char *_src,int _c_w,int _c_h){ int y; int x; for(y=0;y<_c_h;y++){ /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for(x=0;x<OC_MINI(_c_w,2);x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[0]-17*_src[OC_MAXI(x-1,0)]+ 114*_src[x]+35*_src[OC_MINI(x+1,_c_w-1)]-9*_src[OC_MINI(x+2,_c_w-1)]+ _src[OC_MINI(x+3,_c_w-1)]+64)>>7,255); } for(;x<_c_w-3;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[x-2]-17*_src[x-1]+ 114*_src[x]+35*_src[x+1]-9*_src[x+2]+_src[x+3]+64)>>7,255); } for(;x<_c_w;x++){ _dst[x]=(unsigned char)OC_CLAMPI(0,(4*_src[x-2]-17*_src[x-1]+ 114*_src[x]+35*_src[OC_MINI(x+1,_c_w-1)]-9*_src[OC_MINI(x+2,_c_w-1)]+ _src[_c_w-1]+64)>>7,255); } _dst+=_c_w; _src+=_c_w; } }
void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _qti){ ogg_int64_t lq; int qi; int qi1; int nqis; /*For now, lambda is fixed depending on the qi value and frame type: lambda=qscale*(qavg[qti][qi]**2), where qscale=0.2125. This was derived by exhaustively searching for the optimal quantizer for the AC coefficients in each block from a number of test sequences for a number of fixed lambda values and fitting the peaks of the resulting histograms (on the log(qavg) scale). The same model applies to both inter and intra frames. A more adaptive scheme might perform better.*/ qi=_enc->state.qis[0]; /*If rate control is active, use the lambda for the _target_ quantizer. This allows us to scale to rates slightly lower than we'd normally be able to reach, and give the rate control a semblance of "fractional qi" precision. TODO: Add API for changing QI, and allow extra precision.*/ if(_enc->state.info.target_bitrate>0)lq=_enc->rc.log_qtarget; else lq=_enc->log_qavg[_qti][qi]; /*The resulting lambda value is less than 0x500000.*/ _enc->lambda=(int)oc_bexp64(2*lq-0x4780BD468D6B62BLL); /*Select additional quantizers. The R-D optimal block AC quantizer statistics suggest that the distribution is roughly Gaussian-like with a slight positive skew. K-means clustering on log_qavg to select 3 quantizers produces cluster centers of {log_qavg-0.6,log_qavg,log_qavg+0.7}. Experiments confirm these are relatively good choices. Although we do greedy R-D optimization of the qii flags to avoid switching too frequently, this becomes ineffective at low rates, either because we do a poor job of predicting the actual R-D cost, or the greedy optimization is not sufficient. Therefore adaptive quantization is disabled above an (experimentally suggested) threshold of log_qavg=7.00 (e.g., below INTRA qi=12 or INTER qi=20 with current matrices). This may need to be revised if the R-D cost estimation or qii flag optimization strategies change.*/ nqis=1; if(lq<(OC_Q57(56)>>3)&&!_enc->vp3_compatible&& _enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MAXI(qi-1,0),0, lq+(OC_Q57(7)+5)/10); if(qi1!=qi)_enc->state.qis[nqis++]=qi1; qi1=oc_enc_find_qi_for_target(_enc,_qti,OC_MINI(qi+1,63),0, lq-(OC_Q57(6)+5)/10); if(qi1!=qi&&qi1!=_enc->state.qis[nqis-1])_enc->state.qis[nqis++]=qi1; } _enc->state.nqis=nqis; }
/*422jpeg chroma samples are sited like: Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | Y---BR--Y-------Y---BR--Y------- | | | | | | | | | | | | 411 chroma samples are sited like: YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | YBR-----Y-------Y-------Y------- | | | | | | | | | | | | We use a filter to resample at site locations one eighth pixel (at the source chroma plane's horizontal resolution) and five eighths of a pixel to the right.*/ static void y4m_convert_411_422jpeg(y4m_input *_y4m,unsigned char *_dst, unsigned char *_aux){ int c_w; int dst_c_w; int c_h; int pli; int y; int x; /*Skip past the luma data.*/ _dst+=_y4m->pic_w*_y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w=(_y4m->pic_w+_y4m->src_c_dec_h-1)/_y4m->src_c_dec_h; dst_c_w=(_y4m->pic_w+_y4m->dst_c_dec_h-1)/_y4m->dst_c_dec_h; c_h=(_y4m->pic_h+_y4m->dst_c_dec_v-1)/_y4m->dst_c_dec_v; for(pli=1;pli<3;pli++){ for(y=0;y<c_h;y++){ /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a 4-tap Mitchell window.*/ for(x=0;x<OC_MINI(c_w,1);x++){ _dst[x<<1]=(unsigned char)OC_CLAMPI(0,111*_aux[0]+ 18*_aux[OC_MINI(1,c_w-1)]-_aux[OC_MINI(2,c_w-1)]+64>>7,255); _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,47*_aux[0]+ 86*_aux[OC_MINI(1,c_w-1)]-5*_aux[OC_MINI(2,c_w-1)]+64>>7,255); } for(;x<c_w-2;x++){ _dst[x<<1]=(unsigned char)OC_CLAMPI(0,_aux[x-1]+110*_aux[x]+ 18*_aux[x+1]-_aux[x+2]+64>>7,255); _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,-3*_aux[x-1]+50*_aux[x]+ 86*_aux[x+1]-5*_aux[x+2]+64>>7,255); } for(;x<c_w;x++){ _dst[x<<1]=(unsigned char)OC_CLAMPI(0,_aux[x-1]+110*_aux[x]+ 18*_aux[OC_MINI(x+1,c_w-1)]-_aux[c_w-1]+64>>7,255); if((x<<1|1)<dst_c_w){ _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,-3*_aux[x-1]+50*_aux[x]+ 86*_aux[OC_MINI(x+1,c_w-1)]-5*_aux[c_w-1]+64>>7,255); } } _dst+=dst_c_w; _aux+=c_w; } }
/*This format is only used for interlaced content, but is included for completeness. 420jpeg chroma samples are sited like: Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | Y-------Y-------Y-------Y------- | | | | | BR | | BR | | | | | Y-------Y-------Y-------Y------- | | | | | | | | | | | | 420paldv chroma samples are sited like: YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | YR------Y-------YR------Y------- | | | | | | | | | | | | YB------Y-------YB------Y------- | | | | | | | | | | | | We use a resampling filter to shift the site locations one quarter pixel (at the chroma plane's resolution) to the right. Then we use another filter to move the C_r location down one quarter pixel, and the C_b location up one quarter pixel.*/ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m,unsigned char *_dst, unsigned char *_aux){ unsigned char *tmp; int c_w; int c_h; int c_sz; int pli; int y; int x; /*Skip past the luma data.*/ _dst+=_y4m->pic_w*_y4m->pic_h; /*Compute the size of each chroma plane.*/ c_w=(_y4m->pic_w+1)/2; c_h=(_y4m->pic_h+_y4m->dst_c_dec_h-1)/_y4m->dst_c_dec_h; c_sz=c_w*c_h; /*First do the horizontal re-sampling. This is the same as the mpeg2 case, except that after the horizontal case, we need to apply a second vertical filter.*/ tmp=_aux+2*c_sz; for(pli=1;pli<3;pli++){ for(y=0;y<c_h;y++){ /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos window.*/ for(x=0;x<OC_MINI(c_w,2);x++){ tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[0]-17*_aux[OC_MAXI(x-1,0)]+ 114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+ _aux[OC_MINI(x+3,c_w-1)]+64>>7,255); } for(;x<c_w-3;x++){ tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+ 114*_aux[x]+35*_aux[x+1]-9*_aux[x+2]+_aux[x+3]+64>>7,255); } for(;x<c_w;x++){ tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+ 114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+ _aux[c_w-1]+64>>7,255); } tmp+=c_w; _aux+=c_w; } switch(pli){ case 1:{ tmp-=c_sz; /*Slide C_b up a quarter-pel. This is the same filter used above, but in the other order.*/ for(x=0;x<c_w;x++){ for(y=0;y<OC_MINI(c_h,3);y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[0]- 9*tmp[OC_MAXI(y-2,0)*c_w]+35*tmp[OC_MAXI(y-1,0)*c_w]+ 114*tmp[y*c_w]-17*tmp[OC_MINI(y+1,c_h-1)*c_w]+ 4*tmp[OC_MINI(y+2,c_h-1)*c_w]+64>>7,255); } for(;y<c_h-2;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[(y-3)*c_w]- 9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w]- 17*tmp[(y+1)*c_w]+4*tmp[(y+2)*c_w]+64>>7,255); } for(;y<c_h;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[(y-3)*c_w]- 9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w]- 17*tmp[OC_MINI(y+1,c_h-1)*c_w]+4*tmp[(c_h-1)*c_w]+64>>7,255); } _dst++; tmp++; } _dst+=c_sz-c_w; tmp-=c_w; }break; case 2:{ tmp-=c_sz; /*Slide C_r down a quarter-pel. This is the same as the horizontal filter.*/ for(x=0;x<c_w;x++){ for(y=0;y<OC_MINI(c_h,2);y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[0]- 17*tmp[OC_MAXI(y-1,0)*c_w]+114*tmp[y*c_w]+ 35*tmp[OC_MINI(y+1,c_h-1)*c_w]-9*tmp[OC_MINI(y+2,c_h-1)*c_w]+ tmp[OC_MINI(y+3,c_h-1)*c_w]+64>>7,255); } for(;y<c_h-3;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[(y-2)*c_w]- 17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[(y+1)*c_w]- 9*tmp[(y+2)*c_w]+tmp[(y+3)*c_w]+64>>7,255); } for(;y<c_h;y++){ _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[(y-2)*c_w]- 17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[OC_MINI(y+1,c_h-1)*c_w]- 9*tmp[OC_MINI(y+2,c_h-1)*c_w]+tmp[(c_h-1)*c_w]+64>>7,255); } _dst++; tmp++; } }break; } /*For actual interlaced material, this would have to be done separately on each field, and the shift amounts would be different. C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8, C_b up 1/8 in the bottom field. The corresponding filters would be: Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128 Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/ } }
/*Perform a motion vector search for this macro block against a single reference frame. As a bonus, individual block motion vectors are computed as well, as much of the work can be shared. The actual motion vector is stored in the appropriate place in the oc_mb_enc_info structure. _accum: Drop frame/golden MV accumulators. _mbi: The macro block index. _frame: The frame to use for SATD calculations and refinement, either OC_FRAME_PREV or OC_FRAME_GOLD. _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, int _frame_full){ /*Note: Traditionally this search is done using a rate-distortion objective function of the form D+lambda*R. However, xiphmont tested this and found it produced a small degredation, while requiring extra computation. This is most likely due to Theora's peculiar MV encoding scheme: MVs are not coded relative to a predictor, and the only truly cheap way to use a MV is in the LAST or LAST2 MB modes, which are not being considered here. Therefore if we use the MV found here, it's only because both LAST and LAST2 performed poorly, and therefore the MB is not likely to be uniform or suffer from the aperture problem. Furthermore we would like to re-use the MV found here for as many MBs as possible, so picking a slightly sub-optimal vector to save a bit or two may cause increased degredation in many blocks to come. We could artificially reduce lambda to compensate, but it's faster to just disable it entirely, and use D (the distortion) as the sole criterion.*/ oc_mcenc_ctx mcenc; const ptrdiff_t *frag_buf_offs; const ptrdiff_t *fragis; const unsigned char *src; const unsigned char *ref; const unsigned char *satd_ref; int ystride; oc_mb_enc_info *embs; ogg_int32_t hit_cache[31]; ogg_int32_t hitbit; unsigned best_block_err[4]; unsigned block_err[4]; unsigned best_err; int best_vec[2]; int best_block_vec[4][2]; int candx; int candy; int bi; embs=_enc->mb_info; /*Find some candidate motion vectors.*/ oc_mcenc_find_candidates(_enc,&mcenc,_accum,_mbi,_frame); /*Clear the cache of locations we've examined.*/ memset(hit_cache,0,sizeof(hit_cache)); /*Start with the median predictor.*/ candx=mcenc.candidates[0][0]; candy=mcenc.candidates[0][1]; hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; frag_buf_offs=_enc->state.frag_buf_offs; fragis=_enc->state.mb_maps[_mbi][0]; src=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_IO]]; ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame_full]]; satd_ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[_frame]]; ystride=_enc->state.ref_ystride[0]; /*TODO: customize error function for speed/(quality+size) tradeoff.*/ best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); best_vec[0]=candx; best_vec[1]=candy; if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } /*If this predictor fails, move on to set A.*/ if(best_err>OC_YSAD_THRESH1){ unsigned err; unsigned t2; int ncs; int ci; /*Compute the early termination threshold for set A.*/ t2=embs[_mbi].error[_frame]; ncs=OC_MINI(3,embs[_mbi].ncneighbors); for(ci=0;ci<ncs;ci++){ t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); } t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; /*Examine the candidates in set A.*/ for(ci=1;ci<mcenc.setb0;ci++){ candx=mcenc.candidates[ci][0]; candy=mcenc.candidates[ci][1]; /*If we've already examined this vector, then we would be using it if it was better than what we are using.*/ hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } if(best_err>t2){ /*Examine the candidates in set B.*/ for(;ci<mcenc.ncandidates;ci++){ candx=mcenc.candidates[ci][0]; candy=mcenc.candidates[ci][1]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } /*Use the same threshold for set B as in set A.*/ if(best_err>t2){ int best_site; int nsites; int sitei; int site; int b; /*Square pattern search.*/ for(;;){ best_site=4; /*Compose the bit flags for boundary conditions.*/ b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; nsites=OC_SQUARE_NSITES[b]; for(sitei=0;sitei<nsites;sitei++){ site=OC_SQUARE_SITES[b][sitei]; candx=best_vec[0]+OC_SQUARE_DX[site]; candy=best_vec[1]+OC_SQUARE_DY[site]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_site=site; } if(_frame==OC_FRAME_PREV){ for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ best_block_err[bi]=block_err[bi]; best_block_vec[bi][0]=candx; best_block_vec[bi][1]=candy; } } } if(best_site==4)break; best_vec[0]+=OC_SQUARE_DX[best_site]; best_vec[1]+=OC_SQUARE_DY[best_site]; } /*Final 4-MV search.*/ /*Simply use 1/4 of the macro block set A and B threshold as the individual block threshold.*/ if(_frame==OC_FRAME_PREV){ t2>>=2; for(bi=0;bi<4;bi++){ if(best_block_err[bi]>t2){ /*Square pattern search. We do this in a slightly interesting manner. We continue to check the SAD of all four blocks in the macro block. This gives us two things: 1) We can continue to use the hit_cache to avoid duplicate checks. Otherwise we could continue to read it, but not write to it without saving and restoring it for each block. Note that we could still eliminate a large number of duplicate checks by taking into account the site we came from when choosing the site list. We can still do that to avoid extra hit_cache queries, and it might even be a speed win. 2) It gives us a slightly better chance of escaping local minima. We would not be here if we weren't doing a fairly bad job in finding a good vector, and checking these vectors can save us from 100 to several thousand points off our SAD 1 in 15 times. TODO: Is this a good idea? Who knows. It needs more testing.*/ for(;;){ int bestx; int besty; int bj; bestx=best_block_vec[bi][0]; besty=best_block_vec[bi][1]; /*Compose the bit flags for boundary conditions.*/ b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; nsites=OC_SQUARE_NSITES[b]; for(sitei=0;sitei<nsites;sitei++){ site=OC_SQUARE_SITES[b][sitei]; candx=bestx+OC_SQUARE_DX[site]; candy=besty+OC_SQUARE_DY[site]; hitbit=(ogg_int32_t)1<<candx+15; if(hit_cache[candy+15]&hitbit)continue; hit_cache[candy+15]|=hitbit; err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); if(err<best_err){ best_err=err; best_vec[0]=candx; best_vec[1]=candy; } for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ best_block_err[bj]=block_err[bj]; best_block_vec[bj][0]=candx; best_block_vec[bj][1]=candy; } } if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ break; } } } } } }
void oc_enc_rc_resize(oc_enc_ctx *_enc){ /*If encoding has not yet begun, reset the buffer state.*/ if(_enc->state.curframe_num<0)oc_enc_rc_reset(_enc); else{ int idt; /*Otherwise, update the bounds on the buffer, but not the current fullness.*/ _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate* (ogg_int64_t)_enc->state.info.fps_denominator)/ _enc->state.info.fps_numerator; /*Insane framerates or frame sizes mean insane bitrates. Let's not get carried away.*/ if(_enc->rc.bits_per_frame>0x400000000000LL){ _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL; } else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32; _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12); _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay; _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)* OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay); /*Update the INTER-frame scale filter delay. We jump to it immediately if we've already seen enough frames; otherwise it is simply set as the new target.*/ _enc->rc.inter_delay_target=idt=OC_MAXI(_enc->rc.buf_delay>>1,10); if(idt<OC_MINI(_enc->rc.inter_delay,_enc->rc.inter_count)){ oc_iir_filter_init(&_enc->rc.scalefilter[1],idt, _enc->rc.scalefilter[1].y[0]); _enc->rc.inter_delay=idt; } } /*If we're in pass-2 mode, make sure the frame metrics array is big enough to hold frame statistics for the full buffer.*/ if(_enc->rc.twopass==2){ int cfm; int buf_delay; int reset_window; buf_delay=_enc->rc.buf_delay; reset_window=_enc->rc.frame_metrics==NULL&&(_enc->rc.frames_total[0]==0|| buf_delay<_enc->rc.frames_total[0]+_enc->rc.frames_total[1] +_enc->rc.frames_total[2]); cfm=_enc->rc.cframe_metrics; /*Only try to resize the frame metrics buffer if a) it's too small and b) we were using a finite buffer, or are about to start.*/ if(cfm<buf_delay&&(_enc->rc.frame_metrics!=NULL||reset_window)){ oc_frame_metrics *fm; int nfm; int fmh; fm=(oc_frame_metrics *)_ogg_realloc(_enc->rc.frame_metrics, buf_delay*sizeof(*_enc->rc.frame_metrics)); if(fm==NULL){ /*We failed to allocate a finite buffer.*/ /*If we don't have a valid 2-pass header yet, just return; we'll reset the buffer size when we read the header.*/ if(_enc->rc.frames_total[0]==0)return; /*Otherwise revert to the largest finite buffer previously set, or to whole-file buffering if we were still using that.*/ _enc->rc.buf_delay=_enc->rc.frame_metrics!=NULL? cfm:_enc->rc.frames_total[0]+_enc->rc.frames_total[1] +_enc->rc.frames_total[2]; oc_enc_rc_resize(_enc); return; } _enc->rc.frame_metrics=fm; _enc->rc.cframe_metrics=buf_delay; /*Re-organize the circular buffer.*/ fmh=_enc->rc.frame_metrics_head; nfm=_enc->rc.nframe_metrics; if(fmh+nfm>cfm){ int shift; shift=OC_MINI(fmh+nfm-cfm,buf_delay-cfm); memcpy(fm+cfm,fm,OC_MINI(fmh+nfm-cfm,buf_delay-cfm)*sizeof(*fm)); if(fmh+nfm>buf_delay)memmove(fm,fm+shift,fmh+nfm-buf_delay); } } /*We were using whole-file buffering; now we're not.*/ if(reset_window){ _enc->rc.nframes[0]=_enc->rc.nframes[1]=_enc->rc.nframes[2]=0; _enc->rc.scale_sum[0]=_enc->rc.scale_sum[1]=0; _enc->rc.scale_window_end=_enc->rc.scale_window0= _enc->state.curframe_num+_enc->prev_dup_count+1; if(_enc->rc.twopass_buffer_bytes){ int qti; /*We already read the metrics for the first frame in the window.*/ *(_enc->rc.frame_metrics)=*&_enc->rc.cur_metrics; _enc->rc.nframe_metrics++; qti=_enc->rc.cur_metrics.frame_type; _enc->rc.nframes[qti]++; _enc->rc.nframes[2]+=_enc->rc.cur_metrics.dup_count; _enc->rc.scale_sum[qti]+=oc_bexp_q24(_enc->rc.cur_metrics.log_scale); _enc->rc.scale_window_end+=_enc->rc.cur_metrics.dup_count+1; if(_enc->rc.scale_window_end-_enc->rc.scale_window0<buf_delay){ /*We need more frame data.*/ _enc->rc.twopass_buffer_bytes=0; } } } /*Otherwise, we could shrink the size of the current window, if necessary, but leaving it like it is lets us adapt to the new buffer size more gracefully.*/ } }
static void oc_enc_rc_reset(oc_enc_ctx *_enc){ ogg_int64_t npixels; ogg_int64_t ibpp; int inter_delay; /*TODO: These parameters should be exposed in a th_encode_ctl() API.*/ _enc->rc.bits_per_frame=(_enc->state.info.target_bitrate* (ogg_int64_t)_enc->state.info.fps_denominator)/ _enc->state.info.fps_numerator; /*Insane framerates or frame sizes mean insane bitrates. Let's not get carried away.*/ if(_enc->rc.bits_per_frame>0x400000000000LL){ _enc->rc.bits_per_frame=(ogg_int64_t)0x400000000000LL; } else if(_enc->rc.bits_per_frame<32)_enc->rc.bits_per_frame=32; _enc->rc.buf_delay=OC_MAXI(_enc->rc.buf_delay,12); _enc->rc.max=_enc->rc.bits_per_frame*_enc->rc.buf_delay; /*Start with a buffer fullness of 50% plus 25% of the amount we plan to spend on a single keyframe interval. We can require fully half the bits in an interval for a keyframe, so this initial level gives us maximum flexibility for over/under-shooting in subsequent frames.*/ _enc->rc.target=(_enc->rc.max+1>>1)+(_enc->rc.bits_per_frame+2>>2)* OC_MINI(_enc->keyframe_frequency_force,_enc->rc.buf_delay); _enc->rc.fullness=_enc->rc.target; /*Pick exponents and initial scales for quantizer selection.*/ npixels=_enc->state.info.frame_width* (ogg_int64_t)_enc->state.info.frame_height; _enc->rc.log_npixels=oc_blog64(npixels); ibpp=npixels/_enc->rc.bits_per_frame; if(ibpp<1){ _enc->rc.exp[0]=59; _enc->rc.log_scale[0]=oc_blog64(1997)-OC_Q57(8); } else if(ibpp<2){ _enc->rc.exp[0]=55; _enc->rc.log_scale[0]=oc_blog64(1604)-OC_Q57(8); } else{ _enc->rc.exp[0]=48; _enc->rc.log_scale[0]=oc_blog64(834)-OC_Q57(8); } if(ibpp<4){ _enc->rc.exp[1]=100; _enc->rc.log_scale[1]=oc_blog64(2249)-OC_Q57(8); } else if(ibpp<8){ _enc->rc.exp[1]=95; _enc->rc.log_scale[1]=oc_blog64(1751)-OC_Q57(8); } else{ _enc->rc.exp[1]=73; _enc->rc.log_scale[1]=oc_blog64(1260)-OC_Q57(8); } _enc->rc.prev_drop_count=0; _enc->rc.log_drop_scale=OC_Q57(0); /*Set up second order followers, initialized according to corresponding time constants.*/ oc_iir_filter_init(&_enc->rc.scalefilter[0],4, oc_q57_to_q24(_enc->rc.log_scale[0])); inter_delay=(_enc->rc.twopass? OC_MAXI(_enc->keyframe_frequency_force,12):_enc->rc.buf_delay)>>1; _enc->rc.inter_count=0; /*We clamp the actual inter_delay to a minimum of 10 to work within the range of values where later incrementing the delay works as designed. 10 is not an exact choice, but rather a good working trade-off.*/ _enc->rc.inter_delay=10; _enc->rc.inter_delay_target=inter_delay; oc_iir_filter_init(&_enc->rc.scalefilter[1],_enc->rc.inter_delay, oc_q57_to_q24(_enc->rc.log_scale[1])); oc_iir_filter_init(&_enc->rc.vfrfilter,4, oc_bexp64_q24(_enc->rc.log_drop_scale)); }