void dsp_mmx_recon_init(DspFunctions *funcs) { TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n"); funcs->copy8x8 = copy8x8__mmx; funcs->recon_intra8x8 = recon_intra8x8__mmx; funcs->recon_inter8x8 = recon_inter8x8__mmx; funcs->recon_inter8x8_half = recon_inter8x8_half__mmx; }
/*Initializes the dequantization tables from a set of quantizer info. Currently the dequantizer (and elsewhere enquantizer) tables are expected to be initialized as pointing to the storage reserved for them in the oc_theora_state (resp. oc_enc_ctx) structure. If some tables are duplicates of others, the pointers will be adjusted to point to a single copy of the tables, but the storage for them will not be freed. If you're concerned about the memory footprint, the obvious thing to do is to move the storage out of its fixed place in the structures and allocate it on demand. However, a much, much better option is to only store the quantization matrices being used for the current frame, and to recalculate these as the qi values change between frames (this is what VP3 did).*/ void oc_dequant_tables_init(oc_quant_table *_dequant[2][3], int _pp_dc_scale[64],const th_quant_info *_qinfo){ int qti; /* coding mode: intra or inter */ int pli; /* Y U V */ for(qti=0;qti<2;qti++){ for(pli=0;pli<3;pli++){ oc_quant_tables stage; int qi; /* quality index */ int qri; /* range iterator */ for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges; qri++){ th_quant_base base; ogg_uint32_t q; int qi_start; int qi_end; int ci; memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri], sizeof(base)); qi_start=qi; if(qri==_qinfo->qi_ranges[qti][pli].nranges) qi_end=qi+1; else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri]; /* Iterate over quality indicies in this range */ for(;;){ /*In the original VP3.2 code, the rounding offset and the size of the dead zone around 0 were controlled by a "sharpness" parameter. The size of our dead zone is now controlled by the per-coefficient quality thresholds returned by our HVS module. We round down from a more accurate value when the quality of the reconstruction does not fall below our threshold and it saves bits. Hence, all of that VP3.2 code is gone from here, and the remaining floating point code has been implemented as equivalent integer code with exact precision.*/ /* for postprocess, not dequant */ if(_pp_dc_scale!=NULL) _pp_dc_scale[qi]=(int)((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/160); /*Scale DC the coefficient from the proper table.*/ q=((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/100)<<2; q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX); stage[qi][0]=(ogg_uint16_t)q; /*Now scale AC coefficients from the proper table.*/ for(ci=1;ci<64;ci++){ q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2; q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX); stage[qi][ci]=(ogg_uint16_t)q; } if(++qi>=qi_end)break; /*Interpolate the next base matrix.*/ for(ci=0;ci<64;ci++){ base[ci]=(unsigned char) ((2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+ (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci]) +_qinfo->qi_ranges[qti][pli].sizes[qri])/ (2*_qinfo->qi_ranges[qti][pli].sizes[qri])); } } } /* Staging matricies complete; commit to memory only if this isn't a duplicate of a preceeding plane. This simple check helps us improve cache coherency later.*/ { int dupe = 0; int i,j; for(i=0;i<=qti;i++){ for(j=0;j<(i<qti?3:pli);j++){ if(!memcmp(stage,_dequant[i][j],sizeof(stage))){ dupe = 1; break; } } if(dupe)break; } if(dupe){ _dequant[qti][pli]=_dequant[i][j]; }else{ memcpy(_dequant[qti][pli],stage,sizeof(stage)); } } } } #ifdef _TH_DEBUG_ int i, j, k, l; /* dump the calculated quantizer tables */ for(i=0;i<2;i++){ for(j=0;j<3;j++){ for(k=0;k<64;k++){ TH_DEBUG("quantizer table [%s][%s][Q%d] = {", (i==0?"intra":"inter"),(j==0?"Y":(j==1?"U":"V")),k); for(l=0;l<64;l++){ if((l&7)==0) TH_DEBUG("\n "); TH_DEBUG("%4d ",_dequant[i][j][k][l]); } TH_DEBUG("}\n"); } } } #endif }
int oc_quant_params_unpack(oggpack_buffer *_opb, th_quant_info *_qinfo){ th_quant_base *base_mats; long val; int nbase_mats; int sizes[64]; int indices[64]; int nbits; int bmi; int ci; int qti; int pli; int qri; int qi; int i; theorapackB_read(_opb,3,&val); nbits=(int)val; for(qi=0;qi<64;qi++){ theorapackB_read(_opb,nbits,&val); _qinfo->loop_filter_limits[qi]=(unsigned char)val; } theorapackB_read(_opb,4,&val); nbits=(int)val+1; for(qi=0;qi<64;qi++){ theorapackB_read(_opb,nbits,&val); _qinfo->ac_scale[qi]=(ogg_uint16_t)val; } theorapackB_read(_opb,4,&val); nbits=(int)val+1; for(qi=0;qi<64;qi++){ theorapackB_read(_opb,nbits,&val); _qinfo->dc_scale[qi]=(ogg_uint16_t)val; } theorapackB_read(_opb,9,&val); nbase_mats=(int)val+1; base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0])); for(bmi=0;bmi<nbase_mats;bmi++){ for(ci=0;ci<64;ci++){ theorapackB_read(_opb,8,&val); base_mats[bmi][ci]=(unsigned char)val; } } nbits=oc_ilog(nbase_mats-1); for(i=0;i<6;i++){ th_quant_ranges *qranges; th_quant_base *qrbms; int *qrsizes; qti=i/3; pli=i%3; qranges=_qinfo->qi_ranges[qti]+pli; if(i>0){ theorapackB_read1(_opb,&val); if(!val){ int qtj; int plj; if(qti>0){ theorapackB_read1(_opb,&val); if(val){ qtj=qti-1; plj=pli; } else{ qtj=(i-1)/3; plj=(i-1)%3; } } else{ qtj=(i-1)/3; plj=(i-1)%3; } *qranges=*(_qinfo->qi_ranges[qtj]+plj); continue; } } theorapackB_read(_opb,nbits,&val); indices[0]=(int)val; for(qi=qri=0;qi<63;){ theorapackB_read(_opb,oc_ilog(62-qi),&val); sizes[qri]=(int)val+1; qi+=(int)val+1; theorapackB_read(_opb,nbits,&val); indices[++qri]=(int)val; } /*Note: The caller is responsible for cleaning up any partially constructed qinfo.*/ if(qi>63){ _ogg_free(base_mats); return TH_EBADHEADER; } qranges->nranges=qri; qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0])); memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0])); qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0])); qranges->base_matrices=(const th_quant_base *)qrbms; do{ bmi=indices[qri]; /*Note: The caller is responsible for cleaning up any partially constructed qinfo.*/ if(bmi>=nbase_mats){ _ogg_free(base_mats); return TH_EBADHEADER; } memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri])); } while(qri-->0); } #ifdef _TH_DEBUG_ /* dump the tables */ { int i, j, k, l, m; TH_DEBUG("loop filter limits = {"); for(i=0;i<64;){ TH_DEBUG("\n "); for(j=0;j<16;i++,j++) TH_DEBUG("%3d ",_qinfo->loop_filter_limits[i]); } TH_DEBUG("\n}\n\n"); TH_DEBUG("ac scale = {"); for(i=0;i<64;){ TH_DEBUG("\n "); for(j=0;j<16;i++,j++) TH_DEBUG("%3d ",_qinfo->ac_scale[i]); } TH_DEBUG("\n}\n\n"); TH_DEBUG("dc scale = {"); for(i=0;i<64;){ TH_DEBUG("\n "); for(j=0;j<16;i++,j++) TH_DEBUG("%3d ",_qinfo->dc_scale[i]); } TH_DEBUG("\n}\n\n"); for(k=0;k<2;k++) for(l=0;l<3;l++){ char *name[2][3]={ {"intra Y bases","intra U bases", "intra V bases"}, {"inter Y bases","inter U bases", "inter V bases"} }; th_quant_ranges *r = &_qinfo->qi_ranges[k][l]; TH_DEBUG("%s = {\n",name[k][l]); TH_DEBUG(" ranges = %d\n",r->nranges); TH_DEBUG(" intervals = { "); for(i=0;i<r->nranges;i++) TH_DEBUG("%3d ",r->sizes[i]); TH_DEBUG("}\n"); TH_DEBUG("\n matricies = { "); for(m=0;m<r->nranges+1;m++){ TH_DEBUG("\n { "); for(i=0;i<64;){ TH_DEBUG("\n "); for(j=0;j<8;i++,j++) TH_DEBUG("%3d ",r->base_matrices[m][i]); } TH_DEBUG("\n }"); } TH_DEBUG("\n }\n"); } } #endif _ogg_free(base_mats); return 0; }
ogg_uint32_t oc_cpu_flags_get(void){ ogg_uint32_t flags = 0; ogg_uint32_t eax; ogg_uint32_t ebx; ogg_uint32_t ecx; ogg_uint32_t edx; # if !defined(_MSC_VER) && !defined(__amd64__) && !defined(__x86_64__) /* check for cpuid */ __asm__ __volatile__( "pushfl\n\t" "pushfl\n\t" "popl %0\n\t" "movl %0,%1\n\t" "xorl $0x200000,%0\n\t" "pushl %0\n\t" "popfl\n\t" "pushfl\n\t" "popl %0\n\t" "popfl\n\t" :"=r" (eax), "=r" (ebx) : :"cc" ); /*No cpuid.*/ if(eax==ebx)return 0; # endif /* GCC, x86_32 */ cpuid(0,eax,ebx,ecx,edx); if(ebx==0x756e6547&&edx==0x49656e69&&ecx==0x6c65746e){ /*Intel:*/ inteltest: cpuid(1,eax,ebx,ecx,edx); if((edx&0x00800000)==0)return 0; flags=OC_CPU_X86_MMX; if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE; if(edx&0x04000000)flags|=OC_CPU_X86_SSE2; } else if(ebx==0x68747541&&edx==0x69746e65&&ecx==0x444d4163 || ebx==0x646f6547&&edx==0x79622065&&ecx==0x43534e20){ /*AMD:*/ /*Geode:*/ cpuid(0x80000000,eax,ebx,ecx,edx); if(eax<0x80000001)goto inteltest; cpuid(0x80000001,eax,ebx,ecx,edx); if((edx&0x00800000)==0)return 0; flags=OC_CPU_X86_MMX; if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW; if(edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT; if(edx&0x00400000)flags|=OC_CPU_X86_MMXEXT; } else{ /*Implement me.*/ flags=0; } # ifdef DEBUG if (flags) { TH_DEBUG("vectorized instruction sets supported:"); if (flags & OC_CPU_X86_MMX) TH_DEBUG(" mmx"); if (flags & OC_CPU_X86_MMXEXT) TH_DEBUG(" mmxext"); if (flags & OC_CPU_X86_SSE) TH_DEBUG(" sse"); if (flags & OC_CPU_X86_SSE2) TH_DEBUG(" sse2"); if (flags & OC_CPU_X86_3DNOW) TH_DEBUG(" 3dnow"); if (flags & OC_CPU_X86_3DNOWEXT) TH_DEBUG(" 3dnowext"); TH_DEBUG("\n"); } # endif return flags; }