void x264_cpu_restore( uint32_t cpu ) { if( cpu&(X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_3DNOW|X264_CPU_3DNOWEXT) ) { x264_emms(); } }
void x264_speedcontrol_new( x264_t *h ) { x264_speedcontrol_t *sc = h->sc = x264_malloc( sizeof(x264_speedcontrol_t) ); x264_emms(); memset( sc, 0, sizeof(x264_speedcontrol_t) ); if( h->param.sc.f_speed <= 0 ) h->param.sc.f_speed = 1; sc->fps = h->param.i_fps_num / h->param.i_fps_den; sc->spf = 1e6 / sc->fps; h->param.sc.i_buffer_size = X264_MAX( 3, h->param.sc.i_buffer_size ); sc->buffer_size = h->param.sc.i_buffer_size * 1e6 / sc->fps; sc->buffer_fill = sc->buffer_size * h->param.sc.f_buffer_init; sc->buffer_fill = x264_clip3( sc->buffer_fill, sc->spf, sc->buffer_size ); sc->compensation_period = sc->buffer_size/4; sc->timestamp = x264_mdate(); sc->preset = -1; sc->prev_frame = 0; sc->cplx_num = 3e3; //FIXME estimate initial complexity sc->cplx_den = .1; sc->cplx_decay = 1 - 1./h->param.sc.i_buffer_size; sc->stat.min_buffer = sc->buffer_size; sc->stat.max_buffer = 0; sc->user_param = h->param; }
/**************************************************************************** * x264_nal_encode: ****************************************************************************/ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) { uint8_t *src = nal->p_payload; uint8_t *end = nal->p_payload + nal->i_payload; uint8_t *orig_dst = dst; if( MPEG2 ) { *dst++ = 0x00; *dst++ = 0x00; *dst++ = 0x01; /* Write correct startcode if the structure is a slice*/ if( nal->i_type > 0 && nal->i_type < 0xb0 ) *dst++ = nal->i_type; else *dst++ = structure_to_start_code[nal->i_type]; memcpy( dst, src, nal->i_payload ); nal->i_payload += 4; } else { if( h->param.b_annexb ) { if( nal->b_long_startcode ) *dst++ = 0x00; *dst++ = 0x00; *dst++ = 0x00; *dst++ = 0x01; } else /* save room for size later */ dst += 4; /* nal header */ *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type; dst = h->bsf.nal_escape( dst, src, end ); int size = (dst - orig_dst) - 4; /* Write the size header for mp4/etc */ if( !h->param.b_annexb ) { /* Size doesn't include the size of the header we're writing now. */ orig_dst[0] = size>>24; orig_dst[1] = size>>16; orig_dst[2] = size>> 8; orig_dst[3] = size>> 0; } nal->i_payload = size+4; x264_emms(); }
static int check_pixel( int cpu_ref, int cpu_new ) { x264_pixel_function_t pixel_c; x264_pixel_function_t pixel_ref; x264_pixel_function_t pixel_asm; x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3]; x264_predict_t predict_4x4[9+3]; x264_predict8x8_t predict_8x8[9+3]; DECLARE_ALIGNED_16( uint8_t edge[33] ); uint16_t cost_mv[32]; int ret = 0, ok, used_asm; int i, j; x264_pixel_init( 0, &pixel_c ); x264_pixel_init( cpu_ref, &pixel_ref ); x264_pixel_init( cpu_new, &pixel_asm ); x264_predict_16x16_init( 0, predict_16x16 ); x264_predict_8x8c_init( 0, predict_8x8c ); x264_predict_8x8_init( 0, predict_8x8 ); x264_predict_4x4_init( 0, predict_4x4 ); x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); #define TEST_PIXEL( name, align ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] ); \ for( j=0; j<64; j++ ) \ { \ used_asm = 1; \ res_c = call_c( pixel_c.name[i], buf1, 16, buf2+j*!align, 64 ); \ res_asm = call_a( pixel_asm.name[i], buf1, 16, buf2+j*!align, 64 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ break; \ } \ } \ } \ } \ report( "pixel " #name " :" ); TEST_PIXEL( sad, 0 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 0 ); #define TEST_PIXEL_X( N ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ { \ int res_c[4]={0}, res_asm[4]={0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \ for( j=0; j<64; j++) \ { \ uint8_t *pix2 = buf2+j; \ used_asm = 1; \ res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 64 ); \ res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+6, 64 ); \ res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 64 ); \ if(N==4) \ { \ res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+10, 64 ); \ call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \ } \ else \ call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ i, res_c[0], res_c[1], res_c[2], res_c[3], \ res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ } \ if(N==4) \ call_c2( pixel_c.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \ else \ call_c2( pixel_c.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \ } \ } \ } \ report( "pixel sad_x"#N" :" ); TEST_PIXEL_X(3); TEST_PIXEL_X(4); #define TEST_PIXEL_VAR( i ) \ if( pixel_asm.var[i] != pixel_ref.var[i] ) \ { \ uint32_t res_c, res_asm; \ uint32_t sad_c, sad_asm; \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \ res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \ if( (res_c != res_asm) || (sad_c != sad_asm) ) \ { \ ok = 0; \ fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \ } \ } ok = 1; used_asm = 0; TEST_PIXEL_VAR( PIXEL_16x16 ); TEST_PIXEL_VAR( PIXEL_8x8 ); report( "pixel var :" ); #define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ int res_c[3], res_asm[3]; \ set_func_name( #name );\ used_asm = 1; \ memcpy( buf3, buf2, 1024 ); \ for( i=0; i<3; i++ ) \ { \ pred[i]( buf3+40, ##__VA_ARGS__ ); \ res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ } \ call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \ res_c[0], res_c[1], res_c[2], \ res_asm[0], res_asm[1], res_asm[2] ); \ } \ } ok = 1; used_asm = 0; TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 ); TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 ); TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 ); TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge ); report( "intra satd_x3 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { float res_c, res_a; int sums[5][4] = {{0}}; used_asm = ok = 1; x264_emms(); res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 ); res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 ); if( fabs(res_c - res_a) > 1e-6 ) { ok = 0; fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } set_func_name( "ssim_core" ); call_c2( pixel_c.ssim_4x4x2_core, buf1+2, 32, buf2+2, 32, sums ); call_a2( pixel_asm.ssim_4x4x2_core, buf1+2, 32, buf2+2, 32, sums ); set_func_name( "ssim_end" ); call_c2( pixel_c.ssim_end4, sums, sums, 4 ); call_a2( pixel_asm.ssim_end4, sums, sums, 4 ); report( "ssim :" ); } ok = 1; used_asm = 0; for( i=0; i<32; i++ ) cost_mv[i] = i*10; for( i=0; i<100 && ok; i++ ) if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { DECLARE_ALIGNED_16( uint16_t sums[72] ); DECLARE_ALIGNED_16( int dc[4] ); int16_t mvs_a[32], mvs_c[32]; int mvn_a, mvn_c; int thresh = rand() & 0x3fff; set_func_name( "esa_ads" ); for( j=0; j<72; j++ ) sums[j] = rand() & 0x3fff; for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh ); mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh ); if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) { ok = 0; printf("c%d: ", i&3); for(j=0; j<mvn_c; j++) printf("%d ", mvs_c[j]); printf("\na%d: ", i&3); for(j=0; j<mvn_a; j++) printf("%d ", mvs_a[j]); printf("\n\n"); } } report( "esa ads:" ); return ret; }
void x264_speedcontrol_frame( x264_t *h ) { x264_speedcontrol_t *sc = h->sc; int64_t t, delta_t, delta_buffer; int delta_f; x264_emms(); // update buffer state after encoding and outputting the previous frame(s) t = x264_mdate(); delta_f = h->i_frame - sc->prev_frame; delta_t = t - sc->timestamp; delta_buffer = delta_f * sc->spf / h->param.sc.f_speed - delta_t; sc->buffer_fill += delta_buffer; sc->prev_frame = h->i_frame; sc->timestamp = t; // update the time predictor if( delta_f ) { int cpu_time = h->param.sc.b_alt_timer ? sc->cpu_time : delta_t; float decay = powf( sc->cplx_decay, delta_f ); sc->cplx_num *= decay; sc->cplx_den *= decay; sc->cplx_num += cpu_time / presets[sc->preset].time; sc->cplx_den += delta_f; sc->stat.avg_preset += sc->preset * delta_f; sc->stat.den += delta_f; } sc->stat.min_buffer = X264_MIN( sc->buffer_fill, sc->stat.min_buffer ); sc->stat.max_buffer = X264_MAX( sc->buffer_fill, sc->stat.max_buffer ); if( sc->buffer_fill > sc->buffer_size ) // oops, cpu was idle { // not really an error, but we'll warn for debugging purposes static int64_t idle_t = 0, print_interval = 0; idle_t += sc->buffer_fill - sc->buffer_size; if( t - print_interval > 1e6 ) { x264_log( h, X264_LOG_WARNING, "speedcontrol idle (%.6f sec)\n", idle_t/1e6 ); print_interval = t; idle_t = 0; } sc->buffer_fill = sc->buffer_size; } else if( sc->buffer_fill < 0 && delta_buffer < 0 ) // oops, we're late { // don't clip fullness to 0; we'll hope the real buffer was bigger than // specified, and maybe we can catch up. if the application had to drop // frames, then it should override the buffer fullness (FIXME implement this). x264_log( h, X264_LOG_WARNING, "speedcontrol underflow (%.6f sec)\n", sc->buffer_fill/1e6 ); } { // pick the preset that should return the buffer to 3/4-full within a time // specified by compensation_period float target = sc->spf / h->param.sc.f_speed * (sc->buffer_fill + sc->compensation_period) / (sc->buffer_size*3/4 + sc->compensation_period); float cplx = sc->cplx_num / sc->cplx_den; float set, t0, t1; float filled = (float) sc->buffer_fill / sc->buffer_size; int i; t0 = presets[0].time * cplx; for( i=1;; i++ ) { t1 = presets[i].time * cplx; if( t1 >= target || i == PRESETS-1 ) break; t0 = t1; } // linear interpolation between states set = i-1 + (target - t0) / (t1 - t0); // Even if our time estimations in the PRESETS array are off // this will push us towards our target fullness set += (20 * (filled-0.75)); set = x264_clip3f(set,0,PRESETS-1); apply_preset( h, dither( sc, set ) ); // FIXME if (h->param.i_log_level >= X264_LOG_DEBUG) { static float cpu, wall, tgt, den; float decay = 1-1/100.; cpu = cpu*decay + sc->cpu_time; wall = wall*decay + delta_t; tgt = tgt*decay + target; den = den*decay + 1; fprintf( stderr, "speed: %.2f %d[%.5f] (t/c/w: %6.0f/%6.0f/%6.0f = %.4f) fps=%.2f\r", set, sc->preset, (float)sc->buffer_fill / sc->buffer_size, tgt/den, cpu/den, wall/den, cpu/wall, 1e6*den/wall ); } } }