static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP10_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only, VPxWorker *workers, int nworkers, VP9LfSync *lf_sync) { const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; // Decoder may allocate more threads than number of tiles based on user's // input. const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = VPXMIN(nworkers, tile_cols); int i; if (!lf_sync->sync_range || sb_rows != lf_sync->rows || num_workers > lf_sync->num_workers) { vp10_loop_filter_dealloc(lf_sync); vp10_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } // Initialize cur_sb_col to -1 for all SB rows. memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); // Set up loopfilter thread data. // The decoder is capping num_workers because it has been observed that using // more threads on the loopfilter than there are cores will hurt performance // on Android. This is because the system will only schedule the tile decode // workers on cores equal to the number of tile columns. Then if the decoder // tries to use more threads for the loopfilter, it will hurt performance // because of contention. If the multithreading code changes in the future // then the number of workers used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; worker->hook = (VPxWorkerHook)loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; // Loopfilter data vp10_loop_filter_data_reset(lf_data, frame, cm, planes); lf_data->start = start + i * MI_BLOCK_SIZE; lf_data->stop = stop; lf_data->y_only = y_only; // Start loopfiltering if (i == num_workers - 1) { winterface->execute(worker); } else { winterface->launch(worker); } } // Wait till all rows are finished for (i = 0; i < num_workers; ++i) { winterface->sync(&workers[i]); } }
void vp10_decoder_remove(VP10Decoder *pbi) { int i; vpx_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VPxWorker *const worker = &pbi->tile_workers[i]; vpx_get_worker_interface()->end(worker); } vpx_free(pbi->tile_worker_data); vpx_free(pbi->tile_worker_info); vpx_free(pbi->tile_workers); if (pbi->num_tile_workers > 0) { vp10_loop_filter_dealloc(&pbi->lf_row_sync); } vpx_free(pbi); }