static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags = {0, 0, 0}; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; ctx->next_output_worker_id = (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; // TODO(hkuang): Add worker error handling here. winterface->sync(worker); frame_worker_data->received_frame = 0; ++ctx->available_threads; check_resync(ctx, frame_worker_data->pbi); if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { VP9_COMMON *const cm = &frame_worker_data->pbi->common; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, frame_worker_data->user_priv); ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; ++ctx->num_cache_frames; } }
void vp9_create_encoding_threads(VP9_COMP *cpi) { VP9_COMMON * const cm = &cpi->common; const VP9WorkerInterface * const winterface = vp9_get_worker_interface(); int i; CHECK_MEM_ERROR(cm, cpi->enc_thread_hndl, vpx_malloc(sizeof(*cpi->enc_thread_hndl) * cpi->max_threads)); for (i = 0; i < cpi->max_threads; ++i) { VP9Worker * const worker = &cpi->enc_thread_hndl[i]; winterface->init(worker); CHECK_MEM_ERROR(cm, worker->data1, vpx_memalign(32, sizeof(thread_context))); worker->data2 = NULL; if (i < cpi->max_threads - 1 && !winterface->reset(worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); } } // set row encoding hook for (i = 0; i < cpi->max_threads; ++i) { winterface->sync(&cpi->enc_thread_hndl[i]); cpi->enc_thread_hndl[i].hook = (VP9WorkerHook) encoding_thread_process; } CHECK_MEM_ERROR(cm, cpi->cur_sb_col, vpx_malloc(sizeof(*cpi->cur_sb_col) * cm->sb_rows)); // init cur sb col vpx_memset(cpi->cur_sb_col, -1, (sizeof(*cpi->cur_sb_col) * cm->sb_rows)); // set up nsync (currently unused). cpi->sync_range = get_sync_range(cpi->oxcf.width); }
static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { if (ctx->frame_workers != NULL) { int i; for (i = 0; i < ctx->num_frame_workers; ++i) { VP9Worker *const worker = &ctx->frame_workers[i]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; vp9_get_worker_interface()->end(worker); vp9_remove_common(&frame_worker_data->pbi->common); #if CONFIG_VP9_POSTPROC vp9_free_postproc_buffers(&frame_worker_data->pbi->common); #endif vp9_decoder_remove(frame_worker_data->pbi); vpx_free(frame_worker_data->scratch_buffer); #if CONFIG_MULTITHREAD pthread_mutex_destroy(&frame_worker_data->stats_mutex); pthread_cond_destroy(&frame_worker_data->stats_cond); #endif vpx_free(frame_worker_data); } #if CONFIG_MULTITHREAD pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); #endif } if (ctx->buffer_pool) { vp9_free_ref_frame_buffers(ctx->buffer_pool); vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); } vpx_free(ctx->frame_workers); vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; }
// VP9 Encoder: Implement multi-threaded loopfilter that uses the threads // used for encoding. void vp9e_loop_filter_frame_mt(VP9_COMP *cpi, int frame_filter_level, int y_only, int partial_frame) { VP9_COMMON *const cm = &cpi->common; const int sb_rows = cm->sb_rows; const int num_threads = cpi->max_threads; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); int thread_id; int start_mi_row = 0, end_mi_row, mi_rows_to_filter = cm->mi_rows; if (!frame_filter_level) return; // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(cpi->cur_sb_col, -1, sizeof(*cpi->cur_sb_col) * sb_rows); if (partial_frame && cm->mi_rows > 8) { int i; start_mi_row = cm->mi_rows >> 1; start_mi_row &= 0xfffffff8; mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); // Initialize cur_sb_col to sb_cols for top SB rows indicating // that deblocking is done. for (i = 0; i < start_mi_row >> MI_BLOCK_SIZE_LOG2; i++) cpi->cur_sb_col[i] = cm->sb_cols - 1; }
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only, VP9Worker *workers, int nworkers, VP9LfSync *lf_sync) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; // Decoder may allocate more threads than number of tiles based on user's // input. const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = MIN(nworkers, tile_cols); int i; if (!lf_sync->sync_range || sb_rows != lf_sync->rows || num_workers > lf_sync->num_workers) { vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); // Set up loopfilter thread data. // The decoder is capping num_workers because it has been observed that using // more threads on the loopfilter than there are cores will hurt performance // on Android. This is because the system will only schedule the tile decode // workers on cores equal to the number of tile columns. Then if the decoder // tries to use more threads for the loopfilter, it will hurt performance // because of contention. If the multithreading code changes in the future // then the number of workers used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { VP9Worker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; worker->hook = (VP9WorkerHook)loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; // Loopfilter data vp9_loop_filter_data_reset(lf_data, frame, cm, planes); lf_data->start = start + i * MI_BLOCK_SIZE; lf_data->stop = stop; lf_data->y_only = y_only; // Start loopfiltering if (i == num_workers - 1) { winterface->execute(worker); } else { winterface->launch(worker); } } // Wait till all rows are finished for (i = 0; i < num_workers; ++i) { winterface->sync(&workers[i]); } }
void vp9_decoder_remove(VP9Decoder *pbi) { int i; vp9_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; vp9_get_worker_interface()->end(worker); } vpx_free(pbi->tile_worker_data); vpx_free(pbi->tile_worker_info); vpx_free(pbi->tile_workers); if (pbi->num_tile_workers > 0) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } vpx_free(pbi); }
VP9Decoder *vp9_decoder_create() { VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi)); VP9_COMMON *const cm = pbi ? &pbi->common : NULL; if (!cm) return NULL; vp9_zero(*pbi); if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; vp9_decoder_remove(pbi); return NULL; } cm->error.setjmp = 1; CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR(cm, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); pbi->need_resync = 1; initialize_dec(); // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); cm->current_video_frame = 0; pbi->ready_for_new_data = 1; cm->bit_depth = VPX_BITS_8; cm->dequant_bit_depth = VPX_BITS_8; cm->alloc_mi = vp9_dec_alloc_mi; cm->free_mi = vp9_dec_free_mi; cm->setup_mi = vp9_dec_setup_mi; // vp9_init_dequantizer() is first called here. Add check in // frame_init_dequantizer() to avoid unnecessary calling of // vp9_init_dequantizer() for every frame. vp9_init_dequantizer(cm); vp9_loop_filter_init(cm); cm->error.setjmp = 0; vp9_get_worker_interface()->init(&pbi->lf_worker); return pbi; }
VP9Decoder *vp9_decoder_create(BufferPool *const pool) { VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi)); VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL; if (!cm) return NULL; vp9_zero(*pbi); if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; vp9_decoder_remove(pbi); return NULL; } cm->error.setjmp = 1; CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR(cm, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); pbi->need_resync = 1; once(initialize_dec); // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); vpx_memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); cm->current_video_frame = 0; pbi->ready_for_new_data = 1; pbi->common.buffer_pool = pool; cm->bit_depth = VPX_BITS_8; cm->dequant_bit_depth = VPX_BITS_8; cm->alloc_mi = vp9_dec_alloc_mi; cm->free_mi = vp9_dec_free_mi; cm->setup_mi = vp9_dec_setup_mi; vp9_loop_filter_init(cm); cm->error.setjmp = 0; vp9_get_worker_interface()->init(&pbi->lf_worker); return pbi; }
void vp9_decoder_remove(VP9Decoder *pbi) { VP9_COMMON *const cm = &pbi->common; int i; vp9_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; vp9_get_worker_interface()->end(worker); vpx_free(worker->data1); vpx_free(worker->data2); } vpx_free(pbi->tile_workers); if (pbi->num_tile_workers) { const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows); } vp9_remove_common(cm); vpx_free(pbi); }
VP9Decoder *vp9_decoder_create() { VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi)); VP9_COMMON *const cm = pbi ? &pbi->common : NULL; if (!cm) return NULL; vp9_zero(*pbi); if (setjmp(cm->error.jmp)) { cm->error.setjmp = 0; vp9_decoder_remove(pbi); return NULL; } cm->error.setjmp = 1; initialize_dec(); vp9_rtcd(); // Initialize the references to not point to any frame buffers. vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); cm->current_video_frame = 0; pbi->ready_for_new_data = 1; // vp9_init_dequantizer() is first called here. Add check in // frame_init_dequantizer() to avoid unnecessary calling of // vp9_init_dequantizer() for every frame. vp9_init_dequantizer(cm); vp9_loop_filter_init(cm); cm->error.setjmp = 0; vp9_get_worker_interface()->init(&pbi->lf_worker); return pbi; }
// VP9 decoder: Implement multi-threaded loopfilter that uses the tile // threads. void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9Decoder *pbi, VP9_COMMON *cm, int frame_filter_level, int y_only) { VP9LfSync *const lf_sync = &pbi->lf_row_sync; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); int i; if (!frame_filter_level) return; if (!lf_sync->sync_range || cm->last_height != cm->height) { vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width); } vp9_loop_filter_frame_init(cm, frame_filter_level); // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); // Set up loopfilter thread data. // The decoder is using num_workers instead of pbi->num_tile_workers // because it has been observed that using more threads on the // loopfilter, than there are tile columns in the frame will hurt // performance on Android. This is because the system will only // schedule the tile decode workers on cores equal to the number // of tile columns. Then if the decoder tries to use more threads for the // loopfilter, it will hurt performance because of contention. If the // multithreading code changes in the future then the number of workers // used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; LFWorkerData *const lf_data = &tile_data->lfdata; worker->hook = (VP9WorkerHook)loop_filter_row_worker; // Loopfilter data lf_data->frame_buffer = frame; lf_data->cm = cm; vp9_copy(lf_data->planes, pbi->mb.plane); lf_data->start = i; lf_data->stop = sb_rows; lf_data->y_only = y_only; // always do all planes in decoder lf_data->lf_sync = lf_sync; lf_data->num_lf_workers = num_workers; // Start loopfiltering if (i == num_workers - 1) { winterface->execute(worker); } else { winterface->launch(worker); } } // Wait till all rows are finished for (i = 0; i < num_workers; ++i) { winterface->sync(&pbi->tile_workers[i]); } }
static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; // Only return frame when all the cpu are busy or // application fluhsed the decoder in frame parallel decode. if (ctx->frame_parallel_decode && ctx->available_threads > 0 && !ctx->flushed) { return NULL; } // Output the frames in the cache first. if (ctx->num_cache_frames > 0) { release_last_output_frame(ctx); ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; if (ctx->need_resync) return NULL; img = &ctx->frame_cache[ctx->frame_cache_read].img; ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; --ctx->num_cache_frames; return img; } // iter acts as a flip flop, so an image is only returned on the first // call to get_frame. if (*iter == NULL && ctx->frame_workers != NULL) { do { YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags = {0, 0, 0}; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; ctx->next_output_worker_id = (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); // Wait for the frame from worker thread. if (winterface->sync(worker)) { // Check if worker has received any frames. if (frame_worker_data->received_frame == 1) { ++ctx->available_threads; frame_worker_data->received_frame = 0; check_resync(ctx, frame_worker_data->pbi); } if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { VP9_COMMON *const cm = &frame_worker_data->pbi->common; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; release_last_output_frame(ctx); ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; if (ctx->need_resync) return NULL; yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; img = &ctx->img; return img; } } else { // Decoding failed. Release the worker thread. frame_worker_data->received_frame = 0; ++ctx->available_threads; ctx->need_resync = 1; if (ctx->flushed != 1) return NULL; } } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); } return NULL; }
static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, void *user_priv, int64_t deadline) { vp9_ppflags_t flags = {0, 0, 0}; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); (void)deadline; // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. if (!ctx->si.h) { int is_intra_only = 0; const vpx_codec_err_t res = decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only, ctx->decrypt_cb, ctx->decrypt_state); if (res != VPX_CODEC_OK) return res; if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR; } if (!ctx->frame_parallel_decode) { VP9Worker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->data = *data; frame_worker_data->data_size = data_sz; frame_worker_data->user_priv = user_priv; frame_worker_data->received_frame = 1; // Set these even if already initialized. The caller may have changed the // decrypt config between frames. frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; worker->had_error = 0; winterface->execute(worker); // Update data pointer after decode. *data = frame_worker_data->data_end; if (worker->had_error) return update_error_state(ctx, &frame_worker_data->pbi->common.error); check_resync(ctx, frame_worker_data->pbi); } else { VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; // Copy context from last worker thread to next worker thread. if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) vp9_frameworker_copy_context( &ctx->frame_workers[ctx->next_submit_worker_id], &ctx->frame_workers[ctx->last_submit_worker_id]); frame_worker_data->pbi->ready_for_new_data = 0; // Copy the compressed data into worker's internal buffer. // TODO(hkuang): Will all the workers allocate the same size // as the size of the first intra frame be better? This will // avoid too many deallocate and allocate. if (frame_worker_data->scratch_buffer_size < data_sz) { frame_worker_data->scratch_buffer = (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz); if (frame_worker_data->scratch_buffer == NULL) { set_error_detail(ctx, "Failed to reallocate scratch buffer"); return VPX_CODEC_MEM_ERROR; } frame_worker_data->scratch_buffer_size = data_sz; } frame_worker_data->data_size = data_sz; vpx_memcpy(frame_worker_data->scratch_buffer, *data, data_sz); frame_worker_data->frame_decoded = 0; frame_worker_data->frame_context_ready = 0; frame_worker_data->received_frame = 1; frame_worker_data->data = frame_worker_data->scratch_buffer; frame_worker_data->user_priv = user_priv; if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) ctx->last_submit_worker_id = (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; ctx->next_submit_worker_id = (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; --ctx->available_threads; worker->had_error = 0; winterface->launch(worker); } if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); return VPX_CODEC_OK; }
static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { int i; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); ctx->last_show_frame = -1; ctx->next_submit_worker_id = 0; ctx->last_submit_worker_id = 0; ctx->next_output_worker_id = 0; ctx->frame_cache_read = 0; ctx->frame_cache_write = 0; ctx->num_cache_frames = 0; ctx->need_resync = 1; ctx->num_frame_workers = (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1; if (ctx->num_frame_workers > MAX_DECODE_THREADS) ctx->num_frame_workers = MAX_DECODE_THREADS; ctx->available_threads = ctx->num_frame_workers; ctx->flushed = 0; ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; #if CONFIG_MULTITHREAD if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { set_error_detail(ctx, "Failed to allocate buffer pool mutex"); return VPX_CODEC_MEM_ERROR; } #endif ctx->frame_workers = (VP9Worker *) vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers)); if (ctx->frame_workers == NULL) { set_error_detail(ctx, "Failed to allocate frame_workers"); return VPX_CODEC_MEM_ERROR; } for (i = 0; i < ctx->num_frame_workers; ++i) { VP9Worker *const worker = &ctx->frame_workers[i]; FrameWorkerData *frame_worker_data = NULL; winterface->init(worker); worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); if (worker->data1 == NULL) { set_error_detail(ctx, "Failed to allocate frame_worker_data"); return VPX_CODEC_MEM_ERROR; } frame_worker_data = (FrameWorkerData *)worker->data1; frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); if (frame_worker_data->pbi == NULL) { set_error_detail(ctx, "Failed to allocate frame_worker_data"); return VPX_CODEC_MEM_ERROR; } frame_worker_data->pbi->frame_worker_owner = worker; frame_worker_data->worker_id = i; frame_worker_data->scratch_buffer = NULL; frame_worker_data->scratch_buffer_size = 0; frame_worker_data->frame_context_ready = 0; frame_worker_data->received_frame = 0; #if CONFIG_MULTITHREAD if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); return VPX_CODEC_MEM_ERROR; } if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); return VPX_CODEC_MEM_ERROR; } #endif // If decoding in serial mode, FrameWorker thread could create tile worker // thread or loopfilter thread. frame_worker_data->pbi->max_threads = (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; frame_worker_data->pbi->common.frame_parallel_decode = ctx->frame_parallel_decode; worker->hook = (VP9WorkerHook)frame_worker_hook; if (!winterface->reset(worker)) { set_error_detail(ctx, "Frame Worker thread creation failed"); return VPX_CODEC_MEM_ERROR; } } // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) set_default_ppflags(&ctx->postproc_cfg); init_buffer_callbacks(ctx); return VPX_CODEC_OK; }
int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, const uint8_t **psource) { VP9_COMMON *volatile const cm = &pbi->common; BufferPool *volatile const pool = cm->buffer_pool; RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; const uint8_t *source = *psource; int retcode = 0; cm->error.error_code = VPX_CODEC_OK; if (size == 0) { // This is used to signal that we are missing frames. // We do not know if the missing frame(s) was supposed to update // any of the reference buffers, but we act conservative and // mark only the last buffer as corrupted. // // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. if (cm->frame_refs[0].idx > 0) { assert(cm->frame_refs[0].buf != NULL); cm->frame_refs[0].buf->corrupted = 1; } } pbi->ready_for_new_data = 0; // Check if the previous frame was a frame without any references to it. // Release frame buffer if not decoding in frame parallel mode. if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0) pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); cm->new_fb_idx = get_free_fb(cm); // Assign a MV array to the frame buffer. cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; pbi->hold_ref_buf = 0; if (pbi->frame_parallel_decode) { VP9Worker *const worker = pbi->frame_worker_owner; vp9_frameworker_lock_stats(worker); frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; // Reset decoding progress. pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; pbi->cur_buf->row = -1; pbi->cur_buf->col = -1; vp9_frameworker_unlock_stats(worker); } else { pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; } if (setjmp(cm->error.jmp)) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); int i; cm->error.setjmp = 0; pbi->ready_for_new_data = 1; // Synchronize all threads immediately as a subsequent decode call may // cause a resize invalidating some allocations. winterface->sync(&pbi->lf_worker); for (i = 0; i < pbi->num_tile_workers; ++i) { winterface->sync(&pbi->tile_workers[i]); } lock_buffer_pool(pool); // Release all the reference buffers if worker thread is holding them. if (pbi->hold_ref_buf == 1) { int ref_index = 0, mask; for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { const int old_idx = cm->ref_frame_map[ref_index]; // Current thread releases the holding of reference frame. decrease_ref_count(old_idx, frame_bufs, pool); // Release the reference frame in reference map. if ((mask & 1) && old_idx >= 0) { decrease_ref_count(old_idx, frame_bufs, pool); } ++ref_index; } // Current thread releases the holding of reference frame. for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { const int old_idx = cm->ref_frame_map[ref_index]; decrease_ref_count(old_idx, frame_bufs, pool); } pbi->hold_ref_buf = 0; } // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); unlock_buffer_pool(pool); vp9_clear_system_state(); return -1; }
void vp9_encode_tiles_mt(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols); int i; vp9_init_tile_data(cpi); // Only run once to create threads and allocate thread data. if (cpi->num_workers == 0) { CHECK_MEM_ERROR(cm, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); CHECK_MEM_ERROR(cm, cpi->tile_thr_data, vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; winterface->init(worker); if (i < num_workers - 1) { thread_data->cpi = cpi; // Allocate thread data. CHECK_MEM_ERROR(cm, thread_data->td, vpx_memalign(32, sizeof(*thread_data->td))); vp9_zero(*thread_data->td); // Set up pc_tree. thread_data->td->leaf_tree = NULL; thread_data->td->pc_tree = NULL; vp9_setup_pc_tree(cm, thread_data->td); // Allocate frame counters in thread data. CHECK_MEM_ERROR(cm, thread_data->td->counts, vpx_calloc(1, sizeof(*thread_data->td->counts))); // Create threads if (!winterface->reset(worker)) vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile encoder thread creation failed"); } else { // Main thread acts as a worker and uses the thread data in cpi. thread_data->cpi = cpi; thread_data->td = &cpi->td; } winterface->sync(worker); } } for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; EncWorkerData *thread_data; worker->hook = (VP9WorkerHook)enc_worker_hook; worker->data1 = &cpi->tile_thr_data[i]; worker->data2 = NULL; thread_data = (EncWorkerData*)worker->data1; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { thread_data->td->mb = cpi->td.mb; thread_data->td->rd_counts = cpi->td.rd_counts; } if (thread_data->td->counts != &cpi->common.counts) { memcpy(thread_data->td->counts, &cpi->common.counts, sizeof(cpi->common.counts)); } // Handle use_nonrd_pick_mode case. if (cpi->sf.use_nonrd_pick_mode) { MACROBLOCK *const x = &thread_data->td->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; int j; for (j = 0; j < MAX_MB_PLANE; ++j) { p[j].coeff = ctx->coeff_pbuf[j][0]; p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; p[j].eobs = ctx->eobs_pbuf[j][0]; } } } // Encode a frame for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; // Set the starting tile for each thread. thread_data->start = i; if (i == num_workers - 1) winterface->execute(worker); else winterface->launch(worker); } // Encoding ends. for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; winterface->sync(worker); } for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; // Accumulate counters. if (i < num_workers - 1) { vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0); accumulate_rd_opt(&cpi->td, thread_data->td); } } }