static int FinalizeTokenProbas(VP8Proba* const proba) { int has_changed = 0; int size = 0; int t, b, c, p; for (t = 0; t < NUM_TYPES; ++t) { for (b = 0; b < NUM_BANDS; ++b) { for (c = 0; c < NUM_CTX; ++c) { for (p = 0; p < NUM_PROBAS; ++p) { const proba_t stats = proba->stats_[t][b][c][p]; const int nb = (stats >> 0) & 0xffff; const int total = (stats >> 16) & 0xffff; const int update_proba = VP8CoeffsUpdateProba[t][b][c][p]; const int old_p = VP8CoeffsProba0[t][b][c][p]; const int new_p = CalcTokenProba(nb, total); const int old_cost = BranchCost(nb, total, old_p) + VP8BitCost(0, update_proba); const int new_cost = BranchCost(nb, total, new_p) + VP8BitCost(1, update_proba) + 8 * 256; const int use_new_p = (old_cost > new_cost); size += VP8BitCost(use_new_p, update_proba); if (use_new_p) { // only use proba that seem meaningful enough. proba->coeffs_[t][b][c][p] = new_p; has_changed |= (new_p != old_p); size += 8 * 256; } else { proba->coeffs_[t][b][c][p] = old_p; } } } } } proba->dirty_ = has_changed; return size; }
void VP8CalculateLevelCosts(VP8EncProba* const proba) { int ctype, band, ctx; if (!proba->dirty_) return; // nothing to do. for (ctype = 0; ctype < NUM_TYPES; ++ctype) { int n; for (band = 0; band < NUM_BANDS; ++band) { for (ctx = 0; ctx < NUM_CTX; ++ctx) { const uint8_t* const p = proba->coeffs_[ctype][band][ctx]; uint16_t* const table = proba->level_cost_[ctype][band][ctx]; const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0; const int cost_base = VP8BitCost(1, p[1]) + cost0; int v; table[0] = VP8BitCost(0, p[1]) + cost0; for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) { table[v] = cost_base + VariableLevelCost(v, p); } // Starting at level 67 and up, the variable part of the cost is // actually constant. } } for (n = 0; n < 16; ++n) { // replicate bands. We don't need to sentinel. for (ctx = 0; ctx < NUM_CTX; ++ctx) { proba->remapped_costs_[ctype][n][ctx] = proba->level_cost_[ctype][VP8EncBands[n]][ctx]; } } } proba->dirty_ = 0; }
static int GetResidualCost(int ctx0, const VP8Residual* const res) { int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; const uint16_t* t = res->cost[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } for (; n < res->last; ++n) { const int v = abs(res->coeffs[n]); const int b = VP8EncBands[n + 1]; const int ctx = (v >= 2) ? 2 : v; cost += VP8LevelCost(t, v); t = res->cost[b][ctx]; } // Last coefficient is always non-zero { const int v = abs(res->coeffs[n]); assert(v != 0); cost += VP8LevelCost(t, v); if (n < 15) { const int b = VP8EncBands[n + 1]; const int ctx = (v == 1) ? 1 : 2; const int last_p0 = res->prob[b][ctx][0]; cost += VP8BitCost(0, last_p0); } } return cost; }
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { uint8_t levels[16], ctxs[16]; uint16_t abs_levels[16]; int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; CostArrayPtr const costs = res->costs; const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } { // precompute clamped levels and contexts, packed to 8b. const __m128i zero = _mm_setzero_si128(); const __m128i kCst2 = _mm_set1_epi8(2); const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL); const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]); const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]); const __m128i D0 = _mm_sub_epi16(zero, c0); const __m128i D1 = _mm_sub_epi16(zero, c1); const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b const __m128i E1 = _mm_max_epi16(c1, D1); const __m128i F = _mm_packs_epi16(E0, E1); const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2 const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67] _mm_storeu_si128((__m128i*)&ctxs[0], G); _mm_storeu_si128((__m128i*)&levels[0], H); _mm_storeu_si128((__m128i*)&abs_levels[0], E0); _mm_storeu_si128((__m128i*)&abs_levels[8], E1); } for (; n < res->last; ++n) { const int ctx = ctxs[n]; const int level = levels[n]; const int flevel = abs_levels[n]; // full level cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { const int level = levels[n]; const int flevel = abs_levels[n]; assert(flevel != 0); cost += VP8LevelFixedCosts[flevel] + t[level]; if (n < 15) { const int b = VP8EncBands[n + 1]; const int ctx = ctxs[n]; const int last_p0 = res->prob[b][ctx][0]; cost += VP8BitCost(0, last_p0); } } return cost; }
static void SetSegmentProbas(VP8Encoder* const enc) { int p[NUM_MB_SEGMENTS] = { 0 }; int n; for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) { const VP8MBInfo* const mb = &enc->mb_info_[n]; p[mb->segment_]++; } if (enc->pic_->stats != NULL) { for (n = 0; n < NUM_MB_SEGMENTS; ++n) { enc->pic_->stats->segment_size[n] = p[n]; } } if (enc->segment_hdr_.num_segments_ > 1) { uint8_t* const probas = enc->proba_.segments_; probas[0] = GetProba(p[0] + p[1], p[2] + p[3]); probas[1] = GetProba(p[0], p[1]); probas[2] = GetProba(p[2], p[3]); enc->segment_hdr_.update_map_ = (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255); enc->segment_hdr_.size_ = p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) + p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) + p[2] * (VP8BitCost(1, probas[0]) + VP8BitCost(0, probas[2])) + p[3] * (VP8BitCost(1, probas[0]) + VP8BitCost(1, probas[2])); } else { enc->segment_hdr_.update_map_ = 0; enc->segment_hdr_.size_ = 0; } }
// Returns the bit-cost for coding the skip probability. static int FinalizeSkipProba(VP8Encoder* const enc) { VP8Proba* const proba = &enc->proba_; const int nb_mbs = enc->mb_w_ * enc->mb_h_; const int nb_events = proba->nb_skip_; int size; proba->skip_proba_ = CalcSkipProba(nb_events, nb_mbs); proba->use_skip_proba_ = (proba->skip_proba_ < SKIP_PROBA_THRESHOLD); size = 256; // 'use_skip_proba' bit if (proba->use_skip_proba_) { size += nb_events * VP8BitCost(1, proba->skip_proba_) + (nb_mbs - nb_events) * VP8BitCost(0, proba->skip_proba_); size += 8 * 256; // cost of signaling the skip_proba_ itself. } return size; }
void VP8CalculateLevelCosts(VP8Proba* const proba) { int ctype, band, ctx; for (ctype = 0; ctype < NUM_TYPES; ++ctype) { for (band = 0; band < NUM_BANDS; ++band) { for(ctx = 0; ctx < NUM_CTX; ++ctx) { const uint8_t* const p = proba->coeffs_[ctype][band][ctx]; uint16_t* const table = proba->level_cost_[ctype][band][ctx]; const int cost_base = VP8BitCost(1, p[1]); int v; table[0] = VP8BitCost(0, p[1]); for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) { table[v] = cost_base + VariableLevelCost(v, p); } // Starting at level 67 and up, the variable part of the cost is // actually constant. } } } }
static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) { int pattern = VP8LevelCodes[level - 1][0]; int bits = VP8LevelCodes[level - 1][1]; int cost = 0; int i; for (i = 2; pattern; ++i) { if (pattern & 1) { cost += VP8BitCost(bits & 1, probas[i]); } bits >>= 1; pattern >>= 1; } return cost; }
// Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability. static int BranchCost(int nb, int total, int proba) { return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba); }
static int TrellisQuantizeBlock(const VP8EncIterator* const it, int16_t in[16], int16_t out[16], int ctx0, int coeff_type, const VP8Matrix* const mtx, int lambda) { ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type]; CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type]; const int first = (coeff_type == 0) ? 1 : 0; Node nodes[17][NUM_NODES]; int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous score_t best_score; int best_node; int last = first - 1; int n, m, p, nz; { score_t cost; score_t max_error; const int thresh = mtx->q_[1] * mtx->q_[1] / 4; const int last_proba = last_costs[VP8EncBands[first]][ctx0][0]; // compute maximal distortion. max_error = 0; for (n = first; n < 16; ++n) { const int j = kZigzag[n]; const int err = in[j] * in[j]; max_error += kWeightTrellis[j] * err; if (err > thresh) last = n; } // we don't need to go inspect up to n = 16 coeffs. We can just go up // to last + 1 (inclusive) without losing much. if (last < 15) ++last; // compute 'skip' score. This is the max score one can do. cost = VP8BitCost(0, last_proba); best_score = RDScoreTrellis(lambda, cost, max_error); // initialize source node. n = first - 1; for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { NODE(n, m).cost = 0; NODE(n, m).error = max_error; NODE(n, m).ctx = ctx0; } } // traverse trellis. for (n = first; n <= last; ++n) { const int j = kZigzag[n]; const int Q = mtx->q_[j]; const int iQ = mtx->iq_[j]; const int B = BIAS(0x00); // neutral bias // note: it's important to take sign of the _original_ coeff, // so we don't have to consider level < 0 afterward. const int sign = (in[j] < 0); int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; int level0; if (coeff0 > 2047) coeff0 = 2047; level0 = QUANTDIV(coeff0, iQ, B); // test all alternate level values around level0. for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { Node* const cur = &NODE(n, m); int delta_error, new_error; score_t cur_score = MAX_COST; int level = level0 + m; int last_proba; cur->sign = sign; cur->level = level; cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2; if (level >= 2048 || level < 0) { // node is dead? cur->cost = MAX_COST; continue; } last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0]; // Compute delta_error = how much coding this level will // subtract as distortion to max_error new_error = coeff0 - level * Q; delta_error = kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error); // Inspect all possible non-dead predecessors. Retain only the best one. for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) { const Node* const prev = &NODE(n - 1, p); const int prev_ctx = prev->ctx; const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx]; const score_t total_error = prev->error - delta_error; score_t cost, base_cost, score; if (prev->cost >= MAX_COST) { // dead node? continue; } // Base cost of both terminal/non-terminal base_cost = prev->cost + VP8LevelCost(tcost, level); // Examine node assuming it's a non-terminal one. cost = base_cost; if (level && n < 15) { cost += VP8BitCost(1, last_proba); } score = RDScoreTrellis(lambda, cost, total_error); if (score < cur_score) { cur_score = score; cur->cost = cost; cur->error = total_error; cur->prev = p; } // Now, record best terminal node (and thus best entry in the graph). if (level) { cost = base_cost; if (n < 15) cost += VP8BitCost(0, last_proba); score = RDScoreTrellis(lambda, cost, total_error); if (score < best_score) { best_score = score; best_path[0] = n; // best eob position best_path[1] = m; // best level best_path[2] = p; // best predecessor } } } } } // Fresh start memset(in + first, 0, (16 - first) * sizeof(*in)); memset(out + first, 0, (16 - first) * sizeof(*out)); if (best_path[0] == -1) { return 0; // skip! } // Unwind the best path. // Note: best-prev on terminal node is not necessarily equal to the // best_prev for non-terminal. So we patch best_path[2] in. n = best_path[0]; best_node = best_path[1]; NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal nz = 0; for (; n >= first; --n) { const Node* const node = &NODE(n, best_node); const int j = kZigzag[n]; out[n] = node->sign ? -node->level : node->level; nz |= (node->level != 0); in[j] = out[n] * mtx->q_[j]; best_node = node->prev; } return nz; }