/** * preparation work to have some raw pruning coefficients */ template <class FT> void Pruner<FT>::optimize_coefficients_preparation(/*io*/ vector<double> &pr) { evec b(d); // load coefficients if (flags & PRUNER_START_FROM_INPUT) { load_coefficients(b, pr); } // greedy method if (!(flags & PRUNER_START_FROM_INPUT)) { greedy(b); #ifdef DEBUG_PRUNER_OPTIMIZE_TC cerr << "# [Greedy]" << endl; cerr << b << endl; cerr << "# [Greedy] single_enum_cost = " << single_enum_cost(b) << endl; cerr << "# [Greedy] succ_probability = " << measure_metric(b) << endl; cerr << "# [Greedy] all_enum_cost = " << repeated_enum_cost(b) << endl; #endif } // greedy method for min pruning coefficients. if (flags & (PRUNER_GRADIENT | PRUNER_NELDER_MEAD)) { preproc_cost *= .1; greedy(min_pruning_coefficients); // The aim is to get a lower bound for the pruning parameter // note this lower bound will be used in the enforce() // In the case of fixed input probability, check whether this // min_pruning_coefficiens is small enough. Otherwise, reduce // it further. This is important since otherwise one may never // achieve the target probability. if (!opt_single) { vector<double> pr_min(n); save_coefficients(pr_min, min_pruning_coefficients); if (measure_metric(min_pruning_coefficients) > target) { fill(min_pruning_coefficients.begin(), min_pruning_coefficients.end(), 0.); optimize_coefficients_decr_prob(pr_min); } load_coefficients(min_pruning_coefficients, pr_min); } preproc_cost *= 10; } save_coefficients(pr, b); }
void Pruner<FT>::optimize_coefficients_local_adjust_smooth(/*io*/ vector<double> &pr) { vec b(n); FT lr, rr; FT th = 1.0 / n; load_coefficients(b, pr); for (int i = 1; i < n - 1; ++i) { lr = b[i] / b[i - 1]; rr = b[i + 1] / b[i]; if ((rr / lr > 1.25) || (rr / lr < 0.8)) { b[i] = sqrt(b[i - 1] * b[i + 1]); } if ((b[i + 1] - b[i]) > th || (b[i] - b[i - 1]) > th) { b[i] = (b[i - 1] + b[i + 1]) / 2.0; } } save_coefficients(pr, b); }
/** * optimize without constrains b_i = b_{i+1} for even i. */ template <class FT> void Pruner<FT>::optimize_coefficients_full_core(/*io*/ vector<double> &pr) { vec b(n); // always load coefficients since this is used after optimize_coefficients_evec load_coefficients(b, pr); // gradient method if (flags & PRUNER_GRADIENT) { if (verbosity) { cerr << "\nGradient descent start (dim=" << n << ")" << endl; } gradient_descent(b); #ifdef DEBUG_PRUNER_OPTIMIZE_TC cerr << "# [Descent]" << endl; cerr << b << endl; cerr << "# [Descent] single_enum_cost = " << single_enum_cost(b) << endl; cerr << "# [Descent] succ_probability = " << measure_metric(b) << endl; cerr << "# [Descent] all_enum_cost = " << repeated_enum_cost(b) << endl; #endif }; // Nelder-Mead method if (flags & PRUNER_NELDER_MEAD) { if (verbosity) { cerr << "\nNelder-Mead start (dim=" << n << ")" << endl; } while (nelder_mead_step(b)) { }; #ifdef DEBUG_PRUNER_OPTIMIZE_TC cerr << "# [Nelder-Mead]" << endl; cerr << b << endl; cerr << "# [Nelder-Mead] single_enum_cost = " << single_enum_cost(b) << endl; cerr << "# [Nelder-Mead] succ_probability = " << measure_metric(b) << endl; cerr << "# [Nelder-Mead] all_enum_cost = " << repeated_enum_cost(b) << endl; #endif }; save_coefficients(pr, b); }
void Pruner<FT>::optimize_coefficients_local_adjust_incr_prob(/*io*/ vector<double> &pr) { int trials, tours, maxi, ind; FT old_cf, old_cf0, old_cfs, new_cf, old_b; double current_max; vector<double> detailed_cost(n); vector<double> slices(n, 10.0); // (b[i+1] - b[i])/slice will be used as step vec b(n); load_coefficients(b, pr); // initial cost old_cf0 = target_function(b); tours = 0; while (1) { tours++; // old cost old_cf = target_function(b); // find bottleneck index old_cfs = single_enum_cost(b, &(detailed_cost)); current_max = 0.0; maxi = 0; for (int i = 0; i < n; i++) { if (detailed_cost[i] > current_max) { current_max = detailed_cost[i]; maxi = i; } } ind = n - maxi - 1; if (ind <= 1) break; #ifdef BALANCE_HEURISTIC_PRUNER_OPTIMIZE if (old_cfs > sqrt(old_cf) / 10.0) break; #endif for (int i = ind; i >= 1; --i) { if (b[i] <= b[i - 1]) continue; trials = 0; // fixed i-1, trying to increase b[i-1] while (1) { // old cost old_cf = target_function(b); // try increase old_b = b[i - 1]; b[i - 1] = b[i - 1] + (b[i] - b[i - 1]) / slices[i - 1]; // new cost new_cf = target_function(b); // cerr << " i = " << i << " old_cf = " << old_cf << " new_cf = " << // new_cf << endl; // if not improved -- recover if (new_cf >= (old_cf * 1.2)) { b[i - 1] = old_b; break; } else { if (slices[i - 1] < 1024) slices[i - 1] = slices[i - 1] * 1.2; } trials++; if (trials >= 10) break; } } new_cf = target_function(b); if (new_cf > (old_cf0 * 1.1) || tours > 4) break; } #ifdef DEBUG_PRUNER_OPTIMIZE_TC cerr << "# [TuningProb]" << endl; cerr << b << endl; cerr << "# [TuningProb] all_enum_cost = " << repeated_enum_cost(b) << endl; cerr << "# [TuningProb] succ_probability = " << measure_metric(b) << endl; #endif save_coefficients(pr, b); }
void Pruner<FT>::optimize_coefficients_local_adjust_decr_single(/*io*/ vector<double> &pr) { int maxi, lasti, consecutive_fails; double improved_ratio, current_max = 0.0; FT old_cf, old_cfs, new_cf, old_b; vector<double> detailed_cost(n); vector<double> slices(n, 10.0); // (b[i+1] - b[i])/slice will be used as step vector<int> thresholds(n, 3); vec b(n); load_coefficients(b, pr); lasti = -1; // last failed index, make sure we do not try it again in // the next time consecutive_fails = 0; // number of consecutive failes; break if // reaches it improved_ratio = 0.995; // if reduced by 0.995, descent while (1) { // old cost old_cf = target_function(b); // find bottleneck index old_cfs = single_enum_cost(b, &(detailed_cost)); // heuristic #ifdef BALANCE_HEURISTIC_PRUNER_OPTIMIZE if (old_cfs < sqrt(old_cf) / 10.0) break; #endif current_max = 0.0; maxi = 0; for (int i = 0; i < n; i++) { if ((i != (n - lasti - 1)) && (thresholds[n - i - 1] > 0)) { if (detailed_cost[i] > current_max) { current_max = detailed_cost[i]; maxi = i; } } } // b[ind] is the one to be reduced int ind = n - maxi - 1; old_b = b[ind]; if (ind != 0) { b[ind] = b[ind] - (b[ind] - b[ind - 1]) / slices[ind]; } else { break; } // new cost new_cf = target_function(b); // if not improved -- recover if (new_cf >= (old_cf * improved_ratio)) { b[ind] = old_b; lasti = ind; thresholds[lasti]--; consecutive_fails++; } else { // cerr << " improved from " << old_cf << " to " << new_cf << endl; if (slices[ind] < 1024) slices[ind] = slices[ind] * 1.05; consecutive_fails = 0; } // quit after 10 consecutive failes if (consecutive_fails > 10) { break; } } #ifdef DEBUG_PRUNER_OPTIMIZE_TC cerr << "# [TuningCost]" << endl; cerr << b << endl; cerr << "# [TuningCost] all_enum_cost = " << repeated_enum_cost(b) << endl; cerr << "# [TuningCost] succ_probability = " << measure_metric(b) << endl; #endif save_coefficients(pr, b); }
void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t* zbin_ptr, const int16_t* round_ptr, const int16_t* quant_ptr, const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr, tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr, uint16_t* eob_ptr, const int16_t* scan_ptr, const int16_t* iscan_ptr) { __m128i zero; (void)scan_ptr; coeff_ptr += n_coeffs; iscan_ptr += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; zero = _mm_setzero_si128(); if (!skip_block) { __m128i eob; __m128i zbin; __m128i round, quant, dequant, shift; { __m128i coeff0, coeff1; // Setup global values { __m128i pw_1; zbin = _mm_load_si128((const __m128i*)zbin_ptr); round = _mm_load_si128((const __m128i*)round_ptr); quant = _mm_load_si128((const __m128i*)quant_ptr); pw_1 = _mm_set1_epi16(1); zbin = _mm_sub_epi16(zbin, pw_1); dequant = _mm_load_si128((const __m128i*)dequant_ptr); shift = _mm_load_si128((const __m128i*)quant_shift_ptr); } { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); round = _mm_unpackhi_epi64(round, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); quant = _mm_unpackhi_epi64(quant, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); shift = _mm_unpackhi_epi64(shift, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); } n_coeffs += 8 * 2; } // AC only loop while (n_coeffs < 0) { __m128i coeff0, coeff1; { __m128i coeff0_sign, coeff1_sign; __m128i qcoeff0, qcoeff1; __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; coeff0 = load_coefficients(coeff_ptr + n_coeffs); coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); coeff1_sign = _mm_srai_epi16(coeff1, 15); qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); qcoeff0 = _mm_adds_epi16(qcoeff0, round); qcoeff1 = _mm_adds_epi16(qcoeff1, round); qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); // Reinsert signs qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); } { // Scan for eob __m128i zero_coeff0, zero_coeff1; __m128i nzero_coeff0, nzero_coeff1; __m128i iscan0, iscan1; __m128i eob0, eob1; zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); eob = _mm_max_epi16(eob, eob0); } n_coeffs += 8 * 2; } // Accumulate EOB { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); eob = _mm_max_epi16(eob, eob_shuffled); eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); eob = _mm_max_epi16(eob, eob_shuffled); *eob_ptr = _mm_extract_epi16(eob, 1); } } else { do { store_coefficients(zero, dqcoeff_ptr + n_coeffs); store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); store_coefficients(zero, qcoeff_ptr + n_coeffs); store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; } }
template <class FT> void Pruner<FT>::optimize_coefficients_decr_prob(/*io*/ vector<double> &pr) { int dn = pr.size(); int tours; double normalized; FT old_c0, old_c1, old_prob, old_cfs; vec b(dn), old_b(dn), old_b2(dn); vector<double> detailed_cost(dn); vector<double> weight(dn); bool not_changed; load_coefficients(b, pr); // decr b until achieve target tours = 0; while (1) { if (tours > OPTIMIZE_PROB_MAXSTEP) break; tours++; old_prob = measure_metric(b); if (old_prob <= target) break; old_cfs = single_enum_cost(b, &(detailed_cost)); normalized = 0.0; for (int i = 0; i < dn; i++) { weight[i] = 0.0; for (int j = i; j < dn; j++) { weight[i] = weight[i] + detailed_cost[j]; } weight[i] = 1.0 / weight[i]; if (weight[i] < OPTIMIZE_PROB_MINSTEP) weight[i] = OPTIMIZE_PROB_MINSTEP; normalized += weight[i]; } for (int i = 0; i < dn; i++) { weight[i] = weight[i] / normalized; // cout << weight[i] << " "; } for (int i = dn - 1; i >= 0; --i) { old_b[i] = b[i]; b[i] = b[i] - weight[i]; if (b[i] < OPTIMIZE_PROB_MINSTEP) b[i] = OPTIMIZE_PROB_MINSTEP; } enforce(b); not_changed = true; for (int i = dn - 1; i >= 0; --i) { if (b[i] != old_b[i]) not_changed = false; } if (not_changed) { break; } } save_coefficients(pr, b); }
void Pruner<FT>::optimize_coefficients_local_adjust_prob(/*io*/ vector<double> &pr) { int dn = pr.size(); int tours; FT prob, ratio; vec b(dn), old_b(dn), old_b2(dn); vector<double> detailed_cost(dn); vector<double> weight(dn); bool not_changed; load_coefficients(b, pr); // incr b until achieve target tours = 0; while (1) { tours++; prob = measure_metric(b); ratio = prob / target; // good enough if (ratio < 1.05 && ratio > 0.95) break; // tune if (ratio < 1) { for (int i = dn - 1; i >= 0; --i) { old_b[i] = b[i]; b[i] = b[i] + OPTIMIZE_PROB_MINSTEP; if (b[i] >= 1.0) b[i] = 1.0; } } else { for (int i = dn - 1; i >= 0; --i) { old_b[i] = b[i]; b[i] = b[i] - OPTIMIZE_PROB_MINSTEP; if (b[i] < OPTIMIZE_PROB_MINSTEP) b[i] = OPTIMIZE_PROB_MINSTEP; } } enforce(b); not_changed = true; for (int i = dn - 1; i >= 0; --i) { if (b[i] != old_b[i]) not_changed = false; } if (not_changed) break; } save_coefficients(pr, b); }