Esempio n. 1
0
/**
 *  preparation work to have some raw pruning coefficients
 */
template <class FT> void Pruner<FT>::optimize_coefficients_preparation(/*io*/ vector<double> &pr)
{
  evec b(d);

  // load coefficients
  if (flags & PRUNER_START_FROM_INPUT)
  {
    load_coefficients(b, pr);
  }

  // greedy method
  if (!(flags & PRUNER_START_FROM_INPUT))
  {
    greedy(b);
#ifdef DEBUG_PRUNER_OPTIMIZE_TC
    cerr << "# [Greedy]" << endl;
    cerr << b << endl;
    cerr << "# [Greedy] single_enum_cost  = " << single_enum_cost(b) << endl;
    cerr << "# [Greedy] succ_probability  = " << measure_metric(b) << endl;
    cerr << "# [Greedy]    all_enum_cost  = " << repeated_enum_cost(b) << endl;
#endif
  }

  // greedy method for min pruning coefficients.
  if (flags & (PRUNER_GRADIENT | PRUNER_NELDER_MEAD))
  {
    preproc_cost *= .1;
    greedy(min_pruning_coefficients);

    // The aim is to get a lower bound for the pruning parameter
    // note this lower bound will be used in the enforce()
    // In the case of fixed input probability, check whether this
    // min_pruning_coefficiens is small enough. Otherwise, reduce
    // it further. This is important since otherwise one may never
    // achieve the target probability.
    if (!opt_single)
    {
      vector<double> pr_min(n);
      save_coefficients(pr_min, min_pruning_coefficients);
      if (measure_metric(min_pruning_coefficients) > target)
      {
        fill(min_pruning_coefficients.begin(), min_pruning_coefficients.end(), 0.);
        optimize_coefficients_decr_prob(pr_min);
      }
      load_coefficients(min_pruning_coefficients, pr_min);
    }
    preproc_cost *= 10;
  }
  save_coefficients(pr, b);
}
Esempio n. 2
0
void Pruner<FT>::optimize_coefficients_local_adjust_smooth(/*io*/ vector<double> &pr)
{
  vec b(n);
  FT lr, rr;
  FT th = 1.0 / n;
  load_coefficients(b, pr);

  for (int i = 1; i < n - 1; ++i)
  {

    lr = b[i] / b[i - 1];
    rr = b[i + 1] / b[i];

    if ((rr / lr > 1.25) || (rr / lr < 0.8))
    {
      b[i] = sqrt(b[i - 1] * b[i + 1]);
    }

    if ((b[i + 1] - b[i]) > th || (b[i] - b[i - 1]) > th)
    {
      b[i] = (b[i - 1] + b[i + 1]) / 2.0;
    }
  }

  save_coefficients(pr, b);
}
Esempio n. 3
0
/**
 *  optimize without constrains b_i = b_{i+1} for even i.
 */
template <class FT> void Pruner<FT>::optimize_coefficients_full_core(/*io*/ vector<double> &pr)
{
  vec b(n);

  // always load coefficients since this is used after optimize_coefficients_evec
  load_coefficients(b, pr);

  // gradient method
  if (flags & PRUNER_GRADIENT)
  {
    if (verbosity)
    {
      cerr << "\nGradient descent start (dim=" << n << ")" << endl;
    }

    gradient_descent(b);
#ifdef DEBUG_PRUNER_OPTIMIZE_TC
    cerr << "# [Descent]" << endl;
    cerr << b << endl;
    cerr << "# [Descent] single_enum_cost  = " << single_enum_cost(b) << endl;
    cerr << "# [Descent] succ_probability  = " << measure_metric(b) << endl;
    cerr << "# [Descent]    all_enum_cost  = " << repeated_enum_cost(b) << endl;
#endif
  };

  // Nelder-Mead method
  if (flags & PRUNER_NELDER_MEAD)
  {
    if (verbosity)
    {
      cerr << "\nNelder-Mead start (dim=" << n << ")" << endl;
    }
    while (nelder_mead_step(b))
    {
    };

#ifdef DEBUG_PRUNER_OPTIMIZE_TC
    cerr << "# [Nelder-Mead]" << endl;
    cerr << b << endl;
    cerr << "# [Nelder-Mead] single_enum_cost  = " << single_enum_cost(b) << endl;
    cerr << "# [Nelder-Mead] succ_probability  = " << measure_metric(b) << endl;
    cerr << "# [Nelder-Mead]    all_enum_cost  = " << repeated_enum_cost(b) << endl;
#endif
  };

  save_coefficients(pr, b);
}
Esempio n. 4
0
void Pruner<FT>::optimize_coefficients_local_adjust_incr_prob(/*io*/ vector<double> &pr)
{
  int trials, tours, maxi, ind;
  FT old_cf, old_cf0, old_cfs, new_cf, old_b;
  double current_max;
  vector<double> detailed_cost(n);
  vector<double> slices(n, 10.0);  // (b[i+1] - b[i])/slice will be used as step
  vec b(n);
  load_coefficients(b, pr);

  // initial cost
  old_cf0 = target_function(b);

  tours = 0;
  while (1)
  {

    tours++;

    // old cost
    old_cf = target_function(b);
    // find bottleneck index
    old_cfs     = single_enum_cost(b, &(detailed_cost));
    current_max = 0.0;
    maxi        = 0;
    for (int i = 0; i < n; i++)
    {
      if (detailed_cost[i] > current_max)
      {
        current_max = detailed_cost[i];
        maxi        = i;
      }
    }
    ind = n - maxi - 1;
    if (ind <= 1)
      break;

#ifdef BALANCE_HEURISTIC_PRUNER_OPTIMIZE
    if (old_cfs > sqrt(old_cf) / 10.0)
      break;
#endif

    for (int i = ind; i >= 1; --i)
    {

      if (b[i] <= b[i - 1])
        continue;

      trials = 0;

      // fixed i-1, trying to increase b[i-1]
      while (1)
      {
        // old cost
        old_cf = target_function(b);

        // try increase
        old_b    = b[i - 1];
        b[i - 1] = b[i - 1] + (b[i] - b[i - 1]) / slices[i - 1];

        // new cost
        new_cf = target_function(b);

        // cerr << " i = " << i << " old_cf = " << old_cf << " new_cf = " <<
        // new_cf << endl;

        // if not improved -- recover
        if (new_cf >= (old_cf * 1.2))
        {
          b[i - 1] = old_b;
          break;
        }
        else
        {
          if (slices[i - 1] < 1024)
            slices[i - 1] = slices[i - 1] * 1.2;
        }
        trials++;
        if (trials >= 10)
          break;
      }
    }

    new_cf = target_function(b);
    if (new_cf > (old_cf0 * 1.1) || tours > 4)
      break;
  }

#ifdef DEBUG_PRUNER_OPTIMIZE_TC
  cerr << "# [TuningProb]" << endl;
  cerr << b << endl;
  cerr << "# [TuningProb] all_enum_cost    = " << repeated_enum_cost(b) << endl;
  cerr << "# [TuningProb] succ_probability = " << measure_metric(b) << endl;
#endif

  save_coefficients(pr, b);
}
Esempio n. 5
0
void Pruner<FT>::optimize_coefficients_local_adjust_decr_single(/*io*/ vector<double> &pr)
{
  int maxi, lasti, consecutive_fails;
  double improved_ratio, current_max = 0.0;
  FT old_cf, old_cfs, new_cf, old_b;
  vector<double> detailed_cost(n);
  vector<double> slices(n, 10.0);  // (b[i+1] - b[i])/slice will be used as step
  vector<int> thresholds(n, 3);
  vec b(n);

  load_coefficients(b, pr);

  lasti = -1;             // last failed index, make sure we do not try it again in
                          // the next time
  consecutive_fails = 0;  // number of consecutive failes; break if
                          // reaches it

  improved_ratio = 0.995;  // if reduced by 0.995, descent

  while (1)
  {

    // old cost
    old_cf = target_function(b);

    // find bottleneck index
    old_cfs = single_enum_cost(b, &(detailed_cost));

// heuristic
#ifdef BALANCE_HEURISTIC_PRUNER_OPTIMIZE
    if (old_cfs < sqrt(old_cf) / 10.0)
      break;
#endif

    current_max = 0.0;
    maxi        = 0;
    for (int i = 0; i < n; i++)
    {
      if ((i != (n - lasti - 1)) && (thresholds[n - i - 1] > 0))
      {
        if (detailed_cost[i] > current_max)
        {
          current_max = detailed_cost[i];
          maxi        = i;
        }
      }
    }

    // b[ind] is the one to be reduced
    int ind = n - maxi - 1;
    old_b   = b[ind];
    if (ind != 0)
    {
      b[ind] = b[ind] - (b[ind] - b[ind - 1]) / slices[ind];
    }
    else
    {
      break;
    }

    // new cost
    new_cf = target_function(b);

    // if not improved -- recover
    if (new_cf >= (old_cf * improved_ratio))
    {
      b[ind] = old_b;
      lasti  = ind;
      thresholds[lasti]--;
      consecutive_fails++;
    }
    else
    {
      // cerr << " improved from " << old_cf << " to  " << new_cf << endl;
      if (slices[ind] < 1024)
        slices[ind] = slices[ind] * 1.05;
      consecutive_fails = 0;
    }

    // quit after 10 consecutive failes
    if (consecutive_fails > 10)
    {
      break;
    }
  }

#ifdef DEBUG_PRUNER_OPTIMIZE_TC
  cerr << "# [TuningCost]" << endl;
  cerr << b << endl;
  cerr << "# [TuningCost] all_enum_cost    = " << repeated_enum_cost(b) << endl;
  cerr << "# [TuningCost] succ_probability = " << measure_metric(b) << endl;
#endif

  save_coefficients(pr, b);
}
Esempio n. 6
0
void vpx_quantize_b_sse2(const tran_low_t* coeff_ptr, intptr_t n_coeffs,
                         int skip_block, const int16_t* zbin_ptr,
                         const int16_t* round_ptr, const int16_t* quant_ptr,
                         const int16_t* quant_shift_ptr, tran_low_t* qcoeff_ptr,
                         tran_low_t* dqcoeff_ptr, const int16_t* dequant_ptr,
                         uint16_t* eob_ptr, const int16_t* scan_ptr,
                         const int16_t* iscan_ptr) {
  __m128i zero;
  (void)scan_ptr;

  coeff_ptr += n_coeffs;
  iscan_ptr += n_coeffs;
  qcoeff_ptr += n_coeffs;
  dqcoeff_ptr += n_coeffs;
  n_coeffs = -n_coeffs;
  zero = _mm_setzero_si128();
  if (!skip_block) {
    __m128i eob;
    __m128i zbin;
    __m128i round, quant, dequant, shift;
    {
      __m128i coeff0, coeff1;

      // Setup global values
      {
        __m128i pw_1;
        zbin = _mm_load_si128((const __m128i*)zbin_ptr);
        round = _mm_load_si128((const __m128i*)round_ptr);
        quant = _mm_load_si128((const __m128i*)quant_ptr);
        pw_1 = _mm_set1_epi16(1);
        zbin = _mm_sub_epi16(zbin, pw_1);
        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
        shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
      }

      {
        __m128i coeff0_sign, coeff1_sign;
        __m128i qcoeff0, qcoeff1;
        __m128i qtmp0, qtmp1;
        __m128i cmp_mask0, cmp_mask1;
        // Do DC and first 15 AC
        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);

        // Poor man's sign extract
        coeff0_sign = _mm_srai_epi16(coeff0, 15);
        coeff1_sign = _mm_srai_epi16(coeff1, 15);
        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
        round = _mm_unpackhi_epi64(round, round);
        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
        quant = _mm_unpackhi_epi64(quant, quant);
        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
        shift = _mm_unpackhi_epi64(shift, shift);
        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

        // Reinsert signs
        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

        // Mask out zbin threshold coeffs
        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);

        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
        dequant = _mm_unpackhi_epi64(dequant, dequant);
        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
      }

      {
        // Scan for eob
        __m128i zero_coeff0, zero_coeff1;
        __m128i nzero_coeff0, nzero_coeff1;
        __m128i iscan0, iscan1;
        __m128i eob1;
        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
        // Add one to convert from indices to counts
        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
        eob = _mm_and_si128(iscan0, nzero_coeff0);
        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
        eob = _mm_max_epi16(eob, eob1);
      }
      n_coeffs += 8 * 2;
    }

    // AC only loop
    while (n_coeffs < 0) {
      __m128i coeff0, coeff1;
      {
        __m128i coeff0_sign, coeff1_sign;
        __m128i qcoeff0, qcoeff1;
        __m128i qtmp0, qtmp1;
        __m128i cmp_mask0, cmp_mask1;

        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);

        // Poor man's sign extract
        coeff0_sign = _mm_srai_epi16(coeff0, 15);
        coeff1_sign = _mm_srai_epi16(coeff1, 15);
        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

        // Reinsert signs
        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

        // Mask out zbin threshold coeffs
        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);

        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
      }

      {
        // Scan for eob
        __m128i zero_coeff0, zero_coeff1;
        __m128i nzero_coeff0, nzero_coeff1;
        __m128i iscan0, iscan1;
        __m128i eob0, eob1;
        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
        // Add one to convert from indices to counts
        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
        eob0 = _mm_max_epi16(eob0, eob1);
        eob = _mm_max_epi16(eob, eob0);
      }
      n_coeffs += 8 * 2;
    }

    // Accumulate EOB
    {
      __m128i eob_shuffled;
      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
      eob = _mm_max_epi16(eob, eob_shuffled);
      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
      eob = _mm_max_epi16(eob, eob_shuffled);
      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
      eob = _mm_max_epi16(eob, eob_shuffled);
      *eob_ptr = _mm_extract_epi16(eob, 1);
    }
  } else {
    do {
      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
      store_coefficients(zero, qcoeff_ptr + n_coeffs);
      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
      n_coeffs += 8 * 2;
    } while (n_coeffs < 0);
    *eob_ptr = 0;
  }
}
Esempio n. 7
0
template <class FT> void Pruner<FT>::optimize_coefficients_decr_prob(/*io*/ vector<double> &pr)
{
  int dn = pr.size();
  int tours;
  double normalized;
  FT old_c0, old_c1, old_prob, old_cfs;
  vec b(dn), old_b(dn), old_b2(dn);
  vector<double> detailed_cost(dn);
  vector<double> weight(dn);
  bool not_changed;

  load_coefficients(b, pr);
  // decr b until achieve target
  tours = 0;
  while (1)
  {
    if (tours > OPTIMIZE_PROB_MAXSTEP)
      break;
    tours++;

    old_prob = measure_metric(b);
    if (old_prob <= target)
      break;

    old_cfs    = single_enum_cost(b, &(detailed_cost));
    normalized = 0.0;
    for (int i = 0; i < dn; i++)
    {
      weight[i] = 0.0;
      for (int j = i; j < dn; j++)
      {
        weight[i] = weight[i] + detailed_cost[j];
      }
      weight[i] = 1.0 / weight[i];
      if (weight[i] < OPTIMIZE_PROB_MINSTEP)
        weight[i] = OPTIMIZE_PROB_MINSTEP;
      normalized += weight[i];
    }
    for (int i = 0; i < dn; i++)
    {
      weight[i] = weight[i] / normalized;
      // cout << weight[i] << " ";
    }

    for (int i = dn - 1; i >= 0; --i)
    {
      old_b[i] = b[i];
      b[i]     = b[i] - weight[i];
      if (b[i] < OPTIMIZE_PROB_MINSTEP)
        b[i] = OPTIMIZE_PROB_MINSTEP;
    }

    enforce(b);

    not_changed = true;
    for (int i = dn - 1; i >= 0; --i)
    {
      if (b[i] != old_b[i])
        not_changed = false;
    }
    if (not_changed)
    {
      break;
    }
  }
  save_coefficients(pr, b);
}
Esempio n. 8
0
void Pruner<FT>::optimize_coefficients_local_adjust_prob(/*io*/ vector<double> &pr)
{
  int dn = pr.size();
  int tours;
  FT prob, ratio;
  vec b(dn), old_b(dn), old_b2(dn);
  vector<double> detailed_cost(dn);
  vector<double> weight(dn);
  bool not_changed;

  load_coefficients(b, pr);

  // incr b until achieve target
  tours = 0;
  while (1)
  {
    tours++;

    prob  = measure_metric(b);
    ratio = prob / target;

    // good enough
    if (ratio < 1.05 && ratio > 0.95)
      break;

    // tune
    if (ratio < 1)
    {
      for (int i = dn - 1; i >= 0; --i)
      {
        old_b[i] = b[i];
        b[i]     = b[i] + OPTIMIZE_PROB_MINSTEP;
        if (b[i] >= 1.0)
          b[i] = 1.0;
      }
    }
    else
    {
      for (int i = dn - 1; i >= 0; --i)
      {
        old_b[i] = b[i];
        b[i]     = b[i] - OPTIMIZE_PROB_MINSTEP;
        if (b[i] < OPTIMIZE_PROB_MINSTEP)
          b[i] = OPTIMIZE_PROB_MINSTEP;
      }
    }

    enforce(b);

    not_changed = true;
    for (int i = dn - 1; i >= 0; --i)
    {
      if (b[i] != old_b[i])
        not_changed = false;
    }
    if (not_changed)
      break;
  }

  save_coefficients(pr, b);
}