/** Find the codepoint on the given PSphere closest to the desired * vector. Double-precision PVQ search just to make sure our tests * aren't limited by numerical accuracy. * * @param [in] xcoeff input vector to quantize (x in the math doc) * @param [in] n number of dimensions * @param [in] k number of pulses * @param [out] ypulse optimal codevector found (y in the math doc) * @return cosine distance between x and y (between 0 and 1) */ static double pvq_search_double(const double *xcoeff, int n, int k, od_coeff *ypulse) { int i, j; double xy; double yy; double x[1024]; double xx; xx = xy = yy = 0; for (j = 0; j < n; j++) { x[j] = fabs(xcoeff[j]); xx += x[j]*x[j]; } i = 0; if (k > 2) { double l1_norm; double l1_inv; l1_norm = 0; for (j = 0; j < n; j++) l1_norm += x[j]; l1_inv = 1./OD_MAXF(l1_norm, 1e-100); for (j = 0; j < n; j++) { ypulse[j] = OD_MAXI(0, (int)floor(k*x[j]*l1_inv)); xy += x[j]*ypulse[j]; yy += ypulse[j]*ypulse[j]; i += ypulse[j]; } } else { for (j = 0; j < n; j++) ypulse[j] = 0; } /* Search one pulse at a time */ for (; i < k; i++) { int pos; double best_xy; double best_yy; pos = 0; best_xy = -10; best_yy = 1; for (j = 0; j < n; j++) { double tmp_xy; double tmp_yy; tmp_xy = xy + x[j]; tmp_yy = yy + 2*ypulse[j] + 1; tmp_xy *= tmp_xy; if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) { best_xy = tmp_xy; best_yy = tmp_yy; pos = j; } } xy = xy + x[pos]; yy = yy + 2*ypulse[pos] + 1; ypulse[pos]++; } for (i = 0; i < n; i++) { if (xcoeff[i] < 0) ypulse[i] = -ypulse[i]; } return xy/(1e-100 + sqrt(xx*yy)); }
/** Find the codepoint on the given PSphere closest to the desired * vector. Double-precision PVQ search just to make sure our tests * aren't limited by numerical accuracy. * * @param [in] xcoeff input vector to quantize (x in the math doc) * @param [in] n number of dimensions * @param [in] k number of pulses * @param [out] ypulse optimal codevector found (y in the math doc) * @param [out] g2 multiplier for the distortion (typically squared * gain units) * @return cosine distance between x and y (between 0 and 1) */ static double pvq_search_rdo_double(const od_val16 *xcoeff, int n, int k, od_coeff *ypulse, double g2) { int i, j; double xy; double yy; /* TODO - This blows our 8kB stack space budget and should be fixed when converting PVQ to fixed point. */ double x[MAXN]; double xx; double lambda; double norm_1; int rdo_pulses; double delta_rate; xx = xy = yy = 0; for (j = 0; j < n; j++) { x[j] = fabs((float)xcoeff[j]); xx += x[j]*x[j]; } norm_1 = 1./sqrt(1e-30 + xx); lambda = OD_PVQ_LAMBDA/(1e-30 + g2); i = 0; if (k > 2) { double l1_norm; double l1_inv; l1_norm = 0; for (j = 0; j < n; j++) l1_norm += x[j]; l1_inv = 1./OD_MAXF(l1_norm, 1e-100); for (j = 0; j < n; j++) { ypulse[j] = OD_MAXI(0, (int)floor(k*x[j]*l1_inv)); xy += x[j]*ypulse[j]; yy += ypulse[j]*ypulse[j]; i += ypulse[j]; } } else { for (j = 0; j < n; j++) ypulse[j] = 0; } /* Only use RDO on the last few pulses. This not only saves CPU, but using RDO on all pulses actually makes the results worse for reasons I don't fully understand. */ rdo_pulses = 1 + k/4; /* Rough assumption for now, the last position costs about 3 bits more than the first. */ delta_rate = 3./n; /* Search one pulse at a time */ for (; i < k - rdo_pulses; i++) { int pos; double best_xy; double best_yy; pos = 0; best_xy = -10; best_yy = 1; for (j = 0; j < n; j++) { double tmp_xy; double tmp_yy; tmp_xy = xy + x[j]; tmp_yy = yy + 2*ypulse[j] + 1; tmp_xy *= tmp_xy; if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) { best_xy = tmp_xy; best_yy = tmp_yy; pos = j; } } xy = xy + x[pos]; yy = yy + 2*ypulse[pos] + 1; ypulse[pos]++; } /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - x*y + y^2 and since x^2 and y^2 are constant, we just maximize x*y, plus a lambda*rate term. Note that since x and y aren't normalized here, we need to divide by sqrt(x^2)*sqrt(y^2). */ for (; i < k; i++) { double rsqrt_table[4]; int rsqrt_table_size = 4; int pos; double best_cost; pos = 0; best_cost = -1e5; /*Fill the small rsqrt lookup table with inputs relative to yy. Specifically, the table of n values is filled with rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/ od_fill_dynamic_rqrt_table(rsqrt_table, rsqrt_table_size, yy); for (j = 0; j < n; j++) { double tmp_xy; double tmp_yy; tmp_xy = xy + x[j]; /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/ tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size, yy, ypulse[j]); tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate; if (j == 0 || tmp_xy > best_cost) { best_cost = tmp_xy; pos = j; } } xy = xy + x[pos]; yy = yy + 2*ypulse[pos] + 1; ypulse[pos]++; } for (i = 0; i < n; i++) { if (xcoeff[i] < 0) ypulse[i] = -ypulse[i]; } return xy/(1e-100 + sqrt(xx*yy)); }
/** Perform PVQ quantization with prediction, trying several * possible gains and angles. See draft-valin-videocodec-pvq and * http://jmvalin.ca/slides/pvq.pdf for more details. * * @param [out] out coefficients after quantization * @param [in] x0 coefficients before quantization * @param [in] r0 reference, aka predicted coefficients * @param [in] n number of dimensions * @param [in] q0 quantization step size * @param [out] y pulse vector (i.e. selected PVQ codevector) * @param [out] itheta angle between input and reference (-1 if noref) * @param [out] max_theta maximum value of itheta that could have been * @param [out] vk total number of pulses * @param [in] beta per-band activity masking beta param * @param [out] skip_diff distortion cost of skipping this block * (accumulated) * @param [in] robust make stream robust to error in the reference * @param [in] is_keyframe whether we're encoding a keyframe * @param [in] pli plane index * @param [in] adapt probability adaptation context * @param [in] qm QM with magnitude compensation * @param [in] qm_inv Inverse of QM with magnitude compensation * @return gain index of the quatized gain */ static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0, int n, int q0, od_coeff *y, int *itheta, int *max_theta, int *vk, double beta, double *skip_diff, int robust, int is_keyframe, int pli, const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv) { od_val32 g; od_val32 gr; od_coeff y_tmp[MAXN]; int i; /* Number of pulses. */ int k; /* Companded gain of x and reference, normalized to q. */ od_val32 cg; od_val32 cgr; int icgr; int qg; /* Best RDO cost (D + lamdba*R) so far. */ double best_cost; /* Distortion (D) that corresponds to the best RDO cost. */ double best_dist; double dist; /* Sign of Householder reflection. */ int s; /* Dimension on which Householder reflects. */ int m; od_val32 theta; double corr; int best_k; od_val32 best_qtheta; od_val32 gain_offset; int noref; double lambda; double skip_dist; int cfl_enabled; int skip; double gain_weight; od_val16 x16[MAXN]; od_val16 r16[MAXN]; int xshift; int rshift; lambda = OD_PVQ_LAMBDA; /* Give more weight to gain error when calculating the total distortion. */ gain_weight = 1.4; OD_ASSERT(n > 1); corr = 0; #if !defined(OD_FLOAT_PVQ) /* Shift needed to make x fit in 16 bits even after rotation. This shift value is not normative (it can be changed without breaking the bitstream) */ xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15); /* Shift needed to make the reference fit in 15 bits, so that the Householder vector can fit in 16 bits. This shift value *is* normative, and has to match the decoder. */ rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14); #else xshift = 0; rshift = 0; #endif for (i = 0; i < n; i++) { #if defined(OD_FLOAT_PVQ) /*This is slightly different from the original float PVQ code, where the qm was applied in the accumulation in od_pvq_compute_gain and the vectors were od_coeffs, not od_val16 (i.e. double).*/ x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1; r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1; #else x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift); r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift); #endif corr += OD_MULT16_16(x16[i], r16[i]); } cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL; cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift); cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift); if (cfl_enabled) cgr = OD_CGAIN_SCALE; /* gain_offset is meant to make sure one of the quantized gains has exactly the same gain as the reference. */ #if defined(OD_FLOAT_PVQ) icgr = (int)floor(.5 + cgr); #else icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT); #endif gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT); /* Start search with null case: gain=0, no pulse. */ qg = 0; dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; best_dist = dist; best_cost = dist + lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0, n, is_keyframe, pli); noref = 1; best_k = 0; *itheta = -1; *max_theta = 0; OD_CLEAR(y, n); best_qtheta = 0; m = 0; s = 1; corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift)); corr = OD_MAXF(OD_MINF(corr, 1.), -1.); if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2; else { skip_dist = gain_weight*(cg - cgr)*(cg - cgr) + cgr*(double)cg*(2 - 2*corr); skip_dist *= OD_CGAIN_SCALE_2; } if (!is_keyframe) { /* noref, gain=0 isn't allowed, but skip is allowed. */ od_val32 scgr; scgr = OD_MAXF(0,gain_offset); if (icgr == 0) { best_dist = gain_weight*(cg - scgr)*(cg - scgr) + scgr*(double)cg*(2 - 2*corr); best_dist *= OD_CGAIN_SCALE_2; } best_cost = best_dist + lambda*od_pvq_rate(0, icgr, 0, 0, adapt, NULL, 0, n, is_keyframe, pli); best_qtheta = 0; *itheta = 0; *max_theta = 0; noref = 0; } if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) { od_val16 xr[MAXN]; int gain_bound; gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT); /* Perform theta search only if prediction is useful. */ theta = OD_ROUND32(OD_THETA_SCALE*acos(corr)); m = od_compute_householder(r16, n, gr, &s, rshift); od_apply_householder(xr, x16, r16, n); for (i = m; i < n - 1; i++) xr[i] = xr[i + 1]; /* Search for the best gain within a reasonable range. */ for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) { int j; od_val32 qcg; int ts; /* Quantized companded gain */ qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset; /* Set angular resolution (in ra) to match the encoded gain */ ts = od_pvq_compute_max_theta(qcg, beta); /* Search for the best angle within a reasonable range. */ for (j = OD_MAXI(0, (int)floor(.5 + theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2); j <= OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts)); j++) { double cos_dist; double cost; double dist_theta; double sin_prod; od_val32 qtheta; qtheta = od_pvq_compute_theta(j, ts); k = od_pvq_compute_k(qcg, j, qtheta, 0, n, beta, robust || is_keyframe); sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)* OD_TRIG_SCALE_1; /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since that's the factor by which cos_dist is multiplied to get the distortion metric. */ cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp, qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2); /* See Jmspeex' Journal of Dubious Theoretical Results. */ dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1 + sin_prod*(2 - 2*cos_dist); dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta; dist *= OD_CGAIN_SCALE_2; /* Do approximate RDO. */ cost = dist + lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp, k, n, is_keyframe, pli); if (cost < best_cost) { best_cost = cost; best_dist = dist; qg = i; best_k = k; best_qtheta = qtheta; *itheta = j; *max_theta = ts; noref = 0; OD_COPY(y, y_tmp, n - 1); } } } } /* Don't bother with no-reference version if there's a reasonable correlation. The only exception is luma on a keyframe because H/V prediction is unreliable. */ if (n <= OD_MAX_PVQ_SIZE && ((is_keyframe && pli == 0) || corr < .5 || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) { int gain_bound; gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT); /* Search for the best gain (haven't determined reasonable range yet). */ for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) { double cos_dist; double cost; od_val32 qcg; qcg = OD_SHL(i, OD_CGAIN_SHIFT); k = od_pvq_compute_k(qcg, -1, -1, 1, n, beta, robust || is_keyframe); cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp, qcg*(double)cg*OD_CGAIN_SCALE_2); /* See Jmspeex' Journal of Dubious Theoretical Results. */ dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*(2 - 2*cos_dist); dist *= OD_CGAIN_SCALE_2; /* Do approximate RDO. */ cost = dist + lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k, n, is_keyframe, pli); if (cost <= best_cost) { best_cost = cost; best_dist = dist; qg = i; noref = 1; best_k = k; *itheta = -1; *max_theta = 0; OD_COPY(y, y_tmp, n); } } } k = best_k; theta = best_qtheta; skip = 0; if (noref) { if (qg == 0) skip = OD_PVQ_SKIP_ZERO; } else { if (!is_keyframe && qg == 0) { skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY); } if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY; } /* Synthesize like the decoder would. */ if (skip) { if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n); else OD_CLEAR(out, n); } else { if (noref) gain_offset = 0; g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta); od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s, qm_inv); } *vk = k; *skip_diff += skip_dist - best_dist; /* Encode gain differently depending on whether we use prediction or not. Special encoding on inter frames where qg=0 is allowed for noref=0 but not noref=1.*/ if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr); else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1); }