static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block, VP8Histogram* const histo) { int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); { int k; v8i16 coeff0, coeff1; const v8i16 zero = { 0 }; const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH); LD_SH2(&out[0], 8, coeff0, coeff1); coeff0 = __msa_add_a_h(coeff0, zero); coeff1 = __msa_add_a_h(coeff1, zero); SRAI_H2_SH(coeff0, coeff1, 3); coeff0 = __msa_min_s_h(coeff0, max_coeff_thr); coeff1 = __msa_min_s_h(coeff1, max_coeff_thr); ST_SH2(coeff0, coeff1, &out[0], 8); for (k = 0; k < 16; ++k) { ++distribution[out[k]]; } } } VP8SetHistogramData(distribution, histo); }
static int ReconstructIntra16(VP8EncIterator* const it, VP8ModeScore* const rd, uint8_t* const yuv_out, int mode) { VP8Encoder* const enc = it->enc_; const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; const uint8_t* const src = it->yuv_in_ + Y_OFF; VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; int nz = 0; int n; int16_t tmp[16][16], dc_tmp[16]; for (n = 0; n < 16; ++n) { VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); } VP8FTransformWHT(tmp[0], dc_tmp); nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24; if (DO_TRELLIS_I16 && it->do_trellis_) { int x, y; VP8IteratorNzToBytes(it); for (y = 0, n = 0; y < 4; ++y) { for (x = 0; x < 4; ++x, ++n) { const int ctx = it->top_nz_[x] + it->left_nz_[y]; const int non_zero = TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0, &dqm->y1_, dqm->lambda_trellis_i16_); it->top_nz_[x] = it->left_nz_[y] = non_zero; nz |= non_zero << n; } } } else { for (n = 0; n < 16; ++n) { nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n; } } // Transform back VP8ITransformWHT(dc_tmp, tmp[0]); for (n = 0; n < 16; n += 2) { VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); } return nz; }
static int ReconstructIntra4(VP8EncIterator* const it, int16_t levels[16], const uint8_t* const src, uint8_t* const yuv_out, int mode) { const VP8Encoder* const enc = it->enc_; const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; int nz = 0; int16_t tmp[16]; VP8FTransform(src, ref, tmp); if (DO_TRELLIS_I4 && it->do_trellis_) { const int x = it->i4_ & 3, y = it->i4_ >> 2; const int ctx = it->top_nz_[x] + it->left_nz_[y]; nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_, dqm->lambda_trellis_i4_); } else {
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block, VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; int distribution[MAX_COEFF_THRESH + 1] = { 0 }; for (j = start_block; j < end_block; ++j) { int16_t out[16]; int k; VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 3 const __m128i v0 = _mm_srai_epi16(abs0, 3); const __m128i v1 = _mm_srai_epi16(abs1, 3); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Convert coefficients to bin. for (k = 0; k < 16; ++k) { ++distribution[out[k]]; } } VP8LSetHistogramData(distribution, histo); }
static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, int start_block, int end_block) { int histo[MAX_COEFF_THRESH + 1] = { 0 }; int16_t out[16]; int j, k; const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); for (j = start_block; j < end_block; ++j) { VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); // Convert coefficients to bin (within out[]). { // Load. const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_srai_epi16(out0, 15); const __m128i sign1 = _mm_srai_epi16(out1, 15); // abs(out) = (out ^ sign) - sign const __m128i xor0 = _mm_xor_si128(out0, sign0); const __m128i xor1 = _mm_xor_si128(out1, sign1); const __m128i abs0 = _mm_sub_epi16(xor0, sign0); const __m128i abs1 = _mm_sub_epi16(xor1, sign1); // v = abs(out) >> 2 const __m128i v0 = _mm_srai_epi16(abs0, 2); const __m128i v1 = _mm_srai_epi16(abs1, 2); // bin = min(v, MAX_COEFF_THRESH) const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); // Store. _mm_storeu_si128((__m128i*)&out[0], bin0); _mm_storeu_si128((__m128i*)&out[8], bin1); } // Use bin to update histogram. for (k = 0; k < 16; ++k) { histo[out[k]]++; } } return VP8GetAlpha(histo); }