void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, unsigned int out_h) { std::mt19937 rng(12345); bool use_ssse3 = (CPUFeatures::HasMMX() && CPUFeatures::HasSSE() && CPUFeatures::HasSSE2() && CPUFeatures::HasSSE3() && CPUFeatures::HasSSSE3()); // the queue needs to use enough memory to make sure that the CPU cache is flushed unsigned int pixels = std::max(in_w * in_h, out_w * out_h); unsigned int queue_size = 1 + 20000000 / pixels; unsigned int run_size = queue_size * 20; // create queue std::vector<std::unique_ptr<ImageGeneric> > queue_in(queue_size); std::vector<std::unique_ptr<ImageGeneric> > queue_out(queue_size); for(unsigned int i = 0; i < queue_size; ++i) { queue_in[i] = NewImageBGRA(in_w, in_h, rng); queue_out[i] = NewImageBGRA(out_w, out_h, rng); } // run test unsigned int time_fallback = 0, time_ssse3 = 0; { int64_t t1 = hrt_time_micro(); for(unsigned int i = 0; i < run_size; ++i) { unsigned int ii = i % queue_size; Scale_BGRA_Fallback(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]); } int64_t t2 = hrt_time_micro(); time_fallback = (t2 - t1) / run_size; } if(use_ssse3) { int64_t t1 = hrt_time_micro(); for(unsigned int i = 0; i < run_size; ++i) { unsigned int ii = i % queue_size; Scale_BGRA_SSSE3(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]); } int64_t t2 = hrt_time_micro(); time_ssse3 = (t2 - t1) / run_size; } // print result QString in_size = QString("%1x%2").arg(in_w).arg(in_h); QString out_size = QString("%1x%2").arg(out_w).arg(out_h); Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2 | Fallback %3 ms | SSSE3 %4 ms | %5%") .arg(in_size, 9).arg(out_size, 9).arg(time_fallback, 6).arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)); }
void BenchmarkScale(unsigned int in_w, unsigned int in_h, unsigned int out_w, unsigned int out_h) { std::mt19937 rng(12345); #if SSR_USE_X86_ASM bool use_ssse3 = (CPUFeatures::HasMMX() && CPUFeatures::HasSSE() && CPUFeatures::HasSSE2() && CPUFeatures::HasSSE3() && CPUFeatures::HasSSSE3()); #endif // the queue needs to use enough memory to make sure that the CPU cache is flushed unsigned int pixels = std::max(in_w * in_h, out_w * out_h); unsigned int queue_size = 1 + 20000000 / pixels; unsigned int run_size = queue_size * 20; // create queue std::vector<std::unique_ptr<ImageGeneric> > queue_in(queue_size); std::vector<std::unique_ptr<ImageGeneric> > queue_out(queue_size); for(unsigned int i = 0; i < queue_size; ++i) { queue_in[i] = NewImageBGRA(in_w, in_h, rng); queue_out[i] = NewImageBGRA(out_w, out_h, rng); } // run test unsigned int time_swscale = 0, time_fallback = 0, time_ssse3 = 0; { SwsContext *sws = sws_getCachedContext(NULL, in_w, in_h, AV_PIX_FMT_BGRA, out_w, out_h, AV_PIX_FMT_BGRA, SWS_BILINEAR, NULL, NULL, NULL); if(sws == NULL) { Logger::LogError("[BenchmarkScale] " + Logger::tr("Error: Can't get swscale context!", "Don't translate 'swscale'")); throw LibavException(); } sws_setColorspaceDetails(sws, sws_getCoefficients(SWS_CS_ITU709), 0, sws_getCoefficients(SWS_CS_DEFAULT), 0, 0, 1 << 16, 1 << 16); int64_t t1 = hrt_time_micro(); for(unsigned int i = 0; i < run_size / 2; ++i) { unsigned int ii = i % queue_size; sws_scale(sws, queue_in[ii]->m_data.data(), queue_in[ii]->m_stride.data(), 0, in_h, queue_out[ii]->m_data.data(), queue_out[ii]->m_stride.data()); } int64_t t2 = hrt_time_micro(); time_swscale = (t2 - t1) / (run_size / 2); } { int64_t t1 = hrt_time_micro(); for(unsigned int i = 0; i < run_size; ++i) { unsigned int ii = i % queue_size; Scale_BGRA_Fallback(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]); } int64_t t2 = hrt_time_micro(); time_fallback = (t2 - t1) / run_size; } #if SSR_USE_X86_ASM if(use_ssse3) { int64_t t1 = hrt_time_micro(); for(unsigned int i = 0; i < run_size; ++i) { unsigned int ii = i % queue_size; Scale_BGRA_SSSE3(in_w, in_h, queue_in[ii]->m_data[0], queue_in[ii]->m_stride[0], out_w, out_h, queue_out[ii]->m_data[0], queue_out[ii]->m_stride[0]); } int64_t t2 = hrt_time_micro(); time_ssse3 = (t2 - t1) / run_size; } #endif // print result QString in_size = QString("%1x%2").arg(in_w).arg(in_h); QString out_size = QString("%1x%2").arg(out_w).arg(out_h); Logger::LogInfo("[BenchmarkScale] " + Logger::tr("BGRA %1 to BGRA %2 | SWScale %3 us | Fallback %4 us (%5%) | SSSE3 %6 us (%7%)") .arg(in_size, 9).arg(out_size, 9) .arg(time_swscale, 6) .arg(time_fallback, 6).arg(100 * time_fallback / time_swscale, 3) .arg(time_ssse3, 6).arg(100 * time_ssse3 / time_fallback, 3)); }