static void merge_plane(BYTE* srcp, const BYTE* otherp, int src_pitch, int other_pitch, int src_width, int src_height, float weight, IScriptEnvironment *env) { if ((weight>0.4961f) && (weight<0.5039f)) { //average of two planes if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(srcp, 16) && IsPtrAligned(otherp, 16)) { average_plane_sse2(srcp, otherp, src_pitch, other_pitch, src_width, src_height); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_INTEGER_SSE) { average_plane_isse(srcp, otherp, src_pitch, other_pitch, src_width, src_height); } else #endif { average_plane_c(srcp, otherp, src_pitch, other_pitch, src_width, src_height); } } else { int iweight = (int)(weight*32767.0f); int invweight = 32767-iweight; //real merge if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(srcp, 16) && IsPtrAligned(otherp, 16)) { weighted_merge_planar_sse2(srcp, otherp, src_pitch, other_pitch, src_width, src_height, iweight, invweight); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { weighted_merge_planar_mmx(srcp, otherp, src_pitch, other_pitch, src_width, src_height, iweight, invweight); } else #endif { iweight = (int)(weight*65535.0f); invweight = 65535-iweight; weighted_merge_planar_c(srcp, otherp, src_pitch, other_pitch, src_width, src_height, iweight, invweight); } } }
extern "C" void Store_SSE2(UINT32* dst, UINT32 rgba, unsigned len) { OP_ASSERT(IsPtrAligned(dst, 4)); // Guard against short runs where the setup cost will be too // high to benefit from the unrolling and 128 bit stores. // It's also set high enough to avoid an extra check when // aligning memory. if (len > 15) { // Aligned memory accesses can be significantly faster, // write single pixels util we get to an aligned memory // address. int unaligned = PixelsUntilAligned(dst); if (unaligned) { len -= unaligned; switch (unaligned) { case 3: *dst++ = rgba; case 2: *dst++ = rgba; case 1: *dst++ = rgba; } } // Duplicate the source color four times in a 128 bit register. // This is needed for both the unrolled and normal // inner loop. __m128i src = _mm_cvtsi32_si128(rgba); src = _mm_shuffle_epi32(src, _MM_SHUFFLE(0, 0, 0, 0)); // Unrolled inner loop that stores 32 pixels at a time. unsigned int length = len / 32; while (length--) { _mm_store_si128((__m128i *)(dst + 0 * 4), src); _mm_store_si128((__m128i *)(dst + 1 * 4), src); _mm_store_si128((__m128i *)(dst + 2 * 4), src); _mm_store_si128((__m128i *)(dst + 3 * 4), src); _mm_store_si128((__m128i *)(dst + 4 * 4), src); _mm_store_si128((__m128i *)(dst + 5 * 4), src); _mm_store_si128((__m128i *)(dst + 6 * 4), src); _mm_store_si128((__m128i *)(dst + 7 * 4), src); dst += 32; } // Handle the remaining pixels four at a time. len &= 31; length = len / 4; while (length--) { _mm_store_si128((__m128i *)dst, src); dst += 4; } // Leave the last 1-3 pixels for the regular loop. len &= 3; } // One pixel at a time. while (len--) *dst++ = rgba; }
PVideoFrame __stdcall MergeLuma::GetFrame(int n, IScriptEnvironment* env) { PVideoFrame src = child->GetFrame(n, env); if (weight<0.0039f) return src; PVideoFrame luma = clip->GetFrame(n, env); if (vi.IsYUY2()) { env->MakeWritable(&src); BYTE* srcp = src->GetWritePtr(); const BYTE* lumap = luma->GetReadPtr(); int isrc_pitch = src->GetPitch(); int iluma_pitch = luma->GetPitch(); int h = src->GetHeight(); int w = src->GetRowSize(); if (weight<0.9961f) { if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(srcp, 16) && IsPtrAligned(lumap, 16)) { weighted_merge_luma_yuy2_sse2(srcp, lumap, isrc_pitch, iluma_pitch, w, h, (int)(weight*32768.0f), 32768-(int)(weight*32768.0f)); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { weighted_merge_luma_yuy2_mmx(srcp, lumap, isrc_pitch, iluma_pitch, w, h, (int)(weight*32768.0f), 32768-(int)(weight*32768.0f)); } else #endif { weighted_merge_luma_yuy2_c(srcp, lumap, isrc_pitch, iluma_pitch, w, h, (int)(weight*32768.0f), 32768-(int)(weight*32768.0f)); } } else { if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(srcp, 16) && IsPtrAligned(lumap, 16)) { replace_luma_yuy2_sse2(srcp,lumap,isrc_pitch,iluma_pitch,w,h); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { replace_luma_yuy2_mmx(srcp,lumap,isrc_pitch,iluma_pitch,w,h); } else #endif { replace_luma_yuy2_c(srcp,lumap,isrc_pitch,iluma_pitch,w,h); } } return src; } // Planar if (weight>0.9961f) { const VideoInfo& vi2 = clip->GetVideoInfo(); if (luma->IsWritable() && vi.IsSameColorspace(vi2)) { if (luma->GetRowSize(PLANAR_U)) { luma->GetWritePtr(PLANAR_Y); //Must be requested BUT only if we actually do something env->BitBlt(luma->GetWritePtr(PLANAR_U),luma->GetPitch(PLANAR_U),src->GetReadPtr(PLANAR_U),src->GetPitch(PLANAR_U),src->GetRowSize(PLANAR_U),src->GetHeight(PLANAR_U)); env->BitBlt(luma->GetWritePtr(PLANAR_V),luma->GetPitch(PLANAR_V),src->GetReadPtr(PLANAR_V),src->GetPitch(PLANAR_V),src->GetRowSize(PLANAR_V),src->GetHeight(PLANAR_V)); } return luma; } else { // avoid the cost of 2 chroma blits PVideoFrame dst = env->NewVideoFrame(vi); env->BitBlt(dst->GetWritePtr(PLANAR_Y),dst->GetPitch(PLANAR_Y),luma->GetReadPtr(PLANAR_Y),luma->GetPitch(PLANAR_Y),luma->GetRowSize(PLANAR_Y),luma->GetHeight(PLANAR_Y)); if (src->GetRowSize(PLANAR_U) && dst->GetRowSize(PLANAR_U)) { env->BitBlt(dst->GetWritePtr(PLANAR_U),dst->GetPitch(PLANAR_U),src->GetReadPtr(PLANAR_U),src->GetPitch(PLANAR_U),src->GetRowSize(PLANAR_U),src->GetHeight(PLANAR_U)); env->BitBlt(dst->GetWritePtr(PLANAR_V),dst->GetPitch(PLANAR_V),src->GetReadPtr(PLANAR_V),src->GetPitch(PLANAR_V),src->GetRowSize(PLANAR_V),src->GetHeight(PLANAR_V)); } return dst; } } else { // weight <= 0.9961f env->MakeWritable(&src); BYTE* srcpY = (BYTE*)src->GetWritePtr(PLANAR_Y); BYTE* lumapY = (BYTE*)luma->GetReadPtr(PLANAR_Y); int src_pitch = src->GetPitch(PLANAR_Y); int luma_pitch = luma->GetPitch(PLANAR_Y); int src_width = src->GetRowSize(PLANAR_Y); int src_height = src->GetHeight(PLANAR_Y); merge_plane(srcpY, lumapY, src_pitch, luma_pitch, src_width, src_height, weight, env); } return src; }
PVideoFrame __stdcall MergeChroma::GetFrame(int n, IScriptEnvironment* env) { PVideoFrame src = child->GetFrame(n, env); if (weight<0.0039f) return src; PVideoFrame chroma = clip->GetFrame(n, env); int h = src->GetHeight(); int w = src->GetRowSize(); // width in pixels if (weight<0.9961f) { if (vi.IsYUY2()) { env->MakeWritable(&src); BYTE* srcp = src->GetWritePtr(); const BYTE* chromap = chroma->GetReadPtr(); int src_pitch = src->GetPitch(); int chroma_pitch = chroma->GetPitch(); if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(srcp, 16) && IsPtrAligned(chromap, 16)) { weighted_merge_chroma_yuy2_sse2(srcp,chromap,src_pitch,chroma_pitch,w,h,(int)(weight*32768.0f),32768-(int)(weight*32768.0f)); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { weighted_merge_chroma_yuy2_mmx(srcp,chromap,src_pitch,chroma_pitch,w,h,(int)(weight*32768.0f),32768-(int)(weight*32768.0f)); } else #endif { weighted_merge_chroma_yuy2_c(srcp,chromap,src_pitch,chroma_pitch,w,h,(int)(weight*32768.0f),32768-(int)(weight*32768.0f)); } } else { // Planar env->MakeWritable(&src); src->GetWritePtr(PLANAR_Y); //Must be requested BYTE* srcpU = (BYTE*)src->GetWritePtr(PLANAR_U); BYTE* chromapU = (BYTE*)chroma->GetReadPtr(PLANAR_U); BYTE* srcpV = (BYTE*)src->GetWritePtr(PLANAR_V); BYTE* chromapV = (BYTE*)chroma->GetReadPtr(PLANAR_V); int src_pitch_uv = src->GetPitch(PLANAR_U); int chroma_pitch_uv = chroma->GetPitch(PLANAR_U); int src_width_u = src->GetRowSize(PLANAR_U_ALIGNED); int src_width_v = src->GetRowSize(PLANAR_V_ALIGNED); int src_height_uv = src->GetHeight(PLANAR_U); merge_plane(srcpU, chromapU, src_pitch_uv, chroma_pitch_uv, src_width_u, src_height_uv, weight, env); merge_plane(srcpV, chromapV, src_pitch_uv, chroma_pitch_uv, src_width_v, src_height_uv, weight, env); } } else { // weight == 1.0 if (vi.IsYUY2()) { const BYTE* srcp = src->GetReadPtr(); env->MakeWritable(&chroma); BYTE* chromap = chroma->GetWritePtr(); int src_pitch = src->GetPitch(); int chroma_pitch = chroma->GetPitch(); if ((env->GetCPUFlags() & CPUF_SSE2) && IsPtrAligned(chromap, 16) && IsPtrAligned(srcp, 16)) { replace_luma_yuy2_sse2(chromap,srcp,chroma_pitch,src_pitch,w,h); // Just swap luma/chroma } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { replace_luma_yuy2_mmx(chromap,srcp,chroma_pitch,src_pitch,w,h); // Just swap luma/chroma } else #endif { replace_luma_yuy2_c(chromap,srcp,chroma_pitch,src_pitch,w,h); // Just swap luma/chroma } return chroma; } else { if (src->IsWritable()) { src->GetWritePtr(PLANAR_Y); //Must be requested env->BitBlt(src->GetWritePtr(PLANAR_U),src->GetPitch(PLANAR_U),chroma->GetReadPtr(PLANAR_U),chroma->GetPitch(PLANAR_U),chroma->GetRowSize(PLANAR_U),chroma->GetHeight(PLANAR_U)); env->BitBlt(src->GetWritePtr(PLANAR_V),src->GetPitch(PLANAR_V),chroma->GetReadPtr(PLANAR_V),chroma->GetPitch(PLANAR_V),chroma->GetRowSize(PLANAR_V),chroma->GetHeight(PLANAR_V)); } else { // avoid the cost of 2 chroma blits PVideoFrame dst = env->NewVideoFrame(vi); env->BitBlt(dst->GetWritePtr(PLANAR_Y),dst->GetPitch(PLANAR_Y),src->GetReadPtr(PLANAR_Y),src->GetPitch(PLANAR_Y),src->GetRowSize(PLANAR_Y),src->GetHeight(PLANAR_Y)); env->BitBlt(dst->GetWritePtr(PLANAR_U),dst->GetPitch(PLANAR_U),chroma->GetReadPtr(PLANAR_U),chroma->GetPitch(PLANAR_U),chroma->GetRowSize(PLANAR_U),chroma->GetHeight(PLANAR_U)); env->BitBlt(dst->GetWritePtr(PLANAR_V),dst->GetPitch(PLANAR_V),chroma->GetReadPtr(PLANAR_V),chroma->GetPitch(PLANAR_V),chroma->GetRowSize(PLANAR_V),chroma->GetHeight(PLANAR_V)); return dst; } } } return src; }