static int make_frame_planar_yuv
(
    lw_video_output_handler_t *vohp,
    int                        height,
    AVFrame                   *av_frame,
    PVideoFrame               &as_frame
)
{
    as_picture_t as_picture = { { { NULL } } };
    as_assign_planar_yuv( as_frame, &as_picture );
    return convert_av_pixel_format( vohp->scaler.sws_ctx, height, av_frame, &as_picture );
}
static int make_frame_packed_rgb
(
    lw_video_output_handler_t *vohp,
    int                        height,
    AVFrame                   *av_frame,
    PVideoFrame               &as_frame
)
{
    as_picture_t as_picture = { { { NULL } } };
    as_picture.data    [0] = as_frame->GetWritePtr() + as_frame->GetPitch() * (as_frame->GetHeight() - 1);
    as_picture.linesize[0] = -as_frame->GetPitch();
    return convert_av_pixel_format( vohp->scaler.sws_ctx, height, av_frame, &as_picture );
}
VSFrameRef *make_frame
(
    lw_video_output_handler_t *vohp,
    AVCodecContext            *ctx,
    AVFrame                   *av_frame
)
{
    vs_video_output_handler_t *vs_vohp = (vs_video_output_handler_t *)vohp->private_handler;
    VSFrameContext *frame_ctx = vs_vohp->frame_ctx;
    VSCore         *core      = vs_vohp->core;
    const VSAPI    *vsapi     = vs_vohp->vsapi;
    if( vs_vohp->direct_rendering && !vohp->scaler.enabled && av_frame->opaque )
    {
        /* Render from the decoder directly. */
        vs_video_buffer_handler_t *vs_vbhp = (vs_video_buffer_handler_t *)av_frame->opaque;
        return vs_vbhp ? (VSFrameRef *)vs_vbhp->vsapi->cloneFrameRef( vs_vbhp->vs_frame_buffer ) : NULL;
    }
    if( !vs_vohp->make_frame )
        return NULL;
    /* Convert pixel format if needed. We don't change the presentation resolution. */
    enum AVPixelFormat *input_pixel_format = (enum AVPixelFormat *)&av_frame->format;
    int yuv_range = avoid_yuv_scale_conversion( input_pixel_format );
    lw_video_scaler_handler_t *vshp = &vohp->scaler;
    if( !vshp->sws_ctx
     || vshp->input_width        != ctx->width
     || vshp->input_height       != ctx->height
     || vshp->input_pixel_format != *input_pixel_format
     || vshp->input_colorspace   != ctx->colorspace
     || vshp->input_yuv_range    != yuv_range )
    {
        /* Update scaler. */
        vshp->sws_ctx = update_scaler_configuration( vshp->sws_ctx, vshp->flags,
                                                     ctx->width, ctx->height,
                                                     *input_pixel_format, vshp->output_pixel_format,
                                                     ctx->colorspace, yuv_range );
        if( !vshp->sws_ctx )
        {
            if( frame_ctx )
                vsapi->setFilterError( "lsmas: failed to update scaler settings.", frame_ctx );
            return NULL;
        }
        vshp->input_width        = ctx->width;
        vshp->input_height       = ctx->height;
        vshp->input_pixel_format = *input_pixel_format;
        vshp->input_colorspace   = ctx->colorspace;
        vshp->input_yuv_range    = yuv_range;
    }
    /* Make video frame. */
    AVPicture av_picture;
    int ret = convert_av_pixel_format( vshp, ctx->width, ctx->height, av_frame, &av_picture );
    if( ret < 0 )
    {
        if( frame_ctx )
            vsapi->setFilterError( "lsmas: failed to av_image_alloc.", frame_ctx );
        return NULL;
    }
    VSFrameRef *vs_frame = new_output_video_frame( vohp, ctx->width, ctx->height, *input_pixel_format, frame_ctx, core, vsapi );
    if( vs_frame )
        vs_vohp->make_frame( &av_picture, ctx->width, ctx->height, vs_vohp->component_reorder, vs_frame, frame_ctx, vsapi );
    else if( frame_ctx )
        vsapi->setFilterError( "lsmas: failed to alloc a output video frame.", frame_ctx );
    if( ret > 0 )
        av_free( av_picture.data[0] );
    return vs_frame;
}
static int make_frame_planar_yuv_stacked
(
    lw_video_output_handler_t *vohp,
    int                        height,
    AVFrame                   *av_frame,
    PVideoFrame               &as_frame
)
{
    as_picture_t dst_picture = { { { NULL } } };
    as_picture_t src_picture = { { { NULL } } };
    as_assign_planar_yuv( as_frame, &dst_picture );
    lw_video_scaler_handler_t *vshp = &vohp->scaler;
    as_video_output_handler_t *as_vohp = (as_video_output_handler_t *)vohp->private_handler;
    if( vshp->input_pixel_format == vshp->output_pixel_format )
        for( int i = 0; i < 3; i++ )
        {
            src_picture.data    [i] = av_frame->data    [i];
            src_picture.linesize[i] = av_frame->linesize[i];
        }
    else
    {
        if( convert_av_pixel_format( vshp->sws_ctx, height, av_frame, &as_vohp->scaled ) < 0 )
            return -1;
        src_picture = as_vohp->scaled;
    }
    for( int i = 0; i < 3; i++ )
    {
        const int src_height = height >> (i ? as_vohp->sub_height : 0);
        const int width      = vshp->input_width >> (i ? as_vohp->sub_width : 0);
        const int width16    = sse2_available > 0 ? (width & ~15) : 0;
        const int width32    = avx2_available > 0 ? (width & ~31) : 0;
        const int lsb_offset = src_height * dst_picture.linesize[i];
        for( int j = 0; j < src_height; j++ )
        {
            /* Here, if available, use SIMD instructions.
             * Note: There is assumption that the address of a given data can be divided by 32 or 16.
             *       The destination is always 32 byte alignment unless AviSynth legacy alignment is used.
             *       The source is not always 32 or 16 byte alignment if the frame buffer is from libavcodec directly. */
            static const uint8_t LW_ALIGN(32) sp16[32] =
                {
                    /* saturation protector
                     * For setting all upper 8 bits to zero so that saturation won't make sense. */
                    0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 ,0xFF, 0x00, 0xFF, 0x00,
                    0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 ,0xFF, 0x00, 0xFF, 0x00
                };
                  uint8_t *dst     = dst_picture.data[i] + j * dst_picture.linesize[i];     /* MSB: dst +     k,     LSB: dst +     k + lsb_offset */
            const uint8_t *src     = src_picture.data[i] + j * src_picture.linesize[i];     /* MSB: src + 2 * k + 1, LSB: src + 2 * k */
            const int     _width16 = ((intptr_t)src & 15) == 0 ? width16 : 0;               /* Don't use SSE2 instructions if set to 0. */
            const int     _width32 = ((intptr_t)src & 31) == 0 ? width32 : 0;               /* Don't use AVX(2) instructions if set to 0. */
#if VC_HAS_AVX2
            /* AVX, AVX2 */
            for( int k = 0; k < _width32; k += 32 )
            {
                __m256i ymm0 = _mm256_load_si256( (__m256i *)(src + 2 * k     ) );
                __m256i ymm1 = _mm256_load_si256( (__m256i *)(src + 2 * k + 32) );
                __m256i mask = _mm256_load_si256( (__m256i *)sp16 );
                __m256i ymm2 = _mm256_packus_epi16( _mm256_and_si256 ( ymm0, mask ), _mm256_and_si256 ( ymm1, mask ) );
                __m256i ymm3 = _mm256_packus_epi16( _mm256_srli_epi16( ymm0,    8 ), _mm256_srli_epi16( ymm1,    8 ) );
                _mm256_store_si256( (__m256i *)(dst + k + lsb_offset), _mm256_permute4x64_epi64( ymm2, _MM_SHUFFLE( 3, 1, 2, 0 ) ) );
                _mm256_store_si256( (__m256i *)(dst + k             ), _mm256_permute4x64_epi64( ymm3, _MM_SHUFFLE( 3, 1, 2, 0 ) ) );
            }
#endif
            /* SSE2 */
            for( int k = _width32; k < _width16; k += 16 )
            {
                __m128i xmm0 = _mm_load_si128( (__m128i *)(src + 2 * k     ) );
                __m128i xmm1 = _mm_load_si128( (__m128i *)(src + 2 * k + 16) );
                __m128i mask = _mm_load_si128( (__m128i *)sp16 );
                _mm_store_si128( (__m128i *)(dst + k + lsb_offset), _mm_packus_epi16( _mm_and_si128 ( xmm0, mask ), _mm_and_si128 ( xmm1, mask ) ) );
                _mm_store_si128( (__m128i *)(dst + k             ), _mm_packus_epi16( _mm_srli_epi16( xmm0,    8 ), _mm_srli_epi16( xmm1,    8 ) ) );
            }
            for( int k = _width16; k < width; k++ )
            {
                *(dst + k + lsb_offset) = *(src + 2 * k    );
                *(dst + k             ) = *(src + 2 * k + 1);
            }
        }
    }
    return 0;
}