Example #1
0
static HRESULT _PremultiplyAlpha( _In_ const Image& srcImage, _In_ const Image& destImage )
{
    assert( srcImage.width == destImage.width );
    assert( srcImage.height == destImage.height );

    ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*srcImage.width), 16 ) ) );
    if ( !scanline )
        return E_OUTOFMEMORY;

    const uint8_t *pSrc = srcImage.pixels;
    uint8_t *pDest = destImage.pixels;
    if ( !pSrc || !pDest )
        return E_POINTER;

    for( size_t h = 0; h < srcImage.height; ++h )
    {
        if ( !_LoadScanline( scanline.get(), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) )
            return E_FAIL;

        XMVECTOR* ptr = scanline.get();
        for( size_t w = 0; w < srcImage.width; ++w )
        {
            XMVECTOR v = *ptr;
            XMVECTOR alpha = XMVectorSplatW( *ptr );
            alpha = XMVectorMultiply( v, alpha );
            *(ptr++) = XMVectorSelect( v, alpha, g_XMSelect1110 );
        }

        if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), srcImage.width ) )
            return E_FAIL;

        pSrc += srcImage.rowPitch;
        pDest += destImage.rowPitch;
    }

    return S_OK;
}
//-------------------------------------------------------------------------------------
static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
                            _In_ float alphaRef, _In_ bool degenerate )
{
    if ( !image.pixels || !result.pixels )
        return E_POINTER;

    assert( image.width == result.width );
    assert( image.height == result.height );

    const DXGI_FORMAT format = image.format;
    size_t sbpp = BitsPerPixel( format );
    if ( !sbpp )
        return E_FAIL;

    if ( sbpp < 8 )
    {
        // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    // Round to bytes
    sbpp = ( sbpp + 7 ) / 8;

    uint8_t *pDest = result.pixels;

    // Determine BC format encoder
    BC_ENCODE pfEncode;
    size_t blocksize;
    switch(result.format)
    {
    case DXGI_FORMAT_BC1_UNORM:
    case DXGI_FORMAT_BC1_UNORM_SRGB:    pfEncode = nullptr;         blocksize = 8;   break;
    case DXGI_FORMAT_BC2_UNORM:
    case DXGI_FORMAT_BC2_UNORM_SRGB:    pfEncode = D3DXEncodeBC2;   blocksize = 16;  break;
    case DXGI_FORMAT_BC3_UNORM:
    case DXGI_FORMAT_BC3_UNORM_SRGB:    pfEncode = D3DXEncodeBC3;   blocksize = 16;  break;
    case DXGI_FORMAT_BC4_UNORM:         pfEncode = D3DXEncodeBC4U;  blocksize = 8;   break;
    case DXGI_FORMAT_BC4_SNORM:         pfEncode = D3DXEncodeBC4S;  blocksize = 8;   break;
    case DXGI_FORMAT_BC5_UNORM:         pfEncode = D3DXEncodeBC5U;  blocksize = 16;  break;
    case DXGI_FORMAT_BC5_SNORM:         pfEncode = D3DXEncodeBC5S;  blocksize = 16;  break;
    case DXGI_FORMAT_BC6H_UF16:         pfEncode = D3DXEncodeBC6HU; blocksize = 16;  break;
    case DXGI_FORMAT_BC6H_SF16:         pfEncode = D3DXEncodeBC6HS; blocksize = 16;  break;
    case DXGI_FORMAT_BC7_UNORM:
    case DXGI_FORMAT_BC7_UNORM_SRGB:    pfEncode = D3DXEncodeBC7;   blocksize = 16;  break;
    default:
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    XMVECTOR temp[16];
    const uint8_t *pSrc = image.pixels;
    const size_t rowPitch = image.rowPitch;
    for( size_t h=0; h < image.height; h += 4 )
    {
        const uint8_t *sptr = pSrc;
        uint8_t* dptr = pDest;
        for( size_t count = 0; count < rowPitch; count += sbpp*4 )
        {
            if ( !_LoadScanline( &temp[0], 4, sptr, rowPitch, format ) )
                return E_FAIL;

            if ( image.height > 1 )
            {
                if ( !_LoadScanline( &temp[4], 4, sptr + rowPitch, rowPitch, format ) )
                    return E_FAIL;

                if ( image.height > 2 )
                {
                    if ( !_LoadScanline( &temp[8], 4, sptr + rowPitch*2, rowPitch, format ) )
                        return E_FAIL;

                    if ( !_LoadScanline( &temp[12], 4, sptr + rowPitch*3, rowPitch, format ) )
                        return E_FAIL;
                }
            }

            if ( degenerate )
            {
                assert( image.width < 4 || image.height < 4 );
                const size_t uSrc[] = { 0, 0, 0, 1 };

                if ( image.width < 4 )
                {
                    for( size_t t=0; t < image.height && t < 4; ++t )
                    {
                        for( size_t s = image.width; s < 4; ++s )
                        {
                            temp[ t*4 + s ] = temp[ t*4 + uSrc[s] ]; 
                        }
                    }
                }

                if ( image.height < 4 )
                {
                    for( size_t t=image.height; t < 4; ++t )
                    {
                        for( size_t s =0; s < 4; ++s )
                        {
                            temp[ t*4 + s ] = temp[ uSrc[t]*4 + s ]; 
                        }
                    }
                }
            }

            _ConvertScanline( temp, 16, result.format, format, 0 );
            
            if ( pfEncode )
                pfEncode( dptr, temp, bcflags );
            else
                D3DXEncodeBC1( dptr, temp, alphaRef, bcflags );

            sptr += sbpp*4;
            dptr += blocksize;
        }

        pSrc += rowPitch*4;
        pDest += result.rowPitch;
    }

    return S_OK;
}
static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
                                     _In_ float alphaRef )
{
    if ( !image.pixels || !result.pixels )
        return E_POINTER;

    // Parallel version doesn't support degenerate case
    assert( ((image.width % 4) == 0) && ((image.height % 4) == 0 ) );

    assert( image.width == result.width );
    assert( image.height == result.height );

    const DXGI_FORMAT format = image.format;
    size_t sbpp = BitsPerPixel( format );
    if ( !sbpp )
        return E_FAIL;

    if ( sbpp < 8 )
    {
        // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    // Round to bytes
    sbpp = ( sbpp + 7 ) / 8;

    // Determine BC format encoder
    BC_ENCODE pfEncode;
    size_t blocksize;
    switch(result.format)
    {
    case DXGI_FORMAT_BC1_UNORM:
    case DXGI_FORMAT_BC1_UNORM_SRGB:    pfEncode = nullptr;         blocksize = 8;   break;
    case DXGI_FORMAT_BC2_UNORM:
    case DXGI_FORMAT_BC2_UNORM_SRGB:    pfEncode = D3DXEncodeBC2;   blocksize = 16;  break;
    case DXGI_FORMAT_BC3_UNORM:
    case DXGI_FORMAT_BC3_UNORM_SRGB:    pfEncode = D3DXEncodeBC3;   blocksize = 16;  break;
    case DXGI_FORMAT_BC4_UNORM:         pfEncode = D3DXEncodeBC4U;  blocksize = 8;   break;
    case DXGI_FORMAT_BC4_SNORM:         pfEncode = D3DXEncodeBC4S;  blocksize = 8;   break;
    case DXGI_FORMAT_BC5_UNORM:         pfEncode = D3DXEncodeBC5U;  blocksize = 16;  break;
    case DXGI_FORMAT_BC5_SNORM:         pfEncode = D3DXEncodeBC5S;  blocksize = 16;  break;
    case DXGI_FORMAT_BC6H_UF16:         pfEncode = D3DXEncodeBC6HU; blocksize = 16;  break;
    case DXGI_FORMAT_BC6H_SF16:         pfEncode = D3DXEncodeBC6HS; blocksize = 16;  break;
    case DXGI_FORMAT_BC7_UNORM:
    case DXGI_FORMAT_BC7_UNORM_SRGB:    pfEncode = D3DXEncodeBC7;   blocksize = 16;  break;
    default:
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    // Refactored version of loop to support parallel independance
    const size_t nBlocks = std::max<size_t>(1, image.width / 4) * std::max<size_t>(1, image.height / 4);

    bool fail = false;

#pragma omp parallel for
    for( int nb=0; nb < static_cast<int>( nBlocks ); ++nb )
    {
        const size_t nbWidth = std::max<size_t>(1, image.width / 4);

        const size_t y = nb / nbWidth;
        const size_t x = nb - (y*nbWidth);

        assert( x < image.width && y < image.height );

        size_t rowPitch = image.rowPitch;
        const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp);

        uint8_t *pDest = result.pixels + (nb*blocksize);

        XMVECTOR temp[16];
        if ( !_LoadScanline( &temp[0], 4, pSrc, rowPitch, format ) )
            fail = true;

        if ( !_LoadScanline( &temp[4], 4, pSrc + rowPitch, rowPitch, format ) )
            fail = true;

        if ( !_LoadScanline( &temp[8], 4, pSrc + rowPitch*2, rowPitch, format ) )
            fail = true;

        if ( !_LoadScanline( &temp[12], 4, pSrc + rowPitch*3, rowPitch, format ) )
            fail = true;

        _ConvertScanline( temp, 16, result.format, format, 0 );
            
        if ( pfEncode )
            pfEncode( pDest, temp, bcflags );
        else
            D3DXEncodeBC1( pDest, temp, alphaRef, bcflags );
    }

    return (fail) ? E_FAIL : S_OK;
}
//-------------------------------------------------------------------------------------
static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
                            _In_ DWORD srgb, _In_ float alphaRef )
{
    if ( !image.pixels || !result.pixels )
        return E_POINTER;

    assert( image.width == result.width );
    assert( image.height == result.height );

    const DXGI_FORMAT format = image.format;
    size_t sbpp = BitsPerPixel( format );
    if ( !sbpp )
        return E_FAIL;

    if ( sbpp < 8 )
    {
        // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    // Round to bytes
    sbpp = ( sbpp + 7 ) / 8;

    uint8_t *pDest = result.pixels;

    // Determine BC format encoder
    BC_ENCODE pfEncode;
    size_t blocksize;
    DWORD cflags;
    if ( !_DetermineEncoderSettings( result.format, pfEncode, blocksize, cflags ) )
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );

    XMVECTOR temp[16];
    const uint8_t *pSrc = image.pixels;
    const size_t rowPitch = image.rowPitch;
    for( size_t h=0; h < image.height; h += 4 )
    {
        const uint8_t *sptr = pSrc;
        uint8_t* dptr = pDest;
        size_t ph = std::min<size_t>( 4, image.height - h );
        size_t w = 0;
        for( size_t count = 0; count < rowPitch; count += sbpp*4, w += 4 )
        {
            size_t pw = std::min<size_t>( 4, image.width - w );
            assert( pw > 0 && ph > 0 );

            if ( !_LoadScanline( &temp[0], pw, sptr, rowPitch, format ) )
                return E_FAIL;

            if ( ph > 1 )
            {
                if ( !_LoadScanline( &temp[4], pw, sptr + rowPitch, rowPitch, format ) )
                    return E_FAIL;

                if ( ph > 2 )
                {
                    if ( !_LoadScanline( &temp[8], pw, sptr + rowPitch*2, rowPitch, format ) )
                        return E_FAIL;

                    if ( ph > 3 )
                    {
                        if ( !_LoadScanline( &temp[12], pw, sptr + rowPitch*3, rowPitch, format ) )
                            return E_FAIL;
                    }
                }
            }

            if ( pw != 4 || ph != 4 )
            {
                // Replicate pixels for partial block
                static const size_t uSrc[] = { 0, 0, 0, 1 };

                if ( pw < 4 )
                {
                    for( size_t t = 0; t < ph && t < 4; ++t )
                    {
                        for( size_t s = pw; s < 4; ++s )
                        {
#pragma prefast(suppress: 26000, "PREFAST false positive")
                            temp[ (t << 2) | s ] = temp[ (t << 2) | uSrc[s] ]; 
                        }
                    }
                }

                if ( ph < 4 )
                {
                    for( size_t t = ph; t < 4; ++t )
                    {
                        for( size_t s = 0; s < 4; ++s )
                        {
#pragma prefast(suppress: 26000, "PREFAST false positive")
                            temp[ (t << 2) | s ] = temp[ (uSrc[t] << 2) | s ]; 
                        }
                    }
                }
            }

            _ConvertScanline( temp, 16, result.format, format, cflags | srgb );
            
            if ( pfEncode )
                pfEncode( dptr, temp, bcflags );
            else
                D3DXEncodeBC1( dptr, temp, alphaRef, bcflags );

            sptr += sbpp*4;
            dptr += blocksize;
        }

        pSrc += rowPitch*4;
        pDest += result.rowPitch;
    }

    return S_OK;
}
static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags,
                                     _In_ DWORD srgb, _In_ float alphaRef )
{
    if ( !image.pixels || !result.pixels )
        return E_POINTER;

    assert( image.width == result.width );
    assert( image.height == result.height );

    const DXGI_FORMAT format = image.format;
    size_t sbpp = BitsPerPixel( format );
    if ( !sbpp )
        return E_FAIL;

    if ( sbpp < 8 )
    {
        // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM)
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );
    }

    // Round to bytes
    sbpp = ( sbpp + 7 ) / 8;

    // Determine BC format encoder
    BC_ENCODE pfEncode;
    size_t blocksize;
    DWORD cflags;
    if ( !_DetermineEncoderSettings( result.format, pfEncode, blocksize, cflags ) )
        return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED );

    // Refactored version of loop to support parallel independance
    const size_t nBlocks = std::max<size_t>(1, (image.width + 3) / 4 ) * std::max<size_t>(1, (image.height + 3) / 4 );

    bool fail = false;

#pragma omp parallel for
    for( int nb=0; nb < static_cast<int>( nBlocks ); ++nb )
    {
        const size_t nbWidth = std::max<size_t>(1, (image.width + 3) / 4 );

        const size_t y = nb / nbWidth;
        const size_t x = nb - (y*nbWidth);

        assert( x < image.width && y < image.height );

        size_t rowPitch = image.rowPitch;
        const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp);

        uint8_t *pDest = result.pixels + (nb*blocksize);

        size_t ph = std::min<size_t>( 4, image.height - y );
        size_t pw = std::min<size_t>( 4, image.width - x );
        assert( pw > 0 && ph > 0 );

        XMVECTOR temp[16];
        if ( !_LoadScanline( &temp[0], pw, pSrc, rowPitch, format ) )
            fail = true;

        if ( ph > 1 )
        {
            if ( !_LoadScanline( &temp[4], pw, pSrc + rowPitch, rowPitch, format ) )
                fail = true;

            if ( ph > 2 )
            {
                if ( !_LoadScanline( &temp[8], pw, pSrc + rowPitch*2, rowPitch, format ) )
                    fail = true;

                if ( ph > 3 )
                {
                    if ( !_LoadScanline( &temp[12], pw, pSrc + rowPitch*3, rowPitch, format ) )
                        fail = true;
                }
            }
        }

        if ( pw != 4 || ph != 4 )
        {
            // Replicate pixels for partial block
            static const size_t uSrc[] = { 0, 0, 0, 1 };

            if ( pw < 4 )
            {
                for( size_t t = 0; t < ph && t < 4; ++t )
                {
                    for( size_t s = pw; s < 4; ++s )
                    {
                        temp[ (t << 2) | s ] = temp[ (t << 2) | uSrc[s] ]; 
                    }
                }
            }

            if ( ph < 4 )
            {
                for( size_t t = ph; t < 4; ++t )
                {
                    for( size_t s = 0; s < 4; ++s )
                    {
                        temp[ (t << 2) | s ] = temp[ (uSrc[t] << 2) | s ]; 
                    }
                }
            }
        }

        _ConvertScanline( temp, 16, result.format, format, cflags | srgb );
            
        if ( pfEncode )
            pfEncode( pDest, temp, bcflags );
        else
            D3DXEncodeBC1( pDest, temp, alphaRef, bcflags );
    }

    return (fail) ? E_FAIL : S_OK;
}