static HRESULT _PremultiplyAlpha( _In_ const Image& srcImage, _In_ const Image& destImage ) { assert( srcImage.width == destImage.width ); assert( srcImage.height == destImage.height ); ScopedAlignedArrayXMVECTOR scanline( reinterpret_cast<XMVECTOR*>( _aligned_malloc( (sizeof(XMVECTOR)*srcImage.width), 16 ) ) ); if ( !scanline ) return E_OUTOFMEMORY; const uint8_t *pSrc = srcImage.pixels; uint8_t *pDest = destImage.pixels; if ( !pSrc || !pDest ) return E_POINTER; for( size_t h = 0; h < srcImage.height; ++h ) { if ( !_LoadScanline( scanline.get(), srcImage.width, pSrc, srcImage.rowPitch, srcImage.format ) ) return E_FAIL; XMVECTOR* ptr = scanline.get(); for( size_t w = 0; w < srcImage.width; ++w ) { XMVECTOR v = *ptr; XMVECTOR alpha = XMVectorSplatW( *ptr ); alpha = XMVectorMultiply( v, alpha ); *(ptr++) = XMVectorSelect( v, alpha, g_XMSelect1110 ); } if ( !_StoreScanline( pDest, destImage.rowPitch, destImage.format, scanline.get(), srcImage.width ) ) return E_FAIL; pSrc += srcImage.rowPitch; pDest += destImage.rowPitch; } return S_OK; }
//------------------------------------------------------------------------------------- static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, _In_ float alphaRef, _In_ bool degenerate ) { if ( !image.pixels || !result.pixels ) return E_POINTER; assert( image.width == result.width ); assert( image.height == result.height ); const DXGI_FORMAT format = image.format; size_t sbpp = BitsPerPixel( format ); if ( !sbpp ) return E_FAIL; if ( sbpp < 8 ) { // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } // Round to bytes sbpp = ( sbpp + 7 ) / 8; uint8_t *pDest = result.pixels; // Determine BC format encoder BC_ENCODE pfEncode; size_t blocksize; switch(result.format) { case DXGI_FORMAT_BC1_UNORM: case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break; case DXGI_FORMAT_BC2_UNORM: case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break; case DXGI_FORMAT_BC3_UNORM: case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break; case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break; case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break; case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break; case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break; case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break; case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break; case DXGI_FORMAT_BC7_UNORM: case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break; default: return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } XMVECTOR temp[16]; const uint8_t *pSrc = image.pixels; const size_t rowPitch = image.rowPitch; for( size_t h=0; h < image.height; h += 4 ) { const uint8_t *sptr = pSrc; uint8_t* dptr = pDest; for( size_t count = 0; count < rowPitch; count += sbpp*4 ) { if ( !_LoadScanline( &temp[0], 4, sptr, rowPitch, format ) ) return E_FAIL; if ( image.height > 1 ) { if ( !_LoadScanline( &temp[4], 4, sptr + rowPitch, rowPitch, format ) ) return E_FAIL; if ( image.height > 2 ) { if ( !_LoadScanline( &temp[8], 4, sptr + rowPitch*2, rowPitch, format ) ) return E_FAIL; if ( !_LoadScanline( &temp[12], 4, sptr + rowPitch*3, rowPitch, format ) ) return E_FAIL; } } if ( degenerate ) { assert( image.width < 4 || image.height < 4 ); const size_t uSrc[] = { 0, 0, 0, 1 }; if ( image.width < 4 ) { for( size_t t=0; t < image.height && t < 4; ++t ) { for( size_t s = image.width; s < 4; ++s ) { temp[ t*4 + s ] = temp[ t*4 + uSrc[s] ]; } } } if ( image.height < 4 ) { for( size_t t=image.height; t < 4; ++t ) { for( size_t s =0; s < 4; ++s ) { temp[ t*4 + s ] = temp[ uSrc[t]*4 + s ]; } } } } _ConvertScanline( temp, 16, result.format, format, 0 ); if ( pfEncode ) pfEncode( dptr, temp, bcflags ); else D3DXEncodeBC1( dptr, temp, alphaRef, bcflags ); sptr += sbpp*4; dptr += blocksize; } pSrc += rowPitch*4; pDest += result.rowPitch; } return S_OK; }
static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, _In_ float alphaRef ) { if ( !image.pixels || !result.pixels ) return E_POINTER; // Parallel version doesn't support degenerate case assert( ((image.width % 4) == 0) && ((image.height % 4) == 0 ) ); assert( image.width == result.width ); assert( image.height == result.height ); const DXGI_FORMAT format = image.format; size_t sbpp = BitsPerPixel( format ); if ( !sbpp ) return E_FAIL; if ( sbpp < 8 ) { // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } // Round to bytes sbpp = ( sbpp + 7 ) / 8; // Determine BC format encoder BC_ENCODE pfEncode; size_t blocksize; switch(result.format) { case DXGI_FORMAT_BC1_UNORM: case DXGI_FORMAT_BC1_UNORM_SRGB: pfEncode = nullptr; blocksize = 8; break; case DXGI_FORMAT_BC2_UNORM: case DXGI_FORMAT_BC2_UNORM_SRGB: pfEncode = D3DXEncodeBC2; blocksize = 16; break; case DXGI_FORMAT_BC3_UNORM: case DXGI_FORMAT_BC3_UNORM_SRGB: pfEncode = D3DXEncodeBC3; blocksize = 16; break; case DXGI_FORMAT_BC4_UNORM: pfEncode = D3DXEncodeBC4U; blocksize = 8; break; case DXGI_FORMAT_BC4_SNORM: pfEncode = D3DXEncodeBC4S; blocksize = 8; break; case DXGI_FORMAT_BC5_UNORM: pfEncode = D3DXEncodeBC5U; blocksize = 16; break; case DXGI_FORMAT_BC5_SNORM: pfEncode = D3DXEncodeBC5S; blocksize = 16; break; case DXGI_FORMAT_BC6H_UF16: pfEncode = D3DXEncodeBC6HU; blocksize = 16; break; case DXGI_FORMAT_BC6H_SF16: pfEncode = D3DXEncodeBC6HS; blocksize = 16; break; case DXGI_FORMAT_BC7_UNORM: case DXGI_FORMAT_BC7_UNORM_SRGB: pfEncode = D3DXEncodeBC7; blocksize = 16; break; default: return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } // Refactored version of loop to support parallel independance const size_t nBlocks = std::max<size_t>(1, image.width / 4) * std::max<size_t>(1, image.height / 4); bool fail = false; #pragma omp parallel for for( int nb=0; nb < static_cast<int>( nBlocks ); ++nb ) { const size_t nbWidth = std::max<size_t>(1, image.width / 4); const size_t y = nb / nbWidth; const size_t x = nb - (y*nbWidth); assert( x < image.width && y < image.height ); size_t rowPitch = image.rowPitch; const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp); uint8_t *pDest = result.pixels + (nb*blocksize); XMVECTOR temp[16]; if ( !_LoadScanline( &temp[0], 4, pSrc, rowPitch, format ) ) fail = true; if ( !_LoadScanline( &temp[4], 4, pSrc + rowPitch, rowPitch, format ) ) fail = true; if ( !_LoadScanline( &temp[8], 4, pSrc + rowPitch*2, rowPitch, format ) ) fail = true; if ( !_LoadScanline( &temp[12], 4, pSrc + rowPitch*3, rowPitch, format ) ) fail = true; _ConvertScanline( temp, 16, result.format, format, 0 ); if ( pfEncode ) pfEncode( pDest, temp, bcflags ); else D3DXEncodeBC1( pDest, temp, alphaRef, bcflags ); } return (fail) ? E_FAIL : S_OK; }
//------------------------------------------------------------------------------------- static HRESULT _CompressBC( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, _In_ DWORD srgb, _In_ float alphaRef ) { if ( !image.pixels || !result.pixels ) return E_POINTER; assert( image.width == result.width ); assert( image.height == result.height ); const DXGI_FORMAT format = image.format; size_t sbpp = BitsPerPixel( format ); if ( !sbpp ) return E_FAIL; if ( sbpp < 8 ) { // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } // Round to bytes sbpp = ( sbpp + 7 ) / 8; uint8_t *pDest = result.pixels; // Determine BC format encoder BC_ENCODE pfEncode; size_t blocksize; DWORD cflags; if ( !_DetermineEncoderSettings( result.format, pfEncode, blocksize, cflags ) ) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); XMVECTOR temp[16]; const uint8_t *pSrc = image.pixels; const size_t rowPitch = image.rowPitch; for( size_t h=0; h < image.height; h += 4 ) { const uint8_t *sptr = pSrc; uint8_t* dptr = pDest; size_t ph = std::min<size_t>( 4, image.height - h ); size_t w = 0; for( size_t count = 0; count < rowPitch; count += sbpp*4, w += 4 ) { size_t pw = std::min<size_t>( 4, image.width - w ); assert( pw > 0 && ph > 0 ); if ( !_LoadScanline( &temp[0], pw, sptr, rowPitch, format ) ) return E_FAIL; if ( ph > 1 ) { if ( !_LoadScanline( &temp[4], pw, sptr + rowPitch, rowPitch, format ) ) return E_FAIL; if ( ph > 2 ) { if ( !_LoadScanline( &temp[8], pw, sptr + rowPitch*2, rowPitch, format ) ) return E_FAIL; if ( ph > 3 ) { if ( !_LoadScanline( &temp[12], pw, sptr + rowPitch*3, rowPitch, format ) ) return E_FAIL; } } } if ( pw != 4 || ph != 4 ) { // Replicate pixels for partial block static const size_t uSrc[] = { 0, 0, 0, 1 }; if ( pw < 4 ) { for( size_t t = 0; t < ph && t < 4; ++t ) { for( size_t s = pw; s < 4; ++s ) { #pragma prefast(suppress: 26000, "PREFAST false positive") temp[ (t << 2) | s ] = temp[ (t << 2) | uSrc[s] ]; } } } if ( ph < 4 ) { for( size_t t = ph; t < 4; ++t ) { for( size_t s = 0; s < 4; ++s ) { #pragma prefast(suppress: 26000, "PREFAST false positive") temp[ (t << 2) | s ] = temp[ (uSrc[t] << 2) | s ]; } } } } _ConvertScanline( temp, 16, result.format, format, cflags | srgb ); if ( pfEncode ) pfEncode( dptr, temp, bcflags ); else D3DXEncodeBC1( dptr, temp, alphaRef, bcflags ); sptr += sbpp*4; dptr += blocksize; } pSrc += rowPitch*4; pDest += result.rowPitch; } return S_OK; }
static HRESULT _CompressBC_Parallel( _In_ const Image& image, _In_ const Image& result, _In_ DWORD bcflags, _In_ DWORD srgb, _In_ float alphaRef ) { if ( !image.pixels || !result.pixels ) return E_POINTER; assert( image.width == result.width ); assert( image.height == result.height ); const DXGI_FORMAT format = image.format; size_t sbpp = BitsPerPixel( format ); if ( !sbpp ) return E_FAIL; if ( sbpp < 8 ) { // We don't support compressing from monochrome (DXGI_FORMAT_R1_UNORM) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); } // Round to bytes sbpp = ( sbpp + 7 ) / 8; // Determine BC format encoder BC_ENCODE pfEncode; size_t blocksize; DWORD cflags; if ( !_DetermineEncoderSettings( result.format, pfEncode, blocksize, cflags ) ) return HRESULT_FROM_WIN32( ERROR_NOT_SUPPORTED ); // Refactored version of loop to support parallel independance const size_t nBlocks = std::max<size_t>(1, (image.width + 3) / 4 ) * std::max<size_t>(1, (image.height + 3) / 4 ); bool fail = false; #pragma omp parallel for for( int nb=0; nb < static_cast<int>( nBlocks ); ++nb ) { const size_t nbWidth = std::max<size_t>(1, (image.width + 3) / 4 ); const size_t y = nb / nbWidth; const size_t x = nb - (y*nbWidth); assert( x < image.width && y < image.height ); size_t rowPitch = image.rowPitch; const uint8_t *pSrc = image.pixels + (y*4*rowPitch) + (x*4*sbpp); uint8_t *pDest = result.pixels + (nb*blocksize); size_t ph = std::min<size_t>( 4, image.height - y ); size_t pw = std::min<size_t>( 4, image.width - x ); assert( pw > 0 && ph > 0 ); XMVECTOR temp[16]; if ( !_LoadScanline( &temp[0], pw, pSrc, rowPitch, format ) ) fail = true; if ( ph > 1 ) { if ( !_LoadScanline( &temp[4], pw, pSrc + rowPitch, rowPitch, format ) ) fail = true; if ( ph > 2 ) { if ( !_LoadScanline( &temp[8], pw, pSrc + rowPitch*2, rowPitch, format ) ) fail = true; if ( ph > 3 ) { if ( !_LoadScanline( &temp[12], pw, pSrc + rowPitch*3, rowPitch, format ) ) fail = true; } } } if ( pw != 4 || ph != 4 ) { // Replicate pixels for partial block static const size_t uSrc[] = { 0, 0, 0, 1 }; if ( pw < 4 ) { for( size_t t = 0; t < ph && t < 4; ++t ) { for( size_t s = pw; s < 4; ++s ) { temp[ (t << 2) | s ] = temp[ (t << 2) | uSrc[s] ]; } } } if ( ph < 4 ) { for( size_t t = ph; t < 4; ++t ) { for( size_t s = 0; s < 4; ++s ) { temp[ (t << 2) | s ] = temp[ (uSrc[t] << 2) | s ]; } } } } _ConvertScanline( temp, 16, result.format, format, cflags | srgb ); if ( pfEncode ) pfEncode( pDest, temp, bcflags ); else D3DXEncodeBC1( pDest, temp, alphaRef, bcflags ); } return (fail) ? E_FAIL : S_OK; }