// notes:
// Slow CPU based decompression : Should look into also using HW based decompression with this interface
//
CodecError CCodec_ASTC::Decompress(CCodecBuffer& bufferIn, CCodecBuffer& bufferOut, Codec_Feedback_Proc pFeedbackProc, CMP_DWORD_PTR pUser1, CMP_DWORD_PTR pUser2)
{
    m_xdim = bufferIn.GetBlockWidth();
    m_ydim = bufferIn.GetBlockHeight();
    m_zdim = 1;

    CodecError err = InitializeASTCLibrary();
    if (err != CE_OK) return err;

    // Our Compressed data Blocks are always 128 bit long (4x4 blocks)
    const CMP_DWORD imageWidth  = bufferIn.GetWidth();
    const CMP_DWORD imageHeight = bufferIn.GetHeight();
    const CMP_DWORD imageDepth  = 1;
    const BYTE      bitness     = 8;

    const CMP_DWORD CompBlockX  = bufferIn.GetBlockWidth();
    const CMP_DWORD CompBlockY  = bufferIn.GetBlockHeight();
    CMP_BYTE  Block_Width       = bufferIn.GetBlockWidth();
    CMP_BYTE  Block_Height      = bufferIn.GetBlockHeight();

    const CMP_DWORD dwBlocksX = ((bufferIn.GetWidth() + (CompBlockX - 1)) / CompBlockX);
    const CMP_DWORD dwBlocksY = ((bufferIn.GetHeight()+ (CompBlockY - 1)) / CompBlockY);
    const CMP_DWORD dwBlocksZ = 1;
    const CMP_DWORD dwBufferInDepth = 1;

    // Override the current input buffer Pitch size  (Since it will be set according to the Compressed Block Sizes
    // and not to the Compressed Codec data which is for ASTC 16 Bytes per block x Number of blocks per row
    bufferIn.SetPitch(16 * dwBlocksX);

    // Output data size Pitch
    CMP_DWORD  dwPitch = bufferOut.GetPitch();

    // Output Buffer
    BYTE *pDataOut      = bufferOut.GetData();

    const CMP_DWORD dwBlocksXY = dwBlocksX*dwBlocksY;

    for(CMP_DWORD cmpRowY = 0; cmpRowY < dwBlocksY; cmpRowY++)        // Compressed images row = height
    {
        for(CMP_DWORD cmpColX = 0; cmpColX < dwBlocksX; cmpColX++)    // Compressed images Col = width
        {
            union FBLOCKS
            {
                float decodedBlock[144][4];            // max 12x12 block size
                float destBlock[576];                  // max 12x12x4
            } DecData;
    
            union BBLOCKS
            {
                CMP_DWORD       compressedBlock[4];
                BYTE            out[16];
                BYTE            in[16];
            } CompData;

            bufferIn.ReadBlock(cmpColX*4, cmpRowY*4, CompData.compressedBlock, 4);

            // Encode to the appropriate location in the compressed image
            m_decoder->DecompressBlock(Block_Width, Block_Height, bitness, DecData.decodedBlock,CompData.in);
            
            // Now that we have a decoded block lets copy that data over to the target image buffer
            CMP_DWORD outCol = cmpColX*Block_Width;
            CMP_DWORD outRow = cmpRowY*Block_Height;
            CMP_DWORD outImgRow = outRow;
            CMP_DWORD outImgCol = outCol;

            for (int row = 0; row < Block_Height; row++)
            {
                CMP_DWORD  nextRowCol  = (outRow+row)*dwPitch + (outCol * 4);
                CMP_BYTE*  pData       = (CMP_BYTE*)(pDataOut + nextRowCol);
                if ((outImgRow + row) < imageHeight)
                {
                    outImgCol = outCol;
                    for (int col = 0; col < Block_Width; col++)
                    {
                        CMP_DWORD w = outImgCol + col;
                        if (w < imageWidth)
                        {
                            int index = row*Block_Width + col;
                            *pData++ = (CMP_BYTE)DecData.decodedBlock[index][BC_COMP_RED];
                            *pData++ = (CMP_BYTE)DecData.decodedBlock[index][BC_COMP_GREEN];
                            *pData++ = (CMP_BYTE)DecData.decodedBlock[index][BC_COMP_BLUE];
                            *pData++ = (CMP_BYTE)DecData.decodedBlock[index][BC_COMP_ALPHA];
                        }
                        else break;
                    }
                }
            }
        }

        if (pFeedbackProc)
        {
            float fProgress = 100.f * (cmpRowY * dwBlocksX) / dwBlocksXY;
            if (pFeedbackProc(fProgress, pUser1, pUser2))
            {
                return CE_Aborted;
            }
        }
    }

    return CE_OK;
}
CodecError CCodec_ASTC::Compress(CCodecBuffer& bufferIn, CCodecBuffer& bufferOut, Codec_Feedback_Proc pFeedbackProc, CMP_DWORD_PTR pUser1, CMP_DWORD_PTR pUser2)
{
    m_AbortRequested = false;

    int xsize = bufferIn.GetWidth();
    int ysize = bufferIn.GetHeight();
    int zsize = 1; //todo: add depth to support 3d textures
    m_xdim = bufferOut.GetBlockWidth();
    m_ydim = bufferOut.GetBlockHeight();
    m_zdim = 1;

    CodecError err = InitializeASTCLibrary();
    if (err != CE_OK) return err;

#ifdef ASTC_COMPDEBUGGER
    CompViewerClient    g_CompClient;
    if (g_CompClient.connect())
    {
        #ifdef USE_DBGTRACE
            DbgTrace(("-------> Remote Server Connected"));
        #endif
    }
#endif


#ifdef USE_DBGTRACE
    DbgTrace(("IN : BufferType %d ChannelCount %d ChannelDepth %d", bufferIn.GetBufferType(), bufferIn.GetChannelCount(), bufferIn.GetChannelDepth()));
    DbgTrace(("   : Height %d Width %d Pitch %d isFloat %d", bufferIn.GetHeight(), bufferIn.GetWidth(), bufferIn.GetWidth(), bufferIn.IsFloat()));

    DbgTrace(("OUT: BufferType %d ChannelCount %d ChannelDepth %d", bufferOut.GetBufferType(), bufferOut.GetChannelCount(), bufferOut.GetChannelDepth()));
    DbgTrace(("   : Height %d Width %d Pitch %d isFloat %d", bufferOut.GetHeight(), bufferOut.GetWidth(), bufferOut.GetWidth(), bufferOut.IsFloat()));
#endif;


    int bitness = 0; //todo: replace astc_codec_image with bufferIn and rewrite fetch_imageblock()
    switch (bufferIn.GetBufferType())
    {
    case CBT_RGBA8888:
    case CBT_BGRA8888:
    case CBT_ARGB8888:
    case CBT_RGB888:
    case CBT_RG8:
    case CBT_R8:
        bitness = 8;
        break;
    case CBT_RGBA2101010:
        break;
    case CBT_RGBA16:
    case CBT_RG16:
    case CBT_R16:
        break;
    case CBT_RGBA32:
    case CBT_RG32:
    case CBT_R32:
        break;
    case CBT_RGBA16F:
    case CBT_RG16F:
    case CBT_R16F:
        break;
    case CBT_RGBA32F:
    case CBT_RG32F:
    case CBT_R32F:
        break;
    default:
        break;
    }

    if (bitness != 8)
        assert("Unsupported type of input buffer");

    astc_codec_image_cpu *input_image = allocate_image_cpu(bitness, xsize, ysize, zsize, 0);

    if (!input_image)
        assert("Unable to allocate image buffer");

    // Loop through the original input image and setup compression threads for each 
    // block to encode  we will load the buffer to pass to ASTC code as 8 bit 4x4 blocks
    // the fill in source image. ASTC code will then use the adaptive sizes for process on the input
    BYTE *pData = bufferIn.GetData();
    int ii = 0;
    for (int y = 0; y < ysize; y++) {
        for (int x = 0; x < xsize; x++) {
            input_image->imagedata8[0][y][4*x      ] = pData[ii];      // Red
            ii++;
            input_image->imagedata8[0][y][4 * x + 1] = pData[ii];      // Green
            ii++;
            input_image->imagedata8[0][y][4 * x + 2] = pData[ii];      // Blue
            ii++;
            input_image->imagedata8[0][y][4 * x + 3] = pData[ii];      // Alpha
            ii++;
        }
    }

    m_NumEncodingThreads = min(m_NumThreads, (decltype(m_NumThreads))MAX_ASTC_THREADS);
    if (m_NumEncodingThreads == 0) m_NumEncodingThreads = 1;

// Common ARM and AMD Code
    CodecError result = CE_OK;
    int xdim = m_xdim;
    int ydim = m_ydim;
    int zdim = m_zdim;
    uint8_t *bufferOutput = bufferOut.GetData();

    // Common ARM and Compressonator Code
    int x, y, z, i;
    int xblocks = (xsize + xdim - 1) / xdim;
    int yblocks = (ysize + ydim - 1) / ydim;
    int zblocks = (zsize + zdim - 1) / zdim;
    float TotalBlocks = (float) (yblocks * xblocks);
    int processingBlock = 0;

    for (z = 0; z < zblocks; z++)
    {
        for (y = 0; y < yblocks; y++)
        {
            for (x = 0; x < xblocks; x++)
            {
                int offset = ((z * yblocks + y) * xblocks + x) * 16;
                uint8_t *bp = bufferOutput + offset;
                EncodeASTCBlock((astc_codec_image *)input_image, bp, xdim, ydim, zdim, x * xdim, y * ydim, z * zdim);
                processingBlock++;
            }

            if (pFeedbackProc)
            {
                float fProgress = 100.f * ((float)(processingBlock) / TotalBlocks);
                if (pFeedbackProc(fProgress, pUser1, pUser2))
                {
                    result = CE_Aborted;
                    break;
                }
            }

        }
    }

    CodecError EncodeResult = FinishASTCEncoding();

    if (result != CE_Aborted)
        result = EncodeResult;

    destroy_image_cpu(input_image);

#ifdef ASTC_COMPDEBUGGER
    g_CompClient.disconnect();
#endif

    return result;
}