/** Documented at declaration */
int
gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, struct gpujpeg_encoder_input* input, uint8_t** image_compressed, int* image_compressed_size)
{
    // Get coder
    struct gpujpeg_coder* coder = &encoder->coder;

    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    // Load input image
    if ( input->type == GPUJPEG_ENCODER_INPUT_IMAGE ) {
        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Copy image to device memory
        if ( cudaSuccess != cudaMemcpy(coder->d_data_raw, input->image, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyHostToDevice) )
            return -1;

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    } else if ( input->type == GPUJPEG_ENCODER_INPUT_OPENGL_TEXTURE ) {
        assert(input->texture != NULL);

        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Map texture to CUDA
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(input->texture, &data_size);
        assert(data_size == (coder->data_raw_size));

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);

        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Copy image data from texture pixel buffer object to device data
        cudaMemcpy(coder->d_data_raw, d_data, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);

        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Unmap texture from CUDA
        gpujpeg_opengl_texture_unmap(input->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    } else if(input->type == GPUJPEG_ENCODER_INPUT_IMAGE_ON_GPU) {
        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        coder->d_data_raw = input->image;

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    //gpujpeg_table_print(encoder->table[JPEG_COMPONENT_LUMINANCE]);
    //gpujpeg_table_print(encoder->table[JPEG_COMPONENT_CHROMINANCE]);

    GPUJPEG_CUSTOM_TIMER_START(encoder->in_gpu);
    GPUJPEG_CUSTOM_TIMER_START(encoder->def);

    // Preprocessing
    if ( gpujpeg_preprocessor_encode(&encoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    GPUJPEG_CUSTOM_TIMER_START(encoder->def);

    // Perform DCT and quantization
    if ( gpujpeg_dct_gpu(encoder) != 0 )
        return -1;

    // If restart interval is 0 then the GPU processing is in the end (even huffman coder will be performed on CPU)
    if ( coder->param.restart_interval == 0 ) {
        GPUJPEG_CUSTOM_TIMER_STOP(encoder->in_gpu);
        coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->in_gpu);
    }

    // Initialize writer output buffer current position
    encoder->writer->buffer_current = encoder->writer->buffer;

    // Write header
    gpujpeg_writer_write_header(encoder);

    GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    GPUJPEG_CUSTOM_TIMER_START(encoder->def);

    // Perform huffman coding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        // Copy quantized data from device memory to cpu memory
        cudaMemcpy(coder->data_quantized, coder->d_data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyDeviceToHost);

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Perform huffman coding
        if ( gpujpeg_huffman_cpu_encoder_encode(encoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman encoder on CPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    }
    // Perform huffman coding on GPU (when restart interval is set)
    else {
        // Perform huffman coding
        unsigned int output_size;
        if ( gpujpeg_huffman_gpu_encoder_encode(encoder, &output_size) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman encoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->in_gpu);
        coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->in_gpu);

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        // Copy compressed data from device memory to cpu memory
        if ( cudaSuccess != cudaMemcpy(coder->data_compressed, coder->d_data_compressed, output_size, cudaMemcpyDeviceToHost) != 0 )
            return -1;
        // Copy segments from device memory
        if ( cudaSuccess != cudaMemcpy(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost) )
            return -1;

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
        GPUJPEG_CUSTOM_TIMER_START(encoder->def);

        if ( coder->param.interleaved == 1 ) {
            // Write scan header (only one scan is written, that contains all color components data)
            gpujpeg_writer_write_scan_header(encoder, 0);

            // Write scan data
            for ( int segment_index = 0; segment_index < coder->segment_count; segment_index++ ) {
                struct gpujpeg_segment* segment = &coder->segment[segment_index];

                gpujpeg_writer_write_segment_info(encoder);

                // Copy compressed data to writer
                memcpy(
                    encoder->writer->buffer_current,
                    &coder->data_compressed[segment->data_compressed_index],
                    segment->data_compressed_size
                );
                encoder->writer->buffer_current += segment->data_compressed_size;
                //printf("Compressed data %d bytes\n", segment->data_compressed_size);
            }
            // Remove last restart marker in scan (is not needed)
            encoder->writer->buffer_current -= 2;

            gpujpeg_writer_write_segment_info(encoder);
        } else {
            // Write huffman coder results as one scan for each color component
            int segment_index = 0;
            for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
                // Write scan header
                gpujpeg_writer_write_scan_header(encoder, comp);
                // Write scan data
                for ( int index = 0; index < coder->component[comp].segment_count; index++ ) {
                    struct gpujpeg_segment* segment = &coder->segment[segment_index];

                    gpujpeg_writer_write_segment_info(encoder);

                    // Copy compressed data to writer
                    memcpy(
                        encoder->writer->buffer_current,
                        &coder->data_compressed[segment->data_compressed_index],
                        segment->data_compressed_size
                    );
                    encoder->writer->buffer_current += segment->data_compressed_size;
                    //printf("Compressed data %d bytes\n", segment->data_compressed_size);

                    segment_index++;
                }
                // Remove last restart marker in scan (is not needed)
                encoder->writer->buffer_current -= 2;

                gpujpeg_writer_write_segment_info(encoder);
            }
        }

        GPUJPEG_CUSTOM_TIMER_STOP(encoder->def);
        coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(encoder->def);
    }
    gpujpeg_writer_emit_marker(encoder->writer, GPUJPEG_MARKER_EOI);

    // Set compressed image
    *image_compressed = encoder->writer->buffer;
    *image_compressed_size = encoder->writer->buffer_current - encoder->writer->buffer;

    return 0;
}
/** Documented at declaration */
int
gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output)
{
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Read JPEG image data
    if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
        return -1;
    }
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Perform huffman decoding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
            return -1;
        }
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy quantized data to device memory from cpu memory
        cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);
    }
    // Perform huffman decoding on GPU (when restart interval is set)
    else {
        // Reset huffman output
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        // Copy scan data to device memory
        cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Copy segments to device memory
        cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Zero output memory
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);

        // Perform huffman decoding
        if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    }
    
    // Perform IDCT and dequantization (own CUDA implementation)
    gpujpeg_idct_gpu(decoder);
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Preprocessing
    if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu);
    coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu);

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    
    // Set decompressed image size
    output->data_size = coder->data_raw_size * sizeof(uint8_t);
    
    // Set decompressed image
    if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy decompressed image to host memory
        cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        // Set output to internal buffer
        output->data = coder->data_raw;
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
        
        assert(output->data != NULL);

        // Copy decompressed image to host memory
        cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Map OpenGL texture
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
        assert(data_size == coder->data_raw_size);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Copy decompressed image to texture pixel buffer object device data
        cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Unmap OpenGL texture
        gpujpeg_opengl_texture_unmap(output->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    return 0;
}
/** Documented at declaration */
int
gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output)
{
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Read JPEG image data
    if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
        return -1;
    }
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Perform huffman decoding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
            return -1;
        }
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy quantized data to device memory from cpu memory
        cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);
    }
    // Perform huffman decoding on GPU (when restart interval is set)
    else {
    #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
        // Copy huffman tables to constant memory
        for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
            for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
                int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type);
                cudaMemcpyToSymbol(
                    (char*)gpujpeg_decoder_table_huffman, 
                    &decoder->table_huffman[comp_type][huff_type], 
                    sizeof(struct gpujpeg_table_huffman_decoder), 
                    index * sizeof(struct gpujpeg_table_huffman_decoder), 
                    cudaMemcpyHostToDevice
                );
            }
        }
        gpujpeg_cuda_check_error("Decoder copy huffman tables to constant memory");
    #endif
    
        // Reset huffman output
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        // Copy scan data to device memory
        cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Copy segments to device memory
        cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Zero output memory
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);

        // Perform huffman decoding
        if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    }
    
#ifdef GPUJPEG_DCT_FROM_NPP
    // Perform IDCT and dequantization (implementation from NPP)
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        // Get component
        struct gpujpeg_component* component = &coder->component[comp];
                
        // Determine table type
        enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
        
        //gpujpeg_component_print16(component, component->d_data_quantized);
        
        cudaMemset(component->d_data, 0, component->data_size * sizeof(uint8_t));
        
        //Perform inverse DCT
        NppiSize inv_roi;
        inv_roi.width = component->data_width * GPUJPEG_BLOCK_SIZE;
        inv_roi.height = component->data_height / GPUJPEG_BLOCK_SIZE;
        assert(GPUJPEG_BLOCK_SIZE == 8);
        NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
            component->d_data_quantized, 
            component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t), 
            component->d_data, 
            component->data_width * sizeof(uint8_t), 
            decoder->table_quantization[type].d_table, 
            inv_roi
        );
        if ( status != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Inverse DCT failed (error %d)!\n", status);
        }
            
        //gpujpeg_component_print8(component, component->d_data);
    }
#else
    // Perform IDCT and dequantization (own CUDA implementation)
    gpujpeg_idct_gpu(decoder);
    
    // Perform IDCT and dequantization (own CPU implementation)
    // gpujpeg_idct_cpu(decoder);
#endif
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Preprocessing
    if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu);
    coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu);

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    
    // Set decompressed image size
    output->data_size = coder->data_raw_size * sizeof(uint8_t);
    
    // Set decompressed image
    if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy decompressed image to host memory
        cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        // Set output to internal buffer
        output->data = coder->data_raw;
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
        
        assert(output->data != NULL);

        // Copy decompressed image to host memory
        cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Map OpenGL texture
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
        assert(data_size == coder->data_raw_size);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Copy decompressed image to texture pixel buffer object device data
        cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Unmap OpenGL texture
        gpujpeg_opengl_texture_unmap(output->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    return 0;
}