Exemplo n.º 1
0
/** Documented at declaration */
void
gpujpeg_opengl_texture_unmap(struct gpujpeg_opengl_texture* texture)
{
    // Unmap pbo
    cudaGraphicsUnmapResources(1, &texture->texture_pbo_resource, 0);
    gpujpeg_cuda_check_error("Encoder unmap texture PBO resource");

#ifdef GPUJPEG_USE_OPENGL
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
        assert(texture->texture_pbo_type == GL_PIXEL_UNPACK_BUFFER);

        glBindTexture(GL_TEXTURE_2D, texture->texture_id);
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture->texture_pbo_id);

        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, texture->texture_width, texture->texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);

        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
        glBindTexture(GL_TEXTURE_2D, 0);
        glFinish();
    }
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    // Dettach OpenGL context by callback
    if ( texture->texture_callback_detach_opengl != NULL )
        texture->texture_callback_detach_opengl(texture->texture_callback_param);
}
/** Documented at declaration */
struct gpujpeg_decoder*
gpujpeg_decoder_create()
{    
    struct gpujpeg_decoder* decoder = malloc(sizeof(struct gpujpeg_decoder));
    if ( decoder == NULL )
        return NULL;
        
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Set parameters
    memset(decoder, 0, sizeof(struct gpujpeg_decoder));
    gpujpeg_set_default_parameters(&coder->param);
    gpujpeg_image_set_default_parameters(&coder->param_image);
    coder->param_image.comp_count = 0;
    coder->param_image.width = 0;
    coder->param_image.height = 0;
    coder->param.restart_interval = 0;
    
    int result = 1;
    
    // Create reader
    decoder->reader = gpujpeg_reader_create();
    if ( decoder->reader == NULL )
        result = 0;
    
    // Allocate quantization tables in device memory
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        if ( cudaSuccess != cudaMalloc((void**)&decoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) ) 
            result = 0;
    }
    // Allocate huffman tables in device memory
    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
        for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
            if ( cudaSuccess != cudaMalloc((void**)&decoder->d_table_huffman[comp_type][huff_type], sizeof(struct gpujpeg_table_huffman_decoder)) )
                result = 0;
        }
    }
    gpujpeg_cuda_check_error("Decoder table allocation");
    
    // Init huffman encoder
    if ( gpujpeg_huffman_gpu_decoder_init() != 0 )
        result = 0;
    
    if ( result == 0 ) {
        gpujpeg_decoder_destroy(decoder);
        return NULL;
    }
    
    // Timers
    GPUJPEG_CUSTOM_TIMER_CREATE(decoder->def);
    GPUJPEG_CUSTOM_TIMER_CREATE(decoder->in_gpu);

    return decoder;
}
Exemplo n.º 3
0
/** Documented at declaration */
uint8_t*
gpujpeg_opengl_texture_map(struct gpujpeg_opengl_texture* texture, int* data_size)
{
    assert(texture->texture_pbo_resource != NULL);
    assert((texture->texture_callback_attach_opengl == NULL && texture->texture_callback_detach_opengl == NULL) ||
           (texture->texture_callback_attach_opengl != NULL && texture->texture_callback_detach_opengl != NULL));

    // Attach OpenGL context by callback
    if ( texture->texture_callback_attach_opengl != NULL )
        texture->texture_callback_attach_opengl(texture->texture_callback_param);

    uint8_t* d_data = NULL;

#ifdef GPUJPEG_USE_OPENGL
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
        assert(texture->texture_pbo_type == GL_PIXEL_PACK_BUFFER);

        glBindTexture(GL_TEXTURE_2D, texture->texture_id);
        glBindBuffer(GL_PIXEL_PACK_BUFFER, texture->texture_pbo_id);

        glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, 0);

        glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
        glBindTexture(GL_TEXTURE_2D, 0);
    }
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    // Map pixel buffer object to cuda
    cudaGraphicsMapResources(1, &texture->texture_pbo_resource, 0);
    gpujpeg_cuda_check_error("Encoder map texture PBO resource");

    // Get device data pointer to pixel buffer object data
    size_t d_data_size;
    cudaGraphicsResourceGetMappedPointer((void **)&d_data, &d_data_size, texture->texture_pbo_resource);
    gpujpeg_cuda_check_error("Encoder get device pointer for texture PBO resource");
    if ( data_size != NULL )
        *data_size = d_data_size;

    return d_data;
}
Exemplo n.º 4
0
/** Documented at declaration */
struct gpujpeg_opengl_texture*
gpujpeg_opengl_texture_register(int texture_id, enum gpujpeg_opengl_texture_type texture_type)
{
    struct gpujpeg_opengl_texture* texture = NULL;
    cudaMallocHost((void**)&texture, sizeof(struct gpujpeg_opengl_texture));
    assert(texture != NULL);

    texture->texture_id = texture_id;
    texture->texture_type = texture_type;
    texture->texture_width = 0;
    texture->texture_height = 0;
    texture->texture_pbo_id = 0;
    texture->texture_pbo_type = 0;
    texture->texture_pbo_resource = 0;
    texture->texture_callback_param = NULL;
    texture->texture_callback_attach_opengl = NULL;
    texture->texture_callback_detach_opengl = NULL;

#ifdef GPUJPEG_USE_OPENGL
    glBindTexture(GL_TEXTURE_2D, texture->texture_id);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &texture->texture_width);
    glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &texture->texture_height);
    glBindTexture(GL_TEXTURE_2D, 0);
    assert(texture->texture_width != 0 && texture->texture_height != 0);

    // Select PBO type
    if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) {
        texture->texture_pbo_type = GL_PIXEL_PACK_BUFFER;
    } else if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) {
        texture->texture_pbo_type = GL_PIXEL_UNPACK_BUFFER;
    } else {
        assert(0);
    }

    // Create PBO
    glGenBuffers(1, &texture->texture_pbo_id);
    glBindBuffer(texture->texture_pbo_type, texture->texture_pbo_id);
    glBufferData(texture->texture_pbo_type, texture->texture_width * texture->texture_height * 3 * sizeof(uint8_t), NULL, GL_DYNAMIC_DRAW);
    glBindBuffer(texture->texture_pbo_type, 0);

    // Create CUDA PBO Resource
    cudaGraphicsGLRegisterBuffer(&texture->texture_pbo_resource, texture->texture_pbo_id, cudaGraphicsMapFlagsNone);
    gpujpeg_cuda_check_error("Register OpenGL buffer");
#else
    GPUJPEG_EXIT_MISSING_OPENGL();
#endif

    return texture;
}
/** Documented at declaration */
struct gpujpeg_devices_info
gpujpeg_get_devices_info()
{
    struct gpujpeg_devices_info devices_info;

    cudaGetDeviceCount(&devices_info.device_count);
    gpujpeg_cuda_check_error("Cannot get number of CUDA devices", exit(-1));

    if ( devices_info.device_count > GPUJPEG_MAX_DEVICE_COUNT ) {
        fprintf(stderr, "[GPUJPEG] [Warning] There are available more CUDA devices (%d) than maximum count (%d).\n",
                devices_info.device_count, GPUJPEG_MAX_DEVICE_COUNT);
        fprintf(stderr, "[GPUJPEG] [Warning] Using maximum count (%d).\n", GPUJPEG_MAX_DEVICE_COUNT);
        devices_info.device_count = GPUJPEG_MAX_DEVICE_COUNT;
    }

    for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) {
        struct cudaDeviceProp device_properties;
        cudaGetDeviceProperties(&device_properties, device_id);

        struct gpujpeg_device_info* device_info = &devices_info.device[device_id];

        device_info->id = device_id;
        strncpy(device_info->name, device_properties.name, 255);
        device_info->cc_major = device_properties.major;
        device_info->cc_minor = device_properties.minor;
        device_info->global_memory = device_properties.totalGlobalMem;
        device_info->constant_memory = device_properties.totalConstMem;
        device_info->shared_memory = device_properties.sharedMemPerBlock;
        device_info->register_count = device_properties.regsPerBlock;
#if CUDART_VERSION >= 2000
        device_info->multiprocessor_count = device_properties.multiProcessorCount;
#endif
    }

    return devices_info;
}
Exemplo n.º 6
0
/** Documented at declaration */
int
gpujpeg_coder_init(struct gpujpeg_coder* coder)
{
    int result = 1;

    // Get info about the device
    struct cudaDeviceProp device_properties;
    int device_idx;
    cudaGetDevice(&device_idx);
    cudaGetDeviceProperties(&device_properties, device_idx);
    gpujpeg_cuda_check_error("Device info getting");
    coder->cuda_cc_major = device_properties.major;
    coder->cuda_cc_minor = device_properties.minor;

    coder->preprocessor = NULL;

    // Allocate color components
    cudaMallocHost((void**)&coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component));
    if ( coder->component == NULL )
        result = 0;
    // Allocate color components in device memory
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_component, coder->param_image.comp_count * sizeof(struct gpujpeg_component)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder color component allocation");

    // Initialize sampling factors and compute maximum sampling factor to coder->sampling_factor
    coder->sampling_factor.horizontal = 0;
    coder->sampling_factor.vertical = 0;
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        assert(coder->param.sampling_factor[comp].horizontal >= 1 && coder->param.sampling_factor[comp].horizontal <= 15);
        assert(coder->param.sampling_factor[comp].vertical >= 1 && coder->param.sampling_factor[comp].vertical <= 15);
        coder->component[comp].sampling_factor = coder->param.sampling_factor[comp];
        if ( coder->component[comp].sampling_factor.horizontal > coder->sampling_factor.horizontal )
            coder->sampling_factor.horizontal = coder->component[comp].sampling_factor.horizontal;
        if ( coder->component[comp].sampling_factor.vertical > coder->sampling_factor.vertical )
            coder->sampling_factor.vertical = coder->component[comp].sampling_factor.vertical;
    }

    // Calculate data size
    coder->data_raw_size = gpujpeg_image_calculate_size(&coder->param_image);
    coder->data_size = 0;

    // Initialize color components
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        // Get component
        struct gpujpeg_component* component = &coder->component[comp];

        // Set type
        component->type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;

        // Set proper color component sizes in pixels based on sampling factors
        int samp_factor_h = component->sampling_factor.horizontal;
        int samp_factor_v = component->sampling_factor.vertical;
        component->width = (coder->param_image.width * samp_factor_h) / coder->sampling_factor.horizontal;
        component->height = (coder->param_image.height * samp_factor_v) / coder->sampling_factor.vertical;

        // Compute component MCU size
        component->mcu_size_x = GPUJPEG_BLOCK_SIZE;
        component->mcu_size_y = GPUJPEG_BLOCK_SIZE;
        if ( coder->param.interleaved == 1 ) {
            component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE * samp_factor_h * samp_factor_v;
            component->mcu_size_x *= samp_factor_h;
            component->mcu_size_y *= samp_factor_v;
        } else {
            component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE;
        }
        component->mcu_size = component->mcu_size_x * component->mcu_size_y;

        // Compute allocated data size
        component->data_width = gpujpeg_div_and_round_up(component->width, component->mcu_size_x) * component->mcu_size_x;
        component->data_height = gpujpeg_div_and_round_up(component->height, component->mcu_size_y) * component->mcu_size_y;
        component->data_size = component->data_width * component->data_height;
        // Increase total data size
        coder->data_size += component->data_size;

        // Compute component MCU count
        component->mcu_count_x = gpujpeg_div_and_round_up(component->data_width, component->mcu_size_x);
        component->mcu_count_y = gpujpeg_div_and_round_up(component->data_height, component->mcu_size_y);
        component->mcu_count = component->mcu_count_x * component->mcu_count_y;

        // Compute MCU count per segment
        component->segment_mcu_count = coder->param.restart_interval;
        if ( component->segment_mcu_count == 0 ) {
            // If restart interval is disabled, restart interval is equal MCU count
            component->segment_mcu_count = component->mcu_count;
        }

        // Calculate segment count
        component->segment_count = gpujpeg_div_and_round_up(component->mcu_count, component->segment_mcu_count);

        //printf("Subsampling %dx%d, Resolution %d, %d, mcu size %d, mcu count %d\n",
        //    coder->param.sampling_factor[comp].horizontal, coder->param.sampling_factor[comp].vertical,
        //    component->data_width, component->data_height,
        //    component->mcu_compressed_size, component->mcu_count
        //);
    }

    // Maximum component data size for allocated buffers
    coder->data_width = gpujpeg_div_and_round_up(coder->param_image.width, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;
    coder->data_height = gpujpeg_div_and_round_up(coder->param_image.height, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE;

    // Compute MCU size, MCU count, segment count and compressed data allocation size
    coder->mcu_count = 0;
    coder->mcu_size = 0;
    coder->mcu_compressed_size = 0;
    coder->segment_count = 0;
    coder->data_compressed_size = 0;
    if ( coder->param.interleaved == 1 ) {
        assert(coder->param_image.comp_count > 0);
        coder->mcu_count = coder->component[0].mcu_count;
        coder->segment_count = coder->component[0].segment_count;
        coder->segment_mcu_count = coder->component[0].segment_mcu_count;
        for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
            struct gpujpeg_component* component = &coder->component[comp];
            assert(coder->mcu_count == component->mcu_count);
            assert(coder->segment_mcu_count == component->segment_mcu_count);
            coder->mcu_size += component->mcu_size;
            coder->mcu_compressed_size += component->mcu_compressed_size;
        }
    } else {
        assert(coder->param_image.comp_count > 0);
        coder->mcu_size = coder->component[0].mcu_size;
        coder->mcu_compressed_size = coder->component[0].mcu_compressed_size;
        coder->segment_mcu_count = 0;
        for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
            struct gpujpeg_component* component = &coder->component[comp];
            assert(coder->mcu_size == component->mcu_size);
            assert(coder->mcu_compressed_size == component->mcu_compressed_size);
            coder->mcu_count += component->mcu_count;
            coder->segment_count += component->segment_count;
        }
    }
    //printf("mcu size %d -> %d, mcu count %d, segment mcu count %d\n", coder->mcu_size, coder->mcu_compressed_size, coder->mcu_count, coder->segment_mcu_count);

    // Allocate segments
    cudaMallocHost((void**)&coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment));
    if ( coder->segment == NULL )
        result = 0;
    // Allocate segments in device memory
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder segment allocation");

    // Prepare segments
    if ( result == 1 ) {
        // While preparing segments compute input size and compressed size
        int data_index = 0;
        int data_compressed_index = 0;

        // Prepare segments based on (non-)interleaved mode
        if ( coder->param.interleaved == 1 ) {
            // Prepare segments for encoding (only one scan for all color components)
            int mcu_index = 0;
            for ( int index = 0; index < coder->segment_count; index++ ) {
                // Prepare segment MCU count
                int mcu_count = coder->segment_mcu_count;
                if ( (mcu_index + mcu_count) >= coder->mcu_count )
                    mcu_count = coder->mcu_count - mcu_index;
                // Set parameters for segment
                coder->segment[index].scan_index = 0;
                coder->segment[index].scan_segment_index = index;
                coder->segment[index].mcu_count = mcu_count;
                coder->segment[index].data_compressed_index = data_compressed_index;
                coder->segment[index].data_temp_index = data_compressed_index;
                coder->segment[index].data_compressed_size = 0;
                // Increase parameters for next segment
                data_index += mcu_count * coder->mcu_size;
                data_compressed_index += SEGMENT_ALIGN(mcu_count * coder->mcu_compressed_size);
                mcu_index += mcu_count;
            }
        } else {
            // Prepare segments for encoding (one scan for each color component)
            int index = 0;
            for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
                // Get component
                struct gpujpeg_component* component = &coder->component[comp];
                // Prepare component segments
                int mcu_index = 0;
                for ( int segment = 0; segment < component->segment_count; segment++ ) {
                    // Prepare segment MCU count
                    int mcu_count = component->segment_mcu_count;
                    if ( (mcu_index + mcu_count) >= component->mcu_count )
                        mcu_count = component->mcu_count - mcu_index;
                    // Set parameters for segment
                    coder->segment[index].scan_index = comp;
                    coder->segment[index].scan_segment_index = segment;
                    coder->segment[index].mcu_count = mcu_count;
                    coder->segment[index].data_compressed_index = data_compressed_index;
                    coder->segment[index].data_temp_index = data_compressed_index;
                    coder->segment[index].data_compressed_size = 0;
                    // Increase parameters for next segment
                    data_index += mcu_count * component->mcu_size;
                    data_compressed_index += SEGMENT_ALIGN(mcu_count * component->mcu_compressed_size);
                    mcu_index += mcu_count;
                    index++;
                }
            }
        }

        // Check data size
        //printf("%d == %d\n", coder->data_size, data_index);
        assert(coder->data_size == data_index);

        // Set compressed size
        coder->data_compressed_size = data_compressed_index;
    }
    //printf("Compressed size %d (segments %d)\n", coder->data_compressed_size, coder->segment_count);

    // Print allocation info
    if ( coder->param.verbose ) {
        int structures_size = 0;
        structures_size += coder->segment_count * sizeof(struct gpujpeg_segment);
        structures_size += coder->param_image.comp_count * sizeof(struct gpujpeg_component);
        int total_size = 0;
        total_size += structures_size;
        total_size += coder->data_raw_size;
        total_size += coder->data_size;
        total_size += coder->data_size * 2;
        total_size += coder->data_compressed_size;  // for Huffman coding output
        total_size += coder->data_compressed_size;  // for Hiffman coding temp buffer

        printf("\nAllocation Info:\n");
        printf("    Segment Count:            %d\n", coder->segment_count);
        printf("    Allocated Data Size:      %dx%d\n", coder->data_width, coder->data_height);
        printf("    Raw Buffer Size:          %0.1f MB\n", (double)coder->data_raw_size / (1024.0 * 1024.0));
        printf("    Preprocessor Buffer Size: %0.1f MB\n", (double)coder->data_size / (1024.0 * 1024.0));
        printf("    DCT Buffer Size:          %0.1f MB\n", (double)2 * coder->data_size / (1024.0 * 1024.0));
        printf("    Compressed Buffer Size:   %0.1f MB\n", (double)coder->data_compressed_size / (1024.0 * 1024.0));
        printf("    Huffman Temp buffer Size: %0.1f MB\n", (double)coder->data_compressed_size / (1024.0 * 1024.0));
        printf("    Structures Size:          %0.1f kB\n", (double)structures_size / (1024.0));
        printf("    Total GPU Memory Size:    %0.1f MB\n", (double)total_size / (1024.0 * 1024.0));
        printf("");
    }

    // Allocate data buffers for all color components
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_raw, coder->data_raw_size * sizeof(uint8_t)) )
        return -1;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data, coder->data_size * sizeof(uint8_t)) )
        result = 0;
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_quantized, coder->data_size * sizeof(int16_t)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_quantized, coder->data_size * sizeof(int16_t)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder data allocation");

    // Set data buffer to color components
    uint8_t* d_comp_data = coder->d_data;
    int16_t* d_comp_data_quantized = coder->d_data_quantized;
    int16_t* comp_data_quantized = coder->data_quantized;
    unsigned int data_quantized_index = 0;
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        struct gpujpeg_component* component = &coder->component[comp];
        component->d_data = d_comp_data;
        component->d_data_quantized = d_comp_data_quantized;
        component->data_quantized_index = data_quantized_index;
        component->data_quantized = comp_data_quantized;
        d_comp_data += component->data_width * component->data_height;
        d_comp_data_quantized += component->data_width * component->data_height;
        comp_data_quantized += component->data_width * component->data_height;
        data_quantized_index += component->data_width * component->data_height;
    }

    // Copy components to device memory
    if ( cudaSuccess != cudaMemcpy(coder->d_component, coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component), cudaMemcpyHostToDevice) )
        result = 0;
    gpujpeg_cuda_check_error("Coder component copy");

    // Allocate compressed data
    int max_compressed_data_size = coder->data_compressed_size;
    max_compressed_data_size += GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE;
    //max_compressed_data_size *= 2;
    if ( cudaSuccess != cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_compressed, max_compressed_data_size * sizeof(uint8_t)) )
        result = 0;
    gpujpeg_cuda_check_error("Coder data compressed allocation");

    // Allocate Huffman coder temporary buffer
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_temp_huffman, max_compressed_data_size * sizeof(uint8_t)) )
        result = 0;
    gpujpeg_cuda_check_error("Huffman temp buffer allocation");

    // Initialize block lists in host memory
    coder->block_count = 0;
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ )
        coder->block_count += (coder->component[comp].data_width * coder->component[comp].data_height) / (8 * 8);
    if ( cudaSuccess != cudaMallocHost((void**)&coder->block_list, coder->block_count * sizeof(*coder->block_list)) )
        result = 0;
    if ( cudaSuccess != cudaMalloc((void**)&coder->d_block_list, coder->block_count * sizeof(*coder->d_block_list)) )
        result = 0;
    if( result == 0 )
        return 0;
    int block_idx = 0;
    int comp_count = 1;
    if ( coder->param.interleaved == 1 )
        comp_count = coder->param_image.comp_count;
    assert(comp_count >= 1 && comp_count <= GPUJPEG_MAX_COMPONENT_COUNT);
    for( int segment_idx = 0; segment_idx < coder->segment_count; segment_idx++ ) {
        struct gpujpeg_segment* const segment = &coder->segment[segment_idx];
        segment->block_index_list_begin = block_idx;

        // Non-interleaving mode
        if ( comp_count == 1 ) {
            // Inspect MCUs in segment
            for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) {
                // Component for the scan
                struct gpujpeg_component* component = &coder->component[segment->scan_index];

                // Offset of component data for MCU
                uint64_t data_index = component->data_quantized_index + (segment->scan_segment_index * component->segment_mcu_count + mcu_index) * component->mcu_size;
                uint64_t component_type = component->type == GPUJPEG_COMPONENT_LUMINANCE ? 0x00 : 0x80;
                uint64_t dc_index = segment->scan_index;
                coder->block_list[block_idx++] = dc_index | component_type | (data_index << 8);
            }
        }
        // Interleaving mode
        else {
            // Encode MCUs in segment
            for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) {
                //assert(segment->scan_index == 0);
                for ( int comp = 0; comp < comp_count; comp++ ) {
                    struct gpujpeg_component* component = &coder->component[comp];

                    // Prepare mcu indexes
                    int mcu_index_x = (segment->scan_segment_index * component->segment_mcu_count + mcu_index) % component->mcu_count_x;
                    int mcu_index_y = (segment->scan_segment_index * component->segment_mcu_count + mcu_index) / component->mcu_count_x;
                    // Compute base data index
                    int data_index_base = component->data_quantized_index + mcu_index_y * (component->mcu_size * component->mcu_count_x) + mcu_index_x * (component->mcu_size_x * GPUJPEG_BLOCK_SIZE);

                    // For all vertical 8x8 blocks
                    for ( int y = 0; y < component->sampling_factor.vertical; y++ ) {
                        // Compute base row data index
                        int data_index_row = data_index_base + y * (component->mcu_count_x * component->mcu_size_x * GPUJPEG_BLOCK_SIZE);
                        // For all horizontal 8x8 blocks
                        for ( int x = 0; x < component->sampling_factor.horizontal; x++ ) {
                            // Compute 8x8 block data index
                            uint64_t data_index = data_index_row + x * GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE;
                            uint64_t component_type = component->type == GPUJPEG_COMPONENT_LUMINANCE ? 0x00 : 0x80;
                            uint64_t dc_index = comp;
                            coder->block_list[block_idx++] = dc_index | component_type | (data_index << 8);
                        }
                    }
                }
            }
        }
        segment->block_count = block_idx - segment->block_index_list_begin;
    }
    assert(block_idx == coder->block_count);

    // Copy block lists to device memory
    if ( cudaSuccess != cudaMemcpy(coder->d_block_list, coder->block_list, coder->block_count * sizeof(*coder->d_block_list), cudaMemcpyHostToDevice) )
        result = 0;

    // Copy segments to device memory
    if ( cudaSuccess != cudaMemcpy(coder->d_segment, coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice) )
        result = 0;

    return 0;
}
Exemplo n.º 7
0
/** Documented at declaration */
int
gpujpeg_init_device(int device_id, int flags)
{
    int dev_count;
    cudaGetDeviceCount(&dev_count);
    if ( dev_count == 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n");
        return -1;
    }

    if ( device_id < 0 || device_id >= dev_count ) {
        fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
                device_id, 0, dev_count - 1);
        return -1;
    }

    struct cudaDeviceProp devProp;
    if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) {
        fprintf(stderr,
                "[GPUJPEG] [Error] Can't get CUDA device properties!\n"
                "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n"
               );
        return -1;
    }

    if ( devProp.major < 1 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id);
        return -1;
    }

    if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) {
        cudaGLSetGLDevice(device_id);
        gpujpeg_cuda_check_error("Enabling OpenGL interoperability");
    }

    if ( flags & GPUJPEG_VERBOSE ) {
        int cuda_driver_version = 0;
        cudaDriverGetVersion(&cuda_driver_version);
        printf("CUDA driver version:   %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);

        int cuda_runtime_version = 0;
        cudaRuntimeGetVersion(&cuda_runtime_version);
        printf("CUDA runtime version:  %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10);

        printf("Using Device #%d:       %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
    }

    cudaSetDevice(device_id);
    gpujpeg_cuda_check_error("Set CUDA device");

    // Test by simple copying that the device is ready
    uint8_t data[] = {8};
    uint8_t* d_data = NULL;
    cudaMalloc((void**)&d_data, 1);
    cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice);
    cudaFree(d_data);
    cudaError_t error = cudaGetLastError();
    if ( cudaSuccess != error ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n");
        if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY )
            fprintf(stderr, "[GPUJPEG] [Info]  OpenGL interoperability is used, is OpenGL context available?\n");
        return -1;
    }

    return 0;
}
/** Documented at declaration */
int
gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output)
{
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Read JPEG image data
    if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
        return -1;
    }
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Perform huffman decoding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
            return -1;
        }
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy quantized data to device memory from cpu memory
        cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);
    }
    // Perform huffman decoding on GPU (when restart interval is set)
    else {
        // Reset huffman output
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        // Copy scan data to device memory
        cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Copy segments to device memory
        cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Zero output memory
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);

        // Perform huffman decoding
        if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    }
    
    // Perform IDCT and dequantization (own CUDA implementation)
    gpujpeg_idct_gpu(decoder);
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Preprocessing
    if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu);
    coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu);

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    
    // Set decompressed image size
    output->data_size = coder->data_raw_size * sizeof(uint8_t);
    
    // Set decompressed image
    if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy decompressed image to host memory
        cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        // Set output to internal buffer
        output->data = coder->data_raw;
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
        
        assert(output->data != NULL);

        // Copy decompressed image to host memory
        cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Map OpenGL texture
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
        assert(data_size == coder->data_raw_size);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Copy decompressed image to texture pixel buffer object device data
        cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Unmap OpenGL texture
        gpujpeg_opengl_texture_unmap(output->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    return 0;
}
Exemplo n.º 9
0
/** Documented at declaration */
int
gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output)
{
    // Get coder
    struct gpujpeg_coder* coder = &decoder->coder;
    
    // Reset durations
    coder->duration_memory_to = 0.0;
    coder->duration_memory_from = 0.0;
    coder->duration_memory_map = 0.0;
    coder->duration_memory_unmap = 0.0;
    coder->duration_preprocessor = 0.0;
    coder->duration_dct_quantization = 0.0;
    coder->duration_huffman_coder = 0.0;
    coder->duration_stream = 0.0;
    coder->duration_in_gpu = 0.0;

    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Read JPEG image data
    if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
        return -1;
    }
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Perform huffman decoding on CPU (when restart interval is not set)
    if ( coder->param.restart_interval == 0 ) {
        if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
            return -1;
        }
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy quantized data to device memory from cpu memory
        cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);
    }
    // Perform huffman decoding on GPU (when restart interval is set)
    else {
    #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT
        // Copy huffman tables to constant memory
        for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
            for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
                int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type);
                cudaMemcpyToSymbol(
                    (char*)gpujpeg_decoder_table_huffman, 
                    &decoder->table_huffman[comp_type][huff_type], 
                    sizeof(struct gpujpeg_table_huffman_decoder), 
                    index * sizeof(struct gpujpeg_table_huffman_decoder), 
                    cudaMemcpyHostToDevice
                );
            }
        }
        gpujpeg_cuda_check_error("Decoder copy huffman tables to constant memory");
    #endif
    
        // Reset huffman output
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        // Copy scan data to device memory
        cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Copy segments to device memory
        cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice);
        gpujpeg_cuda_check_error("Decoder copy compressed data");
        
        // Zero output memory
        cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t));
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu);

        // Perform huffman decoding
        if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
            return -1;
        }

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    }
    
#ifdef GPUJPEG_DCT_FROM_NPP
    // Perform IDCT and dequantization (implementation from NPP)
    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
        // Get component
        struct gpujpeg_component* component = &coder->component[comp];
                
        // Determine table type
        enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE;
        
        //gpujpeg_component_print16(component, component->d_data_quantized);
        
        cudaMemset(component->d_data, 0, component->data_size * sizeof(uint8_t));
        
        //Perform inverse DCT
        NppiSize inv_roi;
        inv_roi.width = component->data_width * GPUJPEG_BLOCK_SIZE;
        inv_roi.height = component->data_height / GPUJPEG_BLOCK_SIZE;
        assert(GPUJPEG_BLOCK_SIZE == 8);
        NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
            component->d_data_quantized, 
            component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t), 
            component->d_data, 
            component->data_width * sizeof(uint8_t), 
            decoder->table_quantization[type].d_table, 
            inv_roi
        );
        if ( status != 0 ) {
            fprintf(stderr, "[GPUJPEG] [Error] Inverse DCT failed (error %d)!\n", status);
        }
            
        //gpujpeg_component_print8(component, component->d_data);
    }
#else
    // Perform IDCT and dequantization (own CUDA implementation)
    gpujpeg_idct_gpu(decoder);
    
    // Perform IDCT and dequantization (own CPU implementation)
    // gpujpeg_idct_cpu(decoder);
#endif
    
    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    GPUJPEG_CUSTOM_TIMER_START(decoder->def);
    
    // Preprocessing
    if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 )
        return -1;

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu);
    coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu);

    GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
    coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    
    // Set decompressed image size
    output->data_size = coder->data_raw_size * sizeof(uint8_t);
    
    // Set decompressed image
    if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Copy decompressed image to host memory
        cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        // Set output to internal buffer
        output->data = coder->data_raw;
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
        
        assert(output->data != NULL);

        // Copy decompressed image to host memory
        cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
        
        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) {
        GPUJPEG_CUSTOM_TIMER_START(decoder->def);

        // Map OpenGL texture
        int data_size = 0;
        uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
        assert(data_size == coder->data_raw_size);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Copy decompressed image to texture pixel buffer object device data
        cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);

        GPUJPEG_CUSTOM_TIMER_START(decoder->def);
            
        // Unmap OpenGL texture
        gpujpeg_opengl_texture_unmap(output->texture);

        GPUJPEG_CUSTOM_TIMER_STOP(decoder->def);
        coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def);
    } else {
        // Unknown output type
        assert(0);
    }

    return 0;
}