/** Documented at declaration */ void gpujpeg_opengl_texture_unmap(struct gpujpeg_opengl_texture* texture) { // Unmap pbo cudaGraphicsUnmapResources(1, &texture->texture_pbo_resource, 0); gpujpeg_cuda_check_error("Encoder unmap texture PBO resource"); #ifdef GPUJPEG_USE_OPENGL if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) { assert(texture->texture_pbo_type == GL_PIXEL_UNPACK_BUFFER); glBindTexture(GL_TEXTURE_2D, texture->texture_id); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture->texture_pbo_id); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, texture->texture_width, texture->texture_height, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glBindTexture(GL_TEXTURE_2D, 0); glFinish(); } #else GPUJPEG_EXIT_MISSING_OPENGL(); #endif // Dettach OpenGL context by callback if ( texture->texture_callback_detach_opengl != NULL ) texture->texture_callback_detach_opengl(texture->texture_callback_param); }
/** Documented at declaration */ struct gpujpeg_decoder* gpujpeg_decoder_create() { struct gpujpeg_decoder* decoder = malloc(sizeof(struct gpujpeg_decoder)); if ( decoder == NULL ) return NULL; // Get coder struct gpujpeg_coder* coder = &decoder->coder; // Set parameters memset(decoder, 0, sizeof(struct gpujpeg_decoder)); gpujpeg_set_default_parameters(&coder->param); gpujpeg_image_set_default_parameters(&coder->param_image); coder->param_image.comp_count = 0; coder->param_image.width = 0; coder->param_image.height = 0; coder->param.restart_interval = 0; int result = 1; // Create reader decoder->reader = gpujpeg_reader_create(); if ( decoder->reader == NULL ) result = 0; // Allocate quantization tables in device memory for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { if ( cudaSuccess != cudaMalloc((void**)&decoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) ) result = 0; } // Allocate huffman tables in device memory for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) { if ( cudaSuccess != cudaMalloc((void**)&decoder->d_table_huffman[comp_type][huff_type], sizeof(struct gpujpeg_table_huffman_decoder)) ) result = 0; } } gpujpeg_cuda_check_error("Decoder table allocation"); // Init huffman encoder if ( gpujpeg_huffman_gpu_decoder_init() != 0 ) result = 0; if ( result == 0 ) { gpujpeg_decoder_destroy(decoder); return NULL; } // Timers GPUJPEG_CUSTOM_TIMER_CREATE(decoder->def); GPUJPEG_CUSTOM_TIMER_CREATE(decoder->in_gpu); return decoder; }
/** Documented at declaration */ uint8_t* gpujpeg_opengl_texture_map(struct gpujpeg_opengl_texture* texture, int* data_size) { assert(texture->texture_pbo_resource != NULL); assert((texture->texture_callback_attach_opengl == NULL && texture->texture_callback_detach_opengl == NULL) || (texture->texture_callback_attach_opengl != NULL && texture->texture_callback_detach_opengl != NULL)); // Attach OpenGL context by callback if ( texture->texture_callback_attach_opengl != NULL ) texture->texture_callback_attach_opengl(texture->texture_callback_param); uint8_t* d_data = NULL; #ifdef GPUJPEG_USE_OPENGL if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) { assert(texture->texture_pbo_type == GL_PIXEL_PACK_BUFFER); glBindTexture(GL_TEXTURE_2D, texture->texture_id); glBindBuffer(GL_PIXEL_PACK_BUFFER, texture->texture_pbo_id); glGetTexImage(GL_TEXTURE_2D, 0, GL_RGB, GL_UNSIGNED_BYTE, 0); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); glBindTexture(GL_TEXTURE_2D, 0); } #else GPUJPEG_EXIT_MISSING_OPENGL(); #endif // Map pixel buffer object to cuda cudaGraphicsMapResources(1, &texture->texture_pbo_resource, 0); gpujpeg_cuda_check_error("Encoder map texture PBO resource"); // Get device data pointer to pixel buffer object data size_t d_data_size; cudaGraphicsResourceGetMappedPointer((void **)&d_data, &d_data_size, texture->texture_pbo_resource); gpujpeg_cuda_check_error("Encoder get device pointer for texture PBO resource"); if ( data_size != NULL ) *data_size = d_data_size; return d_data; }
/** Documented at declaration */ struct gpujpeg_opengl_texture* gpujpeg_opengl_texture_register(int texture_id, enum gpujpeg_opengl_texture_type texture_type) { struct gpujpeg_opengl_texture* texture = NULL; cudaMallocHost((void**)&texture, sizeof(struct gpujpeg_opengl_texture)); assert(texture != NULL); texture->texture_id = texture_id; texture->texture_type = texture_type; texture->texture_width = 0; texture->texture_height = 0; texture->texture_pbo_id = 0; texture->texture_pbo_type = 0; texture->texture_pbo_resource = 0; texture->texture_callback_param = NULL; texture->texture_callback_attach_opengl = NULL; texture->texture_callback_detach_opengl = NULL; #ifdef GPUJPEG_USE_OPENGL glBindTexture(GL_TEXTURE_2D, texture->texture_id); glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &texture->texture_width); glGetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &texture->texture_height); glBindTexture(GL_TEXTURE_2D, 0); assert(texture->texture_width != 0 && texture->texture_height != 0); // Select PBO type if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_READ ) { texture->texture_pbo_type = GL_PIXEL_PACK_BUFFER; } else if ( texture->texture_type == GPUJPEG_OPENGL_TEXTURE_WRITE ) { texture->texture_pbo_type = GL_PIXEL_UNPACK_BUFFER; } else { assert(0); } // Create PBO glGenBuffers(1, &texture->texture_pbo_id); glBindBuffer(texture->texture_pbo_type, texture->texture_pbo_id); glBufferData(texture->texture_pbo_type, texture->texture_width * texture->texture_height * 3 * sizeof(uint8_t), NULL, GL_DYNAMIC_DRAW); glBindBuffer(texture->texture_pbo_type, 0); // Create CUDA PBO Resource cudaGraphicsGLRegisterBuffer(&texture->texture_pbo_resource, texture->texture_pbo_id, cudaGraphicsMapFlagsNone); gpujpeg_cuda_check_error("Register OpenGL buffer"); #else GPUJPEG_EXIT_MISSING_OPENGL(); #endif return texture; }
/** Documented at declaration */ struct gpujpeg_devices_info gpujpeg_get_devices_info() { struct gpujpeg_devices_info devices_info; cudaGetDeviceCount(&devices_info.device_count); gpujpeg_cuda_check_error("Cannot get number of CUDA devices", exit(-1)); if ( devices_info.device_count > GPUJPEG_MAX_DEVICE_COUNT ) { fprintf(stderr, "[GPUJPEG] [Warning] There are available more CUDA devices (%d) than maximum count (%d).\n", devices_info.device_count, GPUJPEG_MAX_DEVICE_COUNT); fprintf(stderr, "[GPUJPEG] [Warning] Using maximum count (%d).\n", GPUJPEG_MAX_DEVICE_COUNT); devices_info.device_count = GPUJPEG_MAX_DEVICE_COUNT; } for ( int device_id = 0; device_id < devices_info.device_count; device_id++ ) { struct cudaDeviceProp device_properties; cudaGetDeviceProperties(&device_properties, device_id); struct gpujpeg_device_info* device_info = &devices_info.device[device_id]; device_info->id = device_id; strncpy(device_info->name, device_properties.name, 255); device_info->cc_major = device_properties.major; device_info->cc_minor = device_properties.minor; device_info->global_memory = device_properties.totalGlobalMem; device_info->constant_memory = device_properties.totalConstMem; device_info->shared_memory = device_properties.sharedMemPerBlock; device_info->register_count = device_properties.regsPerBlock; #if CUDART_VERSION >= 2000 device_info->multiprocessor_count = device_properties.multiProcessorCount; #endif } return devices_info; }
/** Documented at declaration */ int gpujpeg_coder_init(struct gpujpeg_coder* coder) { int result = 1; // Get info about the device struct cudaDeviceProp device_properties; int device_idx; cudaGetDevice(&device_idx); cudaGetDeviceProperties(&device_properties, device_idx); gpujpeg_cuda_check_error("Device info getting"); coder->cuda_cc_major = device_properties.major; coder->cuda_cc_minor = device_properties.minor; coder->preprocessor = NULL; // Allocate color components cudaMallocHost((void**)&coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component)); if ( coder->component == NULL ) result = 0; // Allocate color components in device memory if ( cudaSuccess != cudaMalloc((void**)&coder->d_component, coder->param_image.comp_count * sizeof(struct gpujpeg_component)) ) result = 0; gpujpeg_cuda_check_error("Coder color component allocation"); // Initialize sampling factors and compute maximum sampling factor to coder->sampling_factor coder->sampling_factor.horizontal = 0; coder->sampling_factor.vertical = 0; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { assert(coder->param.sampling_factor[comp].horizontal >= 1 && coder->param.sampling_factor[comp].horizontal <= 15); assert(coder->param.sampling_factor[comp].vertical >= 1 && coder->param.sampling_factor[comp].vertical <= 15); coder->component[comp].sampling_factor = coder->param.sampling_factor[comp]; if ( coder->component[comp].sampling_factor.horizontal > coder->sampling_factor.horizontal ) coder->sampling_factor.horizontal = coder->component[comp].sampling_factor.horizontal; if ( coder->component[comp].sampling_factor.vertical > coder->sampling_factor.vertical ) coder->sampling_factor.vertical = coder->component[comp].sampling_factor.vertical; } // Calculate data size coder->data_raw_size = gpujpeg_image_calculate_size(&coder->param_image); coder->data_size = 0; // Initialize color components for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Set type component->type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE; // Set proper color component sizes in pixels based on sampling factors int samp_factor_h = component->sampling_factor.horizontal; int samp_factor_v = component->sampling_factor.vertical; component->width = (coder->param_image.width * samp_factor_h) / coder->sampling_factor.horizontal; component->height = (coder->param_image.height * samp_factor_v) / coder->sampling_factor.vertical; // Compute component MCU size component->mcu_size_x = GPUJPEG_BLOCK_SIZE; component->mcu_size_y = GPUJPEG_BLOCK_SIZE; if ( coder->param.interleaved == 1 ) { component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE * samp_factor_h * samp_factor_v; component->mcu_size_x *= samp_factor_h; component->mcu_size_y *= samp_factor_v; } else { component->mcu_compressed_size = GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE; } component->mcu_size = component->mcu_size_x * component->mcu_size_y; // Compute allocated data size component->data_width = gpujpeg_div_and_round_up(component->width, component->mcu_size_x) * component->mcu_size_x; component->data_height = gpujpeg_div_and_round_up(component->height, component->mcu_size_y) * component->mcu_size_y; component->data_size = component->data_width * component->data_height; // Increase total data size coder->data_size += component->data_size; // Compute component MCU count component->mcu_count_x = gpujpeg_div_and_round_up(component->data_width, component->mcu_size_x); component->mcu_count_y = gpujpeg_div_and_round_up(component->data_height, component->mcu_size_y); component->mcu_count = component->mcu_count_x * component->mcu_count_y; // Compute MCU count per segment component->segment_mcu_count = coder->param.restart_interval; if ( component->segment_mcu_count == 0 ) { // If restart interval is disabled, restart interval is equal MCU count component->segment_mcu_count = component->mcu_count; } // Calculate segment count component->segment_count = gpujpeg_div_and_round_up(component->mcu_count, component->segment_mcu_count); //printf("Subsampling %dx%d, Resolution %d, %d, mcu size %d, mcu count %d\n", // coder->param.sampling_factor[comp].horizontal, coder->param.sampling_factor[comp].vertical, // component->data_width, component->data_height, // component->mcu_compressed_size, component->mcu_count //); } // Maximum component data size for allocated buffers coder->data_width = gpujpeg_div_and_round_up(coder->param_image.width, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE; coder->data_height = gpujpeg_div_and_round_up(coder->param_image.height, GPUJPEG_BLOCK_SIZE) * GPUJPEG_BLOCK_SIZE; // Compute MCU size, MCU count, segment count and compressed data allocation size coder->mcu_count = 0; coder->mcu_size = 0; coder->mcu_compressed_size = 0; coder->segment_count = 0; coder->data_compressed_size = 0; if ( coder->param.interleaved == 1 ) { assert(coder->param_image.comp_count > 0); coder->mcu_count = coder->component[0].mcu_count; coder->segment_count = coder->component[0].segment_count; coder->segment_mcu_count = coder->component[0].segment_mcu_count; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { struct gpujpeg_component* component = &coder->component[comp]; assert(coder->mcu_count == component->mcu_count); assert(coder->segment_mcu_count == component->segment_mcu_count); coder->mcu_size += component->mcu_size; coder->mcu_compressed_size += component->mcu_compressed_size; } } else { assert(coder->param_image.comp_count > 0); coder->mcu_size = coder->component[0].mcu_size; coder->mcu_compressed_size = coder->component[0].mcu_compressed_size; coder->segment_mcu_count = 0; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { struct gpujpeg_component* component = &coder->component[comp]; assert(coder->mcu_size == component->mcu_size); assert(coder->mcu_compressed_size == component->mcu_compressed_size); coder->mcu_count += component->mcu_count; coder->segment_count += component->segment_count; } } //printf("mcu size %d -> %d, mcu count %d, segment mcu count %d\n", coder->mcu_size, coder->mcu_compressed_size, coder->mcu_count, coder->segment_mcu_count); // Allocate segments cudaMallocHost((void**)&coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment)); if ( coder->segment == NULL ) result = 0; // Allocate segments in device memory if ( cudaSuccess != cudaMalloc((void**)&coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment)) ) result = 0; gpujpeg_cuda_check_error("Coder segment allocation"); // Prepare segments if ( result == 1 ) { // While preparing segments compute input size and compressed size int data_index = 0; int data_compressed_index = 0; // Prepare segments based on (non-)interleaved mode if ( coder->param.interleaved == 1 ) { // Prepare segments for encoding (only one scan for all color components) int mcu_index = 0; for ( int index = 0; index < coder->segment_count; index++ ) { // Prepare segment MCU count int mcu_count = coder->segment_mcu_count; if ( (mcu_index + mcu_count) >= coder->mcu_count ) mcu_count = coder->mcu_count - mcu_index; // Set parameters for segment coder->segment[index].scan_index = 0; coder->segment[index].scan_segment_index = index; coder->segment[index].mcu_count = mcu_count; coder->segment[index].data_compressed_index = data_compressed_index; coder->segment[index].data_temp_index = data_compressed_index; coder->segment[index].data_compressed_size = 0; // Increase parameters for next segment data_index += mcu_count * coder->mcu_size; data_compressed_index += SEGMENT_ALIGN(mcu_count * coder->mcu_compressed_size); mcu_index += mcu_count; } } else { // Prepare segments for encoding (one scan for each color component) int index = 0; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Prepare component segments int mcu_index = 0; for ( int segment = 0; segment < component->segment_count; segment++ ) { // Prepare segment MCU count int mcu_count = component->segment_mcu_count; if ( (mcu_index + mcu_count) >= component->mcu_count ) mcu_count = component->mcu_count - mcu_index; // Set parameters for segment coder->segment[index].scan_index = comp; coder->segment[index].scan_segment_index = segment; coder->segment[index].mcu_count = mcu_count; coder->segment[index].data_compressed_index = data_compressed_index; coder->segment[index].data_temp_index = data_compressed_index; coder->segment[index].data_compressed_size = 0; // Increase parameters for next segment data_index += mcu_count * component->mcu_size; data_compressed_index += SEGMENT_ALIGN(mcu_count * component->mcu_compressed_size); mcu_index += mcu_count; index++; } } } // Check data size //printf("%d == %d\n", coder->data_size, data_index); assert(coder->data_size == data_index); // Set compressed size coder->data_compressed_size = data_compressed_index; } //printf("Compressed size %d (segments %d)\n", coder->data_compressed_size, coder->segment_count); // Print allocation info if ( coder->param.verbose ) { int structures_size = 0; structures_size += coder->segment_count * sizeof(struct gpujpeg_segment); structures_size += coder->param_image.comp_count * sizeof(struct gpujpeg_component); int total_size = 0; total_size += structures_size; total_size += coder->data_raw_size; total_size += coder->data_size; total_size += coder->data_size * 2; total_size += coder->data_compressed_size; // for Huffman coding output total_size += coder->data_compressed_size; // for Hiffman coding temp buffer printf("\nAllocation Info:\n"); printf(" Segment Count: %d\n", coder->segment_count); printf(" Allocated Data Size: %dx%d\n", coder->data_width, coder->data_height); printf(" Raw Buffer Size: %0.1f MB\n", (double)coder->data_raw_size / (1024.0 * 1024.0)); printf(" Preprocessor Buffer Size: %0.1f MB\n", (double)coder->data_size / (1024.0 * 1024.0)); printf(" DCT Buffer Size: %0.1f MB\n", (double)2 * coder->data_size / (1024.0 * 1024.0)); printf(" Compressed Buffer Size: %0.1f MB\n", (double)coder->data_compressed_size / (1024.0 * 1024.0)); printf(" Huffman Temp buffer Size: %0.1f MB\n", (double)coder->data_compressed_size / (1024.0 * 1024.0)); printf(" Structures Size: %0.1f kB\n", (double)structures_size / (1024.0)); printf(" Total GPU Memory Size: %0.1f MB\n", (double)total_size / (1024.0 * 1024.0)); printf(""); } // Allocate data buffers for all color components if ( cudaSuccess != cudaMallocHost((void**)&coder->data_raw, coder->data_raw_size * sizeof(uint8_t)) ) return -1; if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t)) ) result = 0; if ( cudaSuccess != cudaMalloc((void**)&coder->d_data, coder->data_size * sizeof(uint8_t)) ) result = 0; if ( cudaSuccess != cudaMallocHost((void**)&coder->data_quantized, coder->data_size * sizeof(int16_t)) ) result = 0; if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_quantized, coder->data_size * sizeof(int16_t)) ) result = 0; gpujpeg_cuda_check_error("Coder data allocation"); // Set data buffer to color components uint8_t* d_comp_data = coder->d_data; int16_t* d_comp_data_quantized = coder->d_data_quantized; int16_t* comp_data_quantized = coder->data_quantized; unsigned int data_quantized_index = 0; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { struct gpujpeg_component* component = &coder->component[comp]; component->d_data = d_comp_data; component->d_data_quantized = d_comp_data_quantized; component->data_quantized_index = data_quantized_index; component->data_quantized = comp_data_quantized; d_comp_data += component->data_width * component->data_height; d_comp_data_quantized += component->data_width * component->data_height; comp_data_quantized += component->data_width * component->data_height; data_quantized_index += component->data_width * component->data_height; } // Copy components to device memory if ( cudaSuccess != cudaMemcpy(coder->d_component, coder->component, coder->param_image.comp_count * sizeof(struct gpujpeg_component), cudaMemcpyHostToDevice) ) result = 0; gpujpeg_cuda_check_error("Coder component copy"); // Allocate compressed data int max_compressed_data_size = coder->data_compressed_size; max_compressed_data_size += GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE; //max_compressed_data_size *= 2; if ( cudaSuccess != cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t)) ) result = 0; if ( cudaSuccess != cudaMalloc((void**)&coder->d_data_compressed, max_compressed_data_size * sizeof(uint8_t)) ) result = 0; gpujpeg_cuda_check_error("Coder data compressed allocation"); // Allocate Huffman coder temporary buffer if ( cudaSuccess != cudaMalloc((void**)&coder->d_temp_huffman, max_compressed_data_size * sizeof(uint8_t)) ) result = 0; gpujpeg_cuda_check_error("Huffman temp buffer allocation"); // Initialize block lists in host memory coder->block_count = 0; for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) coder->block_count += (coder->component[comp].data_width * coder->component[comp].data_height) / (8 * 8); if ( cudaSuccess != cudaMallocHost((void**)&coder->block_list, coder->block_count * sizeof(*coder->block_list)) ) result = 0; if ( cudaSuccess != cudaMalloc((void**)&coder->d_block_list, coder->block_count * sizeof(*coder->d_block_list)) ) result = 0; if( result == 0 ) return 0; int block_idx = 0; int comp_count = 1; if ( coder->param.interleaved == 1 ) comp_count = coder->param_image.comp_count; assert(comp_count >= 1 && comp_count <= GPUJPEG_MAX_COMPONENT_COUNT); for( int segment_idx = 0; segment_idx < coder->segment_count; segment_idx++ ) { struct gpujpeg_segment* const segment = &coder->segment[segment_idx]; segment->block_index_list_begin = block_idx; // Non-interleaving mode if ( comp_count == 1 ) { // Inspect MCUs in segment for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) { // Component for the scan struct gpujpeg_component* component = &coder->component[segment->scan_index]; // Offset of component data for MCU uint64_t data_index = component->data_quantized_index + (segment->scan_segment_index * component->segment_mcu_count + mcu_index) * component->mcu_size; uint64_t component_type = component->type == GPUJPEG_COMPONENT_LUMINANCE ? 0x00 : 0x80; uint64_t dc_index = segment->scan_index; coder->block_list[block_idx++] = dc_index | component_type | (data_index << 8); } } // Interleaving mode else { // Encode MCUs in segment for ( int mcu_index = 0; mcu_index < segment->mcu_count; mcu_index++ ) { //assert(segment->scan_index == 0); for ( int comp = 0; comp < comp_count; comp++ ) { struct gpujpeg_component* component = &coder->component[comp]; // Prepare mcu indexes int mcu_index_x = (segment->scan_segment_index * component->segment_mcu_count + mcu_index) % component->mcu_count_x; int mcu_index_y = (segment->scan_segment_index * component->segment_mcu_count + mcu_index) / component->mcu_count_x; // Compute base data index int data_index_base = component->data_quantized_index + mcu_index_y * (component->mcu_size * component->mcu_count_x) + mcu_index_x * (component->mcu_size_x * GPUJPEG_BLOCK_SIZE); // For all vertical 8x8 blocks for ( int y = 0; y < component->sampling_factor.vertical; y++ ) { // Compute base row data index int data_index_row = data_index_base + y * (component->mcu_count_x * component->mcu_size_x * GPUJPEG_BLOCK_SIZE); // For all horizontal 8x8 blocks for ( int x = 0; x < component->sampling_factor.horizontal; x++ ) { // Compute 8x8 block data index uint64_t data_index = data_index_row + x * GPUJPEG_BLOCK_SIZE * GPUJPEG_BLOCK_SIZE; uint64_t component_type = component->type == GPUJPEG_COMPONENT_LUMINANCE ? 0x00 : 0x80; uint64_t dc_index = comp; coder->block_list[block_idx++] = dc_index | component_type | (data_index << 8); } } } } } segment->block_count = block_idx - segment->block_index_list_begin; } assert(block_idx == coder->block_count); // Copy block lists to device memory if ( cudaSuccess != cudaMemcpy(coder->d_block_list, coder->block_list, coder->block_count * sizeof(*coder->d_block_list), cudaMemcpyHostToDevice) ) result = 0; // Copy segments to device memory if ( cudaSuccess != cudaMemcpy(coder->d_segment, coder->segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice) ) result = 0; return 0; }
/** Documented at declaration */ int gpujpeg_init_device(int device_id, int flags) { int dev_count; cudaGetDeviceCount(&dev_count); if ( dev_count == 0 ) { fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n"); return -1; } if ( device_id < 0 || device_id >= dev_count ) { fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n", device_id, 0, dev_count - 1); return -1; } struct cudaDeviceProp devProp; if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) { fprintf(stderr, "[GPUJPEG] [Error] Can't get CUDA device properties!\n" "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n" ); return -1; } if ( devProp.major < 1 ) { fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id); return -1; } if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) { cudaGLSetGLDevice(device_id); gpujpeg_cuda_check_error("Enabling OpenGL interoperability"); } if ( flags & GPUJPEG_VERBOSE ) { int cuda_driver_version = 0; cudaDriverGetVersion(&cuda_driver_version); printf("CUDA driver version: %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10); int cuda_runtime_version = 0; cudaRuntimeGetVersion(&cuda_runtime_version); printf("CUDA runtime version: %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10); printf("Using Device #%d: %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor); } cudaSetDevice(device_id); gpujpeg_cuda_check_error("Set CUDA device"); // Test by simple copying that the device is ready uint8_t data[] = {8}; uint8_t* d_data = NULL; cudaMalloc((void**)&d_data, 1); cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice); cudaFree(d_data); cudaError_t error = cudaGetLastError(); if ( cudaSuccess != error ) { fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n"); if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) fprintf(stderr, "[GPUJPEG] [Info] OpenGL interoperability is used, is OpenGL context available?\n"); return -1; } return 0; }
/** Documented at declaration */ int gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output) { // Get coder struct gpujpeg_coder* coder = &decoder->coder; // Reset durations coder->duration_memory_to = 0.0; coder->duration_memory_from = 0.0; coder->duration_memory_map = 0.0; coder->duration_memory_unmap = 0.0; coder->duration_preprocessor = 0.0; coder->duration_dct_quantization = 0.0; coder->duration_huffman_coder = 0.0; coder->duration_stream = 0.0; coder->duration_in_gpu = 0.0; GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Read JPEG image data if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Perform huffman decoding on CPU (when restart interval is not set) if ( coder->param.restart_interval == 0 ) { if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy quantized data to device memory from cpu memory cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); } // Perform huffman decoding on GPU (when restart interval is set) else { // Reset huffman output cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); // Copy scan data to device memory cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Copy segments to device memory cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Zero output memory cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); // Perform huffman decoding if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); } // Perform IDCT and dequantization (own CUDA implementation) gpujpeg_idct_gpu(decoder); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Preprocessing if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 ) return -1; GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu); coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set decompressed image size output->data_size = coder->data_raw_size * sizeof(uint8_t); // Set decompressed image if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to host memory cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set output to internal buffer output->data = coder->data_raw; } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); assert(output->data != NULL); // Copy decompressed image to host memory cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Map OpenGL texture int data_size = 0; uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size); assert(data_size == coder->data_raw_size); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to texture pixel buffer object device data cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Unmap OpenGL texture gpujpeg_opengl_texture_unmap(output->texture); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else { // Unknown output type assert(0); } return 0; }
/** Documented at declaration */ int gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, int image_size, struct gpujpeg_decoder_output* output) { // Get coder struct gpujpeg_coder* coder = &decoder->coder; // Reset durations coder->duration_memory_to = 0.0; coder->duration_memory_from = 0.0; coder->duration_memory_map = 0.0; coder->duration_memory_unmap = 0.0; coder->duration_preprocessor = 0.0; coder->duration_dct_quantization = 0.0; coder->duration_huffman_coder = 0.0; coder->duration_stream = 0.0; coder->duration_in_gpu = 0.0; GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Read JPEG image data if ( gpujpeg_reader_read_image(decoder, image, image_size) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_stream = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Perform huffman decoding on CPU (when restart interval is not set) if ( coder->param.restart_interval == 0 ) { if ( gpujpeg_huffman_cpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy quantized data to device memory from cpu memory cudaMemcpy(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); } // Perform huffman decoding on GPU (when restart interval is set) else { #ifdef GPUJPEG_HUFFMAN_CODER_TABLES_IN_CONSTANT // Copy huffman tables to constant memory for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) { int index = (comp_type * GPUJPEG_HUFFMAN_TYPE_COUNT + huff_type); cudaMemcpyToSymbol( (char*)gpujpeg_decoder_table_huffman, &decoder->table_huffman[comp_type][huff_type], sizeof(struct gpujpeg_table_huffman_decoder), index * sizeof(struct gpujpeg_table_huffman_decoder), cudaMemcpyHostToDevice ); } } gpujpeg_cuda_check_error("Decoder copy huffman tables to constant memory"); #endif // Reset huffman output cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); // Copy scan data to device memory cudaMemcpy(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size * sizeof(uint8_t), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Copy segments to device memory cudaMemcpy(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice); gpujpeg_cuda_check_error("Decoder copy compressed data"); // Zero output memory cudaMemset(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t)); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_to = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->in_gpu); // Perform huffman decoding if ( gpujpeg_huffman_gpu_decoder_decode(decoder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n"); return -1; } GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_huffman_coder = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); } #ifdef GPUJPEG_DCT_FROM_NPP // Perform IDCT and dequantization (implementation from NPP) for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { // Get component struct gpujpeg_component* component = &coder->component[comp]; // Determine table type enum gpujpeg_component_type type = (comp == 0) ? GPUJPEG_COMPONENT_LUMINANCE : GPUJPEG_COMPONENT_CHROMINANCE; //gpujpeg_component_print16(component, component->d_data_quantized); cudaMemset(component->d_data, 0, component->data_size * sizeof(uint8_t)); //Perform inverse DCT NppiSize inv_roi; inv_roi.width = component->data_width * GPUJPEG_BLOCK_SIZE; inv_roi.height = component->data_height / GPUJPEG_BLOCK_SIZE; assert(GPUJPEG_BLOCK_SIZE == 8); NppStatus status = nppiDCTQuantInv8x8LS_JPEG_16s8u_C1R( component->d_data_quantized, component->data_width * GPUJPEG_BLOCK_SIZE * sizeof(int16_t), component->d_data, component->data_width * sizeof(uint8_t), decoder->table_quantization[type].d_table, inv_roi ); if ( status != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Inverse DCT failed (error %d)!\n", status); } //gpujpeg_component_print8(component, component->d_data); } #else // Perform IDCT and dequantization (own CUDA implementation) gpujpeg_idct_gpu(decoder); // Perform IDCT and dequantization (own CPU implementation) // gpujpeg_idct_cpu(decoder); #endif GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_dct_quantization = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Preprocessing if ( gpujpeg_preprocessor_decode(&decoder->coder) != 0 ) return -1; GPUJPEG_CUSTOM_TIMER_STOP(decoder->in_gpu); coder->duration_in_gpu = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->in_gpu); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_preprocessor = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set decompressed image size output->data_size = coder->data_raw_size * sizeof(uint8_t); // Set decompressed image if ( output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to host memory cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); // Set output to internal buffer output->data = coder->data_raw; } else if ( output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); assert(output->data != NULL); // Copy decompressed image to host memory cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else if ( output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE ) { GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Map OpenGL texture int data_size = 0; uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size); assert(data_size == coder->data_raw_size); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_map = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Copy decompressed image to texture pixel buffer object device data cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_from = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); GPUJPEG_CUSTOM_TIMER_START(decoder->def); // Unmap OpenGL texture gpujpeg_opengl_texture_unmap(output->texture); GPUJPEG_CUSTOM_TIMER_STOP(decoder->def); coder->duration_memory_unmap = GPUJPEG_CUSTOM_TIMER_DURATION(decoder->def); } else { // Unknown output type assert(0); } return 0; }