void enc_astc_file(char* filename, char* dst_filename) { rgba_surface src_img; load_bmp(&src_img, filename); flip_image(&src_img); int block_width = 6; int block_height = 6; rgba_surface output_tex; output_tex.width = idiv_ceil(src_img.width, block_width); output_tex.height = idiv_ceil(src_img.height, block_height); output_tex.stride = output_tex.width * 16; output_tex.ptr = (uint8_t*)malloc(output_tex.height * output_tex.stride); rgba_surface edged_img; fill_borders(&edged_img, &src_img, block_width, block_height); printf("encoding <%s>...", filename); compress_astc_tex(&output_tex, &edged_img, block_width, block_height); printf("done.\n"); output_tex.width = src_img.width; output_tex.height = src_img.height; store_astc(&output_tex, block_width, block_height, dst_filename); }
int main(int argc, char** argv) { if (argc != 2) { printf("Usage: %s <iterations>\n", argv[0]); return (EXIT_FAILURE); } int iterations = atoi(argv[1]); printf("main_cuda()\n"); printf("Iterations: %d\n", iterations); bool ok = true; /* Read input file into buffer. */ unsigned char (**image_buffer_h)[CHANNELS]; if (ok) ok = read_image((unsigned char ***) &image_buffer_h); /* Allocate memory for image data. */ float (**image_h)[CHANNELS]; if (ok) ok = alloc_float_array((float ***) &image_h, B + HEIGHT + B, B + WIDTH + B, CHANNELS); /* Convert input. */ unsigned int i, j, c; if (ok) { for (i = 0; i < HEIGHT; i++) for (j = 0; j < WIDTH; j++) for (c = 0; c < CHANNELS; c++) image_h[i + B][j + B][c] = (float) image_buffer_h[i][j][c]; } /* Device memory allocation. */ float (**prev_image_d)[CHANNELS]; float (**curr_image_d)[CHANNELS]; float *prev_image_p; float *curr_image_p; if (ok) ok = alloc_float_array_cuda((float ***) &prev_image_d, &prev_image_p, B + HEIGHT + B, B + WIDTH + B, CHANNELS); if (ok) ok = alloc_float_array_cuda((float ***) &curr_image_d, &curr_image_p, B + HEIGHT + B, B + WIDTH + B, CHANNELS); /* Initialize filter in device memory space. */ float (**filter_d)[1]; float *filter_p; if (ok) ok = init_filter(&filter_d, &filter_p, filter); /* Device parameters for nVidia 9600GT (G94), passed to main filter function. */ /* nVidia G94 supports 8 resident blocks per SMP, 768 resident threads per SMP. */ unsigned int block_size = 64; // maximum 512 threads per block for nVidia G94 printf("Block size: %u\n", block_size); /* nVidia G94 supports 2-dimensional grids with a maximum of 65535 for x,y dimension. */ unsigned int grid_dim = HEIGHT * WIDTH / block_size; double sqr = sqrt(grid_dim); grid_dim = sqr; grid_dim++; printf("Grid: %ux%u\n", grid_dim, grid_dim); /* Start timing. */ float memcopy, compute; timestamp t_start; t_start = getTimestamp(); /* Copy image data to device. */ if (ok) ok = (cudaSuccess == cudaMemcpy(curr_image_p, &(image_h[0][0][0]), (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float), cudaMemcpyHostToDevice)); memcopy = getElapsedtime(t_start); /* Clear host image data. */ memset(&(image_h[0][0][0]), 0, (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float)); /* Apply filter. */ t_start = getTimestamp(); unsigned int n; if (ok) { for (n = 0; iterations == 0 || n < iterations; n++) { /* Fill borders with edge image data. */ fill_borders(curr_image_d, HEIGHT, WIDTH); /* Apply filter. */ apply_filter_cuda(prev_image_d, curr_image_d, filter_d, block_size, grid_dim); /* Switch current / previous image buffers. */ float (**temp)[CHANNELS]; temp = prev_image_d; prev_image_d = curr_image_d; curr_image_d = temp; float *tmp; tmp = prev_image_p; prev_image_p = curr_image_p; curr_image_p = tmp; } } /* Stop time measurement, print time. */ cudaThreadSynchronize(); compute = getElapsedtime(t_start); t_start = getTimestamp(); /* Copy processed image data from device. */ if (ok) ok = (cudaSuccess == cudaMemcpy(&(image_h[0][0][0]), curr_image_p, (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float), cudaMemcpyDeviceToHost)); memcopy += getElapsedtime(t_start); printf("Completed in %.3f sec\n", compute / 1000); printf("Memory copy in %.3f sec\n", memcopy / 1000); /* Convert output. */ if (ok) { for (i = 0; i < HEIGHT; i++) for (j = 0; j < WIDTH; j++) for (c = 0; c < CHANNELS; c++) image_buffer_h[i][j][c] = (unsigned char) image_h[i + B][j + B][c]; } /* Create output files, one for each channel. */ if (ok) ok = write_channels(image_buffer_h, HEIGHT, WIDTH); /* Free allocated memory. */ dealloc_uchar_array((unsigned char ***) &image_buffer_h); dealloc_float_array((float ***) &image_h); dealloc_float_array_cuda((float ***) &prev_image_d, &prev_image_p); dealloc_float_array_cuda((float ***) &curr_image_d, &curr_image_p); destroy_filter(&filter_d, &filter_p); return ok ? (EXIT_SUCCESS) : (EXIT_FAILURE); }