void enc_astc_file(char* filename, char* dst_filename)
{
    rgba_surface src_img;
    load_bmp(&src_img, filename);
    flip_image(&src_img);

    int block_width = 6;
    int block_height = 6;

    rgba_surface output_tex;
    output_tex.width = idiv_ceil(src_img.width, block_width);
    output_tex.height = idiv_ceil(src_img.height, block_height);
    output_tex.stride = output_tex.width * 16;
    output_tex.ptr = (uint8_t*)malloc(output_tex.height * output_tex.stride);

    rgba_surface edged_img;
    fill_borders(&edged_img, &src_img, block_width, block_height);

    printf("encoding <%s>...", filename);

    compress_astc_tex(&output_tex, &edged_img, block_width, block_height);

    printf("done.\n");

    output_tex.width = src_img.width;
    output_tex.height = src_img.height;
    store_astc(&output_tex, block_width, block_height, dst_filename);
}
Example #2
0
int main(int argc, char** argv)
{
    if (argc != 2)
    {
        printf("Usage: %s <iterations>\n", argv[0]);
        return (EXIT_FAILURE);
    }

    int iterations = atoi(argv[1]);

    printf("main_cuda()\n");
    printf("Iterations: %d\n", iterations);

    bool ok = true;

    /* Read input file into buffer. */

    unsigned char (**image_buffer_h)[CHANNELS];

    if (ok)
        ok = read_image((unsigned char ***) &image_buffer_h);

    /* Allocate memory for image data. */

    float (**image_h)[CHANNELS];

    if (ok)
        ok = alloc_float_array((float ***) &image_h,
            B + HEIGHT + B, B + WIDTH + B, CHANNELS);

    /* Convert input. */

    unsigned int i, j, c;

    if (ok)
    {
        for (i = 0; i < HEIGHT; i++)
            for (j = 0; j < WIDTH; j++)
                for (c = 0; c < CHANNELS; c++)
                    image_h[i + B][j + B][c] = (float) image_buffer_h[i][j][c];
    }

    /* Device memory allocation. */

    float (**prev_image_d)[CHANNELS];
    float (**curr_image_d)[CHANNELS];

    float *prev_image_p;
    float *curr_image_p;

    if (ok)
        ok = alloc_float_array_cuda((float ***) &prev_image_d, &prev_image_p,
            B + HEIGHT + B, B + WIDTH + B, CHANNELS);

    if (ok)
        ok = alloc_float_array_cuda((float ***) &curr_image_d, &curr_image_p,
            B + HEIGHT + B, B + WIDTH + B, CHANNELS);

    /* Initialize filter in device memory space. */

    float (**filter_d)[1];
    float *filter_p;

    if (ok)
        ok = init_filter(&filter_d, &filter_p, filter);

    /* Device parameters for nVidia 9600GT (G94), passed to main filter function. */

    /* nVidia G94 supports 8 resident blocks per SMP, 768 resident threads per SMP. */

    unsigned int block_size = 64; // maximum 512 threads per block for nVidia G94
    printf("Block size: %u\n", block_size);

    /* nVidia G94 supports 2-dimensional grids with a maximum of 65535 for x,y dimension. */

    unsigned int grid_dim = HEIGHT * WIDTH / block_size;
    double sqr = sqrt(grid_dim);
    grid_dim = sqr;
    grid_dim++;
    printf("Grid: %ux%u\n", grid_dim, grid_dim);

    /* Start timing. */

    float memcopy, compute;
    timestamp t_start;
    t_start = getTimestamp();

    /* Copy image data to device. */

    if (ok)
        ok = (cudaSuccess == cudaMemcpy(curr_image_p, &(image_h[0][0][0]),
            (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float),
            cudaMemcpyHostToDevice));

    memcopy = getElapsedtime(t_start);

    /* Clear host image data. */

    memset(&(image_h[0][0][0]), 0, (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float));

    /* Apply filter. */

    t_start = getTimestamp();

    unsigned int n;

    if (ok)
    {
        for (n = 0; iterations == 0 || n < iterations; n++)
        {
            /* Fill borders with edge image data. */

            fill_borders(curr_image_d, HEIGHT, WIDTH);

            /* Apply filter. */

            apply_filter_cuda(prev_image_d, curr_image_d, filter_d, block_size, grid_dim);

            /* Switch current / previous image buffers. */

            float (**temp)[CHANNELS];
            temp = prev_image_d;
            prev_image_d = curr_image_d;
            curr_image_d = temp;

            float *tmp;
            tmp = prev_image_p;
            prev_image_p = curr_image_p;
            curr_image_p = tmp;
        }
    }

    /* Stop time measurement, print time. */

    cudaThreadSynchronize();

    compute = getElapsedtime(t_start);
    t_start = getTimestamp();

    /* Copy processed image data from device. */

    if (ok)
        ok = (cudaSuccess == cudaMemcpy(&(image_h[0][0][0]), curr_image_p,
            (B + HEIGHT + B) * (B + WIDTH + B) * CHANNELS * sizeof (float),
            cudaMemcpyDeviceToHost));

    memcopy += getElapsedtime(t_start);

    printf("Completed in %.3f sec\n", compute / 1000);
    printf("Memory copy in %.3f sec\n", memcopy / 1000);

    /* Convert output. */

    if (ok)
    {
        for (i = 0; i < HEIGHT; i++)
            for (j = 0; j < WIDTH; j++)
                for (c = 0; c < CHANNELS; c++)
                    image_buffer_h[i][j][c] = (unsigned char) image_h[i + B][j + B][c];
    }

    /* Create output files, one for each channel. */

    if (ok)
        ok = write_channels(image_buffer_h, HEIGHT, WIDTH);

    /* Free allocated memory. */

    dealloc_uchar_array((unsigned char ***) &image_buffer_h);
    dealloc_float_array((float ***) &image_h);
    dealloc_float_array_cuda((float ***) &prev_image_d, &prev_image_p);
    dealloc_float_array_cuda((float ***) &curr_image_d, &curr_image_p);
    destroy_filter(&filter_d, &filter_p);

    return ok ? (EXIT_SUCCESS) : (EXIT_FAILURE);
}