int main(int argc, char **argv) { tut1_error res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device phy_devs[MAX_DEVICES]; struct tut2_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; VkShaderModule shaders[MAX_DEVICES] = {NULL}; struct tut3_pipelines pipelines[MAX_DEVICES]; struct tut4_data test_data[MAX_DEVICES]; int success = 0; /* How many threads to do the work on */ size_t thread_count = 8; /* Whether the threads should take some CPU time as well */ bool busy_threads = false; /* Default to 1MB of buffer data to work on */ size_t buffer_size = 1024 * 1024 / sizeof(float); bool bad_args = false; if (argc < 2) bad_args = true; if (argc > 2 && sscanf(argv[2], "%zu", &thread_count) != 1) bad_args = true; if (argc > 3) { int temp; if (sscanf(argv[3], "%d", &temp) != 1) bad_args = true; else busy_threads = temp; } if (argc > 4) { if (sscanf(argv[4], "%zu", &buffer_size) != 1) bad_args = true; else buffer_size /= sizeof(float); } if (bad_args) { printf("Usage: %s shader_file [thread_count(8) [busy_threads(0) [buffer_size(1MB)]]]\n\n", argv[0]); return EXIT_FAILURE; } /* Fire up Vulkan */ res = tut1_init(&vk); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not initialize Vulkan\n"); goto exit_bad_init; } /* Enumerate devices */ res = tut1_enumerate_devices(vk, phy_devs, &dev_count); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not enumerate devices\n"); goto exit_bad_enumerate; } /* Set up devices */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i); goto exit_bad_setup; } } /* Load our compute shader */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_load_shader(&devs[i], argv[1], &shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not load shader on device %u\n", i); goto exit_bad_shader; } } /* * Create the pipelines. There are as many pipelines created as command buffers (just for example). If * there are not actually enough resources for them, as many as possible are created. In this test, we are * not going to handle the case where some pipelines are not created. */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not allocate enough pipelines on device %u\n", i); goto exit_bad_pipeline; } } /* * Prepare our test. Both the buffers and threads are divided near-equally among the physical devices, which * are likely to be just 1 in your case, but who knows. */ for (uint32_t i = 0; i < dev_count; ++i) { size_t this_buffer_size = buffer_size / dev_count; size_t this_thread_count = thread_count / dev_count; /* Make sure the last device gets all the left-over */ if (i == dev_count - 1) { this_buffer_size = buffer_size - buffer_size / dev_count * (dev_count - 1); this_thread_count = thread_count - thread_count / dev_count * (dev_count - 1); } res = tut4_prepare_test(&phy_devs[i], &devs[i], &pipelines[i], &test_data[i], this_buffer_size, this_thread_count); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not allocate resources on device %u\n", i); goto exit_bad_test_prepare; } } /* * Ok, this was a LOT of initializing! But we are finally ready to run something. tut4_start_test() creates * a test thread for us, which further spawns the corresponding device's thread_count threads that do the * calculations. We then wait for the tests to finish with tut4_wait_test_end(). */ for (uint32_t i = 0; i < dev_count; ++i) { if (tut4_start_test(&test_data[i], busy_threads)) { printf("Could not start the test threads for device %u\n", i); perror("Error"); } } printf("Running the tests...\n"); for (uint32_t i = 0; i < dev_count; ++i) tut4_wait_test_end(&test_data[i]); success = 1; for (uint32_t i = 0; i < dev_count; ++i) if (!test_data[i].success) { if (!tut1_error_is_success(&test_data[i].error)) tut1_error_printf(&test_data[i].error, "Error starting test on device %u\n", i); else printf("The test didn't produce expected results (device %u)\n", i); success = 0; } if (success) printf("Everything went well :) We just wasted your GPU doing something stupid\n"); /* * You can time the execution of the program with time(1): * * $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> ... * * Then try to play with different number of threads and see if the total execution time of the application * changes and how! * * ... * * Did you try that? Already? Well, that was disappointing. More threads probably resulted in higher * execution time, right? That actually makes sense. You see, we have N data to compute, and whether you tell * the GPU to do N computations from one thread, or N/T computations each from T threads, you aren't actually * doing any less computation. You probably just have more overhead from the threads. * * So what's the deal with multi-threaded and Vulkan? Well, the problem is that this test was heavily * GPU-bound, and as you have noticed, multi-CPU-threaded doesn't help. For this reason, this test has a * little feature to "fake" some execution on the CPU threads as well. If you run the program like this: * * $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> <fake> ... * * where <fake> can be either 0 (no CPU usage) or 1 (some fake CPU usage), and then experiment with different * number of threads, you can see the benefit of multi-threading. In this case, while the GPU is working, the * CPU thread spends time fake-doing something. If there is only one thread, the CPU cannot keep the GPU * constantly busy, so the computation slows down. On the other hand, with multiple threads, the same amount * of CPU work is spread out and done in parallel, so the threads together can feed the GPU with instructions * faster. * * In this test, the total amount of time to waste is 3.2 seconds (32ms for each "render" operation, and there * are a hundred of them). Depending on your GPU, you may notice that above a certain number of threads, there * is no more any speedup. That is when the amount of time spent in each CPU thread becomes less than the time * spent in the GPU for that thread's task, so whether the CPU spent time doing something before waiting for * the GPU doesn't make a difference in the execution time. */ retval = 0; /* Cleanup after yourself */ exit_bad_test_prepare: for (uint32_t i = 0; i < dev_count; ++i) tut4_free_test(&devs[i], &test_data[i]); exit_bad_pipeline: for (uint32_t i = 0; i < dev_count; ++i) tut3_destroy_pipeline(&devs[i], &pipelines[i]); exit_bad_shader: for (uint32_t i = 0; i < dev_count; ++i) tut3_free_shader(&devs[i], shaders[i]); exit_bad_setup: for (uint32_t i = 0; i < dev_count; ++i) tut2_cleanup(&devs[i]); exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }
int main(int argc, char **argv) { tut1_error res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device phy_devs[MAX_DEVICES]; struct tut2_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; VkShaderModule shaders[MAX_DEVICES] = {NULL}; struct tut3_pipelines pipelines[MAX_DEVICES]; if (argc < 2) { printf("Usage: %s shader_file\n\n", argv[0]); return EXIT_FAILURE; } /* Fire up Vulkan */ res = tut1_init(&vk); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not initialize Vulkan\n"); goto exit_bad_init; } /* Enumerate devices */ res = tut1_enumerate_devices(vk, phy_devs, &dev_count); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not enumerate devices\n"); goto exit_bad_enumerate; } /* Set up devices */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i); goto exit_bad_setup; } } /* Load our compute shader */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_load_shader(&devs[i], argv[1], &shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not load shader on device %u\n", i); goto exit_bad_shader; } } printf("Loaded the shader, awesome!\n"); /* * Create the pipelines. There are as many pipelines created as command buffers (just for example). If * there are not actually enough resources for them, as many as possible are created. */ for (uint32_t i = 0; i < dev_count; ++i) tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]); /* * Like tutorial 2, we have covered a lot of ground in this tutorial. Let's keep actual usage of our compute * shader to the next tutorial, where we would see the effect of multiple threads on the processing speed. */ for (uint32_t i = 0; i < dev_count; ++i) { uint32_t count = 0; for (uint32_t j = 0; j < pipelines[i].pipeline_count; ++j) if (pipelines[i].pipelines[j].pipeline) ++count; printf("Created %u pipeline%s on device %u\n", count, count == 1?"":"s", i); } retval = 0; /* Cleanup after yourself */ for (uint32_t i = 0; i < dev_count; ++i) tut3_destroy_pipeline(&devs[i], &pipelines[i]); exit_bad_shader: for (uint32_t i = 0; i < dev_count; ++i) tut3_free_shader(&devs[i], shaders[i]); exit_bad_setup: for (uint32_t i = 0; i < dev_count; ++i) tut2_cleanup(&devs[i]); exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }
int main(int argc, char **argv) { VkResult res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; /* Fire up Vulkan */ res = tut1_init(&vk); if (res) { printf("Could not initialize Vulkan: %s\n", tut1_VkResult_string(res)); goto exit_bad_init; } printf("Vulkan is in the house.\n"); /* Take a look at what devices there are */ res = tut1_enumerate_devices(vk, devs, &dev_count); if (res < 0) { printf("Could not enumerate devices: %s\n", tut1_VkResult_string(res)); goto exit_bad_enumerate; } else if (res == VK_INCOMPLETE) { print_surprise("", "you've got", "devices", "dream of"); printf("I have information on only %"PRIu32" of them:\n", dev_count); } else printf("I detected the following %"PRIu32" device%s:\n", dev_count, dev_count == 1?"":"s"); /* * Print out some of the information taken when enumerating physical devices. This is by no means an * exhaustive printout, but to give you the idea. */ for (uint32_t i = 0; i < dev_count; ++i) { struct tut1_physical_device *dev = &devs[i]; VkPhysicalDeviceProperties *pr = &dev->properties; printf(" - %s: %s (id: 0x%04X) from vendor 0x%04X [driver version: 0x%04X, API version: 0x%04X]\n", tut1_VkPhysicalDeviceType_string(pr->deviceType), pr->deviceName, pr->deviceID, pr->vendorID, pr->driverVersion, pr->apiVersion); if (dev->queue_families_incomplete) { print_surprise(" ", "your device", "queue families", "imagine"); printf(" I have information on only %"PRIu32" of them:\n", dev->queue_family_count); } else printf(" The device supports the following %"PRIu32" queue famil%s:\n", dev->queue_family_count, dev->queue_family_count == 1?"y":"ies"); for (uint32_t j = 0; j < dev->queue_family_count; ++j) { VkQueueFamilyProperties *qf = &dev->queue_families[j]; printf(" * %"PRIu32" queue%s with the following capabilit%s:\n", qf->queueCount, qf->queueCount == 1?"":"s", qf->queueFlags && (qf->queueFlags & (qf->queueFlags - 1)) == 0?"y":"ies"); if (qf->queueFlags == 0) printf(" None\n"); if ((qf->queueFlags & VK_QUEUE_GRAPHICS_BIT)) printf(" Graphics\n"); if ((qf->queueFlags & VK_QUEUE_COMPUTE_BIT)) printf(" Compute\n"); if ((qf->queueFlags & VK_QUEUE_TRANSFER_BIT)) printf(" Transfer\n"); if ((qf->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT)) printf(" Sparse binding\n"); } printf(" The device supports memories of the following types:\n"); for (uint32_t j = 0; j < dev->memories.memoryTypeCount; ++j) { printf(" *"); if (dev->memories.memoryTypes[j].propertyFlags == 0) printf(" <no properties>"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) printf(" device-local"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) printf(" host-visible"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) printf(" host-coherent"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) printf(" host-cached"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) printf(" lazy"); printf(": Available in Heap of size %"PRIu64"MB\n", dev->memories.memoryHeaps[dev->memories.memoryTypes[j].heapIndex].size / (1024 * 1024)); } } /* Congratulations, you can now duplicate the `vulkaninfo` program. */ retval = 0; /* Cleanup after yourself */ exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }
static void render_loop(uint32_t dev_count, struct tut1_physical_device *phy_devs, struct tut2_device *devs, struct tut6_swapchain *swapchains) { int res; struct tut7_render_essentials essentials[dev_count]; /* Allocate render essentials. See this function in tut7_render.c for explanations. */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut7_render_get_essentials(&essentials[i], &phy_devs[i], &devs[i], &swapchains[i]); if (res) { printf("-- failed for device %u\n", i); return; } } unsigned int frames = 0; time_t before = time(NULL); uint8_t color = 0; /* Process events from SDL and render. If process_events returns non-zero, it signals application exit. */ while (process_events() == 0) { /* * A simple imprecise FPS calculator. Try the --no-vsync option to this program to see the difference. * * On Linux, with Nvidia GTX 970, and Vulkan 1.0.8, --no-vsync got me about 12000 FPS. */ time_t now = time(NULL); if (now != before) { printf("%lds: %u frames\n", now - before, frames); frames = 0; before = now; } ++frames; /* * We are not yet ready to actually render something. For that, we would need descriptor sets and * pipelines, but we'll get to that soon. In tut7.c, we have a repository of functions to create * resources for the eventual rendering. Here, we'll ignore all that and do what we ignored in * Tutorial 6, and that is properly transitioning the swapchain images between "present src" and * something we can render to. With a graphics pipeline, we would want to transition to * "color attachment optimal". Since we don't have one, we are going to "clear" the screen which * doesn't need a graphics pipeline. In that case, the layout of the image should be GENERAL. */ for (uint32_t i = 0; i < dev_count; ++i) { uint32_t image_index; /* * To render to an image and present it on the screen, the following sequence of operations * needs to be done: * * - acquire from swapchain * - transition to color attachment optimal * - render * - transition to present src * - present the image * * One way to implement this would be to call the corresponding functions one by one, wait and * make sure the image passes through each section, and repeat. The problem with this way is * that there is wasted time between each function call. Not that function call itself takes * measurable time, but the setup and finish times of each call, especially because we are * interacting with the GPU. * * Vulkan is made for parallelism and efficiency, so naturally it's not stupid in this regard! * There are different ways to do the above in parallel, and synchronize them. One nice thing * is that command buffers can call other secondary command buffers. So, while a small part of * the command buffer requires knowledge of which presentable image it is working with, the * majority of it doesn't, so they could be pre-recorded or recorded in parallel by other * threads. Another nice thing is that many of the functions work asynchronously, such as * submission to queue for rendering. This allows the CPU to go ahead with executing the rest * of the above algorithm, only wait for the GPU to finish rendering when it has to, and let * synchronization mechanisms take care of handling the flow of execution in the back. * * One could imagine different ways of doing things, but here is a simple example: * * - acquire from swapchain, signalling semaphore A * - wait on fence C (for previous frame to finish) * - create a command buffer with 1) first transition, 2) render, 3) second transition * - submit the command buffer with semaphore A waiting in the beginning and semaphore B * signalling the end, with fence C signalling the end as well * - present to swapchain, waiting on the second semaphore * * The significance of the fence above is the following. In Tutorial 6, we used `usleep` to * avoid busy looping. That was bad, because it put a hard limit and the frame rate. The * issue is not just busy looping though. Since the submissions to queues happen * asynchronously, we risk submitting work faster than the card can actually perform them, with * the result being that frames we send now are rendered much later, after all our previous * work is finished. This delay can easily become unacceptable; imagine a player has hit the * key to move forwards, you detect this and generate the next frame accordingly, but the * player doesn't actually see her character move forward while several older frames are still * being rendered. * * The location of the fence is chosen as such, to allow maximum overlap between GPU and CPU * work. In this case, while the GPU is still rendering, the CPU can wait for the swapchain * image to be acquired. The wait on the fence could not be delayed any further, because we * can't re-record a command buffer that is being executed. Interestingly, if we use two * command buffers and alternate between them, we could also wait for the fence later! Let's * not go that far yet. */ /* See this function in tut7_render.c for explanations */ res = tut7_render_start(&essentials[i], &devs[i], &swapchains[i], VK_IMAGE_LAYOUT_GENERAL, &image_index); if (res) { printf("-- failed for device %u\n", i); goto exit_fail; } /* * We did everything just to clear the image. Like I said, it's possible to clear an image * outside a pipeline. It is also possible to clear it inside a pipeline, so fear not! When * we have a graphics pipeline, we can transition the image directly to "color attachment * optimal" and clear it, and we don't have to first transition to "general" and then * transition again to "color attachment optimal". * * Clearing the image outside the pipeline is quite straightforward, and in fact has no notion * of the image being used for presentation later. It's just clearing a general image. * * The vkCmdClearColorImage takes the command buffer, the image, the layout the image is in * (which is "general", we just transitioned it), the color to clear the image with, and a set * of "subresources" to clear. We are going to clear everything, and we have just a single mip * level and a single array layer, so the subresource range to be cleared is similar to the * `subresourceRange` in image barrier. * * The clear color needs to be specified based on the format of the image. The * `VkClearColorValue` is a union which accepts RGBA values in float, uint32_t or int32_t, and * we should choose the appropriate field based on swapchains[i].surface_format.format. If we * weren't so lazy, we could write a simple lookup table that tells us which field to use for * each format, but luckily we are lazy, so let's assume `float` is good for now and hope it's * portable enough. * * For fun, let's change the background color on each frame! */ VkImageSubresourceRange clear_subresource_range = { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1, }; VkClearColorValue clear_color = { .float32 = {color, (color + 64) % 256 / 255.0f, (color + 128) % 256 / 255.0f, 1}, }; ++color; vkCmdClearColorImage(essentials[i].cmd_buffer, essentials[i].images[image_index], VK_IMAGE_LAYOUT_GENERAL, &clear_color, 1, &clear_subresource_range); /* See this function in tut7_render.c for explanations */ res = tut7_render_finish(&essentials[i], &devs[i], &swapchains[i], VK_IMAGE_LAYOUT_GENERAL, image_index); if (res) { printf("-- failed for device %u\n", i); goto exit_fail; } } } exit_fail: for (uint32_t i = 0; i < dev_count; ++i) tut7_render_cleanup_essentials(&essentials[i], &devs[i]); } int main(int argc, char **argv) { tut1_error res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device phy_devs[MAX_DEVICES]; struct tut2_device devs[MAX_DEVICES]; struct tut6_swapchain swapchains[MAX_DEVICES] = {0}; SDL_Window *windows[MAX_DEVICES] = {NULL}; uint32_t dev_count = MAX_DEVICES; bool no_vsync = false; for (int i = 1; i < argc; ++i) { if (strcmp(argv[1], "--help") == 0) { printf("Usage: %s [--no-vsync]\n\n", argv[0]); return 0; } if (strcmp(argv[1], "--no-vsync") == 0) no_vsync = true; } /* Fire up Vulkan */ res = tut6_init(&vk); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not initialize Vulkan\n"); goto exit_bad_init; } /* Enumerate devices */ res = tut1_enumerate_devices(vk, phy_devs, &dev_count); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not enumerate devices\n"); goto exit_bad_enumerate; } /* Get logical devices and enable WSI extensions */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut6_setup(&phy_devs[i], &devs[i], VK_QUEUE_GRAPHICS_BIT); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i); goto exit_bad_setup; } } /* Set up SDL */ if (SDL_Init(SDL_INIT_VIDEO)) { printf("Could not initialize SDL: %s\n", SDL_GetError()); goto exit_bad_sdl; } for (uint32_t i = 0; i < dev_count; ++i) { char title[50]; snprintf(title, sizeof title, "Vk on device %u\n", i); windows[i] = SDL_CreateWindow(title, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 1024, 768, 0); if (windows[i] == NULL) { printf("Could not create window #%u: %s\n", i + 1, SDL_GetError()); goto exit_bad_window; } } /* Get the surface and swapchain */ for (uint32_t i = 0; i < dev_count; ++i) { /* Let's still not bother with threads and use just 1 (the current thread) */ res = tut6_get_swapchain(vk, &phy_devs[i], &devs[i], &swapchains[i], windows[i], 1, no_vsync); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not create surface and swapchain for device %u\n", i); goto exit_bad_swapchain; } } /* Render loop similar to Tutorial 6 */ render_loop(dev_count, phy_devs, devs, swapchains); retval = 0; /* Cleanup after yourself */ exit_bad_swapchain: for (uint32_t i = 0; i < dev_count; ++i) tut6_free_swapchain(vk, &devs[i], &swapchains[i]); exit_bad_window: for (uint32_t i = 0; i < dev_count; ++i) if (windows[i]) SDL_DestroyWindow(windows[i]); exit_bad_sdl: SDL_Quit(); exit_bad_setup: for (uint32_t i = 0; i < dev_count; ++i) tut2_cleanup(&devs[i]); exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }