int main(int argc, char **argv) { VkResult res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; /* Fire up Vulkan */ res = tut1_init(&vk); if (res) { printf("Could not initialize Vulkan: %s\n", tut1_VkResult_string(res)); goto exit_bad_init; } printf("Vulkan is in the house.\n"); /* Take a look at what devices there are */ res = tut1_enumerate_devices(vk, devs, &dev_count); if (res < 0) { printf("Could not enumerate devices: %s\n", tut1_VkResult_string(res)); goto exit_bad_enumerate; } else if (res == VK_INCOMPLETE) { print_surprise("", "you've got", "devices", "dream of"); printf("I have information on only %"PRIu32" of them:\n", dev_count); } else printf("I detected the following %"PRIu32" device%s:\n", dev_count, dev_count == 1?"":"s"); /* * Print out some of the information taken when enumerating physical devices. This is by no means an * exhaustive printout, but to give you the idea. */ for (uint32_t i = 0; i < dev_count; ++i) { struct tut1_physical_device *dev = &devs[i]; VkPhysicalDeviceProperties *pr = &dev->properties; printf(" - %s: %s (id: 0x%04X) from vendor 0x%04X [driver version: 0x%04X, API version: 0x%04X]\n", tut1_VkPhysicalDeviceType_string(pr->deviceType), pr->deviceName, pr->deviceID, pr->vendorID, pr->driverVersion, pr->apiVersion); if (dev->queue_families_incomplete) { print_surprise(" ", "your device", "queue families", "imagine"); printf(" I have information on only %"PRIu32" of them:\n", dev->queue_family_count); } else printf(" The device supports the following %"PRIu32" queue famil%s:\n", dev->queue_family_count, dev->queue_family_count == 1?"y":"ies"); for (uint32_t j = 0; j < dev->queue_family_count; ++j) { VkQueueFamilyProperties *qf = &dev->queue_families[j]; printf(" * %"PRIu32" queue%s with the following capabilit%s:\n", qf->queueCount, qf->queueCount == 1?"":"s", qf->queueFlags && (qf->queueFlags & (qf->queueFlags - 1)) == 0?"y":"ies"); if (qf->queueFlags == 0) printf(" None\n"); if ((qf->queueFlags & VK_QUEUE_GRAPHICS_BIT)) printf(" Graphics\n"); if ((qf->queueFlags & VK_QUEUE_COMPUTE_BIT)) printf(" Compute\n"); if ((qf->queueFlags & VK_QUEUE_TRANSFER_BIT)) printf(" Transfer\n"); if ((qf->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT)) printf(" Sparse binding\n"); } printf(" The device supports memories of the following types:\n"); for (uint32_t j = 0; j < dev->memories.memoryTypeCount; ++j) { printf(" *"); if (dev->memories.memoryTypes[j].propertyFlags == 0) printf(" <no properties>"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)) printf(" device-local"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) printf(" host-visible"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) printf(" host-coherent"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)) printf(" host-cached"); if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)) printf(" lazy"); printf(": Available in Heap of size %"PRIu64"MB\n", dev->memories.memoryHeaps[dev->memories.memoryTypes[j].heapIndex].size / (1024 * 1024)); } } /* Congratulations, you can now duplicate the `vulkaninfo` program. */ retval = 0; /* Cleanup after yourself */ exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }
int main(int argc, char **argv) { tut1_error res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device phy_devs[MAX_DEVICES]; struct tut2_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; VkShaderModule shaders[MAX_DEVICES] = {NULL}; struct tut3_pipelines pipelines[MAX_DEVICES]; struct tut4_data test_data[MAX_DEVICES]; int success = 0; /* How many threads to do the work on */ size_t thread_count = 8; /* Whether the threads should take some CPU time as well */ bool busy_threads = false; /* Default to 1MB of buffer data to work on */ size_t buffer_size = 1024 * 1024 / sizeof(float); bool bad_args = false; if (argc < 2) bad_args = true; if (argc > 2 && sscanf(argv[2], "%zu", &thread_count) != 1) bad_args = true; if (argc > 3) { int temp; if (sscanf(argv[3], "%d", &temp) != 1) bad_args = true; else busy_threads = temp; } if (argc > 4) { if (sscanf(argv[4], "%zu", &buffer_size) != 1) bad_args = true; else buffer_size /= sizeof(float); } if (bad_args) { printf("Usage: %s shader_file [thread_count(8) [busy_threads(0) [buffer_size(1MB)]]]\n\n", argv[0]); return EXIT_FAILURE; } /* Fire up Vulkan */ res = tut1_init(&vk); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not initialize Vulkan\n"); goto exit_bad_init; } /* Enumerate devices */ res = tut1_enumerate_devices(vk, phy_devs, &dev_count); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not enumerate devices\n"); goto exit_bad_enumerate; } /* Set up devices */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i); goto exit_bad_setup; } } /* Load our compute shader */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_load_shader(&devs[i], argv[1], &shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not load shader on device %u\n", i); goto exit_bad_shader; } } /* * Create the pipelines. There are as many pipelines created as command buffers (just for example). If * there are not actually enough resources for them, as many as possible are created. In this test, we are * not going to handle the case where some pipelines are not created. */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not allocate enough pipelines on device %u\n", i); goto exit_bad_pipeline; } } /* * Prepare our test. Both the buffers and threads are divided near-equally among the physical devices, which * are likely to be just 1 in your case, but who knows. */ for (uint32_t i = 0; i < dev_count; ++i) { size_t this_buffer_size = buffer_size / dev_count; size_t this_thread_count = thread_count / dev_count; /* Make sure the last device gets all the left-over */ if (i == dev_count - 1) { this_buffer_size = buffer_size - buffer_size / dev_count * (dev_count - 1); this_thread_count = thread_count - thread_count / dev_count * (dev_count - 1); } res = tut4_prepare_test(&phy_devs[i], &devs[i], &pipelines[i], &test_data[i], this_buffer_size, this_thread_count); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not allocate resources on device %u\n", i); goto exit_bad_test_prepare; } } /* * Ok, this was a LOT of initializing! But we are finally ready to run something. tut4_start_test() creates * a test thread for us, which further spawns the corresponding device's thread_count threads that do the * calculations. We then wait for the tests to finish with tut4_wait_test_end(). */ for (uint32_t i = 0; i < dev_count; ++i) { if (tut4_start_test(&test_data[i], busy_threads)) { printf("Could not start the test threads for device %u\n", i); perror("Error"); } } printf("Running the tests...\n"); for (uint32_t i = 0; i < dev_count; ++i) tut4_wait_test_end(&test_data[i]); success = 1; for (uint32_t i = 0; i < dev_count; ++i) if (!test_data[i].success) { if (!tut1_error_is_success(&test_data[i].error)) tut1_error_printf(&test_data[i].error, "Error starting test on device %u\n", i); else printf("The test didn't produce expected results (device %u)\n", i); success = 0; } if (success) printf("Everything went well :) We just wasted your GPU doing something stupid\n"); /* * You can time the execution of the program with time(1): * * $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> ... * * Then try to play with different number of threads and see if the total execution time of the application * changes and how! * * ... * * Did you try that? Already? Well, that was disappointing. More threads probably resulted in higher * execution time, right? That actually makes sense. You see, we have N data to compute, and whether you tell * the GPU to do N computations from one thread, or N/T computations each from T threads, you aren't actually * doing any less computation. You probably just have more overhead from the threads. * * So what's the deal with multi-threaded and Vulkan? Well, the problem is that this test was heavily * GPU-bound, and as you have noticed, multi-CPU-threaded doesn't help. For this reason, this test has a * little feature to "fake" some execution on the CPU threads as well. If you run the program like this: * * $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> <fake> ... * * where <fake> can be either 0 (no CPU usage) or 1 (some fake CPU usage), and then experiment with different * number of threads, you can see the benefit of multi-threading. In this case, while the GPU is working, the * CPU thread spends time fake-doing something. If there is only one thread, the CPU cannot keep the GPU * constantly busy, so the computation slows down. On the other hand, with multiple threads, the same amount * of CPU work is spread out and done in parallel, so the threads together can feed the GPU with instructions * faster. * * In this test, the total amount of time to waste is 3.2 seconds (32ms for each "render" operation, and there * are a hundred of them). Depending on your GPU, you may notice that above a certain number of threads, there * is no more any speedup. That is when the amount of time spent in each CPU thread becomes less than the time * spent in the GPU for that thread's task, so whether the CPU spent time doing something before waiting for * the GPU doesn't make a difference in the execution time. */ retval = 0; /* Cleanup after yourself */ exit_bad_test_prepare: for (uint32_t i = 0; i < dev_count; ++i) tut4_free_test(&devs[i], &test_data[i]); exit_bad_pipeline: for (uint32_t i = 0; i < dev_count; ++i) tut3_destroy_pipeline(&devs[i], &pipelines[i]); exit_bad_shader: for (uint32_t i = 0; i < dev_count; ++i) tut3_free_shader(&devs[i], shaders[i]); exit_bad_setup: for (uint32_t i = 0; i < dev_count; ++i) tut2_cleanup(&devs[i]); exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }
int main(int argc, char **argv) { tut1_error res; int retval = EXIT_FAILURE; VkInstance vk; struct tut1_physical_device phy_devs[MAX_DEVICES]; struct tut2_device devs[MAX_DEVICES]; uint32_t dev_count = MAX_DEVICES; VkShaderModule shaders[MAX_DEVICES] = {NULL}; struct tut3_pipelines pipelines[MAX_DEVICES]; if (argc < 2) { printf("Usage: %s shader_file\n\n", argv[0]); return EXIT_FAILURE; } /* Fire up Vulkan */ res = tut1_init(&vk); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not initialize Vulkan\n"); goto exit_bad_init; } /* Enumerate devices */ res = tut1_enumerate_devices(vk, phy_devs, &dev_count); if (tut1_error_is_error(&res)) { tut1_error_printf(&res, "Could not enumerate devices\n"); goto exit_bad_enumerate; } /* Set up devices */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i); goto exit_bad_setup; } } /* Load our compute shader */ for (uint32_t i = 0; i < dev_count; ++i) { res = tut3_load_shader(&devs[i], argv[1], &shaders[i]); if (!tut1_error_is_success(&res)) { tut1_error_printf(&res, "Could not load shader on device %u\n", i); goto exit_bad_shader; } } printf("Loaded the shader, awesome!\n"); /* * Create the pipelines. There are as many pipelines created as command buffers (just for example). If * there are not actually enough resources for them, as many as possible are created. */ for (uint32_t i = 0; i < dev_count; ++i) tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]); /* * Like tutorial 2, we have covered a lot of ground in this tutorial. Let's keep actual usage of our compute * shader to the next tutorial, where we would see the effect of multiple threads on the processing speed. */ for (uint32_t i = 0; i < dev_count; ++i) { uint32_t count = 0; for (uint32_t j = 0; j < pipelines[i].pipeline_count; ++j) if (pipelines[i].pipelines[j].pipeline) ++count; printf("Created %u pipeline%s on device %u\n", count, count == 1?"":"s", i); } retval = 0; /* Cleanup after yourself */ for (uint32_t i = 0; i < dev_count; ++i) tut3_destroy_pipeline(&devs[i], &pipelines[i]); exit_bad_shader: for (uint32_t i = 0; i < dev_count; ++i) tut3_free_shader(&devs[i], shaders[i]); exit_bad_setup: for (uint32_t i = 0; i < dev_count; ++i) tut2_cleanup(&devs[i]); exit_bad_enumerate: tut1_exit(vk); exit_bad_init: return retval; }