Ejemplo n.º 1
0
Archivo: main.c Proyecto: ShabbyX/vktut
int main(int argc, char **argv)
{
	tut1_error res;
	int retval = EXIT_FAILURE;
	VkInstance vk;
	struct tut1_physical_device phy_devs[MAX_DEVICES];
	struct tut2_device devs[MAX_DEVICES];
	uint32_t dev_count = MAX_DEVICES;
	VkShaderModule shaders[MAX_DEVICES] = {NULL};
	struct tut3_pipelines pipelines[MAX_DEVICES];
	struct tut4_data test_data[MAX_DEVICES];
	int success = 0;

	/* How many threads to do the work on */
	size_t thread_count = 8;
	/* Whether the threads should take some CPU time as well */
	bool busy_threads = false;
	/* Default to 1MB of buffer data to work on */
	size_t buffer_size = 1024 * 1024 / sizeof(float);

	bool bad_args = false;
	if (argc < 2)
		bad_args = true;
	if (argc > 2 && sscanf(argv[2], "%zu", &thread_count) != 1)
		bad_args = true;
	if (argc > 3)
	{
		int temp;
		if (sscanf(argv[3], "%d", &temp) != 1)
			bad_args = true;
		else
			busy_threads = temp;
	}
	if (argc > 4)
	{
		if (sscanf(argv[4], "%zu", &buffer_size) != 1)
			bad_args = true;
		else
			buffer_size /= sizeof(float);
	}

	if (bad_args)
	{
		printf("Usage: %s shader_file [thread_count(8) [busy_threads(0) [buffer_size(1MB)]]]\n\n", argv[0]);
		return EXIT_FAILURE;
	}

	/* Fire up Vulkan */
	res = tut1_init(&vk);
	if (!tut1_error_is_success(&res))
	{
		tut1_error_printf(&res, "Could not initialize Vulkan\n");
		goto exit_bad_init;
	}

	/* Enumerate devices */
	res = tut1_enumerate_devices(vk, phy_devs, &dev_count);
	if (tut1_error_is_error(&res))
	{
		tut1_error_printf(&res, "Could not enumerate devices\n");
		goto exit_bad_enumerate;
	}

	/* Set up devices */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i);
			goto exit_bad_setup;
		}
	}

	/* Load our compute shader */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut3_load_shader(&devs[i], argv[1], &shaders[i]);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not load shader on device %u\n", i);
			goto exit_bad_shader;
		}
	}

	/*
	 * Create the pipelines.  There are as many pipelines created as command buffers (just for example).  If
	 * there are not actually enough resources for them, as many as possible are created.  In this test, we are
	 * not going to handle the case where some pipelines are not created.
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not allocate enough pipelines on device %u\n", i);
			goto exit_bad_pipeline;
		}
	}

	/*
	 * Prepare our test.  Both the buffers and threads are divided near-equally among the physical devices, which
	 * are likely to be just 1 in your case, but who knows.
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		size_t this_buffer_size = buffer_size / dev_count;
		size_t this_thread_count = thread_count / dev_count;

		/* Make sure the last device gets all the left-over */
		if (i == dev_count - 1)
		{
			this_buffer_size = buffer_size - buffer_size / dev_count * (dev_count - 1);
			this_thread_count = thread_count - thread_count / dev_count * (dev_count - 1);
		}

		res = tut4_prepare_test(&phy_devs[i], &devs[i], &pipelines[i], &test_data[i], this_buffer_size, this_thread_count);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not allocate resources on device %u\n", i);
			goto exit_bad_test_prepare;
		}
	}

	/*
	 * Ok, this was a LOT of initializing!  But we are finally ready to run something.  tut4_start_test() creates
	 * a test thread for us, which further spawns the corresponding device's thread_count threads that do the
	 * calculations.  We then wait for the tests to finish with tut4_wait_test_end().
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		if (tut4_start_test(&test_data[i], busy_threads))
		{
			printf("Could not start the test threads for device %u\n", i);
			perror("Error");
		}
	}

	printf("Running the tests...\n");

	for (uint32_t i = 0; i < dev_count; ++i)
		tut4_wait_test_end(&test_data[i]);

	success = 1;
	for (uint32_t i = 0; i < dev_count; ++i)
		if (!test_data[i].success)
		{
			if (!tut1_error_is_success(&test_data[i].error))
				tut1_error_printf(&test_data[i].error, "Error starting test on device %u\n", i);
			else
				printf("The test didn't produce expected results (device %u)\n", i);
			success = 0;
		}

	if (success)
		printf("Everything went well :) We just wasted your GPU doing something stupid\n");

	/*
	 * You can time the execution of the program with time(1):
	 *
	 *     $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> ...
	 *
	 * Then try to play with different number of threads and see if the total execution time of the application
	 * changes and how!
	 *
	 * ...
	 *
	 * Did you try that?  Already?  Well, that was disappointing.  More threads probably resulted in higher
	 * execution time, right?  That actually makes sense.  You see, we have N data to compute, and whether you tell
	 * the GPU to do N computations from one thread, or N/T computations each from T threads, you aren't actually
	 * doing any less computation.  You probably just have more overhead from the threads.
	 *
	 * So what's the deal with multi-threaded and Vulkan?  Well, the problem is that this test was heavily
	 * GPU-bound, and as you have noticed, multi-CPU-threaded doesn't help.  For this reason, this test has a
	 * little feature to "fake" some execution on the CPU threads as well.  If you run the program like this:
	 *
	 *     $ time ./tut4/tut4 shaders/tut3.comp.spv <threads> <fake> ...
	 *
	 * where <fake> can be either 0 (no CPU usage) or 1 (some fake CPU usage), and then experiment with different
	 * number of threads, you can see the benefit of multi-threading.  In this case, while the GPU is working, the
	 * CPU thread spends time fake-doing something.  If there is only one thread, the CPU cannot keep the GPU
	 * constantly busy, so the computation slows down.  On the other hand, with multiple threads, the same amount
	 * of CPU work is spread out and done in parallel, so the threads together can feed the GPU with instructions
	 * faster.
	 *
	 * In this test, the total amount of time to waste is 3.2 seconds (32ms for each "render" operation, and there
	 * are a hundred of them).  Depending on your GPU, you may notice that above a certain number of threads, there
	 * is no more any speedup.  That is when the amount of time spent in each CPU thread becomes less than the time
	 * spent in the GPU for that thread's task, so whether the CPU spent time doing something before waiting for
	 * the GPU doesn't make a difference in the execution time.
	 */

	retval = 0;

	/* Cleanup after yourself */

exit_bad_test_prepare:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut4_free_test(&devs[i], &test_data[i]);

exit_bad_pipeline:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut3_destroy_pipeline(&devs[i], &pipelines[i]);

exit_bad_shader:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut3_free_shader(&devs[i], shaders[i]);

exit_bad_setup:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut2_cleanup(&devs[i]);

exit_bad_enumerate:
	tut1_exit(vk);

exit_bad_init:
	return retval;
}
Ejemplo n.º 2
0
Archivo: main.c Proyecto: ShabbyX/vktut
int main(int argc, char **argv)
{
	tut1_error res;
	int retval = EXIT_FAILURE;
	VkInstance vk;
	struct tut1_physical_device phy_devs[MAX_DEVICES];
	struct tut2_device devs[MAX_DEVICES];
	uint32_t dev_count = MAX_DEVICES;
	VkShaderModule shaders[MAX_DEVICES] = {NULL};
	struct tut3_pipelines pipelines[MAX_DEVICES];

	if (argc < 2)
	{
		printf("Usage: %s shader_file\n\n", argv[0]);
		return EXIT_FAILURE;
	}

	/* Fire up Vulkan */
	res = tut1_init(&vk);
	if (!tut1_error_is_success(&res))
	{
		tut1_error_printf(&res, "Could not initialize Vulkan\n");
		goto exit_bad_init;
	}

	/* Enumerate devices */
	res = tut1_enumerate_devices(vk, phy_devs, &dev_count);
	if (tut1_error_is_error(&res))
	{
		tut1_error_printf(&res, "Could not enumerate devices\n");
		goto exit_bad_enumerate;
	}

	/* Set up devices */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut2_setup(&phy_devs[i], &devs[i], VK_QUEUE_COMPUTE_BIT);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i);
			goto exit_bad_setup;
		}
	}

	/* Load our compute shader */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut3_load_shader(&devs[i], argv[1], &shaders[i]);
		if (!tut1_error_is_success(&res))
		{
			tut1_error_printf(&res, "Could not load shader on device %u\n", i);
			goto exit_bad_shader;
		}
	}

	printf("Loaded the shader, awesome!\n");

	/*
	 * Create the pipelines.  There are as many pipelines created as command buffers (just for example).  If
	 * there are not actually enough resources for them, as many as possible are created.
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
		tut3_make_compute_pipeline(&devs[i], &pipelines[i], shaders[i]);

	/*
	 * Like tutorial 2, we have covered a lot of ground in this tutorial.  Let's keep actual usage of our compute
	 * shader to the next tutorial, where we would see the effect of multiple threads on the processing speed.
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		uint32_t count = 0;
		for (uint32_t j = 0; j < pipelines[i].pipeline_count; ++j)
			if (pipelines[i].pipelines[j].pipeline)
				++count;

		printf("Created %u pipeline%s on device %u\n", count, count == 1?"":"s", i);
	}

	retval = 0;

	/* Cleanup after yourself */

	for (uint32_t i = 0; i < dev_count; ++i)
		tut3_destroy_pipeline(&devs[i], &pipelines[i]);

exit_bad_shader:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut3_free_shader(&devs[i], shaders[i]);

exit_bad_setup:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut2_cleanup(&devs[i]);

exit_bad_enumerate:
	tut1_exit(vk);

exit_bad_init:
	return retval;
}
Ejemplo n.º 3
0
Archivo: main.c Proyecto: barotto/vktut
int main(int argc, char **argv)
{
	VkResult res;
	int retval = EXIT_FAILURE;
	VkInstance vk;
	struct tut1_physical_device devs[MAX_DEVICES];
	uint32_t dev_count = MAX_DEVICES;

	/* Fire up Vulkan */
	res = tut1_init(&vk);
	if (res)
	{
		printf("Could not initialize Vulkan: %s\n", tut1_VkResult_string(res));
		goto exit_bad_init;
	}

	printf("Vulkan is in the house.\n");

	/* Take a look at what devices there are */
	res = tut1_enumerate_devices(vk, devs, &dev_count);
	if (res < 0)
	{
		printf("Could not enumerate devices: %s\n", tut1_VkResult_string(res));
		goto exit_bad_enumerate;
	}
	else if (res == VK_INCOMPLETE)
	{
		print_surprise("", "you've got", "devices", "dream of");
		printf("I have information on only %"PRIu32" of them:\n", dev_count);
	}
	else
		printf("I detected the following %"PRIu32" device%s:\n", dev_count, dev_count == 1?"":"s");

	/*
	 * Print out some of the information taken when enumerating physical devices.  This is by no means an
	 * exhaustive printout, but to give you the idea.
	 */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		struct tut1_physical_device *dev = &devs[i];
		VkPhysicalDeviceProperties *pr = &dev->properties;

		printf("  - %s: %s (id: 0x%04X) from vendor 0x%04X [driver version: 0x%04X, API version: 0x%04X]\n",
				tut1_VkPhysicalDeviceType_string(pr->deviceType), pr->deviceName,
				pr->deviceID, pr->vendorID, pr->driverVersion, pr->apiVersion);
		if (dev->queue_families_incomplete)
		{
			print_surprise("    ", "your device", "queue families", "imagine");
			printf("    I have information on only %"PRIu32" of them:\n", dev->queue_family_count);
		}
		else
			printf("    The device supports the following %"PRIu32" queue famil%s:\n", dev->queue_family_count, dev->queue_family_count == 1?"y":"ies");

		for (uint32_t j = 0; j < dev->queue_family_count; ++j)
		{
			VkQueueFamilyProperties *qf = &dev->queue_families[j];

			printf("    * %"PRIu32" queue%s with the following capabilit%s:\n", qf->queueCount, qf->queueCount == 1?"":"s",
					qf->queueFlags && (qf->queueFlags & (qf->queueFlags - 1)) == 0?"y":"ies");
			if (qf->queueFlags == 0)
				printf("          None\n");
			if ((qf->queueFlags & VK_QUEUE_GRAPHICS_BIT))
				printf("          Graphics\n");
			if ((qf->queueFlags & VK_QUEUE_COMPUTE_BIT))
				printf("          Compute\n");
			if ((qf->queueFlags & VK_QUEUE_TRANSFER_BIT))
				printf("          Transfer\n");
			if ((qf->queueFlags & VK_QUEUE_SPARSE_BINDING_BIT))
				printf("          Sparse binding\n");
		}

		printf("    The device supports memories of the following types:\n");
		for (uint32_t j = 0; j < dev->memories.memoryTypeCount; ++j)
		{
			printf("    *");
			if (dev->memories.memoryTypes[j].propertyFlags == 0)
				printf(" <no properties>");
			if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT))
				printf(" device-local");
			if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
				printf(" host-visible");
			if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
				printf(" host-coherent");
			if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT))
				printf(" host-cached");
			if ((dev->memories.memoryTypes[j].propertyFlags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT))
				printf(" lazy");
			printf(": Available in Heap of size %"PRIu64"MB\n", dev->memories.memoryHeaps[dev->memories.memoryTypes[j].heapIndex].size / (1024 * 1024));
		}
	}

	/* Congratulations, you can now duplicate the `vulkaninfo` program. */

	retval = 0;

	/* Cleanup after yourself */

exit_bad_enumerate:
	tut1_exit(vk);

exit_bad_init:
	return retval;
}
Ejemplo n.º 4
0
Archivo: main.c Proyecto: ShabbyX/vktut
static void render_loop(uint32_t dev_count, struct tut1_physical_device *phy_devs, struct tut2_device *devs, struct tut6_swapchain *swapchains)
{
	int res;
	struct tut7_render_essentials essentials[dev_count];

	/* Allocate render essentials.  See this function in tut7_render.c for explanations. */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut7_render_get_essentials(&essentials[i], &phy_devs[i], &devs[i], &swapchains[i]);
		if (res)
		{
			printf("-- failed for device %u\n", i);
			return;
		}
	}

	unsigned int frames = 0;
	time_t before = time(NULL);

	uint8_t color = 0;

	/* Process events from SDL and render.  If process_events returns non-zero, it signals application exit. */
	while (process_events() == 0)
	{
		/*
		 * A simple imprecise FPS calculator.  Try the --no-vsync option to this program to see the difference.
		 *
		 * On Linux, with Nvidia GTX 970, and Vulkan 1.0.8, --no-vsync got me about 12000 FPS.
		 */
		time_t now = time(NULL);
		if (now != before)
		{
			printf("%lds: %u frames\n", now - before, frames);
			frames = 0;
			before = now;
		}
		++frames;

		/*
		 * We are not yet ready to actually render something.  For that, we would need descriptor sets and
		 * pipelines, but we'll get to that soon.  In tut7.c, we have a repository of functions to create
		 * resources for the eventual rendering.  Here, we'll ignore all that and do what we ignored in
		 * Tutorial 6, and that is properly transitioning the swapchain images between "present src" and
		 * something we can render to.  With a graphics pipeline, we would want to transition to
		 * "color attachment optimal".  Since we don't have one, we are going to "clear" the screen which
		 * doesn't need a graphics pipeline.  In that case, the layout of the image should be GENERAL.
		 */

		for (uint32_t i = 0; i < dev_count; ++i)
		{
			uint32_t image_index;

			/*
			 * To render to an image and present it on the screen, the following sequence of operations
			 * needs to be done:
			 *
			 * - acquire from swapchain
			 * - transition to color attachment optimal
			 * - render
			 * - transition to present src
			 * - present the image
			 *
			 * One way to implement this would be to call the corresponding functions one by one, wait and
			 * make sure the image passes through each section, and repeat.  The problem with this way is
			 * that there is wasted time between each function call.  Not that function call itself takes
			 * measurable time, but the setup and finish times of each call, especially because we are
			 * interacting with the GPU.
			 *
			 * Vulkan is made for parallelism and efficiency, so naturally it's not stupid in this regard!
			 * There are different ways to do the above in parallel, and synchronize them.  One nice thing
			 * is that command buffers can call other secondary command buffers.  So, while a small part of
			 * the command buffer requires knowledge of which presentable image it is working with, the
			 * majority of it doesn't, so they could be pre-recorded or recorded in parallel by other
			 * threads.  Another nice thing is that many of the functions work asynchronously, such as
			 * submission to queue for rendering.  This allows the CPU to go ahead with executing the rest
			 * of the above algorithm, only wait for the GPU to finish rendering when it has to, and let
			 * synchronization mechanisms take care of handling the flow of execution in the back.
			 *
			 * One could imagine different ways of doing things, but here is a simple example:
			 *
			 * - acquire from swapchain, signalling semaphore A
			 * - wait on fence C (for previous frame to finish)
			 * - create a command buffer with 1) first transition, 2) render, 3) second transition
			 * - submit the command buffer with semaphore A waiting in the beginning and semaphore B
			 *   signalling the end, with fence C signalling the end as well
			 * - present to swapchain, waiting on the second semaphore
			 *
			 * The significance of the fence above is the following.  In Tutorial 6, we used `usleep` to
			 * avoid busy looping.  That was bad, because it put a hard limit and the frame rate.  The
			 * issue is not just busy looping though.  Since the submissions to queues happen
			 * asynchronously, we risk submitting work faster than the card can actually perform them, with
			 * the result being that frames we send now are rendered much later, after all our previous
			 * work is finished.  This delay can easily become unacceptable; imagine a player has hit the
			 * key to move forwards, you detect this and generate the next frame accordingly, but the
			 * player doesn't actually see her character move forward while several older frames are still
			 * being rendered.
			 *
			 * The location of the fence is chosen as such, to allow maximum overlap between GPU and CPU
			 * work.  In this case, while the GPU is still rendering, the CPU can wait for the swapchain
			 * image to be acquired.  The wait on the fence could not be delayed any further, because we
			 * can't re-record a command buffer that is being executed.  Interestingly, if we use two
			 * command buffers and alternate between them, we could also wait for the fence later!  Let's
			 * not go that far yet.
			 */

			/* See this function in tut7_render.c for explanations */
			res = tut7_render_start(&essentials[i], &devs[i], &swapchains[i], VK_IMAGE_LAYOUT_GENERAL, &image_index);
			if (res)
			{
				printf("-- failed for device %u\n", i);
				goto exit_fail;
			}

			/*
			 * We did everything just to clear the image.  Like I said, it's possible to clear an image
			 * outside a pipeline.  It is also possible to clear it inside a pipeline, so fear not!  When
			 * we have a graphics pipeline, we can transition the image directly to "color attachment
			 * optimal" and clear it, and we don't have to first transition to "general" and then
			 * transition again to "color attachment optimal".
			 *
			 * Clearing the image outside the pipeline is quite straightforward, and in fact has no notion
			 * of the image being used for presentation later.  It's just clearing a general image.
			 *
			 * The vkCmdClearColorImage takes the command buffer, the image, the layout the image is in
			 * (which is "general", we just transitioned it), the color to clear the image with, and a set
			 * of "subresources" to clear.  We are going to clear everything, and we have just a single mip
			 * level and a single array layer, so the subresource range to be cleared is similar to the
			 * `subresourceRange` in image barrier.
			 *
			 * The clear color needs to be specified based on the format of the image.  The
			 * `VkClearColorValue` is a union which accepts RGBA values in float, uint32_t or int32_t, and
			 * we should choose the appropriate field based on swapchains[i].surface_format.format.  If we
			 * weren't so lazy, we could write a simple lookup table that tells us which field to use for
			 * each format, but luckily we are lazy, so let's assume `float` is good for now and hope it's
			 * portable enough.
			 *
			 * For fun, let's change the background color on each frame!
			 */
			VkImageSubresourceRange clear_subresource_range = {
				.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
				.baseMipLevel = 0,
				.levelCount = 1,
				.baseArrayLayer = 0,
				.layerCount = 1,
			};
			VkClearColorValue clear_color = {
				.float32 = {color, (color + 64) % 256 / 255.0f, (color + 128) % 256 / 255.0f, 1},
			};
			++color;
			vkCmdClearColorImage(essentials[i].cmd_buffer, essentials[i].images[image_index], VK_IMAGE_LAYOUT_GENERAL, &clear_color, 1, &clear_subresource_range);

			/* See this function in tut7_render.c for explanations */
			res = tut7_render_finish(&essentials[i], &devs[i], &swapchains[i], VK_IMAGE_LAYOUT_GENERAL, image_index);
			if (res)
			{
				printf("-- failed for device %u\n", i);
				goto exit_fail;
			}
		}
	}

exit_fail:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut7_render_cleanup_essentials(&essentials[i], &devs[i]);
}

int main(int argc, char **argv)
{
	tut1_error res;
	int retval = EXIT_FAILURE;
	VkInstance vk;
	struct tut1_physical_device phy_devs[MAX_DEVICES];
	struct tut2_device devs[MAX_DEVICES];
	struct tut6_swapchain swapchains[MAX_DEVICES] = {0};
	SDL_Window *windows[MAX_DEVICES] = {NULL};
	uint32_t dev_count = MAX_DEVICES;

	bool no_vsync = false;

	for (int i = 1; i < argc; ++i)
	{
		if (strcmp(argv[1], "--help") == 0)
		{
			printf("Usage: %s [--no-vsync]\n\n", argv[0]);
			return 0;
		}
		if (strcmp(argv[1], "--no-vsync") == 0)
			no_vsync = true;
	}

	/* Fire up Vulkan */
	res = tut6_init(&vk);
	if (!tut1_error_is_success(&res))
	{
		tut1_error_printf(&res, "Could not initialize Vulkan\n");
		goto exit_bad_init;
	}

	/* Enumerate devices */
	res = tut1_enumerate_devices(vk, phy_devs, &dev_count);
	if (tut1_error_is_error(&res))
	{
		tut1_error_printf(&res, "Could not enumerate devices\n");
		goto exit_bad_enumerate;
	}

	/* Get logical devices and enable WSI extensions */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		res = tut6_setup(&phy_devs[i], &devs[i], VK_QUEUE_GRAPHICS_BIT);
		if (tut1_error_is_error(&res))
		{
			tut1_error_printf(&res, "Could not setup logical device %u, command pools and queues\n", i);
			goto exit_bad_setup;
		}
	}

	/* Set up SDL */
	if (SDL_Init(SDL_INIT_VIDEO))
	{
		printf("Could not initialize SDL: %s\n", SDL_GetError());
		goto exit_bad_sdl;
	}

	for (uint32_t i = 0; i < dev_count; ++i)
	{
		char title[50];
		snprintf(title, sizeof title, "Vk on device %u\n", i);
		windows[i] = SDL_CreateWindow(title, SDL_WINDOWPOS_UNDEFINED, SDL_WINDOWPOS_UNDEFINED, 1024, 768, 0);
		if (windows[i] == NULL)
		{
			printf("Could not create window #%u: %s\n", i + 1, SDL_GetError());
			goto exit_bad_window;
		}
	}

	/* Get the surface and swapchain */
	for (uint32_t i = 0; i < dev_count; ++i)
	{
		/* Let's still not bother with threads and use just 1 (the current thread) */
		res = tut6_get_swapchain(vk, &phy_devs[i], &devs[i], &swapchains[i], windows[i], 1, no_vsync);
		if (tut1_error_is_error(&res))
		{
			tut1_error_printf(&res, "Could not create surface and swapchain for device %u\n", i);
			goto exit_bad_swapchain;
		}
	}

	/* Render loop similar to Tutorial 6 */
	render_loop(dev_count, phy_devs, devs, swapchains);

	retval = 0;

	/* Cleanup after yourself */

exit_bad_swapchain:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut6_free_swapchain(vk, &devs[i], &swapchains[i]);

exit_bad_window:
	for (uint32_t i = 0; i < dev_count; ++i)
		if (windows[i])
			SDL_DestroyWindow(windows[i]);
exit_bad_sdl:
	SDL_Quit();

exit_bad_setup:
	for (uint32_t i = 0; i < dev_count; ++i)
		tut2_cleanup(&devs[i]);

exit_bad_enumerate:
	tut1_exit(vk);

exit_bad_init:
	return retval;
}