int main() { IDevice* device = gf::createDevice(EDT_DIRECT3D11, 800, 600); IVideoDriver* driver = device->getVideoDriver(); ISceneManager* smgr = device->createSceneManager(); IResourceGroupManager* resourceGroupManager = driver->getResourceGroupManager(); resourceGroupManager->init("Resources.cfg"); ITimer* timer = device->getTimer(); timer->reset(); ITextureManager* textureManager = ITextureManager::getInstance(); IShaderManager* shaderMgr = driver->getShaderManager(); IShader* shader = shaderMgr->load(EST_COMPUTE_SHADER, "matmul.hlsl", "cs_main"); const u32 dimension = 512; const u32 sq_dimension = dimension * dimension; std::vector<f32> A(sq_dimension); std::vector<f32> B(sq_dimension); std::vector<f32> C(sq_dimension); std::vector<f32> D(sq_dimension); // init data for (u32 i = 0; i < sq_dimension; i++) { A[i] = math::RandomFloat(0, 10.0f); B[i] = math::RandomFloat(0, 10.0f); } f32 start_time, end_time; start_time = timer->getTime(); // store the right answers to D for (u32 i = 0; i < dimension; i++) { for (u32 j = 0; j < dimension; j++) { f32 sum = 0; for (u32 k = 0; k < dimension; k++) { sum += A[i * dimension + k] * B[k * dimension + j]; } D[i * dimension + j] = sum; } } end_time = timer->getTime(); printf("The computation time by CPU: %fs\n", end_time - start_time); start_time = timer->getTime(); ITexture* inputTexture1 = textureManager->createTexture2D("input1", dimension, dimension, ETBT_SHADER_RESOURCE, &A[0], 1, EGF_R32_FLOAT, 0); ITexture* inputTexture2 = textureManager->createTexture2D("input2", dimension, dimension, ETBT_SHADER_RESOURCE, &B[0], 1, EGF_R32_FLOAT, 0); ITexture* outputTexture = textureManager->createTexture2D("output", dimension, dimension, ETBT_UNORDERED_ACCESS, nullptr, 1, EGF_R32_FLOAT, 0); ITexture* copyTexture = textureManager->createTexture2D("copy", dimension, dimension, ETBT_CPU_ACCESS_READ, nullptr, 1, EGF_R32_FLOAT, 0); shader->setTexture("gInputA", inputTexture1); shader->setTexture("gInputB", inputTexture2); shader->setTexture("gOutput", outputTexture); u32 blockNum = dimension / 8; if (blockNum * 8 != dimension) blockNum++; driver->runComputeShader(shader, blockNum, blockNum, 1); //driver->resetRWTextures(); //driver->resetTextures(); outputTexture->copyDataToAnotherTexture(copyTexture); STextureData outputData; copyTexture->lock(ETLT_READ, &outputData); u8* data = (u8*)outputData.Data; for (u32 i = 0; i < dimension; i++) { // copy each row. memcpy(&C[i * dimension], data + outputData.RowPitch * i, dimension * sizeof(f32)); } copyTexture->unlock(); end_time = timer->getTime(); printf("The computation time by GPU: %fs\n", end_time - start_time); for (u32 i = 0; i < sq_dimension; i++) { assert(math::FloatEqual(C[i], D[i])); } // destory textures. if (!textureManager->destroy(inputTexture1)) printf("Destory texture failed!"); if (!textureManager->destroy(inputTexture2)) printf("Destory texture failed!"); if (!textureManager->destroy(outputTexture)) printf("Destory texture failed!"); if (!textureManager->destroy(copyTexture)) printf("Destory texture failed!"); //sphereMaterial.drop(); smgr->destroy(); device->drop(); system("pause"); return 0; }