예제 #1
0
	void BvhStrategy::Preprocess(World const& world)
	{
		// If something has been changed we need to rebuild BVH
		if (!m_bvh || world.has_changed() || world.GetStateChange() != ShapeImpl::kStateChangeNone)
		{
			int numshapes = (int)world.shapes_.size();
			int numvertices = 0;
			int numfaces = 0;

			// This buffer tracks mesh start index for next stage as mesh face indices are relative to 0
			std::vector<int> mesh_vertices_start_idx(numshapes);
			std::vector<int> mesh_faces_start_idx(numshapes);

			// Recreate it

			// First check if we need to use SAH
			auto builder = world.options_.GetOption("bvh.builder");
			bool enablesah = false;

			if (builder && builder->AsString() == "sah")
			{
				enablesah = true;
			}

			m_bvh.reset(new Bvh(enablesah));

			// Partition the array into meshes and instances
			std::vector<Shape const*> shapes(world.shapes_);
			
            auto firstinst = std::partition(shapes.begin(), shapes.end(),
			[&](Shape const* shape)
            {
                return !static_cast<ShapeImpl const*>(shape)->is_instance();
            });

            // Count the number of meshes
            int nummeshes = (int)std::distance(shapes.begin(), firstinst);
            // Count the number of instances
            int numinstances = (int)std::distance(firstinst, shapes.end());

			for (int i = 0; i < nummeshes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);

				mesh_faces_start_idx[i] = numfaces;
				mesh_vertices_start_idx[i] = numvertices;

				numfaces += mesh->num_faces();
				numvertices += mesh->num_vertices();
			}

			for (int i = nummeshes; i < nummeshes + numinstances; ++i)
			{
				Instance const* instance = static_cast<Instance const*>(shapes[i]);
				Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());

				mesh_faces_start_idx[i] = numfaces;
				mesh_vertices_start_idx[i] = numvertices;

				numfaces += mesh->num_faces();
				numvertices += mesh->num_vertices();
			}

			// We can't avoild allocating it here, since bounds aren't stored anywhere
			std::vector<bbox> bounds(numfaces);
			std::vector<ShapeData> shapedata(numshapes);

			// We handle meshes first collecting their world space bounds
#pragma omp parallel for
			for (int i = 0; i < nummeshes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);

				for (int j = 0; j < mesh->num_faces(); ++j)
				{
					// Here we directly get world space bounds
					mesh->GetFaceBounds(j, false, bounds[mesh_faces_start_idx[i] + j]);
				}

				shapedata[i].id = mesh->GetId();
				shapedata[i].mask = mesh->GetMask();
			}

			// Then we handle instances. Need to flatten them into actual geometry.
#pragma omp parallel for
			for (int i = nummeshes; i < nummeshes + numinstances; ++i)
			{
				Instance const* instance = static_cast<Instance const*>(shapes[i]);
				Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());
				
				// Instance is using its own transform for base shape geometry
				// so we need to get object space bounds and transform them manually
				matrix m, minv;
				instance->GetTransform(m, minv);

				for (int j = 0; j < mesh->num_faces(); ++j)
				{
					bbox tmp;
					mesh->GetFaceBounds(j, true, tmp);
					bounds[mesh_faces_start_idx[i] + j] = transform_bbox(tmp, m);
				}

				shapedata[i].id = instance->GetId();
				shapedata[i].mask = instance->GetMask();
			}
			
			m_bvh->Build(&bounds[0], numfaces);

			PlainBvhTranslator translator;
			translator.Process(*m_bvh);

			// Update GPU data
			// Copy translated nodes first
			m_gpudata->bvh = m_device->CreateBuffer(translator.nodes_.size() * sizeof(PlainBvhTranslator::Node), Calc::BufferType::kRead, &translator.nodes_[0]);

			// Create vertex buffer
			{
				// Vertices
				m_gpudata->vertices = m_device->CreateBuffer(numvertices * sizeof(float3), Calc::BufferType::kRead);

				// Get the pointer to mapped data
				float3* vertexdata = nullptr;
				Calc::Event* e = nullptr;
				m_device->MapBuffer(m_gpudata->vertices, 0, 0, numvertices * sizeof(float3), Calc::MapType::kMapWrite, (void**)&vertexdata, &e);

				e->Wait();
				m_device->DeleteEvent(e);

				// Here we need to put data in world space rather than object space
				// So we need to get the transform from the mesh and multiply each vertex
				matrix m, minv;

#pragma omp parallel for
				for (int i = 0; i < nummeshes; ++i)
				{
					// Get the mesh
					Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);
					// Get vertex buffer of the current mesh
					float3 const* myvertexdata = mesh->GetVertexData();
					// Get mesh transform
					mesh->GetTransform(m, minv);

					//#pragma omp parallel for
					// Iterate thru vertices multiply and append them to GPU buffer
					for (int j = 0; j < mesh->num_vertices(); ++j)
					{
						vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
					}
				}

#pragma omp parallel for
				for (int i = nummeshes; i < nummeshes + numinstances; ++i)
				{
					Instance const* instance = static_cast<Instance const*>(shapes[i]);
					// Get the mesh
					Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());
					// Get vertex buffer of the current mesh
					float3 const* myvertexdata = mesh->GetVertexData();
					// Get mesh transform
					instance->GetTransform(m, minv);

					//#pragma omp parallel for
					// Iterate thru vertices multiply and append them to GPU buffer
					for (int j = 0; j < mesh->num_vertices(); ++j)
					{
						vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
					}
				}

				m_device->UnmapBuffer(m_gpudata->vertices, 0, vertexdata, &e);

				e->Wait();
				m_device->DeleteEvent(e);
			}

			// Create face buffer
			{
				struct Face
				{
					// Up to 3 indices
					int idx[3];
					// Shape index
					int shapeidx;
					// Primitive ID within the mesh
					int id;
					// Idx count
					int cnt;
				};

				// Create face buffer
				m_gpudata->faces = m_device->CreateBuffer(numfaces * sizeof(Face), Calc::BufferType::kRead);

				// Get the pointer to mapped data
				Face* facedata = nullptr;
				Calc::Event* e = nullptr;

				m_device->MapBuffer(m_gpudata->faces, 0, 0, numfaces * sizeof(Face), Calc::BufferType::kWrite, (void**)&facedata, &e);

				e->Wait();
				m_device->DeleteEvent(e);

				// Here the point is to add mesh starting index to actual index contained within the mesh,
				// getting absolute index in the buffer.
				// Besides that we need to permute the faces accorningly to BVH reordering, whihc
				// is contained within bvh.primids_
				int const* reordering = m_bvh->GetIndices();
				for (int i = 0; i < numfaces; ++i)
				{
					int indextolook4 = reordering[i];

					// We need to find a shape corresponding to current face
					auto iter = std::upper_bound(mesh_faces_start_idx.cbegin(), mesh_faces_start_idx.cend(), indextolook4);

					// Find the index of the shape
					int shapeidx = static_cast<int>(std::distance(mesh_faces_start_idx.cbegin(), iter) - 1);

					// Get the mesh directly or out of instance
					Mesh const* mesh = nullptr;
					if (shapeidx < nummeshes)
					{
						mesh = static_cast<Mesh const*>(shapes[shapeidx]);
					}
					else
					{
						mesh = static_cast<Mesh const*>(static_cast<Instance const*>(shapes[shapeidx])->GetBaseShape());
					}

					// Get vertex buffer of the current mesh
					Mesh::Face const* myfacedata = mesh->GetFaceData();
					// Find face idx
					int faceidx = indextolook4 - mesh_faces_start_idx[shapeidx];
					// Find mesh start idx
					int mystartidx = mesh_vertices_start_idx[shapeidx];

					// Copy face data to GPU buffer
					facedata[i].idx[0] = myfacedata[faceidx].idx[0] + mystartidx;
					facedata[i].idx[1] = myfacedata[faceidx].idx[1] + mystartidx;
					facedata[i].idx[2] = myfacedata[faceidx].idx[2] + mystartidx;

					facedata[i].shapeidx = shapeidx;
					facedata[i].cnt = 0;
					facedata[i].id = faceidx;
				}

				m_device->UnmapBuffer(m_gpudata->faces, 0, facedata, &e);

				e->Wait();
				m_device->DeleteEvent(e);
			}

			// Create shapes buffer
			m_gpudata->shapes = m_device->CreateBuffer(numshapes * sizeof(ShapeData), Calc::BufferType::kRead, &shapedata[0]);
			// Create helper raycounter buffer
			m_gpudata->raycnt = m_device->CreateBuffer(sizeof(int), Calc::BufferType::kWrite);

			// Make sure everything is commited
			m_device->Finish(0);
		}
	}
    void IntersectorShortStack::Process(World const& world)
    {

        // If something has been changed we need to rebuild BVH
        if (!m_bvh || world.has_changed() || world.GetStateChange() != ShapeImpl::kStateChangeNone)
        {
            if (m_bvh)
            {
                m_device->DeleteBuffer(m_gpudata->bvh);
                m_device->DeleteBuffer(m_gpudata->vertices);
            }

            // Check if we can allocate enough stack memory
            Calc::DeviceSpec spec;
            m_device->GetSpec(spec);
            if (spec.max_alloc_size <= kMaxBatchSize * kMaxStackSize * sizeof(int))
            {
                throw ExceptionImpl("fatbvh accelerator can't allocate enough stack memory, try using bvh instead");
            }

            int numshapes = (int)world.shapes_.size();
            int numvertices = 0;
            int numfaces = 0;

            // This buffer tracks mesh start index for next stage as mesh face indices are relative to 0
            std::vector<int> mesh_vertices_start_idx(numshapes);
            std::vector<int> mesh_faces_start_idx(numshapes);

            auto builder = world.options_.GetOption("bvh.builder");
            auto splits = world.options_.GetOption("bvh.sah.use_splits");
            auto maxdepth = world.options_.GetOption("bvh.sah.max_split_depth");
            auto overlap = world.options_.GetOption("bvh.sah.min_overlap");
            auto tcost = world.options_.GetOption("bvh.sah.traversal_cost");
            auto node_budget = world.options_.GetOption("bvh.sah.extra_node_budget");
            auto nbins = world.options_.GetOption("bvh.sah.num_bins");

            bool use_sah = false;
            bool use_splits = false;
            int max_split_depth = maxdepth ? (int)maxdepth->AsFloat() : 10;
            int num_bins = nbins ? (int)nbins->AsFloat() : 64;
            float min_overlap = overlap ? overlap->AsFloat() : 0.05f;
            float traversal_cost = tcost ? tcost->AsFloat() : 10.f;
            float extra_node_budget = node_budget ? node_budget->AsFloat() : 0.5f;

            if (builder && builder->AsString() == "sah")
            {
                use_sah = true;
            }

            if (splits && splits->AsFloat() > 0.f)
            {
                use_splits = true;
            }

            m_bvh.reset(use_splits ?
                new SplitBvh(traversal_cost, num_bins, max_split_depth, min_overlap, extra_node_budget) :
                new Bvh(traversal_cost, num_bins, use_sah)
            );

            // Partition the array into meshes and instances
            std::vector<Shape const*> shapes(world.shapes_);

            auto firstinst = std::partition(shapes.begin(), shapes.end(),
                [&](Shape const* shape)
            {
                return !static_cast<ShapeImpl const*>(shape)->is_instance();
            });

            // Count the number of meshes
            int nummeshes = (int)std::distance(shapes.begin(), firstinst);
            // Count the number of instances
            int numinstances = (int)std::distance(firstinst, shapes.end());

            for (int i = 0; i < nummeshes; ++i)
            {
                Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);

                mesh_faces_start_idx[i] = numfaces;
                mesh_vertices_start_idx[i] = numvertices;

                numfaces += mesh->num_faces();
                numvertices += mesh->num_vertices();
            }

            for (int i = nummeshes; i < nummeshes + numinstances; ++i)
            {
                Instance const* instance = static_cast<Instance const*>(shapes[i]);
                Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());

                mesh_faces_start_idx[i] = numfaces;
                mesh_vertices_start_idx[i] = numvertices;

                numfaces += mesh->num_faces();
                numvertices += mesh->num_vertices();
            }


            // We can't avoild allocating it here, since bounds aren't stored anywhere
            std::vector<bbox> bounds(numfaces);

            // We handle meshes first collecting their world space bounds
#pragma omp parallel for
            for (int i = 0; i < nummeshes; ++i)
            {
                Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);

                for (int j = 0; j < mesh->num_faces(); ++j)
                {
                    // Here we directly get world space bounds
                    mesh->GetFaceBounds(j, false, bounds[mesh_faces_start_idx[i] + j]);
                }
            }

            // Then we handle instances. Need to flatten them into actual geometry.
#pragma omp parallel for
            for (int i = nummeshes; i < nummeshes + numinstances; ++i)
            {
                Instance const* instance = static_cast<Instance const*>(shapes[i]);
                Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());

                // Instance is using its own transform for base shape geometry
                // so we need to get object space bounds and transform them manually
                matrix m, minv;
                instance->GetTransform(m, minv);

                for (int j = 0; j < mesh->num_faces(); ++j)
                {
                    bbox tmp;
                    mesh->GetFaceBounds(j, true, tmp);
                    bounds[mesh_faces_start_idx[i] + j] = transform_bbox(tmp, m);
                }
            }

            m_bvh->Build(&bounds[0], numfaces);

#ifdef RR_PROFILE
            m_bvh->PrintStatistics(std::cout);
#endif

            // Check if the tree height is reasonable
            if (m_bvh->GetHeight() >= kMaxStackSize)
            {
                m_bvh.reset(nullptr);
                throw ExceptionImpl("fatbvh accelerator can cause stack overflow for this scene, try using bvh instead");
            }

            FatNodeBvhTranslator translator;
            translator.Process(*m_bvh);

            // Update GPU data

            // Create vertex buffer
            {
                // Vertices
                m_gpudata->vertices = m_device->CreateBuffer(numvertices * sizeof(float3), Calc::BufferType::kRead);

                // Get the pointer to mapped data
                float3* vertexdata = nullptr;
                Calc::Event* e = nullptr;

                m_device->MapBuffer(m_gpudata->vertices, 0, 0, numvertices * sizeof(float3), Calc::MapType::kMapWrite, (void**)&vertexdata, &e);

                e->Wait();
                m_device->DeleteEvent(e);

                // Here we need to put data in world space rather than object space
                // So we need to get the transform from the mesh and multiply each vertex
                matrix m, minv;

#pragma omp parallel for
                for (int i = 0; i < nummeshes; ++i)
                {
                    // Get the mesh
                    Mesh const* mesh = static_cast<Mesh const*>(shapes[i]);
                    // Get vertex buffer of the current mesh
                    float3 const* myvertexdata = mesh->GetVertexData();
                    // Get mesh transform
                    mesh->GetTransform(m, minv);

                    //#pragma omp parallel for
                    // Iterate thru vertices multiply and append them to GPU buffer
                    for (int j = 0; j < mesh->num_vertices(); ++j)
                    {
                        vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
                    }
                }

#pragma omp parallel for
                for (int i = nummeshes; i < nummeshes + numinstances; ++i)
                {
                    Instance const* instance = static_cast<Instance const*>(shapes[i]);
                    // Get the mesh
                    Mesh const* mesh = static_cast<Mesh const*>(instance->GetBaseShape());
                    // Get vertex buffer of the current mesh
                    float3 const* myvertexdata = mesh->GetVertexData();
                    // Get mesh transform
                    instance->GetTransform(m, minv);

                    //#pragma omp parallel for
                    // Iterate thru vertices multiply and append them to GPU buffer
                    for (int j = 0; j < mesh->num_vertices(); ++j)
                    {
                        vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
                    }
                }

                m_device->UnmapBuffer(m_gpudata->vertices, 0, vertexdata, &e);
                e->Wait();
                m_device->DeleteEvent(e);
            }

            // Create face buffer
            {
                
                // This number is different from the number of faces for some BVHs 
                auto numindices = m_bvh->GetNumIndices();
                std::vector<FatNodeBvhTranslator::Face> facedata(numindices);

                // Here the point is to add mesh starting index to actual index contained within the mesh,
                // getting absolute index in the buffer.
                // Besides that we need to permute the faces accorningly to BVH reordering, whihc
                // is contained within bvh.primids_
                int const* reordering = m_bvh->GetIndices();
                for (int i = 0; i < numindices; ++i)
                {
                    int indextolook4 = reordering[i];

                    // We need to find a shape corresponding to current face
                    auto iter = std::upper_bound(mesh_faces_start_idx.cbegin(), mesh_faces_start_idx.cend(), indextolook4);

                    // Find the index of the shape
                    int shapeidx = static_cast<int>(std::distance(mesh_faces_start_idx.cbegin(), iter) - 1);

                    // Get the mesh directly or out of instance
                    Mesh const* mesh = nullptr;
                    if (shapeidx < nummeshes)
                    {
                        mesh = static_cast<Mesh const*>(shapes[shapeidx]);
                    }
                    else
                    {
                        mesh = static_cast<Mesh const*>(static_cast<Instance const*>(shapes[shapeidx])->GetBaseShape());
                    }

                    // Get vertex buffer of the current mesh
                    Mesh::Face const* myfacedata = mesh->GetFaceData();
                    // Find face idx
                    int faceidx = indextolook4 - mesh_faces_start_idx[shapeidx];
                    // Find mesh start idx
                    int mystartidx = mesh_vertices_start_idx[shapeidx];

                    // Copy face data to GPU buffer
                    facedata[i].idx[0] = myfacedata[faceidx].idx[0] + mystartidx;
                    facedata[i].idx[1] = myfacedata[faceidx].idx[1] + mystartidx;
                    facedata[i].idx[2] = myfacedata[faceidx].idx[2] + mystartidx;

                    facedata[i].shapeidx = shapes[shapeidx]->GetId();
                    facedata[i].shape_mask = shapes[shapeidx]->GetMask();
                    facedata[i].id = faceidx;
                }

                translator.InjectIndices(&facedata[0]);
            }

            // Copy translated nodes first
            m_gpudata->bvh = m_device->CreateBuffer(translator.nodes_.size() * sizeof(FatNodeBvhTranslator::Node), Calc::BufferType::kRead, &translator.nodes_[0]);

            // Stack
            m_gpudata->stack = m_device->CreateBuffer(kMaxBatchSize*kMaxStackSize, Calc::BufferType::kWrite);

            // Make sure everything is commited
            m_device->Finish(0);
        }
    }
예제 #3
0
	void HlbvhStrategy::Preprocess(World const& world)
	{
		// If something has been changed we need to rebuild BVH
		if (!m_bvh || world.has_changed())
		{
            if (m_bvh)
            {
                m_device->DeleteBuffer(m_gpudata->vertices);
                m_device->DeleteBuffer(m_gpudata->faces);
                m_device->DeleteBuffer(m_gpudata->shapes);
                m_device->DeleteBuffer(m_gpudata->raycnt);
            }
            
			int numshapes = (int)world.shapes_.size();
			int numvertices = 0;
			int numfaces = 0;

			// This buffer tracks mesh start index for next stage as mesh face indices are relative to 0
			std::vector<int> mesh_vertices_start_idx(numshapes);
			std::vector<int> mesh_faces_start_idx(numshapes);

			//
			m_bvh.reset(new Hlbvh(m_device));

			// Here we now that only Meshes are present, otherwise 2level strategy would have been used
			for (int i = 0; i < numshapes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);

				mesh_faces_start_idx[i] = numfaces;
				mesh_vertices_start_idx[i] = numvertices;

				numfaces += mesh->num_faces();
				numvertices += mesh->num_vertices();
			}

			// We can't avoid allocating it here, since bounds aren't stored anywhere
			std::vector<bbox> bounds(numfaces);
			std::vector<ShapeData> shapes(numshapes);

#pragma omp parallel for
			for (int i = 0; i < numshapes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);

				for (int j = 0; j < mesh->num_faces(); ++j)
				{
					mesh->GetFaceBounds(j, false, bounds[mesh_faces_start_idx[i] + j]);
				}

				shapes[i].id = mesh->GetId();
				shapes[i].mask = mesh->GetMask();
			}

			m_bvh->Build(&bounds[0], numfaces);

			// Create vertex buffer
			{
				// Vertices
				m_gpudata->vertices = m_device->CreateBuffer(numvertices * sizeof(float3), Calc::BufferType::kRead);

				// Get the pointer to mapped data
				float3* vertexdata = nullptr;
				Calc::Event* e = nullptr;

				m_device->MapBuffer(m_gpudata->vertices, 0, 0, numvertices * sizeof(float3), Calc::MapType::kMapWrite, (void**)&vertexdata, &e);

				e->Wait();
				m_device->DeleteEvent(e);

				// Here we need to put data in world space rather than object space
				// So we need to get the transform from the mesh and multiply each vertex
				matrix m, minv;

#pragma omp parallel for
				for (int i = 0; i < numshapes; ++i)
				{
					// Get the mesh
					Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);
					// Get vertex buffer of the current mesh
					float3 const* myvertexdata = mesh->GetVertexData();
					// Get mesh transform
					mesh->GetTransform(m, minv);

					//#pragma omp parallel for
					// Iterate thru vertices multiply and append them to GPU buffer
					for (int j = 0; j < mesh->num_vertices(); ++j)
					{
						vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
					}
				}
				m_device->UnmapBuffer(m_gpudata->vertices, 0, vertexdata, &e); 

				e->Wait();
				m_device->DeleteEvent(e);
			}


			// Create face buffer
			{
				struct Face
				{
					// Up to 3 indices
					int idx[3];
					// Shape idx
					int shapeidx;
					// Primitive ID within the mesh
					int id;
					// Idx count
					int cnt;
				};

				// Create face buffer
				{
					struct Face
					{
						// Up to 3 indices
						int idx[3];
						// Shape index
						int shapeidx;
						// Primitive ID within the mesh
						int id;
						// Idx count
						int cnt;
					};

					// Create face buffer
					m_gpudata->faces = m_device->CreateBuffer(numfaces * sizeof(Face), Calc::BufferType::kRead);

					// Get the pointer to mapped data
					Face* facedata = nullptr;
					Calc::Event* e = nullptr;

					m_device->MapBuffer(m_gpudata->faces, 0, 0, numfaces * sizeof(Face), Calc::BufferType::kWrite, (void**)&facedata, &e);

					e->Wait();
					m_device->DeleteEvent(e);

					// Here the point is to add mesh starting index to actual index contained within the mesh,
					// getting absolute index in the buffer.
					// Besides that we need to permute the faces accorningly to BVH reordering, whihc
					// is contained within bvh.primids_
					for (int i = 0; i < numfaces; ++i)
					{
						int indextolook4 = i;

						// We need to find a shape corresponding to current face
						auto iter = std::upper_bound(mesh_faces_start_idx.cbegin(), mesh_faces_start_idx.cend(), indextolook4);

						// Find the index of the shape
						int shapeidx = static_cast<int>(std::distance(mesh_faces_start_idx.cbegin(), iter) - 1);

						// Get the mesh
						Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[shapeidx]);
						// Get vertex buffer of the current mesh
						Mesh::Face const* myfacedata = mesh->GetFaceData();
						// Find face idx
						int faceidx = indextolook4 - mesh_faces_start_idx[shapeidx];
						// Find mesh start idx
						int mystartidx = mesh_vertices_start_idx[shapeidx];

						// Copy face data to GPU buffer
						facedata[i].idx[0] = myfacedata[faceidx].idx[0] + mystartidx;
						facedata[i].idx[1] = myfacedata[faceidx].idx[1] + mystartidx;
						facedata[i].idx[2] = myfacedata[faceidx].idx[2] + mystartidx;

						facedata[i].shapeidx = shapeidx;
						facedata[i].cnt = (myfacedata[faceidx].type_ == Mesh::FaceType::QUAD ? 4 : 3);
						facedata[i].id = faceidx;
					}

					m_device->UnmapBuffer(m_gpudata->faces, 0, facedata, &e);

					e->Wait();
					m_device->DeleteEvent(e);
				}
			}

			// Create shapes buffer
			m_gpudata->shapes = m_device->CreateBuffer(numshapes * sizeof(ShapeData), Calc::BufferType::kRead, &shapes[0]);
			// Create helper raycounter buffer
			m_gpudata->raycnt = m_device->CreateBuffer(sizeof(int), Calc::BufferType::kWrite);
			// Stack
			m_gpudata->stack = m_device->CreateBuffer(kMaxBatchSize*kMaxStackSize, Calc::BufferType::kWrite);
			// Make sure everything is commited
			m_device->Finish(0);
		}
		else if (world.GetStateChange() != ShapeImpl::kStateChangeNone)
		{
			int numshapes = (int)world.shapes_.size();
			int numvertices = 0;
			int numfaces = 0;

			// This buffer tracks mesh start index for next stage as mesh face indices are relative to 0
			std::vector<int> mesh_vertices_start_idx(numshapes);
			std::vector<int> mesh_faces_start_idx(numshapes);

			//
			//bvh_.reset(new Hlbvh(context_));

			// Here we now that only Meshes are present, otherwise 2level strategy would have been used
			for (int i = 0; i < numshapes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);

				mesh_faces_start_idx[i] = numfaces;
				mesh_vertices_start_idx[i] = numvertices;

				numfaces += mesh->num_faces();
				numvertices += mesh->num_vertices();
			}

			// We can't avoid allocating it here, since bounds aren't stored anywhere
			std::vector<bbox> bounds(numfaces);

#pragma omp parallel for
			for (int i = 0; i < numshapes; ++i)
			{
				Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);

				for (int j = 0; j < mesh->num_faces(); ++j)
				{
					mesh->GetFaceBounds(j, false, bounds[mesh_faces_start_idx[i] + j]);
				}
			}

			m_bvh->Build(&bounds[0], numfaces);

			// Create vertex buffer
			{
				// Vertices
				m_gpudata->vertices = m_device->CreateBuffer(numvertices * sizeof(float3), Calc::BufferType::kRead);

				// Get the pointer to mapped data
				float3* vertexdata = nullptr;
				Calc::Event* e = nullptr;

				m_device->MapBuffer(m_gpudata->vertices, 0, 0, numvertices * sizeof(float3), Calc::MapType::kMapWrite, (void**)&vertexdata, &e);

				e->Wait();
				m_device->DeleteEvent(e);

				// Here we need to put data in world space rather than object space
				// So we need to get the transform from the mesh and multiply each vertex
				matrix m, minv;

#pragma omp parallel for
				for (int i = 0; i < numshapes; ++i)
				{
					// Get the mesh
					Mesh const* mesh = static_cast<Mesh const*>(world.shapes_[i]);
					// Get vertex buffer of the current mesh
					float3 const* myvertexdata = mesh->GetVertexData();
					// Get mesh transform
					mesh->GetTransform(m, minv);

					//#pragma omp parallel for
					// Iterate thru vertices multiply and append them to GPU buffer
					for (int j = 0; j < mesh->num_vertices(); ++j)
					{
						vertexdata[mesh_vertices_start_idx[i] + j] = transform_point(myvertexdata[j], m);
					}
				}
				m_device->UnmapBuffer(m_gpudata->vertices, 0, vertexdata, &e);

				e->Wait();
				m_device->DeleteEvent(e);
			}
		}
	}