void VHParticlesRender::draw(){ if (pSys->nParts == -1) return; if(displayMode == SHADOWED_SPRITES) { calcVectors(); cu::float3 halfVec = cu::make_float3(halfVector.x,halfVector.y,halfVector.z); calcDepthCu(pSys->dev_pos, pSys->dev_keys, pSys->dev_indices, halfVec, pSys->nParts); if (sortParts) cudppSort(m_sortHandle, pSys->dev_keys, pSys->dev_indices, 32, pSys->nParts); } if((displayMode == SPRITES || displayMode == POINTS) && sortParts) { glGetFloatv(GL_MODELVIEW_MATRIX, (float *) modelView.get_value()); viewVector = -vec3f(modelView.get_row(2)); cu::float3 viewVec = cu::make_float3(viewVector.x, viewVector.y, viewVector.y); //printf("view vec : %f, %f, %f \n", viewVector.x, viewVector.y, viewVector.z); calcDepthCu(pSys->dev_pos, pSys->dev_keys, pSys->dev_indices, viewVec, pSys->nParts); cudppSort(m_sortHandle, pSys->dev_keys, pSys->dev_indices, 32, pSys->nParts); } pSys->posVbo->map(); pSys->colourVbo->map(); pSys->indexVbo->map(); pSys->updateVBOs(); pSys->posVbo->unmap(); pSys->colourVbo->unmap(); pSys->indexVbo->unmap(); switch (displayMode) { case POINTS: glPointSize(pointSize); glDisable(GL_DEPTH_TEST); glEnable(GL_BLEND); pSys->posVbo->bind(); glVertexPointer(3, GL_FLOAT, 0, 0); glEnableClientState(GL_VERTEX_ARRAY); pSys->colourVbo->bind(); glColorPointer(4, GL_FLOAT, 0, 0); glEnableClientState(GL_COLOR_ARRAY); if (blendingMode == ADD) { glBlendFunc( GL_SRC_ALPHA, GL_ONE ); glDrawArrays(GL_POINTS, 0, pSys->nParts); } else { glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); if(sortParts){ pSys->indexVbo->bind(); glDrawElements(GL_POINTS, pSys->nParts, GL_UNSIGNED_INT, 0); pSys->indexVbo->unbind(); } else { glDrawArrays(GL_POINTS, 0, pSys->nParts); } } pSys->posVbo->unbind(); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_COLOR_ARRAY); glDisable(GL_BLEND); break; case LINES: glDisable(GL_DEPTH_TEST); glEnable(GL_BLEND); glBlendFunc( GL_SRC_ALPHA, GL_ONE ); pSys->posVbo->bind(); glVertexPointer(3, GL_FLOAT, 0, 0); glEnableClientState(GL_VERTEX_ARRAY); pSys->colourVbo->bind(); glColorPointer(4, GL_FLOAT, 0, 0); glEnableClientState(GL_COLOR_ARRAY); glLineWidth(lineWidth); for (int i = 0; i<pSys->nLeadParts; i++) { glDrawArrays(GL_LINE_STRIP, i*pSys->trailLength, pSys->trailLength); } pSys->posVbo->unbind(); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_COLOR_ARRAY); glDisable(GL_BLEND); break; case SPRITES: glDisable(GL_DEPTH_TEST); glEnable(GL_TEXTURE_2D); glEnable(GL_BLEND); glBlendFunc( GL_SRC_ALPHA, GL_ONE ); pSys->posVbo->bind(); glVertexPointer(3, GL_FLOAT, 0, 0); glEnableClientState(GL_VERTEX_ARRAY); pSys->colourVbo->bind(); glColorPointer(4, GL_FLOAT, 0, 0); glEnableClientState(GL_COLOR_ARRAY); simpleSpriteProg->enable(); simpleSpriteProg->setUniform1f("pointRadius",pointSize); simpleSpriteProg->bindTexture("sDiffuseMap",TextureManager::Inst()->m_texID[id1],GL_TEXTURE_2D,0); if (blendingMode == ADD) { glBlendFunc( GL_SRC_ALPHA, GL_ONE ); glDrawArrays(GL_POINTS, 0, pSys->nParts); } else { glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); if(sortParts){ pSys->indexVbo->bind(); glDrawElements(GL_POINTS, pSys->nParts, GL_UNSIGNED_INT, 0); pSys->indexVbo->unbind(); } else { glDrawArrays(GL_POINTS, 0, pSys->nParts); } } simpleSpriteProg->disable(); pSys->posVbo->unbind(); glDisableClientState(GL_VERTEX_ARRAY); glDisableClientState(GL_COLOR_ARRAY); glDisable(GL_BLEND); glDisable(GL_TEXTURE_2D); break; case SHADOWED_SPRITES : GLfloat currentViewport[4]; glGetFloatv(GL_VIEWPORT, currentViewport); if(width != currentViewport[2] || height != currentViewport[3]) initFbos(currentViewport[2],currentViewport[3], false); drawSlices(); //glutReportErrors(); Fbo::unbind(); glViewport(0, 0, width, height); glDisable(GL_DEPTH_TEST); glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); glEnable(GL_BLEND); int mm; glGetIntegerv ( GL_MATRIX_MODE, &mm ); displayTexProg->enable(); displayTexProg->bindTexture("tex", imageTex, GL_TEXTURE_2D, 0); drawQuad(); displayTexProg->disable(); if(displayLightBuffer) { displayTexProg->bindTexture("tex", lightTex[srcLightTexture], GL_TEXTURE_2D, 0); glViewport(0, 0, lightBufferSize, lightBufferSize); drawQuad(); displayTexProg->disable(); } //calcVectors(); glViewport(0, 0, width, height); if (displayVectors) { debugVectors(); } glutReportErrors(); break; } }
void IntIntSorter::executeOnGPUAsync(void * const keys, void * const vals, const int numKeys, int & numUniqueKeys, int ** keyOffsets, int ** valOffsets, int ** numVals) { if (numKeys == 0) { numUniqueKeys = 0; *keyOffsets = *valOffsets = NULL; *numVals = NULL; return; } if (numKeys > 32 * 1048576) { executeOnCPUAsync(keys, vals, numKeys, numUniqueKeys, keyOffsets, valOffsets, numVals); return; } int commRank; MPI_Comm_rank(MPI_COMM_WORLD, &commRank); CUDPPConfiguration cudppConfig; CUDPPHandle planHandle; void * gpuInputKeys = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuInputVals = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuUniqueFlags = cudacpp::Runtime::malloc(sizeof(int) * numKeys); void * gpuValOffsets = cudacpp::Runtime::malloc(sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputKeys, keys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyHtoD(gpuInputVals, vals, sizeof(int) * numKeys); /* what we need to get out of here: 1 - sorted keys and values 2 - num unique keys 3 - number of values for each key 4 - value offsets 5 - compacted keys to get: simply sort A = find unique values B = reverse exclusive scan of "A" C = if A[i] == 1 C[B[0] - B[i]] = i D = [0] = C[0] + 1 [N] = #keys - C[#keys - 1] [i] = C[i + 1] - C[i] E = forward exclusive scan D F = keys[E[i]] 1 = result of sort (only copy the values) 2 = B[0] 3 = D 4 = E 5 = F */ // 1 cudppConfig.algorithm = CUDPP_SORT_RADIX; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_UINT; cudppConfig.options = CUDPP_OPTION_KEY_VALUE_PAIRS; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppSort(planHandle, gpuInputKeys, gpuInputVals, sizeof(int) * 8, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); // cudacpp::Runtime::memcpyDtoH(keys, gpuInputKeys, sizeof(int) * numKeys); cudacpp::Runtime::memcpyDtoH(vals, gpuInputVals, sizeof(int) * numKeys); // 2 - A = gpuUniqueFlags gpmrIntIntSorterMarkUnique(gpuInputKeys, gpuUniqueFlags, numKeys); // 2 - B = gpuValOffsets cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_BACKWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuValOffsets, gpuUniqueFlags, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(&numUniqueKeys, gpuValOffsets, sizeof(int)); ++numUniqueKeys; // 2 - C = gpuInputVals and // 3 - D = gpuValOffsets cudacpp::Runtime::sync(); gpmrIntIntSorterFindOffsets(gpuInputKeys, gpuUniqueFlags, gpuValOffsets, gpuInputVals, gpuValOffsets, numKeys, numUniqueKeys); *numVals = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::sync(); cudacpp::Runtime::memcpyDtoH(*numVals, gpuValOffsets, sizeof(int) * numUniqueKeys); cudacpp::Runtime::sync(); // 4 - E = gpuUniqueFlags cudppConfig.algorithm = CUDPP_SCAN; cudppConfig.op = CUDPP_ADD; // ignored cudppConfig.datatype = CUDPP_INT; cudppConfig.options = CUDPP_OPTION_EXCLUSIVE | CUDPP_OPTION_FORWARD; cudppPlan(&planHandle, cudppConfig, numKeys, 1, numKeys * sizeof(int)); cudppScan(planHandle, gpuUniqueFlags, gpuValOffsets, numKeys); cudppDestroyPlan(planHandle); cudacpp::Runtime::sync(); *valOffsets = reinterpret_cast<int * >(cudacpp::Runtime::mallocHost(numUniqueKeys * sizeof(int))); cudacpp::Runtime::memcpyDtoH(*valOffsets, gpuUniqueFlags, sizeof(int) * numUniqueKeys); // 4 - F = gpuInputVals gpmrIntIntSorterSetCompactedKeys(gpuInputKeys, gpuUniqueFlags, gpuInputVals, numUniqueKeys); cudacpp::Runtime::memcpyDtoH(keys, gpuInputVals, sizeof(int) * numUniqueKeys); cudacpp::Runtime::free(gpuInputKeys); cudacpp::Runtime::free(gpuInputVals); cudacpp::Runtime::free(gpuUniqueFlags); cudacpp::Runtime::free(gpuValOffsets); }