/* ============================================================================ Function: = Debugger Purpose: = Print out debug phase and element types at various stages. ============================================================================== Input arg: = 1. flag: Flag for stage for specified output. = 7. my_rank: The processors rank. = 3. phase: The current phase of the algorithim. = 4. partner_size: The size of the paring among processors. = 5. my_list: Local list for each processor. = 6. global_size: The size of the input array. = 7. list_size: The size of the processors array. =========================================================================== */ void Debugger(int flag, int my_rank, int phase, int partner_size, int my_list[], int global_size, int list_size) { if (flag == 1) { printf("Processor %d's #%d %d-element butterfly now executing. \n", my_rank, phase, partner_size); fflush(stdout); } else if (flag == 2) { printf("Processor %d's #%d %d-element butterfly now executing. \n", my_rank, phase, partner_size); fflush(stdout); } else { if(my_rank == 0) { printf(" \nStage %d completed. \n", phase); } Gather(my_list, global_size, my_rank, list_size); fflush(stdout); } }
void Renderer::Gather(std::vector<Avpl>& avpls_shadowmap, std::vector<Avpl>& avpls_antiradiance) { if(m_confManager->GetConfVars()->UseDebugMode && m_FinishedDebug) return; Gather(avpls_antiradiance, m_gatherAntiradianceRenderTarget.get()); GatherRadianceWithShadowMap(avpls_shadowmap, m_gatherShadowmapRenderTarget.get()); }
bool GameBonus::CollidesWith(GamePlayer Player) { if (CollisionRecRec({ Player.x, Player.y, Player.w, Player.h })) { Gather(); Destroy(); return true; } else return false; }
Index<String> MakeBuild::PackageConfig(const Workspace& wspc, int package, const VectorMap<String, String>& bm, String mainparam, Host& host, Builder& b, String *target) { String packagepath = PackagePath(wspc[package]); const Package& pkg = wspc.package[package]; Index<String> cfg; mainparam << ' ' << bm.Get(targetmode ? "RELEASE_FLAGS" : "DEBUG_FLAGS", NULL); cfg = SplitFlags(mainparam, package == 0, wspc.GetAllAccepts(package)); cfg.FindAdd(bm.Get("BUILDER", "GCC")); const TargetMode& m = GetTargetMode(); if(targetmode == 0) cfg.FindAdd("DEBUG"); switch(m.linkmode) { case 2: cfg.FindAdd("SO"); case 1: cfg.FindAdd("SHARED"); } if(targetmode == 2) cfg.FindAdd("FORCE_SPEED"); if(targetmode == 3) cfg.FindAdd("FORCE_SIZE"); int q = m.package.Find(wspc[package]); if(q >= 0) { const PackageMode& p = m.package[q]; switch(p.debug >= 0 ? p.debug : m.def.debug) { case 1: cfg.FindAdd("DEBUG_MINIMAL"); break; case 2: cfg.FindAdd("DEBUG_FULL"); break; } if(!pkg.noblitz && (p.blitz >= 0 ? p.blitz : m.def.blitz)) cfg.FindAdd("BLITZ"); } else { switch(m.def.debug) { case 1: cfg.FindAdd("DEBUG_MINIMAL"); break; case 2: cfg.FindAdd("DEBUG_FULL"); break; } if(!pkg.noblitz && m.def.blitz) cfg.FindAdd("BLITZ"); } host.AddFlags(cfg); b.AddFlags(cfg); for(int i = 0; i < pkg.flag.GetCount(); i++) { if(MatchWhen(pkg.flag[i].when, cfg.GetKeys())) cfg.Add(pkg.flag[i].text); } if(target) *target = Gather(pkg.target, cfg.GetKeys(), true); return cfg; }
int main(int argc, char** argv) { if(argc <= 2) { PrintUsage(); return -1; } if(!strcmp(argv[1], "gather")) { if(argc < 4) { PrintUsage(); return -1; } else { Gather(argv[2], argv[3]); } } else if(!strcmp(argv[1], "compile")) { if(argc < 6) { PrintUsage(); return -1; } try { const char* databasePath = argv[2]; const char* cpuArchName = argv[3]; const char* imageFormatName = argv[4]; const char* outputPath = argv[5]; Compile(databasePath, cpuArchName, imageFormatName, outputPath); } catch(const std::exception& exception) { printf("Failed to compile: %s\r\n", exception.what()); return -1; } } return 0; }
Vector<String> MakeBuild::GetAllLibraries(const Workspace& wspc, int index, const VectorMap<String, String>& bm, String mainparam, Host& host, Builder& builder) { // Warning: This does not seem to do what it is supposed to do... Vector<String> uses = GetAllUses(wspc, index); uses.Add(wspc[index]); Index<String> libraries; for(int i = 0; i < uses.GetCount(); i++) { int f = wspc.package.Find(UnixPath(uses[i])); if(f >= 0) { const Package& pk = wspc.package[f]; Index<String> config = PackageConfig(wspc, f, bm, mainparam, host, builder); Vector<String> pklibs = Split(Gather(pk.library, config.GetKeys()), ' '); FindAppend(libraries, pklibs); } } return libraries.PickKeys(); }
/* ============================================================================ Function: = Mpi_Bitonic_sort Purpose: = The root itteration logic for the bitonic sequence sort. ============================================================================== Input arg: = 1. my_rank: The size of the processors array. = 2. p: The size of the input array. = 3. my_list[]: A pointer to an array. = 4. neighbors_list[]: A pointer to an array. = 5. list_size: The size of the processors array. = 6. comm: The mpi communicator channel. =========================================================================== */ void Mpi_Bitonic_sort(int my_rank, int p, int my_list[], int neighbors_list[], int list_size, MPI_Comm comm) { int partner_size, phase = 1; unsigned and_bit; for(partner_size = 2, and_bit = 2; partner_size <= p; partner_size = partner_size * 2, and_bit = and_bit << 1) { if((my_rank & and_bit) == 0) { #ifdef DEBUG Debugger(1, my_rank, phase, partner_size, my_list, p*list_size, list_size); #endif Bitonic_sort_incr(my_rank, my_list, neighbors_list, list_size, partner_size, comm); } else { #ifdef DEBUG Debugger(2, my_rank, phase, partner_size, my_list, p*list_size, list_size); #endif Bitonic_sort_decr(my_rank, my_list, neighbors_list, list_size, partner_size, comm); } #ifdef DEBUG Debugger(3, my_rank, phase, partner_size, my_list, p*list_size, list_size); #endif phase++; } if(my_rank == 0) printf("The list is now sorted. \n"); Gather(my_list, p*list_size, my_rank, list_size); }
//-------------------------------------------------------------------------------- // Bin the screen space transformed triangles into tiles. For single threaded version //-------------------------------------------------------------------------------- void SoftOccluderMeshScalar::BinTransformedTrianglesST(UINT taskId, UINT modelId, UINT meshId, UINT start, UINT end, UINT* pBin, USHORT* pBinModel, USHORT* pBinMesh, USHORT* pNumTrisInBin, UINT idx) { // working on one triangle at a time for(UINT index = start; index <= end; index++) { float4 xformedPos[3]; Gather(xformedPos, index, idx); int fxPtX[3], fxPtY[3]; for(int i = 0; i < 3; i++) { fxPtX[i] = (int)(xformedPos[i].x + 0.5); fxPtY[i] = (int)(xformedPos[i].y + 0.5); } // Compute triangle area int triArea = (fxPtX[1] - fxPtX[0]) * (fxPtY[2] - fxPtY[0]) - (fxPtX[0] - fxPtX[2]) * (fxPtY[0] - fxPtY[1]); // Find bounding box for screen space triangle in terms of pixels int startX = max(min(min(fxPtX[0], fxPtX[1]), fxPtX[2]), 0); int endX = min(max(max(fxPtX[0], fxPtX[1]), fxPtX[2]), mRasterData->mScreenWidth - 1); int startY = max(min(min(fxPtY[0], fxPtY[1]), fxPtY[2]), 0 ); int endY = min(max(max(fxPtY[0], fxPtY[1]), fxPtY[2]), mRasterData->mScreenHeight - 1); // Skip triangle if area is zero if(triArea <= 0) continue; // Dont bin screen-clipped triangles if(endX < startX || endY < startY) continue; // Reject the triangle if any of its verts is behind the nearclip plane if(xformedPos[0].w > 0.0f && xformedPos[1].w > 0.0f && xformedPos[2].w > 0.0f) { // Convert bounding box in terms of pixels to bounding box in terms of tiles int startXx = max(startX/mRasterData->mTileWidthInPixels, 0); int endXx = min(endX/mRasterData->mTileWidthInPixels, mRasterData->mScreenWidthInTiles-1); int startYy = max(startY/mRasterData->mTileHeightInPixels, 0); int endYy = min(endY/mRasterData->mTileHeightInPixels, mRasterData->mScreenHeightInTiles-1); // Add triangle to the tiles or bins that the bounding box covers int row, col; for(row = startYy; row <= endYy; row++) { int offset1 = mRasterData->mYOffset1_ST * row; int offset2 = mRasterData->mYOffset2_ST * row; for(col = startXx; col <= endXx; col++) { int idx1 = offset1 + (mRasterData->mXOffset1_ST * col) + taskId; int idx2 = offset2 + (mRasterData->mXOffset2_ST * col) + (taskId * MAX_TRIS_IN_BIN_ST) + pNumTrisInBin[idx1]; pBin[idx2] = index; pBinModel[idx2] = modelId; pBinMesh[idx2] = meshId; pNumTrisInBin[idx1] += 1; } } } } }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_set_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_set_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i); // use fixed-point only for X and Y. Avoid work for Z and W. vFxPt4 xFormedFxPtPos[3]; for(int m = 0; m < 3; m++) { xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X); xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y); xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z); xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y); __m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y); __m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X); __m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X); __m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y)); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y)); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y)); // Compute triangle area __m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X); triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y)); triArea = _mm_add_epi32(triArea, C0); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW)); __m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH)); for(int vv = 0; vv < 3; vv++) { // If W (holding 1/w in our case) is not between 0 and 1, // then vertex is behind near clip plane (1.0 in our case. // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0). __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f)); __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f)); __m128 nearClipMask = _mm_or_ps(nearClipMask0, nearClipMask1); if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask)) { // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE) *mVisible = true; return; } } // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3], oneOverW[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]); oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]); } __m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]); zz[0] *= oneOverTotalArea; zz[1] *= oneOverTotalArea; zz[2] *= oneOverTotalArea; int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]); __m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]); __m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; int rowIdx; // To avoid this branching, choose one method to traverse and store the pixel depth if(gVisualizeDepthBuffer) { // Sequentially traverse and store pixel depths contiguously rowIdx = (startYy * SCREENW + startXx); } else { // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance rowIdx = (startYy * SCREENW + 2 * startXx); } col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, row = _mm_add_epi32(row, _mm_set1_epi32(2)), rowIdx = rowIdx + 2 * SCREENW, bb0Row = _mm_add_epi32(bb0Row, bb0Inc), bb1Row = _mm_add_epi32(bb1Row, bb1Inc), bb2Row = _mm_add_epi32(bb2Row, bb2Inc)) { // Compute barycentric coordinates int idx = rowIdx; __m128i alpha = _mm_add_epi32(aa0Col, bb0Row); __m128i beta = _mm_add_epi32(aa1Col, bb1Row); __m128i gama = _mm_add_epi32(aa2Col, bb2Row); int idxIncr; if(gVisualizeDepthBuffer) { idxIncr = 2; } else { idxIncr = 4; } for(int c = startXx; c < endXx; c += 2, idx = idx + idxIncr, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc)) { //Test Pixel inside triangle __m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama)); // Early out if all of this quad's pixels are outside the triangle. if(_mm_test_all_zeros(mask, mask)) { continue; } // Compute barycentric-interpolated depth __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); __m128 previousDepthValue; if(gVisualizeDepthBuffer) { previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]); } else { previousDepthValue = *(__m128*)&pDepthBuffer[idx]; } __m128 depthMask = _mm_cmpge_ps( depth, previousDepthValue); __m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask)); if(!_mm_test_all_zeros(finalMask, finalMask)) { *mVisible = true; return; //early exit } }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
CFGTOKEN CfgFile::Scan() { CFGTOKEN ret; int i; char c; ret.id = TINVALID; loop: c = Input(); if(c=='\t' || c==' '){ goto loop; } else if(c=='\n' || c==';'){ ReadLine(); goto loop; } else if(isdigit(c) || c=='.'){ m_nTextLen = 0; if( GetDigit(c) > 0 ) ret.id = TREAL; else ret.id = TINTEGER; if(m_nTextLen == 1 && m_szText[0]=='.') goto reterr; if(m_nTextLen == 0) goto reterr; Gather('\0'); if( ret.id == TINTEGER ){ ret.v.nval = atoi(m_szText); } else if( ret.id == TREAL ){ ret.v.fval = (float)atof(m_szText); } return ret; reterr: ret.id = TINVALID; return ret; } // SYMBOL else if( isalpha(c) ){ m_nTextLen=0; Gather(c); for( c = Input(); isalpha(c) || isdigit(c); c=Input() ) Gather(c); Unput(c); Gather('\0'); for( i=0; rwords[i].val != TINVALID; i++ ){ if( strcmp(m_szText, rwords[i].szText) == 0 ){ break; } } if( rwords[i].val != TINVALID ){ ret.id = rwords[i].val; return ret; } else { ret.id = TSYMBOL; ret.v.szval = strdup(m_szText); return ret; } } // STRING CONSTANT else if(c == '\"'){ m_nTextLen = 0; for( c = Input(); ; c=Input() ){ if(c == '\"'){ if((c=Input()) == '\"'){ Gather('\\'); Gather('\"'); } else { break; } } else if(c == '\\'){ c=Input(); Gather(c); if(c == '\n') ReadLine(); } else if(c == '\n'){ ret.id = TINVALID; ReadLine(); return ret; } else { Gather(c); } } Unput(c); Gather('\0'); ret.id = TSTRING; ret.v.szval = strdup(m_szText); return ret; } else if(c == '='){ ret.id = TEQUAL; } else if(c == '!'){ ret.id = TSECTION; } else if(c == '-'){ ret.id = TUNARY; } else if(c == EOF){ ret.id = TEOF; } return ret; }
void CppBuilder::AddMakeFile(MakeFile& makefile, String package, const Vector<String>& all_uses, const Vector<String>& all_libraries, const Index<String>& common_config, bool exporting) { String packagepath = PackagePath(package); Package pkg; pkg.Load(packagepath); String packagedir = GetFileFolder(packagepath); Vector<String> src = GetUppDirs(); for(int i = 0; i < src.GetCount(); i++) src[i] = UnixPath(src[i]); bool main = HasFlag("MAIN"); bool is_shared = HasFlag("SO"); bool libout = !main && !HasFlag("NOLIB"); bool win32 = HasFlag("WIN32"); String pack_ident = MakeIdent(package); String outdir = "OutDir_" + pack_ident; String macros = "Macro_" + pack_ident; String macdef = "$(Macro)"; String objext = (HasFlag("MSC") || HasFlag("EVC") ? ".obj" : ".o"); Vector<String> x(config.GetKeys(), 1); Sort(x); for(int i = 0; i < x.GetCount(); i++) { if(common_config.Find(x[i]) < 0) macdef << " -Dflag" << x[i]; x[i] = InitCaps(x[i]); } makefile.outdir << "$(" << outdir << ")"; makefile.outfile << AdjustMakePath(GetFileTitle(NativePath(package))); if(main) makefile.outfile << GetTargetExt(); else if(is_shared) makefile.outfile << (win32 ? ".dll" : ".so"); else makefile.outfile << (win32 && HasFlag("MSC") ? ".lib" : ".a"); makefile.output << (main ? String("$(OutDir)") : makefile.outdir) << makefile.outfile; if(main) { makefile.config << "CXX = c++\n" "LINKER = $(CXX)\n"; String flags; if(HasFlag("DEBUG")) flags << " -D_DEBUG " << debug_options; else flags << ' ' << release_options; if(HasFlag("DEBUG_MINIMAL")) flags << " -ggdb -g1"; if(HasFlag("DEBUG_FULL")) flags << " -ggdb -g2"; if(is_shared && !win32) flags << " -fPIC "; flags << ' ' << Gather(pkg.option, config.GetKeys()); makefile.config << "CFLAGS =" << flags << "\n" "CXXFLAGS =" << flags << "\n" "LDFLAGS = " << (HasFlag("DEBUG") ? debug_link : release_link) << " $(LINKOPTIONS)\n" "LIBPATH ="; for(int i = 0; i < libpath.GetCount(); i++) makefile.config << " -L" << GetMakePath(AdjustMakePath(GetHostPathQ(libpath[i]))); makefile.config << "\n" "AR = ar -sr\n\n"; makefile.install << "\t-mkdir -p $(OutDir)\n"; Vector<String> lib; String lnk; lnk << "$(LINKER)"; if(!HasFlag("SHARED")) lnk << " -static"; if(HasFlag("WIN32")) { lnk << " -mwindows"; if(!HasFlag("GUI")) makefile.linkfiles << " -mconsole"; } lnk << " -o $(OutFile)"; if(HasFlag("DEBUG") || HasFlag("DEBUG_MINIMAL") || HasFlag("DEBUG_FULL")) lnk << " -ggdb"; else lnk << (!HasFlag("OSX11") ? " -Wl,-s" : ""); lnk << " $(LIBPATH)"; if (!HasFlag("OSX11")) lnk << " -Wl,-O,2"; lnk << " $(LDFLAGS) -Wl,--start-group "; makefile.linkfiles = lnk; } makefile.config << outdir << " = $(UPPOUT)" << GetMakePath(AdjustMakePath(String().Cat() << package << '/' << method << '-' << Join(x, "-") << '/')) << "\n" << macros << " = " << macdef << "\n"; makefile.install << "\t-mkdir -p $(" << outdir << ")\n"; String libdep, libfiles; libdep << makefile.output << ":"; if(is_shared) { libfiles = "c++ -shared -fPIC"; // -v"; Point p = ExtractVersion(); if(!IsNull(p.x)) { libfiles << " -Xlinker --major-image-version -Xlinker " << p.x; if(!IsNull(p.y)) libfiles << " -Xlinker --minor-image-version -Xlinker " << p.y; } libfiles << " -o "; } else libfiles = "$(AR) "; libfiles << makefile.output; Vector<String> libs = Split(Gather(pkg.library, config.GetKeys()), ' '); for(int i = 0; i < libs.GetCount(); i++) { String ln = libs[i]; String ext = ToLower(GetFileExt(ln)); if(ext == ".a" || ext == ".so" || ext == ".dll") makefile.linkfileend << " \\\n\t\t\t" << GetHostPathQ(FindInDirs(libpath, ln)); else makefile.linkfileend << " \\\n\t\t\t-l" << ln; } for(int i = 0; i < pkg.GetCount(); i++) if(!pkg[i].separator) { String gop = Gather(pkg[i].option, config.GetKeys()); String fn = SourcePath(package, pkg[i]); String ext = ToLower(GetFileExt(fn)); bool isc = ext == ".c"; bool isrc = (ext == ".rc" && HasFlag("WIN32")); bool iscpp = (ext == ".cpp" || ext == ".cc" || ext == ".cxx"); bool isicpp = (ext == ".icpp"); if(ext == ".brc") { isc = true; fn << "c"; } if(isc || isrc || iscpp || isicpp) { String outfile; outfile << makefile.outdir << AdjustMakePath(GetFileTitle(fn)) << (isrc ? "_rc" : "") << objext; String srcfile = GetMakePath(MakeSourcePath(src, fn, false, exporting)); makefile.rules << outfile << ": " << srcfile; Vector<String> dep = HdependGetDependencies(fn); Sort(dep, GetLanguageInfo()); for(int d = 0; d < dep.GetCount(); d++) { String dfn = MakeSourcePath(src, dep[d], true, exporting); if(!IsNull(dfn)) makefile.rules << " \\\n\t" << GetMakePath(dfn); } makefile.rules << "\n" "\t$(CXX) -c " << (isc ? "-x c $(CFLAGS)" : "-x c++ $(CXXFLAGS)") << " $(CINC) $(" << macros << ") " << gop << " " << srcfile << " -o " << outfile << "\n\n"; if(!libout || isicpp) { makefile.linkdep << " \\\n\t" << outfile; makefile.linkfiles << " \\\n\t\t" << outfile; } else { libdep << " \\\n\t" << outfile; libfiles << " \\\n\t\t" << outfile; } } else if(ext == ".o" || ext == ".obj" || ext == ".a" || ext == ".so" || ext == ".lib" || ext == ".dll") { makefile.linkdep << " \\\n\t" << fn; makefile.linkfiles << ' ' << fn; } } if(libout) { makefile.rules << libdep << "\n\t" << libfiles << "\n\n"; makefile.linkdep << " \\\n\t" << makefile.output; makefile.linkfiles << " \\\n\t\t\t" << makefile.output; } /* if(main) { if(!HasFlag("SOLARIS")&&!HasFlag("OSX11")) makefile.linkfiles << " \\\n\t\t-Wl,--start-group "; DDUMPC(all_libraries); for(int i = 0; i < all_libraries.GetCount(); i++) { String ln = all_libraries[i]; String ext = ToLower(GetFileExt(ln)); if(ext == ".a" || ext == ".so" || ext == ".dll") makefile.linkfileend << " \\\n\t\t\t" << GetHostPathQ(FindInDirs(libpath, ln)); else makefile.linkfileend << " \\\n\t\t\t-l" << ln; } if(!HasFlag("SOLARIS")&&!HasFlag("OSX11")) makefile.linkfileend << " \\\n\t\t-Wl,--end-group\n\n"; } */ }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 ssp_setcsr( ssp_getcsr() | 0x8040 ); __m128i colOffset = ssp_setr_epi32(0, 1, 0, 1); __m128i rowOffset = ssp_setr_epi32(0, 0, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = ssp_cvtps_epi32(xformedPos[m].X); fxPtY[m] = ssp_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = ssp_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = ssp_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = ssp_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = ssp_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = ssp_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = ssp_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = ssp_sub_epi32(ssp_mullo_epi32(fxPtX[1], fxPtY[2]), ssp_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = ssp_sub_epi32(ssp_mullo_epi32(fxPtX[2], fxPtY[0]), ssp_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = ssp_sub_epi32(ssp_mullo_epi32(fxPtX[0], fxPtY[1]), ssp_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = ssp_mullo_epi32(B2, A1); triArea = ssp_sub_epi32(triArea, ssp_mullo_epi32(B1, A2)); __m128 oneOverTriArea = ssp_div_ps(ssp_set1_ps(1.0f), ssp_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = ssp_mul_ps(ssp_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = ssp_mul_ps(ssp_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = ssp_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), ssp_set1_epi32(0)), ssp_set1_epi32(~1)); __m128i endX = Min(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), ssp_set1_epi32(SCREENW - 1)); __m128i startY = ssp_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), ssp_set1_epi32(0)), ssp_set1_epi32(~1)); __m128i endY = Min(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), ssp_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = ssp_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = ssp_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = ssp_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = ssp_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = ssp_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = ssp_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = ssp_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = ssp_slli_epi32(aa0, 1); __m128i aa1Inc = ssp_slli_epi32(aa1, 1); __m128i aa2Inc = ssp_slli_epi32(aa2, 1); __m128i bb0Inc = ssp_slli_epi32(bb0, 1); __m128i bb1Inc = ssp_slli_epi32(bb1, 1); __m128i bb2Inc = ssp_slli_epi32(bb2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = ssp_add_epi32(colOffset, ssp_set1_epi32(startXx)); __m128i aa0Col = ssp_mullo_epi32(aa0, col); __m128i aa1Col = ssp_mullo_epi32(aa1, col); __m128i aa2Col = ssp_mullo_epi32(aa2, col); row = ssp_add_epi32(rowOffset, ssp_set1_epi32(startYy)); __m128i bb0Row = ssp_add_epi32(ssp_mullo_epi32(bb0, row), ssp_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = ssp_add_epi32(ssp_mullo_epi32(bb1, row), ssp_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = ssp_add_epi32(ssp_mullo_epi32(bb2, row), ssp_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = ssp_add_epi32(aa0Col, bb0Row); __m128i sum1Row = ssp_add_epi32(aa1Col, bb1Row); __m128i sum2Row = ssp_add_epi32(aa2Col, bb2Row); __m128 zx = ssp_mul_ps(ssp_cvtepi32_ps(aa1Inc), zz[1]); zx = ssp_add_ps(zx, ssp_mul_ps(ssp_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = ssp_add_epi32(sum0Row, bb0Inc), sum1Row = ssp_add_epi32(sum1Row, bb1Inc), sum2Row = ssp_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = ssp_add_ps(depth, ssp_mul_ps(ssp_cvtepi32_ps(beta), zz[1])); depth = ssp_add_ps(depth, ssp_mul_ps(ssp_cvtepi32_ps(gama), zz[2])); __m128i anyOut = ssp_setzero_si128(); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = ssp_add_epi32(alpha, aa0Inc), beta = ssp_add_epi32(beta, aa1Inc), gama = ssp_add_epi32(gama, aa2Inc), depth = ssp_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = ssp_or_si128(ssp_or_si128(alpha, beta), gama); __m128 previousDepthValue = ssp_load_ps(&pDepthBuffer[index]); __m128 depthMask = ssp_cmpge_ps(depth, previousDepthValue); __m128i finalMask = ssp_andnot_si128(mask, ssp_castps_si128(depthMask)); anyOut = ssp_or_si128(anyOut, finalMask); }//for each column if(!ssp_testz_si128(anyOut, ssp_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
void Renderer::Render() { if(m_ClearLighting) ClearLighting(); if(m_ClearAccumulationBuffer) ClearAccumulationBuffer(); CTimer frameTimer(CTimer::OGL); CTimer timer(CTimer::OGL); if(m_ProfileFrame) { std::cout << std::endl; std::cout << "Profile frame --------------- " << std::endl; std::cout << std::endl; frameTimer.Start(); timer.Start(); } SetUpRender(); if(m_ProfileFrame) timer.Stop("set up render"); if(m_ProfileFrame) timer.Start(); UpdateUniformBuffers(); if(m_ProfileFrame) timer.Stop("update ubs"); if(m_CurrentPathAntiradiance == 0 && m_CurrentPathShadowmap == 0) { m_experimentData->Init("test", "nois.data"); m_experimentData->MaxTime(450); m_globalTimer->Start(); m_resultTimer->Start(); m_glTimer->Start(); CreateGBuffer(); m_cudaGather->rebuildVisiblePointsBvh(); } if (m_confManager->GetConfVars()->drawGBufferTextures) { int border = 10; int width = (m_camera->GetWidth() - 4 * border) / 2; int height = (m_camera->GetHeight() - 4 * border) / 2; m_textureViewer->drawTexture(m_gbuffer->GetNormalTexture(), border, border, width, height); m_textureViewer->drawTexture(m_gbuffer->GetPositionTextureWS(), 3 * border + width, border, width, height); m_textureViewer->drawTexture(m_normalizeAntiradianceRenderTarget->GetTarget(2), border, 3 * border + height, width, height); m_textureViewer->drawTexture(m_depthBuffer.get(), 3 * border + width, 3 * border + height, width, height); return; } std::vector<Avpl> avpls_shadowmap; std::vector<Avpl> avpls_antiradiance; if(m_ProfileFrame) timer.Start(); //GetAVPLs(avpls_shadowmap, avpls_antiradiance); m_avplShooter->shoot(avpls_shadowmap, avpls_antiradiance, m_confManager->GetConfVars()->NumAVPLsPerFrame); m_CurrentPathAntiradiance += m_confManager->GetConfVars()->NumAVPLsPerFrame; if(m_ProfileFrame) timer.Stop("get avpls"); if(m_ProfileFrame) timer.Start(); if (m_confManager->GetConfVars()->gatherWithCuda) { if (avpls_antiradiance.size() > 0) { m_cudaGather->run(avpls_antiradiance, m_camera->GetPosition(), m_sceneProbe.get(), m_scene->getSceneExtent(), m_ProfileFrame); Add(m_gatherAntiradianceRenderTarget.get(), m_cudaRenderTarget.get()); } if(m_ProfileFrame) timer.Stop("gather"); if(m_ProfileFrame) timer.Start(); } else { Gather(avpls_shadowmap, avpls_antiradiance); if(m_ProfileFrame) timer.Stop("gather"); if(m_ProfileFrame) timer.Start(); } Normalize(m_normalizeShadowmapRenderTarget.get(), m_gatherShadowmapRenderTarget.get(), m_CurrentPathShadowmap); Normalize(m_normalizeAntiradianceRenderTarget.get(), m_gatherAntiradianceRenderTarget.get(), m_CurrentPathAntiradiance); if(m_ProfileFrame) timer.Stop("normalize"); if(m_ProfileFrame) timer.Start(); if(m_confManager->GetConfVars()->LightingMode == 2) { drawAreaLight(m_normalizeShadowmapRenderTarget.get(), glm::vec3(0.f, 0.f, 0.f)); drawAreaLight(m_normalizeAntiradianceRenderTarget.get(), glm::vec3(0.f, 0.f, 0.f)); } else { drawAreaLight(m_normalizeShadowmapRenderTarget.get(), m_scene->getAreaLight()->getRadiance()); } SetTransformToCamera(); Add(m_resultRenderTarget.get(), m_normalizeAntiradianceRenderTarget.get(), m_normalizeShadowmapRenderTarget.get()); if (m_confManager->GetConfVars()->UseDebugMode) { if (m_confManager->GetConfVars()->DrawClusterLights) { CRenderTargetLock lock(m_resultRenderTarget.get()); PointCloud pc(m_cudaGather->getVisiblePointsBvh()->centerPositions, m_cudaGather->getVisiblePointsBvh()->colors, m_ubTransform.get(), m_confManager->GetConfVars()->lightRadiusScale * m_scene->getSceneExtent() / 100.f); pc.Draw(); //m_cudaGather->getPointCloud()->Draw(); } if (m_confManager->GetConfVars()->DrawClusterAABBs) { CRenderTargetLock lock(m_resultRenderTarget.get()); AABBCloud aabb(m_cudaGather->getVisiblePointsBvh()->clusterMin, m_cudaGather->getVisiblePointsBvh()->clusterMax, m_ubTransform.get()); aabb.Draw(); //m_cudaGather->getAABBCloud()->Draw(); } if (m_confManager->GetConfVars()->DrawLights) { CRenderTargetLock lock(m_resultRenderTarget.get()); m_pointCloud->Draw(); } if (m_sceneProbe) { m_sceneProbe->draw(m_resultRenderTarget.get(), m_debugProgram.get(), m_ubTransform.get(), m_camera); m_pointCloud->Draw(); } } DrawDebug(); if(m_ProfileFrame) timer.Stop("draw debug"); m_postProcess->postprocess(m_resultRenderTarget->GetTarget(0), m_postProcessRenderTarget.get()); m_textureViewer->drawTexture(m_postProcessRenderTarget->GetTarget(0), 0, 0, m_camera->GetWidth(), m_camera->GetHeight()); m_NumAVPLs += (int)avpls_antiradiance.size(); m_NumAVPLs += (int)avpls_shadowmap.size(); if(m_ProfileFrame) timer.Start(); avpls_antiradiance.clear(); avpls_shadowmap.clear(); if(m_ProfileFrame) timer.Stop("clear avpls"); CheckExport(); m_Frame++; if(m_ProfileFrame) frameTimer.Stop("frame time"); m_ProfileFrame = false; m_FinishedDebug = true; }
//-------------------------------------------------------------------------------- // Bin the screen space transformed triangles into tiles. For single threaded version //-------------------------------------------------------------------------------- void TransformedMeshScalar::BinTransformedTrianglesST(UINT taskId, UINT modelId, UINT meshId, UINT start, UINT end, UINT* pBin, USHORT* pBinModel, USHORT* pBinMesh, USHORT* pNumTrisInBin) { // working on one triangle at a time for(UINT index = start; index <= end; index++) { float4 xformedPos[3]; Gather(xformedPos, index); // TODO: Maybe convert to Fixed pt and store it once so that dont have to convert to fixedPt again during rasterization int4 xFormedFxPtPos[3] = { int4(xformedPos[0]), int4(xformedPos[1]), int4(xformedPos[2]), }; // Compute triangle are int A0 = xFormedFxPtPos[1].y - xFormedFxPtPos[2].y; int B0 = xFormedFxPtPos[2].x - xFormedFxPtPos[1].x; int C0 = (xFormedFxPtPos[1].x * xFormedFxPtPos[2].y) - (xFormedFxPtPos[2].x * xFormedFxPtPos[1].y); int triArea = A0 * xFormedFxPtPos[0].x + B0 * xFormedFxPtPos[0].y + C0; // Find bounding box for screen space triangle in terms of pixels int startX = max(min(min(xFormedFxPtPos[0].x, xFormedFxPtPos[1].x), xFormedFxPtPos[2].x), 0); int endX = min(max(max(xFormedFxPtPos[0].x, xFormedFxPtPos[1].x), xFormedFxPtPos[2].x) + 1, SCREENW); int startY = max(min(min(xFormedFxPtPos[0].y, xFormedFxPtPos[1].y), xFormedFxPtPos[2].y), 0 ); int endY = min(max(max(xFormedFxPtPos[0].y, xFormedFxPtPos[1].y), xFormedFxPtPos[2].y) + 1, SCREENH); // Skip triangle if area is zero if(triArea <= 0) continue; float oneOverW[3]; for(int j = 0; j < 3; j++) { oneOverW[j] = xformedPos[j].w; } // Reject the triangle if any of its verts is behind the nearclip plane if(oneOverW[0] > 1.0f || oneOverW[1] > 1.0f || oneOverW[2] > 1.0f) continue; // Convert bounding box in terms of pixels to bounding box in terms of tiles int startXx = max(startX/TILE_WIDTH_IN_PIXELS, 0); int endXx = min(endX/TILE_WIDTH_IN_PIXELS, SCREENW_IN_TILES-1); int startYy = max(startY/TILE_HEIGHT_IN_PIXELS, 0); int endYy = min(endY/TILE_HEIGHT_IN_PIXELS, SCREENH_IN_TILES-1); // Add triangle to the tiles or bins that the bounding box covers int row, col; for(row = startYy; row <= endYy; row++) { int offset1 = YOFFSET1_ST * row; int offset2 = YOFFSET2_ST * row; for(col = startXx; col <= endXx; col++) { int idx1 = offset1 + (XOFFSET1_ST * col) + taskId; int idx2 = offset2 + (XOFFSET2_ST * col) + (taskId * MAX_TRIS_IN_BIN_ST) + pNumTrisInBin[idx1]; pBin[idx2] = index; pBinModel[idx2] = modelId; pBinMesh[idx2] = meshId; pNumTrisInBin[idx1] += 1; } } } }