int run_hsl(int c, char* src, char* dst, float hh, float ss, float ll, int times) { BMP* bmp = bmp_read(src); if(bmp==0) { return -1;} // open error if(ss>1) ss=1; else if(ss<-1) ss=-1; if(ll>1) ll=1; else if(ll<-1) ll=-1; uint8_t* data = bmp_get_data(bmp); uint32_t h = *(bmp_get_h(bmp)); uint32_t w = *(bmp_get_w(bmp)); if(w%4!=0) { return -1;} // do not support padding uint8_t* dataC = 0; if(*(bmp_get_bitcount(bmp)) == 24) { dataC = malloc(sizeof(uint8_t)*4*h*w); to32(w,h,data,dataC); } else { dataC = data; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_hsl(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_hsl1(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_hsl2(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; default: return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp)) == 24) { to24(w,h,dataC,data); free(dataC); } bmp_delete(bmp); return 0; }
int run_blur(int c, char* src, char* dst, int times){ BMP* bmp = bmp_read(src); if(bmp==0) { return -1;} // open error uint8_t* data = bmp_get_data(bmp); uint32_t h = *(bmp_get_h(bmp)); uint32_t w = *(bmp_get_w(bmp)); if(w%4!=0) { return -1;} // do not support padding uint8_t* dataC = 0; if(*(bmp_get_bitcount(bmp)) == 24) { dataC = malloc(sizeof(uint8_t)*4*h*w); to32(w,h,data,dataC); } else { dataC = data; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_blur(w,h,dataC); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_blur1(w,h,dataC); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_blur2(w,h,dataC); RDTSC_STOP(end); break; default: // return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp)) == 24) { to24(w,h,dataC,data); free(dataC); } bmp_delete(bmp); return 0; }
///@todo Combine this with QueueDraw void QueueDispatch(SWR_CONTEXT *pContext) { _ReadWriteBarrier(); pContext->DrawEnqueued++; if (KNOB_SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); WorkOnCompute(pContext, 0, pContext->WorkerBE[0]); // restore csr _mm_setcsr(mxcsr); } else { RDTSC_START(APIDrawWakeAllThreads); WakeAllThreads(pContext); RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); } // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. pContext->pPrevDrawContext = pContext->pCurDrawContext; pContext->pCurDrawContext = nullptr; }
void SwrClearRenderTarget( HANDLE hContext, uint32_t clearMask, const float clearColor[4], float z, BYTE stencil) { RDTSC_START(APIClearRenderTarget); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); SetupMacroTileScissors(pDC); pDC->inUse = true; CLEAR_FLAGS flags; flags.mask = clearMask; pDC->FeWork.type = CLEAR; pDC->FeWork.pfnWork = ProcessClear; pDC->FeWork.desc.clear.flags = flags; pDC->FeWork.desc.clear.clearDepth = z; pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; pDC->FeWork.desc.clear.clearStencil = stencil; // enqueue draw QueueDraw(pContext); RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); }
// Deswizzles, converts and stores current contents of the hot tiles to surface // described by pState void SwrStoreTiles( HANDLE hContext, SWR_RENDERTARGET_ATTACHMENT attachment, SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState { RDTSC_START(APIStoreTiles); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->inUse = true; SetupMacroTileScissors(pDC); pDC->FeWork.type = STORETILES; pDC->FeWork.pfnWork = ProcessStoreTiles; pDC->FeWork.desc.storeTiles.attachment = attachment; pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; //enqueue QueueDraw(pContext); RDTSC_STOP(APIStoreTiles, 0, 0); if (attachment == SWR_ATTACHMENT_COLOR0) { RDTSC_ENDFRAME(); } }
////////////////////////////////////////////////////////////////////////// /// @brief SwrDispatch /// @param hContext - Handle passed back from SwrCreateContext /// @param threadGroupCountX - Number of thread groups dispatched in X direction /// @param threadGroupCountY - Number of thread groups dispatched in Y direction /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction void SwrDispatch( HANDLE hContext, uint32_t threadGroupCountX, uint32_t threadGroupCountY, uint32_t threadGroupCountZ) { RDTSC_START(APIDispatch); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->isCompute = true; // This is a compute context. pDC->inUse = true; COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64); pTaskData->threadGroupCountX = threadGroupCountX; pTaskData->threadGroupCountY = threadGroupCountY; pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; pDC->pDispatch->initialize(totalThreadGroups, pTaskData); QueueDispatch(pContext); RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); }
JNIEXPORT jlong JNICALL Java_com_rr_core_os_NativeHooksImpl_jniNanoRDTSCStop(JNIEnv *env, jclass clazz) { long cycles; RDTSC_STOP( cycles ); return( cycles ); }
void Flip() { RDTSC_START(APIFlip); if (mIsDisplay) { XSync(mpDisplay, False); // copy render target to Xshm surface, mirroring on Y to account for X/GL origin differences (X is top-left, GL is bottom-left) UINT pitch = mWidth * 4; OGL::GetDDProcTable().pfnPresent2(OGL::GetDDHandle(), mpImages[mCurBackBuffer]->data, pitch); // copy to display surface if (useShm) XShmPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight, False); else XPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight); // flip back buffer mCurBackBuffer ^= 1; } RDTSC_STOP(APIFlip, 1, 0); }
void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2) { RDTSC_START(APISync); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->inUse = true; pDC->FeWork.type = SYNC; pDC->FeWork.pfnWork = ProcessSync; pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; pDC->FeWork.desc.sync.userData = userData; pDC->FeWork.desc.sync.userData2 = userData2; // cannot execute until all previous draws have completed pDC->dependency = pDC->drawId - 1; //enqueue QueueDraw(pContext); RDTSC_STOP(APISync, 1, 0); }
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { RDTSC_START(APIGetDrawContext); // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; pContext->pCurDrawContext = pCurDrawContext; // Update LastRetiredId UpdateLastRetiredId(pContext); // Need to wait until this draw context is available to use. while (StillDrawing(pContext, pCurDrawContext)) { // Make sure workers are working. WakeAllThreads(pContext); _mm_pause(); } // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; Arena& stateArena = pCurDrawContext->pState->arena; // Copy previous state to current state. if (pContext->pPrevDrawContext) { DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; // If we're splitting our draw then we can just use the same state from the previous // draw. In this case, we won't increment the DS ring index so the next non-split // draw can receive the state. if (isSplitDraw == false) { CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); stateArena.Reset(); // Reset memory. // Copy private state to new context. if (pPrevDrawContext->pState->pPrivateState != nullptr) { pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize); } pContext->curStateId++; // Progress state ring index forward. } else { // If its a split draw then just copy the state pointer over // since its the same draw. pCurDrawContext->pState = pPrevDrawContext->pState; } } else { stateArena.Reset(); // Reset memory. pContext->curStateId++; // Progress state ring index forward. } pCurDrawContext->dependency = 0; pCurDrawContext->arena.Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. pCurDrawContext->inUse = false; pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->nextDrawId++; } else { SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); } RDTSC_STOP(APIGetDrawContext, 0, 0); return pContext->pCurDrawContext; }
////////////////////////////////////////////////////////////////////////// /// @brief DrawIndexedInstanced /// @param hContext - Handle passed back from SwrCreateContext /// @param topology - Specifies topology for draw. /// @param numIndices - Number of indices to read sequentially from index buffer. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. /// @param numInstances - Number of instances to render. /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) void DrawIndexedInstance( HANDLE hContext, PRIMITIVE_TOPOLOGY topology, uint32_t numIndices, uint32_t indexOffset, int32_t baseVertex, uint32_t numInstances = 1, uint32_t startInstance = 0) { RDTSC_START(APIDrawIndexed); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); int32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) { case R32_UINT: indexSize = sizeof(uint32_t); break; case R16_UINT: indexSize = sizeof(uint16_t); break; case R8_UINT: indexSize = sizeof(uint8_t); break; default: SWR_ASSERT(0); } int draw = 0; uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; pIB += (uint64_t)indexOffset * (uint64_t)indexSize; pState->topology = topology; pState->forceFront = false; // disable culling for points/lines uint32_t oldCullMode = pState->rastState.cullMode; if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } while (remainingIndices) { uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw; // When breaking up draw, we need to obtain new draw context for each iteration. bool isSplitDraw = (draw > 0) ? true : false; pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); pDC->FeWork.type = DRAW; pDC->FeWork.pfnWork = GetFEDrawFunc( true, // IsIndexed pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pDC->pState->pfnProcessPrims != nullptr); pDC->FeWork.desc.draw.pDC = pDC; pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; pDC->FeWork.desc.draw.pIB = (int*)pIB; pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; pDC->FeWork.desc.draw.baseVertex = baseVertex; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; //enqueue DC QueueDraw(pContext); pIB += maxIndicesPerDraw * indexSize; remainingIndices -= numIndicesForDraw; draw++; } // restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); }
int run_merge(int c, char* src1, char* src2, char* dst, float value, int times){ if(value>1) value=1; else if(value<0) value=0; BMP* bmp1 = bmp_read(src1); BMP* bmp2 = bmp_read(src2); if(bmp1==0 || bmp2==0) { return -1;} // open error uint8_t* data1 = bmp_get_data(bmp1); uint8_t* data2 = bmp_get_data(bmp2); uint32_t h1 = *(bmp_get_h(bmp1)); uint32_t w1 = *(bmp_get_w(bmp1)); uint32_t h2 = *(bmp_get_h(bmp2)); uint32_t w2 = *(bmp_get_w(bmp2)); if(w1%4!=0 || w2%4!=0) { return -1;} // do not support padding if( w1!=w2 || h1!=h2 ) { return -1;} // different image size uint8_t* data1C = 0; uint8_t* data2C = 0; if(*(bmp_get_bitcount(bmp1)) == 24) { data1C = malloc(sizeof(uint8_t)*4*h1*w1); data2C = malloc(sizeof(uint8_t)*4*h2*w2); to32(w1,h1,data1,data1C); to32(w2,h2,data2,data2C); } else { data1C = data1; data2C = data2; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_merge(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_merge1(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_merge2(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; default: return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp1)) == 24) { to24(w1,h1,data1C,data1); free(data1C); free(data2C); } bmp_delete(bmp1); bmp_delete(bmp2); return 0; }
int main(void) { uint32_t start_hi=0, start_lo=0; uint32_t end_hi=0, end_lo=0; RDTSC_START(); sleep(1); RDTSC_STOP(); printf("elapsed: %ld (sleep(1))\n", elapsed(start_hi, start_lo, end_hi, end_lo)); printf("\n\n\n"); // For the rest of our tests, lets use loops to get more accurate numbers. #define REPEAT 100 uint64_t totalTime = 0; for(int i=0; i<REPEAT; i++) { RDTSC_START(); printf("printing!\n"); // how fast is printf()? RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (printf)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); printf("\n\n\n"); totalTime = 0; for(int i=0; i<REPEAT; i++) { RDTSC_START(); // how fast is nothing at all? RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (NOTHING)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); printf("\n\n\n"); totalTime = 0; for(int i=0; i<REPEAT; i++) { volatile int var = 0; int k=0; RDTSC_START(); // how fast is a loop that we can choose how many times it runs? for(; k<2; k++) // Change how many times this loop runs, see what happens. (var) = 1; RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (loop)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); return 0; }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); RDTSC_INIT(threadId); uint32_t numaNode = pThreadData->numaId; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint64_t curDrawBE = 0; uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } if (IsBEThread) { RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any BE work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its /// own set and each time it fails to lock a macrotile, because its already locked, /// then it will add that tile to the lockedTiles set. As a worker begins to work /// on future draws the lockedTiles ensure that it doesn't work on tiles that may /// still have work pending in a previous draw. Additionally, the lockedTiles is /// hueristic that can steer a worker back to the same macrotile that it had been /// working on in a previous draw. void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet& lockedTiles, uint32_t numaNode, uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. uint64_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. lockedTiles.clear(); // Try to work on each draw in order of the available draws in flight. // 1. If we're on curDrawBE, we can work on any macrotile that is available. // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute) return; // We don't look at compute work. // First wait for FE to be finished with this draw. This keeps threading model simple // but if there are lots of bubbles between draws then serializing FE and BE may // need to be revisited. if (!pDC->doneFE) return; // If this draw is dependent on a previous draw then we need to bail. if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. std::vector<uint32_t> ¯oTiles = pDC->pTileMgr->getDirtyTiles(); for (uint32_t tileID : macroTiles) { // Only work on tiles for for this numa node uint32_t x, y; pDC->pTileMgr->getTileIndices(tileID, x, y); if (((x ^ y) & numaMask) != numaNode) { continue; } MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); if (!tile.getNumQueued()) { continue; } // can only work on this draw if it's not in use by other threads if (lockedTiles.find(tileID) != lockedTiles.end()) { continue; } if (tile.tryLock()) { BE_WORK *pWork; RDTSC_START(WorkerFoundWork); uint32_t numWorkItems = tile.getNumQueued(); SWR_ASSERT(numWorkItems); pWork = tile.peek(); SWR_ASSERT(pWork); if (pWork->type == DRAW) { pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); } while ((pWork = tile.peek()) != nullptr) { pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); tile.dequeue(); } RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); _ReadWriteBarrier(); pDC->pTileMgr->markTileComplete(tileID); // Optimization: If the draw is complete and we're the last one to have worked on it then // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; CompleteDrawContext(pContext, pDC); lastRetiredDraw++; lockedTiles.clear(); break; } } else { // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. lockedTiles.insert(tileID); } } } }
////////////////////////////////////////////////////////////////////////// /// @brief DrawInstanced /// @param hContext - Handle passed back from SwrCreateContext /// @param topology - Specifies topology for draw. /// @param numVerts - How many vertices to read sequentially from vertex data (per instance). /// @param startVertex - Specifies start vertex for draw. (vertex data) /// @param numInstances - How many instances to render. /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) void DrawInstanced( HANDLE hContext, PRIMITIVE_TOPOLOGY topology, uint32_t numVertices, uint32_t startVertex, uint32_t numInstances = 1, uint32_t startInstance = 0) { RDTSC_START(APIDraw); #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_DRAW) { return; } #endif SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); int32_t remainingVerts = numVertices; API_STATE *pState = &pDC->pState->state; pState->topology = topology; pState->forceFront = false; // disable culling for points/lines uint32_t oldCullMode = pState->rastState.cullMode; if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } int draw = 0; while (remainingVerts) { uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw; bool isSplitDraw = (draw > 0) ? true : false; DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); pDC->FeWork.type = DRAW; pDC->FeWork.pfnWork = GetFEDrawFunc( false, // IsIndexed pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pDC->pState->pfnProcessPrims != nullptr); pDC->FeWork.desc.draw.numVerts = numVertsForDraw; pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw; pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; //enqueue DC QueueDraw(pContext); remainingVerts -= numVertsForDraw; draw++; } // restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_STOP(APIDraw, numVertices * numInstances, 0); }
int main() { /** Fm, where modulo = m */ mpz_t a, b, k, r, modulo; int i = 0; // loop variable Point p, next_p; p = init_point(p); mpz_init(a); mpz_init(b); mpz_init(k); mpz_init(r); // order mpz_init(modulo); /** Initialize parameters of ECC (F2p) */ mpz_set_str(a, a_v, 10); mpz_set_str(b, b_v, 16); mpz_set_str(modulo, p_v, 10); mpz_set_str(r, r_v, 10); mpz_set_str(p.x, gx_v, 16); mpz_set_str(p.y, gy_v, 16); mpz_t zero_value, k2; mpz_init(zero_value); mpz_init(k2); RDTSC_START(t1); sleep(1); // sleep for 1 second RDTSC_STOP(t2); uint64_t one_second = t2 - t1 - rdtscp_cycle; printf("Approximate number of cycles in 1 second: %lld\n\n", one_second); uint64_t one_us = one_second / 1e6; while (mpz_cmp(k, zero_value) == 0) { get_random(k, 32); // generate random test (256 bits) positive_modulo(k, k, modulo); } printf("Random k (in Binary): "); mpz_out_str(stdout, 2, k); printf("\n"); while (mpz_cmp(k2, zero_value) == 0) { get_random(k2, 32); // generate random test (256 bits) positive_modulo(k2, k2, modulo); } printf("Random k2 (in Binary): "); mpz_out_str(stdout, 2, k2); printf("\n"); /** Compare ADDITION, SHIFTING, MULTIPLICATION, and INVERSION */ if (TEST_MODULAR_OPERATION) { max_iteration = 10000; /** Addition */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_add(k, k, k2); positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION]--\n"); print_result(total, one_us); /** Shifting */ i = 0; uint64_t total2 = 0; mpz_t two; mpz_init(two); mpz_set_si(two, 2); while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_mul_2exp(k, k, 1); // left shift positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[SHIFTING 2 * k]--\n"); print_result(total2, one_us); /** Multiplication */ i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_mul(k, k, k2); positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[MULTIPLICATION k * k2]--\n"); print_result(total3, one_us); /** Inversion */ i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_invert(k, k, modulo); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[INVERSION]--\n"); print_result(total4, one_us); } /** -------------------------------------------------------------------------*/ // Convert Affine coordinate to Jacobian coordinate J_Point j_p, j_next_p; j_next_p = init_j_point(j_next_p); j_p = affine_to_jacobian(p); // Generator point if (TEST_SCALAR_OPERATION) { max_iteration = 100; Point p1, p2, p3; J_Point j_p1, j_p2, j_p3; /** Point preparation */ p1 = init_point(p1); p2 = init_point(p2); j_p1 = init_j_point(j_p1); j_p2 = init_j_point(j_p2); j_p1 = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, 4); j_p2 = jacobian_affine_sliding_NAF(j_p, p, a, k2, modulo, 4); p1 = jacobian_to_affine(j_p1, modulo); p2 = jacobian_to_affine(j_p2, modulo); /** Affine addition */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation p3 = affine_curve_addition(p1, p2, a, modulo); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in AFFINE]--\n"); print_result(total, one_us); /** Affine doubling */ i = 0; uint64_t total2 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation p3 = affine_curve_doubling(p1, a, modulo); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[DOUBLING in AFFINE]--\n"); print_result(total2, one_us); /** Jacobian addition */ i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_curve_addition(j_p1, j_p2, a, modulo); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in JACOBIAN]--\n"); print_result(total3, one_us); /** Jacobian doubling */ i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_curve_doubling(j_p1, a, modulo); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[DOUBLING in JACOBIAN]--\n"); print_result(total4, one_us); /** Affine-Jacobian addition */ i = 0; uint64_t total5 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_affine_curve_addition(j_p1, p2, a, modulo); RDTSC_STOP(t2); // stop operation total5 += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in JACOBIAN-AFFINE]--\n"); print_result(total5, one_us); } /** -------------------------------------------------------------------------*/ if (TEST_SCALAR_ALGORITHM) { max_iteration = 100; /** Test Left-to-right binary algorithm */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation next_p = affine_left_to_right_binary(p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[AFFINE] Left to right binary algorithm--\n"); print_result(total, one_us); i = 0; uint64_t total2 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_left_to_right_binary(j_p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN] Left to right binary algorithm--\n"); print_result(total2, one_us); i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_left_to_right_binary(j_p, p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Left to right binary algorithm--\n"); print_result(total3, one_us); int w = 4; // windows size i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 4)--\n"); print_result(total4, one_us); w = 5; // windows size i = 0; uint64_t total5 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total5 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 5)--\n"); print_result(total5, one_us); /** Test Right-to-left binary algorithm */ i = 0; uint64_t total6 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation next_p = affine_right_to_left_binary(p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total6 += t2 - t1 - rdtscp_cycle; i++; } printf("--[AFFINE] Right to left binary algorithm--\n"); print_result(total6, one_us); /** Test Montgomery ladder algorithm (Against time-based attack) */ i = 0; uint64_t total7 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_montgomery_ladder(j_p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total7 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN] Montgomery ladder algorithm--\n"); print_result(total7, one_us); } /** -------------------------------------------------------------------------*/ J_Point public_key_1, public_key_2, shared_key; mpz_t private_key_1, private_key_2; mpz_init(private_key_1); mpz_init(private_key_2); // TODO : Key should be padded to fixed size (serializable) // Note: (2^-256 chance of failure, can be ignored) while (mpz_cmp(private_key_1, zero_value) == 0) { get_random(private_key_1, 32); // 256 bit positive_modulo(private_key_1, private_key_1, modulo); } while (mpz_cmp(private_key_2, zero_value) == 0) { get_random(private_key_2, 32); // 256 bit positive_modulo(private_key_2, private_key_2, modulo); } gmp_printf("Private key [A B]: %Zd %Zd\n\n", private_key_1, private_key_2); public_key_1 = jacobian_left_to_right_binary(j_p, a, private_key_1, modulo); public_key_2 = jacobian_left_to_right_binary(j_p, a, private_key_2, modulo); gmp_printf("Public key 1 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_1.X, public_key_1.Y, public_key_1.Z); gmp_printf("Public key 2 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_2.X, public_key_2.Y, public_key_2.Z); Point public_key_1_decoded = jacobian_to_affine(public_key_1, modulo); Point public_key_2_decoded = jacobian_to_affine(public_key_2, modulo); gmp_printf("Public key 1 - Affine [X Y]: %Zd %Zd\n", public_key_1_decoded.x, public_key_1_decoded.y); gmp_printf("Public key 2 - Affine [X Y]: %Zd %Zd\n\n", public_key_2_decoded.x, public_key_2_decoded.y); /** -------------------------------------------------------------------------*/ if (TEST_ENCRYPT_DECRYPT) { // ElGamal Encrypt - Decrypt (Map message to chunk of points in EC) J_Point message, chosen_point, encoded_point, decoded_point; mpz_t k_message; mpz_init(k_message); mpz_set_ui(k_message, 123456789); message = jacobian_left_to_right_binary(j_p, a, k_message, modulo); Point message_decoded = jacobian_to_affine(message, modulo); gmp_printf("[Encrypt] Message - Affine [X Y] %Zd %Zd\n", message_decoded.x, message_decoded.y); gmp_printf("[Encrypt] Message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", message.X, message.Y, message.Z); while (mpz_cmp(k_message, zero_value) == 0) { get_random(k_message, 32); positive_modulo(k_message, k_message, modulo); } // Encrypt example chosen_point = jacobian_left_to_right_binary(j_p, a, k_message, modulo); // chosen point (r) gmp_printf("[Encrypt] Chosen point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", chosen_point.X, chosen_point.Y, chosen_point.Z); encoded_point = jacobian_left_to_right_binary(public_key_2, a, k_message, modulo); // r * Pu2 encoded_point = jacobian_curve_addition(message, encoded_point, a, modulo); // TODO : chosen_point & encoded_point should be padded to P-bit gmp_printf("[Decrypt] Encoded point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", encoded_point.X, encoded_point.Y, encoded_point.Z); // Decrypt example (encoded_point - private_key * chosen_point) decoded_point = jacobian_left_to_right_binary(chosen_point, a, private_key_2, modulo); decoded_point = jacobian_curve_subtraction(encoded_point, decoded_point, a, modulo); gmp_printf("[Decrypt] Original message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", decoded_point.X, decoded_point.Y, decoded_point.Z); message_decoded = jacobian_to_affine(decoded_point, modulo); gmp_printf("[Decrypt] Original message - Affine [X Y] %Zd %Zd\n\n", message_decoded.x, message_decoded.y); } /** -------------------------------------------------------------------------*/ if (TEST_SIMPLIFIED_ECIES) { // Simplified ECIES (Ref: Page 256 Cryptography Theory & Practice 2nd Ed. - Douglas) char* message_string = "hello"; // 0..9, a..z (base 36) mpz_t encrypted_message; mpz_init(encrypted_message); int partition = strlen(message_string) / 24; int partition_modulo = strlen(message_string) % 24; if (partition_modulo != 0) partition++; for (i = 0; i < partition; i++) { // 24 characters from message_string + 1 null-terminate char* chunked_message_string = (char*) malloc(25 * sizeof(char)); int size = 24; if ((i == partition - 1) && (partition_modulo != 0)) size = partition_modulo; strncpy(chunked_message_string, message_string + i*24, size); chunked_message_string[size] = '\0'; // null-terminate Point c_point = encrypt_ECIES(encrypted_message, chunked_message_string, public_key_2_decoded, p, a, modulo); gmp_printf("[SIMPLIFIED ECIES] Encrypted message: %Zd\n", encrypted_message); decrypt_ECIES(encrypted_message, c_point, private_key_2, p, a, modulo); } } /**-------------------------------------------------------------------------*/ // TODO : Public key validation! // Shared key (ECDH) - key secure exchange shared_key = jacobian_left_to_right_binary(public_key_2, a, private_key_1, modulo); gmp_printf("Shared key - Jacobian [X Y Z]: %Zd %Zd %Zd\n", shared_key.X, shared_key.Y, shared_key.Z); Point shared_key_decoded = jacobian_to_affine(shared_key, modulo); gmp_printf("Shared key - Affine [X Y]: %Zd %Zd\n", shared_key_decoded.x, shared_key_decoded.y); // TODO : ECDSA - digital signature algorithm /** Cleaning up */ mpz_clear(a); mpz_clear(b); mpz_clear(k); mpz_clear(r); mpz_clear(modulo); mpz_clear(private_key_1); mpz_clear(private_key_2); return EXIT_SUCCESS; }
// for draw calls, we initialize the active hot tiles and perform deferred // load on them if tile is in invalid state. we do this in the outer thread loop instead of inside // the draw routine itself mainly for performance, to avoid unnecessary setup // every triangle // @todo support deferred clear INLINE void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) { const API_STATE& state = GetApiState(pDC); HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; const SWR_PS_STATE& psState = state.psState; uint32_t numRTs = psState.maxRTSlotUsed + 1; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); x *= KNOB_MACROTILE_X_DIM; y *= KNOB_MACROTILE_Y_DIM; uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); // check RT if enabled if (state.psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearColorHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } } // check depth if enabled if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearDepthHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } // check stencil if enabled if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearStencilHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } }