Exemple #1
0
int run_hsl(int c, char* src, char* dst, float hh, float ss, float ll, int times) {
  BMP* bmp = bmp_read(src);
  if(bmp==0) { return -1;}  // open error
  if(ss>1) ss=1; else if(ss<-1) ss=-1;
  if(ll>1) ll=1; else if(ll<-1) ll=-1;
  uint8_t* data = bmp_get_data(bmp);
  uint32_t h = *(bmp_get_h(bmp));
  uint32_t w = *(bmp_get_w(bmp));
  if(w%4!=0) { return -1;}  // do not support padding
  
  uint8_t* dataC = 0;
  if(*(bmp_get_bitcount(bmp)) == 24) {
    dataC = malloc(sizeof(uint8_t)*4*h*w);
    to32(w,h,data,dataC);
  } else {
    dataC = data;
  }
  
  unsigned long start, end;
  switch(c){
    case 0:
      RDTSC_START(start);
      C_hsl(w,h,dataC,hh,ss,ll);
      RDTSC_STOP(end); 
      break;
    case 1:
      RDTSC_START(start);
      ASM_hsl1(w,h,dataC,hh,ss,ll);
      RDTSC_STOP(end);
      break;
    case 2:
      RDTSC_START(start);
      ASM_hsl2(w,h,dataC,hh,ss,ll);
      RDTSC_STOP(end);
      break;
    default:
      return -1;
      break;
  }
  unsigned long delta = end - start;
  
  printf("%lu", delta);

  if(*(bmp_get_bitcount(bmp)) == 24) {
    to24(w,h,dataC,data);
    free(dataC);
  }
  bmp_delete(bmp);
  return 0;
}
Exemple #2
0
int run_blur(int c, char* src, char* dst, int times){
  BMP* bmp = bmp_read(src);
  if(bmp==0) { return -1;}  // open error
  
  uint8_t* data = bmp_get_data(bmp);
  uint32_t h = *(bmp_get_h(bmp));
  uint32_t w = *(bmp_get_w(bmp));
  if(w%4!=0) { return -1;}  // do not support padding
  
  uint8_t* dataC = 0;
  if(*(bmp_get_bitcount(bmp)) == 24) {
    dataC = malloc(sizeof(uint8_t)*4*h*w);
    to32(w,h,data,dataC);
  } else {
    dataC = data;
  }
  
  unsigned long start, end;
  switch(c){
    case 0:
      RDTSC_START(start);
      C_blur(w,h,dataC);
      RDTSC_STOP(end); 
      break;
    case 1:
      RDTSC_START(start);
      ASM_blur1(w,h,dataC);
      RDTSC_STOP(end);
      break;
    case 2:
      RDTSC_START(start);
      ASM_blur2(w,h,dataC);
      RDTSC_STOP(end);
      break;
    default:
      // return -1;
      break;
  }
  unsigned long delta = end - start;
  printf("%lu", delta);

  if(*(bmp_get_bitcount(bmp)) == 24) {
    to24(w,h,dataC,data);
    free(dataC);
  }
  bmp_delete(bmp);
  return 0;
}
Exemple #3
0
///@todo Combine this with QueueDraw
void QueueDispatch(SWR_CONTEXT *pContext)
{
    _ReadWriteBarrier();
    pContext->DrawEnqueued++;

    if (KNOB_SINGLE_THREADED)
    {
        // flush denormals to 0
        uint32_t mxcsr = _mm_getcsr();
        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

        WorkOnCompute(pContext, 0, pContext->WorkerBE[0]);

        // restore csr
        _mm_setcsr(mxcsr);
    }
    else
    {
        RDTSC_START(APIDrawWakeAllThreads);
        WakeAllThreads(pContext);
        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
    }

    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
    pContext->pPrevDrawContext = pContext->pCurDrawContext;
    pContext->pCurDrawContext = nullptr;
}
Exemple #4
0
void SwrClearRenderTarget(
    HANDLE hContext,
    uint32_t clearMask,
    const float clearColor[4],
    float z,
    BYTE stencil)
{
    RDTSC_START(APIClearRenderTarget);

    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;

    DRAW_CONTEXT* pDC = GetDrawContext(pContext);

    SetupMacroTileScissors(pDC);

    pDC->inUse = true;

    CLEAR_FLAGS flags;
    flags.mask = clearMask;

    pDC->FeWork.type = CLEAR;
    pDC->FeWork.pfnWork = ProcessClear;
    pDC->FeWork.desc.clear.flags = flags;
    pDC->FeWork.desc.clear.clearDepth = z;
    pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
    pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
    pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
    pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
    pDC->FeWork.desc.clear.clearStencil = stencil;

    // enqueue draw
    QueueDraw(pContext);

    RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
}
Exemple #5
0
// Deswizzles, converts and stores current contents of the hot tiles to surface
// described by pState
void SwrStoreTiles(
    HANDLE hContext,
    SWR_RENDERTARGET_ATTACHMENT attachment,
    SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState
{
    RDTSC_START(APIStoreTiles);

    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    pDC->inUse = true;

    SetupMacroTileScissors(pDC);

    pDC->FeWork.type = STORETILES;
    pDC->FeWork.pfnWork = ProcessStoreTiles;
    pDC->FeWork.desc.storeTiles.attachment = attachment;
    pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;

    //enqueue
    QueueDraw(pContext);

    RDTSC_STOP(APIStoreTiles, 0, 0);
    if (attachment == SWR_ATTACHMENT_COLOR0)
    {
        RDTSC_ENDFRAME();
    }
}
Exemple #6
0
//////////////////////////////////////////////////////////////////////////
/// @brief SwrDispatch
/// @param hContext - Handle passed back from SwrCreateContext
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
void SwrDispatch(
    HANDLE hContext,
    uint32_t threadGroupCountX,
    uint32_t threadGroupCountY,
    uint32_t threadGroupCountZ)
{
    RDTSC_START(APIDispatch);
    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);

    pDC->isCompute = true;      // This is a compute context.
    pDC->inUse = true;

    COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64);

    pTaskData->threadGroupCountX = threadGroupCountX;
    pTaskData->threadGroupCountY = threadGroupCountY;
    pTaskData->threadGroupCountZ = threadGroupCountZ;

    uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
    pDC->pDispatch->initialize(totalThreadGroups, pTaskData);

    QueueDispatch(pContext);
    RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
}
JNIEXPORT jlong JNICALL Java_com_rr_core_os_NativeHooksImpl_jniNanoRDTSCStop(JNIEnv *env, jclass clazz) {
    long cycles;
    
    RDTSC_STOP( cycles );
    
    return( cycles );
}
Exemple #8
0
    void Flip()
    {
        RDTSC_START(APIFlip);

        if (mIsDisplay)
        {
            XSync(mpDisplay, False);
            // copy render target to Xshm surface, mirroring on Y to account for X/GL origin differences (X is top-left, GL is bottom-left)
            UINT pitch = mWidth * 4;
            OGL::GetDDProcTable().pfnPresent2(OGL::GetDDHandle(), mpImages[mCurBackBuffer]->data, pitch);

            // copy to display surface
            if (useShm)
                XShmPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight, False);
            else
                XPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight);

            // flip back buffer
            mCurBackBuffer ^= 1;
        }

        RDTSC_STOP(APIFlip, 1, 0);
    }
Exemple #9
0
void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2)
{
    RDTSC_START(APISync);

    SWR_CONTEXT *pContext = GetContext(hContext);
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);

    pDC->inUse = true;

    pDC->FeWork.type = SYNC;
    pDC->FeWork.pfnWork = ProcessSync;
    pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
    pDC->FeWork.desc.sync.userData = userData;
    pDC->FeWork.desc.sync.userData2 = userData2;

    // cannot execute until all previous draws have completed
    pDC->dependency = pDC->drawId - 1;

    //enqueue
    QueueDraw(pContext);

    RDTSC_STOP(APISync, 1, 0);
}
Exemple #10
0
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
{
    RDTSC_START(APIGetDrawContext);
    // If current draw context is null then need to obtain a new draw context to use from ring.
    if (pContext->pCurDrawContext == nullptr)
    {
        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;

        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
        pContext->pCurDrawContext = pCurDrawContext;

        // Update LastRetiredId
        UpdateLastRetiredId(pContext);

        // Need to wait until this draw context is available to use.
        while (StillDrawing(pContext, pCurDrawContext))
        {
            // Make sure workers are working.
            WakeAllThreads(pContext);

            _mm_pause();
        }

        // Assign next available entry in DS ring to this DC.
        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
        pCurDrawContext->pState = &pContext->dsRing[dsIndex];

        Arena& stateArena = pCurDrawContext->pState->arena;

        // Copy previous state to current state.
        if (pContext->pPrevDrawContext)
        {
            DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;

            // If we're splitting our draw then we can just use the same state from the previous
            // draw. In this case, we won't increment the DS ring index so the next non-split
            // draw can receive the state.
            if (isSplitDraw == false)
            {
                CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);

                stateArena.Reset();    // Reset memory.

                // Copy private state to new context.
                if (pPrevDrawContext->pState->pPrivateState != nullptr)
                {
                    pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
                    memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize);
                }

                pContext->curStateId++;  // Progress state ring index forward.
            }
            else
            {
                // If its a split draw then just copy the state pointer over
                // since its the same draw.
                pCurDrawContext->pState = pPrevDrawContext->pState;
            }
        }
        else
        {
            stateArena.Reset();    // Reset memory.
            pContext->curStateId++;  // Progress state ring index forward.
        }

        pCurDrawContext->dependency = 0;
        pCurDrawContext->arena.Reset();
        pCurDrawContext->pContext = pContext;
        pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
        pCurDrawContext->inUse = false;

        pCurDrawContext->doneCompute = false;
        pCurDrawContext->doneFE = false;
        pCurDrawContext->FeLock = 0;

        pCurDrawContext->pTileMgr->initialize();

        // Assign unique drawId for this DC
        pCurDrawContext->drawId = pContext->nextDrawId++;
    }
    else
    {
        SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
    }

    RDTSC_STOP(APIGetDrawContext, 0, 0);
    return pContext->pCurDrawContext;
}
Exemple #11
0
//////////////////////////////////////////////////////////////////////////
/// @brief DrawIndexedInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
/// @param numInstances - Number of instances to render.
/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
void DrawIndexedInstance(
    HANDLE hContext,
    PRIMITIVE_TOPOLOGY topology,
    uint32_t numIndices,
    uint32_t indexOffset,
    int32_t baseVertex,
    uint32_t numInstances = 1,
    uint32_t startInstance = 0)
{
    RDTSC_START(APIDrawIndexed);

    SWR_CONTEXT *pContext = GetContext(hContext);
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    API_STATE* pState = &pDC->pState->state;

    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
    uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
    int32_t remainingIndices = numIndices;

    uint32_t indexSize = 0;
    switch (pState->indexBuffer.format)
    {
    case R32_UINT: indexSize = sizeof(uint32_t); break;
    case R16_UINT: indexSize = sizeof(uint16_t); break;
    case R8_UINT: indexSize = sizeof(uint8_t); break;
    default:
        SWR_ASSERT(0);
    }

    int draw = 0;
    uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
    pIB += (uint64_t)indexOffset * (uint64_t)indexSize;

    pState->topology = topology;
    pState->forceFront = false;

    // disable culling for points/lines
    uint32_t oldCullMode = pState->rastState.cullMode;
    if (topology == TOP_POINT_LIST)
    {
        pState->rastState.cullMode = SWR_CULLMODE_NONE;
        pState->forceFront = true;
    }

    while (remainingIndices)
    {
        uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
        remainingIndices : maxIndicesPerDraw;

        // When breaking up draw, we need to obtain new draw context for each iteration.
        bool isSplitDraw = (draw > 0) ? true : false;
        pDC = GetDrawContext(pContext, isSplitDraw);
        InitDraw(pDC, isSplitDraw);

        pDC->FeWork.type = DRAW;
        pDC->FeWork.pfnWork = GetFEDrawFunc(
            true,   // IsIndexed
            pState->tsState.tsEnable,
            pState->gsState.gsEnable,
            pState->soState.soEnable,
            pDC->pState->pfnProcessPrims != nullptr);
        pDC->FeWork.desc.draw.pDC = pDC;
        pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
        pDC->FeWork.desc.draw.pIB = (int*)pIB;
        pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;

        pDC->FeWork.desc.draw.numInstances = numInstances;
        pDC->FeWork.desc.draw.startInstance = startInstance;
        pDC->FeWork.desc.draw.baseVertex = baseVertex;
        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;

        //enqueue DC
        QueueDraw(pContext);

        pIB += maxIndicesPerDraw * indexSize;
        remainingIndices -= numIndicesForDraw;
        draw++;
    }

    // restore culling state
    pDC = GetDrawContext(pContext);
    pDC->pState->state.rastState.cullMode = oldCullMode;

    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
}
Exemple #12
0
int run_merge(int c, char* src1, char* src2, char* dst, float value, int times){
  if(value>1) value=1; else if(value<0) value=0;
  BMP* bmp1 = bmp_read(src1);
  BMP* bmp2 = bmp_read(src2);
  if(bmp1==0 || bmp2==0) { return -1;}  // open error
  
  uint8_t* data1 = bmp_get_data(bmp1);
  uint8_t* data2 = bmp_get_data(bmp2);
  uint32_t h1 = *(bmp_get_h(bmp1));
  uint32_t w1 = *(bmp_get_w(bmp1));
  uint32_t h2 = *(bmp_get_h(bmp2));
  uint32_t w2 = *(bmp_get_w(bmp2));
  if(w1%4!=0 || w2%4!=0) { return -1;}  // do not support padding
  if( w1!=w2 || h1!=h2 ) { return -1;}  // different image size
  
  uint8_t* data1C = 0;
  uint8_t* data2C = 0;
  if(*(bmp_get_bitcount(bmp1)) == 24) {
    data1C = malloc(sizeof(uint8_t)*4*h1*w1);
    data2C = malloc(sizeof(uint8_t)*4*h2*w2);
    to32(w1,h1,data1,data1C);
    to32(w2,h2,data2,data2C);
  } else {
    data1C = data1;
    data2C = data2;
  }

  unsigned long start, end;
  switch(c){
    case 0:
      RDTSC_START(start);
      C_merge(w1,h1,data1C,data2C,value);
      RDTSC_STOP(end); 
      break;
    case 1:
      RDTSC_START(start);
      ASM_merge1(w1,h1,data1C,data2C,value);
      RDTSC_STOP(end);
      break;
    case 2:
      RDTSC_START(start);
      ASM_merge2(w1,h1,data1C,data2C,value);
      RDTSC_STOP(end);
      break;
    default:
      return -1;
      break;
  }
  unsigned long delta = end - start;
  
  printf("%lu", delta);

  if(*(bmp_get_bitcount(bmp1)) == 24) {
    to24(w1,h1,data1C,data1);
    free(data1C);
    free(data2C);
  }
  bmp_delete(bmp1);
  bmp_delete(bmp2);
  return 0;
}
Exemple #13
0
int main(void)
{
	uint32_t start_hi=0, start_lo=0; 
	uint32_t   end_hi=0,   end_lo=0;

	RDTSC_START();
	sleep(1);
	RDTSC_STOP();
	printf("elapsed: %ld (sleep(1))\n", elapsed(start_hi, start_lo, end_hi, end_lo));
	printf("\n\n\n");


	// For the rest of our tests, lets use loops to get more accurate numbers.

#define REPEAT 100
	
	uint64_t totalTime = 0;
	for(int i=0; i<REPEAT; i++)
	{
		RDTSC_START();
		printf("printing!\n"); // how fast is printf()?
		RDTSC_STOP();
		uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo);
		printf("trial %d: %ld (printf)\n", i, e);
		totalTime += e;
	}
	printf("average: %f\n", totalTime/(float)REPEAT);

	
	printf("\n\n\n");
	totalTime = 0;
	for(int i=0; i<REPEAT; i++)
	{
		RDTSC_START();
		// how fast is nothing at all?
		RDTSC_STOP();
		uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo);
		printf("trial %d: %ld (NOTHING)\n", i, e);
		totalTime += e;
	}
	printf("average: %f\n", totalTime/(float)REPEAT);
	printf("\n\n\n");

	totalTime = 0;
	for(int i=0; i<REPEAT; i++)
	{
		volatile int var = 0;
		int k=0; 
		RDTSC_START();
		// how fast is a loop that we can choose how many times it runs?
		for(; k<2; k++) // Change how many times this loop runs, see what happens.
			(var) = 1;
		RDTSC_STOP();
		uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo);
		printf("trial %d: %ld (loop)\n", i, e);
		totalTime += e;
	}
	printf("average: %f\n", totalTime/(float)REPEAT);


	return 0;
}
Exemple #14
0
DWORD workerThreadMain(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 

    RDTSC_INIT(threadId);

    uint32_t numaNode = pThreadData->numaId;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    TileSet lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);

    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };

    uint64_t curDrawBE = 0;
    uint64_t curDrawFE = 0;

    while (pContext->threadPool.inThreadShutdown == false)
    {
        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
        {
            _mm_pause();
        }

        if (!threadHasWork(curDrawBE))
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (threadHasWork(curDrawBE))
            {
                lock.unlock();
                continue;
            }

            if (pContext->threadPool.inThreadShutdown)
            {
                lock.unlock();
                break;
            }

            RDTSC_START(WorkerWaitForThreadEvent);

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();

            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);

            if (pContext->threadPool.inThreadShutdown)
            {
                break;
            }
        }

        if (IsBEThread)
        {
            RDTSC_START(WorkerWorkOnFifoBE);
            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);

            WorkOnCompute(pContext, workerId, curDrawBE);
        }

        if (IsFEThread)
        {
            WorkOnFifoFE(pContext, workerId, curDrawFE);

            if (!IsBEThread)
            {
                curDrawBE = curDrawFE;
            }
        }
    }

    return 0;
}
Exemple #15
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any BE work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
///                      own set and each time it fails to lock a macrotile, because its already locked,
///                      then it will add that tile to the lockedTiles set. As a worker begins to work
///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
///                      still have work pending in a previous draw. Additionally, the lockedTiles is
///                      hueristic that can steer a worker back to the same macrotile that it had been
///                      working on in a previous draw.
void WorkOnFifoBE(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint64_t &curDrawBE,
    TileSet& lockedTiles,
    uint32_t numaNode,
    uint32_t numaMask)
{
    // Find the first incomplete draw that has pending work. If no such draw is found then
    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
    uint64_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
    {
        return;
    }

    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;

    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
    lockedTiles.clear();

    // Try to work on each draw in order of the available draws in flight.
    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
    //      working on those macrotiles that are known to be complete in the prior draw to
    //      maintain order. The locked tiles provides the history to ensures this.
    for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];

        if (pDC->isCompute) return; // We don't look at compute work.

        // First wait for FE to be finished with this draw. This keeps threading model simple
        // but if there are lots of bubbles between draws then serializing FE and BE may
        // need to be revisited.
        if (!pDC->doneFE) return;
        
        // If this draw is dependent on a previous draw then we need to bail.
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return;
        }

        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
        std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();

        for (uint32_t tileID : macroTiles)
        {
            // Only work on tiles for for this numa node
            uint32_t x, y;
            pDC->pTileMgr->getTileIndices(tileID, x, y);
            if (((x ^ y) & numaMask) != numaNode)
            {
                continue;
            }

            MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
            
            if (!tile.getNumQueued())
            {
                continue;
            }

            // can only work on this draw if it's not in use by other threads
            if (lockedTiles.find(tileID) != lockedTiles.end())
            {
                continue;
            }

            if (tile.tryLock())
            {
                BE_WORK *pWork;

                RDTSC_START(WorkerFoundWork);

                uint32_t numWorkItems = tile.getNumQueued();
                SWR_ASSERT(numWorkItems);

                pWork = tile.peek();
                SWR_ASSERT(pWork);
                if (pWork->type == DRAW)
                {
                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
                }

                while ((pWork = tile.peek()) != nullptr)
                {
                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                    tile.dequeue();
                }
                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);

                _ReadWriteBarrier();

                pDC->pTileMgr->markTileComplete(tileID);

                // Optimization: If the draw is complete and we're the last one to have worked on it then
                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
                {
                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
                    curDrawBE++;
                    CompleteDrawContext(pContext, pDC);

                    lastRetiredDraw++;

                    lockedTiles.clear();
                    break;
                }
            }
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                lockedTiles.insert(tileID);
            }
        }
    }
}
Exemple #16
0
//////////////////////////////////////////////////////////////////////////
/// @brief DrawInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
/// @param startVertex - Specifies start vertex for draw. (vertex data)
/// @param numInstances - How many instances to render.
/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
void DrawInstanced(
    HANDLE hContext,
    PRIMITIVE_TOPOLOGY topology,
    uint32_t numVertices,
    uint32_t startVertex,
    uint32_t numInstances = 1,
    uint32_t startInstance = 0)
{
    RDTSC_START(APIDraw);

#if KNOB_ENABLE_TOSS_POINTS
    if (KNOB_TOSS_DRAW)
    {
        return;
    }
#endif

    SWR_CONTEXT *pContext = GetContext(hContext);
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);

    int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
    uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
    int32_t remainingVerts = numVertices;

    API_STATE    *pState = &pDC->pState->state;
    pState->topology = topology;
    pState->forceFront = false;

    // disable culling for points/lines
    uint32_t oldCullMode = pState->rastState.cullMode;
    if (topology == TOP_POINT_LIST)
    {
        pState->rastState.cullMode = SWR_CULLMODE_NONE;
        pState->forceFront = true;
    }

    int draw = 0;
    while (remainingVerts)
    {
        uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
        remainingVerts : maxVertsPerDraw;

        bool isSplitDraw = (draw > 0) ? true : false;
        DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
        InitDraw(pDC, isSplitDraw);

        pDC->FeWork.type = DRAW;
        pDC->FeWork.pfnWork = GetFEDrawFunc(
            false,  // IsIndexed
            pState->tsState.tsEnable,
            pState->gsState.gsEnable,
            pState->soState.soEnable,
            pDC->pState->pfnProcessPrims != nullptr);
        pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
        pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw;
        pDC->FeWork.desc.draw.numInstances = numInstances;
        pDC->FeWork.desc.draw.startInstance = startInstance;
        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;

        //enqueue DC
        QueueDraw(pContext);

        remainingVerts -= numVertsForDraw;
        draw++;
    }

    // restore culling state
    pDC = GetDrawContext(pContext);
    pDC->pState->state.rastState.cullMode = oldCullMode;

    RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
}
Exemple #17
0
int main() {
	/** Fm, where modulo = m */
	mpz_t a, b, k, r, modulo;
	int i = 0; // loop variable
	Point p, next_p;
	p = init_point(p);
	mpz_init(a);
	mpz_init(b);
	mpz_init(k);
	mpz_init(r); // order
	mpz_init(modulo);

	/** Initialize parameters of ECC (F2p) */
	mpz_set_str(a, a_v, 10);
	mpz_set_str(b, b_v, 16);
	mpz_set_str(modulo, p_v, 10);
	mpz_set_str(r, r_v, 10);
	mpz_set_str(p.x, gx_v, 16);
	mpz_set_str(p.y, gy_v, 16);

	mpz_t zero_value, k2;
	mpz_init(zero_value);
	mpz_init(k2);

	RDTSC_START(t1);
	sleep(1); // sleep for 1 second
	RDTSC_STOP(t2);
	uint64_t one_second = t2 - t1 - rdtscp_cycle;
	printf("Approximate number of cycles in 1 second: %lld\n\n", one_second);
	uint64_t one_us = one_second / 1e6;

	while (mpz_cmp(k, zero_value) == 0) {
		get_random(k, 32); // generate random test (256 bits)
		positive_modulo(k, k, modulo);
	}
	printf("Random k (in Binary): ");
	mpz_out_str(stdout, 2, k);
	printf("\n");

	while (mpz_cmp(k2, zero_value) == 0) {
		get_random(k2, 32); // generate random test (256 bits)
		positive_modulo(k2, k2, modulo);
	}
	printf("Random k2 (in Binary): ");
	mpz_out_str(stdout, 2, k2);
	printf("\n");

	/** Compare ADDITION, SHIFTING, MULTIPLICATION, and INVERSION */
	if (TEST_MODULAR_OPERATION) {
		max_iteration = 10000;

		/** Addition */
		i = 0;
		uint64_t total = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			mpz_add(k, k, k2);
			positive_modulo(k, k, modulo);
			RDTSC_STOP(t2); // stop operation
			total += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[ADDITION]--\n");
		print_result(total, one_us);
		

		/** Shifting */
		i = 0;
		uint64_t total2 = 0;
		mpz_t two;
		mpz_init(two);
		mpz_set_si(two, 2);
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			mpz_mul_2exp(k, k, 1); // left shift
			positive_modulo(k, k, modulo);
			RDTSC_STOP(t2); // stop operation
			total2 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[SHIFTING 2 * k]--\n");
		print_result(total2, one_us);

		/** Multiplication */
		i = 0;
		uint64_t total3 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			mpz_mul(k, k, k2);
			positive_modulo(k, k, modulo);
			RDTSC_STOP(t2); // stop operation
			total3 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[MULTIPLICATION k * k2]--\n");
		print_result(total3, one_us);

		/** Inversion */
		i = 0;
		uint64_t total4 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			mpz_invert(k, k, modulo);
			RDTSC_STOP(t2); // stop operation
			total4 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[INVERSION]--\n");
		print_result(total4, one_us);
	}

	/** -------------------------------------------------------------------------*/
	// Convert Affine coordinate to Jacobian coordinate
	J_Point j_p, j_next_p;
	j_next_p = init_j_point(j_next_p);
	j_p = affine_to_jacobian(p); // Generator point

	if (TEST_SCALAR_OPERATION) {
		max_iteration = 100;
		Point p1, p2, p3;
		J_Point j_p1, j_p2, j_p3;

		/** Point preparation */
		p1 = init_point(p1); p2 = init_point(p2);
		j_p1 = init_j_point(j_p1); j_p2 = init_j_point(j_p2);
		j_p1 = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, 4);
		j_p2 = jacobian_affine_sliding_NAF(j_p, p, a, k2, modulo, 4);
		p1 = jacobian_to_affine(j_p1, modulo);
		p2 = jacobian_to_affine(j_p2, modulo);

		/** Affine addition */
		i = 0;
		uint64_t total = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			p3 = affine_curve_addition(p1, p2, a, modulo);
			RDTSC_STOP(t2); // stop operation
			total += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[ADDITION in AFFINE]--\n");
		print_result(total, one_us);

		/** Affine doubling */
		i = 0;
		uint64_t total2 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			p3 = affine_curve_doubling(p1, a, modulo);
			RDTSC_STOP(t2); // stop operation
			total2 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[DOUBLING in AFFINE]--\n");
		print_result(total2, one_us);

		/** Jacobian addition */
		i = 0;
		uint64_t total3 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_p3 = jacobian_curve_addition(j_p1, j_p2, a, modulo);
			RDTSC_STOP(t2); // stop operation
			total3 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[ADDITION in JACOBIAN]--\n");
		print_result(total3, one_us);

		/** Jacobian doubling */
		i = 0;
		uint64_t total4 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_p3 = jacobian_curve_doubling(j_p1, a, modulo);
			RDTSC_STOP(t2); // stop operation
			total4 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[DOUBLING in JACOBIAN]--\n");
		print_result(total4, one_us);

		/** Affine-Jacobian addition */
		i = 0;
		uint64_t total5 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_p3 = jacobian_affine_curve_addition(j_p1, p2, a, modulo);
			RDTSC_STOP(t2); // stop operation
			total5 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[ADDITION in JACOBIAN-AFFINE]--\n");
		print_result(total5, one_us);
	}

	/** -------------------------------------------------------------------------*/
	if (TEST_SCALAR_ALGORITHM) {
		max_iteration = 100;

		/** Test Left-to-right binary algorithm */
		i = 0;
		uint64_t total = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			next_p = affine_left_to_right_binary(p, a, k, modulo); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[AFFINE] Left to right binary algorithm--\n");
		print_result(total, one_us);

		i = 0;
		uint64_t total2 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_next_p = jacobian_left_to_right_binary(j_p, a, k, modulo); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y);
			next_p = jacobian_to_affine(j_next_p, modulo);
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total2 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[JACOBIAN] Left to right binary algorithm--\n");
		print_result(total2, one_us);

		i = 0;
		uint64_t total3 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_next_p = jacobian_affine_left_to_right_binary(j_p, p, a, k, modulo); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y);
			next_p = jacobian_to_affine(j_next_p, modulo);
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total3 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[JACOBIAN-AFFINE] Left to right binary algorithm--\n");
		print_result(total3, one_us);

		int w = 4; // windows size
		i = 0;
		uint64_t total4 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y);
			next_p = jacobian_to_affine(j_next_p, modulo);
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total4 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 4)--\n");
		print_result(total4, one_us);

		w = 5; // windows size
		i = 0;
		uint64_t total5 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y);
			next_p = jacobian_to_affine(j_next_p, modulo);
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total5 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 5)--\n");
		print_result(total5, one_us);

		/** Test Right-to-left binary algorithm */
		i = 0;
		uint64_t total6 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			next_p = affine_right_to_left_binary(p, a, k, modulo); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total6 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[AFFINE] Right to left binary algorithm--\n");
		print_result(total6, one_us);

		/** Test Montgomery ladder algorithm (Against time-based attack) */
		i = 0;
		uint64_t total7 = 0;
		while (i < max_iteration) {
			RDTSC_START(t1); // start operation
			j_next_p = jacobian_montgomery_ladder(j_p, a, k, modulo); // Q = [k]P
			// gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y);
			next_p = jacobian_to_affine(j_next_p, modulo);
			// gmp_printf("%Zd %Zd\n", next_p.x, next_p.y);
			RDTSC_STOP(t2); // stop operation
			total7 += t2 - t1 - rdtscp_cycle;
			i++;
		}
		printf("--[JACOBIAN] Montgomery ladder algorithm--\n");
		print_result(total7, one_us);
	}

	/** -------------------------------------------------------------------------*/
	J_Point public_key_1, public_key_2, shared_key;
	mpz_t private_key_1, private_key_2;
	mpz_init(private_key_1); mpz_init(private_key_2);
	// TODO : Key should be padded to fixed size (serializable)
	// Note: (2^-256 chance of failure, can be ignored)
	while (mpz_cmp(private_key_1, zero_value) == 0) {
		get_random(private_key_1, 32); // 256 bit
		positive_modulo(private_key_1, private_key_1, modulo);
	}
	while (mpz_cmp(private_key_2, zero_value) == 0) {
		get_random(private_key_2, 32); // 256 bit
		positive_modulo(private_key_2, private_key_2, modulo);
	}

	gmp_printf("Private key [A B]: %Zd %Zd\n\n", private_key_1, private_key_2);
	public_key_1 = jacobian_left_to_right_binary(j_p, a, private_key_1, modulo);
	public_key_2 = jacobian_left_to_right_binary(j_p, a, private_key_2, modulo);

	gmp_printf("Public key 1 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_1.X, public_key_1.Y, public_key_1.Z);
	gmp_printf("Public key 2 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_2.X, public_key_2.Y, public_key_2.Z);

	Point public_key_1_decoded = jacobian_to_affine(public_key_1, modulo);
	Point public_key_2_decoded = jacobian_to_affine(public_key_2, modulo);

	gmp_printf("Public key 1 - Affine [X Y]: %Zd %Zd\n", public_key_1_decoded.x, public_key_1_decoded.y);
	gmp_printf("Public key 2 - Affine [X Y]: %Zd %Zd\n\n", public_key_2_decoded.x, public_key_2_decoded.y);

	/** -------------------------------------------------------------------------*/
	if (TEST_ENCRYPT_DECRYPT) {
		// ElGamal Encrypt - Decrypt (Map message to chunk of points in EC)
		J_Point message, chosen_point, encoded_point, decoded_point;
		mpz_t k_message;
		mpz_init(k_message);
		mpz_set_ui(k_message, 123456789);
		message = jacobian_left_to_right_binary(j_p, a, k_message, modulo);

		Point message_decoded = jacobian_to_affine(message, modulo);
		gmp_printf("[Encrypt] Message - Affine [X Y] %Zd %Zd\n", message_decoded.x, message_decoded.y);
		gmp_printf("[Encrypt] Message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", message.X, message.Y, message.Z);
		while (mpz_cmp(k_message, zero_value) == 0) {
			get_random(k_message, 32);
			positive_modulo(k_message, k_message, modulo);
		}
		// Encrypt example
		chosen_point = jacobian_left_to_right_binary(j_p, a, k_message, modulo); // chosen point (r)
		gmp_printf("[Encrypt] Chosen point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", chosen_point.X, chosen_point.Y, chosen_point.Z);
		encoded_point = jacobian_left_to_right_binary(public_key_2, a, k_message, modulo); // r * Pu2
		encoded_point = jacobian_curve_addition(message, encoded_point, a, modulo);
		// TODO : chosen_point & encoded_point should be padded to P-bit
		gmp_printf("[Decrypt] Encoded point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", encoded_point.X, encoded_point.Y, encoded_point.Z);
	
		// Decrypt example (encoded_point - private_key * chosen_point)
		decoded_point = jacobian_left_to_right_binary(chosen_point, a, private_key_2, modulo);
		decoded_point = jacobian_curve_subtraction(encoded_point, decoded_point, a, modulo);
		gmp_printf("[Decrypt] Original message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", decoded_point.X, decoded_point.Y, decoded_point.Z);
		message_decoded = jacobian_to_affine(decoded_point, modulo);
		gmp_printf("[Decrypt] Original message - Affine [X Y] %Zd %Zd\n\n", message_decoded.x, message_decoded.y);
	}
	/** -------------------------------------------------------------------------*/
	if (TEST_SIMPLIFIED_ECIES) {
		// Simplified ECIES (Ref: Page 256 Cryptography Theory & Practice 2nd Ed. - Douglas)
		char* message_string = "hello"; // 0..9, a..z (base 36)
		mpz_t encrypted_message;
		mpz_init(encrypted_message);

		int partition = strlen(message_string) / 24;
		int partition_modulo = strlen(message_string) % 24;
		if (partition_modulo != 0) partition++;

		for (i = 0; i < partition; i++) {
			// 24 characters from message_string + 1 null-terminate
			char* chunked_message_string = (char*) malloc(25 * sizeof(char));
			int size = 24;
			if ((i == partition - 1) && (partition_modulo != 0)) size = partition_modulo;
			strncpy(chunked_message_string, message_string + i*24, size);
			chunked_message_string[size] = '\0'; // null-terminate

			Point c_point = encrypt_ECIES(encrypted_message, chunked_message_string, public_key_2_decoded, p, a, modulo);
			gmp_printf("[SIMPLIFIED ECIES] Encrypted message: %Zd\n", encrypted_message);
			decrypt_ECIES(encrypted_message, c_point, private_key_2, p, a, modulo);
		}
	}
	/**-------------------------------------------------------------------------*/
	// TODO : Public key validation!
	// Shared key (ECDH) - key secure exchange
	shared_key = jacobian_left_to_right_binary(public_key_2, a, private_key_1, modulo);
	gmp_printf("Shared key - Jacobian [X Y Z]: %Zd %Zd %Zd\n", shared_key.X, shared_key.Y, shared_key.Z);
	Point shared_key_decoded = jacobian_to_affine(shared_key, modulo);
	gmp_printf("Shared key - Affine [X Y]: %Zd %Zd\n", shared_key_decoded.x, shared_key_decoded.y);

	// TODO : ECDSA - digital signature algorithm

	/** Cleaning up */
	mpz_clear(a);
	mpz_clear(b);
	mpz_clear(k);
	mpz_clear(r);
	mpz_clear(modulo);
	mpz_clear(private_key_1);
	mpz_clear(private_key_2);

	return EXIT_SUCCESS;
}
// for draw calls, we initialize the active hot tiles and perform deferred
// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
// the draw routine itself mainly for performance, to avoid unnecessary setup
// every triangle
// @todo support deferred clear
INLINE
void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
{
    const API_STATE& state = GetApiState(pDC);
    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
    const SWR_PS_STATE& psState = state.psState;
    uint32_t numRTs = psState.maxRTSlotUsed + 1;

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);
    x *= KNOB_MACROTILE_X_DIM;
    y *= KNOB_MACROTILE_Y_DIM;

    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);

    // check RT if enabled
    if (state.psState.pfnPixelShader != nullptr)
    {
        for (uint32_t rt = 0; rt < numRTs; ++rt)
        {
            HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples);

            if (pHotTile->state == HOTTILE_INVALID)
            {
                RDTSC_START(BELoadTiles);
                // invalid hottile before draw requires a load from surface before we can draw to it
                pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
                pHotTile->state = HOTTILE_DIRTY;
                RDTSC_STOP(BELoadTiles, 0, 0);
            }
            else if (pHotTile->state == HOTTILE_CLEAR)
            {
                RDTSC_START(BELoadTiles);
                // Clear the tile.
                ClearColorHotTile(pHotTile);
                pHotTile->state = HOTTILE_DIRTY;
                RDTSC_STOP(BELoadTiles, 0, 0);
            }
        }
    }

    // check depth if enabled
    if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable)
    {
        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_START(BELoadTiles);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_START(BELoadTiles);
            // Clear the tile.
            ClearDepthHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
    }

    // check stencil if enabled
    if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable)
    {
        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_START(BELoadTiles);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_START(BELoadTiles);
            // Clear the tile.
            ClearStencilHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
    }
}