int run_hsl(int c, char* src, char* dst, float hh, float ss, float ll, int times) { BMP* bmp = bmp_read(src); if(bmp==0) { return -1;} // open error if(ss>1) ss=1; else if(ss<-1) ss=-1; if(ll>1) ll=1; else if(ll<-1) ll=-1; uint8_t* data = bmp_get_data(bmp); uint32_t h = *(bmp_get_h(bmp)); uint32_t w = *(bmp_get_w(bmp)); if(w%4!=0) { return -1;} // do not support padding uint8_t* dataC = 0; if(*(bmp_get_bitcount(bmp)) == 24) { dataC = malloc(sizeof(uint8_t)*4*h*w); to32(w,h,data,dataC); } else { dataC = data; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_hsl(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_hsl1(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_hsl2(w,h,dataC,hh,ss,ll); RDTSC_STOP(end); break; default: return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp)) == 24) { to24(w,h,dataC,data); free(dataC); } bmp_delete(bmp); return 0; }
int run_blur(int c, char* src, char* dst, int times){ BMP* bmp = bmp_read(src); if(bmp==0) { return -1;} // open error uint8_t* data = bmp_get_data(bmp); uint32_t h = *(bmp_get_h(bmp)); uint32_t w = *(bmp_get_w(bmp)); if(w%4!=0) { return -1;} // do not support padding uint8_t* dataC = 0; if(*(bmp_get_bitcount(bmp)) == 24) { dataC = malloc(sizeof(uint8_t)*4*h*w); to32(w,h,data,dataC); } else { dataC = data; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_blur(w,h,dataC); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_blur1(w,h,dataC); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_blur2(w,h,dataC); RDTSC_STOP(end); break; default: // return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp)) == 24) { to24(w,h,dataC,data); free(dataC); } bmp_delete(bmp); return 0; }
// Deswizzles, converts and stores current contents of the hot tiles to surface // described by pState void SwrStoreTiles( HANDLE hContext, SWR_RENDERTARGET_ATTACHMENT attachment, SWR_TILE_STATE postStoreTileState) // TODO: Implement postStoreTileState { RDTSC_START(APIStoreTiles); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->inUse = true; SetupMacroTileScissors(pDC); pDC->FeWork.type = STORETILES; pDC->FeWork.pfnWork = ProcessStoreTiles; pDC->FeWork.desc.storeTiles.attachment = attachment; pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState; //enqueue QueueDraw(pContext); RDTSC_STOP(APIStoreTiles, 0, 0); if (attachment == SWR_ATTACHMENT_COLOR0) { RDTSC_ENDFRAME(); } }
void SwrClearRenderTarget( HANDLE hContext, uint32_t clearMask, const float clearColor[4], float z, BYTE stencil) { RDTSC_START(APIClearRenderTarget); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); SetupMacroTileScissors(pDC); pDC->inUse = true; CLEAR_FLAGS flags; flags.mask = clearMask; pDC->FeWork.type = CLEAR; pDC->FeWork.pfnWork = ProcessClear; pDC->FeWork.desc.clear.flags = flags; pDC->FeWork.desc.clear.clearDepth = z; pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0]; pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1]; pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2]; pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3]; pDC->FeWork.desc.clear.clearStencil = stencil; // enqueue draw QueueDraw(pContext); RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId); }
JNIEXPORT jlong JNICALL Java_com_rr_core_os_NativeHooksImpl_jniNanoRDTSCStart(JNIEnv *env, jclass clazz) { long cycles; RDTSC_START( cycles ); return( cycles ); }
////////////////////////////////////////////////////////////////////////// /// @brief SwrDispatch /// @param hContext - Handle passed back from SwrCreateContext /// @param threadGroupCountX - Number of thread groups dispatched in X direction /// @param threadGroupCountY - Number of thread groups dispatched in Y direction /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction void SwrDispatch( HANDLE hContext, uint32_t threadGroupCountX, uint32_t threadGroupCountY, uint32_t threadGroupCountZ) { RDTSC_START(APIDispatch); SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->isCompute = true; // This is a compute context. pDC->inUse = true; COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->arena.AllocAligned(sizeof(COMPUTE_DESC), 64); pTaskData->threadGroupCountX = threadGroupCountX; pTaskData->threadGroupCountY = threadGroupCountY; pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; pDC->pDispatch->initialize(totalThreadGroups, pTaskData); QueueDispatch(pContext); RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0); }
///@todo Combine this with QueueDraw void QueueDispatch(SWR_CONTEXT *pContext) { _ReadWriteBarrier(); pContext->DrawEnqueued++; if (KNOB_SINGLE_THREADED) { // flush denormals to 0 uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); WorkOnCompute(pContext, 0, pContext->WorkerBE[0]); // restore csr _mm_setcsr(mxcsr); } else { RDTSC_START(APIDrawWakeAllThreads); WakeAllThreads(pContext); RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); } // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. pContext->pPrevDrawContext = pContext->pCurDrawContext; pContext->pCurDrawContext = nullptr; }
// see https://github.com/saulius/croaring-rs/issues/6#issuecomment-243341270 int main() { size_t N = 1000000; uint64_t cycles_start, cycles_final; RDTSC_START(cycles_start); for(size_t i = 0; i < N; i++) { roaring_bitmap_t * bm = roaring_bitmap_create(); roaring_bitmap_free(bm); } RDTSC_FINAL(cycles_final); printf("%f cycles per object created \n",(cycles_final-cycles_start)*1.0/N); return 0; }
void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2) { RDTSC_START(APISync); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); pDC->inUse = true; pDC->FeWork.type = SYNC; pDC->FeWork.pfnWork = ProcessSync; pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc; pDC->FeWork.desc.sync.userData = userData; pDC->FeWork.desc.sync.userData2 = userData2; // cannot execute until all previous draws have completed pDC->dependency = pDC->drawId - 1; //enqueue QueueDraw(pContext); RDTSC_STOP(APISync, 1, 0); }
void Flip() { RDTSC_START(APIFlip); if (mIsDisplay) { XSync(mpDisplay, False); // copy render target to Xshm surface, mirroring on Y to account for X/GL origin differences (X is top-left, GL is bottom-left) UINT pitch = mWidth * 4; OGL::GetDDProcTable().pfnPresent2(OGL::GetDDHandle(), mpImages[mCurBackBuffer]->data, pitch); // copy to display surface if (useShm) XShmPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight, False); else XPutImage(mpDisplay, mDrawable, swapGC, mpImages[mCurBackBuffer], 0, 0, 0, 0, mWidth, mHeight); // flip back buffer mCurBackBuffer ^= 1; } RDTSC_STOP(APIFlip, 1, 0); }
////////////////////////////////////////////////////////////////////////// /// @brief DrawIndexedInstanced /// @param hContext - Handle passed back from SwrCreateContext /// @param topology - Specifies topology for draw. /// @param numIndices - Number of indices to read sequentially from index buffer. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. /// @param numInstances - Number of instances to render. /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) void DrawIndexedInstance( HANDLE hContext, PRIMITIVE_TOPOLOGY topology, uint32_t numIndices, uint32_t indexOffset, int32_t baseVertex, uint32_t numInstances = 1, uint32_t startInstance = 0) { RDTSC_START(APIDrawIndexed); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); int32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) { case R32_UINT: indexSize = sizeof(uint32_t); break; case R16_UINT: indexSize = sizeof(uint16_t); break; case R8_UINT: indexSize = sizeof(uint8_t); break; default: SWR_ASSERT(0); } int draw = 0; uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; pIB += (uint64_t)indexOffset * (uint64_t)indexSize; pState->topology = topology; pState->forceFront = false; // disable culling for points/lines uint32_t oldCullMode = pState->rastState.cullMode; if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } while (remainingIndices) { uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw; // When breaking up draw, we need to obtain new draw context for each iteration. bool isSplitDraw = (draw > 0) ? true : false; pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); pDC->FeWork.type = DRAW; pDC->FeWork.pfnWork = GetFEDrawFunc( true, // IsIndexed pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pDC->pState->pfnProcessPrims != nullptr); pDC->FeWork.desc.draw.pDC = pDC; pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; pDC->FeWork.desc.draw.pIB = (int*)pIB; pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; pDC->FeWork.desc.draw.baseVertex = baseVertex; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; //enqueue DC QueueDraw(pContext); pIB += maxIndicesPerDraw * indexSize; remainingIndices -= numIndicesForDraw; draw++; } // restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); }
////////////////////////////////////////////////////////////////////////// /// @brief DrawInstanced /// @param hContext - Handle passed back from SwrCreateContext /// @param topology - Specifies topology for draw. /// @param numVerts - How many vertices to read sequentially from vertex data (per instance). /// @param startVertex - Specifies start vertex for draw. (vertex data) /// @param numInstances - How many instances to render. /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) void DrawInstanced( HANDLE hContext, PRIMITIVE_TOPOLOGY topology, uint32_t numVertices, uint32_t startVertex, uint32_t numInstances = 1, uint32_t startInstance = 0) { RDTSC_START(APIDraw); #if KNOB_ENABLE_TOSS_POINTS if (KNOB_TOSS_DRAW) { return; } #endif SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); int32_t remainingVerts = numVertices; API_STATE *pState = &pDC->pState->state; pState->topology = topology; pState->forceFront = false; // disable culling for points/lines uint32_t oldCullMode = pState->rastState.cullMode; if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } int draw = 0; while (remainingVerts) { uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw; bool isSplitDraw = (draw > 0) ? true : false; DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); pDC->FeWork.type = DRAW; pDC->FeWork.pfnWork = GetFEDrawFunc( false, // IsIndexed pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pDC->pState->pfnProcessPrims != nullptr); pDC->FeWork.desc.draw.numVerts = numVertsForDraw; pDC->FeWork.desc.draw.startVertex = startVertex + draw * maxVertsPerDraw; pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; //enqueue DC QueueDraw(pContext); remainingVerts -= numVertsForDraw; draw++; } // restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_STOP(APIDraw, numVertices * numInstances, 0); }
int main(int argc, char **argv) { int c; char *extension = ".txt"; bool copy_on_write = false; while ((c = getopt(argc, argv, "e:h")) != -1) switch (c) { case 'e': extension = optarg; break; case 'h': printusage(argv[0]); return 0; default: abort(); } if (optind >= argc) { printusage(argv[0]); return -1; } char *dirname = argv[optind]; size_t count; size_t *howmany = NULL; uint32_t **numbers = read_all_integer_files(dirname, extension, &howmany, &count); if (numbers == NULL) { printf( "I could not find or load any data file with extension %s in " "directory %s.\n", extension, dirname); return -1; } uint64_t cycles_start = 0, cycles_final = 0; RDTSC_START(cycles_start); roaring_bitmap_t **bitmaps = create_all_bitmaps(howmany, numbers, count, copy_on_write); RDTSC_FINAL(cycles_final); if (bitmaps == NULL) return -1; printf("Loaded %d bitmaps from directory %s \n", (int)count, dirname); printf("Creating %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); for (int i = 0; i < (int)count; i += 2) { roaring_bitmap_t *CI = roaring_bitmap_copy( bitmaps[i]); // to test the inplace version we create a copy roaring_bitmap_free(CI); } RDTSC_FINAL(cycles_final); printf("Copying and freeing %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); uint64_t successive_and = 0; uint64_t successive_or = 0; // try ANDing and ORing together consecutive pairs for (int i = 0; i < (int)count - 1; ++i) { uint32_t c1 = roaring_bitmap_get_cardinality(bitmaps[i]); uint32_t c2 = roaring_bitmap_get_cardinality(bitmaps[i + 1]); RDTSC_START(cycles_start); roaring_bitmap_t *tempand = roaring_bitmap_and(bitmaps[i], bitmaps[i + 1]); RDTSC_FINAL(cycles_final); successive_and += cycles_final - cycles_start; uint32_t ci = roaring_bitmap_get_cardinality(tempand); roaring_bitmap_free(tempand); RDTSC_START(cycles_start); roaring_bitmap_t *tempor = roaring_bitmap_or(bitmaps[i], bitmaps[i + 1]); RDTSC_FINAL(cycles_final); successive_or += cycles_final - cycles_start; uint32_t co = roaring_bitmap_get_cardinality(tempor); roaring_bitmap_free(tempor); if (c1 + c2 != co + ci) { printf(KRED "cardinalities are wrong somehow\n"); printf("c1 = %d, c2 = %d, co = %d, ci = %d\n", c1, c2, co, ci); return -1; } } printf(" %zu successive bitmaps intersections took %" PRIu64 " cycles\n", count - 1, successive_and); printf(" %zu successive bitmaps unions took %" PRIu64 " cycles\n", count - 1, successive_or); roaring_bitmap_t **copyofr = malloc(sizeof(roaring_bitmap_t *) * count); for (int i = 0; i < (int)count; i++) { copyofr[i] = roaring_bitmap_copy(bitmaps[i]); } RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; i++) { roaring_bitmap_and_inplace(copyofr[i], bitmaps[i + 1]); } RDTSC_FINAL(cycles_final); printf(" %zu successive in-place bitmaps intersections took %" PRIu64 " cycles\n", count - 1, cycles_final - cycles_start); free(copyofr); copyofr = malloc(sizeof(roaring_bitmap_t *) * count); for (int i = 0; i < (int)count; i++) { copyofr[i] = roaring_bitmap_copy(bitmaps[i]); } RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; i++) { roaring_bitmap_or_inplace(copyofr[i], bitmaps[i + 1]); } RDTSC_FINAL(cycles_final); printf(" %zu successive in-place bitmaps unions took %" PRIu64 " cycles\n", count - 1, cycles_final - cycles_start); for (int i = 0; i < (int)count; ++i) { free(numbers[i]); numbers[i] = NULL; // paranoid roaring_bitmap_free(bitmaps[i]); bitmaps[i] = NULL; // paranoid } free(bitmaps); free(howmany); free(numbers); return 0; }
int main(int argc, char **argv) { int c; const char *extension = ".txt"; bool verbose = false; uint64_t data[13]; initializeMemUsageCounter(); while ((c = getopt(argc, argv, "ve:h")) != -1) switch (c) { case 'e': extension = optarg; break; case 'v': verbose = true; break; case 'h': printusage(argv[0]); return 0; default: abort(); } if (optind >= argc) { printusage(argv[0]); return -1; } char *dirname = argv[optind]; size_t count; size_t *howmany = NULL; uint32_t **numbers = read_all_integer_files(dirname, extension, &howmany, &count); if (numbers == NULL) { printf( "I could not find or load any data file with extension %s in " "directory %s.\n", extension, dirname); return -1; } uint32_t maxvalue = 0; for (size_t i = 0; i < count; i++) { if( howmany[i] > 0 ) { if(maxvalue < numbers[i][howmany[i]-1]) { maxvalue = numbers[i][howmany[i]-1]; } } } uint64_t totalcard = 0; for (size_t i = 0; i < count; i++) { totalcard += howmany[i]; } uint64_t successivecard = 0; for (size_t i = 1; i < count; i++) { successivecard += howmany[i-1] + howmany[i]; } uint64_t cycles_start = 0, cycles_final = 0; RDTSC_START(cycles_start); std::vector<vector> bitmaps = create_all_bitmaps(howmany, numbers, count); RDTSC_FINAL(cycles_final); if (bitmaps.empty()) return -1; if(verbose) printf("Loaded %d bitmaps from directory %s \n", (int)count, dirname); uint64_t totalsize = getMemUsageInBytes(); data[0] = totalsize; if(verbose) printf("Total size in bytes = %" PRIu64 " \n", totalsize); uint64_t successive_and = 0; uint64_t successive_or = 0; uint64_t total_or = 0; uint64_t total_count = 0; uint64_t successive_andnot = 0; uint64_t successive_xor = 0; RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { vector v; std::set_intersection(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),std::back_inserter(v)); successive_and += v.size(); } RDTSC_FINAL(cycles_final); data[1] = cycles_final - cycles_start; if(verbose) printf("Successive intersections on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { vector v; std::set_union(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),std::back_inserter(v)); successive_or += v.size(); } RDTSC_FINAL(cycles_final); data[2] = cycles_final - cycles_start; if(verbose) printf("Successive unions on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); if(count>1) { vector v; std::set_union(bitmaps[0].begin(), bitmaps[0].end(),bitmaps[1].begin(), bitmaps[1].end(),std::back_inserter(v)); for (int i = 2; i < (int)count ; ++i) { vector newv; std::set_union(v.begin(), v.end(),bitmaps[i].begin(), bitmaps[i].end(),std::back_inserter(newv)); v.swap(newv); } total_or = v.size(); } RDTSC_FINAL(cycles_final); data[3] = cycles_final - cycles_start; if(verbose) printf("Total naive unions on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); if(count>1) { const vector ** allofthem = new const vector* [count]; for(int i = 0 ; i < (int) count; ++i) allofthem[i] = & bitmaps[i]; vector totalorbitmap = fast_logicalor(count, allofthem); total_or = totalorbitmap.size(); delete[] allofthem; } RDTSC_FINAL(cycles_final); data[4] = cycles_final - cycles_start; if(verbose) printf("Total heap unions on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); uint64_t quartcount = 0; for (size_t i = 0; i < count ; ++i) { if ( std::binary_search(bitmaps[i].begin(),bitmaps[i].end(),maxvalue/4 ) ) quartcount ++; if ( std::binary_search(bitmaps[i].begin(),bitmaps[i].end(),maxvalue/2 ) ) quartcount ++; if ( std::binary_search(bitmaps[i].begin(),bitmaps[i].end(),3*maxvalue/4 ) ) quartcount ++; } RDTSC_FINAL(cycles_final); data[5] = cycles_final - cycles_start; if(verbose) printf("Quartile queries on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); if(verbose) printf("Collected stats %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",successive_and,successive_or,total_or,quartcount); RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { vector v; std::set_difference(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),std::back_inserter(v)); successive_andnot += v.size(); } RDTSC_FINAL(cycles_final); data[6] = cycles_final - cycles_start; if(verbose) printf("Successive differences on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { vector v; std::set_symmetric_difference(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),std::back_inserter(v)); successive_xor += v.size(); } RDTSC_FINAL(cycles_final); data[7] = cycles_final - cycles_start; if(verbose) printf("Successive symmetric differences on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); RDTSC_START(cycles_start); for (size_t i = 0; i < count; ++i) { vector & b = bitmaps[i]; for(auto j = b.begin(); j != b.end() ; j++) { total_count++; } } RDTSC_FINAL(cycles_final); data[8] = cycles_final - cycles_start; assert(total_count == totalcard); if(verbose) printf("Iterating over %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); assert(successive_xor + successive_and == successive_or); /** * and, or, andnot and xor cardinality */ uint64_t successive_andcard = 0; uint64_t successive_orcard = 0; uint64_t successive_andnotcard = 0; uint64_t successive_xorcard = 0; RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { std::set_intersection(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),inserter(successive_andcard)); } RDTSC_FINAL(cycles_final); data[9] = cycles_final - cycles_start; RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { std::set_union(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),inserter(successive_orcard)); } RDTSC_FINAL(cycles_final); data[10] = cycles_final - cycles_start; RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { std::set_difference(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),inserter(successive_andnotcard)); } RDTSC_FINAL(cycles_final); data[11] = cycles_final - cycles_start; RDTSC_START(cycles_start); for (int i = 0; i < (int)count - 1; ++i) { std::set_symmetric_difference(bitmaps[i].begin(), bitmaps[i].end(),bitmaps[i+1].begin(), bitmaps[i+1].end(),inserter(successive_xorcard)); } RDTSC_FINAL(cycles_final); data[12] = cycles_final - cycles_start; assert(successive_andcard == successive_and); assert(successive_orcard == successive_or); assert(successive_xorcard == successive_xor); assert(successive_andnotcard == successive_andnot); /** * end and, or, andnot and xor cardinality */ printf(" %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f %20.2f\n", data[0]*8.0/totalcard, data[1]*1.0/successivecard, data[2]*1.0/successivecard, data[3]*1.0/totalcard, data[4]*1.0/totalcard, data[5]*1.0/(3*count), data[6]*1.0/successivecard, data[7]*1.0/successivecard, data[8]*1.0/totalcard, data[9]*1.0/successivecard, data[10]*1.0/successivecard, data[11]*1.0/successivecard, data[12]*1.0/successivecard ); for (int i = 0; i < (int)count; ++i) { free(numbers[i]); numbers[i] = NULL; // paranoid } free(howmany); free(numbers); return 0; }
int main(void) { uint32_t start_hi=0, start_lo=0; uint32_t end_hi=0, end_lo=0; RDTSC_START(); sleep(1); RDTSC_STOP(); printf("elapsed: %ld (sleep(1))\n", elapsed(start_hi, start_lo, end_hi, end_lo)); printf("\n\n\n"); // For the rest of our tests, lets use loops to get more accurate numbers. #define REPEAT 100 uint64_t totalTime = 0; for(int i=0; i<REPEAT; i++) { RDTSC_START(); printf("printing!\n"); // how fast is printf()? RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (printf)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); printf("\n\n\n"); totalTime = 0; for(int i=0; i<REPEAT; i++) { RDTSC_START(); // how fast is nothing at all? RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (NOTHING)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); printf("\n\n\n"); totalTime = 0; for(int i=0; i<REPEAT; i++) { volatile int var = 0; int k=0; RDTSC_START(); // how fast is a loop that we can choose how many times it runs? for(; k<2; k++) // Change how many times this loop runs, see what happens. (var) = 1; RDTSC_STOP(); uint64_t e = elapsed(start_hi, start_lo, end_hi, end_lo); printf("trial %d: %ld (loop)\n", i, e); totalTime += e; } printf("average: %f\n", totalTime/(float)REPEAT); return 0; }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); RDTSC_INIT(threadId); uint32_t numaNode = pThreadData->numaId; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint64_t curDrawBE = 0; uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } if (IsBEThread) { RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any BE work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its /// own set and each time it fails to lock a macrotile, because its already locked, /// then it will add that tile to the lockedTiles set. As a worker begins to work /// on future draws the lockedTiles ensure that it doesn't work on tiles that may /// still have work pending in a previous draw. Additionally, the lockedTiles is /// hueristic that can steer a worker back to the same macrotile that it had been /// working on in a previous draw. void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet& lockedTiles, uint32_t numaNode, uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. uint64_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. lockedTiles.clear(); // Try to work on each draw in order of the available draws in flight. // 1. If we're on curDrawBE, we can work on any macrotile that is available. // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute) return; // We don't look at compute work. // First wait for FE to be finished with this draw. This keeps threading model simple // but if there are lots of bubbles between draws then serializing FE and BE may // need to be revisited. if (!pDC->doneFE) return; // If this draw is dependent on a previous draw then we need to bail. if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. std::vector<uint32_t> ¯oTiles = pDC->pTileMgr->getDirtyTiles(); for (uint32_t tileID : macroTiles) { // Only work on tiles for for this numa node uint32_t x, y; pDC->pTileMgr->getTileIndices(tileID, x, y); if (((x ^ y) & numaMask) != numaNode) { continue; } MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); if (!tile.getNumQueued()) { continue; } // can only work on this draw if it's not in use by other threads if (lockedTiles.find(tileID) != lockedTiles.end()) { continue; } if (tile.tryLock()) { BE_WORK *pWork; RDTSC_START(WorkerFoundWork); uint32_t numWorkItems = tile.getNumQueued(); SWR_ASSERT(numWorkItems); pWork = tile.peek(); SWR_ASSERT(pWork); if (pWork->type == DRAW) { pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); } while ((pWork = tile.peek()) != nullptr) { pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); tile.dequeue(); } RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); _ReadWriteBarrier(); pDC->pTileMgr->markTileComplete(tileID); // Optimization: If the draw is complete and we're the last one to have worked on it then // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; CompleteDrawContext(pContext, pDC); lastRetiredDraw++; lockedTiles.clear(); break; } } else { // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. lockedTiles.insert(tileID); } } } }
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { RDTSC_START(APIGetDrawContext); // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; pContext->pCurDrawContext = pCurDrawContext; // Update LastRetiredId UpdateLastRetiredId(pContext); // Need to wait until this draw context is available to use. while (StillDrawing(pContext, pCurDrawContext)) { // Make sure workers are working. WakeAllThreads(pContext); _mm_pause(); } // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; Arena& stateArena = pCurDrawContext->pState->arena; // Copy previous state to current state. if (pContext->pPrevDrawContext) { DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; // If we're splitting our draw then we can just use the same state from the previous // draw. In this case, we won't increment the DS ring index so the next non-split // draw can receive the state. if (isSplitDraw == false) { CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); stateArena.Reset(); // Reset memory. // Copy private state to new context. if (pPrevDrawContext->pState->pPrivateState != nullptr) { pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize); } pContext->curStateId++; // Progress state ring index forward. } else { // If its a split draw then just copy the state pointer over // since its the same draw. pCurDrawContext->pState = pPrevDrawContext->pState; } } else { stateArena.Reset(); // Reset memory. pContext->curStateId++; // Progress state ring index forward. } pCurDrawContext->dependency = 0; pCurDrawContext->arena.Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. pCurDrawContext->inUse = false; pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->nextDrawId++; } else { SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); } RDTSC_STOP(APIGetDrawContext, 0, 0); return pContext->pCurDrawContext; }
int run_merge(int c, char* src1, char* src2, char* dst, float value, int times){ if(value>1) value=1; else if(value<0) value=0; BMP* bmp1 = bmp_read(src1); BMP* bmp2 = bmp_read(src2); if(bmp1==0 || bmp2==0) { return -1;} // open error uint8_t* data1 = bmp_get_data(bmp1); uint8_t* data2 = bmp_get_data(bmp2); uint32_t h1 = *(bmp_get_h(bmp1)); uint32_t w1 = *(bmp_get_w(bmp1)); uint32_t h2 = *(bmp_get_h(bmp2)); uint32_t w2 = *(bmp_get_w(bmp2)); if(w1%4!=0 || w2%4!=0) { return -1;} // do not support padding if( w1!=w2 || h1!=h2 ) { return -1;} // different image size uint8_t* data1C = 0; uint8_t* data2C = 0; if(*(bmp_get_bitcount(bmp1)) == 24) { data1C = malloc(sizeof(uint8_t)*4*h1*w1); data2C = malloc(sizeof(uint8_t)*4*h2*w2); to32(w1,h1,data1,data1C); to32(w2,h2,data2,data2C); } else { data1C = data1; data2C = data2; } unsigned long start, end; switch(c){ case 0: RDTSC_START(start); C_merge(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; case 1: RDTSC_START(start); ASM_merge1(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; case 2: RDTSC_START(start); ASM_merge2(w1,h1,data1C,data2C,value); RDTSC_STOP(end); break; default: return -1; break; } unsigned long delta = end - start; printf("%lu", delta); if(*(bmp_get_bitcount(bmp1)) == 24) { to24(w1,h1,data1C,data1); free(data1C); free(data2C); } bmp_delete(bmp1); bmp_delete(bmp2); return 0; }
int main(int argc, char **argv) { // binsearch [random-seed] if (argc > 1) srand(atoi(argv[1])); #ifdef POWER_OF_TWO size_t lengths[] = { 256, 1024, 256 * 256, 256 * 1024, 1024*1024, 16*1024*1024, 256*1024*1024 }; #else size_t lengths[] = { 100, 1000, 10*1000, 100*1000, 1000*1000, 10*1000*1000, 100*1000*1000 }; #endif size_t batches[] = { // batch sizes reduced to target count as needed 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 40, 60, 100, 200, 400, 1000, UINT32_MAX }; for (size_t n = 0; n < COUNT(lengths); n++) { size_t length = lengths[n]; size_t count = 1000*1000; size_t i; uint32_t *targets = create_search_targets(count, 2*length); printf("%zd Targets with Array Length", count); fflush(NULL); uint32_t *array = create_sorted_array(length, 2*length); printf(" %zd\n", length); size_t *reference_indexes = malloc(count * sizeof(size_t)); #ifdef LINEAR_REFERENCE linear_search(array, length, targets, reference_indexes, count); #else parallel_search(array, length, targets, reference_indexes, count); #endif size_t *indexes = malloc(count * sizeof(*indexes)); for (i = 0; i < COUNT(batches); i++) { size_t batch = batches[i]; size_t c; float cycles_per_search; memset(indexes, 0, count * sizeof(*indexes)); if (batch > count) batch = count; printf(" Batch %3zd: ", batch); fflush(NULL); uint64_t cycles_start, cycles_final; RDTSC_START(cycles_start); for (c = 0; c < count; c += batch) { // possible smaller batch for last iteration if (c > count - batch) batch = count % batch; parallel_search(array, length, targets + c, indexes + c, batch); } RDTSC_FINAL(cycles_final); cycles_per_search = (cycles_final - cycles_start) / (float) count; printf("cmov?: %.2f cycles/search ", cycles_per_search); verify_indexes(reference_indexes, indexes, count, array, targets, length); RDTSC_START(cycles_start); for (c = 0; c < count; c += batch) { // possible smaller batch for last iteration if (c > count - batch) batch = count % batch; portable_parallel_search(array, length, targets + c, indexes + c, batch); } RDTSC_FINAL(cycles_final); cycles_per_search = (cycles_final - cycles_start) / (float) count; printf(", portable: %.2f cycles/search", cycles_per_search); RDTSC_START(cycles_start); for (size_t c = 0; c < count; c += batch) { // possible smaller batch for last iteration if (c > count - batch) batch = count % batch; portable_parallel_search2(array, length, targets + c, indexes + c, batch); } RDTSC_FINAL(cycles_final); cycles_per_search = (cycles_final - cycles_start) / (float) count; printf(", portable2: %.2f cycles/search", cycles_per_search); printf("\n"); } free(array); free(targets); free(indexes); free(reference_indexes); printf("\n"); } }
int main(int argc, char **argv) { int c; const char *extension = ".txt"; bool copy_on_write = false; bool runoptimize = true; while ((c = getopt(argc, argv, "e:h")) != -1) switch (c) { case 'e': extension = optarg; break; case 'h': printusage(argv[0]); return 0; default: abort(); } if (optind >= argc) { printusage(argv[0]); return -1; } char *dirname = argv[optind]; size_t count; size_t *howmany = NULL; uint32_t **numbers = read_all_integer_files(dirname, extension, &howmany, &count); if (numbers == NULL) { printf( "I could not find or load any data file with extension %s in " "directory %s.\n", extension, dirname); return -1; } uint64_t cycles_start = 0, cycles_final = 0; RDTSC_START(cycles_start); roaring_bitmap_t **bitmaps = create_all_bitmaps(howmany, numbers, count, runoptimize, copy_on_write); RDTSC_FINAL(cycles_final); if (bitmaps == NULL) return -1; printf("Loaded %d bitmaps from directory %s \n", (int)count, dirname); printf("Creating %zu bitmaps took %" PRIu64 " cycles\n", count, cycles_final - cycles_start); if(count == 0) return -1; uint32_t maxvalue = roaring_bitmap_maximum(bitmaps[0]); for (int i = 1; i < (int)count; i ++) { uint32_t thismax = roaring_bitmap_maximum(bitmaps[0]); if(thismax > maxvalue) maxvalue = thismax; } const int quartile_test_repetitions = 1000; uint64_t quartcount; uint64_t cycles; STARTBEST(quartile_test_repetitions) quartcount = 0; for (size_t i = 0; i < count ; ++i) { quartcount += roaring_bitmap_contains(bitmaps[i],maxvalue/4); quartcount += roaring_bitmap_contains(bitmaps[i],maxvalue/2); quartcount += roaring_bitmap_contains(bitmaps[i],3*maxvalue/4); } ENDBEST(cycles) printf("Quartile queries on %zu bitmaps took %" PRIu64 " cycles\n", count, cycles); for (int i = 0; i < (int)count; ++i) { free(numbers[i]); numbers[i] = NULL; // paranoid roaring_bitmap_free(bitmaps[i]); bitmaps[i] = NULL; // paranoid } free(bitmaps); free(howmany); free(numbers); return (int) quartcount; }
int main() { /** Fm, where modulo = m */ mpz_t a, b, k, r, modulo; int i = 0; // loop variable Point p, next_p; p = init_point(p); mpz_init(a); mpz_init(b); mpz_init(k); mpz_init(r); // order mpz_init(modulo); /** Initialize parameters of ECC (F2p) */ mpz_set_str(a, a_v, 10); mpz_set_str(b, b_v, 16); mpz_set_str(modulo, p_v, 10); mpz_set_str(r, r_v, 10); mpz_set_str(p.x, gx_v, 16); mpz_set_str(p.y, gy_v, 16); mpz_t zero_value, k2; mpz_init(zero_value); mpz_init(k2); RDTSC_START(t1); sleep(1); // sleep for 1 second RDTSC_STOP(t2); uint64_t one_second = t2 - t1 - rdtscp_cycle; printf("Approximate number of cycles in 1 second: %lld\n\n", one_second); uint64_t one_us = one_second / 1e6; while (mpz_cmp(k, zero_value) == 0) { get_random(k, 32); // generate random test (256 bits) positive_modulo(k, k, modulo); } printf("Random k (in Binary): "); mpz_out_str(stdout, 2, k); printf("\n"); while (mpz_cmp(k2, zero_value) == 0) { get_random(k2, 32); // generate random test (256 bits) positive_modulo(k2, k2, modulo); } printf("Random k2 (in Binary): "); mpz_out_str(stdout, 2, k2); printf("\n"); /** Compare ADDITION, SHIFTING, MULTIPLICATION, and INVERSION */ if (TEST_MODULAR_OPERATION) { max_iteration = 10000; /** Addition */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_add(k, k, k2); positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION]--\n"); print_result(total, one_us); /** Shifting */ i = 0; uint64_t total2 = 0; mpz_t two; mpz_init(two); mpz_set_si(two, 2); while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_mul_2exp(k, k, 1); // left shift positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[SHIFTING 2 * k]--\n"); print_result(total2, one_us); /** Multiplication */ i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_mul(k, k, k2); positive_modulo(k, k, modulo); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[MULTIPLICATION k * k2]--\n"); print_result(total3, one_us); /** Inversion */ i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation mpz_invert(k, k, modulo); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[INVERSION]--\n"); print_result(total4, one_us); } /** -------------------------------------------------------------------------*/ // Convert Affine coordinate to Jacobian coordinate J_Point j_p, j_next_p; j_next_p = init_j_point(j_next_p); j_p = affine_to_jacobian(p); // Generator point if (TEST_SCALAR_OPERATION) { max_iteration = 100; Point p1, p2, p3; J_Point j_p1, j_p2, j_p3; /** Point preparation */ p1 = init_point(p1); p2 = init_point(p2); j_p1 = init_j_point(j_p1); j_p2 = init_j_point(j_p2); j_p1 = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, 4); j_p2 = jacobian_affine_sliding_NAF(j_p, p, a, k2, modulo, 4); p1 = jacobian_to_affine(j_p1, modulo); p2 = jacobian_to_affine(j_p2, modulo); /** Affine addition */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation p3 = affine_curve_addition(p1, p2, a, modulo); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in AFFINE]--\n"); print_result(total, one_us); /** Affine doubling */ i = 0; uint64_t total2 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation p3 = affine_curve_doubling(p1, a, modulo); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[DOUBLING in AFFINE]--\n"); print_result(total2, one_us); /** Jacobian addition */ i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_curve_addition(j_p1, j_p2, a, modulo); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in JACOBIAN]--\n"); print_result(total3, one_us); /** Jacobian doubling */ i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_curve_doubling(j_p1, a, modulo); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[DOUBLING in JACOBIAN]--\n"); print_result(total4, one_us); /** Affine-Jacobian addition */ i = 0; uint64_t total5 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_p3 = jacobian_affine_curve_addition(j_p1, p2, a, modulo); RDTSC_STOP(t2); // stop operation total5 += t2 - t1 - rdtscp_cycle; i++; } printf("--[ADDITION in JACOBIAN-AFFINE]--\n"); print_result(total5, one_us); } /** -------------------------------------------------------------------------*/ if (TEST_SCALAR_ALGORITHM) { max_iteration = 100; /** Test Left-to-right binary algorithm */ i = 0; uint64_t total = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation next_p = affine_left_to_right_binary(p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total += t2 - t1 - rdtscp_cycle; i++; } printf("--[AFFINE] Left to right binary algorithm--\n"); print_result(total, one_us); i = 0; uint64_t total2 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_left_to_right_binary(j_p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total2 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN] Left to right binary algorithm--\n"); print_result(total2, one_us); i = 0; uint64_t total3 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_left_to_right_binary(j_p, p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total3 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Left to right binary algorithm--\n"); print_result(total3, one_us); int w = 4; // windows size i = 0; uint64_t total4 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total4 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 4)--\n"); print_result(total4, one_us); w = 5; // windows size i = 0; uint64_t total5 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_affine_sliding_NAF(j_p, p, a, k, modulo, w); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total5 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN-AFFINE] Sliding NAF Left to right binary algorithm (w = 5)--\n"); print_result(total5, one_us); /** Test Right-to-left binary algorithm */ i = 0; uint64_t total6 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation next_p = affine_right_to_left_binary(p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total6 += t2 - t1 - rdtscp_cycle; i++; } printf("--[AFFINE] Right to left binary algorithm--\n"); print_result(total6, one_us); /** Test Montgomery ladder algorithm (Against time-based attack) */ i = 0; uint64_t total7 = 0; while (i < max_iteration) { RDTSC_START(t1); // start operation j_next_p = jacobian_montgomery_ladder(j_p, a, k, modulo); // Q = [k]P // gmp_printf("%Zd %Zd\n", j_next_p.X, j_next_p.Y); next_p = jacobian_to_affine(j_next_p, modulo); // gmp_printf("%Zd %Zd\n", next_p.x, next_p.y); RDTSC_STOP(t2); // stop operation total7 += t2 - t1 - rdtscp_cycle; i++; } printf("--[JACOBIAN] Montgomery ladder algorithm--\n"); print_result(total7, one_us); } /** -------------------------------------------------------------------------*/ J_Point public_key_1, public_key_2, shared_key; mpz_t private_key_1, private_key_2; mpz_init(private_key_1); mpz_init(private_key_2); // TODO : Key should be padded to fixed size (serializable) // Note: (2^-256 chance of failure, can be ignored) while (mpz_cmp(private_key_1, zero_value) == 0) { get_random(private_key_1, 32); // 256 bit positive_modulo(private_key_1, private_key_1, modulo); } while (mpz_cmp(private_key_2, zero_value) == 0) { get_random(private_key_2, 32); // 256 bit positive_modulo(private_key_2, private_key_2, modulo); } gmp_printf("Private key [A B]: %Zd %Zd\n\n", private_key_1, private_key_2); public_key_1 = jacobian_left_to_right_binary(j_p, a, private_key_1, modulo); public_key_2 = jacobian_left_to_right_binary(j_p, a, private_key_2, modulo); gmp_printf("Public key 1 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_1.X, public_key_1.Y, public_key_1.Z); gmp_printf("Public key 2 - Jacobian [X Y Z]: %Zd %Zd %Zd\n", public_key_2.X, public_key_2.Y, public_key_2.Z); Point public_key_1_decoded = jacobian_to_affine(public_key_1, modulo); Point public_key_2_decoded = jacobian_to_affine(public_key_2, modulo); gmp_printf("Public key 1 - Affine [X Y]: %Zd %Zd\n", public_key_1_decoded.x, public_key_1_decoded.y); gmp_printf("Public key 2 - Affine [X Y]: %Zd %Zd\n\n", public_key_2_decoded.x, public_key_2_decoded.y); /** -------------------------------------------------------------------------*/ if (TEST_ENCRYPT_DECRYPT) { // ElGamal Encrypt - Decrypt (Map message to chunk of points in EC) J_Point message, chosen_point, encoded_point, decoded_point; mpz_t k_message; mpz_init(k_message); mpz_set_ui(k_message, 123456789); message = jacobian_left_to_right_binary(j_p, a, k_message, modulo); Point message_decoded = jacobian_to_affine(message, modulo); gmp_printf("[Encrypt] Message - Affine [X Y] %Zd %Zd\n", message_decoded.x, message_decoded.y); gmp_printf("[Encrypt] Message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", message.X, message.Y, message.Z); while (mpz_cmp(k_message, zero_value) == 0) { get_random(k_message, 32); positive_modulo(k_message, k_message, modulo); } // Encrypt example chosen_point = jacobian_left_to_right_binary(j_p, a, k_message, modulo); // chosen point (r) gmp_printf("[Encrypt] Chosen point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", chosen_point.X, chosen_point.Y, chosen_point.Z); encoded_point = jacobian_left_to_right_binary(public_key_2, a, k_message, modulo); // r * Pu2 encoded_point = jacobian_curve_addition(message, encoded_point, a, modulo); // TODO : chosen_point & encoded_point should be padded to P-bit gmp_printf("[Decrypt] Encoded point - Jacobian [X Y Z]: %Zd %Zd %Zd\n", encoded_point.X, encoded_point.Y, encoded_point.Z); // Decrypt example (encoded_point - private_key * chosen_point) decoded_point = jacobian_left_to_right_binary(chosen_point, a, private_key_2, modulo); decoded_point = jacobian_curve_subtraction(encoded_point, decoded_point, a, modulo); gmp_printf("[Decrypt] Original message - Jacobian [X Y Z]: %Zd %Zd %Zd\n", decoded_point.X, decoded_point.Y, decoded_point.Z); message_decoded = jacobian_to_affine(decoded_point, modulo); gmp_printf("[Decrypt] Original message - Affine [X Y] %Zd %Zd\n\n", message_decoded.x, message_decoded.y); } /** -------------------------------------------------------------------------*/ if (TEST_SIMPLIFIED_ECIES) { // Simplified ECIES (Ref: Page 256 Cryptography Theory & Practice 2nd Ed. - Douglas) char* message_string = "hello"; // 0..9, a..z (base 36) mpz_t encrypted_message; mpz_init(encrypted_message); int partition = strlen(message_string) / 24; int partition_modulo = strlen(message_string) % 24; if (partition_modulo != 0) partition++; for (i = 0; i < partition; i++) { // 24 characters from message_string + 1 null-terminate char* chunked_message_string = (char*) malloc(25 * sizeof(char)); int size = 24; if ((i == partition - 1) && (partition_modulo != 0)) size = partition_modulo; strncpy(chunked_message_string, message_string + i*24, size); chunked_message_string[size] = '\0'; // null-terminate Point c_point = encrypt_ECIES(encrypted_message, chunked_message_string, public_key_2_decoded, p, a, modulo); gmp_printf("[SIMPLIFIED ECIES] Encrypted message: %Zd\n", encrypted_message); decrypt_ECIES(encrypted_message, c_point, private_key_2, p, a, modulo); } } /**-------------------------------------------------------------------------*/ // TODO : Public key validation! // Shared key (ECDH) - key secure exchange shared_key = jacobian_left_to_right_binary(public_key_2, a, private_key_1, modulo); gmp_printf("Shared key - Jacobian [X Y Z]: %Zd %Zd %Zd\n", shared_key.X, shared_key.Y, shared_key.Z); Point shared_key_decoded = jacobian_to_affine(shared_key, modulo); gmp_printf("Shared key - Affine [X Y]: %Zd %Zd\n", shared_key_decoded.x, shared_key_decoded.y); // TODO : ECDSA - digital signature algorithm /** Cleaning up */ mpz_clear(a); mpz_clear(b); mpz_clear(k); mpz_clear(r); mpz_clear(modulo); mpz_clear(private_key_1); mpz_clear(private_key_2); return EXIT_SUCCESS; }
// for draw calls, we initialize the active hot tiles and perform deferred // load on them if tile is in invalid state. we do this in the outer thread loop instead of inside // the draw routine itself mainly for performance, to avoid unnecessary setup // every triangle // @todo support deferred clear INLINE void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) { const API_STATE& state = GetApiState(pDC); HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; const SWR_PS_STATE& psState = state.psState; uint32_t numRTs = psState.maxRTSlotUsed + 1; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); x *= KNOB_MACROTILE_X_DIM; y *= KNOB_MACROTILE_Y_DIM; uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); // check RT if enabled if (state.psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearColorHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } } // check depth if enabled if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearDepthHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } // check stencil if enabled if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearStencilHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } }