const unsigned long CUDARunner::RunStep() { unsigned int best=0; unsigned int bestg=~0; if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0) { AllocateResources(m_numb,m_numt); } cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice)); cuda_process_helper(m_devin,m_devout,GetStepIterations(),GetStepBitShift(),m_numb,m_numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,m_numb*m_numt*sizeof(cuda_out),cudaMemcpyDeviceToHost)); for(int i=0; i<m_numb*m_numt; i++) { if(m_out[i].m_bestnonce!=0 && m_out[i].m_bestg<bestg) { best=m_out[i].m_bestnonce; bestg=m_out[i].m_bestg; } } return CryptoPP::ByteReverse(best); }
Word Burger::DisplayDirectX9Software8::Init(Word uWidth,Word uHeight,Word /* uDepth */,Word uFlags) { Word uResult = DisplayDirectX9::Init(uWidth,uHeight,32,uFlags); if (!uResult) { m_uDepth = 8; // // Create the vertex buffer for software rendering // if (AllocateResources()!=D3D_OK) { uResult = 10; } else { m_Renderer.SetClip(0,0,static_cast<int>(uWidth),static_cast<int>(uHeight)); Word8 TempPalette[768]; MemoryClear(TempPalette,sizeof(TempPalette)); TempPalette[765]=255; TempPalette[766]=255; TempPalette[767]=255; //SetPalette(pSelf,TempPalette); FillVertexBuffer(); } } return uResult; }
void GaussianBlurView::OnSizeSet(const Vector3& targetSize) { mTargetSize = Vector2(targetSize); mChildrenRoot.SetSize(targetSize); if( !mBlurUserImage ) { mImageActorComposite.SetSize(targetSize); mTargetActor.SetSize(targetSize); // Children render camera must move when GaussianBlurView object is resized. This is since we cannot change render target size - so we need to remap the child actors' rendering // accordingly so they still exactly fill the render target. Note that this means the effective resolution of the child render changes as the GaussianBlurView object changes // size, this is the trade off for not being able to modify render target size // Change camera z position based on GaussianBlurView actor height float cameraPosConstraintScale = 0.5f / tanf(ARBITRARY_FIELD_OF_VIEW * 0.5f); mRenderFullSizeCamera.SetZ(mTargetSize.height * cameraPosConstraintScale); } // if we are already on stage, need to update render target sizes now to reflect the new size of this actor if(Self().OnStage()) { AllocateResources(); } }
const unsigned long CUDARunner::RunStep() { //unsigned int best=0; //unsigned int bestg=~0; int offset=0; if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0) { AllocateResources(m_numb,m_numt); } m_out[0].m_bestnonce=0; cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out)); cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); int loops=GetStepIterations(); int bits=GetStepBitShift()-1; void *ptr=(void *)(size_t)m_devin; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ptr=(void *)(size_t)m_devout; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ALIGN_UP(offset, __alignof(loops)); cuParamSeti(m_function,offset,loops); offset+=sizeof(loops); ALIGN_UP(offset, __alignof(bits)); cuParamSeti(m_function,offset,bits); offset+=sizeof(bits); cuParamSetSize(m_function,offset); cuFuncSetBlockShape(m_function,m_numt,1,1); cuLaunchGrid(m_function,m_numb,1); cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out)); // very unlikely that we will find more than 1 hash with H=0 // so we'll just return the first one and not even worry about G for(int i=0; i<1/*m_numb*m_numt*/; i++) { if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg) { return CryptoPP::ByteReverse(m_out[i].m_bestnonce); //best=m_out[i].m_bestnonce; //bestg=m_out[i].m_bestg; } } return 0; }
Texture::Texture(unsigned int width, unsigned int height, unsigned int bytesPerPixel, InternalFormat internalFormat, DataType dataType, FilterMode filter, WrapMode wrapMode, void* pData) : mTextureId(0), mWidth(width), mHeight(height), mBytesPerPixel(bytesPerPixel), mInternalFormat(internalFormat), mDataType(dataType), mFilter(filter), mWrapMode(wrapMode), mpData(pData) { AllocateResources(); }
long Burger::DisplayDirectX9Software8::ResetLostDevice(void) { ReleaseResources(); D3DPRESENT_PARAMETERS Parms; m_D3D9Settings.GetPresentParameters(&Parms); HRESULT hResult = m_pDirect3DDevice9->Reset(&Parms); if (hResult<0) { if (hResult!=D3DERR_DEVICELOST) { --m_uResetAttempts; } else { m_uResetAttempts = DIRECTXRESETATTEMPTS; } } else { hResult = AllocateResources(); } return hResult; }
const unsigned long RemoteCUDARunner::RunStep() { if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0) { AllocateResources(m_numb,m_numt); } cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(remote_cuda_in),cudaMemcpyHostToDevice)); remote_cuda_process_helper(m_devin,m_devout,m_devmetahash,GetStepIterations(),GetStepBitShift(),m_numb,m_numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,m_numb*m_numt*sizeof(remote_cuda_out),cudaMemcpyDeviceToHost)); cutilSafeCall(cudaMemcpy(m_metahash,m_devmetahash,m_numb*m_numt*GetStepIterations(),cudaMemcpyDeviceToHost)); return 0; }
void RemoteCUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int64 besttime=std::numeric_limits<int64>::max(); int m_savebits=m_bits; m_bits=7; if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } std::cout << "CUDA finding best kernel configuration" << std::endl; for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { AllocateResources(numb,numt); // clear out any existing error cudaError_t err=cudaGetLastError(); err=cudaSuccess; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==0; it+=(numb*numt)) { cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(remote_cuda_in),cudaMemcpyHostToDevice)); remote_cuda_process_helper(m_devin,m_devout,m_devmetahash,64,6,numb,numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(remote_cuda_out),cudaMemcpyDeviceToHost)); err=cudaGetLastError(); if(err!=cudaSuccess) { std::cout << "CUDA error " << err << std::endl; } } int64 et=GetTimeMillis(); std::cout << "Finding best configuration step end (" << numb << "," << numt << ") " << et-st << "ms prev best=" << besttime << "ms" << std::endl; if((et-st)<besttime && err==cudaSuccess) { bestb=numb; bestt=numt; besttime=et-st; } } } m_numb=bestb; m_numt=bestt; m_bits=m_savebits; AllocateResources(m_numb,m_numt); }
// Top level loop for TLM algorithm void MainLoop(void) { int nIterations = 0; int n = 0; int nThreads; int nCores; int *Priorities; int **BusiestThreads; bool Empty = false; HANDLE *hReadyEventArray; HANDLE *hWorkerThreadArray; // Calculate the absolute threshold from the path loss AbsoluteThreshold = SQUARE(4*M_PI*GridSpacing/KAPPA*Frequency/SPEED_OF_LIGHT)*pow(10, MaxPathLoss/10.0); RelativeThreshold *= RelativeThreshold; // Calculate the boundaries CalculateSectionIndices(); AllocateResources(); CalculateInitialBoundaries(); nThreads = MaxThreadIndex.X * MaxThreadIndex.Y * MaxThreadIndex.Z; nCores = 2; // Evaluate source output if (nIterations < ImpulseSource.Duration) { EvaluateSource(nIterations); } // Allocate memory for the ready event array and worker threads hReadyEventArray = (HANDLE*)malloc(nThreads * sizeof(HANDLE)); hWorkerThreadArray = (HANDLE*)malloc(nThreads * sizeof(HANDLE)); BusiestThreads = (int**)malloc(nThreads*sizeof(int*)); Priorities = (int*)malloc(nThreads*sizeof(int)); int CurrentPriority = 6; for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { hReadyEventArray[n] = hReadyEvent[i][j][k]; hWorkerThreadArray[n] = hWorkerThreads[i][j][k]; BusiestThreads[n] = (int*)malloc(3*sizeof(int)); BusiestThreads[n][0] = i; BusiestThreads[n][1] = j; BusiestThreads[n][2] = k; Priorities[n] = CurrentPriority; if (n%nCores == nCores-1 && CurrentPriority) { CurrentPriority--; } printf("Priority %d = %d\n", n, Priorities[n]); n++; } } } // Wait for the workers to become ready WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE); // Repeat the algorithm while the active set is not empty while (Empty == false) { // Tell the worker threads to scatter for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { SetEvent(hScatterEvent[i][j][k]); } } } // Wait for the workers to acknowledge WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE); // Tell the worker threads to connect for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { SetEvent(hConnectEvent[i][j][k]); } } } // Wait for the workers to acknowledge WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE); /*for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { ThreadPriority = 0; for (int l=0; l<nCores; l++) { if (ActiveJunctions[i][j][k] > ActiveJunctions[BusiestThreads[l][0]][BusiestThreads[l][1]][BusiestThreads[l][2]]) { for (int m=nCores-1; m>l; m--) { BusiestThreads[m][0] = BusiestThreads[m-1][0]; BusiestThreads[m][1] = BusiestThreads[m-1][1]; BusiestThreads[m][2] = BusiestThreads[m-1][2]; } BusiestThreads[l][0] = i; BusiestThreads[l][1] = j; BusiestThreads[l][2] = k; if (l<MAX(nCores/2,2)) { ThreadPriority = 4; } else { ThreadPriority = 2; } } } SetThreadPriority(hWorkerThreads[i][j][k], ThreadPriority); } } }*/ qsort(BusiestThreads, nThreads, sizeof(int*), CompareActiveJunctions); for (int i=0; i<nThreads; i++) { SetThreadPriority(hWorkerThreads[BusiestThreads[i][0]][BusiestThreads[i][1]][BusiestThreads[i][2]], Priorities[i]); } // Increment the number of iterations completed nIterations++; // Check for empty active sets Empty = true; for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { if (ActiveSet[i][j][k] != NULL) { Empty = false; } } } } printf("Completed %d iterations\n", nIterations); for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { printf("\tSection (%d,%d,%d):\t%d active junctions\n", i+1, j+1, k+1, ActiveJunctions[i][j][k]); } } } } // Tell the worker threads to finish for (int i=0; i<MaxThreadIndex.X; i++) { for (int j=0; j<MaxThreadIndex.Y; j++) { for (int k=0; k<MaxThreadIndex.Z; k++) { SetEvent(hEndEvent[i][j][k]); } } } // Wait for the worker threads to terminate WaitForMultipleObjects(nThreads, hWorkerThreadArray, true, INFINITE); // Free memory allocated to the synchronisation FreeResources(); printf("Algorithm complete, took %d iterations\n", nIterations); }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int offset=0; void *ptr=0; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { if(AllocateResources(numb,numt)==true) { // clear out any existing error CUresult err=CUDA_SUCCESS; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==CUDA_SUCCESS; it+=(numb*numt)) { cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in)); offset=0; int loops=64; int bits=5; ptr=(void *)(size_t)m_devin; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ptr=(void *)(size_t)m_devout; ALIGN_UP(offset, __alignof(ptr)); cuParamSetv(m_function,offset,&ptr,sizeof(ptr)); offset+=sizeof(ptr); ALIGN_UP(offset, __alignof(loops)); cuParamSeti(m_function,offset,loops); offset+=sizeof(loops); ALIGN_UP(offset, __alignof(bits)); cuParamSeti(m_function,offset,bits); offset+=sizeof(bits); cuParamSetSize(m_function,offset); err=cuFuncSetBlockShape(m_function,numt,1,1); if(err!=CUDA_SUCCESS) { printf("cuFuncSetBlockShape error %d\n",err); continue; } err=cuLaunchGrid(m_function,numb,1); if(err!=CUDA_SUCCESS) { printf("cuLaunchGrid error %d\n",err); continue; } cuMemcpyDtoH(m_out,m_devout,numt*numb*sizeof(cuda_out)); if(err!=CUDA_SUCCESS) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==CUDA_SUCCESS) { bestb=numb; bestt=numt; besttime=et-st; } } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
void CUDARunner::FindBestConfiguration() { unsigned long lowb=16; unsigned long highb=128; unsigned long lowt=16; unsigned long hight=256; unsigned long bestb=16; unsigned long bestt=16; int64 besttime=std::numeric_limits<int64>::max(); if(m_requestedgrid>0 && m_requestedgrid<=65536) { lowb=m_requestedgrid; highb=m_requestedgrid; } if(m_requestedthreads>0 && m_requestedthreads<=65536) { lowt=m_requestedthreads; hight=m_requestedthreads; } for(int numb=lowb; numb<=highb; numb*=2) { for(int numt=lowt; numt<=hight; numt*=2) { AllocateResources(numb,numt); // clear out any existing error cudaError_t err=cudaGetLastError(); err=cudaSuccess; int64 st=GetTimeMillis(); for(int it=0; it<128*256*2 && err==0; it+=(numb*numt)) { cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice)); cuda_process_helper(m_devin,m_devout,64,6,numb,numt); cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost)); err=cudaGetLastError(); if(err!=cudaSuccess) { printf("CUDA error %d\n",err); } } int64 et=GetTimeMillis(); printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime); if((et-st)<besttime && err==cudaSuccess) { bestb=numb; bestt=numt; besttime=et-st; } } } m_numb=bestb; m_numt=bestt; AllocateResources(m_numb,m_numt); }
// Top level loop for TLM algorithm void MainLoop(void) { int nIterations = 0; int Index = 0; bool Success; bool Empty = false; // Calculate the absolute threshold from the path loss AbsoluteThreshold = SQUARE(4*M_PI*GridSpacing/KAPPA*Frequency/SPEED_OF_LIGHT)*pow(10, MaxPathLoss/10.0); RelativeThreshold *= RelativeThreshold; Sets = 2*Threads; // Calculate the boundaries AllocateResources(); // Evaluate source output if (nIterations < ImpulseSource.Duration) { EvaluateSource(nIterations); } ActiveJunctions[(ImpulseSource.X+ImpulseSource.Y+ImpulseSource.Z)%Threads] = 1; ActiveSet[(ImpulseSource.X+ImpulseSource.Y+ImpulseSource.Z)%Threads] = AddJunctionToSet(ImpulseSource.X, ImpulseSource.Y, ImpulseSource.Z); // Wait for the workers to become ready do { Sleep(1); Success = true; for (int i=0; i<Threads; i++) { WaitForSingleObject(hMutexMsg[i], INFINITE); if (Msg[i] != READY) { Success = false; } ReleaseMutex(hMutexMsg[i]); } } while (Success == false); // Repeat the algorithm while the active set is not empty while (Empty == false) { SetupNodeAdditions(); // Tell the worker threads to scatter for (int i=0; i<Threads; i++) { Msg[i] = SCATTER_1; ReleaseMutex(hMutex[Index][i]); } ++Index %= 3; // Wait for the workers to finish scattering WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE); ++Index %= 3; // Tell the worker threads to scatter for (int i=0; i<Threads; i++) { Msg[i] = SCATTER_2; ReleaseMutex(hMutex[Index][i]); } ++Index %= 3; // Wait for the workers to finish scattering WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE); ++Index %= 3; // Tell the worker threads to connect for (int i=0; i<Threads; i++) { Msg[i] = CONNECT; ReleaseMutex(hMutex[Index][i]); } ++Index %= 3; WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE); ++Index %= 3; // PROCESSING HERE // Increment the number of iterations completed nIterations++; // Check for empty active sets Empty = true; for (int i=0; i<Sets; i++) { if (ActiveSet[i] != NULL) { Empty = false; } } printf("Completed %d iterations\n", nIterations); for (int i=0; i<Sets; i++) { printf("\tSet %d:\t%d active junctions\n", i+1, ActiveJunctions[i]); } } // Tell the worker threads to finish for (int i=0; i<Threads; i++) { Msg[i] = END; ReleaseMutex(hMutex[Index][i]); } // Wait for the worker threads to terminate WaitForMultipleObjects(Threads, hWorkerThreads, true, INFINITE); // Free memory allocated to the synchronisation FreeResources(); printf("Algorithm complete, took %d iterations\n", nIterations); }
void GaussianBlurView::Activate() { // make sure resources are allocated and start the render tasks processing AllocateResources(); CreateRenderTasks(); }