示例#1
0
const unsigned long CUDARunner::RunStep()
{
	unsigned int best=0;
	unsigned int bestg=~0;

	if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0)
	{
		AllocateResources(m_numb,m_numt);
	}

	cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));

	cuda_process_helper(m_devin,m_devout,GetStepIterations(),GetStepBitShift(),m_numb,m_numt);

	cutilSafeCall(cudaMemcpy(m_out,m_devout,m_numb*m_numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));

	for(int i=0; i<m_numb*m_numt; i++)
	{
		if(m_out[i].m_bestnonce!=0 && m_out[i].m_bestg<bestg)
		{
			best=m_out[i].m_bestnonce;
			bestg=m_out[i].m_bestg;
		}
	}

	return CryptoPP::ByteReverse(best);

}
Word Burger::DisplayDirectX9Software8::Init(Word uWidth,Word uHeight,Word /* uDepth */,Word uFlags)
{
	Word uResult = DisplayDirectX9::Init(uWidth,uHeight,32,uFlags);
	if (!uResult) {
		m_uDepth = 8;

		//
		// Create the vertex buffer for software rendering
		//

		if (AllocateResources()!=D3D_OK) {
			uResult = 10;
		} else {
			m_Renderer.SetClip(0,0,static_cast<int>(uWidth),static_cast<int>(uHeight));

			Word8 TempPalette[768];
			MemoryClear(TempPalette,sizeof(TempPalette));
			TempPalette[765]=255;
			TempPalette[766]=255;
			TempPalette[767]=255;
			//SetPalette(pSelf,TempPalette);
			FillVertexBuffer();
		}
	}
	return uResult;
}
void GaussianBlurView::OnSizeSet(const Vector3& targetSize)
{
    mTargetSize = Vector2(targetSize);

    mChildrenRoot.SetSize(targetSize);

    if( !mBlurUserImage )
    {
        mImageActorComposite.SetSize(targetSize);
        mTargetActor.SetSize(targetSize);

        // Children render camera must move when GaussianBlurView object is resized. This is since we cannot change render target size - so we need to remap the child actors' rendering
        // accordingly so they still exactly fill the render target. Note that this means the effective resolution of the child render changes as the GaussianBlurView object changes
        // size, this is the trade off for not being able to modify render target size
        // Change camera z position based on GaussianBlurView actor height
        float cameraPosConstraintScale = 0.5f / tanf(ARBITRARY_FIELD_OF_VIEW * 0.5f);
        mRenderFullSizeCamera.SetZ(mTargetSize.height * cameraPosConstraintScale);
    }


    // if we are already on stage, need to update render target sizes now to reflect the new size of this actor
    if(Self().OnStage())
    {
        AllocateResources();
    }
}
const unsigned long CUDARunner::RunStep()
{
	//unsigned int best=0;
	//unsigned int bestg=~0;
	int offset=0;

	if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0)
	{
		AllocateResources(m_numb,m_numt);
	}
	m_out[0].m_bestnonce=0;
	cuMemcpyHtoD(m_devout,m_out,/*m_numb*m_numt*/sizeof(cuda_out));

	cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));

	int loops=GetStepIterations();
	int bits=GetStepBitShift()-1;

	void *ptr=(void *)(size_t)m_devin;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ptr=(void *)(size_t)m_devout;
	ALIGN_UP(offset, __alignof(ptr));
	cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
	offset+=sizeof(ptr);

	ALIGN_UP(offset, __alignof(loops));
	cuParamSeti(m_function,offset,loops);
	offset+=sizeof(loops);

	ALIGN_UP(offset, __alignof(bits));
	cuParamSeti(m_function,offset,bits);
	offset+=sizeof(bits);

	cuParamSetSize(m_function,offset);

	cuFuncSetBlockShape(m_function,m_numt,1,1);
	cuLaunchGrid(m_function,m_numb,1);

	cuMemcpyDtoH(m_out,m_devout,/*m_numb*m_numt*/sizeof(cuda_out));

	// very unlikely that we will find more than 1 hash with H=0
	// so we'll just return the first one and not even worry about G
	for(int i=0; i<1/*m_numb*m_numt*/; i++)
	{
		if(m_out[i].m_bestnonce!=0)// && m_out[i].m_bestg<bestg)
		{
			return CryptoPP::ByteReverse(m_out[i].m_bestnonce);
			//best=m_out[i].m_bestnonce;
			//bestg=m_out[i].m_bestg;
		}
	}

	return 0;

}
示例#5
0
Texture::Texture(unsigned int width, unsigned int height, unsigned int bytesPerPixel, InternalFormat internalFormat, DataType dataType, FilterMode filter, WrapMode wrapMode, void* pData) :
	mTextureId(0),
	mWidth(width), 
	mHeight(height), 
	mBytesPerPixel(bytesPerPixel), 
	mInternalFormat(internalFormat), 
	mDataType(dataType), 
	mFilter(filter), 
	mWrapMode(wrapMode), 
	mpData(pData)
{
	AllocateResources();
}
long Burger::DisplayDirectX9Software8::ResetLostDevice(void)
{
	ReleaseResources();
	D3DPRESENT_PARAMETERS Parms;
	m_D3D9Settings.GetPresentParameters(&Parms);
	HRESULT hResult = m_pDirect3DDevice9->Reset(&Parms);
	if (hResult<0) {
		if (hResult!=D3DERR_DEVICELOST) {
			--m_uResetAttempts;
		} else {
			m_uResetAttempts = DIRECTXRESETATTEMPTS;
		}
	} else {
		hResult = AllocateResources();
	}
	return hResult;
}
const unsigned long RemoteCUDARunner::RunStep()
{

	if(m_in==0 || m_out==0 || m_devin==0 || m_devout==0)
	{
		AllocateResources(m_numb,m_numt);
	}

	cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(remote_cuda_in),cudaMemcpyHostToDevice));

	remote_cuda_process_helper(m_devin,m_devout,m_devmetahash,GetStepIterations(),GetStepBitShift(),m_numb,m_numt);

	cutilSafeCall(cudaMemcpy(m_out,m_devout,m_numb*m_numt*sizeof(remote_cuda_out),cudaMemcpyDeviceToHost));
	cutilSafeCall(cudaMemcpy(m_metahash,m_devmetahash,m_numb*m_numt*GetStepIterations(),cudaMemcpyDeviceToHost));

	return 0;

}
void RemoteCUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int64 besttime=std::numeric_limits<int64>::max();
	int m_savebits=m_bits;

	m_bits=7;

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	std::cout << "CUDA finding best kernel configuration" << std::endl;

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			AllocateResources(numb,numt);
			// clear out any existing error
			cudaError_t err=cudaGetLastError();
			err=cudaSuccess;

			int64 st=GetTimeMillis();

			for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
			{
				cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(remote_cuda_in),cudaMemcpyHostToDevice));

				remote_cuda_process_helper(m_devin,m_devout,m_devmetahash,64,6,numb,numt);

				cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(remote_cuda_out),cudaMemcpyDeviceToHost));

				err=cudaGetLastError();
				if(err!=cudaSuccess)
				{
					std::cout << "CUDA error " << err << std::endl;
				}
			}

			int64 et=GetTimeMillis();

			std::cout << "Finding best configuration step end (" << numb << "," << numt << ") " << et-st << "ms  prev best=" << besttime << "ms" << std::endl;

			if((et-st)<besttime && err==cudaSuccess)
			{
				bestb=numb;
				bestt=numt;
				besttime=et-st;
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	m_bits=m_savebits;

	AllocateResources(m_numb,m_numt);

}
示例#9
0
// Top level loop for TLM algorithm
void MainLoop(void)
{
	int nIterations = 0;
	int n = 0;
	int nThreads;
	int nCores;
	int *Priorities;
	int **BusiestThreads;
	bool Empty = false;
	HANDLE *hReadyEventArray;
	HANDLE *hWorkerThreadArray;

	// Calculate the absolute threshold from the path loss
	AbsoluteThreshold = SQUARE(4*M_PI*GridSpacing/KAPPA*Frequency/SPEED_OF_LIGHT)*pow(10, MaxPathLoss/10.0);
	RelativeThreshold *= RelativeThreshold;

	// Calculate the boundaries
	CalculateSectionIndices();
	AllocateResources();
	CalculateInitialBoundaries();

	nThreads = MaxThreadIndex.X * MaxThreadIndex.Y * MaxThreadIndex.Z;
	nCores = 2;

	// Evaluate source output
	if (nIterations < ImpulseSource.Duration) {
		EvaluateSource(nIterations);
	}

	// Allocate memory for the ready event array and worker threads
	hReadyEventArray = (HANDLE*)malloc(nThreads * sizeof(HANDLE));
	hWorkerThreadArray = (HANDLE*)malloc(nThreads * sizeof(HANDLE));
	BusiestThreads = (int**)malloc(nThreads*sizeof(int*));
	Priorities = (int*)malloc(nThreads*sizeof(int));
	int CurrentPriority = 6;
	for (int i=0; i<MaxThreadIndex.X; i++) {
		for (int j=0; j<MaxThreadIndex.Y; j++) {
			for (int k=0; k<MaxThreadIndex.Z; k++) {
				hReadyEventArray[n] = hReadyEvent[i][j][k];
				hWorkerThreadArray[n] = hWorkerThreads[i][j][k];
				BusiestThreads[n] = (int*)malloc(3*sizeof(int));
				BusiestThreads[n][0] = i;
				BusiestThreads[n][1] = j;
				BusiestThreads[n][2] = k;
				Priorities[n] = CurrentPriority;
				if (n%nCores == nCores-1 && CurrentPriority) {
					CurrentPriority--;
				}
				printf("Priority %d = %d\n", n, Priorities[n]);
				n++;
			}
		}
	}

	// Wait for the workers to become ready
	WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE);

	// Repeat the algorithm while the active set is not empty
	while (Empty == false) {

		// Tell the worker threads to scatter
		for (int i=0; i<MaxThreadIndex.X; i++) {
			for (int j=0; j<MaxThreadIndex.Y; j++) {
				for (int k=0; k<MaxThreadIndex.Z; k++) {
					SetEvent(hScatterEvent[i][j][k]);
				}
			}
		}

		// Wait for the workers to acknowledge
		WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE);

		// Tell the worker threads to connect
		for (int i=0; i<MaxThreadIndex.X; i++) {
			for (int j=0; j<MaxThreadIndex.Y; j++) {
				for (int k=0; k<MaxThreadIndex.Z; k++) {
					SetEvent(hConnectEvent[i][j][k]);
				}
			}
		}

		// Wait for the workers to acknowledge
		WaitForMultipleObjects(nThreads, hReadyEventArray, true, INFINITE);

		/*for (int i=0; i<MaxThreadIndex.X; i++) {
			for (int j=0; j<MaxThreadIndex.Y; j++) {
				for (int k=0; k<MaxThreadIndex.Z; k++) {
					ThreadPriority = 0;
					for (int l=0; l<nCores; l++) {
						if (ActiveJunctions[i][j][k] > ActiveJunctions[BusiestThreads[l][0]][BusiestThreads[l][1]][BusiestThreads[l][2]]) {
							for (int m=nCores-1; m>l; m--) {
								BusiestThreads[m][0] = BusiestThreads[m-1][0];
								BusiestThreads[m][1] = BusiestThreads[m-1][1];
								BusiestThreads[m][2] = BusiestThreads[m-1][2];
							}
							BusiestThreads[l][0] = i;
							BusiestThreads[l][1] = j;
							BusiestThreads[l][2] = k;
							if (l<MAX(nCores/2,2)) {
								ThreadPriority = 4;
							}
							else {
								ThreadPriority = 2;
							}
						}
					}
					SetThreadPriority(hWorkerThreads[i][j][k], ThreadPriority);
				}
			}
		}*/

		qsort(BusiestThreads, nThreads, sizeof(int*), CompareActiveJunctions);
		for (int i=0; i<nThreads; i++) {
			SetThreadPriority(hWorkerThreads[BusiestThreads[i][0]][BusiestThreads[i][1]][BusiestThreads[i][2]], Priorities[i]);
		}

		// Increment the number of iterations completed
		nIterations++;

		// Check for empty active sets
		Empty = true;
		for (int i=0; i<MaxThreadIndex.X; i++) {
			for (int j=0; j<MaxThreadIndex.Y; j++) {
				for (int k=0; k<MaxThreadIndex.Z; k++) {
					if (ActiveSet[i][j][k] != NULL) {
						Empty = false;
					}
				}
			}
		}

		printf("Completed %d iterations\n", nIterations);
		for (int i=0; i<MaxThreadIndex.X; i++) {
			for (int j=0; j<MaxThreadIndex.Y; j++) {
				for (int k=0; k<MaxThreadIndex.Z; k++) {
					printf("\tSection (%d,%d,%d):\t%d active junctions\n", i+1, j+1, k+1, ActiveJunctions[i][j][k]);
				}
			}
		}
	}

	// Tell the worker threads to finish
	for (int i=0; i<MaxThreadIndex.X; i++) {
		for (int j=0; j<MaxThreadIndex.Y; j++) {
			for (int k=0; k<MaxThreadIndex.Z; k++) {
				SetEvent(hEndEvent[i][j][k]);
			}
		}
	}

	// Wait for the worker threads to terminate
	WaitForMultipleObjects(nThreads, hWorkerThreadArray, true, INFINITE);

	// Free memory allocated to the synchronisation
	FreeResources();

	printf("Algorithm complete, took %d iterations\n", nIterations);
}
void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int offset=0;
	void *ptr=0;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			if(AllocateResources(numb,numt)==true)
			{
				// clear out any existing error
				CUresult err=CUDA_SUCCESS;

				int64 st=GetTimeMillis();

				for(int it=0; it<128*256*2 && err==CUDA_SUCCESS; it+=(numb*numt))
				{

					cuMemcpyHtoD(m_devin,m_in,sizeof(cuda_in));

					offset=0;
					int loops=64;
					int bits=5;

					ptr=(void *)(size_t)m_devin;
					ALIGN_UP(offset, __alignof(ptr));
					cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
					offset+=sizeof(ptr);

					ptr=(void *)(size_t)m_devout;
					ALIGN_UP(offset, __alignof(ptr));
					cuParamSetv(m_function,offset,&ptr,sizeof(ptr));
					offset+=sizeof(ptr);

					ALIGN_UP(offset, __alignof(loops));
					cuParamSeti(m_function,offset,loops);
					offset+=sizeof(loops);

					ALIGN_UP(offset, __alignof(bits));
					cuParamSeti(m_function,offset,bits);
					offset+=sizeof(bits);

					cuParamSetSize(m_function,offset);

					err=cuFuncSetBlockShape(m_function,numt,1,1);
					if(err!=CUDA_SUCCESS)
					{
						printf("cuFuncSetBlockShape error %d\n",err);
						continue;
					}

					err=cuLaunchGrid(m_function,numb,1);
					if(err!=CUDA_SUCCESS)
					{
						printf("cuLaunchGrid error %d\n",err);
						continue;
					}

					cuMemcpyDtoH(m_out,m_devout,numt*numb*sizeof(cuda_out));

					if(err!=CUDA_SUCCESS)
					{
						printf("CUDA error %d\n",err);
					}
				}

				int64 et=GetTimeMillis();

				printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

				if((et-st)<besttime && err==CUDA_SUCCESS)
				{
					bestb=numb;
					bestt=numt;
					besttime=et-st;
				}
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}
示例#11
0
void CUDARunner::FindBestConfiguration()
{
	unsigned long lowb=16;
	unsigned long highb=128;
	unsigned long lowt=16;
	unsigned long hight=256;
	unsigned long bestb=16;
	unsigned long bestt=16;
	int64 besttime=std::numeric_limits<int64>::max();

	if(m_requestedgrid>0 && m_requestedgrid<=65536)
	{
		lowb=m_requestedgrid;
		highb=m_requestedgrid;
	}

	if(m_requestedthreads>0 && m_requestedthreads<=65536)
	{
		lowt=m_requestedthreads;
		hight=m_requestedthreads;
	}

	for(int numb=lowb; numb<=highb; numb*=2)
	{
		for(int numt=lowt; numt<=hight; numt*=2)
		{
			AllocateResources(numb,numt);
			// clear out any existing error
			cudaError_t err=cudaGetLastError();
			err=cudaSuccess;

			int64 st=GetTimeMillis();

			for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
			{
				cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));

				cuda_process_helper(m_devin,m_devout,64,6,numb,numt);

				cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));

				err=cudaGetLastError();
				if(err!=cudaSuccess)
				{
					printf("CUDA error %d\n",err);
				}
			}

			int64 et=GetTimeMillis();

			printf("Finding best configuration step end (%d,%d) %"PRI64d"ms  prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);

			if((et-st)<besttime && err==cudaSuccess)
			{
				bestb=numb;
				bestt=numt;
				besttime=et-st;
			}
		}
	}

	m_numb=bestb;
	m_numt=bestt;

	AllocateResources(m_numb,m_numt);

}
示例#12
0
// Top level loop for TLM algorithm
void MainLoop(void)
{
	int nIterations = 0;
	int Index = 0;
	bool Success;
	bool Empty = false;

	// Calculate the absolute threshold from the path loss
	AbsoluteThreshold = SQUARE(4*M_PI*GridSpacing/KAPPA*Frequency/SPEED_OF_LIGHT)*pow(10, MaxPathLoss/10.0);
	RelativeThreshold *= RelativeThreshold;
	Sets = 2*Threads;

	// Calculate the boundaries
	AllocateResources();

	// Evaluate source output
	if (nIterations < ImpulseSource.Duration) {
		EvaluateSource(nIterations);
	}
	ActiveJunctions[(ImpulseSource.X+ImpulseSource.Y+ImpulseSource.Z)%Threads] = 1;
	ActiveSet[(ImpulseSource.X+ImpulseSource.Y+ImpulseSource.Z)%Threads] = AddJunctionToSet(ImpulseSource.X, ImpulseSource.Y, ImpulseSource.Z);

	// Wait for the workers to become ready
	do {
		Sleep(1);
		Success = true;
		for (int i=0; i<Threads; i++) {
			WaitForSingleObject(hMutexMsg[i], INFINITE);
			if (Msg[i] != READY) {
				Success = false;
			}
			ReleaseMutex(hMutexMsg[i]);
		}
	}
	while (Success == false);

	// Repeat the algorithm while the active set is not empty
	while (Empty == false) {

		SetupNodeAdditions();
		
		// Tell the worker threads to scatter
		for (int i=0; i<Threads; i++) {
			Msg[i] = SCATTER_1;
			ReleaseMutex(hMutex[Index][i]);
		}
		++Index %= 3;

		// Wait for the workers to finish scattering
		WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE);
		++Index %= 3;

		// Tell the worker threads to scatter
		for (int i=0; i<Threads; i++) {
			Msg[i] = SCATTER_2;
			ReleaseMutex(hMutex[Index][i]);
		}
		++Index %= 3;

		// Wait for the workers to finish scattering
		WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE);
		++Index %= 3;

		// Tell the worker threads to connect
		for (int i=0; i<Threads; i++) {
			Msg[i] = CONNECT;
			ReleaseMutex(hMutex[Index][i]);
		}
		++Index %= 3;

		WaitForMultipleObjects(Threads, hMutex[Index], true, INFINITE);
		++Index %= 3;

		// PROCESSING HERE

		// Increment the number of iterations completed
		nIterations++;

		// Check for empty active sets
		Empty = true;
		for (int i=0; i<Sets; i++) {
			if (ActiveSet[i] != NULL) {
				Empty = false;
			}
		}
	
		printf("Completed %d iterations\n", nIterations);
		for (int i=0; i<Sets; i++) {
			printf("\tSet %d:\t%d active junctions\n", i+1, ActiveJunctions[i]);
		}
	}

	// Tell the worker threads to finish
	for (int i=0; i<Threads; i++) {
		Msg[i] = END;
		ReleaseMutex(hMutex[Index][i]);
	}

	// Wait for the worker threads to terminate
	WaitForMultipleObjects(Threads, hWorkerThreads, true, INFINITE);

	// Free memory allocated to the synchronisation
	FreeResources();

	printf("Algorithm complete, took %d iterations\n", nIterations);
}
void GaussianBlurView::Activate()
{
    // make sure resources are allocated and start the render tasks processing
    AllocateResources();
    CreateRenderTasks();
}