Exemplo n.º 1
0
void EdgeBoxGenerator::prepDataStructs( arrayf &E )
{
  int c, r, i;

  // initialize step sizes
  _scStep=sqrt(1/_alpha);
  _arStep=(1+_alpha)/(2*_alpha);
  _rcStepRatio=(1-_alpha)/(1+_alpha);

  // create _scaleNorm
  _scaleNorm.resize(10000);
  for( i=0; i<10000; i++ )
    _scaleNorm[i]=pow(1.f/i,_kappa);

  // create _segIImg
  arrayf E1; E1.init(h,w);
  for( i=0; i<_segCnt; i++ ) if( _segMag[i]>0 ) {
    E1.val(_segC[i],_segR[i]) = _segMag[i];
  }
  _segIImg.init(h+1,w+1);
  for( c=1; c<w; c++ ) for( r=1; r<h; r++ ) {
    _segIImg.val(c+1,r+1) = E1.val(c,r) + _segIImg.val(c,r+1) +
      _segIImg.val(c+1,r) - _segIImg.val(c,r);
  }

  // create _magIImg
  _magIImg.init(h+1,w+1);
  for( c=1; c<w; c++ ) for( r=1; r<h; r++ ) {
    float e = E.val(c,r) > _edgeMinMag ? E.val(c,r) : 0;
    _magIImg.val(c+1,r+1) = e + _magIImg.val(c,r+1) +
      _magIImg.val(c+1,r) - _magIImg.val(c,r);
  }

  // create remaining data structures
  _hIdxs.resize(h); _hIdxImg.init(h,w);
  for( r=0; r<h; r++ ) {
    int s=0, s1; _hIdxs[r].push_back(s);
    for( c=0; c<w; c++ ) {
      s1 = _segIds.val(c,r);
      if( s1!=s ) { s=s1; _hIdxs[r].push_back(s); }
      _hIdxImg.val(c,r) = int(_hIdxs[r].size())-1;
    }
  }
  _vIdxs.resize(w); _vIdxImg.init(h,w);
  for( c=0; c<w; c++ ) {
    int s=0; _vIdxs[c].push_back(s);
    for( r=0; r<h; r++ ) {
      int s1 = _segIds.val(c,r);
      if( s1!=s ) { s=s1; _vIdxs[c].push_back(s); }
      _vIdxImg.val(c,r) = int(_vIdxs[c].size())-1;
    }
  }

  // initialize scoreBox() data structures
  int n=_segCnt+1; _sWts.init(n,1);
  _sDone.init(n,1); _sMap.init(n,1); _sIds.init(n,1);
  for( i=0; i<n; i++ ) _sDone.val(0,i)=-1; _sId=0;
}
Exemplo n.º 2
0
void EdgeBoxGenerator::clusterEdges( arrayf &E, arrayf &O, arrayf &V )
{
  int c, r, cd, rd, i, j; h=E._h; w=E._w;

  // greedily merge connected edge pixels into clusters (create _segIds)
  _segIds.init(h,w); _segCnt=1;
  for( c=0; c<w; c++ ) for( r=0; r<h; r++ ) {
    if( c==0 || r==0 || c==w-1 || r==h-1 || E.val(c,r)<=_edgeMinMag )
      _segIds.val(c,r)=-1; else _segIds.val(c,r)=0;
  }
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ ) {
    if(_segIds.val(c,r)!=0) continue;
    float sumv=0; int c0=c, r0=r; vectorf vs; vectori cs, rs;
    while( sumv < _edgeMergeThr ) {
      _segIds.val(c0,r0)=_segCnt;
      float o0 = O.val(c0,r0), o1, v; bool found;
      for( cd=-1; cd<=1; cd++ ) for( rd=-1; rd<=1; rd++ ) {
        if( _segIds.val(c0+cd,r0+rd)!=0 ) continue; found=false;
        for( i=0; i<cs.size(); i++ )
          if( cs[i]==c0+cd && rs[i]==r0+rd ) { found=true; break; }
          if( found ) continue; o1=O.val(c0+cd,r0+rd);
          v=fabs(o1-o0)/PI; if(v>.5) v=1-v;
          vs.push_back(v); cs.push_back(c0+cd); rs.push_back(r0+rd);
      }
      float minv=1000; j=0;
      for( i=0; i<vs.size(); i++ ) if( vs[i]<minv ) {
        minv=vs[i]; c0=cs[i]; r0=rs[i]; j=i;
      }
      sumv+=minv; if(minv<1000) vs[j]=1000;
    }
    _segCnt++;
  }

  // merge or remove small segments
  _segMag.resize(_segCnt,0);
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ )
    if( (j=_segIds.val(c,r))>0 ) _segMag[j]+=E.val(c,r);
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ )
    if( (j=_segIds.val(c,r))>0 && _segMag[j]<=_clusterMinMag)
      _segIds.val(c,r)=0;
  i=1; while(i>0) {
    i=0; for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ ) {
      if( _segIds.val(c,r)!=0 ) continue;
      float o0=O.val(c,r), o1, v, minv=1000; j=0;
      for( cd=-1; cd<=1; cd++ ) for( rd=-1; rd<=1; rd++ ) {
        if( _segIds.val(c+cd,r+rd)<=0 ) continue; o1=O.val(c+cd,r+rd);
        v=fabs(o1-o0)/PI; if(v>.5) v=1-v;
        if( v<minv ) { minv=v; j=_segIds.val(c+cd,r+rd); }
      }
      _segIds.val(c,r)=j; if(j>0) i++;
    }
  }

  // compactify representation
  _segMag.assign(_segCnt,0); vectori map(_segCnt,0); _segCnt=1;
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ )
    if( (j=_segIds.val(c,r))>0 ) _segMag[j]+=E.val(c,r);
  for( i=0; i<_segMag.size(); i++ ) if( _segMag[i]>0 ) map[i]=_segCnt++;
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ )
    if( (j=_segIds.val(c,r))>0 ) _segIds.val(c,r)=map[j];

  // compute positional means and recompute _segMag
  _segMag.assign(_segCnt,0); vectorf meanX(_segCnt,0), meanY(_segCnt,0);
  vectorf meanOx(_segCnt,0), meanOy(_segCnt,0), meanO(_segCnt,0);
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ ) {
    j=_segIds.val(c,r); if(j<=0) continue;
    float m=E.val(c,r), o=O.val(c,r); _segMag[j]+=m;
    meanOx[j]+=m*cos(2*o); meanOy[j]+=m*sin(2*o);
    meanX[j]+=m*c; meanY[j]+=m*r;
  }
  for( i=0; i<_segCnt; i++ ) if( _segMag[i]>0 ) {
    float m=_segMag[i]; meanX[i]/=m; meanY[i]/=m;
    meanO[i]=atan2(meanOy[i]/m,meanOx[i]/m)/2;
  }

  // compute segment affinities
  _segAff.resize(_segCnt); _segAffIdx.resize(_segCnt);
  for(i=0; i<_segCnt; i++) _segAff[i].resize(0);
  for(i=0; i<_segCnt; i++) _segAffIdx[i].resize(0);
  const int rad = 2;
  for( c=rad; c<w-rad; c++ ) for( r=rad; r<h-rad; r++ ) {
    int s0=_segIds.val(c,r); if( s0<=0 ) continue;
    for( cd=-rad; cd<=rad; cd++ ) for( rd=-rad; rd<=rad; rd++ ) {
      int s1=_segIds.val(c+cd,r+rd); if(s1<=s0) continue;
      bool found = false; for(i=0;i<_segAffIdx[s0].size();i++)
        if(_segAffIdx[s0][i] == s1) { found=true; break; }
      if( found ) continue;
      float o=atan2(meanY[s0]-meanY[s1],meanX[s0]-meanX[s1])+PI/2;
      float a=fabs(cos(meanO[s0]-o)*cos(meanO[s1]-o)); a=pow(a,_gamma);
      _segAff[s0].push_back(a); _segAffIdx[s0].push_back(s1);
      _segAff[s1].push_back(a); _segAffIdx[s1].push_back(s0);
    }
  }

  // compute _segC and _segR
  _segC.resize(_segCnt); _segR.resize(_segCnt);
  for( c=1; c<w-1; c++ ) for( r=1; r<h-1; r++ )
    if( (j=_segIds.val(c,r))>0 ) { _segC[j]=c; _segR[j]=r; }

  // optionally create visualization (assume memory initialized is 3*w*h)
  if( V._x ) for( c=0; c<w; c++ ) for( r=0; r<h; r++ ) {
    i=_segIds.val(c,r);
    V.val(c+w*0,r) = i<=0 ? 1 : ((123*i + 128)%255)/255.0f;
    V.val(c+w*1,r) = i<=0 ? 1 : ((7*i + 3)%255)/255.0f;
    V.val(c+w*2,r) = i<=0 ? 1 : ((174*i + 80)%255)/255.0f;
  }
}
Exemplo n.º 3
0
ErrorCode GPUODESolverFixedStepIterative::SimulateODE(	const matrixf& A, const matrixf& B, const matrixf& C, const matrixf& D, 
														double tStart, double tEnd, double tStep, 
														const vectorf& x0, vectorf & tVect, matrixf & xVect){
	try{
		InitializeSolverData( A, B, tStart, tEnd, tStep, x0);
		BuildOutputObjects(C, D);
	}
	catch(std::exception& except){
		return ParODE_NotEnoughResources;
	}
	tVect.resize(_nTotalStepsSimulation);
	for(int ii = 0; ii < _nTotalStepsSimulation - _nSteps; ii++)
		tVect[ii] = tStart - (_nTotalStepsSimulation - _nSteps - ii) * tStep;
	tVect[ _nTotalStepsSimulation - _nSteps ] = tStart;
	for(int ii = 1; ii < _nSteps; ii++)
		tVect[(_nTotalStepsSimulation - _nSteps) + ii] = tVect[ (_nTotalStepsSimulation - _nSteps) + ii - 1] + tStep;
	xVect.resize(_nOutputSize, _nTotalStepsSimulation);
	matrixf xVectInit;
	try{
		if(_nTotalStepsSimulation != _nSteps){
			matrixf xVectInit;
			matrixf uVectInit;
			GetInitialStatesAndInputs(xVectInit, uVectInit);
			// compute outVectInit
			if(_bZeroC == 1 && _bZeroD == 1){
				// copy the state in the output
				for(int ii = 0; ii < _nTotalStepsSimulation - _nSteps; ii++)
					for(int jj = 0; jj < _nOutputSize; jj++)
						xVect(jj, ii) = xVectInit(jj, ii);
			} else{
				for(int ii = 0; ii < _nTotalStepsSimulation - _nSteps; ii++)
					for(int jj = 0; jj < _nOutputSize; jj++){
						fType val = 0.0;
						if(!_bZeroC)
							for(int kk = 0; kk < _nSystemSize; kk++)
								val += C(jj, kk) * xVectInit(kk, ii);
						if(!_bZeroD)
							for(int kk = 0; kk < _nInputs; kk++)
								val += D(jj, kk) * uVectInit(kk, ii);
						xVect(jj, ii) = val;
					}
			}
		}
	}
	catch(exception &e){
		std::cerr << "Exception caught : " << e.what() << std::endl;
		return ParODE_InvalidSolverType;
	}
	// compute the total number of steps per batch
	unsigned long NSB = 0; // number of steps per batch
	unsigned long maxStepsPerDevice = 0;
	unsigned int nDevices = _pGPUM->GetNumberOfDevices();
	assert(_nStepsPerDevice.size() == nDevices);
	for(int ii = 0; ii < _nStepsPerDevice.size(); ii++){
		NSB += _nStepsPerDevice[ii];
		if(maxStepsPerDevice < _nStepsPerDevice[ii])
			maxStepsPerDevice = _nStepsPerDevice[ii];
	}
	int nB;
	// accumulators for computing the global scan
	fType* accMatrix = new fType[_nMatrixSize * _nMatrixSize];
	fType* accVector = new fType[_nMatrixSize];
	// initialize the accumulator vector with the initial value X0
	// initialize the accumulator matrix with identity
	for(int ii = 0;  ii < _nMatrixSize; ii++){
		accVector[ii] = _X0(ii);
		for(int jj = 0; jj < _nMatrixSize; jj++)
			accMatrix[ii * _nMatrixSize + jj] = _M0(ii, jj);
	}
	long globalResultTIndex(_nTotalStepsSimulation - _nSteps);
	// global simulation time
	try{
		for(nB = 0; nB < (_nSteps - 1) / NSB + 1; nB++){
			// test if  this  the last iteration
			if( nB == (_nSteps - 1) / NSB ){
				long nStepsLeft = _nSteps - nB * NSB;
				// distribute the steps left
				int ii;
				for(ii = 0; ii < _nStepsPerDevice.size(); ii++){
					if(nStepsLeft <= _nStepsPerDevice[ii])
						break;
					nStepsLeft -= _nStepsPerDevice[ii];
				}
				assert(ii < _nStepsPerDevice.size());
				// re-adjust to a power of two
				int nStepsii(1);
				while(nStepsii < nStepsLeft)
					nStepsii <<= 1;
				_nStepsPerDevice[ii++] = nStepsii;
				for(;ii < _nStepsPerDevice.size();  ii++) _nStepsPerDevice[ii] = 0;
			}
			unsigned int nDevices = _nStepsPerDevice.size();
			vector<int> nSource(nDevices, 0), nDest(nDevices, 1);
			double tStartBatch = tStart + nB * NSB * tStep;
			vector<int> nElementStart( _nStepsPerDevice.size(), 0 );
			for(int ii = 1; ii < nElementStart.size(); ii++)
				nElementStart[ii] = nElementStart[ii-1] + _nStepsPerDevice[ii-1];
			for(int nDI = 0; nDI < nDevices; nDI++)
				InitializeBatchData(_BufferMatrices[nSource[nDI]][nDI],
									_BufferVectors[nSource[nDI]][nDI],
									nDI,
									tStartBatch + nElementStart[nDI] * tStep,
									tStep,
									_nStepsPerDevice[nDI]);
			// up-sweep
			for(int d = 1; d  < maxStepsPerDevice; d <<= 1 ){
				for(int nDI = 0; nDI < nDevices; nDI++)
					if(d  < _nStepsPerDevice[nDI]){
						// set parameters for MM upsweep
						vector<size_t> szLocal(2), szGlobal(2);
						szLocal[0] = szLocal[1] = _nLocalSizeMM;
						szGlobal[0] = szLocal[0] * _nStepsPerDevice[nDI] / (2 * d);
						szGlobal[1] = szLocal[1];
						_vectGPUKernels[nDI * _nKernels]->GetKernel<MMKernelScanUpsweep>()->Launch(
													 szLocal, szGlobal, 0,
													_BufferMatrices[nSource[nDI]][nDI],
													_BufferMatrices[nDest[nDI]][nDI],
													d,
													_nMatrixSize);
						// set parameters for MVV upsweep
						szLocal[0] = _nLocalSizeMVVColumn;
						szLocal[1] = _nLocalSizeMVVRow;
						szGlobal[0] = szLocal[0] * _nStepsPerDevice[nDI] / (2 * d);
						szGlobal[1] = szLocal[1];
						_vectGPUKernels[nDI * _nKernels + 1]->GetKernel<MVVKernelScanUpsweep>()->Launch(
													szLocal, szGlobal, 1,
													_BufferMatrices[nSource[nDI]][nDI],
													_BufferVectors[nSource[nDI]][nDI],
													_BufferVectors[nDest[nDI]][nDI],
													d,
													_nMatrixSize);
						// copy the left leaf
						// set parameters for copy kernel for the matrix component
						int nMSZ = _nMatrixSize * _nMatrixSize;
						vector<size_t> szLocalCopy(1), szGlobalCopy(1);
						szLocalCopy[0] = min(nMSZ, (int)(_pGPUM->GetDeviceAndContext(nDI)->MaxWorkgroupSize()));
						szGlobalCopy[0] = szLocalCopy[0] * _nStepsPerDevice[nDI] / (2 * d);
						_vectGPUKernels[nDI * _nKernels + 2]->GetKernel<LeftLeafCopyKernelScanUpsweep>()->Launch(
								szLocalCopy, szGlobalCopy, 0,
								_BufferMatrices[nSource[nDI]][nDI],
								_BufferMatrices[nDest[nDI]][nDI],
								d,
								nMSZ);
						// set parameters for copy kernel for the vector component
						szLocalCopy[0] = _nLocalSizeMM;
						szGlobalCopy[0] = szLocalCopy[0] * _nStepsPerDevice[nDI] / (2 * d);
						_vectGPUKernels[nDI * _nKernels + 2]->GetKernel<LeftLeafCopyKernelScanUpsweep>()->Launch(
								szLocalCopy, szGlobalCopy, 1,
								_BufferVectors[nSource[nDI]][nDI],
								_BufferVectors[nDest[nDI]][nDI],
								d,
								_nMatrixSize);
					}
				for(int nDI = 0; nDI < _nStepsPerDevice.size(); nDI++)
					if(d  < _nStepsPerDevice[nDI]){
						// synchronize both queues for device nDI
						_pGPUM->GetDeviceAndContext(nDI)->GetQueue(0)->Synchronize();
						_pGPUM->GetDeviceAndContext(nDI)->GetQueue(1)->Synchronize();
						// change source with destination
						nSource[nDI] = nDest[nDI];
						nDest[nDI] = (nSource[nDI] + 1) % 2;
					}
			}
			// copy the reduction data from the devices to the host memory
			// temporary buffer for copying data
			fType* bufferMatrix = new fType[_nMatrixSize * _nMatrixSize];
			fType* bufferVector = new fType[_nMatrixSize];
			fType* tempMatrix = new fType[_nMatrixSize * _nMatrixSize];
			fType* tempVector = new fType[_nMatrixSize];
			for(int nDI = 0; nDI < _nStepsPerDevice.size(); nDI++){
				// copy the reduction result
				_BufferMatrices[nSource[nDI]][nDI]->MemRead(	(void*)bufferMatrix, _BufferMatrices[nSource[nDI]][nDI]->GetContext()->GetQueue(0),
														_nMatrixSize * _nMatrixSize * (_nStepsPerDevice[nDI] - 1) * sizeof(fType), _nMatrixSize * _nMatrixSize * sizeof(fType));
				_BufferVectors[nSource[nDI]][nDI]->MemRead(	(void*)bufferVector, _BufferVectors[nSource[nDI]][nDI]->GetContext()->GetQueue(0),
														_nMatrixSize * (_nStepsPerDevice[nDI] - 1) * sizeof(fType), _nMatrixSize * sizeof(fType));
				// wait for the memcopy to complete
				_BufferMatrices[nSource[nDI]][nDI]->GetContext()->GetQueue(0)->Synchronize();
				// save the accumulator into the temporary buffers
				memcpy(tempMatrix, accMatrix, _nMatrixSize * _nMatrixSize * sizeof(fType));
				memcpy(tempVector, accVector, _nMatrixSize * sizeof(fType));
				// compute the new accumulator matrix
				for(int row = 0; row < _nMatrixSize; row++)
					for(int column = 0; column < _nMatrixSize; column++){
						fType resProd = 0.0;
						for(int kk = 0; kk < _nMatrixSize; kk++)
							resProd += bufferMatrix[row * _nMatrixSize + kk] * tempMatrix[kk * _nMatrixSize + column];
						accMatrix[row * _nMatrixSize + column] = resProd;
					}
				// compute the new accumulator vector
				for(int row = 0; row < _nMatrixSize; row++){
					fType resVal(0.0);
					for(int column = 0; column < _nMatrixSize; column++)
						resVal += bufferMatrix[row * _nMatrixSize + column] * tempVector[column];
					accVector[row] = resVal + bufferVector[row];
					}
				// copy the old accumulators in the reduction vectors GPU buffers
				_BufferMatrices[nSource[nDI]][nDI]->MemWrite(tempMatrix, _BufferMatrices[nSource[nDI]][nDI]->GetContext()->GetQueue(0),
					_nMatrixSize * _nMatrixSize * (_nStepsPerDevice[nDI] - 1) * sizeof(fType), _nMatrixSize * _nMatrixSize * sizeof(fType));
				_BufferVectors[nSource[nDI]][nDI]->MemWrite(tempVector, _BufferVectors[nSource[nDI]][nDI]->GetContext()->GetQueue(0),
					_nMatrixSize * (_nStepsPerDevice[nDI] - 1) * sizeof(fType), _nMatrixSize * sizeof(fType));
				// wait for the memcpy to finish
				_BufferVectors[nSource[nDI]][nDI]->GetContext()->GetQueue(0)->Synchronize();
			}
			delete [] bufferMatrix;
			delete [] bufferVector;
			delete [] tempMatrix;
			delete [] tempVector;
			// downsweep stage
			for(int d = maxStepsPerDevice / 2; d  > 0; d >>= 1 ){
				for(int nDI = 0; nDI < nDevices; nDI++)
					if(d  < _nStepsPerDevice[nDI]){
						// set parameters for MM kernel downsweep
						vector<size_t> szLocal(2), szGlobal(2);
						szLocal[0] = szLocal[1] = _nLocalSizeMM;
						szGlobal[0] = szLocal[0] * _nStepsPerDevice[nDI] / (2 * d);
						szGlobal[1] = szLocal[1];
						_vectGPUKernels[nDI * _nKernels + 3]->GetKernel<MMKernelScanDownsweep>()->Launch(
								szLocal, szGlobal, 0,
								_BufferMatrices[nSource[nDI]][nDI],
								_BufferMatrices[nDest[nDI]][nDI],
								d,
								_nMatrixSize);
						// call the MVV kernel for downsweep; queue 1
						szLocal[0] = _nLocalSizeMVVColumn;
						szLocal[1] = _nLocalSizeMVVRow;
						szGlobal[0] = szLocal[0] * _nStepsPerDevice[nDI] / (2 * d);
						szGlobal[1] = szLocal[1];
						_vectGPUKernels[nDI * _nKernels + 4]->GetKernel<MVVKernelScanDownsweep>()->Launch(
								szLocal, szGlobal, 1,
								_BufferMatrices[nSource[nDI]][nDI],
								_BufferVectors[nSource[nDI]][nDI],
								_BufferVectors[nDest[nDI]][nDI],
								d,
								_nMatrixSize);
						// set parameters Matrix copy downsweep
						// call the copy kernel for downsweep; queue 0 (copy the matrix component)
						vector<size_t> szLocalCopy(1), szGlobalCopy(1);
						int nMSZ = _nMatrixSize * _nMatrixSize;
						szLocalCopy[0] = min(nMSZ, (int)(_pGPUM->GetDeviceAndContext(nDI)->MaxWorkgroupSize()));
						szGlobalCopy[0] = szLocalCopy[0] * _nStepsPerDevice[nDI] / (2 * d);
						_vectGPUKernels[nDI * _nKernels + 5]->GetKernel<RootCopyKernelScanDownsweep>()->Launch(
							szLocalCopy, szGlobalCopy, 0,
							_BufferMatrices[nSource[nDI]][nDI],
							_BufferMatrices[nDest[nDI]][nDI],
							d,
							nMSZ);
						// set parameters Vector copy downsweep
						// call the copy kernel for downsweep; queue 1 (copy the vector component)
						szLocalCopy[0] = _nLocalSizeMM;
						szGlobalCopy[0] = szLocalCopy[0] * _nStepsPerDevice[nDI] / (2 * d);
						_vectGPUKernels[nDI * _nKernels + 5]->GetKernel<RootCopyKernelScanDownsweep>()->Launch(
							szLocalCopy, szGlobalCopy, 1,
							_BufferVectors[nSource[nDI]][nDI],
							_BufferVectors[nDest[nDI]][nDI],
							d,
							_nMatrixSize);
					}
				// synchronize all threads
				for(int nDI = 0; nDI < _nStepsPerDevice.size(); nDI++)
					if(d  < _nStepsPerDevice[nDI]){
						// synchronize both queues for device nDI
						_pGPUM->GetDeviceAndContext(nDI)->GetQueue(0)->Synchronize();
						_pGPUM->GetDeviceAndContext(nDI)->GetQueue(1)->Synchronize();
						// change source with destination
						nSource[nDI] = nDest[nDI];
						nDest[nDI] = (nSource[nDI] + 1) % 2;
					}
			}
			// the result is in nSource[nDI] for each device
			// copy the result from the device memory to the result vector
			for(int nDI = 0; nDI < nDevices; nDI++)
				if(_nStepsPerDevice[nDI] > 0){
					fType* buffer;
					int bufferStride;
					if(_bZeroC == 1 && _bZeroD == 1){
						buffer = new fType[  _nStepsPerDevice[nDI] * _nMatrixSize];
						_BufferVectors[nSource[nDI]][nDI]->MemRead(	(void*)buffer, _BufferVectors[nSource[nDI]][nDI]->GetContext()->GetQueue(0),
																	0, _nStepsPerDevice[nDI] * _nMatrixSize * sizeof(fType));
						bufferStride = _nMatrixSize;
					}
					else{
						// compute the output Y
						// set parameters for SystemOutput Kernel
						vector<size_t> szLocalOutputKernel(2), szGlobalOutputKernel(2);
						szLocalOutputKernel[0] = _nLocalSizeMVVColumn;
						szLocalOutputKernel[1] = _nLocalSizeMVVRow;
						szGlobalOutputKernel[0] = _nStepsPerDevice[nDI] * _nLocalSizeMVVColumn; // this should actually be nElementsLocal - 1
						szGlobalOutputKernel[1] = szLocalOutputKernel[1];
						_vectGPUKernels[nDI * _nKernels + 8]->GetKernel<SystemOutput>()->Launch(
							szLocalOutputKernel, szGlobalOutputKernel, 0,
							_BufferC[nDI],
							_BufferD[nDI],
							_BufferVectors[nSource[nDI]][nDI],
							_vectUBuffers[nDI],
							_BufferYVectors[nDI],
							_nStateStride,
							_nStateOffset,
							_nInputStride,
							_nInputOffset,
							_nSystemSize,
							_nInputs,
							_nOutputSize,
							_bZeroC,
							_bZeroD);
						_pGPUM->GetDeviceAndContext(nDI)->GetQueue(0)->Synchronize();
						// copy the result from the GPU memory to the CPU memory
						buffer = new fType[  _nStepsPerDevice[nDI] * _nOutputSize];
						_BufferYVectors[nDI]->MemRead((void*)buffer, _BufferYVectors[nDI]->GetContext()->GetQueue(0),
														0, _nStepsPerDevice[nDI] * _nOutputSize * sizeof(fType));
						bufferStride = _nOutputSize;
					}
					for(int ii = 0; ii < _nStepsPerDevice[nDI]; ii++)
						if(globalResultTIndex + ii < xVect.size2())
							for(int jj = 0; jj < _nOutputSize; jj++)
								xVect(jj, globalResultTIndex +  ii) =
										buffer[(ii + 1)  * bufferStride - _nOutputSize + jj];
					// we assume that for state vectors that include more than one state
					// the current 'original' state is stored at the end of the vector
					delete [] buffer;
					globalResultTIndex += _nStepsPerDevice[nDI];
				}
		}
	}
	catch(exception& except){
		delete [] accMatrix;
		delete [] accVector;
		DeleteGPUFSIObjects();
		CleanSolverData();
		return ParODE_NotEnoughResources;
	}
	delete [] accMatrix;
	delete [] accVector;
	CleanSolverData();
	DeleteGPUFSIObjects();
	return ParODE_OK;
}
Exemplo n.º 4
0
	//! set the root score of the detection
	void setScore(float confidence) { if (confidence_.size() == 0) confidence_.resize(1); confidence_[0] = confidence; }