Esempio n. 1
0
int _main(void) {
	// Init and resolve libraries
	initKernel();
	
	initLibc();
	initNetwork();


	// Connect to server and send message
	char socketName[] = "debug";

	struct sockaddr_in server;

	server.sin_len = sizeof(server);
	server.sin_family = AF_INET;
	server.sin_addr.s_addr = IP(192, 168, 0, 4);
	server.sin_port = sceNetHtons(9023);
	memset(server.sin_zero, 0, sizeof(server.sin_zero));

	int sock = sceNetSocket(socketName, AF_INET, SOCK_STREAM, 0);
	sceNetConnect(sock, (struct sockaddr *)&server, sizeof(server));

	debug(sock, "PID: %d", syscall(20));

	sceNetSocketClose(sock);


	// Return to browser
	return 0;
}
Esempio n. 2
0
int _main(void)
{
	// Init and resolve libraries
	initKernel();
	initLibc();
	initNetwork();
	initPthread();

	// Init netdebug
	struct sockaddr_in server;
	server.sin_family = sceNetHtons(AF_INET);
	server.sin_addr.s_addr = IP(192, 168, 0, 4);
	server.sin_port = sceNetHtons(9023);
	memset(server.sin_zero, 0, sizeof(server.sin_zero));

	netdbg_sock = sceNetSocket("netdebug", AF_INET, SOCK_STREAM, 0);
	sceNetConnect(netdbg_sock, (struct sockaddr *)&server, sizeof(server));


	ftp_init(PS4_IP, PS4_PORT);

	//INFO("PS4 listening on IP %s Port %i\n", PS4_IP, PS4_PORT);

	while (1) {
		sceKernelUsleep(100 * 1000);
	}

	ftp_fini();

	return 0;
}
Esempio n. 3
0
static RCCResult run(RCCWorker *self,
                     RCCBoolean timedOut,
                     RCCBoolean *newRunCondition) {
  Gaussian_blurProperties *p = self->properties;
  Gaussian_blurState *s = self->memories[0];
  RCCPort *in = &self->ports[GAUSSIAN_BLUR_IN],
          *out = &self->ports[GAUSSIAN_BLUR_OUT];
  const RCCContainer *c = self->container;
  
  (void)timedOut;

  // End state:  just send the zero length message to indicate "done"
  // This will be unnecessary when EOS indication is fixed
  if (self->runCondition == &end) {
    out->output.length = 0;
    c->advance(out, 0);
    return RCC_DONE;
  }

	// Current buffer
	unsigned cur = s->inLine % HISTORY_SIZE;

	// First line: init
  if(s->inLine == 0) {
    initKernel(p->sigmaX, p->sigmaY);
  }
	// Second line
	else if(s->inLine == 1) {
		memset(out->current.data, 0, LINE_BYTES);
    out->output.length = LINE_BYTES;
		c->advance(out, LINE_BYTES);
	}
	// Middle line
	else if(s->inLine > 1) {
		doLine(s->buffers[(cur - 2 + HISTORY_SIZE) % HISTORY_SIZE].data,
			s->buffers[(cur - 1 + HISTORY_SIZE) % HISTORY_SIZE].data,
			in->current.data,
			out->current.data,
			p->width);
    out->output.length = LINE_BYTES;
		c->advance(out, LINE_BYTES);
	}

	// Go to next
	unsigned prev = (cur - 2 + HISTORY_SIZE) % HISTORY_SIZE;
	if(s->inLine < HISTORY_SIZE - 1)
		c->take(in, NULL, &s->buffers[cur]);
	else
		c->take(in, &s->buffers[prev], &s->buffers[cur]);
	s->inLine++;

  // Arrange to send the zero-length message after the last line of last image
  // This will be unnecessary when EOS indication is fixed
  if (in->input.length == 0) {
    self->runCondition = &end;
    *newRunCondition = 1;
  }
  return RCC_OK;
}
Esempio n. 4
0
int main(void)
{
   initKernel();
   //Threads anlegen usw.
   createKernelThreads();
   startKernel();
   return 0;
}
Esempio n. 5
0
void TestOpenCL::process()
{
	//create opencl context and command queue
	if(!initOpenCl())
		return;

	//create texture, must be done after openCl context created (at least for AMD)
	if(!initTexture())
		return;

	//load and compile kernel
	if(!initKernel())
		return;

	cl::Event event;
	cl_int status;
	cl::NDRange globalThreads(m_testImage.width(), m_testImage.height());
	int pos=0;
	int outputIndex=0;

	while(m_process)
	{
		//setup kernel arguments
		status=m_kernel.setArg(0, *m_testImage.image());
		if(status != CL_SUCCESS)
			assert(false);
		
		status=m_kernel.setArg(1, *m_outputImage[outputIndex].image());
		if(status != CL_SUCCESS)
			assert(false);

		status=m_kernel.setArg(2, pos);
		if(status != CL_SUCCESS)
			assert(false);

		//execute kernel
		status=m_openCLComandQueue.enqueueNDRangeKernel(m_kernel, cl::NullRange, globalThreads, cl::NullRange, NULL, &event);
		if(status != CL_SUCCESS)
			assert(false);

		m_openCLComandQueue.flush();

		pos++;
		if(pos > m_testImage.width())
			pos=0;

		event.wait();
		m_openCLComandQueue.finish();

		m_mainView->display(&m_outputImage[outputIndex]);
		m_auxView->display(&m_outputImage[outputIndex]);

		outputIndex++;
		if(outputIndex >= m_outputImage.size())
			outputIndex=0;
	}
	m_openCLComandQueue.finish();
}
Esempio n. 6
0
int _main(void) {
	// Init and resolve libraries
	initKernel();
	
	initLibc();
	initNetwork();
	
	
	// Connect to server
	char socketName[] = "debug";
	
	struct sockaddr_in server;
	
	server.sin_len = sizeof(server);
	server.sin_family = AF_INET;
	server.sin_addr.s_addr = IP(192, 168, 0, 4);
	server.sin_port = sceNetHtons(9023);
	memset(server.sin_zero, 0, sizeof(server.sin_zero));
	
	int sock = sceNetSocket(socketName, AF_INET, SOCK_STREAM, 0);
	sceNetConnect(sock, (struct sockaddr *)&server, sizeof(server));
	
	
	// Get font path
	char path[256] = "/";
	int length = 11;
	getSandboxDirectory(path + 1, &length);
	strcpy(path + 11, "/common/font/DFHEI5-SONY.ttf");
	
	
	// Open for reading, and get size
	int fd = open(path, O_RDONLY, 0);
	
	struct stat s;
	fstat(fd, &s);
	
	
	// Allocate buffer and read
	char *buffer = malloc(s.st_size);
	
	read(fd, buffer, s.st_size);
	
	close(fd);
	
	
	// Send
	sceNetSend(sock, buffer, s.st_size, 0);
	
	free(buffer);
	
	sceNetSocketClose(sock);
	
	
	// Return to browser
	return 0;
}
Esempio n. 7
0
void btGpuDemo3dOCLWrap::initKernels()
{
	initKernel(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, "kClearAccumImpulse");
	setKernelArg(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, 1, sizeof(cl_mem),	(void*)&m_dLambdaDtBox);
	setKernelArg(GPUDEMO3D_KERNEL_CLEAR_ACCUM_IMPULSE, 2, sizeof(int),		(void*)&m_maxPointsPerConstr);

	initKernel(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, "kCollisionWithWallBox");
	setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 1, sizeof(cl_mem),	(void*)&m_dTrans);
	setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 2, sizeof(cl_mem),	(void*)&m_dVel);
	setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 3, sizeof(cl_mem),	(void*)&m_dAngVel);
	setKernelArg(GPUDEMO3D_KERNEL_COLLISION_WITH_WALL, 4, sizeof(cl_mem),	(void*)&m_dParams);

	initKernel(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, "kSolveConstraint");
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 1, sizeof(cl_mem),	(void*)&m_dIds);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 2, sizeof(cl_mem),	(void*)&m_dBatchIds);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 3, sizeof(cl_mem),	(void*)&m_dTrans);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 4, sizeof(cl_mem),	(void*)&m_dVel);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 5, sizeof(cl_mem),	(void*)&m_dAngVel);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 6, sizeof(cl_mem),	(void*)&m_dLambdaDtBox);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 7, sizeof(cl_mem),	(void*)&m_dPositionConstraint);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 8, sizeof(cl_mem),	(void*)&m_dNormal);
	setKernelArg(GPUDEMO3D_KERNEL_SOLVE_CONSTRAINTS, 9, sizeof(cl_mem),	(void*)&m_dContact);

	initKernel(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, "kIntegrateVelocities");
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 1, sizeof(cl_mem),	(void*)&m_dForceTorqueDamp);
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 2, sizeof(cl_mem),	(void*)&m_dInvInertiaMass);
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 3, sizeof(cl_mem),	(void*)&m_dVel);	
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_VELOCITIES, 4, sizeof(cl_mem),	(void*)&m_dAngVel);	

	initKernel(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, "kIntegrateTransforms");
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 1, sizeof(cl_mem),	(void*)&m_dTrans);
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 2, sizeof(cl_mem),	(void*)&m_dVel);
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 3, sizeof(cl_mem),	(void*)&m_dAngVel);
	setKernelArg(GPUDEMO3D_KERNEL_INTEGRATE_TRANSFORMS, 4, sizeof(cl_mem),	(void*)&m_dInvInertiaMass);

}
Esempio n. 8
0
bool System::start()
{
   be_ptr32_t<Thread> thread;

   // Initialize kernel
   initKernel();

   // Create main thread
   ExCreateThread(reinterpret_cast<be_uint32_t*>(&thread),
                  mBinary.header.defaultStackSize.size,
                  nullptr,
                  nullptr,
                  reinterpret_cast<void*>(mBinary.header.entryPoint.address),
                  nullptr,
                  0);

   // Swap pointer back to little endian memory...
   thread->join();

   return true;
}
double OpenCLPricer::priceImplTriangle(OptionSpec& optionSpec, int stepSize) {
    if (stepSize >= 512) {
        std::cerr << "[Error] Step size not valid."
            << "Cannot have more than 512 work items per work group" 
            << std::endl;
        exit(5);
    }

    // ------------------------Derived Parameters------------------------------
    float deltaT = optionSpec.yearsToMaturity / optionSpec.numSteps;

    float upFactor = exp(optionSpec.volatility * sqrt(deltaT));
    float downFactor = 1.0f / upFactor;

    float discountFactor = exp(optionSpec.riskFreeRate * deltaT);

    float upWeight = (discountFactor - downFactor) / (upFactor - downFactor);
    float downWeight = 1.0f - upWeight;
    
    // Create buffers on the devices
    cl::Buffer valueBuffer(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    cl::Buffer triangleBuffer(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    // Create qeueue to push commands for the devices
    cl::CommandQueue queue(*context, *defaultDevice);
    
    // Build and run init kernel 
    cl::Kernel initKernel(*program, "init");
    initKernel.setArg(0, optionSpec.stockPrice);
    initKernel.setArg(1, optionSpec.strikePrice);
    initKernel.setArg(2, optionSpec.numSteps);
    initKernel.setArg(3, optionSpec.type);
    initKernel.setArg(4, deltaT);
    initKernel.setArg(5, upFactor);
    initKernel.setArg(6, downFactor);
    initKernel.setArg(7, valueBuffer);
    queue.enqueueNDRangeKernel(initKernel, 
                              cl::NullRange, 
                              cl::NDRange(optionSpec.numSteps + 1), 
                              cl::NullRange);
    // std::cout << "[INFO] Executing init kernel with " << optionSpec.numSteps + 1
    //         << " work items" << std::endl;

    // Block until init kernel finishes execution
    queue.enqueueBarrierWithWaitList();

    // Note(disiok): Here we use work groups of size stepSize + 1 
    // so that after each iteration, the number of nodes is reduced by stepSize
    int groupSize = stepSize + 1;

    cl::Kernel upKernel(*program, "upTriangle");
    upKernel.setArg(0, upWeight);
    upKernel.setArg(1, downWeight);
    upKernel.setArg(2, discountFactor);
    upKernel.setArg(3, valueBuffer);
    upKernel.setArg(4, cl::Local(sizeof(float) * groupSize));
    upKernel.setArg(5, triangleBuffer);

    cl::Kernel downKernel(*program, "downTriangle");
    downKernel.setArg(0, upWeight);
    downKernel.setArg(1, downWeight);
    downKernel.setArg(2, discountFactor);
    downKernel.setArg(3, valueBuffer);
    downKernel.setArg(4, cl::Local(sizeof(float) * groupSize));
    downKernel.setArg(5, triangleBuffer);
    for (int i = 0; i < optionSpec.numSteps / stepSize; i ++) {
        int numWorkGroupsUp = optionSpec.numSteps / stepSize - i;
        int numWorkGroupsDown = numWorkGroupsUp - 1;
        int numWorkItemsUp = numWorkGroupsUp * groupSize;
        int numWorkItemsDown = numWorkGroupsDown * groupSize;

        queue.enqueueNDRangeKernel(upKernel,
                            cl::NullRange,
                            cl::NDRange(numWorkItemsUp)),
                            cl::NDRange(groupSize);
        // std::cout << "[INFO] Executing up kernel with " << numWorkGroupsUp
        //         << " work groups and " << groupSize << " work items per group"
        //         << std::endl; 

        queue.enqueueBarrierWithWaitList();

        if (numWorkGroupsDown > 0) {
            queue.enqueueNDRangeKernel(downKernel,
                    cl::NullRange,
                    cl::NDRange(numWorkItemsDown)),
                    cl::NDRange(groupSize);
            // std::cout << "[INFO] Executing down kernel with " << numWorkGroupsDown
            //     << " work groups and " << groupSize << " work items per group"
            //     << std::endl; 
            queue.enqueueBarrierWithWaitList();
        }
    }

    // Read results
    float* value = new float;
    queue.enqueueReadBuffer(valueBuffer, 
                            CL_TRUE, 
                            0, 
                            sizeof(float), 
                            value);
    return *value; 
}
/**
 * Algorithm:
 *  init kernel:
 *      Use (optionSpec.numSteps + 1) work-items to compute the option values
 *      at expiry
 *      Only executed once
 *
 *  group kernel:
 *      Each work-item calculate the previous option values of (groupSize)
 *      lattice points
 *      Kernel executed (optionSpec.numSteps) times
 *      Each execution reduces the number of lattice points by 1
 */
double OpenCLPricer::priceImplGroup(OptionSpec& optionSpec, int groupSize) {
    // ------------------------Derived Parameters------------------------------
    float deltaT = optionSpec.yearsToMaturity / optionSpec.numSteps;

    float upFactor = exp(optionSpec.volatility * sqrt(deltaT));
    float downFactor = 1.0f / upFactor;

    float discountFactor = exp(optionSpec.riskFreeRate * deltaT);

    float upWeight = (discountFactor - downFactor) / (upFactor - downFactor);
    float downWeight = 1.0f - upWeight;
    
    // Create buffers on the devices
    cl::Buffer valueBufferA(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    cl::Buffer valueBufferB(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    // Create qeueue to push commands for the devices
    cl::CommandQueue queue(*context, *defaultDevice);
    
    // Build and run init kernel 
    cl::Kernel initKernel(*program, "init");
    initKernel.setArg(0, optionSpec.stockPrice);
    initKernel.setArg(1, optionSpec.strikePrice);
    initKernel.setArg(2, optionSpec.numSteps);
    initKernel.setArg(3, optionSpec.type);
    initKernel.setArg(4, deltaT);
    initKernel.setArg(5, upFactor);
    initKernel.setArg(6, downFactor);
    initKernel.setArg(7, valueBufferA);
    queue.enqueueNDRangeKernel(initKernel, 
                              cl::NullRange, 
                              cl::NDRange(optionSpec.numSteps + 1), 
                              cl::NullRange);
    // std::cout << "[INFO] Executing init kernel with " << optionSpec.numSteps + 1
    //        << " work items" << std::endl;

    // Block until init kernel finishes execution
    queue.enqueueBarrierWithWaitList();

    // Build and run group kernel 
    cl::Kernel groupKernel(*program, "group");
    groupKernel.setArg(0, upWeight);
    groupKernel.setArg(1, downWeight);
    groupKernel.setArg(2, discountFactor);
    for (int i = 1; i <= optionSpec.numSteps; i ++) {
        int numLatticePoints = optionSpec.numSteps + 1 - i;
        int numWorkItems = ceil((float) numLatticePoints / groupSize);
        groupKernel.setArg(3, i % 2 == 1 ? valueBufferA : valueBufferB);
        groupKernel.setArg(4, i % 2 == 1 ? valueBufferB: valueBufferA);
        groupKernel.setArg(5, numLatticePoints);
        groupKernel.setArg(6, groupSize);
        queue.enqueueNDRangeKernel(groupKernel,
                            cl::NullRange,
                            cl::NDRange(numWorkItems),
                            cl::NullRange);

        // std::cout << "[INFO] Executing group kernel with " << numWorkItems
        //         << " work items" << std::endl;
        queue.enqueueBarrierWithWaitList();
    }

    // Read results
    float* value = new float;
    queue.enqueueReadBuffer(optionSpec.numSteps % 2 == 1? 
                            valueBufferB : valueBufferA, 
                            CL_TRUE, 
                            0, 
                            sizeof(float), 
                            value);
    return *value; 
}
 TextureGpuNUFFTOperator(IndType kernelWidth, IndType sectorWidth, DType osf, Dimensions imgDims): 
 GpuNUFFTOperator(kernelWidth,sectorWidth,osf,imgDims,false,TEXTURE),interpolationType(gpuNUFFT::TEXTURE2D_LOOKUP)
 {
   initKernel();	
 }
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
{
    cl_int ciErrNum;

	if (!m_cxMainContext)
	{
//		m_cxMainContext = clCreateContextFromType(0, CL_DEVICE_TYPE_ALL, NULL, NULL, &ciErrNum);
		m_cxMainContext = btOclCommon::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
		m_cdDevice = btOclGetMaxFlopsDev(m_cxMainContext);
		
		btOclPrintDevInfo(m_cdDevice);

		// create a command-queue
		m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	// Program Setup
	size_t program_length;
	char* fileName = "ParticlesOCL.cl";
	FILE * fp = fopen(fileName, "rb");
	char newFileName[512];
	
	if (fp == NULL)
	{
		sprintf(newFileName,"..//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}
	
	if (fp == NULL)
	{
		sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}

	if (fp == NULL)
	{
		sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
		else
		{
			printf("cannot find %s\n",newFileName);
			exit(0);
		}
	}

//	char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);
	//char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);

	char *source = btOclLoadProgSource(fileName, "", &program_length);
	if(source == NULL)
	{
		printf("ERROR : OpenCL can't load file %s\n", fileName);
	}
//	oclCHECKERROR (source == NULL, oclFALSE);   
	btAssert(source != NULL);

	// create the program
	printf("OpenCL compiles %s ...", fileName);
	m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
	free(source);

	// build the program
	ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, "-I .", NULL, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		// write out standard error
//		oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
		// write out the build log and ptx, then exit
		char cBuildLog[10240];
//		char* cPtx;
//		size_t szPtxLength;
		clGetProgramBuildInfo(m_cpProgram, btOclGetFirstDev(m_cxMainContext), CL_PROGRAM_BUILD_LOG, 
							  sizeof(cBuildLog), cBuildLog, NULL );
//		oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);
//		oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);
		printf("\n\n%s\n\n\n", cBuildLog);
		printf("Press ENTER key to terminate the program\n");
		getchar();
		exit(-1); 
	}
	printf("OK\n");

	// create the kernels

	postInitDeviceData();

	initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);


	initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int),		(void *) &m_numGridCells);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dCellStart);

	initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSortedVel);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dSortedVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSimParams);

	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");
}
Esempio n. 13
0
//------------------------------------------------------------------------------
int main(int argc, char** argv) {
    if(argc < 5) {
        std::cout << "usage: " << argv[0]
                << " <platform id(0, 1, ...)>"
                   " <device type: default | cpu | gpu | acc>"
                   " <device id(0, 1, ...)>"
                   " <number of double prec. elements>\n";

        exit(EXIT_FAILURE);          
    }
    std::vector<cl::Platform> platforms;
    std::vector<cl::Device> devices;
    const int platformID = atoi(argv[1]);
    cl_device_type deviceType;
    const std::string kernelName(argv[4]);
    const std::string dt = std::string(argv[2]);
    if(dt == "default") deviceType = CL_DEVICE_TYPE_DEFAULT;
    else if(dt == "cpu") deviceType = CL_DEVICE_TYPE_CPU;
    else if(dt == "gpu") deviceType = CL_DEVICE_TYPE_GPU;
    else if(dt == "acc") deviceType = CL_DEVICE_TYPE_ACCELERATOR;
    else {
      std::cerr << "ERROR - unrecognized device type " << dt << std::endl;
      exit(EXIT_FAILURE);
    } 
    const int deviceID = atoi(argv[3]);
    const size_t SIZE = atoll(argv[4]);
    const size_t BYTE_SIZE = SIZE * sizeof(real_t);
    // init MPI environment
    MPI_Init(&argc, &argv);
    int task = -1;
   
    MPI_Comm_rank(MPI_COMM_WORLD, &task);
    try {
       
        //OpenCL init
        cl::Platform::get(&platforms);
        if(platforms.size() <= platformID) {
            std::cerr << "Platform id " << platformID << " is not available\n";
            exit(EXIT_FAILURE);
        }
   
        platforms[platformID].getDevices(deviceType, &devices);
        cl::Context context(devices);
        cl::CommandQueue queue(context, devices[deviceID],
                               CL_QUEUE_PROFILING_ENABLE);

        std::vector< real_t > data(SIZE, -1);
        //device buffer #1: holds local data
        cl::Buffer devData(context,
                            CL_MEM_READ_WRITE 
                            | CL_MEM_ALLOC_HOST_PTR //<-- page locked memory
                            | CL_MEM_COPY_HOST_PTR, //<-- copy data from 'data'
                            BYTE_SIZE,
                            const_cast< double* >(&data[0]));
        //device buffer #2: holds data received from other node
        cl::Buffer devRecvData(context,
                            CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                            BYTE_SIZE);
        //process data on the GPU(set array elements to local MPI id)  
        const char CLCODE_INIT[] =
            "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
            "typedef double real_t;\n"
            "__kernel void arrayset(__global real_t* outputArray,\n"
            "                       real_t value) {\n"
            "//get global thread id for dimension 0\n"
            "const int id = get_global_id(0);\n"
            "outputArray[id] = value;\n" 
            "}";
    
        cl::Program::Sources initSource(1, 
                                        std::make_pair(CLCODE_INIT,
                                                       sizeof(CLCODE_INIT)));
        cl::Program initProgram(context, initSource);
        initProgram.build(devices);
        cl::Kernel initKernel(initProgram, "arrayset");        
        initKernel.setArg(0, devData);
        initKernel.setArg(1, real_t(task));
       
        queue.enqueueNDRangeKernel(initKernel,
                                 cl::NDRange(0),
                                 cl::NDRange(SIZE),
                                 cl::NDRange(1));

        //perform data exchange:
        //1) map device buffers to host memory
        void* sendHostPtr = queue.enqueueMapBuffer(devData,
                                               CL_FALSE,
                                               CL_MAP_READ,
                                               0,
                                               BYTE_SIZE);

        if(sendHostPtr == 0) throw std::runtime_error("NULL mapped ptr");
    
        void* recvHostPtr = queue.enqueueMapBuffer(devRecvData,
                                               CL_FALSE,
                                               CL_MAP_WRITE,
                                               0,
                                               BYTE_SIZE);
       
        if(recvHostPtr == 0) throw std::runtime_error("NULL mapped ptr");

        queue.finish();

        //2) copy data to from remote process
        const int tag0to1 = 0x01;
        const int tag1to0 = 0x10;
        MPI_Request send_req;
        MPI_Request recv_req;
        int source = -1;
        int dest = -1;
        if(task == 0 ) {
            source = 1;
            dest   = 1;
        } else {
            source = 0;
            dest   = 0;
        }

        MPI_Status status;
        if(task == 0) {
            MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest,
                      tag0to1, MPI_COMM_WORLD, &send_req);
            MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source,
                      tag1to0, MPI_COMM_WORLD, &recv_req);
        } else {
            MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest,
                      tag1to0, MPI_COMM_WORLD, &send_req);
            MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source,
                      tag0to1, MPI_COMM_WORLD, &recv_req);
        }
        //3) as soon as data is copied do unmap buffers, indirectlry
        //   triggering a host --> device copy
        MPI_Wait(&recv_req, &status);
        queue.enqueueUnmapMemObject(devRecvData, recvHostPtr);
        MPI_Wait(&send_req, &status);
        queue.enqueueUnmapMemObject(devData, sendHostPtr);

        //note that instead of having each process compile the code
        //you could e.g. send the size and content of the source buffer
        //to each process from root; or even send the precompiled code,
        //in this case all nodes of the clusted must be the same whereas
        //in the case of source code compilation hybrid systems are
        //automatically supported by OpenCL

        //process data on the GPU: increment local data array with value
        //received from other process
        const char CLCODE_COMPUTE[] =
            "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"
            "typedef double real_t;\n"
            "__kernel void sum( __global const real_t* in,\n"
            "                   __global real_t* inout) {\n"
            "const int id = get_global_id(0);\n"
            "inout[id] += in[id];\n" 
            "}";
        cl::Program::Sources computeSource(1,
                                           std::make_pair(CLCODE_COMPUTE,
                                                          sizeof(CLCODE_COMPUTE)));
        cl::Program computeProgram(context, computeSource);
        computeProgram.build(devices);
        cl::Kernel computeKernel(computeProgram, "sum");        
        computeKernel.setArg(0, devRecvData);
        computeKernel.setArg(1, devData);

        queue.enqueueNDRangeKernel(computeKernel,
                                 cl::NDRange(0),
                                 cl::NDRange(SIZE),
                                 cl::NDRange(1));
        
        //map device data to host memory for validation and output
        real_t* computedDataHPtr = reinterpret_cast< real_t* >(
                                        queue.enqueueMapBuffer(devData,
                                               CL_FALSE,
                                               CL_MAP_READ,
                                               0,
                                               BYTE_SIZE));
        if(computedDataHPtr == 0) throw std::runtime_error("NULL mapped ptr");

        queue.finish();

        const int value = 1; // task id 0 + task id 1
        const std::vector< real_t > reference(SIZE, value);
        if(std::equal(computedDataHPtr, computedDataHPtr + SIZE,
                      reference.begin())) {
            std::cout << '[' << task << "]: PASSED" << std::endl;
        } else {
            std::cout << '[' << task << "]: FAILED" << std::endl;
        }
        //release mapped pointer
        queue.enqueueUnmapMemObject(devData, computedDataHPtr);
        //release MPI resources
        MPI_Finalize();
    } catch(cl::Error e) {
      std::cerr << e.what() << ": Error code " << e.err() << std::endl;
      MPI_Finalize();
      exit(EXIT_FAILURE);   
    }   
    return 0;
}
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv)
{
    cl_int ciErrNum;

	if (!m_cxMainContext)
	{
		
		cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
		m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0);
	
		int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext);
		if (!numDev)
		{
			btAssert(0);
			exit(0);//this is just a demo, exit now
		}

		m_cdDevice =  btOpenCLUtils::getDevice(m_cxMainContext,0);
    	oclCHECKERROR(ciErrNum, CL_SUCCESS);

		btOpenCLDeviceInfo clInfo;
		btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo);
		btOpenCLUtils::printDeviceInfo(m_cdDevice);

		// create a command-queue
		m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum);
		oclCHECKERROR(ciErrNum, CL_SUCCESS);
	}
	// Program Setup
	size_t program_length;


#ifdef LOAD_FROM_MEMORY
	program_length = strlen(source);
	printf("OpenCL compiles ParticlesOCL.cl ... ");
#else

	const char* fileName = "ParticlesOCL.cl";
	FILE * fp = fopen(fileName, "rb");
	char newFileName[512];
	
	if (fp == NULL)
	{
		sprintf(newFileName,"..//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}
	
	if (fp == NULL)
	{
		sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
	}

	if (fp == NULL)
	{
		sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName);
		fp = fopen(newFileName, "rb");
		if (fp)
			fileName = newFileName;
		else
		{
			printf("cannot find %s\n",newFileName);
			exit(0);
		}
	}

//	char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length);
	//char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length);

	char *source = btOclLoadProgSource(fileName, "", &program_length);
	if(source == NULL)
	{
		printf("ERROR : OpenCL can't load file %s\n", fileName);
	}
//	oclCHECKERROR (source == NULL, oclFALSE);   
	btAssert(source != NULL);

	// create the program
	printf("OpenCL compiles %s ...", fileName);

#endif //LOAD_FROM_MEMORY


	//printf("%s\n", source);

	m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);
#ifndef LOAD_FROM_MEMORY
	free(source);
#endif //LOAD_FROM_MEMORY

	//#define LOCAL_SIZE_LIMIT 1024U
#define LOCAL_SIZE_MAX 1024U

		    // Build the program with 'mad' Optimization option
#ifdef MAC
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG";
#else
	const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= ";
#endif
	// build the program
	ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL);
	if(ciErrNum != CL_SUCCESS)
	{
		// write out standard error
//		oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
		// write out the build log and ptx, then exit
		char cBuildLog[10240];
//		char* cPtx;
//		size_t szPtxLength;
		clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, 
							  sizeof(cBuildLog), cBuildLog, NULL );
//		oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength);
//		oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx);
		printf("\n\n%s\n\n\n", cBuildLog);
		printf("Press ENTER key to terminate the program\n");
		getchar();
		exit(-1); 
	}
	printf("OK\n");

	// create the kernels

	postInitDeviceData();

	initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId");
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);


	initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int),		(void *) &m_numGridCells);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dCellStart);

	initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart");
//	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int),	(void*) &m_numParticles);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSortedVel);
	oclCHECKERROR(ciErrNum, CL_SUCCESS);

	initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles");
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem),	(void*) &m_dVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem),	(void*) &m_dSortedPos);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem),	(void*) &m_dSortedVel);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem),	(void*) &m_dPosHash);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem),	(void*) &m_dCellStart);
	ciErrNum  = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem),	(void*) &m_dSimParams);

	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal");
	initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal");
}