int main(int argc, char **argv) { int i; int ret; spe_context_ptr_t spe[NUM_SPE]; spe_program_handle_t *prog; pthread_t thread[NUM_SPE]; prog = spe_image_open("increment_spe.elf"); if (!prog) { perror("spe_image_open"); exit(1); } for (i = 0; i < NUM_SPE; i++) { spe[i] = spe_context_create(0, NULL); if (!spe) { perror("spe_context_create"); exit(1); } ret = spe_program_load(spe[i], prog); if (ret) { perror("spe_program_load"); exit(1); } } for (i = 0; i < NUM_SPE; i++) { ret = pthread_create(&thread[i], NULL, run_increment_spe, &spe[i]); if (ret) { perror("pthread_create"); exit(1); } } for (i = 0; i < NUM_SPE; i++) { pthread_join(thread[i], NULL); ret = spe_context_destroy(spe[i]); if (ret < 0) { perror("spe_context_destroy"); exit(1); } } ret = spe_image_close(prog); if (ret) { perror("spe_image_close"); exit(1); } printf("result=%d\n", counter[0]); return 0; }
int main(int argc, char **argv) { int ret; spe_context_ptr_t spe; spe_program_handle_t *prog; unsigned int entry; spe_stop_info_t stop_info; unsigned long param; prog = spe_image_open("print_param_spe.elf"); if (!prog) { perror("spe_image_open"); exit(1); } spe = spe_context_create(0, NULL); if (!spe) { perror("spe_context_create"); exit(1); } ret = spe_program_load(spe, prog); if (ret) { perror("spe_program_load"); exit(1); } param = 12345678; printf("[PPE] param=%ld\n", param); entry = SPE_DEFAULT_ENTRY; ret = spe_context_run(spe, &entry, 0, (void *) param, NULL, &stop_info); if (ret < 0) { perror("spe_context_run"); exit(1); } ret = spe_context_destroy(spe); if (ret) { perror("spe_context_destroy"); exit(1); } ret = spe_image_close(prog); if (ret) { perror("spe_image_close"); exit(1); } return 0; }
initDisp( unsigned int numspes ) { // Get the number of available SPEs speThreads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); // Clamp to the defined number of SPEs used if ( speThreads > MAX_SPU_NUM ) { speThreads = MAX_SPU_NUM; } if( speThreads > numspes ) { speThreads = numspes; } //printf("InitDist. speThreads is: %d\n",speThreads); unsigned int i; unsigned int temp; // Get dispatcher //printf("Getting the dispatcher\n"); //spe_program_handle_t *dispatcher = spe_image_open( "/home/jens/numpycbe_dispatcher" ); spe_program_handle_t *dispatcher = spe_image_open( "./../../../../numpycbe_dispatcher" ); //printf("After getting the dispatcher\n"); // Initialize threads for( i = 0 ; i < speThreads ; i++ ) { CreateSPEThread( &speData[i], dispatcher, &spe_pointer_addr[i] ); // Sending the SPE its id //printf("spe_write MULTIARRAYMODULE Sending id to SPE %d.\n",i); spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING ); // Sending the SPE its seed. This should be something like time instead of id? //printf("spe_write MULTIARRAYMODULE Sending seed to SPE %d.\n",i); spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING ); } //printf("speData[i].spe_ctx is : %d\n",speData[i].spe_ctx); //spe_in_mbox_write ( (void*)temp, &i, 1, SPE_MBOX_ALL_BLOCKING ); return 0; }
static int LoadBinary(size_t length, const unsigned char * binary) { char filename[512]; sprintf(filename, "/tmp/opencl.bin.%lu.%u", (long unsigned int) 1, 0); TouchFile(filename, length); int fd = open(filename, O_RDWR); void * data = mmap((caddr_t)0, length, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); if(data == MAP_FAILED) { close(fd); return 0; } memcpy(data, (const void *) binary, length); /* the file must have execution priviledges. set execution for use */ int result = chmod(filename, S_IRWXU); if(result != 0) goto UnMapFile; spe_program = spe_image_open(filename); if(spe_program == NULL) { if(errno == EACCES) printf("EACCES\n"); else if(errno == EFAULT) printf("EFAULT\n"); else perror(NULL); goto UnMapFile; } /* we have a good binary */ return 1; UnMapFile: munmap(data, length); close(fd); return 0; }
int main(int argc, char **argv) { int i; int ret; spe_context_ptr_t spe; spe_program_handle_t *prog; unsigned int entry; spe_stop_info_t stop_info; if (argc == 1) { fprintf(stderr, "usage: %s <spu_image>\n", argv[0]); return -1; } prog = spe_image_open(argv[1]); if (!prog) { perror("spe_image_open"); exit(1); } spe = spe_context_create(0, NULL); if (!spe) { perror("spe_context_create"); exit(1); } ret = spe_program_load(spe, prog); if (ret) { perror("spe_program_load"); exit(1); } abs_params.ea_in = (unsigned long) in; abs_params.ea_out = (unsigned long) out; abs_params.size = SIZE; entry = SPE_DEFAULT_ENTRY; ret = spe_context_run(spe, &entry, 0, &abs_params, NULL, &stop_info); if (ret < 0) { perror("spe_context_run"); exit(1); } ret = spe_context_destroy(spe); if (ret) { perror("spe_context_destroy"); exit(1); } ret = spe_image_close(prog); if (ret) { perror("spe_image_close"); exit(1); } for (i = 0; i < SIZE; i++) { printf("%5.0f ", i, out[i]); if ((i+1) % 4 == 0) printf("\n"); } return 0; }
void CcdPhysicsDemo::initPhysics() { setTexturing(true); setShadows(false); #ifdef USE_PARALLEL_DISPATCHER #ifdef _WIN32 m_threadSupportSolver = 0; m_threadSupportCollision = 0; #endif // #endif //#define USE_GROUND_PLANE 1 #ifdef USE_GROUND_PLANE m_collisionShapes.push_back(new btStaticPlaneShape(btVector3(0,1,0),0.5)); #else ///Please don't make the box sizes larger then 1000: the collision detection will be inaccurate. ///See http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=346 m_collisionShapes.push_back(new btBoxShape (btVector3(200,CUBE_HALF_EXTENTS,200))); #endif #ifdef DO_BENCHMARK_PYRAMIDS m_collisionShapes.push_back(new btBoxShape (btVector3(CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS))); #else // m_collisionShapes.push_back(new btBoxShape (btVector3(CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS))); m_collisionShapes.push_back(new btCylinderShape (btVector3(CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS,CUBE_HALF_EXTENTS))); #endif #ifdef DO_BENCHMARK_PYRAMIDS setCameraDistance(32.5f); #endif #ifdef DO_BENCHMARK_PYRAMIDS m_azi = 90.f; #endif //DO_BENCHMARK_PYRAMIDS m_dispatcher=0; m_collisionConfiguration = new btDefaultCollisionConfiguration(); #ifdef USE_PARALLEL_DISPATCHER int maxNumOutstandingTasks = 4; #ifdef USE_WIN32_THREADING m_threadSupportCollision = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo( "collision", processCollisionTask, createCollisionLocalStoreMemory, maxNumOutstandingTasks)); #else #ifdef USE_LIBSPE2 spe_program_handle_t * program_handle; #ifndef USE_CESOF program_handle = spe_image_open ("./spuCollision.elf"); if (program_handle == NULL) { perror( "SPU OPEN IMAGE ERROR\n"); } else { printf( "IMAGE OPENED\n"); } #else extern spe_program_handle_t spu_program; program_handle = &spu_program; #endif SpuLibspe2Support* threadSupportCollision = new SpuLibspe2Support( program_handle, maxNumOutstandingTasks); #endif //USE_LIBSPE2 ///Playstation 3 SPU (SPURS) version is available through PS3 Devnet /// For Unix/Mac someone could implement a pthreads version of btThreadSupportInterface? ///you can hook it up to your custom task scheduler by deriving from btThreadSupportInterface #endif m_dispatcher = new SpuGatheringCollisionDispatcher(m_threadSupportCollision,maxNumOutstandingTasks,m_collisionConfiguration); // m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration); #else m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration); #endif //USE_PARALLEL_DISPATCHER #ifdef USE_CUSTOM_NEAR_CALLBACK //this is optional m_dispatcher->setNearCallback(customNearCallback); #endif m_broadphase = new btDbvtBroadphase(); #ifdef COMPARE_WITH_QUICKSTEP m_solver = new btOdeQuickstepConstraintSolver(); #else #ifdef USE_PARALLEL_SOLVER m_threadSupportSolver = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo( "solver", processSolverTask, createSolverLocalStoreMemory, maxNumOutstandingTasks)); m_solver = new btParallelSequentialImpulseSolver(m_threadSupportSolver,maxNumOutstandingTasks); #else btSequentialImpulseConstraintSolver* solver = new btSequentialImpulseConstraintSolver(); m_solver = solver;//new btOdeQuickstepConstraintSolver(); #endif //USE_PARALLEL_SOLVER #endif btDiscreteDynamicsWorld* world = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration); m_dynamicsWorld = world; ///SOLVER_RANDMIZE_ORDER makes cylinder stacking a bit more stable world->getSolverInfo().m_solverMode |= SOLVER_RANDMIZE_ORDER; #ifdef USER_DEFINED_FRICTION_MODEL //user defined friction model is not supported in 'cache friendly' solver yet, so switch to old solver world->getSolverInfo().m_solverMode = SOLVER_RANDMIZE_ORDER; #endif //USER_DEFINED_FRICTION_MODEL #ifdef DO_BENCHMARK_PYRAMIDS world->getSolverInfo().m_numIterations = 4; #endif //DO_BENCHMARK_PYRAMIDS m_dynamicsWorld->getDispatchInfo().m_enableSPU = true; m_dynamicsWorld->setGravity(btVector3(0,-10,0)); #ifdef USER_DEFINED_FRICTION_MODEL { //m_solver->setContactSolverFunc(ContactSolverFunc func,USER_CONTACT_SOLVER_TYPE1,DEFAULT_CONTACT_SOLVER_TYPE); solver->SetFrictionSolverFunc(myFrictionModel,USER_CONTACT_SOLVER_TYPE1,DEFAULT_CONTACT_SOLVER_TYPE); solver->SetFrictionSolverFunc(myFrictionModel,DEFAULT_CONTACT_SOLVER_TYPE,USER_CONTACT_SOLVER_TYPE1); solver->SetFrictionSolverFunc(myFrictionModel,USER_CONTACT_SOLVER_TYPE1,USER_CONTACT_SOLVER_TYPE1); //m_physicsEnvironmentPtr->setNumIterations(2); } #endif //USER_DEFINED_FRICTION_MODEL int i; btTransform tr; tr.setIdentity(); for (i=0;i<gNumObjects;i++) { if (i>0) { shapeIndex[i] = 1;//sphere } else shapeIndex[i] = 0; } if (useCompound) { btCompoundShape* compoundShape = new btCompoundShape(); btCollisionShape* oldShape = m_collisionShapes[1]; m_collisionShapes[1] = compoundShape; btVector3 sphereOffset(0,0,2); comOffset.setIdentity(); #ifdef CENTER_OF_MASS_SHIFT comOffset.setOrigin(comOffsetVec); compoundShape->addChildShape(comOffset,oldShape); #else compoundShape->addChildShape(tr,oldShape); tr.setOrigin(sphereOffset); compoundShape->addChildShape(tr,new btSphereShape(0.9)); #endif } #ifdef DO_WALL for (i=0;i<gNumObjects;i++) { btCollisionShape* shape = m_collisionShapes[shapeIndex[i]]; shape->setMargin(gCollisionMargin); bool isDyna = i>0; btTransform trans; trans.setIdentity(); if (i>0) { //stack them int colsize = 10; int row = (i*CUBE_HALF_EXTENTS*2)/(colsize*2*CUBE_HALF_EXTENTS); int row2 = row; int col = (i)%(colsize)-colsize/2; if (col>3) { col=11; row2 |=1; } btVector3 pos(col*2*CUBE_HALF_EXTENTS + (row2%2)*CUBE_HALF_EXTENTS, row*2*CUBE_HALF_EXTENTS+CUBE_HALF_EXTENTS+EXTRA_HEIGHT,0); trans.setOrigin(pos); } else { trans.setOrigin(btVector3(0,EXTRA_HEIGHT-CUBE_HALF_EXTENTS,0)); } float mass = 1.f; if (!isDyna) mass = 0.f; btRigidBody* body = localCreateRigidBody(mass,trans,shape); #ifdef USE_KINEMATIC_GROUND if (mass == 0.f) { body->setCollisionFlags( body->getCollisionFlags() | btCollisionObject::CF_KINEMATIC_OBJECT); body->setActivationState(DISABLE_DEACTIVATION); } #endif //USE_KINEMATIC_GROUND // Only do CCD if motion in one timestep (1.f/60.f) exceeds CUBE_HALF_EXTENTS body->setCcdMotionThreshold( CUBE_HALF_EXTENTS ); //Experimental: better estimation of CCD Time of Impact: body->setCcdSweptSphereRadius( 0.2*CUBE_HALF_EXTENTS ); #ifdef USER_DEFINED_FRICTION_MODEL ///Advanced use: override the friction solver body->m_frictionSolverType = USER_CONTACT_SOLVER_TYPE1; #endif //USER_DEFINED_FRICTION_MODEL } #endif #ifdef DO_BENCHMARK_PYRAMIDS btTransform trans; trans.setIdentity(); btScalar halfExtents = CUBE_HALF_EXTENTS; trans.setOrigin(btVector3(0,-halfExtents,0)); localCreateRigidBody(0.f,trans,m_collisionShapes[shapeIndex[0]]); int numWalls = 15; int wallHeight = 15; float wallDistance = 3; for (int i=0;i<numWalls;i++) { float zPos = (i-numWalls/2) * wallDistance; createStack(m_collisionShapes[shapeIndex[1]],halfExtents,wallHeight,zPos); } // createStack(m_collisionShapes[shapeIndex[1]],halfExtends,20,10); // createStack(m_collisionShapes[shapeIndex[1]],halfExtends,20,20); #define DESTROYER_BALL 1 #ifdef DESTROYER_BALL btTransform sphereTrans; sphereTrans.setIdentity(); sphereTrans.setOrigin(btVector3(0,2,40)); btSphereShape* ball = new btSphereShape(2.f); m_collisionShapes.push_back(ball); btRigidBody* ballBody = localCreateRigidBody(10000.f,sphereTrans,ball); ballBody->setLinearVelocity(btVector3(0,0,-10)); #endif #endif //DO_BENCHMARK_PYRAMIDS // clientResetScene(); }
float calc_integral(float start, float end, float delta) { int i; int ret; float sum = 0.0f; spe_program_handle_t *prog; spe_context_ptr_t spe[NUM_SPE]; pthread_t thread[NUM_SPE]; thread_arg_t arg[NUM_SPE]; prog = spe_image_open("integral_spe.elf"); if (!prog) { perror("spe_image_open"); exit(1); } for (i = 0; i < NUM_SPE; i++) { spe[i] = spe_context_create(0, NULL); if (!spe) { perror("spe_context_create"); exit(1); } ret = spe_program_load(spe[i], prog); if (ret) { perror("spe_program_load"); exit(1); } } for (i = 0; i < NUM_SPE; i++) { integral_params[i].start = start + (end-start)/NUM_SPE * i; integral_params[i].end = start + (end-start)/NUM_SPE * (i+1); integral_params[i].delta = delta; integral_params[i].sum = 0.0f; arg[i].spe = spe[i]; arg[i].integral_params = &integral_params[i]; ret = pthread_create(&thread[i], NULL, run_integral_spe, &arg[i]); if (ret) { perror("pthread_create"); exit(1); } } for (i = 0; i < NUM_SPE; i++) { pthread_join(thread[i], NULL); ret = spe_context_destroy(spe[i]); if (ret) { perror("spe_context_destroy"); exit(1); } } ret = spe_image_close(prog); if (ret) { perror("spe_image_close"); exit(1); } for (i = 0; i < NUM_SPE; i++) { printf("[PPE] sum = %f\n", integral_params[i].sum); sum += integral_params[i].sum; } return sum; }
void BasicDemo::initPhysics() { btDefaultCollisionConfiguration* collisionConfiguration = new btDefaultCollisionConfiguration(); #ifdef USE_PARALLEL_DISPATCHER int maxNumOutstandingTasks = 1;//number of maximum outstanding tasks #ifdef USE_WIN32_THREADING Win32ThreadSupport* threadSupport = new Win32ThreadSupport(Win32ThreadSupport::Win32ThreadConstructionInfo( "collision", processCollisionTask, createCollisionLocalStoreMemory, maxNumOutstandingTasks)); #else spe_program_handle_t * program_handle; #ifndef USE_CESOF char* spuFileName = "../../../src/BulletMultiThreaded/out/spuCollision.elf"; program_handle = spe_image_open (spuFileName); if (program_handle == NULL) { printf( "SPU OPEN IMAGE ERROR:%s\n",spuFileName); exit(0); } else { printf( "IMAGE OPENED:%s\n",spuFileName); } #else extern spe_program_handle_t spu_program; program_handle = &spu_program; #endif SpuLibspe2Support* threadSupport = new SpuLibspe2Support( program_handle, maxNumOutstandingTasks); #endif // WIN32 m_dispatcher = new SpuGatheringCollisionDispatcher(threadSupport,maxNumOutstandingTasks,collisionConfiguration); #else m_dispatcher = new btCollisionDispatcher(collisionConfiguration); #endif //USE_PARALLEL_DISPATCHER m_collisionConfiguration = new btDefaultCollisionConfiguration(); #define USE_SWEEP_AND_PRUNE 1 #ifdef USE_SWEEP_AND_PRUNE #define maxProxies 8192 btVector3 worldAabbMin(-10000,-10000,-10000); btVector3 worldAabbMax(10000,10000,10000); m_overlappingPairCache = new btAxisSweep3(worldAabbMin,worldAabbMax,maxProxies); //m_overlappingPairCache = new btMultiSapBroadphase(); #else m_overlappingPairCache = new btSimpleBroadphase; #endif //USE_SWEEP_AND_PRUNE btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver; m_solver = sol; m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_overlappingPairCache,m_solver,m_collisionConfiguration); m_dynamicsWorld->getDispatchInfo().m_enableSPU = true; m_dynamicsWorld->setGravity(btVector3(0,-10,0)); ///create a few basic rigid bodies //static ground #ifdef USE_GROUND_BOX btCollisionShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.))); #else btCollisionShape* groundShape = new btSphereShape(btScalar(50.)); #endif//USE_GROUND_BOX m_collisionShapes.push_back(groundShape); btTransform groundTransform; groundTransform.setIdentity(); groundTransform.setOrigin(btVector3(0,-50,0)); localCreateRigidBody(btScalar(0.),groundTransform,groundShape); //create a few dynamic sphere rigidbodies (re-using the same sphere shape) //btCollisionShape* sphereShape = new btBoxShape(btVector3(1,1,1)); btCollisionShape* sphereShape = new btSphereShape(btScalar(1.)); m_collisionShapes.push_back(sphereShape); int i; for (i=0;i<gNumObjects;i++) { sphereShape->setMargin(gCollisionMargin); btTransform trans; trans.setIdentity(); //stack them int colsize = 2; int row = (int)((i*HALF_EXTENTS*2)/(colsize*2*HALF_EXTENTS)); int row2 = row; int col = (i)%(colsize)-colsize/2; btVector3 pos(col*2*HALF_EXTENTS + (row2%2)*HALF_EXTENTS, row*2*HALF_EXTENTS+HALF_EXTENTS,0); trans.setOrigin(pos); //btRigidBody* body = localCreateRigidBody(btScalar(1.),trans,sphereShape); localCreateRigidBody(btScalar(1.),trans,sphereShape); } //clientResetScene(); }
extern cl_program clCreateProgramWithBinary (cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const char **binaries, cl_int *binary_status, cl_int *errcode_ret) { if(context == NULL || context == (cl_context)0) { *errcode_ret = CL_INVALID_CONTEXT; return (cl_program)0; } if(num_devices < 1 || device_list == NULL) { *errcode_ret = CL_INVALID_VALUE; return (cl_program)0; } if(lengths == NULL || binaries == NULL) { *errcode_ret = CL_INVALID_VALUE; return (cl_program)0; } PRINT_DEBUG("\n====\tCreating program \t====\n"); cl_program program = malloc(sizeof(struct _cl_program)); program->program_ref_count = 1; program->program_context = context; program->program_num_devices = num_devices; //Should memcpy() these program->program_devices = device_list; PRINT_DEBUG("Set devices\n"); program->program_source = NULL; program->program_binary_sizes = lengths; PRINT_DEBUG("Before malloc\n"); program->program_binaries = malloc(sizeof(char *)); PRINT_DEBUG("After first malloc\n"); *(program->program_binaries) = malloc((*lengths)+1); PRINT_DEBUG("After second malloc\n"); strcpy(*(program->program_binaries),*((char **)binaries)); PRINT_DEBUG("After strcpy\n"); char *name = *(program->program_binaries); name[(*lengths)] = '\0'; PRINT_DEBUG("Opening spe image %s\n", name); program->program_elfs = spe_image_open(name); if (!program->program_elfs) { PRINT_DEBUG("Could not open spe image\n"); *errcode_ret = CL_INVALID_BINARY; return (cl_program)0; } PRINT_DEBUG("\n====\tReturning program \t====\n"); return program; }
void pocl_cellspu_run (void *data, _cl_command_node* cmd) { struct data *d; int error; char bytecode[POCL_FILENAME_LENGTH]; char assembly[POCL_FILENAME_LENGTH]; char module[POCL_FILENAME_LENGTH]; char command[COMMAND_LENGTH]; char workgroup_string[WORKGROUP_STRING_LENGTH]; unsigned device; struct pocl_argument *al; size_t x, y, z; unsigned i; pocl_workgroup w; char* tmpdir = cmd->command.run.tmp_dir; cl_kernel kernel = cmd->command.run.kernel; struct pocl_context *pc = &cmd->command.run.pc; const char* kern_func = kernel->function_name; unsigned int entry = SPE_DEFAULT_ENTRY; assert (data != NULL); d = (struct data *) data; error = snprintf (module, POCL_FILENAME_LENGTH, "%s/parallel.so", tmpdir); assert (error >= 0); // This is the entry to the kenrel. We currently hard-code it // into the SPU binary. Resulting in only one entry-point per // SPU image. // TODO: figure out which function to call given what conditions snprintf (workgroup_string, WORKGROUP_STRING_LENGTH, "_%s_workgroup_fast", kernel->function_name); if ( access (module, F_OK) != 0) { char *llvm_ld; error = snprintf (bytecode, POCL_FILENAME_LENGTH, "%s/linked.bc", tmpdir); assert (error >= 0); if (getenv("POCL_BUILDING") != NULL) llvm_ld = BUILDDIR "/tools/llvm-ld/pocl-llvm-ld"; else if (access(PKGLIBEXECDIR "/pocl-llvm-ld", X_OK) == 0) llvm_ld = PKGLIBEXECDIR "/pocl-llvm-ld"; else llvm_ld = "pocl-llvm-ld"; error = snprintf (command, COMMAND_LENGTH, "%s --disable-opt -link-as-library -o %s %s/%s", llvm_ld, bytecode, tmpdir, POCL_PARALLEL_BC_FILENAME); assert (error >= 0); error = system(command); assert (error == 0); error = snprintf (assembly, POCL_FILENAME_LENGTH, "%s/parallel.s", tmpdir); assert (error >= 0); // "-relocation-model=dynamic-no-pic" is a magic string, // I do not know why it has to be there to produce valid // sos on x86_64 error = snprintf (command, COMMAND_LENGTH, LLC " " HOST_LLC_FLAGS " -o %s %s", assembly, bytecode); assert (error >= 0); error = system (command); assert (error == 0); // Compile the assembly version of the OCL kernel with the // C wrapper to get a spulet error = snprintf (command, COMMAND_LENGTH, "spu-gcc lib/CL/devices/cellspu/spe_wrap.c -o %s %s " " -Xlinker --defsym -Xlinker _ocl_buffer=%d" " -Xlinker --defsym -Xlinker kernel_command=%d" " -I . -D_KERNEL=%s -std=c99", module, assembly, CELLSPU_OCL_BUFFERS_START, CELLSPU_KERNEL_CMD_ADDR, workgroup_string); assert (error >= 0); #ifdef DEBUG_CELLSPU_DRIVER printf("compiling: %s\n", command); fflush(stdout); #endif error = system (command); assert (error == 0); } // Load the SPU with the newly generated binary hello_spu = spe_image_open( (const char*)module ); if( spe_program_load( spe_context, hello_spu) ) perror("spe_program_load fails"); // // /* Find which device number within the context correspond // to current device. */ // for (i = 0; i < kernel->context->num_devices; ++i) // { // if (kernel->context->devices[i]->data == data) // { // device = i; // break; // } // } // // This structure gets passed to the device. // It contains all the info needed to run a kernel __kernel_exec_cmd dev_cmd; dev_cmd.work_dim = cmd->command.run.pc.work_dim; dev_cmd.num_groups[0] = cmd->command.run.pc.num_groups[0]; dev_cmd.num_groups[1] = cmd->command.run.pc.num_groups[1]; dev_cmd.num_groups[2] = cmd->command.run.pc.num_groups[2]; dev_cmd.global_offset[0] = cmd->command.run.pc.global_offset[0]; dev_cmd.global_offset[1] = cmd->command.run.pc.global_offset[1]; dev_cmd.global_offset[2] = cmd->command.run.pc.global_offset[2]; // the code below is lifted from pthreads :) uint32_t *arguments = dev_cmd.args; for (i = 0; i < kernel->num_args; ++i) { al = &(kernel->dyn_arguments[i]); if (kernel->arg_is_local[i]) { chunk_info_t* local_chunk = cellspu_malloc_local (d, al->size); if (local_chunk == NULL) POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n"); dev_cmd.args[i] = local_chunk->start_address; } else if (kernel->arg_is_pointer[i]) { /* It's legal to pass a NULL pointer to clSetKernelArguments. In that case we must pass the same NULL forward to the kernel. Otherwise, the user must have created a buffer with per device pointers stored in the cl_mem. */ if (al->value == NULL) arguments[i] = (uint32_t)NULL; else arguments[i] = \ ((chunk_info_t*)((*(cl_mem *)\ (al->value))->device_ptrs[0]))->start_address; //TODO: '0' above is the device number... don't hard-code! } else if (kernel->arg_is_image[i]) { POCL_ABORT_UNIMPLEMENTED(); // dev_image2d_t di; // cl_mem mem = *(cl_mem*)al->value; // di.data = &((*(cl_mem *) (al->value))->device_ptrs[device]); // di.data = ((*(cl_mem *) (al->value))->device_ptrs[device]); // di.width = mem->image_width; // di.height = mem->image_height; // di.rowpitch = mem->image_row_pitch; // di.order = mem->image_channel_order; // di.data_type = mem->image_channel_data_type; // void* devptr = pocl_cellspu_malloc(data, 0, sizeof(dev_image2d_t), NULL); // arguments[i] = malloc (sizeof (void *)); // *(void **)(arguments[i]) = devptr; // pocl_cellspu_write (data, &di, devptr, sizeof(dev_image2d_t)); } else if (kernel->arg_is_sampler[i]) { POCL_ABORT_UNIMPLEMENTED(); // dev_sampler_t ds; // // arguments[i] = malloc (sizeof (void *)); // *(void **)(arguments[i]) = pocl_cellspu_malloc(data, 0, sizeof(dev_sampler_t), NULL); // pocl_cellspu_write (data, &ds, *(void**)arguments[i], sizeof(dev_sampler_t)); } else { arguments[i] = (uint32_t)al->value; } } // allocate memory for kernel local variables for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) { al = &(kernel->dyn_arguments[i]); arguments[i] = (uint32_t)malloc (sizeof (void *)); *(void **)(arguments[i]) = cellspu_malloc_local(data, al->size); } // the main loop on the spe needs an auxiliary struct for to get the // number of arguments and such. __kernel_metadata kmd; strncpy( kmd.name, workgroup_string, sizeof( kmd.name ) ); kmd.num_args = kernel->num_args; kmd.num_locals = kernel->num_locals; // TODO: fill in the rest, if used by the spu main function. // TODO malloc_local should be given the 'device data'. as long as teh // spu context is global this is ok. void *chunk = cellspu_malloc_local( NULL, sizeof(__kernel_metadata) ); void *kernel_area = ((chunk_info_t*)chunk)->start_address; cellspu_memwrite( kernel_area, &kmd, sizeof(__kernel_metadata) ); dev_cmd.kernel = kernel_area; // finish up the command, send it to SPE dev_cmd.status =POCL_KST_READY; cellspu_memwrite( (void*)CELLSPU_KERNEL_CMD_ADDR, &dev_cmd, sizeof(__kernel_exec_cmd) ); // Execute code on SPU. This starts with the main() in the spu - see spe_wrap.c if (spe_context_run(spe_context,&entry,0,NULL,NULL,NULL) < 0) perror("context_run error"); // for (z = 0; z < pc->num_groups[2]; ++z) // { // for (y = 0; y < pc->num_groups[1]; ++y) // { // for (x = 0; x < pc->num_groups[0]; ++x) // { // pc->group_id[0] = x; // pc->group_id[1] = y; // pc->group_id[2] = z; // // w (arguments, pc); // // } // } // } // Clean-up ? for (i = 0; i < kernel->num_args; ++i) { if (kernel->arg_is_local[i]) pocl_cellspu_free(data, 0, *(void **)(arguments[i])); } for (i = kernel->num_args; i < kernel->num_args + kernel->num_locals; ++i) pocl_cellspu_free(data, 0, *(void **)(arguments[i])); }