extern cl_context createCellContext (cl_int * errcode_ret) { //CBE Programmer's Guide - page 43 setErrCode(errcode_ret, CL_SUCCESS); int node_count = spe_cpu_info_get (SPE_COUNT_PHYSICAL_CPU_NODES, -1); PRINT_DEBUG("Num nodes: %d\n", node_count); int phys_spes = spe_cpu_info_get (SPE_COUNT_PHYSICAL_SPES, -1); PRINT_DEBUG("Num physical spes: %d\n", phys_spes); int usable_spes = spe_cpu_info_get (SPE_COUNT_USABLE_SPES, -1); PRINT_DEBUG("Num usable spes: %d\n", usable_spes); if (node_count < 1) { setErrCode(errcode_ret, CL_DEVICE_NOT_AVAILABLE); return (cl_context) 0; } PRINT_DEBUG("sizeof(cl_context) == %d\n", sizeof (struct _cl_context)); cl_context context = malloc (sizeof (struct _cl_context)); return context; }
int main() { int i, spu_threads; spe_context_ptr_t ctxs[MAX_SPU_THREADS]; pthread_t threads[MAX_SPU_THREADS]; /* * Determine the number of SPE threads to create. */ spu_threads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (spu_threads > MAX_SPU_THREADS) spu_threads = MAX_SPU_THREADS; /* * Create several SPE-threads to execute 'simple_spu'. */ for(i=0; i<spu_threads; i++) { /* Create context */ if ((ctxs[i] = spe_context_create (0, NULL)) == NULL) { perror ("Failed creating context"); exit (1); } /* Load program into context */ if (spe_program_load (ctxs[i], &simple_spu)) { perror ("Failed loading program"); exit (1); } /* Create thread for each SPE context */ if (pthread_create (&threads[i], NULL, &ppu_pthread_function, &ctxs[i])) { perror ("Failed creating thread"); exit (1); } } /* Wait for SPU-thread to complete execution. */ for (i=0; i<spu_threads; i++) { if (pthread_join (threads[i], NULL)) { perror("Failed pthread_join"); exit (1); } /* Destroy context */ if (spe_context_destroy (ctxs[i]) != 0) { perror("Failed destroying context"); exit (1); } } printf("\nThe program has successfully executed.\n"); return 0; }
/* Start the Spu threads */ void startSpuThreads(int spu_threads, SpuThreadData * spu_data) { int i, no_spus; /* Determine the number of SPE threads to create */ no_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (spu_threads < 0) { spu_threads = no_spus; } else if (no_spus < spu_threads) { spu_threads = no_spus; printf("Warning: Only %i Cell SPU processors available\n", spu_threads); } spu_data->no_spu_threads = spu_threads; spu_data->spus = (SpuData *) malloc(sizeof(SpuData) * spu_threads); if ((spu_data->spus == NULL)) { perror("Failed to allocate SPU data for threads"); } printf("Bringing up %i Cell SPU threads\n", spu_threads); /* create the context gang */ if ((spu_data->gang = spe_gang_context_create(0)) == NULL) { perror("Failed creating Cell SPU gang context"); exit(1); } for(i=0; i<spu_threads; i++) { /* Create context */ if ((spu_data->spus[i].ctx = spe_context_create (CTX_FLAGS, spu_data->gang)) == NULL) { perror ("Failed creating Cell SPU context"); exit (1); } /* load bootloader into spu's */ if (spe_program_load (spu_data->spus[i].ctx, &cellspu_bootloader)) { perror ("Failed loading Cell SPU bootloader"); exit (1); } /* create a thread for each SPU */ if (pthread_create (&(spu_data->spus[i].boot_thread), NULL, &spu_bootstrap_thread, &(spu_data->spus[i].ctx))) { perror ("Failed creating Cell SPU thread"); exit (1); } } }
void MMGP_init(){ MMGP_pid = getpid(); NUM_SPE = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1); /* In the sampling phase use the functions * with the timing instructions, othervise use the * functions without the timing instructions * (to avoid the overhead) */ MMGP_offload = &_empty; MMGP_prediction = &_empty; MMGP_wait_SPE = &_wait_SPE; MMGP_start_SPE = &_start_SPE; MMGP_create_threads = &_create_threads; }
initDisp( unsigned int numspes ) { // Get the number of available SPEs speThreads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); // Clamp to the defined number of SPEs used if ( speThreads > MAX_SPU_NUM ) { speThreads = MAX_SPU_NUM; } if( speThreads > numspes ) { speThreads = numspes; } //printf("InitDist. speThreads is: %d\n",speThreads); unsigned int i; unsigned int temp; // Get dispatcher //printf("Getting the dispatcher\n"); //spe_program_handle_t *dispatcher = spe_image_open( "/home/jens/numpycbe_dispatcher" ); spe_program_handle_t *dispatcher = spe_image_open( "./../../../../numpycbe_dispatcher" ); //printf("After getting the dispatcher\n"); // Initialize threads for( i = 0 ; i < speThreads ; i++ ) { CreateSPEThread( &speData[i], dispatcher, &spe_pointer_addr[i] ); // Sending the SPE its id //printf("spe_write MULTIARRAYMODULE Sending id to SPE %d.\n",i); spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING ); // Sending the SPE its seed. This should be something like time instead of id? //printf("spe_write MULTIARRAYMODULE Sending seed to SPE %d.\n",i); spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING ); } //printf("speData[i].spe_ctx is : %d\n",speData[i].spe_ctx); //spe_in_mbox_write ( (void*)temp, &i, 1, SPE_MBOX_ALL_BLOCKING ); return 0; }
int main() { float A[ARR_SIZE] __attribute__ ((aligned(16))); float B[ARR_SIZE] __attribute__ ((aligned(16))); float C[ARR_SIZE] __attribute__ ((aligned(16))); int i, spu_threads; pthread_t threads[MAX_SPU_THREADS]; pointers_t thread_arg[MAX_SPU_THREADS] __attribute__ ((aligned(16))); /* * Initialization */ for (i=0;i<ARR_SIZE;i++) { A[i] = i; B[i] = ARR_SIZE - i; C[i] = 0; } /* * Determine the number of SPE threads to create. */ spu_threads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (spu_threads > MAX_SPU_THREADS) spu_threads = MAX_SPU_THREADS; /* * Create several SPE-threads to execute 'ex1_spu'. */ for(i = 0; i < spu_threads; i++) { int no_elems = ARR_SIZE / spu_threads; int dim = no_elems * sizeof(float); thread_arg[i].A = A + i*no_elems; thread_arg[i].B = B + i*no_elems; thread_arg[i].C = C + i*no_elems; thread_arg[i].dim = dim; /* Create thread for each SPE context */ if (pthread_create (&threads[i], NULL, &ppu_pthread_function, &thread_arg[i])) { perror ("Failed creating thread"); exit (1); } } /* Wait for SPU-thread to complete execution. */ for (i = 0; i < spu_threads; i++) { if (pthread_join (threads[i], NULL)) { perror("Failed pthread_join"); exit (1); } } printf("\nThe program has successfully executed.\n"); int pass = 1; for (i=0; i<ARR_SIZE;i++) if (C[i]!=ARR_SIZE) { //printf("%d %f\n",i,C[i]); pass = 0; } if (pass) printf("Result is correct.\n"); else printf("RESULT IS INCORRECT!\n"); return 0; }
int main(int argc, char** argv) { /* Iterators */ int i, j, k; uint32_t block; /* Time (seconds) */ long t_0; long t_end; long dt; long steps; long iter; /* Emission control */ bool emflag = TRUE; /* Start wall clock timer */ timer_start(TIMER_WALLCLOCK); /* Initialize parallelization */ nprocs = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); nprocs = nprocs > MAX_THREADS ? MAX_THREADS : nprocs; if(argc > 1) { i = atoi(argv[1]); if(i < 1) { fprintf(stderr, "Invalid number of SPUs: %d < 1.\n", i); exit(1); } if(i < nprocs) { nprocs = i; } else { printf("%d SPUs unavailable. Using %d instead.\n", i, nprocs); } } /* Create SPE threads */ for(i=0; i<nprocs; i++) { threads[i].argp = (void*)(&spe_argvs[i]); /* Create context */ if((threads[i].speid = spe_context_create(0, NULL)) == NULL) { fprintf(stderr, "Failed spe_context_create(errno=%d strerror=%s)\n", errno, strerror(errno)); exit(1); } /* Load program into context */ if(spe_program_load(threads[i].speid, &fixedgrid_spu)) { fprintf(stderr, "Failed spe_program_load(errno=%d strerror=%s)\n", errno, strerror(errno)); exit(1); } /* Create thread for each SPE context */ if(pthread_create(&threads[i].pthread, NULL, &ppu_pthread_function, &threads[i])) { fprintf(stderr, "Failed pthread_create(errno=%d strerror=%s)\n", errno, strerror(errno)); exit(1); } spe_set_status(i, SPE_STATUS_WAITING); } printf("\nRunning %d threads (%d SPU + 1 PPU).\n", (nprocs+1), nprocs); /* Allocate concentration memory */ //conc = _malloc_align(NROWS*NCOLS*sizeof(double), 7); //conc_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7); /* Allocation wind vector filed memory */ //wind_u = _malloc_align(NROWS*NCOLS*sizeof(double), 7); //wind_v = _malloc_align(NROWS*NCOLS*sizeof(double), 7); //wind_u_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7); //wind_v_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7); /* Allocation diffusion tensor memory */ //diff = _malloc_align(NROWS*NCOLS*sizeof(double), 7); //diff_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7); /* Initialize concentration data */ double_array_init(NROWS*NCOLS, conc, O3_INIT); /* Initialize wind field */ double_array_init(NROWS*NCOLS, wind_u, WIND_U_INIT); double_array_init(NROWS*NCOLS, wind_v, WIND_V_INIT); /* Initialize diffusion field */ double_array_init(NROWS*NCOLS, diff, DIFF_INIT); /* Initialize time */ t_0 = 0.0; t_end = year2sec(END_YEAR - START_YEAR) + day2sec(END_DOY - START_DOY) + hour2sec(END_HOUR - START_HOUR) + minute2sec(END_MIN - START_MIN); dt = STEP_SIZE; steps = (long)( (t_end - t_0)/dt ); /* Print startup banner */ print_start_banner(NX*DX, NY*DY, 0.0, t_end, steps); /* Store initial concentration */ write_conc(&(conc[0]), 0, 0); /* BEGIN CALCULATIONS */ for(iter = 1; iter <= steps; iter++) { emflag = iter*dt < 6*3600.0 ? TRUE : FALSE; timer_start(TIMER_ROW_DISCRET); /* Discretize rows 1/2 timestep */ block = NROWS / nprocs; for(i=0; i<nprocs; i++) { /* Configure SPE arguments */ spe_argvs[i].arg[0].u64 = (uint64_t)(&conc[i*block*NX]); spe_argvs[i].arg[1].u64 = (uint64_t)(&wind_u[i*block*NX]); spe_argvs[i].arg[2].u64 = (uint64_t)(&diff[i*block*NX]); spe_argvs[i].arg[3].dbl = dt/2; spe_argvs[i].arg[4].dbl = DX; spe_argvs[i].arg[5].u32[0] = NX; spe_argvs[i].arg[5].u32[1] = (i == nprocs - 1 ? block + NROWS % nprocs : block); //FIXME /* Signal SPE */ spe_set_status(i, SPE_STATUS_WORKING); } /* Wait for SPEs to finish */ wait_all_spes(); timer_stop(TIMER_ROW_DISCRET); timer_start(TIMER_COL_DISCRET); /* Discretize colums 1 timestep */ for(i=0; i<NCOLS; i++) { k = i % nprocs; while(spe_get_status(k) > 0) ; //intentional wait if(i >= nprocs) { timer_start(TIMER_ARRAY_COPY); for(j=0; j<NY; j++) { conc[i-nprocs + j*NX] = ccol[k*NY+j]; } timer_stop(TIMER_ARRAY_COPY); } timer_start(TIMER_ARRAY_COPY); for(j=0; j<NY; j++) { ccol[k*NY + j] = conc[i + j*NX]; wcol[k*NY + j] = wind_v[i + j*NX]; dcol[k*NY + j] = diff[i + j*NX]; } timer_stop(TIMER_ARRAY_COPY); // Configure SPE arguments spe_argvs[k].arg[0].u64 = (uint64_t)(&ccol[k*NY]); spe_argvs[k].arg[1].u64 = (uint64_t)(&wcol[k*NY]); spe_argvs[k].arg[2].u64 = (uint64_t)(&dcol[k*NY]); spe_argvs[k].arg[3].dbl = dt; spe_argvs[k].arg[4].dbl = DY; spe_argvs[k].arg[5].u32[0] = NY; spe_argvs[k].arg[5].u32[1] = 1; // Signal SPE spe_set_status(k, SPE_STATUS_WORKING); } /* Wait for SPEs to finish */ wait_all_spes(); timer_stop(TIMER_COL_DISCRET); timer_start(TIMER_ROW_DISCRET); /* Discretize rows 1/2 timestep */ block = NROWS / nprocs; for(i=0; i<nprocs; i++) { /* Configure SPE arguments */ spe_argvs[i].arg[0].u64 = (uint64_t)(&conc[i*block*NX]); spe_argvs[i].arg[1].u64 = (uint64_t)(&wind_u[i*block*NX]); spe_argvs[i].arg[2].u64 = (uint64_t)(&diff[i*block*NX]); spe_argvs[i].arg[3].dbl = dt/2; spe_argvs[i].arg[4].dbl = DX; spe_argvs[i].arg[5].u32[0] = NX; spe_argvs[i].arg[5].u32[1] = (i == nprocs - 1 ? block + NROWS % nprocs : block); //FIXME /* Signal SPE */ spe_set_status(i, SPE_STATUS_WORKING); } /* Wait for SPEs to finish */ wait_all_spes(); timer_stop(TIMER_ROW_DISCRET); /* * Could update wind field here... */ /* * Could update diffusion tensor here... */ /* Add emissions */ if(emflag) { conc[SOURCE_Y*NX + SOURCE_X] += dt * (SOURCE_RATE) / (DX * DY * 1000.0); } /* Store concentration */ #ifdef WRITE_EACH_ITER write_conc(conc, iter, 0); #endif /* Indicate progress */ if(iter % 10 == 0) { printf("Iteration %ld of %ld. Time = %ld seconds.\n", iter, steps, iter*dt); } } /* END CALCULATIONS */ /* Wait for SPU-thread to complete execution. */ for(i=0; i<nprocs; i++) { spe_set_status(i, SPE_STATUS_STOPPED); if(pthread_join(threads[i].pthread, NULL)) { perror("Failed pthread_join"); exit(1); } } /* Store concentration */ write_conc(conc, iter-1, 0); /* Show final time */ printf("Final time: %ld seconds.\n", (iter-1)*dt); timer_stop(TIMER_WALLCLOCK); print_timer_summary("===PPU Timers==="); /* Cleanup and exit */ return 0; }
int main(int argc, char **argv) { int i, retval, spus; /* Determine number of available SPUs */ spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0); if (argc != 2) { printf("Usage: 'ppu_threads <1-%u>'\n", spus); exit(1); } else if ((atoi(argv[1]) < 1) || (atoi(argv[1]) > spus)) { printf("Usage: 'ppu_threads <1-%u>'\n", spus); exit(1); } else { spus = atoi(argv[1]); } /* Create a context and thread for each SPU */ for (i=0; i<spus; i++) { /* Create context */ if ((data[i].speid = spe_context_create(0, NULL)) == NULL) { perror("spe_context_create"); exit(1); } /* Load program into the context */ if ((retval = spe_program_load(data[i].speid, &spu_threads)) != 0) { perror("spe_program_load"); exit (1); } /* Initialize control block and thread data */ control_block = i; data[i].argp = (void*)control_block; /* Create thread */ if ((retval = pthread_create( &data[i].pthread, NULL, &ppu_pthread_function, &data[i])) != 0) { perror("pthread_create"); exit (1); } } /* Wait for the threads to finish processing */ for (i = 0; i < spus; i++) { if ((retval = pthread_join(data[i].pthread, NULL)) != 0) { perror("pthread_join"); exit (1); } if ((retval = spe_context_destroy (data[i].speid)) != 0) { perror("spe_context_destroy"); exit (1); } } return 0; }
struct pipe_context * cell_create_context(struct pipe_screen *screen, void *priv ) { struct cell_context *cell; uint i; /* some fields need to be 16-byte aligned, so align the whole object */ cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16); if (!cell) return NULL; memset(cell, 0, sizeof(*cell)); cell->winsys = NULL; /* XXX: fixme - get this from screen? */ cell->pipe.winsys = NULL; cell->pipe.screen = screen; cell->pipe.priv = priv; cell->pipe.destroy = cell_destroy_context; cell->pipe.clear = cell_clear; cell->pipe.flush = cell_flush; #if 0 cell->pipe.begin_query = cell_begin_query; cell->pipe.end_query = cell_end_query; cell->pipe.wait_query = cell_wait_query; #endif cell_init_draw_functions(cell); cell_init_state_functions(cell); cell_init_shader_functions(cell); cell_init_surface_functions(cell); cell_init_vertex_functions(cell); cell_init_texture_transfer_funcs(cell); cell->draw = cell_draw_create(cell); /* Create cache of fragment ops generated code */ cell->fragment_ops_cache = util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL); cell_init_vbuf(cell); draw_set_rasterize_stage(cell->draw, cell->vbuf); /* convert all points/lines to tris for the time being */ draw_wide_point_threshold(cell->draw, 0.0); draw_wide_line_threshold(cell->draw, 0.0); /* get env vars or read config file to get debug flags */ cell->debug_flags = debug_get_flags_option("CELL_DEBUG", cell_debug_flags, 0 ); for (i = 0; i < CELL_NUM_BUFFERS; i++) cell_fence_init(&cell->fenced_buffers[i].fence); /* * SPU stuff */ /* This call only works with SDK 3.0. Anyone still using 2.1??? */ cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1); cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (cell->debug_flags) { printf("Cell: found %d Cell(s) with %u SPUs\n", cell->num_cells, cell->num_spus); } if (getenv("CELL_NUM_SPUS")) { cell->num_spus = atoi(getenv("CELL_NUM_SPUS")); assert(cell->num_spus > 0); } cell_start_spus(cell); cell_init_batch_buffers(cell); /* make sure SPU initializations are done before proceeding */ cell_flush_int(cell, CELL_FLUSH_WAIT); return &cell->pipe; }
//SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication ///Setup and initialize SPU/CELL/Libspe2 SpuLibspe2Support::SpuLibspe2Support(spe_program_handle_t *speprog, int numThreads) { this->program = speprog; this->numThreads = ((numThreads <= spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)) ? numThreads : spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)); }
int main(int argc, char **argv) { int i, n, retval, nspus; char temp[256]; struct dirent **spu_files; FILE *fh; unsigned int one = 1; nspus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0); printf(" [PPU]: # usable synergistic processing units = %d\n", nspus); for (i = 0; i < nspus; ++i) { if (NULL == (data[i].id = spe_context_create(0, NULL))) { perror("spe_context_create"); exit(128); } retval = spe_program_load(data[i].id, &spu_program); if (unlikely(retval)) { perror("spe_program_load"); exit(128); } data[i].argp = (void *)(ull )i; retval = pthread_create(&data[i].pthread, NULL, ppu_pthread_function, &data[i]); if (unlikely(retval)) { perror("pthread_create"); exit(128); } } n = scandir("/spu", &spu_files, NULL, alphasort); while (n--) { if (!strncmp(spu_files[n]->d_name, "spethread", 9)) { snprintf(temp, sizeof(temp), "/spu/%s/phys-id", spu_files[n]->d_name); if (NULL == (fh = fopen(temp, "r"))) { perror("fopen"); exit(128); } fgets(temp, 128, fh); fclose(fh); printf(" [PPU]: context = %s: physical id = %s", spu_files[n]->d_name, temp); } } free(spu_files); for (i = 0; i < nspus; ++i) { retval = spe_in_mbox_write(data[i].id, &one, 1, SPE_MBOX_ALL_BLOCKING); if (unlikely(1 != retval)) { perror("spe_in_mbox_write"); exit(128); } retval = pthread_join(data[i].pthread, NULL); if (unlikely(retval)) { perror("pthread_join"); exit(128); } retval = spe_context_destroy(data[i].id); if (unlikely(retval)) { perror("spe_context_destroy"); exit(128); } } return 0; }
int main(int argc, char **argv) { // setup, assign particles initla positions and masses // this is done in scalar fashion, NOT SIMD // insignificant to performance since it's only done once //time_t startTime = time(NULL); //seed random generator srand( time(NULL) ); printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n"); int pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { int grideSize = GRID_SIZE; // printf("\n grideSize/2: %d", grideSize/2); float xPos = (float)( rand() % grideSize - grideSize/2); float yPos = (float)( rand() % grideSize - grideSize/2); float zPos = (float)( rand() % grideSize - grideSize/2); particle_Array_PPU[pC].position[0] = xPos; particle_Array_PPU[pC].position[1] = yPos; particle_Array_PPU[pC].position[2] = zPos; particle_Array_PPU[pC].velocity[3] = PARTICLES_DEFAULTMASS; if(pC == 0) { // center, high mass particle_Array_PPU[pC].position = zeroVector; particle_Array_PPU[pC].velocity = zeroVector; //initialVelocityVector_Y_minus; printf("Earth mass: %f\n", earthMass ); particle_Array_PPU[pC].velocity[3] = earthMass; // PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 1) { particle_Array_PPU[pC].position = issPosition; //initPositionVector; particle_Array_PPU[pC].velocity = issVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = issMass; //PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 2) { particle_Array_PPU[pC].position = sat1Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat1Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 3) { particle_Array_PPU[pC].position = sat2Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat2Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 4) { particle_Array_PPU[pC].position = sat3Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat3Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 5) { particle_Array_PPU[pC].position = sat4Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat4Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 6) { particle_Array_PPU[pC].position = moonPosition; //initPositionVector; particle_Array_PPU[pC].velocity = moonVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = moonMass; } else { } //particle_Array_PPU[pC].position = vec_splat(particle_Array_PPU[pC].position, 1); //particle_Array_PPU[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster printf("Particle %d: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); } // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; } for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); int speCount = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES,-1); /* printf("\n"); printf("%d", speCount); printf("\n"); printf("\n"); printf("--------------\n"); printf("Starting spe1 part\n"); */ /* // wait for user input, gives time to start graphics printf("Press Enter to continue\n"); getchar(); */ struct timeval start; gettimeofday(&start,NULL); int iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("++++++++++++++ START of ITERATION # %d of %d +++++++++++++++\n", i, ITERATION_COUNT ); int retval; pthread_t spe1_Thread; pthread_t spe2_Thread; pthread_t spe3_Thread; pthread_t spe4_Thread; pthread_t spe5_Thread; pthread_t spe6_Thread; //speData = spe1_Data; speNumber = 0; /* Create Thread */ // printf("spe1_Data value: %d\n", (int)spe1_Data ); retval = pthread_create(&spe1_Thread, // Thread object NULL, // Thread attributes spe_code_launch_1, // Thread function NULL // Thread argument ); // printf("spe2_Data value: %d\n", (int)spe2_Data ); retval = pthread_create(&spe2_Thread, // Thread object NULL, // Thread attributes spe_code_launch_2, // Thread function NULL // Thread argument ); retval = pthread_create(&spe3_Thread, // Thread object NULL, // Thread attributes spe_code_launch_3, // Thread function NULL // Thread argument ); retval = pthread_create(&spe4_Thread, // Thread object NULL, // Thread attributes spe_code_launch_4, // Thread function NULL // Thread argument ); retval = pthread_create(&spe5_Thread, // Thread object NULL, // Thread attributes spe_code_launch_5, // Thread function NULL // Thread argument ); retval = pthread_create(&spe6_Thread, // Thread object NULL, // Thread attributes spe_code_launch_6, // Thread function NULL // Thread argument ); //Wait for Thread Completion retval = pthread_join(spe1_Thread, NULL); retval = pthread_join(spe2_Thread, NULL); retval = pthread_join(spe3_Thread, NULL); retval = pthread_join(spe4_Thread, NULL); retval = pthread_join(spe5_Thread, NULL); retval = pthread_join(spe6_Thread, NULL); speNumber = 1; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe1_Data[i]; } speNumber = 2; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe2_Data[i]; } speNumber = 3; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe3_Data[i]; } speNumber = 4; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe4_Data[i]; } speNumber = 5; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe5_Data[i]; } speNumber = 6; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<PARTICLES_MAXCOUNT; ++i) { particle_Array_PPU[i] = spe6_Data[i]; } // reset spe counter speNumber = 0; // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; // update values for shared array (graphics) /* particle_Array_Shared[pC].position[0] = particle_Array_PPU[pC].position[0]; particle_Array_Shared[pC].position[1] = particle_Array_PPU[pC].position[1]; particle_Array_Shared[pC].position[2] = particle_Array_PPU[pC].position[2]; particle_Array_Shared[pC].position[3] = particle_Array_PPU[pC].position[3]; */ /* printf("Particle %d positions: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); */ fullSimilationData[iterCount].particleArray[pC]= particle_Array_PPU[pC]; } // printf("++++++++++++++ END of ITERATION # %d of %d +++++++++++++++\n", iterCount, ITERATION_COUNT ); } struct timeval end; gettimeofday(&end,NULL); float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f); printf("print out values from post spe calculations\n"); i = 0; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { printf("Particle %d positions: ", i ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[i].position[0], particle_Array_PPU[i].position[1], particle_Array_PPU[i].position[2], particle_Array_PPU[i].velocity[3]); printf("\n"); } //cleaining the array octantCount = resetOctantCount; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); /* time_t endTime = time(NULL); int deltaTime = endTime - startTime; */ // need to look into http://www.xmlsoft.org/ printf("Execution time: %f\n",deltaTime); FILE *filePointer; filePointer = fopen("fileLog1.txt","w"); //fprintf(filePointer, "<SimulationData>\n"); iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("Iteration: %d\n", iterCount); //fprintf(filePointer,"<Iter>\n"); fprintf(filePointer,"\n"); pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { //printf("Particle %d positions: ", pC ); // fprintf(filePointer,"<Obj>\n"); //printf("x= %f, y=%f, z=%f", fullSimilationData[iterCount].particleArray[pC].position[0], fullSimilationData[iterCount].particleArray[pC].position[1], fullSimilationData[iterCount].particleArray[pC].position[2]); //printf("\n"); /* fprintf(filePointer,"<PX>%f</PX>\n",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"<PY>%f</PY>\n",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"<PZ>%f</PZ>\n",fullSimilationData[iterCount].particleArray[pC].position[2]); */ fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"%f",fullSimilationData[iterCount].particleArray[pC].position[2]); fprintf(filePointer,"|"); //fprintf(filePointer,"</Obj>\n"); //fullSimilationData[fullDataCounter].particleArray[pC]= particle_Array_PPU[pC]; } //fprintf(filePointer,"</Iter>\n"); } //fprintf(filePointer, "</SimulationData>\n"); fclose(filePointer); return 0; }
unsigned int GetNumSPEs() { return spe_cpu_info_get( SPE_COUNT_USABLE_SPES, -1 ); }
/** * @brief Classifies a set of test points using a set of training points. * * @param k The number of k nearest neighbours. * @param test_points The set of test points. * @param training_points The set of training points. * * @return An array of calculated labels for the set of test points. * The element at the first position represents the calculated * label of the first test points. */ unsigned char *classify(int k, Points<unsigned char, unsigned char> &test_points, Points<unsigned char, unsigned char> &training_points) { time_t start_time, end_time; time(&start_time); cb.k = k; cb.values_size = training_points.getVSize(); cb.label_size = training_points.getLSize(); cb.training_dimension = training_points.getDimension(); cb.training_count = training_points.getCount(); cb.training_data_size = training_points.getCount() * training_points.getVSize(); cb.training_points_per_transfer = TRAINING_VALUES_MAX_SIZE / training_points.getVSize(); cb.test_dimension = test_points.getDimension(); cb.test_count = test_points.getCount(); cb.test_data_size = test_points.getCount() * test_points.getVSize(); cb.test_points_per_transfer = TEST_VALUES_MAX_SIZE / test_points.getVSize(); cb.ea_training_points = (uint64_t) training_points.getValues(0); cb.ea_training_labels = (uint64_t) training_points.getLabel(0); cb.ea_test_points = (uint64_t) test_points.getValues(0); cb.ea_test_labels = (uint64_t) test_points.getLabel(0); Points<unsigned char, unsigned char> test_points_results(test_points.getCount(), test_points.getDimension()); cb.ea_test_labels_calculated = (uint64_t) ((char *) test_points_results.getLabel(0)); cb.num_spes = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); if (cb.num_spes > MAX_NUM_SPES) { cb.num_spes = MAX_NUM_SPES; } #ifdef PRINT printf("PPE:\t Num spes = %d\n", cb.num_spes); #endif uint32_t num; printf("PPE:\t Start calculating\n"); fflush(stdout); // create SPE context and load SPE program into the SPE context for (num=0; num<cb.num_spes; num++) { if ((data[num].spe_ctx = spe_context_create(SPE_MAP_PS |SPE_CFG_SIGNOTIFY1_OR|SPE_CFG_SIGNOTIFY2_OR, NULL))==NULL) { perror("Failed creating context"); exit(1); } if (spe_program_load(data[num].spe_ctx, &cellknn_spu)) { perror("Failed loading program"); exit(1); } } // create SPE pthreads for (num=0; num<cb.num_spes; num++) { if (pthread_create(&data[num].pthread, NULL, &spu_pthread, &data[num])) { perror("Failed creating thread"); exit(1); } } // map SPE's MFC problem state to main storage (get effective address) for (num=0; num<cb.num_spes; num++) { if ((cb.spu_mfc_ctl[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx, SPE_CONTROL_AREA))==0) { perror("Failed mapping MFC control area"); exit(1); } if ((cb.spu_ls[num] = (uint64_t)spe_ls_area_get(data[num].spe_ctx))==0) { perror("Failed mapping SPU local store"); exit(1); } if ((cb.spu_sig1[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx, SPE_SIG_NOTIFY_1_AREA))==0) { perror("Failed mapping Signal1 area"); exit(1); } if ((cb.spu_sig2[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx, SPE_SIG_NOTIFY_2_AREA))==0) { perror("Failed mapping Signal2 area"); exit(1); } } // send each SPE its number using BLOCKING mailbox write for (num=0; num<cb.num_spes; num++) { // write 1 entry to in_mailbox - we don't know if we have availalbe space so use blocking // cb parameter have to be loaded after receiving local id!!! spe_in_mbox_write(data[num].spe_ctx, (uint32_t*)&num, 1, SPE_MBOX_ALL_BLOCKING); } // wait for all SPEs to complete for (num=0; num<cb.num_spes; num++) { // wait for all the SPE pthread to complete if (pthread_join(data[num].pthread, NULL)) { perror("Failed joining thread"); exit(1); } // destroy the SPE contexts if (spe_context_destroy(data[num].spe_ctx)) { perror("Failed spe_context_destroy"); exit(1); } } time(&end_time); double difference = difftime(end_time, start_time); printf("It took %.2lf seconds to calculate %d test points and %d training points\n", difference, cb.test_count, cb.training_count); // We have to create a new array, since the Points object is destroyed after this block. // This array has to be freed somewhere outside this function. unsigned char *result = (unsigned char *) malloc(test_points.getCount() * sizeof(unsigned char)); for (int i = 0; i < test_points.getCount(); i++) { result[i] = test_points_results.getLabel(i)[0]; } return result; }
/** * PPU program entry point. */ int main(int argc, char** argv) { /* Get global memory pointer */ fixedgrid_t* const G = &G_GLOBAL; /* Iterators */ uint32_t i, k, iter; /* Start wall clock timer */ timer_start(&G->ppe_metrics.wallclock); /* Calculate available SPEs */ G->nprocs = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1); G->nprocs = G->nprocs > SPE_MAX_THREADS ? SPE_MAX_THREADS : G->nprocs; /* Parse command line arguments */ if(argc > 1) { i = atoi(argv[1]); if(i < 1) { fprintf(stderr, "Invalid number of SPUs: %d < 1.\n", i); exit(1); } if(i > G->nprocs) { printf("%d SPUs unavailable. Using %d instead.\n", i, G->nprocs); } else { G->nprocs = i; } } /* Check dimensions */ if(NROWS < 5) { fprintf(stderr, "%d rows < 5 rows is too small for discretization.\n", NROWS); } if(NCOLS < 5) { fprintf(stderr, "%d columns < 5 columns is too small for discretization.\n", NCOLS); } /* Don't use more SPEs than there are rows or columns */ if(NROWS < G->nprocs) { printf("%d SPUs available, but only %d rows, so using %d SPUs\n", G->nprocs, NROWS, NROWS); G->nprocs = NROWS; } if(NCOLS / VECTOR_LENGTH < G->nprocs) { printf("%d SPUs available, but only %d column vectors of size %d, so using %d SPUs\n", G->nprocs, (NCOLS/VECTOR_LENGTH), VECTOR_LENGTH, (NCOLS/VECTOR_LENGTH)); G->nprocs = (NCOLS/VECTOR_LENGTH); } /* Initialize the model parameters */ init_model(G); /* Create SPE threads */ create_spe_pthreads(G); /* Wait for SPEs to finish initialization */ wait_all_spes(G); printf("\nRunning %d threads (%d SPU + 1 PPU).\n", (G->nprocs+1), G->nprocs); /* Add emissions */ process_emissions(G); /* Print startup banner */ print_start_banner(G); /* Store initial concentration */ printf("Writing initial concentration data... "); write_conc(G, 0, 0); printf("done.\n"); /* BEGIN CALCULATIONS */ for(iter=1, G->time = G->tstart; G->time <= G->tend; G->time += G->dt, ++iter) { start_saprc99(G); for(k=0; k<NLOOKAT; k++) { start_discretize_row(G, LOOKAT[k], G->dt/2.0); start_discretize_col(G, LOOKAT[k], G->dt); start_discretize_row(G, LOOKAT[k], G->dt/2.0); } update_model(G); #if WRITE_EACH_ITER == 1 write_conc(G, iter, 0); #endif printf(" After iteration %02d: Model time = %07.2f sec.\n", iter, iter*G->dt); } /* END CALCULATIONS */ /* Wait for SPU-thread to complete execution. */ join_all_spes(G); /* Store concentration */ #if WRITE_EACH_ITER != 1 write_conc(G, iter-1, 0); #endif /* Show final time */ printf("\nFinal time: %f seconds.\n", (iter-1)*G->dt); timer_stop(&G->ppe_metrics.wallclock); /* Write metrics to CSV file */ write_metrics_as_csv(G, "Cell B.E."); /* Cleanup and exit */ free_global_memory(G); return 0; }