Beispiel #1
0
extern cl_context
createCellContext (cl_int * errcode_ret)
{

  //CBE Programmer's Guide - page 43
  setErrCode(errcode_ret, CL_SUCCESS);


  int node_count = spe_cpu_info_get (SPE_COUNT_PHYSICAL_CPU_NODES, -1);

  PRINT_DEBUG("Num nodes: %d\n", node_count);
  int phys_spes = spe_cpu_info_get (SPE_COUNT_PHYSICAL_SPES, -1);
  PRINT_DEBUG("Num physical spes: %d\n", phys_spes);
  int usable_spes = spe_cpu_info_get (SPE_COUNT_USABLE_SPES, -1);
  PRINT_DEBUG("Num usable spes: %d\n", usable_spes);
  

  if (node_count < 1)
    {
      setErrCode(errcode_ret, CL_DEVICE_NOT_AVAILABLE);
      return (cl_context) 0;
    }

  PRINT_DEBUG("sizeof(cl_context) == %d\n", sizeof (struct _cl_context));

  cl_context context = malloc (sizeof (struct _cl_context));

  
  return context;
}
Beispiel #2
0
int main()
{
	int i, spu_threads;
	spe_context_ptr_t ctxs[MAX_SPU_THREADS];
	pthread_t threads[MAX_SPU_THREADS];
	 
	/*
	  * Determine the number of SPE threads to create.
	  */
	 
	spu_threads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
	if (spu_threads > MAX_SPU_THREADS) spu_threads = MAX_SPU_THREADS;
	 
	/*
	  * Create several SPE-threads to execute 'simple_spu'.
	  */
	 
	for(i=0; i<spu_threads; i++) {
		/* Create context */
		if ((ctxs[i] = spe_context_create (0, NULL)) == NULL) {
			perror ("Failed creating context");
			exit (1);
		}
	 
		/* Load program into context */
		if (spe_program_load (ctxs[i], &simple_spu)) {
			perror ("Failed loading program");
			exit (1);
		}
		 
		/* Create thread for each SPE context */
		if (pthread_create (&threads[i], NULL, &ppu_pthread_function, &ctxs[i])) {
			perror ("Failed creating thread");
			exit (1);
		}
	}
		 
		/* Wait for SPU-thread to complete execution. */
	for (i=0; i<spu_threads; i++) {
		if (pthread_join (threads[i], NULL)) {
			perror("Failed pthread_join");
			exit (1);
		}
		 
		/* Destroy context */
		if (spe_context_destroy (ctxs[i]) != 0) {
			perror("Failed destroying context");
			exit (1);
		}
	}
	 
	printf("\nThe program has successfully executed.\n");
	return 0;
}
Beispiel #3
0
/* Start the Spu threads */
void startSpuThreads(int spu_threads, SpuThreadData * spu_data) {

	int i, no_spus;

	/* Determine the number of SPE threads to create */
  no_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);

	if (spu_threads < 0) {
		spu_threads = no_spus;
	} else if (no_spus < spu_threads) {
		spu_threads = no_spus;
		printf("Warning: Only %i Cell SPU processors available\n", spu_threads);
	}

	spu_data->no_spu_threads = spu_threads;
	spu_data->spus = (SpuData *) malloc(sizeof(SpuData) * spu_threads);
	
	if ((spu_data->spus == NULL)) {
		perror("Failed to allocate SPU data for threads");
	}

	printf("Bringing up %i Cell SPU threads\n", spu_threads);

	/* create the context gang */
	if ((spu_data->gang = spe_gang_context_create(0)) == NULL) {
		perror("Failed creating Cell SPU gang context");
		exit(1);
	}

	for(i=0; i<spu_threads; i++) {
		/* Create context */
		if ((spu_data->spus[i].ctx = spe_context_create (CTX_FLAGS, spu_data->gang)) == NULL) {
			perror ("Failed creating Cell SPU context");
			exit (1);
		}

		/* load bootloader into spu's */
		if (spe_program_load (spu_data->spus[i].ctx, &cellspu_bootloader)) {
			perror ("Failed loading Cell SPU bootloader");
			exit (1);
		}

		/* create a thread for each SPU */
		if (pthread_create (&(spu_data->spus[i].boot_thread),
												NULL,
												&spu_bootstrap_thread,
												&(spu_data->spus[i].ctx))) {
			perror ("Failed creating Cell SPU thread");
			exit (1);
		}
	}
}
Beispiel #4
0
void MMGP_init(){

    MMGP_pid = getpid();

	NUM_SPE = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1);

    /* In the sampling phase use the functions
     * with the timing instructions, othervise use the
     * functions without the timing instructions 
     * (to avoid the overhead) */
        MMGP_offload = &_empty;
        MMGP_prediction = &_empty;
        MMGP_wait_SPE = &_wait_SPE;
        MMGP_start_SPE = &_start_SPE;
        MMGP_create_threads = &_create_threads;
        
}
Beispiel #5
0
initDisp( unsigned int numspes )
{
  // Get the number of available SPEs
  speThreads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
  // Clamp to the defined number of SPEs used
  if ( speThreads > MAX_SPU_NUM )
    {
      speThreads = MAX_SPU_NUM;
    }
  if( speThreads > numspes )
    {
      speThreads = numspes;
    }
  //printf("InitDist. speThreads is: %d\n",speThreads);
  unsigned int i;

  unsigned int temp;

  // Get dispatcher
  //printf("Getting the dispatcher\n");
  //spe_program_handle_t *dispatcher = spe_image_open( "/home/jens/numpycbe_dispatcher" );
  spe_program_handle_t *dispatcher = spe_image_open( "./../../../../numpycbe_dispatcher" );
  //printf("After getting the dispatcher\n");

  // Initialize threads
  for( i = 0 ; i < speThreads ; i++ )
    {
      CreateSPEThread( &speData[i], dispatcher, &spe_pointer_addr[i] );

      // Sending the SPE its id
      //printf("spe_write MULTIARRAYMODULE Sending id to SPE %d.\n",i);
      spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING );
      // Sending the SPE its seed. This should be something like time instead of id?
      //printf("spe_write MULTIARRAYMODULE Sending seed to SPE %d.\n",i);
      spe_in_mbox_write ( speData[i].spe_ctx, &i, 1, SPE_MBOX_ALL_BLOCKING );
    }
  //printf("speData[i].spe_ctx is : %d\n",speData[i].spe_ctx);

  //spe_in_mbox_write ( (void*)temp, &i, 1, SPE_MBOX_ALL_BLOCKING );

  return 0;
}
Beispiel #6
0
int main()
{
    float A[ARR_SIZE] __attribute__ ((aligned(16)));
    float B[ARR_SIZE] __attribute__ ((aligned(16)));
    float C[ARR_SIZE] __attribute__ ((aligned(16)));

    int i, spu_threads;
    pthread_t threads[MAX_SPU_THREADS];
    pointers_t thread_arg[MAX_SPU_THREADS] __attribute__ ((aligned(16)));

    /*
     * Initialization
     */
    for (i=0;i<ARR_SIZE;i++) {
        A[i] = i;
        B[i] = ARR_SIZE - i;
        C[i] = 0;
    }

    /* 
     * Determine the number of SPE threads to create.
     */

    spu_threads = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
    if (spu_threads > MAX_SPU_THREADS) spu_threads = MAX_SPU_THREADS;

    /* 
     * Create several SPE-threads to execute 'ex1_spu'.
     */

    for(i = 0; i < spu_threads; i++) {

        int no_elems = ARR_SIZE / spu_threads;
        int dim = no_elems * sizeof(float); 
        thread_arg[i].A = A + i*no_elems;
        thread_arg[i].B = B + i*no_elems;
        thread_arg[i].C = C + i*no_elems;
        thread_arg[i].dim = dim;

        /* Create thread for each SPE context */
        if (pthread_create (&threads[i], NULL, &ppu_pthread_function, &thread_arg[i]))  {
            perror ("Failed creating thread");
            exit (1);
        }
    }

    /* Wait for SPU-thread to complete execution.  */
    for (i = 0; i < spu_threads; i++) {
        if (pthread_join (threads[i], NULL)) {
            perror("Failed pthread_join");
            exit (1);
        }
    }

    printf("\nThe program has successfully executed.\n");
    int pass = 1;
    for (i=0; i<ARR_SIZE;i++) 
	if (C[i]!=ARR_SIZE) { 
		//printf("%d %f\n",i,C[i]); 
		pass = 0; 
	}
    if (pass) printf("Result is correct.\n");
    else printf("RESULT IS INCORRECT!\n");
    return 0;
}
Beispiel #7
0
int main(int argc, char** argv)
{
    /* Iterators */
    int i, j, k;
    
    uint32_t block;
    
    /* Time (seconds) */
    long t_0;
    long t_end;
    long dt;
    long steps;
    long iter;
    
    /* Emission control */
    bool emflag = TRUE;
    
    /* Start wall clock timer */
    timer_start(TIMER_WALLCLOCK);
    
    /* Initialize parallelization */
    nprocs = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
    nprocs = nprocs > MAX_THREADS ? MAX_THREADS : nprocs;
    
    if(argc > 1)
    {
        i = atoi(argv[1]);
        if(i < 1)
        {
            fprintf(stderr, "Invalid number of SPUs: %d < 1.\n", i);
            exit(1);
        }
        
        if(i < nprocs)
        {
            nprocs = i;
        }
        else 
        {
            printf("%d SPUs unavailable.  Using %d instead.\n", i, nprocs);
        }
    }
    
    /* Create SPE threads */
    for(i=0; i<nprocs; i++) 
    {
        threads[i].argp = (void*)(&spe_argvs[i]);
        
        /* Create context */
        if((threads[i].speid = spe_context_create(0, NULL)) == NULL) 
        {
            fprintf(stderr, "Failed spe_context_create(errno=%d strerror=%s)\n", errno, strerror(errno));
            exit(1);
        }
        
        /* Load program into context */
        if(spe_program_load(threads[i].speid, &fixedgrid_spu)) 
        {
            fprintf(stderr, "Failed spe_program_load(errno=%d strerror=%s)\n", errno, strerror(errno));
            exit(1);
        }
            
        /* Create thread for each SPE context */
        if(pthread_create(&threads[i].pthread, NULL, &ppu_pthread_function, &threads[i])) 
        {
            fprintf(stderr, "Failed pthread_create(errno=%d strerror=%s)\n", errno, strerror(errno));
            exit(1);
        }
        
        spe_set_status(i, SPE_STATUS_WAITING);
    }
    
    printf("\nRunning %d threads (%d SPU + 1 PPU).\n", (nprocs+1), nprocs);
    
    /* Allocate concentration memory */
    //conc = _malloc_align(NROWS*NCOLS*sizeof(double), 7);
    //conc_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7);

    /* Allocation wind vector filed memory */
    //wind_u = _malloc_align(NROWS*NCOLS*sizeof(double), 7);
    //wind_v = _malloc_align(NROWS*NCOLS*sizeof(double), 7);
    //wind_u_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7);
    //wind_v_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7);
 
    /* Allocation diffusion tensor memory */
    //diff = _malloc_align(NROWS*NCOLS*sizeof(double), 7);
    //diff_buff = (double*)_malloc_align(MAX_THREADS*NY*sizeof(double), 7);

    /* Initialize concentration data */
    double_array_init(NROWS*NCOLS, conc, O3_INIT);
        
    /* Initialize wind field */
    double_array_init(NROWS*NCOLS, wind_u, WIND_U_INIT);
    double_array_init(NROWS*NCOLS, wind_v, WIND_V_INIT);
    
    /* Initialize diffusion field */
    double_array_init(NROWS*NCOLS, diff, DIFF_INIT);
    
    /* Initialize time */
    t_0 = 0.0;
    t_end = year2sec(END_YEAR - START_YEAR) + day2sec(END_DOY - START_DOY) + 
            hour2sec(END_HOUR - START_HOUR) + minute2sec(END_MIN - START_MIN);
    dt = STEP_SIZE;
    steps = (long)( (t_end - t_0)/dt );
    
    /* Print startup banner */
    print_start_banner(NX*DX, NY*DY, 0.0, t_end, steps);
    
    /* Store initial concentration */
    write_conc(&(conc[0]), 0, 0);
    
    /* BEGIN CALCULATIONS */
    for(iter = 1; iter <= steps; iter++)
    {
        emflag = iter*dt < 6*3600.0 ? TRUE : FALSE;
        
        timer_start(TIMER_ROW_DISCRET);
        
        /* Discretize rows 1/2 timestep */
        block = NROWS / nprocs;
        for(i=0; i<nprocs; i++)
        {
            /* Configure SPE arguments */
            spe_argvs[i].arg[0].u64 = (uint64_t)(&conc[i*block*NX]);
            spe_argvs[i].arg[1].u64 = (uint64_t)(&wind_u[i*block*NX]);
            spe_argvs[i].arg[2].u64 = (uint64_t)(&diff[i*block*NX]);
            spe_argvs[i].arg[3].dbl = dt/2;
            spe_argvs[i].arg[4].dbl = DX;
            spe_argvs[i].arg[5].u32[0] = NX;
            spe_argvs[i].arg[5].u32[1] = (i == nprocs - 1 ? block + NROWS % nprocs : block);  //FIXME
            
            /* Signal SPE */
            spe_set_status(i, SPE_STATUS_WORKING);
        }
        
        /* Wait for SPEs to finish */
        wait_all_spes();
        
        timer_stop(TIMER_ROW_DISCRET);
        
        timer_start(TIMER_COL_DISCRET);
        
        /* Discretize colums 1 timestep */
        for(i=0; i<NCOLS; i++)
        {
            k = i % nprocs;

            while(spe_get_status(k) > 0) ; //intentional wait
            
            if(i >= nprocs)
            {
                timer_start(TIMER_ARRAY_COPY);
                for(j=0; j<NY; j++)
                {
                    conc[i-nprocs + j*NX] = ccol[k*NY+j];
                }
                timer_stop(TIMER_ARRAY_COPY);
            }
            
            timer_start(TIMER_ARRAY_COPY);
            for(j=0; j<NY; j++)
            {
                ccol[k*NY + j] = conc[i + j*NX];
                wcol[k*NY + j] = wind_v[i + j*NX];
                dcol[k*NY + j] = diff[i + j*NX];
            }
            timer_stop(TIMER_ARRAY_COPY);

            // Configure SPE arguments 
            spe_argvs[k].arg[0].u64 = (uint64_t)(&ccol[k*NY]);
            spe_argvs[k].arg[1].u64 = (uint64_t)(&wcol[k*NY]);
            spe_argvs[k].arg[2].u64 = (uint64_t)(&dcol[k*NY]);
            spe_argvs[k].arg[3].dbl = dt;
            spe_argvs[k].arg[4].dbl = DY;
            spe_argvs[k].arg[5].u32[0] = NY;
            spe_argvs[k].arg[5].u32[1] = 1;

            // Signal SPE 
            spe_set_status(k, SPE_STATUS_WORKING);
        }

        /* Wait for SPEs to finish */
        wait_all_spes();
        
        timer_stop(TIMER_COL_DISCRET);
        
        timer_start(TIMER_ROW_DISCRET);
        
        /* Discretize rows 1/2 timestep */
        block = NROWS / nprocs;
        for(i=0; i<nprocs; i++)
        {
            /* Configure SPE arguments */
            spe_argvs[i].arg[0].u64 = (uint64_t)(&conc[i*block*NX]);
            spe_argvs[i].arg[1].u64 = (uint64_t)(&wind_u[i*block*NX]);
            spe_argvs[i].arg[2].u64 = (uint64_t)(&diff[i*block*NX]);
            spe_argvs[i].arg[3].dbl = dt/2;
            spe_argvs[i].arg[4].dbl = DX;
            spe_argvs[i].arg[5].u32[0] = NX;
            spe_argvs[i].arg[5].u32[1] = (i == nprocs - 1 ? block + NROWS % nprocs : block);  //FIXME
            
            /* Signal SPE */
            spe_set_status(i, SPE_STATUS_WORKING);
        }
        
        /* Wait for SPEs to finish */
        wait_all_spes();
        
        timer_stop(TIMER_ROW_DISCRET);
        
        /*
         * Could update wind field here...
         */
         
        /*
         * Could update diffusion tensor here...
         */
        
        /* Add emissions */
        if(emflag)
        {
            conc[SOURCE_Y*NX + SOURCE_X] += dt * (SOURCE_RATE) / (DX * DY * 1000.0);
        }
        
        /* Store concentration */
        #ifdef WRITE_EACH_ITER
        write_conc(conc, iter, 0);
        #endif
        
        /* Indicate progress */
        if(iter % 10 == 0)
        {
            printf("Iteration %ld of %ld.  Time = %ld seconds.\n", iter, steps, iter*dt);
        }
        
    }
    /* END CALCULATIONS */
    
    /* Wait for SPU-thread to complete execution. */
    for(i=0; i<nprocs; i++) 
    {
        spe_set_status(i, SPE_STATUS_STOPPED);
        if(pthread_join(threads[i].pthread, NULL)) 
        {
            perror("Failed pthread_join");
            exit(1);
        }
    }
    
    /* Store concentration */
    write_conc(conc, iter-1, 0);
    
    /* Show final time */
    printf("Final time: %ld seconds.\n", (iter-1)*dt);
    
    timer_stop(TIMER_WALLCLOCK);

    print_timer_summary("===PPU Timers===");    
    
    /* Cleanup and exit */
    return 0;
}
Beispiel #8
0
int main(int argc, char **argv) {
   int i, retval, spus;
   
   /* Determine number of available SPUs */
   spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
   if (argc != 2) {
      printf("Usage: 'ppu_threads <1-%u>'\n", spus);
      exit(1);
   }
   else if ((atoi(argv[1]) < 1) || 
      (atoi(argv[1]) > spus)) {
      printf("Usage: 'ppu_threads <1-%u>'\n", spus);
      exit(1);
   }
   else {   
      spus = atoi(argv[1]);
   }
   
   /* Create a context and thread for each SPU */
   for (i=0; i<spus; i++) {

      /* Create context */
      if ((data[i].speid = spe_context_create(0, NULL)) == NULL)
      {
         perror("spe_context_create");
         exit(1);
      }
      
      /* Load program into the context */
      if ((retval =
          spe_program_load(data[i].speid, &spu_threads)) != 0)
      {
         perror("spe_program_load");
         exit (1);
      }
      
      /* Initialize control block and thread data */
      control_block = i;
      data[i].argp = (void*)control_block;
     
      /* Create thread */
      if ((retval = 
          pthread_create(
              &data[i].pthread,
              NULL,
              &ppu_pthread_function,
              &data[i])) != 0)
      {
         perror("pthread_create");
         exit (1);
      }
   }
   
   /* Wait for the threads to finish processing */
   for (i = 0; i < spus; i++)
   {
      if ((retval = pthread_join(data[i].pthread, NULL)) != 0)
      {
          perror("pthread_join");
          exit (1);
      }

      if ((retval = spe_context_destroy (data[i].speid)) != 0)
      {
          perror("spe_context_destroy");
          exit (1);
      }
   }
   return 0;
}
Beispiel #9
0
struct pipe_context *
cell_create_context(struct pipe_screen *screen,
                    void *priv )
{
   struct cell_context *cell;
   uint i;

   /* some fields need to be 16-byte aligned, so align the whole object */
   cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
   if (!cell)
      return NULL;

   memset(cell, 0, sizeof(*cell));

   cell->winsys = NULL;		/* XXX: fixme - get this from screen? */
   cell->pipe.winsys = NULL;
   cell->pipe.screen = screen;
   cell->pipe.priv = priv;
   cell->pipe.destroy = cell_destroy_context;

   cell->pipe.clear = cell_clear;
   cell->pipe.flush = cell_flush;

#if 0
   cell->pipe.begin_query = cell_begin_query;
   cell->pipe.end_query = cell_end_query;
   cell->pipe.wait_query = cell_wait_query;
#endif

   cell_init_draw_functions(cell);
   cell_init_state_functions(cell);
   cell_init_shader_functions(cell);
   cell_init_surface_functions(cell);
   cell_init_vertex_functions(cell);
   cell_init_texture_transfer_funcs(cell);

   cell->draw = cell_draw_create(cell);

   /* Create cache of fragment ops generated code */
   cell->fragment_ops_cache =
      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);

   cell_init_vbuf(cell);

   draw_set_rasterize_stage(cell->draw, cell->vbuf);

   /* convert all points/lines to tris for the time being */
   draw_wide_point_threshold(cell->draw, 0.0);
   draw_wide_line_threshold(cell->draw, 0.0);

   /* get env vars or read config file to get debug flags */
   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
                                              cell_debug_flags, 
                                              0 );

   for (i = 0; i < CELL_NUM_BUFFERS; i++)
      cell_fence_init(&cell->fenced_buffers[i].fence);


   /*
    * SPU stuff
    */
   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
   if (cell->debug_flags) {
      printf("Cell: found %d Cell(s) with %u SPUs\n",
             cell->num_cells, cell->num_spus);
   }
   if (getenv("CELL_NUM_SPUS")) {
      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
      assert(cell->num_spus > 0);
   }

   cell_start_spus(cell);

   cell_init_batch_buffers(cell);

   /* make sure SPU initializations are done before proceeding */
   cell_flush_int(cell, CELL_FLUSH_WAIT);

   return &cell->pipe;
}
//SpuLibspe2Support helps to initialize/shutdown libspe2, start/stop SPU tasks and communication
///Setup and initialize SPU/CELL/Libspe2
SpuLibspe2Support::SpuLibspe2Support(spe_program_handle_t *speprog, int numThreads)
{
	this->program = speprog;
	this->numThreads =  ((numThreads <= spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1)) ? numThreads : spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES, -1));
}
Beispiel #11
0
int main(int argc, char **argv)
{
	int i, n, retval, nspus;
	char temp[256];
	struct dirent **spu_files;
	FILE *fh;
	unsigned int one = 1;

	nspus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
	printf(" [PPU]: # usable synergistic processing units = %d\n", nspus);

	for (i = 0; i < nspus; ++i) {
		if (NULL == (data[i].id = spe_context_create(0, NULL))) {
			perror("spe_context_create");
			exit(128);
		}

		retval = spe_program_load(data[i].id, &spu_program);
		if (unlikely(retval)) {
			perror("spe_program_load");
			exit(128);
		}

		data[i].argp = (void *)(ull )i;

		retval = pthread_create(&data[i].pthread, NULL,
		                        ppu_pthread_function, &data[i]);
		if (unlikely(retval)) {
			perror("pthread_create");
			exit(128);
		}
	}

	n = scandir("/spu", &spu_files, NULL, alphasort);
	while (n--) {
		if (!strncmp(spu_files[n]->d_name, "spethread", 9)) {
			snprintf(temp, sizeof(temp), "/spu/%s/phys-id",
				 spu_files[n]->d_name);
			if (NULL == (fh = fopen(temp, "r"))) {
				perror("fopen");
				exit(128);
			}

			fgets(temp, 128, fh);
			fclose(fh);

			printf(" [PPU]: context = %s: physical id = %s",
			       spu_files[n]->d_name, temp);
		}
	}
	free(spu_files);

	for (i = 0; i < nspus; ++i) {
		retval = spe_in_mbox_write(data[i].id, &one, 1,
		                           SPE_MBOX_ALL_BLOCKING);
		if (unlikely(1 != retval)) {
			perror("spe_in_mbox_write");
			exit(128);
		}

		retval = pthread_join(data[i].pthread, NULL);
		if (unlikely(retval)) {
			perror("pthread_join");
			exit(128);
		}

		retval = spe_context_destroy(data[i].id);
		if (unlikely(retval)) {
			perror("spe_context_destroy");
			exit(128);
		}
	}

	return 0;
}
Beispiel #12
0
int main(int argc, char **argv)
{
	


// setup, assign particles initla positions and masses
// this is done in scalar fashion, NOT SIMD
// insignificant to performance since it's only done once

	//time_t startTime = time(NULL);



	//seed random generator
	srand( time(NULL) );

	printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n");

	int pC = 0;
	for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	{
		int grideSize = GRID_SIZE;

	//	printf("\n grideSize/2: %d", grideSize/2);

		float xPos = (float)( rand() % grideSize  - grideSize/2);
		float yPos = (float)( rand() % grideSize  - grideSize/2);
		float zPos = (float)( rand() % grideSize  - grideSize/2);

		particle_Array_PPU[pC].position[0] = xPos;
		particle_Array_PPU[pC].position[1] = yPos;
		particle_Array_PPU[pC].position[2] = zPos;

		particle_Array_PPU[pC].velocity[3] = PARTICLES_DEFAULTMASS;

		if(pC == 0)
		{
			// center, high mass
			particle_Array_PPU[pC].position = zeroVector;
			particle_Array_PPU[pC].velocity = zeroVector; //initialVelocityVector_Y_minus;

			printf("Earth mass: %f\n", earthMass );
			particle_Array_PPU[pC].velocity[3] = earthMass; // PARTICLES_DEFAULTMASS * 500.0f;
		}
		if(pC == 1)
		{
			particle_Array_PPU[pC].position = issPosition; //initPositionVector;
			particle_Array_PPU[pC].velocity = issVelocity; //initialVelocityVector_Y;

			particle_Array_PPU[pC].velocity[3] = issMass; //PARTICLES_DEFAULTMASS * 500.0f;

		}
		if(pC == 2)
		{
			particle_Array_PPU[pC].position = sat1Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat1Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 3)
		{
			particle_Array_PPU[pC].position = sat2Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat2Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 4)
		{
			particle_Array_PPU[pC].position = sat3Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat3Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 5)
		{
			particle_Array_PPU[pC].position = sat4Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat4Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 6)
		{
			particle_Array_PPU[pC].position = moonPosition; //initPositionVector;
			particle_Array_PPU[pC].velocity = moonVelocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = moonMass; 

		}
		else
		{



		}

		//particle_Array_PPU[pC].position = vec_splat(particle_Array_PPU[pC].position, 1);
		//particle_Array_PPU[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster
		
		printf("Particle %d:   ", pC );
		printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]);
		printf("\n");
		
	}


	// copy arrays into spe ones
	pC = 0;
	for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	{

		spe1_Data[pC] = particle_Array_PPU[pC];	
		spe2_Data[pC] = particle_Array_PPU[pC];	
		spe3_Data[pC] = particle_Array_PPU[pC];	
		spe4_Data[pC] = particle_Array_PPU[pC];	
		spe5_Data[pC] = particle_Array_PPU[pC];	
		spe6_Data[pC] = particle_Array_PPU[pC];		
	}

	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{
     /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes 
		
		// compare with zero vector to get on which side of each axis the particle is
		// 0 is negative, 1 is positive side of the axis
		__vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector);



		// need to manually set, can't cast due to size difference error
		__vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0],
											  (unsigned int)axisDirection[1],
											  (unsigned int)axisDirection[2],
												0};
		// need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES
		shiftedAxis = vec_andc(oneVector, shiftedAxis);

		/*
		printf("Particle %d axis sign:   ", i );
		printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]);
		printf("\n");
		*/

		// shift 3 axies simultaneously (actually only 2, 1 stays in origina positon
		//, with intent to OR them later
		shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector

		__vector unsigned int axis_Y = vec_splats(shiftedAxis[1]);
		__vector unsigned int axis_Z = vec_splats(shiftedAxis[2]);
		// merge shhifted x y z values by OR-ing
		// this gives the octant id, range from 0-7 (000 to 111 in binary)
		shiftedAxis = vec_or(shiftedAxis, axis_Y);
		shiftedAxis = vec_or(shiftedAxis, axis_Z);
		// insert octant value into last slot of position vector of particle
		particle_Array_PPU[i].position[3] = (float)shiftedAxis[0];

		//printf("Oct ID: %d \n", shiftedAxis[0]);

		/////// Update octant vector by incrementing octant that the particle is in
		// The only possible non SIMD line in the entire program, 
		//irreleant since quadrant counting should occur on PPU anyways
		octantCount[shiftedAxis[0]] ++ ;
		
	}
	i=0;

	printf("\n");

		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");


	int speCount = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES,-1);
/*
	printf("\n");
	printf("%d", speCount);

	printf("\n");
	printf("\n");
	printf("--------------\n");
	printf("Starting spe1 part\n");
*/
/*
	// wait for user input, gives time to start graphics
	printf("Press Enter to continue\n");

	getchar();
*/

	struct timeval start;
	gettimeofday(&start,NULL);


	int iterCount = 0;
	for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++)
	{

		//printf("++++++++++++++ START of ITERATION # %d of %d +++++++++++++++\n", i, ITERATION_COUNT );

		int retval;
		pthread_t spe1_Thread;
		pthread_t spe2_Thread;
		pthread_t spe3_Thread;
		pthread_t spe4_Thread;
		pthread_t spe5_Thread;
		pthread_t spe6_Thread;


		//speData = spe1_Data;
		speNumber = 0;
		/* Create Thread */
	//	printf("spe1_Data value: %d\n", (int)spe1_Data );
		retval = pthread_create(&spe1_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_1, // Thread function
								NULL // Thread argument
								);

	//	printf("spe2_Data value: %d\n", (int)spe2_Data );
		
		retval = pthread_create(&spe2_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_2, // Thread function
								NULL // Thread argument
								);
		
		
		retval = pthread_create(&spe3_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_3, // Thread function
								NULL // Thread argument
								);

		
		retval = pthread_create(&spe4_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_4, // Thread function
								NULL // Thread argument
								);

		retval = pthread_create(&spe5_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_5, // Thread function
								NULL // Thread argument
								);

		retval = pthread_create(&spe6_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_6, // Thread function
								NULL // Thread argument
								);
		


		//Wait for Thread Completion
		retval = pthread_join(spe1_Thread, NULL);


		retval = pthread_join(spe2_Thread, NULL);

		
		retval = pthread_join(spe3_Thread, NULL);

		retval = pthread_join(spe4_Thread, NULL);
		
		retval = pthread_join(spe5_Thread, NULL);
		
		retval = pthread_join(spe6_Thread, NULL);
		

		
		speNumber = 1;
		
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe1_Data[i];
		}

		speNumber = 2;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe2_Data[i];
		}

		speNumber = 3;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe3_Data[i];
		}

		speNumber = 4;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe4_Data[i];
		}

		speNumber = 5;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe5_Data[i];
		}

		speNumber = 6;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<PARTICLES_MAXCOUNT; ++i)
		{
			particle_Array_PPU[i] = spe6_Data[i];
		}

		// reset spe counter
		speNumber = 0;
		


		// copy arrays into spe ones
		pC = 0;
		for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
		{

			spe1_Data[pC] = particle_Array_PPU[pC];	
			spe2_Data[pC] = particle_Array_PPU[pC];	
			spe3_Data[pC] = particle_Array_PPU[pC];	
			spe4_Data[pC] = particle_Array_PPU[pC];	
			spe5_Data[pC] = particle_Array_PPU[pC];	
			spe6_Data[pC] = particle_Array_PPU[pC];	


			// update values for shared array (graphics)
			/*
			particle_Array_Shared[pC].position[0] = particle_Array_PPU[pC].position[0];
			particle_Array_Shared[pC].position[1] = particle_Array_PPU[pC].position[1];
			particle_Array_Shared[pC].position[2] = particle_Array_PPU[pC].position[2];
			particle_Array_Shared[pC].position[3] = particle_Array_PPU[pC].position[3];
			*/

			/*		
			printf("Particle %d positions:   ", pC );
			printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]);
			printf("\n");
			*/


			fullSimilationData[iterCount].particleArray[pC]= particle_Array_PPU[pC];
		}

		

	//	printf("++++++++++++++ END of ITERATION # %d of %d +++++++++++++++\n", iterCount, ITERATION_COUNT );


	}

	struct timeval end;
	gettimeofday(&end,NULL);
	float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f);


	printf("print out values from post spe calculations\n");
	i = 0;
	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{

		printf("Particle %d positions:   ", i );
		printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[i].position[0], particle_Array_PPU[i].position[1], particle_Array_PPU[i].position[2], particle_Array_PPU[i].velocity[3]);
		printf("\n");
	
	}
	//cleaining the array
	octantCount = resetOctantCount;
	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{
     /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes 
		
		// compare with zero vector to get on which side of each axis the particle is
		// 0 is negative, 1 is positive side of the axis
		__vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector);



		// need to manually set, can't cast due to size difference error
		__vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0],
											  (unsigned int)axisDirection[1],
											  (unsigned int)axisDirection[2],
												0};
		// need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES
		shiftedAxis = vec_andc(oneVector, shiftedAxis);

		/*
		printf("Particle %d axis sign:   ", i );
		printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]);
		printf("\n");
		*/

		// shift 3 axies simultaneously (actually only 2, 1 stays in origina positon
		//, with intent to OR them later
		shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector

		__vector unsigned int axis_Y = vec_splats(shiftedAxis[1]);
		__vector unsigned int axis_Z = vec_splats(shiftedAxis[2]);
		// merge shhifted x y z values by OR-ing
		// this gives the octant id, range from 0-7 (000 to 111 in binary)
		shiftedAxis = vec_or(shiftedAxis, axis_Y);
		shiftedAxis = vec_or(shiftedAxis, axis_Z);
		// insert octant value into last slot of position vector of particle
		particle_Array_PPU[i].position[3] = (float)shiftedAxis[0];

		//printf("Oct ID: %d \n", shiftedAxis[0]);

		/////// Update octant vector by incrementing octant that the particle is in
		// The only possible non SIMD line in the entire program, 
		//irreleant since quadrant counting should occur on PPU anyways
		octantCount[shiftedAxis[0]] ++ ;
		
	}
	i=0;

	printf("\n");

		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");



/*
	time_t endTime = time(NULL);
	int deltaTime = endTime - startTime;
*/

	// need to look into http://www.xmlsoft.org/


	printf("Execution time:    %f\n",deltaTime);


	FILE *filePointer;
	filePointer = fopen("fileLog1.txt","w");
	//fprintf(filePointer, "<SimulationData>\n");
	

	iterCount = 0;
	for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++)
	{
		//printf("Iteration: %d\n", iterCount);
		//fprintf(filePointer,"<Iter>\n");
		fprintf(filePointer,"\n");

		pC = 0;
	    for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	    {
		
			//printf("Particle %d positions:   ", pC );
		//	fprintf(filePointer,"<Obj>\n");
	    	

			//printf("x= %f, y=%f, z=%f", fullSimilationData[iterCount].particleArray[pC].position[0], fullSimilationData[iterCount].particleArray[pC].position[1], fullSimilationData[iterCount].particleArray[pC].position[2]);
			//printf("\n");
			
	    	/*
			fprintf(filePointer,"<PX>%f</PX>\n",fullSimilationData[iterCount].particleArray[pC].position[0]);
			fprintf(filePointer,"<PY>%f</PY>\n",fullSimilationData[iterCount].particleArray[pC].position[1]);
			fprintf(filePointer,"<PZ>%f</PZ>\n",fullSimilationData[iterCount].particleArray[pC].position[2]);
			*/

			fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[0]);
			fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[1]);
			fprintf(filePointer,"%f",fullSimilationData[iterCount].particleArray[pC].position[2]);

			fprintf(filePointer,"|");
			//fprintf(filePointer,"</Obj>\n");			
			//fullSimilationData[fullDataCounter].particleArray[pC]= particle_Array_PPU[pC];
			
		}

		//fprintf(filePointer,"</Iter>\n");


	}


	//fprintf(filePointer, "</SimulationData>\n");


	fclose(filePointer);


	return 0;
}
Beispiel #13
0
unsigned int GetNumSPEs()
{
	return spe_cpu_info_get( SPE_COUNT_USABLE_SPES, -1 );
}
Beispiel #14
0
/**
 * @brief Classifies a set of test points using a set of training points.
 *
 * @param k The number of k nearest neighbours.
 * @param test_points The set of test points.
 * @param training_points The set of training points.
 * 
 * @return An array of calculated labels for the set of test points. 
 *         The element at the first position represents the calculated 
 *         label of the first test points. 
 */
unsigned char *classify(int k, Points<unsigned char, unsigned char> &test_points,
		Points<unsigned char, unsigned char> &training_points) {	
	time_t start_time, end_time;

	time(&start_time);

	cb.k = k;
	cb.values_size = training_points.getVSize();
	cb.label_size = training_points.getLSize();

	cb.training_dimension = training_points.getDimension();
	cb.training_count = training_points.getCount();
	cb.training_data_size = training_points.getCount()
			* training_points.getVSize();
	cb.training_points_per_transfer = TRAINING_VALUES_MAX_SIZE
			/ training_points.getVSize();

	cb.test_dimension = test_points.getDimension();
	cb.test_count = test_points.getCount();
	cb.test_data_size = test_points.getCount() * test_points.getVSize();
	cb.test_points_per_transfer = TEST_VALUES_MAX_SIZE / test_points.getVSize();

	cb.ea_training_points = (uint64_t) training_points.getValues(0);
	cb.ea_training_labels = (uint64_t) training_points.getLabel(0);
	cb.ea_test_points = (uint64_t) test_points.getValues(0);
	cb.ea_test_labels = (uint64_t) test_points.getLabel(0);
	
	Points<unsigned char, unsigned char> test_points_results(test_points.getCount(), test_points.getDimension());
	cb.ea_test_labels_calculated = (uint64_t) ((char *) test_points_results.getLabel(0));

	cb.num_spes = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
	if (cb.num_spes > MAX_NUM_SPES) {
		cb.num_spes = MAX_NUM_SPES;
	}

	#ifdef PRINT
	printf("PPE:\t Num spes = %d\n", cb.num_spes);
	#endif

	uint32_t num;

	printf("PPE:\t Start calculating\n");
	fflush(stdout);

	// create SPE context and load SPE program into the SPE context
	for (num=0; num<cb.num_spes; num++) {
		if ((data[num].spe_ctx = spe_context_create(SPE_MAP_PS
				|SPE_CFG_SIGNOTIFY1_OR|SPE_CFG_SIGNOTIFY2_OR, NULL))==NULL) {
			perror("Failed creating context");
			exit(1);
		}
		if (spe_program_load(data[num].spe_ctx, &cellknn_spu)) {
			perror("Failed loading program");
			exit(1);
		}
	}

	// create SPE pthreads
	for (num=0; num<cb.num_spes; num++) {
		if (pthread_create(&data[num].pthread, NULL, &spu_pthread, &data[num])) {
			perror("Failed creating thread");
			exit(1);
		}
	}

	// map SPE's MFC problem state to main storage (get effective address)
	for (num=0; num<cb.num_spes; num++) {
		if ((cb.spu_mfc_ctl[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx,
				SPE_CONTROL_AREA))==0) {
			perror("Failed mapping MFC control area");
			exit(1);
		}
		if ((cb.spu_ls[num] = (uint64_t)spe_ls_area_get(data[num].spe_ctx))==0) {
			perror("Failed mapping SPU local store");
			exit(1);
		}
		if ((cb.spu_sig1[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx,
				SPE_SIG_NOTIFY_1_AREA))==0) {
			perror("Failed mapping Signal1 area");
			exit(1);
		}
		if ((cb.spu_sig2[num] = (uint64_t)spe_ps_area_get(data[num].spe_ctx,
				SPE_SIG_NOTIFY_2_AREA))==0) {
			perror("Failed mapping Signal2 area");
			exit(1);
		}
	}

	// send each SPE its number using BLOCKING mailbox write
	for (num=0; num<cb.num_spes; num++) {
		// write 1 entry to in_mailbox - we don't know if we have availalbe space so use blocking
		// cb parameter have to be loaded after receiving local id!!!
		spe_in_mbox_write(data[num].spe_ctx, (uint32_t*)&num, 1,
				SPE_MBOX_ALL_BLOCKING);
	}

	// wait for all SPEs to complete
	for (num=0; num<cb.num_spes; num++) {
		// wait for all the SPE pthread to complete
		if (pthread_join(data[num].pthread, NULL)) {
			perror("Failed joining thread");
			exit(1);
		}

		// destroy the SPE contexts
		if (spe_context_destroy(data[num].spe_ctx)) {
			perror("Failed spe_context_destroy");
			exit(1);
		}
	}

	time(&end_time);

	double difference = difftime(end_time, start_time);
	printf("It took %.2lf seconds to calculate %d test points and %d training points\n",
			difference, cb.test_count, cb.training_count);
	
	// We have to create a new array, since the Points object is destroyed after this block.
	// This array has to be freed somewhere outside this function.
	unsigned char *result = (unsigned char *) malloc(test_points.getCount() * sizeof(unsigned char));
	for (int i = 0; i < test_points.getCount(); i++) {
		result[i] = test_points_results.getLabel(i)[0];
	}

	return result;
}
Beispiel #15
0
/**
 * PPU program entry point.
 */
int main(int argc, char** argv)
{
    /* Get global memory pointer */
    fixedgrid_t* const G = &G_GLOBAL;
    
    /* Iterators */
    uint32_t i, k, iter;
    
    /* Start wall clock timer */
    timer_start(&G->ppe_metrics.wallclock);
    
    /* Calculate available SPEs */
    G->nprocs = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, -1);
    G->nprocs = G->nprocs > SPE_MAX_THREADS ? SPE_MAX_THREADS : G->nprocs;
    
    /* Parse command line arguments */
    if(argc > 1)
    {
        i = atoi(argv[1]);
        if(i < 1)
        {
            fprintf(stderr, "Invalid number of SPUs: %d < 1.\n", i);
            exit(1);
        }
        
        if(i > G->nprocs)
        {
            printf("%d SPUs unavailable.  Using %d instead.\n", i, G->nprocs);
        }
        else 
        {
            G->nprocs = i;
        }
    }
    
    /* Check dimensions */
    if(NROWS < 5)
    {
        fprintf(stderr, "%d rows < 5 rows is too small for discretization.\n", NROWS);
    }
    if(NCOLS < 5)
    {
        fprintf(stderr, "%d columns < 5 columns is too small for discretization.\n", NCOLS);
    }
    
    /* Don't use more SPEs than there are rows or columns */
    if(NROWS < G->nprocs)
    {
        printf("%d SPUs available, but only %d rows, so using %d SPUs\n", G->nprocs, NROWS, NROWS);
        G->nprocs = NROWS;
    }
    if(NCOLS / VECTOR_LENGTH < G->nprocs)
    {
        printf("%d SPUs available, but only %d column vectors of size %d, so using %d SPUs\n", G->nprocs, (NCOLS/VECTOR_LENGTH), VECTOR_LENGTH, (NCOLS/VECTOR_LENGTH));
        G->nprocs = (NCOLS/VECTOR_LENGTH);
    }
        
    /* Initialize the model parameters */
    init_model(G);
    
    /* Create SPE threads */
    create_spe_pthreads(G);
    
    /* Wait for SPEs to finish initialization */
    wait_all_spes(G);
    
    printf("\nRunning %d threads (%d SPU + 1 PPU).\n", (G->nprocs+1), G->nprocs);    

    /* Add emissions */
    process_emissions(G);
    
    /* Print startup banner */
    print_start_banner(G);
    
    /* Store initial concentration */
    printf("Writing initial concentration data... ");
    write_conc(G, 0, 0);
    printf("done.\n");
    
    /* BEGIN CALCULATIONS */
    for(iter=1, G->time = G->tstart; G->time <= G->tend; G->time += G->dt, ++iter)
    {
        start_saprc99(G);
        
        for(k=0; k<NLOOKAT; k++)
        {
            start_discretize_row(G, LOOKAT[k], G->dt/2.0);
          
            start_discretize_col(G, LOOKAT[k], G->dt);
            
            start_discretize_row(G, LOOKAT[k], G->dt/2.0);
        }

        update_model(G);
        
        #if WRITE_EACH_ITER == 1
        write_conc(G, iter, 0);
        #endif

        printf("  After iteration %02d: Model time = %07.2f sec.\n", iter, iter*G->dt);
    }
    /* END CALCULATIONS */
    
    /* Wait for SPU-thread to complete execution. */
    join_all_spes(G);
    
    /* Store concentration */
    #if WRITE_EACH_ITER != 1
    write_conc(G, iter-1, 0);
    #endif
    
    /* Show final time */
    printf("\nFinal time: %f seconds.\n", (iter-1)*G->dt);
    
    timer_stop(&G->ppe_metrics.wallclock);
    
    /* Write metrics to CSV file */
    write_metrics_as_csv(G, "Cell B.E.");
    
    /* Cleanup and exit */
    free_global_memory(G);
    return 0;
}