Esempio n. 1
0
void barrier(int ID)
{
  uint_reg_t bcast_sig;
  if(core_map[ID]==0)
  {
    DynamicHeader header= tmc_udn_header_from_cpu(barrier_map[0]);
    tmc_udn_send_1(header,UDN1_DEMUX_TAG,bcast_sig);
  }
  
  
  bcast_sig=tmc_udn1_receive();
  
  if(core_map[ID]!=0)
  {
    DynamicHeader header= tmc_udn_header_from_cpu(barrier_map[ID]);
    tmc_udn_send_1(header,UDN1_DEMUX_TAG,bcast_sig);
  }
  
  
}
Esempio n. 2
0
void
ssmp_mem_init_platf(int id, int num_ues) 
{  
  ssmp_id_ = id;
  ssmp_num_ues_ = num_ues;

  // Now that we're bound to a core, attach to our UDN rectangle.
  if (tmc_udn_activate() < 0)
    tmc_task_die("Failure in 'tmc_udn_activate()'.");

  udn_header = (DynamicHeader* ) memalign(SSMP_CACHE_LINE_SIZE, num_ues * sizeof (DynamicHeader));
  if (udn_header == NULL)
    {
      tmc_task_die("Failure in allocating dynamic headers");
    }

  int r;
  for (r = 0; r < num_ues; r++)
    {
      int _cpu = tmc_cpus_find_nth_cpu(&cpus, id_to_core[r]);
      DynamicHeader header = tmc_udn_header_from_cpu(_cpu);
      udn_header[r] = header;
    }
}
Esempio n. 3
0
int
main(int argc, char** argv)
{
  // Process arguments.

  int i = 1;

  while (i < argc)
  {
    // Allow "-i FILE" to override STDIN.
    if (i + 2 <= argc && !strcmp(argv[i], "-i"))
    {
      const char* file = argv[i+1];
      if (dup2(open(file, O_RDONLY), STDIN_FILENO) < 0)
      {
        fprintf(stderr, "Could not open '%s'.\n", file);
        exit(1);
      }
      i += 2;
    }

    // Allow "-o FILE" to override STDOUT.
    else if (i + 2 <= argc && !strcmp(argv[i], "-o"))
    {
      const char* file = argv[i+1];
      int fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0666);
      if (dup2(fd, STDOUT_FILENO) < 0)
      {
        fprintf(stderr, "Could not open '%s'.\n", file);
        exit(1);
      }
      i += 2;
    }

    else
    {
      break;
    }
  }

  // Get the UDN coordinates of the BME server tile from our arguments.
  int server_x, server_y;
  if (i + 1 != argc || sscanf(argv[i], "%d,%d", &server_x, &server_y) != 2)
  {
    fprintf(stderr,
            "usage: linux_client [-i IN] [-o OUT] <server_x>,<server_y>\n");
    exit(1);
  }

  // Create a UDN header for the server.
  DynamicHeader bme_server =
    { .bits.dest_x = server_x, .bits.dest_y = server_y };


  // Bind ourselves to our current CPU, and set up a UDN hardwall
  // which encompasses the entire chip, so that we can communicate
  // with the BME server.

  cpu_set_t cpus;

  tmc_cpus_clear(&cpus);
  tmc_cpus_grid_add_all(&cpus);

  tmc_cpus_set_my_cpu(tmc_cpus_get_my_current_cpu());

  if (tmc_udn_init(&cpus) != 0)
  {
    perror("UDN hardwall create failed");
    exit(1);
  }

  if (tmc_udn_activate() != 0)
  {
    perror("UDN hardwall activate failed");
    exit(1);
  }


  // Get one huge page of memory.
  tmc_alloc_t alloc = TMC_ALLOC_INIT;
  tmc_alloc_set_huge(&alloc);
  tmc_alloc_set_home(&alloc, 0);
  tmc_alloc_set_shared(&alloc);
  int mlength = 1 << 24;
  void* maddr = tmc_alloc_map(&alloc, mlength);
  if (maddr == NULL)
  {
    perror("can't mmap");
    exit(1);
  }


  // Lock down that memory and get its physical address and caching
  // information, using the bme_mem device driver.

  struct bme_user_mem_desc_io user_mem_desc;
  struct bme_phys_mem_desc_io phys_mem_desc;
  int fd = open("/dev/bme/mem", O_RDWR);

  if (fd < 0)
  {
    perror("couldn't open /dev/bme/mem");
    exit(1);
  }


  // First we find out how many pages are in the region to be locked down.
  // (Given our allocation above, we know we must have exactly one large page,
  // but this is an example of what you would do for large regions.)

  //user_mem_desc.user.va = maddr;
  user_mem_desc.user.va = (uintptr_t)maddr;
  //  user_mem_desc.user.va = (__u64)maddr;
  user_mem_desc.user.len = mlength;

  if (ioctl(fd, BME_IOC_GET_NUM_PAGES, &user_mem_desc) != 0)
  {
    perror("BME_IOC_GET_NUM_PAGES ioctl failed");
    exit(1);
  }


  // Now that we know how many pages are there, we can request that they be
  // locked into physical memory, and retrieve their physical address and
  // cache mapping information.

  phys_mem_desc.user.va = (uintptr_t)maddr;
  phys_mem_desc.user.len = mlength;

  phys_mem_desc.phys =
    (uintptr_t)malloc(sizeof(struct bme_phys_mem_desc) *
                      user_mem_desc.num_pages);

  phys_mem_desc.num_pages = user_mem_desc.num_pages;

  if (ioctl(fd, BME_IOC_LOCK_MEMORY, &phys_mem_desc) != 0)
  {
    perror("BME_IOC_LOCK_MEMORY ioctl failed");
    exit(1);
  }


  // Send the BME application a message telling it about the memory we
  // just locked down.  Since this is an example, we're only sending one
  // message, for one page.

  DynamicHeader my_hdr = tmc_udn_header_from_cpu(tmc_cpus_get_my_cpu());

  struct bme_phys_mem_desc *phys =
    (struct bme_phys_mem_desc *)(uintptr_t)phys_mem_desc.phys;

  tmc_udn_send_6(bme_server, UDN0_DEMUX_TAG,
                 EX_MSG_MAPPING,
                 my_hdr.word,
                 phys->pa,
                 phys->pa >> 32,
                 phys->pte,
                 phys->pte >> 32);

  uint32_t reply = udn0_receive();
  if (reply)
  {
    fprintf(stderr, "client: got bad response %d to MAPPING message\n",
            reply);
    exit(1);
  }


  // Now read our standard input into a buffer in the shared page; send
  // a request to the BME tile to process that data, putting the output
  // elsewhere in the shared page; and then write it to standard output.

  char* inbuf = maddr;
  char* outbuf = inbuf + PROCESSING_BUFSIZE;
  
  int len;
  while ((len = read(STDIN_FILENO, inbuf, PROCESSING_BUFSIZE)) > 0)
  {
    // Note that our message gives the server the offsets of the input and
    // output buffers, rather than pointers to them.  This is because the
    // server has not mapped in the data at the same set of virtual addresses
    // we're using.  We could arrange this, if desired, although it required
    // more coordination between the client and server.

    tmc_udn_send_5(bme_server, UDN0_DEMUX_TAG,
                   EX_MSG_PROCESS,
                   my_hdr.word,
                   0,
                   len,
                   PROCESSING_BUFSIZE);

    reply = udn0_receive();
    if (reply != len)
    {
      fprintf(stderr, "client: got bad response %d to PROCESS "
              "message (expected %d)\n", reply, len);
      exit(1);
    }

    if (write(STDOUT_FILENO, outbuf, len) != len)
    {
      perror("write");
      exit(1);
    }
  }

  return 0;
}
Esempio n. 4
0
//------------------------------------------------------------------------------
//--------------------------Thread function-------------------------------------
//------------------------------------------------------------------------------
void *thread_fn(void *arg)
{
  int ID=(*((int*)arg));
  
  //Necessary local variables
  int count, count1, count2, start_addr, iterator;
  int row_count_A, col_count_B, el_count;
  DATA_TYPE temp_sum;
  uint_reg_t gather_sig;
  DATA_TYPE weight[3][3] = {{ -1,  0,  1 },{ -2,  0,  2 },{ -1,  0,  1 }};
  
  
  //Set cpu and activate udn
  if(tmc_cpus_set_my_cpu(core_map[ID])!=0)
  {
    printf("Thread: %d CPU setting failed.\n",ID);
    exit(1);
  }
  tmc_udn_activate();
  
  
  
  //Thread memory initialization
  DATA_TYPE *image, *gx, *gy;
  int n_out_rows=nrows-2;
  int factor=sizeof(uint_reg_t)/sizeof(DATA_TYPE);
  if(ID==0)
  {
    image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    
    for(count=0; count<nrows; count++)
    {
      for(count1=0; count1<ncols; count1++)
      {
        image[count*ncols+count1]=count;
      }
    }
  }
  else
  {
    if(ID<n_out_rows%nthreads)
    {
      image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1+2)*ncols);
      gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1  )*ncols);
      gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1  )*ncols);
    }
    else
    {
      image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+2)*ncols);
      gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads  )*ncols);
      gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads  )*ncols);
    }
  }
  
  
  
  //calculate x and y co-ordinates
  int x=ID%(xmax+1);
  int y=ID/(xmax+1);



  //Time management variables
  double zero; //Reference time for iterations
  double scatter_s, scatter_e, scatter_d, compute_s, compute_e, compute_d, gather_s, gather_e, gather_d,total_s,total_e,total_d;
  struct timespec st;
  
  
  
  
  
  
  
  
  
  
  
  
  for(iterator=0; iterator<iterations; iterator++)
  {
    
    //--------------------------------------------------------------------------
    //----------------------------Start of benchmark----------------------------
    //--------------------Do this shit over iteration times---------------------
    //--------------------------------------------------------------------------
  

  
    
    //-------------------------Set reference time ------------------------------
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    zero=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;        
    
    
    
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    total_s=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;  
   
    
    
    //------------------------Step 1: Naive scatter-----------------------------
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    scatter_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    if(ID==0)
    {
      start_addr=((n_out_rows%nthreads==0)?(n_out_rows/nthreads+1):(n_out_rows/nthreads+1+1));
    
      for(count=1; count<nthreads; count++)
      {
        if(count<n_out_rows%nthreads)
        {
          send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);          
          start_addr+=n_out_rows/nthreads+1;
        }
        else
        {
          send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);
          start_addr+=n_out_rows/nthreads;
        }
      }
    }
    else
    {
      if(ID<n_out_rows%nthreads)
      {
        receive126((uint_reg_t*)image,(n_out_rows/nthreads+1+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);
      }
      else
      {
        receive126((uint_reg_t*)image,(n_out_rows/nthreads+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    scatter_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    scatter_d=scatter_e-scatter_s;
    
    
    
    
    
    
    
    
    //-------------------------------Sobel compute------------------------------
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    compute_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    //Compute Gx
    int i,j;
    if(ID<n_out_rows%nthreads)
    {
      temp_sum=(DATA_TYPE)0;
      for(count=1; count<(n_out_rows/nthreads+1+2-1); count++)
      {
        for(count1=1; count1<(ncols-1); count1++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i];
            }
          }
          
          gx[(count-1)*ncols+count1]=temp_sum;
        }
      }
    }
    else
    {
      temp_sum=(DATA_TYPE)0;
      for(count=1; count<(n_out_rows/nthreads+2-1); count++)
      {
        for(count1=1; count1<(ncols-1); count1++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i];
            }
          }
          
          gx[(count-1)*ncols+count2]=temp_sum;
        }
      }
    }
    
    
    //Compute Gy
    if(ID<n_out_rows%nthreads)
    {
      temp_sum=(DATA_TYPE)0;
      for(count1=1; count1<(ncols-1); count1++)
      {
        for(count=1; count<(n_out_rows/nthreads+1+2-1); count++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j];
            }
          }
          
          gy[(count-1)*ncols+count1]=temp_sum;
        }
      }
    }
    else
    {
      temp_sum=(DATA_TYPE)0;
      for(count1=1; count1<(ncols-1); count1++)
      {
        for(count=1; count<(n_out_rows/nthreads+2-1); count++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j];
            }
          }
          
          gy[(count-1)*ncols+count2]=temp_sum;
        }
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    compute_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    compute_d=compute_e-compute_s;
    
    
    
    
    
    
    
    
    //---------------------------Gather-----------------------------------------
    
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    gather_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    
    
    if(ID==0)
    {
      start_addr=1;
     
      for(count=1; count<nthreads; count++)
      {
        if(count<n_out_rows%nthreads)
        {
          //Send a signal to a certain waiting thread
          DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]);
          tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig);
          
          //Collect Gx from that thread
          receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
          
          //Collect Gy from that thread
          receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
                  
          start_addr+=(n_out_rows/nthreads+1);
        }
        else
        {
          //Send a signal to a certain waiting thread
          DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]);
          tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig);
          
          //Collect Gx from that thread
          receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
          
          //Collect Gy from that thread
          receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
                  
          start_addr+=(n_out_rows/nthreads);
        }
      }
    }
    else
    {
      if(ID<n_out_rows%nthreads)
      {
        //Wait for signal from main signal
        gather_sig=tmc_udn2_receive();
        
        //Send the partial gx 
        send126((uint_reg_t*)gx,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
        
        //Send the partial gy 
        send126((uint_reg_t*)gy,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
      }
      else
      {
        //Wait for signal from main signal
        gather_sig=tmc_udn2_receive();
        
        //Send the partial gx 
        send126((uint_reg_t*)gx,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
        
        //Send the partial gy 
        send126((uint_reg_t*)gy,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    gather_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    gather_d=gather_e-gather_s;
    
    
    
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    total_e=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;  
    total_d+=(total_e-total_s);
    
    
    
    
    
    //---------------------------Log the data-----------------------------------
    log_memory[iterator*nthreads*9+ID*9+SCA_S]=scatter_s;
    log_memory[iterator*nthreads*9+ID*9+SCA_E]=scatter_e;
    log_memory[iterator*nthreads*9+ID*9+SCA_D]=scatter_d;
    
    log_memory[iterator*nthreads*9+ID*9+COMP_S]=compute_s;
    log_memory[iterator*nthreads*9+ID*9+COMP_E]=compute_e;
    log_memory[iterator*nthreads*9+ID*9+COMP_D]=compute_d;
    
    log_memory[iterator*nthreads*9+ID*9+GATH_S]=gather_s;
    log_memory[iterator*nthreads*9+ID*9+GATH_E]=gather_e;
    log_memory[iterator*nthreads*9+ID*9+GATH_D]=gather_d;
    
     
     
    //---------------------Barrier----------------------------------------------
    barrier(ID);
    
    
    if(ID==0)
      printf("Iteration: %d\n",iterator);
    
    
     
    //--------------------------------------------------------------------------
    //------------------------------End of benchmark----------------------------
    //--------------------------------------------------------------------------
  
  }
  
  //Print total time
  printf("%lf\n",total_d/iterations);
  
  //Free thread local memory
  free(image);
  free(gx);
  free(gy);
  
  pthread_exit(NULL);
}