void *
task_pingpong_black(void *arg)
{
  arg_t *	Arg = (arg_t *)arg;
  register int	i;
  int		result = 0;

  ERRHAND_TILERA(tmc_cpus_set_my_cpu(Arg->cpu));
  /*ERRHAND_TILERA*/(tmc_udn_activate());
  pthread_barrier_wait(&computation_start);

#if TEST_VERBOSE >= 1  
  printf("[INFO] black: %d\n", tmc_cpus_get_my_cpu());
#endif

  for (i=0; i<Arg->num_scambi; i++) {
    int * received;

    received = ch_receive(CH0_IMPL)(Arg->ch[0]);
    ch_send(CH1_IMPL)(Arg->ch[1], received);

    if (NULL == received) {
      result ++;
      fprintf(stderr, "[ERROR] black: null received\n");
    }
  }

  ERRHAND_TILERA(tmc_udn_close());
  return (void *)result;
}
void *
task_pingpong_white(void *arg)
{
  arg_t *	Arg = (arg_t *)arg;
  register int	i;
  int		integer;
  int		result = 0;

  ERRHAND_TILERA(tmc_cpus_set_my_cpu(Arg->cpu));
  /*ERRHAND_TILERA*/(tmc_udn_activate());
  pthread_barrier_wait(&computation_start);

#if TEST_VERBOSE >= 1
  printf("[INFO] white: cpu %d\n", tmc_cpus_get_my_cpu());
#endif
  
  for (i=0; i<Arg->num_scambi; i++) {
    int *	received = NULL;
    uint_reg_t	a, b;
    integer = i;

    a = GET_CLOCK_CYCLE;
    atomic_compiler_barrier();
    ch_send(CH0_IMPL)(Arg->ch[0], &integer);
    received = ch_receive(CH1_IMPL)(Arg->ch[1]);
    atomic_compiler_barrier();
    b = GET_CLOCK_CYCLE;

    if (b>a) { prepareStatistics(Tscambio, b-a); }
    else {
      //fprintf(stderr, "\n\n>>> %u %u \n\n", a, b);
    }

#if TEST_DEBUG >= 1
    if (b-a > 700) {
      fprintf(stderr, "i %-20d Tscambio %"PRIu64"\n", i, Tscambio[0]);      
    }
#endif

    if (NULL == received) {
      result ++;
      fprintf(stderr, "[ERROR] white: null received\n");
    } else {
      if (i != *received) result ++;
    }
  }

  ERRHAND_TILERA(tmc_udn_close());
  return (void *)result;
}
Ejemplo n.º 3
0
void
ssmp_mem_init_platf(int id, int num_ues) 
{  
  ssmp_id_ = id;
  ssmp_num_ues_ = num_ues;

  // Now that we're bound to a core, attach to our UDN rectangle.
  if (tmc_udn_activate() < 0)
    tmc_task_die("Failure in 'tmc_udn_activate()'.");

  udn_header = (DynamicHeader* ) memalign(SSMP_CACHE_LINE_SIZE, num_ues * sizeof (DynamicHeader));
  if (udn_header == NULL)
    {
      tmc_task_die("Failure in allocating dynamic headers");
    }

  int r;
  for (r = 0; r < num_ues; r++)
    {
      int _cpu = tmc_cpus_find_nth_cpu(&cpus, id_to_core[r]);
      DynamicHeader header = tmc_udn_header_from_cpu(_cpu);
      udn_header[r] = header;
    }
}
Ejemplo n.º 4
0
/** Main function. */
int main(int argc, char** argv)
{
  // Number of instances of this program to run
  // (including the initial parent process).
  int instances = 4;

  // Detect whether we're the parent or an exec'd child,
  int is_parent = is_parent_process();

  // Get the application's affinity set.
  // We'll use the first N available cpus from this set.
  // NOTE: this means parent should _not_ call any functions
  // that shrink the affinity set prior to go_parellel();
  cpu_set_t cpus;
  int status = tmc_cpus_get_my_affinity(&cpus);
  check_tmc_status(status, "tmc_cpus_get_my_affinity()");

  // Define UDN cpu set as first N available cpus
  status = udn_init(instances, &cpus);
  check_tmc_status(status, "udn_init()");

  // Initialize "common" shared memory with default size.
  status = tmc_cmem_init(0);
  check_tmc_status(status, "tmc_cmem_init()");

  // Allocate barrier data structure in shared memory.
  tmc_sync_barrier_t* barrier = NULL;
  if (is_parent)
  {
    // Allocate/initialize barrier data structure in common memory.
    barrier = (tmc_sync_barrier_t*) tmc_cmem_malloc(sizeof(*barrier));
    if (barrier == NULL)
      tmc_task_die("barrier_init(): "
        "Failed to allocate barrier data structure.");
    tmc_sync_barrier_init(barrier, instances);
  }

  // Pass the barrier pointer to any exec'd children.
  share_pointer("SHARED_BARRIER_POINTER", (void**) &barrier);

  // Fork/exec any additional child processes,
  // each locked to its own tile,
  // and get index [0 -- instances-1] of current process.
  int index = go_parallel(instances, &cpus, argc, argv);
  pid_t pid = getpid();
  printf("Process(pid=%i), index=%i: started.\n", pid, index);

  // Enable UDN access for this process (parent or child).
  // Note: this needs to be done after we're locked to a tile.
  status = tmc_udn_activate();
  check_tmc_status(status, "tmc_udn_activate()");

  // Wait here until all other processes have caught up.
  tmc_sync_barrier_wait(barrier);

  // Send/receive a value over the UDN.
  int from = 0;
  int to = instances - 1;
  if (index == from)
  {
    int value = 42;
    printf("Process(pid=%i), index=%i: sending value %i to cpu %i...\n",
        pid, index, value, to);
    udn_send_to_nth_cpu(to, &cpus, value);
    printf("Process(pid=%i), index=%i: sent value %i to cpu %i.\n",
        pid, index, value, to);
  }
  else if (index == to)
  {
    int received = 0;
    printf("Process(pid=%i), index=%i: receiving value...\n",
        pid, index);
    received = udn_receive();
    printf("Process(pid=%i), index=%i: received value %i...\n",
        pid, index, received);
  }

  // Wait here until all other processes have caught up.
  tmc_sync_barrier_wait(barrier);

  printf("Process(pid=%i), index=%i: finished.\n",
      pid, index);

  // We're done.
  return 0;
}
Ejemplo n.º 5
0
int
main(int argc, char** argv)
{
  // Process arguments.

  int i = 1;

  while (i < argc)
  {
    // Allow "-i FILE" to override STDIN.
    if (i + 2 <= argc && !strcmp(argv[i], "-i"))
    {
      const char* file = argv[i+1];
      if (dup2(open(file, O_RDONLY), STDIN_FILENO) < 0)
      {
        fprintf(stderr, "Could not open '%s'.\n", file);
        exit(1);
      }
      i += 2;
    }

    // Allow "-o FILE" to override STDOUT.
    else if (i + 2 <= argc && !strcmp(argv[i], "-o"))
    {
      const char* file = argv[i+1];
      int fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0666);
      if (dup2(fd, STDOUT_FILENO) < 0)
      {
        fprintf(stderr, "Could not open '%s'.\n", file);
        exit(1);
      }
      i += 2;
    }

    else
    {
      break;
    }
  }

  // Get the UDN coordinates of the BME server tile from our arguments.
  int server_x, server_y;
  if (i + 1 != argc || sscanf(argv[i], "%d,%d", &server_x, &server_y) != 2)
  {
    fprintf(stderr,
            "usage: linux_client [-i IN] [-o OUT] <server_x>,<server_y>\n");
    exit(1);
  }

  // Create a UDN header for the server.
  DynamicHeader bme_server =
    { .bits.dest_x = server_x, .bits.dest_y = server_y };


  // Bind ourselves to our current CPU, and set up a UDN hardwall
  // which encompasses the entire chip, so that we can communicate
  // with the BME server.

  cpu_set_t cpus;

  tmc_cpus_clear(&cpus);
  tmc_cpus_grid_add_all(&cpus);

  tmc_cpus_set_my_cpu(tmc_cpus_get_my_current_cpu());

  if (tmc_udn_init(&cpus) != 0)
  {
    perror("UDN hardwall create failed");
    exit(1);
  }

  if (tmc_udn_activate() != 0)
  {
    perror("UDN hardwall activate failed");
    exit(1);
  }


  // Get one huge page of memory.
  tmc_alloc_t alloc = TMC_ALLOC_INIT;
  tmc_alloc_set_huge(&alloc);
  tmc_alloc_set_home(&alloc, 0);
  tmc_alloc_set_shared(&alloc);
  int mlength = 1 << 24;
  void* maddr = tmc_alloc_map(&alloc, mlength);
  if (maddr == NULL)
  {
    perror("can't mmap");
    exit(1);
  }


  // Lock down that memory and get its physical address and caching
  // information, using the bme_mem device driver.

  struct bme_user_mem_desc_io user_mem_desc;
  struct bme_phys_mem_desc_io phys_mem_desc;
  int fd = open("/dev/bme/mem", O_RDWR);

  if (fd < 0)
  {
    perror("couldn't open /dev/bme/mem");
    exit(1);
  }


  // First we find out how many pages are in the region to be locked down.
  // (Given our allocation above, we know we must have exactly one large page,
  // but this is an example of what you would do for large regions.)

  //user_mem_desc.user.va = maddr;
  user_mem_desc.user.va = (uintptr_t)maddr;
  //  user_mem_desc.user.va = (__u64)maddr;
  user_mem_desc.user.len = mlength;

  if (ioctl(fd, BME_IOC_GET_NUM_PAGES, &user_mem_desc) != 0)
  {
    perror("BME_IOC_GET_NUM_PAGES ioctl failed");
    exit(1);
  }


  // Now that we know how many pages are there, we can request that they be
  // locked into physical memory, and retrieve their physical address and
  // cache mapping information.

  phys_mem_desc.user.va = (uintptr_t)maddr;
  phys_mem_desc.user.len = mlength;

  phys_mem_desc.phys =
    (uintptr_t)malloc(sizeof(struct bme_phys_mem_desc) *
                      user_mem_desc.num_pages);

  phys_mem_desc.num_pages = user_mem_desc.num_pages;

  if (ioctl(fd, BME_IOC_LOCK_MEMORY, &phys_mem_desc) != 0)
  {
    perror("BME_IOC_LOCK_MEMORY ioctl failed");
    exit(1);
  }


  // Send the BME application a message telling it about the memory we
  // just locked down.  Since this is an example, we're only sending one
  // message, for one page.

  DynamicHeader my_hdr = tmc_udn_header_from_cpu(tmc_cpus_get_my_cpu());

  struct bme_phys_mem_desc *phys =
    (struct bme_phys_mem_desc *)(uintptr_t)phys_mem_desc.phys;

  tmc_udn_send_6(bme_server, UDN0_DEMUX_TAG,
                 EX_MSG_MAPPING,
                 my_hdr.word,
                 phys->pa,
                 phys->pa >> 32,
                 phys->pte,
                 phys->pte >> 32);

  uint32_t reply = udn0_receive();
  if (reply)
  {
    fprintf(stderr, "client: got bad response %d to MAPPING message\n",
            reply);
    exit(1);
  }


  // Now read our standard input into a buffer in the shared page; send
  // a request to the BME tile to process that data, putting the output
  // elsewhere in the shared page; and then write it to standard output.

  char* inbuf = maddr;
  char* outbuf = inbuf + PROCESSING_BUFSIZE;
  
  int len;
  while ((len = read(STDIN_FILENO, inbuf, PROCESSING_BUFSIZE)) > 0)
  {
    // Note that our message gives the server the offsets of the input and
    // output buffers, rather than pointers to them.  This is because the
    // server has not mapped in the data at the same set of virtual addresses
    // we're using.  We could arrange this, if desired, although it required
    // more coordination between the client and server.

    tmc_udn_send_5(bme_server, UDN0_DEMUX_TAG,
                   EX_MSG_PROCESS,
                   my_hdr.word,
                   0,
                   len,
                   PROCESSING_BUFSIZE);

    reply = udn0_receive();
    if (reply != len)
    {
      fprintf(stderr, "client: got bad response %d to PROCESS "
              "message (expected %d)\n", reply, len);
      exit(1);
    }

    if (write(STDOUT_FILENO, outbuf, len) != len)
    {
      perror("write");
      exit(1);
    }
  }

  return 0;
}
Ejemplo n.º 6
0
//------------------------------------------------------------------------------
//--------------------------Thread function-------------------------------------
//------------------------------------------------------------------------------
void *thread_fn(void *arg)
{
  int ID=(*((int*)arg));
  
  //Necessary local variables
  int count, count1, count2, start_addr, iterator;
  int row_count_A, col_count_B, el_count;
  DATA_TYPE temp_sum;
  uint_reg_t gather_sig;
  DATA_TYPE weight[3][3] = {{ -1,  0,  1 },{ -2,  0,  2 },{ -1,  0,  1 }};
  
  
  //Set cpu and activate udn
  if(tmc_cpus_set_my_cpu(core_map[ID])!=0)
  {
    printf("Thread: %d CPU setting failed.\n",ID);
    exit(1);
  }
  tmc_udn_activate();
  
  
  
  //Thread memory initialization
  DATA_TYPE *image, *gx, *gy;
  int n_out_rows=nrows-2;
  int factor=sizeof(uint_reg_t)/sizeof(DATA_TYPE);
  if(ID==0)
  {
    image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*nrows*ncols);
    
    for(count=0; count<nrows; count++)
    {
      for(count1=0; count1<ncols; count1++)
      {
        image[count*ncols+count1]=count;
      }
    }
  }
  else
  {
    if(ID<n_out_rows%nthreads)
    {
      image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1+2)*ncols);
      gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1  )*ncols);
      gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+1  )*ncols);
    }
    else
    {
      image = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads+2)*ncols);
      gx    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads  )*ncols);
      gy    = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*(n_out_rows/nthreads  )*ncols);
    }
  }
  
  
  
  //calculate x and y co-ordinates
  int x=ID%(xmax+1);
  int y=ID/(xmax+1);



  //Time management variables
  double zero; //Reference time for iterations
  double scatter_s, scatter_e, scatter_d, compute_s, compute_e, compute_d, gather_s, gather_e, gather_d,total_s,total_e,total_d;
  struct timespec st;
  
  
  
  
  
  
  
  
  
  
  
  
  for(iterator=0; iterator<iterations; iterator++)
  {
    
    //--------------------------------------------------------------------------
    //----------------------------Start of benchmark----------------------------
    //--------------------Do this shit over iteration times---------------------
    //--------------------------------------------------------------------------
  

  
    
    //-------------------------Set reference time ------------------------------
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    zero=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;        
    
    
    
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    total_s=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;  
   
    
    
    //------------------------Step 1: Naive scatter-----------------------------
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    scatter_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    if(ID==0)
    {
      start_addr=((n_out_rows%nthreads==0)?(n_out_rows/nthreads+1):(n_out_rows/nthreads+1+1));
    
      for(count=1; count<nthreads; count++)
      {
        if(count<n_out_rows%nthreads)
        {
          send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);          
          start_addr+=n_out_rows/nthreads+1;
        }
        else
        {
          send126((uint_reg_t*)(&image[(start_addr-1)*ncols]),(n_out_rows/nthreads+1+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);
          start_addr+=n_out_rows/nthreads;
        }
      }
    }
    else
    {
      if(ID<n_out_rows%nthreads)
      {
        receive126((uint_reg_t*)image,(n_out_rows/nthreads+1+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);
      }
      else
      {
        receive126((uint_reg_t*)image,(n_out_rows/nthreads+2)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    scatter_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    scatter_d=scatter_e-scatter_s;
    
    
    
    
    
    
    
    
    //-------------------------------Sobel compute------------------------------
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    compute_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    //Compute Gx
    int i,j;
    if(ID<n_out_rows%nthreads)
    {
      temp_sum=(DATA_TYPE)0;
      for(count=1; count<(n_out_rows/nthreads+1+2-1); count++)
      {
        for(count1=1; count1<(ncols-1); count1++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i];
            }
          }
          
          gx[(count-1)*ncols+count1]=temp_sum;
        }
      }
    }
    else
    {
      temp_sum=(DATA_TYPE)0;
      for(count=1; count<(n_out_rows/nthreads+2-1); count++)
      {
        for(count1=1; count1<(ncols-1); count1++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[j + 1][i + 1] * image[(count+j)*ncols + count1 + i];
            }
          }
          
          gx[(count-1)*ncols+count2]=temp_sum;
        }
      }
    }
    
    
    //Compute Gy
    if(ID<n_out_rows%nthreads)
    {
      temp_sum=(DATA_TYPE)0;
      for(count1=1; count1<(ncols-1); count1++)
      {
        for(count=1; count<(n_out_rows/nthreads+1+2-1); count++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j];
            }
          }
          
          gy[(count-1)*ncols+count1]=temp_sum;
        }
      }
    }
    else
    {
      temp_sum=(DATA_TYPE)0;
      for(count1=1; count1<(ncols-1); count1++)
      {
        for(count=1; count<(n_out_rows/nthreads+2-1); count++)
        {
          for(j=-1;j<=1;j++)
          {
            for(i=-1; i<=1; i++)
            {
              temp_sum+=weight[i + 1][j + 1] * image[(count+i)*ncols + count1 + j];
            }
          }
          
          gy[(count-1)*ncols+count2]=temp_sum;
        }
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    compute_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    compute_d=compute_e-compute_s;
    
    
    
    
    
    
    
    
    //---------------------------Gather-----------------------------------------
    
    //Set start time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    gather_s=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;  
    
    
    
    if(ID==0)
    {
      start_addr=1;
     
      for(count=1; count<nthreads; count++)
      {
        if(count<n_out_rows%nthreads)
        {
          //Send a signal to a certain waiting thread
          DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]);
          tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig);
          
          //Collect Gx from that thread
          receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
          
          //Collect Gy from that thread
          receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads+1)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
                  
          start_addr+=(n_out_rows/nthreads+1);
        }
        else
        {
          //Send a signal to a certain waiting thread
          DynamicHeader header= tmc_udn_header_from_cpu(core_map[count]);
          tmc_udn_send_1(header,UDN2_DEMUX_TAG,gather_sig);
          
          //Collect Gx from that thread
          receive126((uint_reg_t*)(&gx[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
          
          //Collect Gy from that thread
          receive126((uint_reg_t*)(&gy[start_addr*ncols]),(n_out_rows/nthreads)*ncols/factor,core_map[count],UDN0_DEMUX_TAG);  
                  
          start_addr+=(n_out_rows/nthreads);
        }
      }
    }
    else
    {
      if(ID<n_out_rows%nthreads)
      {
        //Wait for signal from main signal
        gather_sig=tmc_udn2_receive();
        
        //Send the partial gx 
        send126((uint_reg_t*)gx,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
        
        //Send the partial gy 
        send126((uint_reg_t*)gy,(n_out_rows/nthreads+1)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
      }
      else
      {
        //Wait for signal from main signal
        gather_sig=tmc_udn2_receive();
        
        //Send the partial gx 
        send126((uint_reg_t*)gx,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
        
        
        //Send the partial gy 
        send126((uint_reg_t*)gy,(n_out_rows/nthreads)*ncols/factor,core_map[0],UDN0_DEMUX_TAG);  
      }
    }
    
    //Set end time
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    gather_e=((double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3)-zero;    
    gather_d=gather_e-gather_s;
    
    
    
    clock_gettime(CLOCK_THREAD_CPUTIME_ID,&st);
    total_e=(double)st.tv_sec*1e6 + (double)st.tv_nsec*1e-3;  
    total_d+=(total_e-total_s);
    
    
    
    
    
    //---------------------------Log the data-----------------------------------
    log_memory[iterator*nthreads*9+ID*9+SCA_S]=scatter_s;
    log_memory[iterator*nthreads*9+ID*9+SCA_E]=scatter_e;
    log_memory[iterator*nthreads*9+ID*9+SCA_D]=scatter_d;
    
    log_memory[iterator*nthreads*9+ID*9+COMP_S]=compute_s;
    log_memory[iterator*nthreads*9+ID*9+COMP_E]=compute_e;
    log_memory[iterator*nthreads*9+ID*9+COMP_D]=compute_d;
    
    log_memory[iterator*nthreads*9+ID*9+GATH_S]=gather_s;
    log_memory[iterator*nthreads*9+ID*9+GATH_E]=gather_e;
    log_memory[iterator*nthreads*9+ID*9+GATH_D]=gather_d;
    
     
     
    //---------------------Barrier----------------------------------------------
    barrier(ID);
    
    
    if(ID==0)
      printf("Iteration: %d\n",iterator);
    
    
     
    //--------------------------------------------------------------------------
    //------------------------------End of benchmark----------------------------
    //--------------------------------------------------------------------------
  
  }
  
  //Print total time
  printf("%lf\n",total_d/iterations);
  
  //Free thread local memory
  free(image);
  free(gx);
  free(gy);
  
  pthread_exit(NULL);
}