void op_par_loop_save_soln(char const *name, op_set set,      
  op_arg arg0,                                                
  op_arg arg1 ){                                              
                                                              
  int ninds   = 0;    
  int nargs   = 2;
  op_arg args[2] = {arg0,arg1};

  if (OP_diags>2) {                                           
    printf(" kernel routine w/o indirection:  save_soln \n"); 
  }                                                           
                                                              
  // initialise timers                                        
                                                              
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                    
  op_timers_core(&cpu_t1, &wall_t1);                               
                                                              
  // set number of threads                                    
                                                              
#ifdef _OPENMP                                                
  int nthreads = omp_get_max_threads( );                      
#else                                                         
  int nthreads = 1;                                           
#endif                                                        
                                                              
  // execute plan                                             
                                                              
#pragma omp parallel for                                      
  for (int thr=0; thr<nthreads; thr++) {                      
    int start  = (set->size* thr   )/nthreads;                
    int finish = (set->size*(thr+1))/nthreads;                
    op_x86_save_soln( (double *) arg0.data,                    
                      (double *) arg1.data,                    
                      start, finish );                        
  }                                                           
                           
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE

  
  
  // update kernel record                                     
                                                              
  op_timers_core(&cpu_t2, &wall_t2);                               
  op_timing_realloc(0);                                       
  OP_kernels[0].name      = name;                             
  OP_kernels[0].count    += 1;                                
  OP_kernels[0].time     += wall_t2 - wall_t1;                
  OP_kernels[0].transfer += (double)set->size * arg0.size;     
  OP_kernels[0].transfer += (double)set->size * arg1.size;     
}                                                             
void op_par_loop_adt_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5 ){                                                        
                                                                       
  int nargs   = 6;                                                   
  op_arg args[6] = {arg0,arg1,arg2,arg3,arg4,arg5};                     
                                                                        
  int    ninds   = 1;                                                   
  int    inds[6] = {0,0,0,0,-1,-1};   
  
  int sent[6] = {0,0,0,0,0,0}; 
               
  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if (OP_diags==1) reset_halo(args[i]);
      	      sent[0] = exchange_halo(args[i]); 
      	      if(sent[0] == 1)wait_all(args[i]);
      	  }
      }
  }
  
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: adt_calc \n");            
  }                                                                     
                                                                        
  // get plan                                                           
                                                                        
  #ifdef OP_PART_SIZE_1                                                 
    int part_size = OP_PART_SIZE_1;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
                 
  
  op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
                                                                        
  // initialise timers                                                  
                                                                        
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                              
  op_timers(&cpu_t1, &wall_t1);                                         
                                                                        
  // set number of threads                                              
                                                                        
#ifdef _OPENMP                                                          
  int nthreads = omp_get_max_threads( );                                
#else                                                                   
  int nthreads = 1;                                                     
#endif                                                                  
                                                                        
  // execute plan                                                       
                                                                        
  int block_offset = 0;                                                 
                                                                        
  for (int col=0; col < Plan->ncolors; col++) {                         
    int nblocks = Plan->ncolblk[col];                                   
                                                                        
#pragma omp parallel for                                                
    for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
     op_x86_adt_calc( blockIdx,                                         
       (double *)arg0.data, Plan->ind_maps[0],                           
       Plan->loc_maps[0],                                               
       Plan->loc_maps[1],                                               
       Plan->loc_maps[2],                                               
       Plan->loc_maps[3],                                               
       (double *)arg4.data,                                              
       (double *)arg5.data,                                              
       Plan->ind_sizes,                                                 
       Plan->ind_offs,                                                  
       block_offset,                                                    
       Plan->blkmap,                                                    
       Plan->offset,                                                    
       Plan->nelems,                                                    
       Plan->nthrcol,                                                   
       Plan->thrcol);                                                   
                                                                        
    block_offset += nblocks;                                            
  }             
  
  
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE
  
                                                                        
  // update kernel record                                               
                                                                        
  op_timers(&cpu_t2, &wall_t2);                                         
  op_timing_realloc(1);                                                 
  OP_kernels[1].name      = name;                                       
  OP_kernels[1].count    += 1;                                          
  OP_kernels[1].time     += wall_t2 - wall_t1;                          
  OP_kernels[1].transfer  += Plan->transfer;                            
  OP_kernels[1].transfer2 += Plan->transfer2;                           
}                                                                       
void op_par_loop_update(char const *name, op_set set,           
  op_arg arg0,                                                  
  op_arg arg1,                                                  
  op_arg arg2,                                                  
  op_arg arg3,                                                  
  op_arg arg4 ){                                                
   
  int ninds   = 0;    
  int nargs   = 5; 
  op_arg args[5] = {arg0,arg1,arg2,arg3,arg4};
  
  double *arg4h = (double *)arg4.data;                            
                                                                
  if (OP_diags>2) {                                             
    printf(" kernel routine w/o indirection:  update \n");      
  }                                                             
                                                                
  // initialise timers                                          
                                                                
  double cpu_t1, cpu_t2, wall_t1, wall_t2;                      
  op_timers(&cpu_t1, &wall_t1);                                 
                                                                
  // set number of threads                                      
                                                                
#ifdef _OPENMP                                                  
  int nthreads = omp_get_max_threads( );                        
#else                                                           
  int nthreads = 1;                                             
#endif                                                          
                                                                
  // allocate and initialise arrays for global reduction        
                                                                
  double arg4_l[1+64*64];                                        
  for (int thr=0; thr<nthreads; thr++)                          
    for (int d=0; d<1; d++) arg4_l[d+thr*64]=ZERO_double;        
                                                                
  // execute plan                                               
                                                                
#pragma omp parallel for                                        
  for (int thr=0; thr<nthreads; thr++) {                        
    int start  = (set->size* thr   )/nthreads;                  
    int finish = (set->size*(thr+1))/nthreads;                  
    op_x86_update( (double *) arg0.data,                         
                   (double *) arg1.data,                         
                   (double *) arg2.data,                         
                   (double *) arg3.data,                         
                   arg4_l + thr*64,                             
                   start, finish );                             
  }                                                             
                                                                
  // combine reduction data                                     
                                                                
  for (int thr=0; thr<nthreads; thr++)                          
    for(int d=0; d<1; d++) arg4h[d] += arg4_l[d+thr*64];        
     
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_GBL) 
      	global_reduce(&args[i]);
  


  // update kernel record                                       
                                                                
  op_timers(&cpu_t2, &wall_t2);                                 
  op_timing_realloc(4);                                         
  OP_kernels[4].name      = name;                               
  OP_kernels[4].count    += 1;                                  
  OP_kernels[4].time     += wall_t2 - wall_t1;                  
  OP_kernels[4].transfer += (double)set->size * arg0.size;       
  OP_kernels[4].transfer += (double)set->size * arg1.size;       
  OP_kernels[4].transfer += (double)set->size * arg2.size * 2.0f;
  OP_kernels[4].transfer += (double)set->size * arg3.size;       
}                                                               
void op_par_loop_res_calc(char const *name, op_set set,                 
  op_arg arg0,                                                          
  op_arg arg1,                                                          
  op_arg arg2,                                                          
  op_arg arg3,                                                          
  op_arg arg4,                                                          
  op_arg arg5,                                                          
  op_arg arg6,                                                          
  op_arg arg7 ){     

  int    nargs   = 8;                                                   
  op_arg args[8] = {arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7};  

  int    ninds   = 4;                                                   
  int    inds[8] = {0,0,1,1,2,2,3,3}; 
  
  int sent[8] = {0,0,0,0,0,0,0,0}; //array to set if halo is exchanged
  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if (OP_diags==1) reset_halo(args[i]);
      	      sent[i] = exchange_halo(args[i]); 
      	      //if(sent[i] == 1)wait_all(args[i]);
      	  }
      }
  }
                                                                        
  if (OP_diags>2) {                                                     
    printf(" kernel routine with indirection: res_calc \n");            
  }                                                                     
                                                                        
  // get plan             
  int block_offset;  
  op_plan *Plan;
                                                                        
  #ifdef OP_PART_SIZE_2                                                 
    int part_size = OP_PART_SIZE_2;                                     
  #else                                                                 
    int part_size = OP_part_size;                                       
  #endif                                                                
     
  //get offsets
  int core_len = core_num[set->index];
  int noncore_len = set->size + OP_import_exec_list[set->index]->size - core_len;
  
  double cpu_t1, cpu_t2, wall_t1, wall_t2;    

  //process core set
  if (core_len>0) {
      if (OP_latency_sets[set->index].core_set == NULL) {
	op_set core_set = (op_set)malloc(sizeof(op_set_core));
	core_set->index = set->index;
	core_set->name = set->name;
	core_set->size = core_len;
	core_set->exec_size = 0;
	core_set->nonexec_size = 0;
	OP_latency_sets[set->index].core_set = core_set;
      }
      Plan = op_plan_get_offset(name,OP_latency_sets[set->index].core_set,
      	  0,part_size,nargs,args,ninds,inds);
                                  
	  op_timers_core(&cpu_t1, &wall_t1);
	
      // set number of threads                                          
      #ifdef _OPENMP                                                          
      	int nthreads = omp_get_max_threads( );                                
      #else                                                                   
      	int nthreads = 1;                                                     
      #endif                                                                  
                       
      // execute plan                                                       
      int block_offset = 0;                                                 
                                                                        
      for(int col=0; col < Plan->ncolors; col++) {                         
      	  int nblocks = Plan->ncolblk[col];                                   
                                                                        
      	  #pragma omp parallel for                                                
      	  for (int blockIdx=0; blockIdx<nblocks; blockIdx++)  
      	  op_x86_res_calc( blockIdx,
      	      (double *)arg0.data, Plan->ind_maps[0],
      	      (double *)arg2.data, Plan->ind_maps[1],
      	      (double *)arg4.data, Plan->ind_maps[2],
      	      (double *)arg6.data, Plan->ind_maps[3],                           
      	      Plan->loc_maps[0],                                               
      	      Plan->loc_maps[1],                                               
      	      Plan->loc_maps[2],                                               
      	      Plan->loc_maps[3],                                               
  		Plan->loc_maps[4],                                               
  		Plan->loc_maps[5],                                               
  		Plan->loc_maps[6],                                               
  		Plan->loc_maps[7],                                               
  		Plan->ind_sizes,                                                 
  		Plan->ind_offs,                                                  
  		block_offset,                                                    
  		Plan->blkmap,                                                    
  		Plan->offset,                                                    
  		Plan->nelems,                                                    
  		Plan->nthrcol,                                                   
  		Plan->thrcol);                                                   
                                                                        
  	  block_offset += nblocks;                                            
      }
	op_timers_core(&cpu_t2, &wall_t2);
    OP_kernels[2].time     += wall_t2 - wall_t1;
    OP_kernels[2].transfer  += Plan->transfer;                            
    OP_kernels[2].transfer2 += Plan->transfer2;     
  }

  if(ninds > 0) //indirect loop
  {
      for(int i = 0; i<nargs; i++)
      {
      	  if(args[i].argtype == OP_ARG_DAT)
      	  {
      	      if(sent[i] == 1)wait_all(args[i]);
      	  }
      }
  }

  if (noncore_len>0) {
  	if (OP_latency_sets[set->index].noncore_set == NULL) {
		op_set noncore_set = (op_set)malloc(sizeof (op_set_core));
		noncore_set->size = noncore_len;
		noncore_set->name = set->name;
		noncore_set->index = set->index;
		noncore_set->exec_size = 0;
		noncore_set->nonexec_size = 0;
		OP_latency_sets[set->index].noncore_set = noncore_set;
	  }
	  Plan = op_plan_get_offset(name,OP_latency_sets[set->index].noncore_set,core_len,
	       part_size,nargs,args,ninds,inds);
	
	   op_timers_core(&cpu_t1, &wall_t1);
	
	   // set number of threads                                              
	   #ifdef _OPENMP                                                          
	   	int nthreads = omp_get_max_threads( );                                
	   #else
	   	int nthreads = 1;
	   #endif                                                                  
                       
	   // execute plan                                                       
	   int block_offset = 0;                                                 
                                                                        
	   for (int col=0; col < Plan->ncolors; col++) {                         
	       int nblocks = Plan->ncolblk[col];                                   
                                                                        
	       #pragma omp parallel for                                                
	       for (int blockIdx=0; blockIdx<nblocks; blockIdx++)                  
	       op_x86_res_calc( blockIdx,                                         
	       	   (double *)arg0.data, Plan->ind_maps[0],                           
	       	   (double *)arg2.data, Plan->ind_maps[1],                           
	       	   (double *)arg4.data, Plan->ind_maps[2],                           
	       	   (double *)arg6.data, Plan->ind_maps[3],                           
	       	   Plan->loc_maps[0],                                               
	       	   Plan->loc_maps[1],                                               
	       	   Plan->loc_maps[2],                                               
	       	   Plan->loc_maps[3],                                               
	       	   Plan->loc_maps[4],                                               
	       	   Plan->loc_maps[5],                                               
	       	   Plan->loc_maps[6],                                               
	       	   Plan->loc_maps[7],                                               
	       	   Plan->ind_sizes,                                                 
	       	   Plan->ind_offs,                                                  
	       	   block_offset,                                                    
	       	   Plan->blkmap,                                                    
	       	   Plan->offset,                                                    
	       	   Plan->nelems,                                                    
	       	   Plan->nthrcol,                                                   
	       	   Plan->thrcol);                                                   
                                                                        
	       block_offset += nblocks;       
	   } 
	   op_timers_core(&cpu_t2, &wall_t2);
    	OP_kernels[2].time     += wall_t2 - wall_t1;
	OP_kernels[2].transfer  += Plan->transfer;                            
	OP_kernels[2].transfer2 += Plan->transfer2;     
    }
  
  
  //set dirty bit on direct/indirect datasets with access OP_INC,OP_WRITE, OP_RW
  for(int i = 0; i<nargs; i++)
      if(args[i].argtype == OP_ARG_DAT)
      	set_dirtybit(args[i]);
  
  //performe any global operations
  // - NONE
  
  // update kernel record                                               

  op_timing_realloc(3);                                                 
  OP_kernels[2].name      = name;                                       
  OP_kernels[2].count    += 1;                                          
}