static void identity_reduce(void *key, void **vals, int vals_len)
{
   int i;
   for (i = 0; i < vals_len; i++)
   {
      emit_inline(key, vals[i]);
   }
}
void emit(void *key, void *val)
{
   emit_inline(key, val);
}
/** schedule_tasks()
 *  thread_func - function pointer to process splitter data
 *  splitter_func - splitter function pointer
 *  splitter_init - splitter_init function pointer
 *  runs map tasks in a new thread on each the available processors.
 *  returns pointer intermediate value array 
 */
static inline void schedule_tasks(thread_wrapper_arg_t *th_arg)
{
   assert(th_arg);

   pthread_attr_t attr;   // parameter for pthread creation
   thread_wrapper_arg_t * curr_th_arg; // arg for thread_wrapper()
   
   int thread_cnt;        // counter of number threads assigned assigned
   int curr_proc;
   int curr_thread;

   int num_threads = getNumTaskThreads(th_arg->func_type);
   int threads_per_proc = num_threads / g_state.num_procs; 
   int threads_mod_procs = num_threads % g_state.num_procs;

   int pos = 0; // position of next result in the array
   pthread_mutex_t splitter_lock; // lock for splitter function

   g_state.tinfo = (thread_info_t *)CALLOC(num_threads, sizeof(thread_info_t));
   CHECK_ERROR(pthread_mutex_init(&splitter_lock, NULL) != 0);   
   
   dprintf("Number of available processors = %d\n", g_state.num_procs);
   dprintf("Number of Threads to schedule = %d per(%d) mod(%d)\n", 
      num_threads, threads_per_proc, threads_mod_procs);

   th_arg->pos = &pos;
   th_arg->splitter_lock = &splitter_lock;
   
   // thread must be scheduled systemwide
   pthread_attr_init(&attr);
   pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM);

#ifdef _LINUX_
   unsigned long cpu_set; // bit array of available processors
   // Create a thread for each availble processor to handle the split data
   CHECK_ERROR(sched_getaffinity(0, sizeof(cpu_set), &cpu_set) == -1);
   for (thread_cnt = curr_proc = 0; 
        curr_proc < sizeof(cpu_set) && thread_cnt < num_threads; 
        curr_proc++)
   {
      if (isCpuAvailable(cpu_set, curr_proc))
      {
#endif
#ifdef _SOLARIS_
   int max_procs = sysconf(_SC_NPROCESSORS_ONLN);
   for (thread_cnt = curr_proc = 0; thread_cnt < num_threads; curr_proc++)
   {
      if (P_ONLINE == p_online(curr_proc, P_STATUS))
      {
#endif
         
         for (curr_thread = !(threads_mod_procs-- > 0); 
              curr_thread <= threads_per_proc && thread_cnt < num_threads; 
              curr_thread++, thread_cnt++)
         {
            // Setup data to be passed to each thread
            curr_th_arg = (thread_wrapper_arg_t*)MALLOC(sizeof(thread_wrapper_arg_t));
            memcpy(curr_th_arg, th_arg, sizeof(thread_wrapper_arg_t));
            curr_th_arg->cpu_id = curr_proc;

            g_state.tinfo[thread_cnt].cpuid = curr_proc;

            //fprintf(stderr, "Starting thread %d on cpu %d\n", thread_cnt, curr_th_arg->cpu_id);
            switch (th_arg->func_type)
            {
            case MAP:
               CHECK_ERROR(pthread_create(&g_state.tinfo[thread_cnt].tid, &attr, 
                                                map_worker, curr_th_arg) != 0);
               break;
            case REDUCE:
               CHECK_ERROR(pthread_create(&g_state.tinfo[thread_cnt].tid, &attr, 
                                                reduce_worker, curr_th_arg) != 0);
               break;
            case MERGE:
               CHECK_ERROR(pthread_create(&g_state.tinfo[thread_cnt].tid, &attr, 
                                                merge_worker, curr_th_arg) != 0);
               break;
            default:
               assert(0);
               break;
            }
         }
      }
      
      /*** ADDED BY RAM TO ASSIGN EACH PTHREAD TO HARDWARE THREADS ON DIFFERENT
      PROCESSORS ON THE ULTRASPARC T1 ****/
      if (getenv("MR_AFARA") != NULL)
      {
         //fprintf(stderr, "Using sparse threads\n");
         curr_proc += 3;
         if (curr_proc >= max_procs-1) {
            curr_proc++;
            curr_proc = curr_proc % max_procs; 
         }
      }
   }

   
   dprintf("Status: All %d threads have been created\n", num_threads);
   
   // barrier, wait for all threads to finish           
   for (thread_cnt = 0; thread_cnt < num_threads; thread_cnt++)
   {
      int ret_val;
      CHECK_ERROR(pthread_join(g_state.tinfo[thread_cnt].tid, (void **)(void *)&ret_val) != 0);
      
      // The thread returned and error. Restart the thread.
      //if (ret_val != 0)
      //{
      //}
   }
   
   pthread_attr_destroy(&attr);
   free(g_state.tinfo);
   dprintf("Status: All tasks have completed\n"); 
   
   return;
}

/** map_worker()
* args - pointer to thread_wrapper_arg_t
* returns 0 on success
* This runs thread_func() until there is no more data from the splitter().
* The pointer to results are stored in return_values array.
*/
static void *map_worker(void *args) 
{
   thread_wrapper_arg_t *th_arg = (thread_wrapper_arg_t *)args;
   int thread_index = getCurrThreadIndex(MAP);
   map_args_t thread_func_arg;
   int num_assigned = 0;
   int ret; // return value of splitter func. 0 = no more data to provide
   int isOneQueuePerTask = g_state.isOneQueuePerTask;

   assert(th_arg);
   
#ifdef _LINUX_
   // Bind thread to run on cpu_id
   unsigned long cpu_set = 0;
   setCpuAvailable(&cpu_set, th_arg->cpu_id);
   CHECK_ERROR(sched_setaffinity(0, sizeof(cpu_set), &cpu_set) != 0);
#endif

#ifdef _SOLARIS_
   dprintf("Binding thread to processor %d\n", th_arg->cpu_id);
   CHECK_ERROR(processor_bind(P_LWPID, P_MYID, th_arg->cpu_id, NULL)!= 0);
   /*if (processor_bind(P_LWPID, P_MYID, th_arg->cpu_id, NULL)!= 0) {
      switch(errno)
      {
         case EFAULT: dprintf("EFAULT\n");
                        break;
         case EINVAL: dprintf("EINVAL\n");
                        break;
         case EPERM:  dprintf("EPERM\n");
                        break;
         case ESRCH:  dprintf("ESRCH\n");
                        break;
         default: dprintf("Errno is %d\n",errno);
         
      }
   }*/
#endif

   while (1)
   {
      pthread_mutex_lock(th_arg->splitter_lock);
            
      ret = g_state.splitter(g_state.args->task_data, g_state.chunk_size, &thread_func_arg);
      if (ret != 0) 
      {
         int alloc_len = g_state.intermediate_task_alloc_len;
         g_state.tinfo[thread_index].curr_task = g_state.map_tasks++;
         num_assigned++;

         if (isOneQueuePerTask && g_state.map_tasks > alloc_len)
         {
            dprintf("MAP TASK QUEUE REALLOC\n");
            int i;

            g_state.intermediate_task_alloc_len *= 2;

            for (i = 0; i < g_state.reduce_tasks; i++)
            {
               g_state.intermediate_vals[i] = (keyvals_arr_t *)REALLOC(
                  g_state.intermediate_vals[i], 
                  g_state.intermediate_task_alloc_len*sizeof(keyvals_arr_t));
               memset(&g_state.intermediate_vals[i][alloc_len], 0, 
                  alloc_len*sizeof(keyvals_arr_t));
            }
         }
      }
      
      pthread_mutex_unlock(th_arg->splitter_lock);

      // Stop if there is no more data
      if (ret == 0) break;
      
      dprintf("Task %d: cpu_id -> %d - Started\n", num_assigned, th_arg->cpu_id);

      g_state.args->map(&thread_func_arg);

      dprintf("Task %d: cpu_id -> %d - Done\n", num_assigned, th_arg->cpu_id);
   }

   dprintf("Status: Total of %d tasks were assigned to cpu_id %d\n", 
      num_assigned, th_arg->cpu_id);

   free(args);
   
   return (void *)0;
}


static void *reduce_worker(void *args)
{
   thread_wrapper_arg_t *th_arg = (thread_wrapper_arg_t *)args;   
   int thread_index = getCurrThreadIndex(REDUCE);
   int isOneQueuePerTask = g_state.isOneQueuePerTask;
   
   assert(th_arg);
   
	#ifdef _LINUX_
	   // Bind thread to run on cpu_id
	   unsigned long cpu_set = 0;
	   setCpuAvailable(&cpu_set, th_arg->cpu_id);
	   CHECK_ERROR(sched_setaffinity(0, sizeof(cpu_set), &cpu_set) != 0);
	#endif

	#ifdef _SOLARIS_
	   CHECK_ERROR(processor_bind(P_LWPID, P_MYID, th_arg->cpu_id, NULL)!= 0);
	   /*if (processor_bind(P_LWPID, P_MYID, th_arg->cpu_id, NULL)!= 0) {
	      switch(errno)
         {
            case EFAULT: dprintf("EFAULT\n");
                           break;
            case EINVAL: dprintf("EINVAL\n");
                           break;
            case EPERM:  dprintf("EPERM\n");
                           break;
            case ESRCH:  dprintf("ESRCH\n");
                           break;
            default: dprintf("Errno is %d\n",errno);
            
         }   
	   }*/
	#endif

	int curr_thread, done;
   int curr_reduce_task = 0;
   int ret;
   int num_map_threads;
   if (isOneQueuePerTask)
      num_map_threads = g_state.map_tasks;
   else
      num_map_threads = g_state.num_map_threads;

   int startsize = DEFAULT_VALS_ARR_LEN;
   keyvals_arr_t* thread_array;
   int vals_len, max_len, next_min_pos;
   keyvals_t *curr_key_val, *min_key_val, *next_min;

   int * thread_position = (int *)MALLOC(num_map_threads * sizeof(int));
   void** vals = MALLOC(sizeof(char*)*startsize);

   while (1)
   {
      // Get the next reduce task
	   pthread_mutex_lock(th_arg->splitter_lock);
   	
      ret = (*th_arg->pos >= g_state.reduce_tasks);
      if (!ret)
      {
	      g_state.tinfo[thread_index].curr_task = curr_reduce_task = 
	                                                      (*th_arg->pos)++;
      }
      pthread_mutex_unlock(th_arg->splitter_lock);

      // No more reduce tasks
      if(ret) break;
   	
	   bzero((char *)thread_position, num_map_threads*sizeof(int));

	   vals_len = 0;
		max_len = startsize;
		
      min_key_val = NULL;
		next_min =  NULL;
      done = 0;

      while (!done)
      {
         for (curr_thread = 0; curr_thread < num_map_threads; curr_thread++)
         {
		      /* Find the next array to search */
		      thread_array = 
               &g_state.intermediate_vals[curr_reduce_task][curr_thread];

		      /* Check if the current processor array has been completely searched */
            if (thread_position[curr_thread] >= thread_array->len) continue;
         
		      /* Get the next key in the processor array */
		      curr_key_val = &thread_array->arr[thread_position[curr_thread]];

		      /* If the key matches the minimum value. Then add the value to the 
               list of values for that key */
            if (min_key_val != NULL && 
                !g_state.args->key_cmp(curr_key_val->key, min_key_val->key))
            {
               if (g_state.reduce == identity_reduce)
               {
                  int j;
                  for (j = 0; j < curr_key_val->len; j++)
                  {
				         emit_inline(min_key_val->key, curr_key_val->vals[j]);
                  }
               }
               else
               {
                  if (vals_len + curr_key_val->len >= max_len)
                  {
                     while (vals_len + curr_key_val->len >= max_len)
				            max_len *= 2;
   				      
                     vals = REALLOC(vals, sizeof(char*)*(max_len));
                  }
				      memcpy(&vals[vals_len], curr_key_val->vals, 
                        curr_key_val->len*sizeof(char*));
				      vals_len += curr_key_val->len;
               }

               thread_position[curr_thread--]++;
            }
		      /* Find the location of the next min */	
            else if (next_min == NULL || 
                     g_state.args->key_cmp(curr_key_val->key, next_min->key) < 0)
            {
               next_min = curr_key_val;
               next_min_pos = curr_thread;
            }
         }

	      if(min_key_val != NULL)
	      {
            if (g_state.reduce != identity_reduce)
            {
		         g_state.reduce(min_key_val->key, vals, vals_len);
               
            }

            vals_len = 0;
	         min_key_val = NULL;
         }

         if (next_min != NULL)
         {
            min_key_val = next_min;
            next_min = NULL;
         }
         
         // See if there are any elements left
         for(curr_thread = 0; curr_thread < num_map_threads && 
             thread_position[curr_thread] >= 
             g_state.intermediate_vals[curr_reduce_task][curr_thread].len; 
             curr_thread++);
	      done = (curr_thread == num_map_threads);
      }
      
      for (curr_thread = 0; curr_thread < num_map_threads; curr_thread++)
      {
         keyvals_arr_t * arr = &g_state.intermediate_vals[curr_reduce_task][curr_thread];
         int j;
         for(j = 0; j < arr->len; j++)
         {
            free(arr->arr[j].vals);
         }
         free(arr->arr);
      }
      free(g_state.intermediate_vals[curr_reduce_task]);
   }

   free(thread_position);
   free(vals);
   free(args);

   return (void *)0;
}
static void
emit_struct (definition * def)
{
  decl_list *dl;
  int i, j, size, flag;
  decl_list *cur = NULL, *psav;
  bas_type *ptr;
  char *sizestr, *plus;
  char ptemp[256];
  int can_inline;


  if (doinline == 0) {
    for (dl = def->def.st.decls; dl != NULL; dl = dl->next)
      print_stat (1, &dl->decl);
    return;
  }
  for (dl = def->def.st.decls; dl != NULL; dl = dl->next)
    if (dl->decl.rel == REL_VECTOR) {
      f_print (fout, "\t int i;\n");
      break;
    }
  size = 0;
  can_inline = 0;
  for (dl = def->def.st.decls; dl != NULL; dl = dl->next)
    if ((dl->decl.prefix == NULL) &&
	((ptr = find_type (dl->decl.type)) != NULL) &&
	((dl->decl.rel == REL_ALIAS) || (dl->decl.rel == REL_VECTOR))) {

      if (dl->decl.rel == REL_ALIAS)
	size += ptr->length;
      else {
	can_inline = 1;
	break;			/* can be inlined */
      };
    }
    else {
      if (size >= doinline) {
	can_inline = 1;
	break;			/* can be inlined */
      }
      size = 0;
    }
  if (size > doinline)
    can_inline = 1;

  if (can_inline == 0) {	/* can not inline, drop back to old mode */
    for (dl = def->def.st.decls; dl != NULL; dl = dl->next)
      print_stat (1, &dl->decl);
    return;
  };




  flag = PUT;
  for (j = 0; j < 2; j++) {

    if (flag == PUT)
      f_print (fout, "\n\t if (xdrs->x_op == XDR_ENCODE) {\n");
    else
      f_print (fout, "\n \t return (TRUE);\n\t} else if (xdrs->x_op == XDR_DECODE) {\n");


    i = 0;
    size = 0;
    sizestr = NULL;
    for (dl = def->def.st.decls; dl != NULL; dl = dl->next) {	/* xxx */

      /* now walk down the list and check for basic types */
      if ((dl->decl.prefix == NULL) && ((ptr = find_type (dl->decl.type)) != NULL) && ((dl->decl.rel == REL_ALIAS) || (dl->decl.rel == REL_VECTOR))) {
	if (i == 0)
	  cur = dl;
	i++;

	if (dl->decl.rel == REL_ALIAS)
	  size += ptr->length;
	else {
	  /* this is required to handle arrays */

	  if (sizestr == NULL)
	    plus = " ";
	  else
	    plus = "+";

	  if (ptr->length != 1)
	    s_print (ptemp, " %s %s * %d", plus, dl->decl.array_max, ptr->length);
	  else
	    s_print (ptemp, " %s %s ", plus, dl->decl.array_max);

	  /* now concatenate to sizestr !!!! */
	  if (sizestr == NULL)
	    sizestr = strdup (ptemp);
	  else {
	    sizestr = (char *) realloc (sizestr, strlen (sizestr) + strlen (ptemp) + 1);
	    if (sizestr == NULL) {

	      f_print (stderr, "Fatal error : no memory \n");
	      crash ();
	    };
	    sizestr = strcat (sizestr, ptemp);	/* build up length of
						 * array */

	  }
	}

      }
      else {
	if (i > 0) {
	  if (sizestr == NULL && size < doinline) {
	    /* don't expand into inline
	     * code if size < doinline */
	    while (cur != dl) {
	      print_stat (1, &cur->decl);
	      cur = cur->next;
	    }
	  }
	  else {
	    /* were already looking at a
	     * xdr_inlineable structure */
	    if (sizestr == NULL)
	      f_print (fout, "\t buf = (int32_t *)XDR_INLINE(xdrs,%d * BYTES_PER_XDR_UNIT);",
		       size);
	    else if (size == 0)
	      f_print (fout,
		       "\t buf = (int32_t *)XDR_INLINE(xdrs,%s * BYTES_PER_XDR_UNIT);",
		       sizestr);
	    else
	      f_print (fout,
		       "\t buf = (int32_t *)XDR_INLINE(xdrs,(%d + %s)* BYTES_PER_XDR_UNIT);",
		       size, sizestr);

	    f_print (fout, "\n\t   if (buf == NULL) {\n");

	    psav = cur;
	    while (cur != dl) {
	      print_stat (2, &cur->decl);
	      cur = cur->next;
	    }

	    f_print (fout, "\n\t  }\n\t  else {\n");

	    cur = psav;
	    while (cur != dl) {
	      emit_inline (&cur->decl, flag);
	      cur = cur->next;
	    }

	    f_print (fout, "\t  }\n");
	  }
	}
	size = 0;
	i = 0;
	sizestr = NULL;
	print_stat (1, &dl->decl);
      }

    }
    if (i > 0) {
      if (sizestr == NULL && size < doinline) {
	/* don't expand into inline code if size <
	 * doinline */
	while (cur != dl) {
	  print_stat (1, &cur->decl);
	  cur = cur->next;
	}
      }
      else {

	/* were already looking at a xdr_inlineable
	 * structure */
	if (sizestr == NULL)
	  f_print (fout, "\t\tbuf = (int32_t *)XDR_INLINE(xdrs,%d * BYTES_PER_XDR_UNIT);",
		   size);
	else if (size == 0)
	  f_print (fout,
	   "\t\tbuf = (int32_t *)XDR_INLINE(xdrs,%s * BYTES_PER_XDR_UNIT);",
		   sizestr);
	else
	  f_print (fout,
		   "\t\tbuf = (int32_t *)XDR_INLINE(xdrs,(%d + %s)* BYTES_PER_XDR_UNIT);",
		   size, sizestr);

	f_print (fout, "\n\t\tif (buf == NULL) {\n");

	psav = cur;
	while (cur != NULL) {
	  print_stat (2, &cur->decl);
	  cur = cur->next;
	}
	f_print (fout, "\n\t  }\n\t  else {\n");

	cur = psav;
	while (cur != dl) {
	  emit_inline (&cur->decl, flag);
	  cur = cur->next;
	}

	f_print (fout, "\t  }\n");

      }
    }
    flag = GET;
  }
  f_print (fout, "\t return(TRUE);\n\t}\n\n");

  /* now take care of XDR_FREE case */

  for (dl = def->def.st.decls; dl != NULL; dl = dl->next)
    print_stat (1, &dl->decl);
}