Example #1
0
char *get_host_by_tid(struct pvmhostinfo *hostp, int nhost, int tid) {
  int j, host_tid;
  char *host_name = "unknown";
  host_tid = pvm_tidtohost(tid);
  for (j=0; j<nhost; j++)
    if (hostp[j].hi_tid == host_tid) {
      host_name = hostp[j].hi_name;
      break;
    }
  return host_name;
}
Example #2
0
/* 
   Create the PE Tasks. We spawn (nPEs-1) pvm threads: the Main Thread 
   (which starts execution and performs IO) is created by forking SysMan 
*/
static int
createPEs(int total_nPEs) {
  int i, spawn_nPEs, iSpawn = 0, nArch, nHost;
  struct pvmhostinfo *hostp; 
  int sysman_host;

  spawn_nPEs = total_nPEs-1;
  if (spawn_nPEs > 0) {
    IF_PAR_DEBUG(verbose,
		 fprintf(stderr, "==== [%x] Spawning %d PEs(%s) ...\n", 
			 sysman_id, spawn_nPEs, petask);
		 fprintf(stderr, "  args: ");
		 for (i = 0; pargv[i]; ++i)
		   fprintf(stderr, "%s, ", pargv[i]);
		 fprintf(stderr, "\n"));

    pvm_config(&nHost,&nArch,&hostp);
    sysman_host=pvm_tidtohost(sysman_id);
	
    /* create PEs on the specific machines in the specified order! */
    for (i=0; (iSpawn<spawn_nPEs) && (i<nHost); i++)
      if (hostp[i].hi_tid != sysman_host) { 
	checkComms(pvm_spawn(petask, pargv, spawn_flag+PvmTaskHost, 
			     hostp[i].hi_name, 1, gtids+iSpawn),
		   "SysMan startup");
	IF_PAR_DEBUG(verbose,
		     fprintf(stderr, "==== [%x] Spawned PE %d onto %s\n",
			     sysman_id, i, hostp[i].hi_name));
	iSpawn++;
      }
      
    /* create additional PEs anywhere you like */
    if (iSpawn<spawn_nPEs) { 
      checkComms(pvm_spawn(petask, pargv, spawn_flag, "", 
			   spawn_nPEs-iSpawn, gtids+iSpawn),
		 "SysMan startup");
	IF_PAR_DEBUG(verbose,
		     fprintf(stderr,"==== [%x] Spawned %d additional PEs anywhere\n",
			     sysman_id, spawn_nPEs-iSpawn));
      }   
    }
Example #3
0
void BBSDirect::start() {
	char* client = 0;
	int tid, host_mytid;
	int i, n, ncpu, nncpu;
	struct pvmhostinfo* hostp;
	if (started_) { return; }
	BBSImpl::start();
	mytid_ = pvm_mytid();
	nrnmpi_myid = 0;
	if (mytid_ < 0) { perror("start"); }
	host_mytid = pvm_tidtohost(mytid_);
	tid = pvm_parent();
	if (tid == PvmSysErr) {
		perror("start");
	}else if (tid == PvmNoParent) {
		is_master_ = true;
		pvm_catchout(stdout);
		pvm_setopt(PvmRoute, PvmRouteDirect);
		pvm_config(&n, NULL, &hostp);
		nncpu = 0;
		for (i=0; i < n; ++i) {
			ncpu = hostp[i].hi_speed;
			if (ncpu%1000) {
				hoc_warning(hostp[i].hi_name,
" speed in pvm configuration file is not a multiple of 1000. Assuming 1000.");
				ncpu = 1000;
			}
			nncpu += ncpu/1000;
		}
		nrnmpi_numprocs = nncpu;
		ncids = 0;
	}else{ // a worker, impossible
		assert(false);
	}
	if (nrnmpi_numprocs > 1 && tid == PvmNoParent) {
		char ** sargv;
	// args are workingdirectory specialOrNrniv -bbs_nhost nhost args
		sargv = new char*[nrn_global_argc + 4];
		for (i=1; i < nrn_global_argc; ++i) {
			sargv[i+3] = nrn_global_argv[i];
		}
		sargv[nrn_global_argc + 3] = 0;
		sargv[0] = rel_working_dir();
//printf("sargv[0]=|%s|\n", sargv[0]);
		sargv[1] = nrn_global_argv[0];
		sargv[2] = "-bbs_nhost";
		sargv[3] = new char[10];
		sprintf(sargv[3], "%d", nrnmpi_numprocs);
		cids = new int[nrnmpi_numprocs-1];
if (nrn_global_argv[nrn_global_argc] != 0) {
	printf("argv not null terminated\n");
	exit(1);
}

		BBSDirectServer::server_->start();
		bbs_sig_set();
		bbs_handle();

		//spawn according to number of cpu's (but master has one less)
//printf("%d total number of cpus on %d machines\n", nncpu, n);
		int icid = 0;
		bool first = true;
	    while (icid < nrnmpi_numprocs - 1) {
		for (i=0; i < n; ++i) {
			ncpu = hostp[i].hi_speed;
			if (ncpu%1000) {
				ncpu = 1000;
			}
			ncpu /= 1000;
//printf("%d cpu for machine %d (%s)\n", ncpu, i, hostp[i].hi_name);
			if (first && hostp[i].hi_tid == host_mytid) {
				// spawn one fewer on master first time through
				--ncpu;
			}
			if (icid + ncpu >= nrnmpi_numprocs) {
				ncpu = nrnmpi_numprocs - icid - 1;
			}
//printf("before spawn %d processes (icid=%d) for machine %d (%s)\n", ncpu, icid, i, hostp[i].hi_name);
			if (ncpu) {
				ncids = pvm_spawn("bbswork.sh", sargv,
					PvmTaskHost, hostp[i].hi_name, ncpu, cids + icid);
				if (ncids != ncpu) {
fprintf(stderr, "Tried to spawn %d tasks, only %d succeeded on %s\n", ncpu, ncids, hostp[i].hi_name);
hoc_execerror("Could not spawn all the requested tasks for", hostp[i].hi_name);
				}
//printf("spawned %d for %s with cids starting at %d\n", ncpu, hostp[i].hi_name, icid);
				icid += ncpu;
			}
			if (icid >= nrnmpi_numprocs) {
				break;
			}
		}
		first = false;
	    }
		ncids = icid;
printf("spawned %d more %s on %d cpus on %d machines\n", ncids, nrn_global_argv[0], nncpu, n);
		delete [] sargv[3];
		delete [] sargv;
	}
}
Example #4
0
int main(int argc, char** argv)
{
   int pid = pvm_mytid();

   if (argc != 2 && argc != 3)
      stop("Usage: tm_driver <control_file> [TM_tid]\n");

   int tm_tid = 0;
   char * control_file = strdup(argv[1]);

   if (argc == 3)
      sscanf(argv[2], "t%x", &tm_tid);

   int info = 0;

   // Get the machine configuration
   int nhost = 0;
   int narch = 0;
   struct pvmhostinfo *hostp = 0;
   info = pvm_config(&nhost, &narch, &hostp);

   // Parse the control file
   int to_delete_size = 0;  // # of machsto delete
   char ** to_delete = 0;   // names of machs to delete
   int to_add_size = 0;     // # of machsto add
   char ** to_add = 0;      // names of machs to add

   int delete_proc_num = 0; // # of procs to delete
   int * tid_delete = 0;    // the tids of procs to delete

   // # of various procs to start
   int lp_num = 0;
   int cg_num = 0;
   int vg_num = 0;
   int cp_num = 0;
   int vp_num = 0;
   // the mach names where the procs shoud be started
   char ** lp_mach = 0;
   char ** cg_mach = 0;
   char ** vg_mach = 0;
   char ** cp_mach = 0;
   char ** vp_mach = 0;

   // Do the parsing. First count
   ifstream ctl(control_file);
   if (!ctl)
      stop("Cannot open parameter file... Aborting.\n");
   // Get the lines of the parameter file one-by-one and if a line contains a
   // (keyword, value) pair then interpret it.
   const int MAX_PARAM_LINE_LENGTH = 1024;
   char line[MAX_PARAM_LINE_LENGTH+1], *end_of_line, *keyword, *value, *ctmp;
   char ch;
   while (ctl) {
      ctl.get(line, MAX_PARAM_LINE_LENGTH);
      if (ctl) {
	 ctl.get(ch);
	 if (ch != '\n') {
	    printf("Too long (>= %i chars) line in the parameter file.\n",
		   MAX_PARAM_LINE_LENGTH);
	    stop("This is absurd. Aborting.\n");
	 }
      }
      end_of_line = line + strlen(line);
      //-------------------------- First separate the keyword and value ------
      keyword = find_if(line, end_of_line, isgraph);
      if (keyword == end_of_line) // empty line
	 continue;
      ctmp = find_if(keyword, end_of_line, isspace);
      if (ctmp == end_of_line) // line is just one word. must be a comment
	 continue;
      *ctmp = 0; // terminate the keyword with a 0 character
      ++ctmp;

      value = find_if(ctmp, end_of_line, isgraph);
      if (value == end_of_line) // line is just one word. must be a comment
	 continue;
      
      ctmp = find_if(value, end_of_line, isspace);
      *ctmp = 0; // terminate the value with a 0 character. this is good even
		 // if ctmp == end_ofline

      if (str_eq(keyword, "BCP_delete_machine")) {
	 ++to_delete_size;
      } else if (str_eq(keyword, "BCP_add_machine")) {
	 ++to_add_size;
      } else if (str_eq(keyword, "BCP_delete_proc")) {
	 ++delete_proc_num;
      } else if (str_eq(keyword, "BCP_lp_process")) {
	 ++lp_num;
      } else if (str_eq(keyword, "BCP_cg_process")) {
	 ++cg_num;
      } else if (str_eq(keyword, "BCP_vg_process")) {
	 ++vg_num;
      } else if (str_eq(keyword, "BCP_cp_process")) {
	 ++cp_num;
      } else if (str_eq(keyword, "BCP_vp_process")) {
	 ++vp_num;
      }
   }
   ctl.close();

   if (to_delete_size > 0) {
      to_delete = new char*[to_delete_size];
      to_delete_size = 0;
   }
   if (to_add_size > 0) {
      to_add = new char*[to_add_size];
      to_add_size = 0;
   }
   if (delete_proc_num > 0) {
      tid_delete = new int[delete_proc_num];
      delete_proc_num = 0;
   }
   if (lp_num) {
      lp_mach = new char*[lp_num];
      lp_num = 0;
   }
   if (cg_num) {
      cg_mach = new char*[cg_num];
      cg_num = 0;
   }
   if (vg_num) {
      vg_mach = new char*[vg_num];
      vg_num = 0;
   }
   if (cp_num) {
      cp_mach = new char*[cp_num];
      cp_num = 0;
   }
   if (vp_num) {
      vp_mach = new char*[vp_num];
      vp_num = 0;
   }

   ctl.open(control_file);
   while (ctl) {
      ctl.get(line, MAX_PARAM_LINE_LENGTH);
      if (ctl) {
	 ctl.get(ch);
	 if (ch != '\n') {
	    printf("Too long (>= %i chars) line in the parameter file.\n",
		   MAX_PARAM_LINE_LENGTH);
	    stop("This is absurd. Aborting.\n");
	 }
      }
      end_of_line = line + strlen(line);
      //-------------------------- First separate the keyword and value ------
      keyword = find_if(line, end_of_line, isgraph);
      if (keyword == end_of_line) // empty line
	 continue;
      ctmp = find_if(keyword, end_of_line, isspace);
      if (ctmp == end_of_line) // line is just one word. must be a comment
	 continue;
      *ctmp = 0; // terminate the keyword with a 0 character
      ++ctmp;

      value = find_if(ctmp, end_of_line, isgraph);
      if (value == end_of_line) // line is just one word. must be a comment
	 continue;
      
      ctmp = find_if(value, end_of_line, isspace);
      *ctmp = 0; // terminate the value with a 0 character. this is good even
		 // if ctmp == end_ofline

      if (str_eq(keyword, "BCP_delete_machine")) {
	 to_delete[to_delete_size++] = strdup(value);
      } else if (str_eq(keyword, "BCP_add_machine")) {
	 to_add[to_add_size++] = strdup(value);
      } else if (str_eq(keyword, "BCP_delete_proc")) {
	 sscanf(value, "t%x", &tid_delete[delete_proc_num++]);
      } else if (str_eq(keyword, "BCP_lp_process")) {
	 lp_mach[lp_num++] = strdup(value);
      } else if (str_eq(keyword, "BCP_cg_process")) {
	 cg_mach[cg_num++] = strdup(value);
      } else if (str_eq(keyword, "BCP_vg_process")) {
	 vg_mach[vg_num++] = strdup(value);
      } else if (str_eq(keyword, "BCP_cp_process")) {
	 cp_mach[cp_num++] = strdup(value);
      } else if (str_eq(keyword, "BCP_vp_process")) {
	 vp_mach[vp_num++] = strdup(value);
      }
   }
   ctl.close();

   // Check that machine deletions and additions are correct

   char ** last = 0;

   // Are there duplicates on the to be deleted list ?
   if (to_delete_size > 0) {
      sort(to_delete, to_delete + to_delete_size, str_lt);
      last = unique(to_delete, to_delete + to_delete_size, str_eq);
      if (to_delete_size != last - to_delete)
	 stop("A machine to be deleted is listed twice... Aborting.\n");
   }

   // Are there duplicates on the to be added list?
   if (to_add_size > 0) {
      sort(to_add, to_add + to_add_size, str_lt);
      last = unique(to_add, to_add + to_add_size, str_eq);
      if (to_add_size != last - to_add)
	 stop("A machine to be added is listed twice... Aborting.\n");
   }

   int i;
   char ** mach_list = new char*[nhost + to_add_size];
   for (i = 0; i < nhost; ++i)
      mach_list[i] = strdup(hostp[i].hi_name);
   sort(mach_list, mach_list + nhost, str_lt);

   char ** current_list = new char*[nhost + to_add_size];

   // Is there a nonexisting machine to be deleted?
   if (to_delete_size > 0) {
      last = set_difference(to_delete, to_delete + to_delete_size,
			    mach_list, mach_list + nhost,
			    current_list, str_lt);
      if (last != current_list)
	 stop("A nonexisting machine is to be deleted... Aborting.\n");
      last = set_difference(mach_list, mach_list + nhost,
			    to_delete, to_delete + to_delete_size,
			    current_list, str_lt);
      ::swap(mach_list, current_list);
   }

   // Is there an already existing machine to be added?
   if (to_add_size > 0) {
      last = set_intersection(to_add, to_add + to_add_size,
			      mach_list, mach_list + nhost,
			      current_list, str_lt);
      if (last != current_list)
	 stop("A machine to be added is already there... Aborting.\n");
      last = merge(to_add, to_add + to_add_size,
		   mach_list, mach_list + nhost,
		   current_list, str_lt);
      ::swap(mach_list, current_list);
   }

   const int mach_num = nhost - to_delete_size + to_add_size;

   // Check that the machines the new processes are supposed to be started on
   // really exist.

   if (lp_num > 0) {
      sort(lp_mach, lp_mach + lp_num, str_lt);
      if (set_difference(lp_mach, lp_mach + lp_num,
			 mach_list, mach_list + mach_num,
			 current_list, str_lt) != current_list)
	 stop("An lp machine is not in the final machine list... Aborting.\n");
   }
   if (cg_num > 0) {
      sort(cg_mach, cg_mach + cg_num, str_lt);
      if (set_difference(cg_mach, cg_mach + cg_num,
			 mach_list, mach_list + mach_num,
			 current_list, str_lt) != current_list)
	 stop("An cg machine is not in the final machine list... Aborting.\n");
   }
   if (vg_num > 0) {
      sort(vg_mach, vg_mach + vg_num, str_lt);
      if (set_difference(vg_mach, vg_mach + vg_num,
			 mach_list, mach_list + mach_num,
			 current_list, str_lt) != current_list)
	 stop("An vg machine is not in the final machine list... Aborting.\n");
   }
   if (cp_num > 0) {
      sort(cp_mach, cp_mach + cp_num, str_lt);
      if (set_difference(cp_mach, cp_mach + cp_num,
			 mach_list, mach_list + mach_num,
			 current_list, str_lt) != current_list)
	 stop("An cp machine is not in the final machine list... Aborting.\n");
   }
   if (vp_num > 0) {
      sort(vp_mach, vp_mach + vp_num, str_lt);
      if (set_difference(vp_mach, vp_mach + vp_num,
			 mach_list, mach_list + mach_num,
			 current_list, str_lt) != current_list)
	 stop("An vp machine is not in the final machine list... Aborting.\n");
   }

   // Find the tree manager
   find_tree_manager(pid, tm_tid);

   // Check that the TM is not on one of the machines to be deleted.
   if (to_delete_size > 0) {
      const int dtid = pvm_tidtohost(tm_tid);
      for (i = 0; i < nhost; ++i) {
	 if (hostp[i].hi_tid == dtid)
	    for (int j = 0; j < to_delete_size; ++j) {
	       if (str_eq(hostp[i].hi_name, to_delete[j]))
		  stop("Can't delete the machine the TM is on. Aborting.\n");
	    }
      }
   }

   // Check that the TM is not one of the processes to be deleted
   if (delete_proc_num > 0) {
      if (find(tid_delete, tid_delete + delete_proc_num, tm_tid) !=
	  tid_delete + delete_proc_num)
	 stop("Can't delete the TM... Aborting.\n");
   }

   // Modify the machine configuration
   if (to_delete_size > 0 || to_add_size > 0) {
      int * infos = new int[max(to_delete_size, to_add_size)];
      if (to_delete_size > 0)
	 if (pvm_delhosts(to_delete, to_delete_size, infos) < 0) {
	    printf("Failed to delete all specified machines...\n");
	    stop("Please check the situation manually... Aborting.\n");
	 }
      if (to_add_size > 0)
	 if (pvm_addhosts(to_add, to_add_size, infos) < 0) {
	    printf("Failed to add all specified machines...\n");
	    stop("Please check the situation manually... Aborting.\n");
	 }
   }

   // Kill the processes to be killed
   for (i = 0; i < delete_proc_num; ++i)
      pvm_kill(tid_delete[i]);

   // Put together a message to be sent to the TM that contains the machine
   // names on which the new processes should be spawned
   int len = (lp_num + cg_num + vg_num + cp_num + vp_num) * sizeof(int);
   if (len > 0) {
      len += 5 * sizeof(int);
      for (i = 0; i < lp_num; ++i) len += strlen(lp_mach[i]);
      for (i = 0; i < cg_num; ++i) len += strlen(cg_mach[i]);
      for (i = 0; i < vg_num; ++i) len += strlen(vg_mach[i]);
      for (i = 0; i < cp_num; ++i) len += strlen(cp_mach[i]);
      for (i = 0; i < vp_num; ++i) len += strlen(vp_mach[i]);

      char * buf = new char[len];

      memcpy(buf, &lp_num, sizeof(int));
      buf += sizeof(int);
      for (i = 0; i < lp_num; ++i) {
	 const int l = strlen(lp_mach[i]);
	 memcpy(buf, &l, sizeof(int));
	 buf += sizeof(int);
	 memcpy(buf, lp_mach[i], l);
	 buf += l;
      }

      memcpy(buf, &cg_num, sizeof(int));
      buf += sizeof(int);
      for (i = 0; i < cg_num; ++i) {
	 const int l = strlen(cg_mach[i]);
	 memcpy(buf, &l, sizeof(int));
	 buf += sizeof(int);
	 memcpy(buf, cg_mach[i], l);
	 buf += l;
      }

      memcpy(buf, &vg_num, sizeof(int));
      buf += sizeof(int);
      for (i = 0; i < vg_num; ++i) {
	 const int l = strlen(vg_mach[i]);
	 memcpy(buf, &l, sizeof(int));
	 buf += sizeof(int);
	 memcpy(buf, vg_mach[i], l);
	 buf += l;
      }

      memcpy(buf, &cp_num, sizeof(int));
      buf += sizeof(int);
      for (i = 0; i < cp_num; ++i) {
	 const int l = strlen(cp_mach[i]);
	 memcpy(buf, &l, sizeof(int));
	 buf += sizeof(int);
	 memcpy(buf, cp_mach[i], l);
	 buf += l;
      }

      memcpy(buf, &vp_num, sizeof(int));
      buf += sizeof(int);
      for (i = 0; i < vp_num; ++i) {
	 const int l = strlen(vp_mach[i]);
	 memcpy(buf, &l, sizeof(int));
	 buf += sizeof(int);
	 memcpy(buf, vp_mach[i], l);
	 buf += l;
      }

      buf -= len;

      pvm_initsend(PvmDataRaw);
      pvm_pkbyte(buf, len, 1);
      pvm_send(tm_tid, BCP_CONFIG_CHANGE);

      int bufid = pvm_recv(tm_tid, -1);
      int bytes = 0, msgtag = 0;
      pvm_bufinfo(bufid, &bytes, &msgtag, &tm_tid);
      if (msgtag == BCP_CONFIG_ERROR)
	 stop("TM had difficulties. Please check the situation manually.\n");
   }

   pvm_exit();
   return 0;
}