/* local push */ void ss_put_work(StealStack *s, void* node_c) { StealStackNode* n; void *work; /* If the stack is empty, push an empty StealStackNode. */ if (deq_isEmpty(localQueue)) { n = malloc(sizeof(StealStackNode)); work = malloc(s->chunk_size*s->work_size); if (!n || !work) ss_error("ss_put_work(): Out of virtual memory", 3); n->work = work; n->head = 0; deq_pushFront(localQueue, n); } n = deq_peekFront(localQueue); /* If the current StealStackNode is full, push a new one. */ if (n->head == s->chunk_size) { n = malloc(sizeof(StealStackNode)); work = malloc(s->chunk_size*s->work_size); if (!n || !work) ss_error("ss_put_work(): Out of virtual memory", 3); n->head = 0; n->work = work; deq_pushFront(localQueue, n); } else if (n->head > s->chunk_size) ss_error("ss_put_work(): Block has overflowed!", 3); /* Copy the work to the local queue, increment head */ memcpy(((uint8_t*)n->work)+(s->work_size*n->head), node_c, s->work_size); n->head++; s->localWork++; s->maxStackDepth = max(s->globalWork + s->localWork, s->maxStackDepth); /* If there is sufficient local work, release a chunk to the global queue */ if (s->localWork > 2*s->chunk_size) { if (s->nNodes % polling_interval == 0) { #ifndef TRACE ss_setState(s, SS_OVH); #endif release(s); #ifndef TRACE ss_setState(s, SS_WORK); #endif } } }
/* initialize the stack */ StealStack* ss_init(int *argc, char ***argv) { StealStack* s = &stealStack; //only one s per thread needed /* Start up MPI */ MPI_Init(argc, argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank); if (comm_rank == 0 && comm_size == 1) ss_error("Error: Worksharing requires 2 or more MPI processes (1 work server, >= 1 worker)", 1); /* Reset timestamps */ msg_counter = 0; s->globalWork = 0; s->localWork = 0; s->nNodes = 0; s->nLeaves = 0; s->nAcquire = 0; s->nRelease = 0; s->nSteal = 0; s->nFail = 0; s->maxStackDepth = 0; s->maxTreeDepth = 0; localQueue = deq_create(); globalQueue = deq_create(); mkEmpty(s); return s; }
/* release k values from bottom of local stack */ void release(StealStack *s) { StealStackNode *node; void *work; int work_queue_id; #ifdef NONBLOCK MPI_Status status; #endif /* Get a node from the back of the queue to release */ node = deq_popBack(localQueue); if (node) { /* If this node is not full we can't release it. */ if (node->head != s->chunk_size) ss_error("release(): Attempted to release a non-full node", 1); work = node->work; work_queue_id = getWorkQueueId(); ctrk_put(comm_rank, work); #ifdef NONBLOCK if (rls_handle != MPI_REQUEST_NULL) { MPI_Wait(&rls_handle, &status); free(rls_buff); } rls_handle = work; MPI_Isend(rls_handle, s->work_size*s->chunk_size, MPI_BYTE, work_queue_id, MPI_MAKEWORKGLOBAL_TAG, MPI_COMM_WORLD, &rls_handle); #else MPI_Send(work, s->work_size*s->chunk_size, MPI_BYTE, work_queue_id, MPI_MAKEWORKGLOBAL_TAG, MPI_COMM_WORLD); #endif /* NONBLOCK */ ++msg_counter; free(node); s->localWork -= s->chunk_size; s->nRelease ++; } else ss_error("release(): Do not have a chunk to release", 1); }
/** if no work is found no local work is found, and none can be stolen, return original s and c is null if work is found, return the StealStack and set c to return node **/ int ss_get_work(StealStack *s, void* node_c) { StealStackNode* n; /* Call ensureLocalWork() to make sure there is work on our local queue. * If the local queue is empty, this will get work from the global queue */ if (ensureLocalWork(s) == -1) { if (DEBUG & 1) printf("StealStack::pop - stack is empty and no work can be found"); ss_setState(s, SS_IDLE); node_c = NULL; return STATUS_TERM; } /* We have work */ ss_setState(s, SS_WORK); /* ensureLocalWork() ensures that the local work queue is not empty, * so at this point we know there must be work available */ n = deq_peekFront(localQueue); /* head always points at the next free entry in the work array */ n->head--; memcpy(node_c,((uint8_t*)n->work) + ((s->work_size)*(n->head)),s->work_size); /* This chunk in the queue is empty so dequeue it */ if(n->head == 0) { deq_popFront(localQueue); free(n->work); free(n); } else if (n->head < 0) { /* This happens if an empty chunk is left on the queue */ fprintf(stderr, "ss_get_work(): called with n->head = 0, s->localWork=%d or %d (mod %d)\n", s->localWork, s->localWork % s->chunk_size, s->chunk_size); ss_error("ss_get_work(): Underflow!", 5); } s->nNodes++; s->localWork--; return STATUS_HAVEWORK; }
int ss_listen (int sci_idx) { char *cp; ss_data *info; sigret_t (*sig_int)(int), (*old_sig_cont)(int); char input[BUFSIZ]; sigset_t omask, igmask; int code; jmp_buf old_jmpb; ss_data *old_info = current_info; char *line; current_info = info = ss_info(sci_idx); sig_cont = (sigret_t (*)(int)) 0; info->abort = 0; sigemptyset(&igmask); sigaddset(&igmask, SIGINT); sigprocmask(SIG_BLOCK, &igmask, &omask); memcpy(old_jmpb, listen_jmpb, sizeof(jmp_buf)); sig_int = signal(SIGINT, listen_int_handler); setjmp(listen_jmpb); sigprocmask(SIG_SETMASK, &omask, (sigset_t *) 0); while(!info->abort) { old_sig_cont = sig_cont; sig_cont = signal(SIGCONT, print_prompt); if (sig_cont == print_prompt) sig_cont = old_sig_cont; if (info->readline) { line = (*info->readline)(current_info->prompt); } else { print_prompt(0); if (fgets(input, BUFSIZ, stdin) == input) line = input; else line = NULL; input[BUFSIZ-1] = 0; } if (line == NULL) { code = SS_ET_EOF; (void) signal(SIGCONT, sig_cont); goto egress; } cp = strchr(line, '\n'); if (cp) { *cp = '\0'; if (cp == line) continue; } (void) signal(SIGCONT, sig_cont); if (info->add_history) (*info->add_history)(line); code = ss_execute_line (sci_idx, line); if (code == SS_ET_COMMAND_NOT_FOUND) { register char *c = line; while (*c == ' ' || *c == '\t') c++; cp = strchr (c, ' '); if (cp) *cp = '\0'; cp = strchr (c, '\t'); if (cp) *cp = '\0'; ss_error (sci_idx, 0, "Unknown request \"%s\". Type \"?\" for a request list.", c); } if (info->readline) free(line); } code = 0; egress: (void) signal(SIGINT, sig_int); memcpy(listen_jmpb, old_jmpb, sizeof(jmp_buf)); current_info = old_info; return code; }
void ss_perror(int sci_idx, long code, char const *msg) /* for compatibility */ { ss_error (sci_idx, code, "%s", msg); }
void showStats() { int i, j; counter_t tnodes = 0, tleaves = 0, trel = 0, tacq = 0, tsteal = 0, tfail= 0; counter_t mdepth = 0, mheight = 0; double twork = 0.0, tsearch = 0.0, tidle = 0.0, tovh = 0.0; double max_times[SS_NSTATES]; double min_times[SS_NSTATES]; double elapsedSecs; int num_workers; StealStack *stealStack; stealStack = malloc(sizeof(StealStack)*ss_get_num_threads()); if (!stealStack) ss_error("showStats(): out of memory\n", 10); /* Gather the stats and return if I'm not the one that has them */ if (!ss_gather_stats(stealStack, &num_workers)) return; for (i = 0; i < SS_NSTATES; i++) { max_times[i] = 0.0; min_times[i] = stealStack[0].time[i]; } elapsedSecs = stealStack[0].walltime; // combine measurements from all threads for (i = 0; i < num_workers; i++) { tnodes += stealStack[i].nNodes; tleaves += stealStack[i].nLeaves; trel += stealStack[i].nRelease; tacq += stealStack[i].nAcquire; tsteal += stealStack[i].nSteal; tfail += stealStack[i].nFail; twork += stealStack[i].time[SS_WORK]; tsearch += stealStack[i].time[SS_SEARCH]; tidle += stealStack[i].time[SS_IDLE]; tovh += stealStack[i].time[SS_OVH]; mdepth = max(mdepth, stealStack[i].maxStackDepth); mheight = max(mheight, stealStack[i].maxTreeDepth); for (j = 0; j < SS_NSTATES; j++) { if (max_times[j] < stealStack[i].time[j]) max_times[j] = stealStack[i].time[j]; if (min_times[j] > stealStack[i].time[j]) min_times[j] = stealStack[i].time[j]; } } if (trel != tacq + tsteal) { printf("*** error! total released != total acquired + total stolen\n"); } uts_showStats(ss_get_num_threads(), chunkSize, elapsedSecs, tnodes, tleaves, mheight); if (verbose > 1) { printf("Total chunks released = %d, of which %d reacquired and %d stolen\n", trel, tacq, tsteal); printf("Failed steals = %d, Max queue size = %d\n", tfail, mdepth); printf("Avg time per thread: Work = %.6f, Overhead = %6f, Search = %.6f, Idle = %.6f.\n", (twork / ss_get_num_threads()), (tovh / ss_get_num_threads()), (tsearch / ss_get_num_threads()), (tidle / ss_get_num_threads())); printf("Min time per thread: Work = %.6f, Overhead = %6f, Search = %.6f, Idle = %.6f.\n", min_times[SS_WORK], min_times[SS_OVH], min_times[SS_SEARCH], min_times[SS_IDLE]); printf("Max time per thread: Work = %.6f, Overhead = %6f, Search = %.6f, Idle = %.6f.\n\n", max_times[SS_WORK], max_times[SS_OVH], max_times[SS_SEARCH], max_times[SS_IDLE]); } // per thread execution info if (verbose > 2) { for (i = 0; i < num_workers; i++) { printf("** Thread %d\n", i); printf(" # nodes explored = %d\n", stealStack[i].nNodes); printf(" # chunks released = %d\n", stealStack[i].nRelease); printf(" # chunks reacquired = %d\n", stealStack[i].nAcquire); printf(" # chunks stolen = %d\n", stealStack[i].nSteal); printf(" # failed steals = %d\n", stealStack[i].nFail); printf(" maximum stack depth = %d\n", stealStack[i].maxStackDepth); printf(" work time = %.6f secs (%d sessions)\n", stealStack[i].time[SS_WORK], stealStack[i].entries[SS_WORK]); printf(" overhead time = %.6f secs (%d sessions)\n", stealStack[i].time[SS_OVH], stealStack[i].entries[SS_OVH]); printf(" search time = %.6f secs (%d sessions)\n", stealStack[i].time[SS_SEARCH], stealStack[i].entries[SS_SEARCH]); printf(" idle time = %.6f secs (%d sessions)\n", stealStack[i].time[SS_IDLE], stealStack[i].entries[SS_IDLE]); printf("\n"); } } #ifdef TRACE ss_printTrace(stealStack, num_workers); #endif }
int ss_listen (int sci_idx) { char *cp; ss_data *info; sigret_t (*sig_int)(int), (*sig_cont)(int), (*old_sig_cont)(int); char input[BUFSIZ]; char buffer[BUFSIZ]; char *end = buffer; #ifdef POSIX_SIGNALS sigset_t omask, igmask; #else int mask; #endif int code; jmp_buf old_jmpb; ss_data *old_info = current_info; current_info = info = ss_info(sci_idx); sig_cont = (sigret_t (*)(int)) 0; info->abort = 0; #ifdef POSIX_SIGNALS sigemptyset(&igmask); sigaddset(&igmask, SIGINT); sigprocmask(SIG_BLOCK, &igmask, &omask); #else mask = sigblock(sigmask(SIGINT)); #endif memcpy(old_jmpb, listen_jmpb, sizeof(jmp_buf)); sig_int = signal(SIGINT, listen_int_handler); setjmp(listen_jmpb); #ifdef POSIX_SIGNALS sigprocmask(SIG_SETMASK, &omask, (sigset_t *) 0); #else (void) sigsetmask(mask); #endif while(!info->abort) { print_prompt(0); *end = '\0'; old_sig_cont = sig_cont; sig_cont = signal(SIGCONT, print_prompt); if (sig_cont == print_prompt) sig_cont = old_sig_cont; if (fgets(input, BUFSIZ, stdin) != input) { code = SS_ET_EOF; goto egress; } cp = strchr(input, '\n'); if (cp) { *cp = '\0'; if (cp == input) continue; } (void) signal(SIGCONT, sig_cont); for (end = input; *end; end++) ; code = ss_execute_line (sci_idx, input); if (code == SS_ET_COMMAND_NOT_FOUND) { register char *c = input; while (*c == ' ' || *c == '\t') c++; cp = strchr (c, ' '); if (cp) *cp = '\0'; cp = strchr (c, '\t'); if (cp) *cp = '\0'; ss_error (sci_idx, 0, "Unknown request \"%s\". Type \"?\" for a request list.", c); } } code = 0; egress: (void) signal(SIGINT, sig_int); memcpy(listen_jmpb, old_jmpb, sizeof(jmp_buf)); current_info = old_info; return code; }
void doWorkQueueManager(int size, StealStack *s) { MPI_Request request[size*3]; //make one array so we can do a Waitall on all comm MPI_Request *req_make_global = &request[0]; MPI_Request *req_work_request = &request[size]; MPI_Request *req_response = &request[2*size]; MPI_Status request_status; //, send_status; MPI_Status wait_all_status[3*size]; void *shared_work_buf[size]; unsigned long work_request_buf[size]; int flag, who, i; struct waiting_entry waiting[size]; unsigned long timestamps[size]; unsigned long msg_counts[size]; int work_response_send_count=0; int done=0; #ifdef TRACE_RELEASES /* Track releases */ ss_setState(s, SS_WORK); #else /* Attribute the WQM's time to overhead */ ss_setState(s, SS_WORK); ss_setState(s, SS_IDLE); #endif /* Init the receieve buffers */ for(i = 0; i < size; i++) { waiting[i].flag = 0; /*init waiting to not waiting*/ waiting[i].buf = NULL; /*init waiting to not waiting*/ timestamps[i] = 0; msg_counts[i] = 0; shared_work_buf[i] = malloc(s->work_size*s->chunk_size); } /* Setup non-block recieves for communicating with workers */ for(i=0; i < size; i++) { /* Listen for work releases */ MPI_Irecv(shared_work_buf[i], s->work_size*s->chunk_size, MPI_BYTE, i, MPI_MAKEWORKGLOBAL_TAG, MPI_COMM_WORLD, &req_make_global[i]); /* Listen for work requests (A WORKREQUEST should be the chunksize requested) */ MPI_Irecv(&work_request_buf[i], 1, MPI_LONG, i, MPI_WORKREQUEST_TAG, MPI_COMM_WORLD, &req_work_request[i]); } /** BEGIN WORK MANAGEMENT LOOP */ while(!done) { /* Wait for someone to send work or to request work */ MPI_Waitany(2*size, request, &who, &request_status); /* Sending shared work to the queue */ if(who < size) { void *w = malloc(s->work_size*s->chunk_size); #ifdef TRACE_RELEASES /* Mark this release as a "steal" event */ ss_markSteal(s, who); ss_setState(s, SS_SEARCH); ss_setState(s, SS_WORK); #endif /* Update timestamp */ msg_counts[who]++; memcpy(w, shared_work_buf[who], s->work_size*s->chunk_size); deq_pushFront(globalQueue, w); s->globalWork += s->chunk_size; MPI_Irecv(shared_work_buf[who], s->work_size*s->chunk_size, MPI_BYTE, who, MPI_MAKEWORKGLOBAL_TAG, MPI_COMM_WORLD, &req_make_global[who]); } /* Requesting shared work from the queue */ else { // (who >= size) who -= size; /* mark this id is waiting for work */ waiting[who].flag = 1; /* Update timestamp */ msg_counts[who]++; timestamps[who] = work_request_buf[who]; /* This should be an invariant.. */ if (timestamps[who] < msg_counts[who]) { ss_error("WQM: message delivery failure!\n", 10); } MPI_Irecv(&work_request_buf[who], 1, MPI_LONG, who, MPI_WORKREQUEST_TAG, MPI_COMM_WORLD, &req_work_request[who]); } /* finish last round of sends before start to send more data */ if (work_response_send_count) { MPI_Waitall(work_response_send_count, req_response, wait_all_status); // Free all the buffers used in the last round for (i = 0; i < size; i++) { if (waiting[i].buf != NULL) { free(waiting[i].buf); waiting[i].buf = NULL; } } } /* Attempt to send work to everyone who is waiting. */ work_response_send_count = 0; for (i = 0; i < size; i++) { if (waiting[i].flag && !deq_isEmpty(globalQueue)) { void* work_ptr = deq_popFront(globalQueue); MPI_Isend(work_ptr, s->work_size*s->chunk_size, MPI_BYTE, i, MPI_RESPONDWORK_TAG, MPI_COMM_WORLD, &req_response[work_response_send_count]); work_response_send_count++; s->globalWork -= s->chunk_size; waiting[i].flag = 0; waiting[i].buf = work_ptr; } } /** Check for termination **/ /* If everyone is still waiting and there are no outstanding messages then we are done. */ done = 1; for(i=0; i < size; i++) { if(!waiting[i].flag || (msg_counts[i] != timestamps[i])) { done=0; break; //no need to check everyone else } } /* Sanity check */ if(done && !deq_isEmpty(globalQueue)) { ss_error("WQM: Something evil happened. We are terminating but I still have work!", 13); } } /* END: while (!done) */ if (DEBUG & 2) printf("Queue Manager: We are done. Letting everyone know.\n"); /* This is a sanity test to make sure our prioritazation above works. If this testany were to return true, the cancels below would error out. */ MPI_Testany(2*size, request, &who, &flag, &request_status); if (flag) { ss_error("WQM: Attempted to terminate with inbound work!", 13); } /* Cancel the outstanding MPI_Irecvs */ for (i = 0; i < size; i++) { MPI_Cancel(&req_make_global[i]); MPI_Cancel(&req_work_request[i]); } /* send a msg to everyone that no work exists, everyone should be waiting on an MPI_recv here */ work_response_send_count = 0; for(i=0; i < size; i++) { MPI_Isend(NULL, 0, MPI_BYTE, i, MPI_RESPONDWORK_TAG, MPI_COMM_WORLD, &req_response[i]); work_response_send_count++; } MPI_Waitall(work_response_send_count, req_response, wait_all_status); ss_setState(s, SS_IDLE); }
/** * ensure local work exists, find it if it doesnt * returns process id where work is stolen from if no can be found locally * returns -1 if no local work exists and none could be stolen **/ int ensureLocalWork(StealStack *s) { int work_queue_id = getWorkQueueId(); int work_rcv; void *work_ptr; StealStackNode *node; MPI_Status status; if (s->localWork < 0) ss_error("ensureLocalWork(): localWork count is less than 0!", 2); /* If no more work */ if (s->localWork == 0) { ss_setState(s, SS_SEARCH); work_ptr = malloc(s->work_size*s->chunk_size); node = (StealStackNode*)malloc(sizeof(StealStackNode)); if (!work_ptr || !node) ss_error("ensureLocalWork(): Out of virtual memory.", 10); /* Send a work request - will block until work is available or program terminates. */ #ifdef NONBLOCK MPI_Wait(&rls_handle, &status); #endif ++msg_counter; // Increase our timestamp MPI_Send(&msg_counter, 1, MPI_LONG, work_queue_id, MPI_WORKREQUEST_TAG, MPI_COMM_WORLD); MPI_Recv(work_ptr, s->work_size*s->chunk_size, MPI_BYTE, work_queue_id, MPI_RESPONDWORK_TAG, MPI_COMM_WORLD, &status); // FIXME: Safe to assume chunk is full? node->head = s->chunk_size; node->work = work_ptr; // FIXME: Should we check the tag instead? MPI_Get_count(&status, MPI_BYTE, &work_rcv); if (work_rcv == 0) { /* No more work, time to terminate */ if (DEBUG & 2) printf("Thread %d terminating\n", comm_rank); fflush(NULL); return -1; } else if (work_rcv != s->work_size * s->chunk_size) { ss_error("ensureLocalWork(): Work received size does not equal chunk size", 10); } ctrk_get(comm_rank, work_ptr); s->nSteal++; s->localWork += s->chunk_size; #ifdef TRACE ss_markSteal(s, getWorkQueueId()); #endif /* Push stolen work onto the back of the queue */ deq_pushBack(localQueue, node); return work_queue_id; } return 0; //local work already exists }