void on_init1_finished() { long i; for(i = 0; i < N/PSIZE * N_LONG_PAGE; i = i + N_LONG_PAGE) { if(trigger[i] != 1) { abort; } } // mark this watcher committed trigger[0] = 3; for(i = 0; i < K; i = i + 1) { #ifdef INPUT_PMEM cx[i] = pmem_in[4*i]; cy[i] = pmem_in[4*i+1]; cz[i] = pmem_in[4*i+2]; cw[i] = pmem_in[4*i+3]; #else cx[i] = x[i]; cy[i] = y[i]; cz[i] = z[i]; cw[i] = w[i]; #endif } // number of iterations iter = 0; #ifdef DEBUG output_char('i'); output_char('t'); output_char('e'); output_char('r'); output_char(':'); output_q(-1); output_char(':'); output_centroids(); #endif start_iteration(); commitd; }
/* * Send a status update to all connected clients */ void broadcast_status() { struct cl_entry *current; struct net_status *status; uint8_t dcount; uint16_t scount; uint8_t tcount; uint8_t permission; int ret; /* * Start iteration over complete client list * This locks the mutex for the client list */ current = start_iteration(); /* Get number of users */ dcount = docent_exists(); tcount = tutor_exists(); scount = get_client_count() - dcount - tcount; while (current != NULL) { if (current == get_write_user()) { permission = 1; } else { permission = 0; } /* Build the status message */ status = build_status(current->cdata->role, current->cdata->cid, permission, dcount, tcount, scount); ret = send(current->cdata->sfd, status, sizeof(struct net_status), 0); if (ret < 0) { perror("send"); } free(status); /* Select next client in list */ current = iteration_next(); } log_debug("broadcasting agent: status sent to all connected clients"); /* * End of iteration * This unlocks the mutex of the client list */ end_iteration(); }
void *monitor_run(void *ptr) { struct mt_context *thread_context = (struct mt_context *) ptr; struct monitor_ctx *monitor_context = (struct monitor_ctx *) thread_context->param; struct rtgtargets *targets = monitor_context->targets; struct rtgconf *config = monitor_context->config; unsigned poll_interval = config->interval; /* Use the specified next iteration value. * If it is set to zero, we'll run as soon as possible. */ curms_t next_iteration = monitor_context->next_iteration; curms_t this_iteration = 0; struct timespec loopdelay = { 0, 250 * 1000 * 1000 }; nanosleep(&loopdelay, NULL); while (!thread_stop_requested) { curms_t now = curms(); /* If there are no threads running and it's time for a new iteration, * start a new iteration. */ if (active_threads == 0 && now >= next_iteration) { start_iteration(targets); this_iteration = now; next_iteration = next_interval(now, poll_interval); } /* Give the threads a chance to start, so we don't * mistakenly think they're all done already. */ nanosleep(&loopdelay, NULL); /* If there are no threads running, and we have started a iteration, * then we are done. End the iteration. */ if (active_threads == 0 && this_iteration) { end_iteration(next_iteration, this_iteration); this_iteration = 0; } } /* Save next iteration value, so we can reuse it if necessary. */ monitor_context->next_iteration = next_iteration; return NULL; }
/* * Send a board update message to all connected clients * * Excludes the user with write acces if the parameter * excl_w is set to 1. If the blackbpard should be broadcasted to all * connected clients, excl_w shoul be 0. */ void broadcast_blackboard(char *blackboard, int bsem_id, int excl_w) { struct cl_entry *current; struct net_board *board; int ret; int length; /* * Prepare the blackboard message */ lock_sem(bsem_id); length = strlen(blackboard); board = build_board(blackboard, length); unlock_sem(bsem_id); /* * Iterate over the complete client list * This locks the mutex of the client list */ current = start_iteration(); while (current != NULL) { /* Don't broadcast to the user with write access */ if ((current == get_write_user()) && excl_w) { current = iteration_next(); continue; } ret = send(current->cdata->sfd, board, sizeof(struct net_header) + length, 0); if (ret < 0) { perror("send"); } /* Select next client in list */ current = iteration_next(); } /* Unlock mutex of client list */ end_iteration(); free(board); log_debug("broadcasting agent: blackboard sent to all connected clients"); }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; int it_cpu = 0; int it_gpu = 0; int err = 0; #ifdef LOGS set_iter_interval_print(10); char test_info[500]; snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit); start_log_file("cudaSingleSourceShortestPath", test_info); //printf("Com LOG\n"); #endif // Allocate int n_nodes, n_edges; // int n_nodes_o; read_input_size(n_nodes, n_edges, p); timer.start("Allocation"); Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes); //*************************** Alocando Memoria para o Gold ************************************* Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes); if (p.mode == 1) { // ********************** Lendo O gold ********************************* read_gold(gold, p); // ********************************************************************** } //*********************************************************************************************** Node * d_nodes; cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes); Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges); Edge * d_edges; cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges); std::atomic_int *h_color = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_color; cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes); std::atomic_int *h_cost = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_cost; cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes); int * h_q1 = (int *) malloc(n_nodes * sizeof(int)); int * d_q1; cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes); int * h_q2 = (int *) malloc(n_nodes * sizeof(int)); int * d_q2; cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes); std::atomic_int h_head[1]; int * d_head; cudaStatus = cudaMalloc((void**) &d_head, sizeof(int)); std::atomic_int h_tail[1]; int * d_tail; cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int)); std::atomic_int h_threads_end[1]; int * d_threads_end; cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int)); std::atomic_int h_threads_run[1]; int * d_threads_run; cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int)); int h_num_t[1]; int * d_num_t; cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int)); int h_overflow[1]; int * d_overflow; cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int)); std::atomic_int h_gray_shade[1]; int * d_gray_shade; cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int)); std::atomic_int h_iter[1]; int * d_iter; cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int)); cudaDeviceSynchronize(); CUDA_ERR(); ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2); timer.stop("Allocation"); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); int source; read_input(source, h_nodes, h_edges, p); for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); timer.stop("Initialization"); //timer.print("Initialization", 1); // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); for (int rep = 0; rep < p.n_reps; rep++) { // Reset for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } it_cpu = 0; it_gpu = 0; h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // if(rep >= p.n_warmup) timer.start("Kernel"); #ifdef LOGS start_iteration(); #endif // Run first iteration in master CPU thread h_num_t[0] = 1; int pid; int index_i, index_o; for (index_i = 0; index_i < h_num_t[0]; index_i++) { pid = h_q1[index_i]; h_color[pid].store(BLACK); int cur_cost = h_cost[pid].load(); for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x); i++) { int id = h_edges[i].x; int cost = h_edges[i].y; cost += cur_cost; h_cost[id].store(cost); h_color[id].store(GRAY0); index_o = h_tail[0].fetch_add(1); h_q2[index_o] = id; } } h_num_t[0] = h_tail[0].load(); h_tail[0].store(0); h_threads_run[0].fetch_add(1); h_gray_shade[0].store(GRAY1); h_iter[0].fetch_add(1); // if(rep >= p.n_warmup) timer.stop("Kernel"); // Pointers to input and output queues int * h_qin = h_q2; int * h_qout = h_q1; int * d_qin = d_q2; int * d_qout = d_q1; const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0; const int GPU_EXEC = (p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0; // Run subsequent iterations on CPU or GPU until number of input queue elements is 0 while (*h_num_t != 0) { if ((*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit it_cpu = it_cpu + 1; // if(rep >= p.n_warmup) timer.start("Kernel"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { h_qin = h_q1; h_qout = h_q2; } else { h_qin = h_q2; h_qout = h_q1; } std::thread main_thread(run_cpu_threads, h_nodes, h_edges, h_cost, h_color, h_qin, h_qout, h_num_t, h_head, h_tail, h_threads_end, h_threads_run, h_gray_shade, h_iter, p.n_threads, p.switching_limit, GPU_EXEC); main_thread.join(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.stop("Kernel"); } else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit it_gpu = it_gpu + 1; // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_run, h_threads_run, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_end, h_threads_end, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { d_qin = d_q1; d_qout = d_q2; } else { d_qin = d_q2; d_qout = d_q1; } // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // if(rep >= p.n_warmup) timer.start("Kernel"); assert( p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads, d_nodes, d_edges, d_cost, d_color, d_qin, d_qout, d_num_t, d_head, d_tail, d_threads_end, d_threads_run, d_overflow, d_gray_shade, d_iter, p.switching_limit, CPU_EXEC, sizeof(int) * (W_QUEUE_SIZE + 3)); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Kernel"); // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_run, d_threads_run, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_end, d_threads_end, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); } } #ifdef LOGS end_iteration(); #endif // printf("IT CPU:%d\t",it_cpu); //printf("IT GPU:%d\n",it_gpu); if (p.mode == 1) { err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu); } //err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu); if (err > 0) { printf("Errors: %d\n", err); read_input(source, h_nodes, h_edges, p); read_gold(gold, p); } else { printf(".ITERATION %d\n", rep); } #ifdef LOGS log_error_count(err); #endif // Ler a entrada novamente //read_input(source, h_nodes, h_edges, p); //read_gold(gold,p); } // end of iteration #ifdef LOGS end_log_file(); #endif // timer.print("Allocation", 1); //timer.print("Copy To Device", p.n_reps); // timer.print("Kernel", p.n_reps); // timer.print("Copy Back and Merge", p.n_reps); if (p.mode == 0) { create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file)); } // Verify answer verify(h_cost, n_nodes, p.comparison_file); // Free memory timer.start("Deallocation"); free(h_nodes); free(h_edges); free(h_color); free(h_cost); free(h_q1); free(h_q2); cudaStatus = cudaFree(d_nodes); cudaStatus = cudaFree(d_edges); cudaStatus = cudaFree(d_cost); cudaStatus = cudaFree(d_color); cudaStatus = cudaFree(d_q1); cudaStatus = cudaFree(d_q2); cudaStatus = cudaFree(d_num_t); cudaStatus = cudaFree(d_head); cudaStatus = cudaFree(d_tail); cudaStatus = cudaFree(d_threads_end); cudaStatus = cudaFree(d_threads_run); cudaStatus = cudaFree(d_overflow); cudaStatus = cudaFree(d_iter); cudaStatus = cudaFree(d_gray_shade); CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Deallocation"); //timer.print("Deallocation", 1); // Release timers timer.release("Allocation"); timer.release("Initialization"); timer.release("Copy To Device"); timer.release("Kernel"); timer.release("Copy Back and Merge"); timer.release("Deallocation"); printf("Test Passed\n"); return 0; }