// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; int it_cpu = 0; int it_gpu = 0; int err = 0; #ifdef LOGS set_iter_interval_print(10); char test_info[500]; snprintf(test_info, 500, "-i %d -g %d -t %d -f %s -l %d\n",p.n_gpu_threads, p.n_gpu_blocks,p.n_threads, p.file_name,p.switching_limit); start_log_file("cudaSingleSourceShortestPath", test_info); //printf("Com LOG\n"); #endif // Allocate int n_nodes, n_edges; // int n_nodes_o; read_input_size(n_nodes, n_edges, p); timer.start("Allocation"); Node * h_nodes = (Node *) malloc(sizeof(Node) * n_nodes); //*************************** Alocando Memoria para o Gold ************************************* Gold * gold = (Gold *) malloc(sizeof(Gold) * n_nodes); if (p.mode == 1) { // ********************** Lendo O gold ********************************* read_gold(gold, p); // ********************************************************************** } //*********************************************************************************************** Node * d_nodes; cudaStatus = cudaMalloc((void**) &d_nodes, sizeof(Node) * n_nodes); Edge * h_edges = (Edge *) malloc(sizeof(Edge) * n_edges); Edge * d_edges; cudaStatus = cudaMalloc((void**) &d_edges, sizeof(Edge) * n_edges); std::atomic_int *h_color = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_color; cudaStatus = cudaMalloc((void**) &d_color, sizeof(int) * n_nodes); std::atomic_int *h_cost = (std::atomic_int *) malloc( sizeof(std::atomic_int) * n_nodes); int * d_cost; cudaStatus = cudaMalloc((void**) &d_cost, sizeof(int) * n_nodes); int * h_q1 = (int *) malloc(n_nodes * sizeof(int)); int * d_q1; cudaStatus = cudaMalloc((void**) &d_q1, sizeof(int) * n_nodes); int * h_q2 = (int *) malloc(n_nodes * sizeof(int)); int * d_q2; cudaStatus = cudaMalloc((void**) &d_q2, sizeof(int) * n_nodes); std::atomic_int h_head[1]; int * d_head; cudaStatus = cudaMalloc((void**) &d_head, sizeof(int)); std::atomic_int h_tail[1]; int * d_tail; cudaStatus = cudaMalloc((void**) &d_tail, sizeof(int)); std::atomic_int h_threads_end[1]; int * d_threads_end; cudaStatus = cudaMalloc((void**) &d_threads_end, sizeof(int)); std::atomic_int h_threads_run[1]; int * d_threads_run; cudaStatus = cudaMalloc((void**) &d_threads_run, sizeof(int)); int h_num_t[1]; int * d_num_t; cudaStatus = cudaMalloc((void**) &d_num_t, sizeof(int)); int h_overflow[1]; int * d_overflow; cudaStatus = cudaMalloc((void**) &d_overflow, sizeof(int)); std::atomic_int h_gray_shade[1]; int * d_gray_shade; cudaStatus = cudaMalloc((void**) &d_gray_shade, sizeof(int)); std::atomic_int h_iter[1]; int * d_iter; cudaStatus = cudaMalloc((void**) &d_iter, sizeof(int)); cudaDeviceSynchronize(); CUDA_ERR(); ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2); timer.stop("Allocation"); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); int source; read_input(source, h_nodes, h_edges, p); for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); timer.stop("Initialization"); //timer.print("Initialization", 1); // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); for (int rep = 0; rep < p.n_reps; rep++) { // Reset for (int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for (int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } it_cpu = 0; it_gpu = 0; h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // if(rep >= p.n_warmup) timer.start("Kernel"); #ifdef LOGS start_iteration(); #endif // Run first iteration in master CPU thread h_num_t[0] = 1; int pid; int index_i, index_o; for (index_i = 0; index_i < h_num_t[0]; index_i++) { pid = h_q1[index_i]; h_color[pid].store(BLACK); int cur_cost = h_cost[pid].load(); for (int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x); i++) { int id = h_edges[i].x; int cost = h_edges[i].y; cost += cur_cost; h_cost[id].store(cost); h_color[id].store(GRAY0); index_o = h_tail[0].fetch_add(1); h_q2[index_o] = id; } } h_num_t[0] = h_tail[0].load(); h_tail[0].store(0); h_threads_run[0].fetch_add(1); h_gray_shade[0].store(GRAY1); h_iter[0].fetch_add(1); // if(rep >= p.n_warmup) timer.stop("Kernel"); // Pointers to input and output queues int * h_qin = h_q2; int * h_qout = h_q1; int * d_qin = d_q2; int * d_qout = d_q1; const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0; const int GPU_EXEC = (p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0; // Run subsequent iterations on CPU or GPU until number of input queue elements is 0 while (*h_num_t != 0) { if ((*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit it_cpu = it_cpu + 1; // if(rep >= p.n_warmup) timer.start("Kernel"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { h_qin = h_q1; h_qout = h_q2; } else { h_qin = h_q2; h_qout = h_q1; } std::thread main_thread(run_cpu_threads, h_nodes, h_edges, h_cost, h_color, h_qin, h_qout, h_num_t, h_head, h_tail, h_threads_end, h_threads_run, h_gray_shade, h_iter, p.n_threads, p.switching_limit, GPU_EXEC); main_thread.join(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.stop("Kernel"); } else if ((*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit it_gpu = it_gpu + 1; // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_cost, h_cost, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_color, h_color, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_run, h_threads_run, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_threads_end, h_threads_end, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_overflow, h_overflow, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q1, h_q1, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_q2, h_q2, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_iter, h_iter, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // Continue until switching_limit condition is not satisfied while ((*h_num_t != 0) && (*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // Swap queues if (h_iter[0] % 2 == 0) { d_qin = d_q1; d_qout = d_q2; } else { d_qin = d_q2; d_qout = d_q1; } // if(rep >= p.n_warmup) timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_num_t, h_num_t, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_tail, h_tail, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_head, h_head, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_gray_shade, h_gray_shade, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy To Device"); // if(rep >= p.n_warmup) timer.start("Kernel"); assert( p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads, d_nodes, d_edges, d_cost, d_color, d_qin, d_qout, d_num_t, d_head, d_tail, d_threads_end, d_threads_run, d_overflow, d_gray_shade, d_iter, p.switching_limit, CPU_EXEC, sizeof(int) * (W_QUEUE_SIZE + 3)); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Kernel"); // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_tail, d_tail, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_iter, d_iter, sizeof(int), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if (h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } // if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); cudaStatus = cudaMemcpy(h_cost, d_cost, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_color, d_color, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_run, d_threads_run, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_threads_end, d_threads_end, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q1, d_q1, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(h_q2, d_q2, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); CUDA_ERR(); // if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); } } #ifdef LOGS end_iteration(); #endif // printf("IT CPU:%d\t",it_cpu); //printf("IT GPU:%d\n",it_gpu); if (p.mode == 1) { err = newest_verify(h_cost, n_nodes, n_nodes, gold, it_cpu, it_gpu); } //err=new_verify(h_cost, n_nodes,,it_cpu,it_gpu); if (err > 0) { printf("Errors: %d\n", err); read_input(source, h_nodes, h_edges, p); read_gold(gold, p); } else { printf(".ITERATION %d\n", rep); } #ifdef LOGS log_error_count(err); #endif // Ler a entrada novamente //read_input(source, h_nodes, h_edges, p); //read_gold(gold,p); } // end of iteration #ifdef LOGS end_log_file(); #endif // timer.print("Allocation", 1); //timer.print("Copy To Device", p.n_reps); // timer.print("Kernel", p.n_reps); // timer.print("Copy Back and Merge", p.n_reps); if (p.mode == 0) { create_output(h_cost, n_nodes, n_edges, std::string(p.comparison_file)); } // Verify answer verify(h_cost, n_nodes, p.comparison_file); // Free memory timer.start("Deallocation"); free(h_nodes); free(h_edges); free(h_color); free(h_cost); free(h_q1); free(h_q2); cudaStatus = cudaFree(d_nodes); cudaStatus = cudaFree(d_edges); cudaStatus = cudaFree(d_cost); cudaStatus = cudaFree(d_color); cudaStatus = cudaFree(d_q1); cudaStatus = cudaFree(d_q2); cudaStatus = cudaFree(d_num_t); cudaStatus = cudaFree(d_head); cudaStatus = cudaFree(d_tail); cudaStatus = cudaFree(d_threads_end); cudaStatus = cudaFree(d_threads_run); cudaStatus = cudaFree(d_overflow); cudaStatus = cudaFree(d_iter); cudaStatus = cudaFree(d_gray_shade); CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Deallocation"); //timer.print("Deallocation", 1); // Release timers timer.release("Allocation"); timer.release("Initialization"); timer.release("Copy To Device"); timer.release("Kernel"); timer.release("Copy Back and Merge"); timer.release("Deallocation"); printf("Test Passed\n"); return 0; }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); cudaError_t cudaStatus; // Allocate int n_nodes, n_edges; read_input_size(n_nodes, n_edges, p); Node * h_nodes = (Node *)malloc(sizeof(Node) * n_nodes); Node * d_nodes; cudaStatus = cudaMalloc((void**)&d_nodes, sizeof(Node) * n_nodes); Edge * h_edges = (Edge *)malloc(sizeof(Edge) * n_edges); Edge * d_edges; cudaStatus = cudaMalloc((void**)&d_edges, sizeof(Edge) * n_edges); std::atomic_int *h_color = (std::atomic_int *)malloc(sizeof(std::atomic_int) * n_nodes); int * d_color; cudaStatus = cudaMalloc((void**)&d_color, sizeof(int) * n_nodes); std::atomic_int *h_cost = (std::atomic_int *)malloc(sizeof(std::atomic_int) * n_nodes); int * d_cost; cudaStatus = cudaMalloc((void**)&d_cost, sizeof(int) * n_nodes); int * h_q1 = (int *)malloc(n_nodes * sizeof(int)); int * d_q1; cudaStatus = cudaMalloc((void**)&d_q1, sizeof(int) * n_nodes); int * h_q2 = (int *)malloc(n_nodes * sizeof(int)); int * d_q2; cudaStatus = cudaMalloc((void**)&d_q2, sizeof(int) * n_nodes); std::atomic_int h_head[1]; int * d_head; cudaStatus = cudaMalloc((void**)&d_head, sizeof(int)); std::atomic_int h_tail[1]; int * d_tail; cudaStatus = cudaMalloc((void**)&d_tail, sizeof(int)); std::atomic_int h_threads_end[1]; int * d_threads_end; cudaStatus = cudaMalloc((void**)&d_threads_end, sizeof(int)); std::atomic_int h_threads_run[1]; int * d_threads_run; cudaStatus = cudaMalloc((void**)&d_threads_run, sizeof(int)); int h_num_t[1]; int * d_num_t; cudaStatus = cudaMalloc((void**)&d_num_t, sizeof(int)); int h_overflow[1]; int * d_overflow; cudaStatus = cudaMalloc((void**)&d_overflow, sizeof(int)); std::atomic_int h_gray_shade[1]; int * d_gray_shade; cudaStatus = cudaMalloc((void**)&d_gray_shade, sizeof(int)); std::atomic_int h_iter[1]; int * d_iter; cudaStatus = cudaMalloc((void**)&d_iter, sizeof(int)); cudaThreadSynchronize(); CUDA_ERR(); ALLOC_ERR(h_nodes, h_edges, h_color, h_cost, h_q1, h_q2); // Initialize int source; read_input(source, h_nodes, h_edges, p); for(int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for(int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // Copy to device cudaStatus = cudaMemcpy(d_nodes, h_nodes, sizeof(Node) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_edges, h_edges, sizeof(Edge) * n_edges, cudaMemcpyHostToDevice); cudaThreadSynchronize(); CUDA_ERR(); for(int rep = 0; rep < p.n_reps + p.n_warmup; rep++) { // Reset for(int i = 0; i < n_nodes; i++) { h_cost[i].store(INF); } h_cost[source].store(0); for(int i = 0; i < n_nodes; i++) { h_color[i].store(WHITE); } h_tail[0].store(0); h_head[0].store(0); h_threads_end[0].store(0); h_threads_run[0].store(0); h_q1[0] = source; h_iter[0].store(0); h_overflow[0] = 0; h_gray_shade[0].store(GRAY0); // Run first iteration in master CPU thread h_num_t[0] = 1; int pid; int index_i, index_o; for(index_i = 0; index_i < h_num_t[0]; index_i++) { pid = h_q1[index_i]; h_color[pid].store(BLACK); int cur_cost = h_cost[pid].load(); for(int i = h_nodes[pid].x; i < (h_nodes[pid].y + h_nodes[pid].x); i++) { int id = h_edges[i].x; int cost = h_edges[i].y; cost += cur_cost; h_cost[id].store(cost); h_color[id].store(GRAY0); index_o = h_tail[0].fetch_add(1); h_q2[index_o] = id; } } h_num_t[0] = h_tail[0].load(); h_tail[0].store(0); h_threads_run[0].fetch_add(1); h_gray_shade[0].store(GRAY1); h_iter[0].fetch_add(1); // Pointers to input and output queues int * h_qin = h_q2; int * h_qout = h_q1; int * d_qin = d_q2; int * d_qout = d_q1; const int CPU_EXEC = (p.n_threads > 0) ? 1 : 0; const int GPU_EXEC = (p.n_gpu_blocks > 0 && p.n_gpu_threads > 0) ? 1 : 0; //m5_work_begin(0, 0); // Run subsequent iterations on CPU or GPU until number of input queue elements is 0 while(*h_num_t != 0) { if((*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // If the number of input queue elements is lower than switching_limit // Continue until switching_limit condition is not satisfied while((*h_num_t != 0) && (*h_num_t < p.switching_limit || GPU_EXEC == 0) && CPU_EXEC == 1) { // Swap queues if(h_iter[0] % 2 == 0) { h_qin = h_q1; h_qout = h_q2; } else { h_qin = h_q2; h_qout = h_q1; } std::thread main_thread(run_cpu_threads, h_nodes, h_edges, h_cost, h_color, h_qin, h_qout, h_num_t, h_head, h_tail, h_threads_end, h_threads_run, h_gray_shade, h_iter, p.n_threads, p.switching_limit, GPU_EXEC); main_thread.join(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if(h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } } else if((*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // If the number of input queue elements is higher than or equal to switching_limit cudaStatus = cudaMemcpy( d_cost, h_cost, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_color, h_color, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_threads_run, h_threads_run, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_threads_end, h_threads_end, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_overflow, h_overflow, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_q1, h_q1, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_q2, h_q2, sizeof(int) * n_nodes, cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_iter, h_iter, sizeof(int), cudaMemcpyHostToDevice); cudaThreadSynchronize(); CUDA_ERR(); // Continue until switching_limit condition is not satisfied while((*h_num_t != 0) && (*h_num_t >= p.switching_limit || CPU_EXEC == 0) && GPU_EXEC == 1) { // Swap queues if(h_iter[0] % 2 == 0) { d_qin = d_q1; d_qout = d_q2; } else { d_qin = d_q2; d_qout = d_q1; } cudaStatus = cudaMemcpy( d_num_t, h_num_t, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_tail, h_tail, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_head, h_head, sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy( d_gray_shade, h_gray_shade, sizeof(int), cudaMemcpyHostToDevice); cudaThreadSynchronize(); CUDA_ERR(); cudaStatus = call_SSSP_gpu(p.n_gpu_blocks, p.n_gpu_threads, d_nodes, d_edges, d_cost, d_color, d_qin, d_qout, d_num_t, d_head, d_tail, d_threads_end, d_threads_run, d_overflow, d_gray_shade, d_iter, p.switching_limit, CPU_EXEC, sizeof(int) * (W_QUEUE_SIZE + 3)); cudaThreadSynchronize(); CUDA_ERR(); cudaStatus = cudaMemcpy( h_tail, d_tail, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_iter, d_iter, sizeof(int), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); CUDA_ERR(); h_num_t[0] = h_tail[0].load(); // Number of elements in output queue h_tail[0].store(0); h_head[0].store(0); if(h_iter[0].load() % 2 == 0) h_gray_shade[0].store(GRAY0); else h_gray_shade[0].store(GRAY1); } cudaStatus = cudaMemcpy( h_cost, d_cost, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_color, d_color, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_threads_run, d_threads_run, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_threads_end, d_threads_end, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_q1, d_q1, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy( h_q2, d_q2, sizeof(int) * n_nodes, cudaMemcpyDeviceToHost); cudaThreadSynchronize(); CUDA_ERR(); } } //m5_work_end(0, 0); } // end of iteration // Verify answer verify(h_cost, n_nodes, p.comparison_file); // Free memory free(h_nodes); free(h_edges); free(h_color); free(h_cost); free(h_q1); free(h_q2); cudaStatus = cudaFree(d_nodes); cudaStatus = cudaFree(d_edges); cudaStatus = cudaFree(d_cost); cudaStatus = cudaFree(d_color); cudaStatus = cudaFree(d_q1); cudaStatus = cudaFree(d_q2); cudaStatus = cudaFree(d_num_t); cudaStatus = cudaFree(d_head); cudaStatus = cudaFree(d_tail); cudaStatus = cudaFree(d_threads_end); cudaStatus = cudaFree(d_threads_run); cudaStatus = cudaFree(d_overflow); cudaStatus = cudaFree(d_iter); cudaStatus = cudaFree(d_gray_shade); CUDA_ERR(); cudaThreadSynchronize(); printf("Test Passed\n"); return 0; }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; // Allocate timer.start("Allocation"); int n_flow_vectors = read_input_size(p); int best_model = -1; int best_outliers = n_flow_vectors; #ifdef CUDA_8_0 flowvector *h_flow_vector_array; cudaStatus = cudaMallocManaged(&h_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int *h_random_numbers; cudaStatus = cudaMallocManaged(&h_random_numbers, 2 * p.max_iter * sizeof(int)); int *h_model_candidate; cudaStatus = cudaMallocManaged(&h_model_candidate, p.max_iter * sizeof(int)); int *h_outliers_candidate; cudaStatus = cudaMallocManaged(&h_outliers_candidate, p.max_iter * sizeof(int)); float *h_model_param_local; cudaStatus = cudaMallocManaged(&h_model_param_local, 4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id; cudaStatus = cudaMallocManaged(&h_g_out_id, sizeof(std::atomic_int)); flowvector * d_flow_vector_array = h_flow_vector_array; int * d_random_numbers = h_random_numbers; int * d_model_candidate = h_model_candidate; int * d_outliers_candidate = h_outliers_candidate; float * d_model_param_local = h_model_param_local; std::atomic_int *d_g_out_id = h_g_out_id; std::atomic_int * worklist; cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int)); #else flowvector * h_flow_vector_array = (flowvector *)malloc(n_flow_vectors * sizeof(flowvector)); int * h_random_numbers = (int *)malloc(2 * p.max_iter * sizeof(int)); int * h_model_candidate = (int *)malloc(p.max_iter * sizeof(int)); int * h_outliers_candidate = (int *)malloc(p.max_iter * sizeof(int)); float * h_model_param_local = (float *)malloc(4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id = (std::atomic_int *)malloc(sizeof(std::atomic_int)); flowvector * d_flow_vector_array; cudaStatus = cudaMalloc((void**)&d_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int * d_random_numbers; cudaStatus = cudaMalloc((void**)&d_random_numbers, 2 * p.max_iter * sizeof(int)); int * d_model_candidate; cudaStatus = cudaMalloc((void**)&d_model_candidate, p.max_iter * sizeof(int)); int * d_outliers_candidate; cudaStatus = cudaMalloc((void**)&d_outliers_candidate, p.max_iter * sizeof(int)); float * d_model_param_local; cudaStatus = cudaMalloc((void**)&d_model_param_local, 4 * p.max_iter * sizeof(float)); int *d_g_out_id; cudaStatus = cudaMalloc((void**)&d_g_out_id, sizeof(int)); ALLOC_ERR(h_flow_vector_array, h_random_numbers, h_model_candidate, h_outliers_candidate, h_model_param_local, h_g_out_id); #endif CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Allocation"); timer.print("Allocation", 1); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); read_input(h_flow_vector_array, h_random_numbers, p); cudaDeviceSynchronize(); timer.stop("Initialization"); timer.print("Initialization", 1); #ifndef CUDA_8_0 // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_flow_vector_array, h_flow_vector_array, n_flow_vectors * sizeof(flowvector), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_random_numbers, h_random_numbers, 2 * p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); timer.print("Copy To Device", 1); #endif for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Reset memset((void *)h_model_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_outliers_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_model_param_local, 0, 4 * p.max_iter * sizeof(float)); #ifdef CUDA_8_0 h_g_out_id[0].store(0); if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning worklist[0].store(0); } #else h_g_out_id[0] = 0; cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); CUDA_ERR(); #endif cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.start("Kernel"); // Launch GPU threads // Kernel launch if(p.n_gpu_blocks > 0) { assert(p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_RANSAC_kernel_block(p.n_gpu_blocks, p.n_gpu_threads, n_flow_vectors, p.max_iter, p.error_threshold, p.convergence_threshold, p.max_iter, p.alpha, d_model_param_local, d_flow_vector_array, d_random_numbers, d_model_candidate, d_outliers_candidate, (int*)d_g_out_id, sizeof(int) #ifdef CUDA_8_0 + sizeof(int), (int*)worklist #endif ); CUDA_ERR(); } // Launch CPU threads std::thread main_thread(run_cpu_threads, h_model_candidate, h_outliers_candidate, h_model_param_local, h_flow_vector_array, n_flow_vectors, h_random_numbers, p.max_iter, p.error_threshold, p.convergence_threshold, h_g_out_id, p.n_threads, p.max_iter, p.alpha #ifdef CUDA_8_0 , worklist); #else ); #endif cudaDeviceSynchronize(); main_thread.join(); if(rep >= p.n_warmup) timer.stop("Kernel"); #ifndef CUDA_8_0 // Copy back if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); int d_candidates = 0; if(p.alpha < 1.0) { cudaStatus = cudaMemcpy(&d_candidates, d_g_out_id, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_model_candidate[h_g_out_id[0]], d_model_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_outliers_candidate[h_g_out_id[0]], d_outliers_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); CUDA_ERR(); } h_g_out_id[0] += d_candidates; cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); #endif // Post-processing (chooses the best model among the candidates) if(rep >= p.n_warmup) timer.start("Kernel"); for(int i = 0; i < h_g_out_id[0]; i++) { if(h_outliers_candidate[i] < best_outliers) { best_outliers = h_outliers_candidate[i]; best_model = h_model_candidate[i]; } } if(rep >= p.n_warmup) timer.stop("Kernel"); }