/** * @brief * main - the initialization and main loop of pbs_comm * * @param[in] argc - argument count. * @param[in] argv - argument values. * * @return int * @retval 0 - success */ main(int argc, char *argv[]) { SC_HANDLE schManager; SC_HANDLE schSelf; int reg = 0; int unreg = 0; TCHAR szFileName[MAX_PATH]; /*the real deal or just pbs_version and exit*/ execution_mode(argc, argv); if (argc > 1) { if (strcmp(argv[1], "-R") == 0) reg = 1; else if (strcmp(argv[1], "-U") == 0) unreg = 1; else if (strcmp(argv[1], "-N") == 0) stalone = 1; } if (reg || unreg) { schManager = OpenSCManager(0, 0, SC_MANAGER_ALL_ACCESS); if (schManager == 0) { ErrorMessage("OpenSCManager"); } if (reg) { GetModuleFileName(0, szFileName, sizeof(szFileName)/sizeof(*szFileName)); printf("Installing service %s\n", g_PbsCommName); schSelf = CreateService(schManager, g_PbsCommName, __TEXT("PBS COMM"), SERVICE_ALL_ACCESS, SERVICE_WIN32_OWN_PROCESS, SERVICE_AUTO_START, SERVICE_ERROR_NORMAL, replace_space(szFileName, ""), 0, 0, 0, 0, 0); if (schSelf) { printf("Service %s installed succesfully!\n", g_PbsCommName); } else { ErrorMessage("CreateService"); } if (schSelf != 0) CloseServiceHandle(schSelf); } else if (unreg) { schSelf = OpenService(schManager, g_PbsCommName, DELETE); if (schSelf) { if (DeleteService(schSelf)) { printf("Service %s uninstalled successfully!\n", g_PbsCommName); } else { ErrorMessage("DeleteService"); } } else { ErrorMessage("OpenService failed"); } if (schSelf != 0) CloseServiceHandle(schSelf); } if (schManager != 0) CloseServiceHandle(schManager); } else if (stalone) { struct arg_param *pap; int i, j; pap = create_arg_param(); if (pap == NULL) ErrorMessage("create_arg_param"); pap->argc = argc-1; /* don't pass the second argument */ for (i=j=0; i < argc; i++) { if (i == 1) continue; pap->argv[j] = strdup(argv[i]); j++; } main_thread((void *)pap); free_arg_param(pap); } else { /* running as a service */ SERVICE_TABLE_ENTRY rgste[] = { {(TCHAR*)g_PbsCommName, PbsCommMain }, { 0, 0 } }; if (getenv("PBS_CONF_FILE") == NULL) { char conf_path[80]; char *p; char psave; struct stat sbuf; if (p = strstr(argv[0], "exec")) { psave = *p; *p = '\0'; _snprintf(conf_path, 79, "%spbs.conf", argv[0]); *p = psave; if (stat(conf_path, &sbuf) == 0) { setenv("PBS_CONF_FILE", conf_path, 1); } } } if (!StartServiceCtrlDispatcher(rgste)) { ErrorMessage("StartServiceCntrlDispatcher"); } } return (0); }
int main(void) { unsigned __CPROVER_bitvector[3] tmp_t0_r0; unsigned __CPROVER_bitvector[3] tmp_t1_r0; unsigned __CPROVER_bitvector[3] tmp_t2_r0; unsigned __CPROVER_bitvector[3] tmp_t0_r1; unsigned __CPROVER_bitvector[3] tmp_t1_r1; unsigned __CPROVER_bitvector[3] tmp_t2_r1; unsigned __CPROVER_bitvector[3] tmp_t0_r2; unsigned __CPROVER_bitvector[3] tmp_t1_r2; unsigned __CPROVER_bitvector[3] tmp_t2_r2; unsigned __CPROVER_bitvector[3] tmp_t0_r3; unsigned __CPROVER_bitvector[3] tmp_t1_r3; unsigned __CPROVER_bitvector[3] tmp_t2_r3; unsigned __CPROVER_bitvector[3] tmp_t0_r4; unsigned __CPROVER_bitvector[3] tmp_t1_r4; unsigned __CPROVER_bitvector[3] tmp_t2_r4; unsigned __CPROVER_bitvector[3] tmp_t0_r5; unsigned __CPROVER_bitvector[3] tmp_t1_r5; unsigned __CPROVER_bitvector[3] tmp_t2_r5; unsigned __CPROVER_bitvector[3] tmp_t0_r6; unsigned __CPROVER_bitvector[3] tmp_t1_r6; unsigned __CPROVER_bitvector[3] tmp_t2_r6; unsigned __CPROVER_bitvector[3] tmp_t0_r7; unsigned __CPROVER_bitvector[3] tmp_t1_r7; unsigned __CPROVER_bitvector[3] tmp_t2_r7; unsigned __CPROVER_bitvector[3] tmp_t0_r8; unsigned __CPROVER_bitvector[3] tmp_t1_r8; unsigned __CPROVER_bitvector[3] tmp_t2_r8; unsigned __CPROVER_bitvector[3] tmp_t0_r9; unsigned __CPROVER_bitvector[3] tmp_t1_r9; unsigned __CPROVER_bitvector[3] tmp_t2_r9; unsigned __CPROVER_bitvector[3] tmp_t0_r10; unsigned __CPROVER_bitvector[3] tmp_t1_r10; unsigned __CPROVER_bitvector[3] tmp_t2_r10; unsigned __CPROVER_bitvector[3] tmp_t0_r11; // round 0 thread_index = 0; pc_cs[0] = pc[0] + tmp_t0_r0; assume(pc_cs[0] > 0); assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r0; assume(pc_cs[1] <= 6); thr1_0(threadargs[1]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r0; assume(pc_cs[2] <= 6); thr2_0(threadargs[2]); pc[2] = pc_cs[2]; } // round 1 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r1; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r1; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r1; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 2 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r2; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r2; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r2; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 3 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r3; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r3; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r3; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 4 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r4; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r4; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r4; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 5 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r5; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r5; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r5; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 6 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r6; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r6; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r6; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 7 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r7; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r7; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r7; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 8 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r8; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r8; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r8; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 9 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r9; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r9; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r9; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } // round 10 thread_index = 0; if (active_thread[thread_index] == 1) { // main pc_cs[0] = pc[0] + tmp_t0_r10; assume(pc_cs[0] <= 4); main_thread(); pc[0] = pc_cs[0]; } thread_index = 1; if (active_thread[thread_index] == 1) { // thr1_0 pc_cs[1] = pc[1] + tmp_t1_r10; assume(pc_cs[1] <= 6); thr1_0(threadargs[thread_index]); pc[1] = pc_cs[1]; } thread_index = 2; if (active_thread[thread_index] == 1) { // thr2_0 pc_cs[2] = pc[2] + tmp_t2_r10; assume(pc_cs[2] <= 6); thr2_0(threadargs[thread_index]); pc[2] = pc_cs[2]; } thread_index = 0; if (active_thread[0] == 1) { pc_cs[0] = pc[0] + tmp_t0_r11; assume(pc_cs[0] <= 4); main_thread(); } return 0; }
int main(void) { unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t1_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[4] __cs_tmp_t2_r0 = (unsigned __CPROVER_bitvector[4])nondet_uint(); unsigned __CPROVER_bitvector[4] __cs_tmp_t3_r0 = (unsigned __CPROVER_bitvector[4])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t4_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t1_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[4] __cs_tmp_t2_r1 = (unsigned __CPROVER_bitvector[4])nondet_uint(); unsigned __CPROVER_bitvector[4] __cs_tmp_t3_r1 = (unsigned __CPROVER_bitvector[4])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t4_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint(); unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r2 = (unsigned __CPROVER_bitvector[5])nondet_uint(); /* round 0 */ __CPROVER_assume(__cs_tmp_t0_r0 > 0); __cs_thread_index = 0; __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r0; __CPROVER_assume(__cs_pc_cs[0] > 0); __CPROVER_assume(__cs_pc_cs[0] <= 16); main_thread(); __cs_pc[0] = __cs_pc_cs[0]; if (__cs_active_thread[1] == 1) { __cs_thread_index = 1; __cs_pc_cs[1] = __cs_pc[1] + __cs_tmp_t1_r0; __CPROVER_assume(__cs_pc_cs[1] <= 22); P0_0(__cs_threadargs[1]); __cs_pc[1] = __cs_pc_cs[1]; } if (__cs_active_thread[2] == 1) { __cs_thread_index = 2; __cs_pc_cs[2] = __cs_pc[2] + __cs_tmp_t2_r0; __CPROVER_assume(__cs_pc_cs[2] <= 14); P1_0(__cs_threadargs[2]); __cs_pc[2] = __cs_pc_cs[2]; } if (__cs_active_thread[3] == 1) { __cs_thread_index = 3; __cs_pc_cs[3] = __cs_pc[3] + __cs_tmp_t3_r0; __CPROVER_assume(__cs_pc_cs[3] <= 12); P2_0(__cs_threadargs[3]); __cs_pc[3] = __cs_pc_cs[3]; } if (__cs_active_thread[4] == 1) { __cs_thread_index = 4; __cs_pc_cs[4] = __cs_pc[4] + __cs_tmp_t4_r0; __CPROVER_assume(__cs_pc_cs[4] <= 27); P3_0(__cs_threadargs[4]); __cs_pc[4] = __cs_pc_cs[4]; } /* round 1 */ if (__cs_active_thread[0] == 1) { __cs_thread_index = 0; __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r1; __CPROVER_assume(__cs_pc_cs[0] >= __cs_pc[0]); __CPROVER_assume(__cs_pc_cs[0] <= 16); main_thread(); __cs_pc[0] = __cs_pc_cs[0]; } if (__cs_active_thread[1] == 1) { __cs_thread_index = 1; __cs_pc_cs[1] = __cs_pc[1] + __cs_tmp_t1_r1; __CPROVER_assume(__cs_pc_cs[1] >= __cs_pc[1]); __CPROVER_assume(__cs_pc_cs[1] <= 22); P0_0(__cs_threadargs[__cs_thread_index]); __cs_pc[1] = __cs_pc_cs[1]; } if (__cs_active_thread[2] == 1) { __cs_thread_index = 2; __cs_pc_cs[2] = __cs_pc[2] + __cs_tmp_t2_r1; __CPROVER_assume(__cs_pc_cs[2] >= __cs_pc[2]); __CPROVER_assume(__cs_pc_cs[2] <= 14); P1_0(__cs_threadargs[__cs_thread_index]); __cs_pc[2] = __cs_pc_cs[2]; } if (__cs_active_thread[3] == 1) { __cs_thread_index = 3; __cs_pc_cs[3] = __cs_pc[3] + __cs_tmp_t3_r1; __CPROVER_assume(__cs_pc_cs[3] >= __cs_pc[3]); __CPROVER_assume(__cs_pc_cs[3] <= 12); P2_0(__cs_threadargs[__cs_thread_index]); __cs_pc[3] = __cs_pc_cs[3]; } if (__cs_active_thread[4] == 1) { __cs_thread_index = 4; __cs_pc_cs[4] = __cs_pc[4] + __cs_tmp_t4_r1; __CPROVER_assume(__cs_pc_cs[4] >= __cs_pc[4]); __CPROVER_assume(__cs_pc_cs[4] <= 27); P3_0(__cs_threadargs[__cs_thread_index]); __cs_pc[4] = __cs_pc_cs[4]; } if (__cs_active_thread[0] == 1) { __cs_thread_index = 0; __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r2; __CPROVER_assume(__cs_pc_cs[0] >= __cs_pc[0]); __CPROVER_assume(__cs_pc_cs[0] <= 16); main_thread(); } return 0; }
// Main ------------------------------------------------------------------------------------------ int main(int argc, char **argv) { const Params p(argc, argv); CUDASetup setcuda(p.device); Timer timer; cudaError_t cudaStatus; // Allocate timer.start("Allocation"); int n_flow_vectors = read_input_size(p); int best_model = -1; int best_outliers = n_flow_vectors; #ifdef CUDA_8_0 flowvector *h_flow_vector_array; cudaStatus = cudaMallocManaged(&h_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int *h_random_numbers; cudaStatus = cudaMallocManaged(&h_random_numbers, 2 * p.max_iter * sizeof(int)); int *h_model_candidate; cudaStatus = cudaMallocManaged(&h_model_candidate, p.max_iter * sizeof(int)); int *h_outliers_candidate; cudaStatus = cudaMallocManaged(&h_outliers_candidate, p.max_iter * sizeof(int)); float *h_model_param_local; cudaStatus = cudaMallocManaged(&h_model_param_local, 4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id; cudaStatus = cudaMallocManaged(&h_g_out_id, sizeof(std::atomic_int)); flowvector * d_flow_vector_array = h_flow_vector_array; int * d_random_numbers = h_random_numbers; int * d_model_candidate = h_model_candidate; int * d_outliers_candidate = h_outliers_candidate; float * d_model_param_local = h_model_param_local; std::atomic_int *d_g_out_id = h_g_out_id; std::atomic_int * worklist; cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int)); #else flowvector * h_flow_vector_array = (flowvector *)malloc(n_flow_vectors * sizeof(flowvector)); int * h_random_numbers = (int *)malloc(2 * p.max_iter * sizeof(int)); int * h_model_candidate = (int *)malloc(p.max_iter * sizeof(int)); int * h_outliers_candidate = (int *)malloc(p.max_iter * sizeof(int)); float * h_model_param_local = (float *)malloc(4 * p.max_iter * sizeof(float)); std::atomic_int *h_g_out_id = (std::atomic_int *)malloc(sizeof(std::atomic_int)); flowvector * d_flow_vector_array; cudaStatus = cudaMalloc((void**)&d_flow_vector_array, n_flow_vectors * sizeof(flowvector)); int * d_random_numbers; cudaStatus = cudaMalloc((void**)&d_random_numbers, 2 * p.max_iter * sizeof(int)); int * d_model_candidate; cudaStatus = cudaMalloc((void**)&d_model_candidate, p.max_iter * sizeof(int)); int * d_outliers_candidate; cudaStatus = cudaMalloc((void**)&d_outliers_candidate, p.max_iter * sizeof(int)); float * d_model_param_local; cudaStatus = cudaMalloc((void**)&d_model_param_local, 4 * p.max_iter * sizeof(float)); int *d_g_out_id; cudaStatus = cudaMalloc((void**)&d_g_out_id, sizeof(int)); ALLOC_ERR(h_flow_vector_array, h_random_numbers, h_model_candidate, h_outliers_candidate, h_model_param_local, h_g_out_id); #endif CUDA_ERR(); cudaDeviceSynchronize(); timer.stop("Allocation"); timer.print("Allocation", 1); // Initialize timer.start("Initialization"); const int max_gpu_threads = setcuda.max_gpu_threads(); read_input(h_flow_vector_array, h_random_numbers, p); cudaDeviceSynchronize(); timer.stop("Initialization"); timer.print("Initialization", 1); #ifndef CUDA_8_0 // Copy to device timer.start("Copy To Device"); cudaStatus = cudaMemcpy(d_flow_vector_array, h_flow_vector_array, n_flow_vectors * sizeof(flowvector), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_random_numbers, h_random_numbers, 2 * p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); CUDA_ERR(); timer.stop("Copy To Device"); timer.print("Copy To Device", 1); #endif for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) { // Reset memset((void *)h_model_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_outliers_candidate, 0, p.max_iter * sizeof(int)); memset((void *)h_model_param_local, 0, 4 * p.max_iter * sizeof(float)); #ifdef CUDA_8_0 h_g_out_id[0].store(0); if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning worklist[0].store(0); } #else h_g_out_id[0] = 0; cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice); cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice); CUDA_ERR(); #endif cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.start("Kernel"); // Launch GPU threads // Kernel launch if(p.n_gpu_blocks > 0) { assert(p.n_gpu_threads <= max_gpu_threads && "The thread block size is greater than the maximum thread block size that can be used on this device"); cudaStatus = call_RANSAC_kernel_block(p.n_gpu_blocks, p.n_gpu_threads, n_flow_vectors, p.max_iter, p.error_threshold, p.convergence_threshold, p.max_iter, p.alpha, d_model_param_local, d_flow_vector_array, d_random_numbers, d_model_candidate, d_outliers_candidate, (int*)d_g_out_id, sizeof(int) #ifdef CUDA_8_0 + sizeof(int), (int*)worklist #endif ); CUDA_ERR(); } // Launch CPU threads std::thread main_thread(run_cpu_threads, h_model_candidate, h_outliers_candidate, h_model_param_local, h_flow_vector_array, n_flow_vectors, h_random_numbers, p.max_iter, p.error_threshold, p.convergence_threshold, h_g_out_id, p.n_threads, p.max_iter, p.alpha #ifdef CUDA_8_0 , worklist); #else ); #endif cudaDeviceSynchronize(); main_thread.join(); if(rep >= p.n_warmup) timer.stop("Kernel"); #ifndef CUDA_8_0 // Copy back if(rep >= p.n_warmup) timer.start("Copy Back and Merge"); int d_candidates = 0; if(p.alpha < 1.0) { cudaStatus = cudaMemcpy(&d_candidates, d_g_out_id, sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_model_candidate[h_g_out_id[0]], d_model_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); cudaStatus = cudaMemcpy(&h_outliers_candidate[h_g_out_id[0]], d_outliers_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost); CUDA_ERR(); } h_g_out_id[0] += d_candidates; cudaDeviceSynchronize(); if(rep >= p.n_warmup) timer.stop("Copy Back and Merge"); #endif // Post-processing (chooses the best model among the candidates) if(rep >= p.n_warmup) timer.start("Kernel"); for(int i = 0; i < h_g_out_id[0]; i++) { if(h_outliers_candidate[i] < best_outliers) { best_outliers = h_outliers_candidate[i]; best_model = h_model_candidate[i]; } } if(rep >= p.n_warmup) timer.stop("Kernel"); }