Beispiel #1
0
/**
 * @brief
 *		main - the initialization and main loop of pbs_comm
 *
 * @param[in]	argc	- argument count.
 * @param[in]	argv	- argument values.
 *
 * @return	int
 * @retval	0	- success
 */
main(int argc, char *argv[])
{
	SC_HANDLE schManager;
	SC_HANDLE schSelf;
	int reg = 0;
	int unreg = 0;
	TCHAR	  szFileName[MAX_PATH];

	/*the real deal or just pbs_version and exit*/

	execution_mode(argc, argv);

	if (argc > 1) {
		if (strcmp(argv[1], "-R") == 0)
			reg = 1;
		else if (strcmp(argv[1], "-U") == 0)
			unreg = 1;
		else if (strcmp(argv[1], "-N") == 0)
			stalone = 1;
	}

	if (reg || unreg) {
		schManager = OpenSCManager(0, 0, SC_MANAGER_ALL_ACCESS);
		if (schManager == 0) {
			ErrorMessage("OpenSCManager");
		}

		if (reg) {
			GetModuleFileName(0, szFileName, sizeof(szFileName)/sizeof(*szFileName));
			printf("Installing service %s\n", g_PbsCommName);
			schSelf =
				CreateService(schManager, g_PbsCommName, __TEXT("PBS COMM"),
				SERVICE_ALL_ACCESS,
				SERVICE_WIN32_OWN_PROCESS,
				SERVICE_AUTO_START, SERVICE_ERROR_NORMAL,
				replace_space(szFileName, ""), 0, 0, 0, 0, 0);

			if (schSelf) {
				printf("Service %s installed succesfully!\n", g_PbsCommName);
			} else {
				ErrorMessage("CreateService");
			}

			if (schSelf != 0)
				CloseServiceHandle(schSelf);
		} else if (unreg) {
			schSelf = OpenService(schManager, g_PbsCommName, DELETE);

			if (schSelf) {
				if (DeleteService(schSelf)) {
					printf("Service %s uninstalled successfully!\n", g_PbsCommName);
				} else {
					ErrorMessage("DeleteService");
				}
			} else {
				ErrorMessage("OpenService failed");
			}
			if (schSelf != 0)
				CloseServiceHandle(schSelf);
		}

		if (schManager != 0)
			CloseServiceHandle(schManager);
	} else if (stalone) {
		struct arg_param *pap;
		int	i, j;

		pap = create_arg_param();
		if (pap == NULL)
			ErrorMessage("create_arg_param");

		pap->argc = argc-1;	/* don't pass the second argument */
		for (i=j=0; i < argc; i++) {
			if (i == 1)
				continue;
			pap->argv[j] = strdup(argv[i]);
			j++;
		}
		main_thread((void *)pap);

		free_arg_param(pap);
	} else {	/* running as a service */
		SERVICE_TABLE_ENTRY rgste[] = { {(TCHAR*)g_PbsCommName, PbsCommMain },
			{ 0, 0 } };

		if (getenv("PBS_CONF_FILE") == NULL) {
			char conf_path[80];
			char *p;
			char psave;
			struct stat sbuf;

			if (p = strstr(argv[0], "exec")) {
				psave = *p;
				*p = '\0';
				_snprintf(conf_path, 79, "%spbs.conf", argv[0]);
				*p = psave;
				if (stat(conf_path, &sbuf) == 0) {
					setenv("PBS_CONF_FILE", conf_path, 1);
				}
			}
		}
		if (!StartServiceCtrlDispatcher(rgste)) {
			ErrorMessage("StartServiceCntrlDispatcher");
		}
	}
	return (0);
}
 int main(void) {
           unsigned __CPROVER_bitvector[3] tmp_t0_r0;
           unsigned __CPROVER_bitvector[3] tmp_t1_r0;
           unsigned __CPROVER_bitvector[3] tmp_t2_r0;
           unsigned __CPROVER_bitvector[3] tmp_t0_r1;
           unsigned __CPROVER_bitvector[3] tmp_t1_r1;
           unsigned __CPROVER_bitvector[3] tmp_t2_r1;
           unsigned __CPROVER_bitvector[3] tmp_t0_r2;
           unsigned __CPROVER_bitvector[3] tmp_t1_r2;
           unsigned __CPROVER_bitvector[3] tmp_t2_r2;
           unsigned __CPROVER_bitvector[3] tmp_t0_r3;
           unsigned __CPROVER_bitvector[3] tmp_t1_r3;
           unsigned __CPROVER_bitvector[3] tmp_t2_r3;
           unsigned __CPROVER_bitvector[3] tmp_t0_r4;
           unsigned __CPROVER_bitvector[3] tmp_t1_r4;
           unsigned __CPROVER_bitvector[3] tmp_t2_r4;
           unsigned __CPROVER_bitvector[3] tmp_t0_r5;
           unsigned __CPROVER_bitvector[3] tmp_t1_r5;
           unsigned __CPROVER_bitvector[3] tmp_t2_r5;
           unsigned __CPROVER_bitvector[3] tmp_t0_r6;
           unsigned __CPROVER_bitvector[3] tmp_t1_r6;
           unsigned __CPROVER_bitvector[3] tmp_t2_r6;
           unsigned __CPROVER_bitvector[3] tmp_t0_r7;
           unsigned __CPROVER_bitvector[3] tmp_t1_r7;
           unsigned __CPROVER_bitvector[3] tmp_t2_r7;
           unsigned __CPROVER_bitvector[3] tmp_t0_r8;
           unsigned __CPROVER_bitvector[3] tmp_t1_r8;
           unsigned __CPROVER_bitvector[3] tmp_t2_r8;
           unsigned __CPROVER_bitvector[3] tmp_t0_r9;
           unsigned __CPROVER_bitvector[3] tmp_t1_r9;
           unsigned __CPROVER_bitvector[3] tmp_t2_r9;
           unsigned __CPROVER_bitvector[3] tmp_t0_r10;
           unsigned __CPROVER_bitvector[3] tmp_t1_r10;
           unsigned __CPROVER_bitvector[3] tmp_t2_r10;
           unsigned __CPROVER_bitvector[3] tmp_t0_r11;
 
           // round 0
           thread_index = 0;
           pc_cs[0] = pc[0] + tmp_t0_r0;
           assume(pc_cs[0] > 0);
           assume(pc_cs[0] <= 4);
           main_thread();
           pc[0] = pc_cs[0];
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r0;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[1]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r0;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[2]);
              pc[2] = pc_cs[2];
           }
 
           // round 1
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r1;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r1;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r1;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 2
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r2;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r2;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r2;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 3
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r3;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r3;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r3;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 4
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r4;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r4;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r4;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 5
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r5;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r5;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r5;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 6
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r6;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r6;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r6;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 7
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r7;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r7;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r7;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 8
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r8;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r8;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r8;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 9
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r9;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r9;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r9;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           // round 10
           thread_index = 0;
           if (active_thread[thread_index] == 1) { // main
               pc_cs[0] = pc[0] + tmp_t0_r10;
               assume(pc_cs[0] <= 4);
               main_thread();
               pc[0] = pc_cs[0];
           }
 
           thread_index = 1;
           if (active_thread[thread_index] == 1) { // thr1_0
              pc_cs[1] = pc[1] + tmp_t1_r10;
              assume(pc_cs[1] <= 6);
              thr1_0(threadargs[thread_index]);
              pc[1] = pc_cs[1];
           }
 
           thread_index = 2;
           if (active_thread[thread_index] == 1) { // thr2_0
              pc_cs[2] = pc[2] + tmp_t2_r10;
              assume(pc_cs[2] <= 6);
              thr2_0(threadargs[thread_index]);
              pc[2] = pc_cs[2];
           }
 
           thread_index = 0;
           if (active_thread[0] == 1) {
              pc_cs[0] = pc[0] + tmp_t0_r11;
              assume(pc_cs[0] <= 4);
              main_thread();
           }
 
    return 0;
 }
                                        int main(void)
                                        {
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t1_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[4] __cs_tmp_t2_r0 = (unsigned __CPROVER_bitvector[4])nondet_uint();
                                        unsigned __CPROVER_bitvector[4] __cs_tmp_t3_r0 = (unsigned __CPROVER_bitvector[4])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t4_r0 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t1_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[4] __cs_tmp_t2_r1 = (unsigned __CPROVER_bitvector[4])nondet_uint();
                                        unsigned __CPROVER_bitvector[4] __cs_tmp_t3_r1 = (unsigned __CPROVER_bitvector[4])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t4_r1 = (unsigned __CPROVER_bitvector[5])nondet_uint();
                                        unsigned __CPROVER_bitvector[5] __cs_tmp_t0_r2 = (unsigned __CPROVER_bitvector[5])nondet_uint();
/* round  0 */
                                        __CPROVER_assume(__cs_tmp_t0_r0 > 0);
                                        __cs_thread_index = 0;
                                        __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r0;
                                        __CPROVER_assume(__cs_pc_cs[0] > 0);
                                        __CPROVER_assume(__cs_pc_cs[0] <= 16);
                                        main_thread();
                                        __cs_pc[0] = __cs_pc_cs[0];
                                        if (__cs_active_thread[1] == 1)
                                        		{
                                        __cs_thread_index = 1;
                                        __cs_pc_cs[1] = __cs_pc[1] + __cs_tmp_t1_r0;
                                        __CPROVER_assume(__cs_pc_cs[1] <= 22);
                                        P0_0(__cs_threadargs[1]);
                                        __cs_pc[1] = __cs_pc_cs[1];
                                        		}
                                        if (__cs_active_thread[2] == 1)
                                        		{
                                        __cs_thread_index = 2;
                                        __cs_pc_cs[2] = __cs_pc[2] + __cs_tmp_t2_r0;
                                        __CPROVER_assume(__cs_pc_cs[2] <= 14);
                                        P1_0(__cs_threadargs[2]);
                                        __cs_pc[2] = __cs_pc_cs[2];
                                        		}
                                        if (__cs_active_thread[3] == 1)
                                        		{
                                        __cs_thread_index = 3;
                                        __cs_pc_cs[3] = __cs_pc[3] + __cs_tmp_t3_r0;
                                        __CPROVER_assume(__cs_pc_cs[3] <= 12);
                                        P2_0(__cs_threadargs[3]);
                                        __cs_pc[3] = __cs_pc_cs[3];
                                        		}
                                        if (__cs_active_thread[4] == 1)
                                        		{
                                        __cs_thread_index = 4;
                                        __cs_pc_cs[4] = __cs_pc[4] + __cs_tmp_t4_r0;
                                        __CPROVER_assume(__cs_pc_cs[4] <= 27);
                                        P3_0(__cs_threadargs[4]);
                                        __cs_pc[4] = __cs_pc_cs[4];
                                        		}
/* round  1 */
                                        if (__cs_active_thread[0] == 1)
                                        		{
                                        				__cs_thread_index = 0;
                                        __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r1;
                                        __CPROVER_assume(__cs_pc_cs[0] >= __cs_pc[0]);
                                        __CPROVER_assume(__cs_pc_cs[0] <= 16);
                                        main_thread();
                                        __cs_pc[0] = __cs_pc_cs[0];
                                        		}
                                        if (__cs_active_thread[1] == 1)
                                        		{
                                        __cs_thread_index = 1;
                                        __cs_pc_cs[1] = __cs_pc[1] + __cs_tmp_t1_r1;
                                        __CPROVER_assume(__cs_pc_cs[1] >= __cs_pc[1]);
                                        __CPROVER_assume(__cs_pc_cs[1] <= 22);
                                        P0_0(__cs_threadargs[__cs_thread_index]);
                                        __cs_pc[1] = __cs_pc_cs[1];
                                        		}
                                        if (__cs_active_thread[2] == 1)
                                        		{
                                        __cs_thread_index = 2;
                                        __cs_pc_cs[2] = __cs_pc[2] + __cs_tmp_t2_r1;
                                        __CPROVER_assume(__cs_pc_cs[2] >= __cs_pc[2]);
                                        __CPROVER_assume(__cs_pc_cs[2] <= 14);
                                        P1_0(__cs_threadargs[__cs_thread_index]);
                                        __cs_pc[2] = __cs_pc_cs[2];
                                        		}
                                        if (__cs_active_thread[3] == 1)
                                        		{
                                        __cs_thread_index = 3;
                                        __cs_pc_cs[3] = __cs_pc[3] + __cs_tmp_t3_r1;
                                        __CPROVER_assume(__cs_pc_cs[3] >= __cs_pc[3]);
                                        __CPROVER_assume(__cs_pc_cs[3] <= 12);
                                        P2_0(__cs_threadargs[__cs_thread_index]);
                                        __cs_pc[3] = __cs_pc_cs[3];
                                        		}
                                        if (__cs_active_thread[4] == 1)
                                        		{
                                        __cs_thread_index = 4;
                                        __cs_pc_cs[4] = __cs_pc[4] + __cs_tmp_t4_r1;
                                        __CPROVER_assume(__cs_pc_cs[4] >= __cs_pc[4]);
                                        __CPROVER_assume(__cs_pc_cs[4] <= 27);
                                        P3_0(__cs_threadargs[__cs_thread_index]);
                                        __cs_pc[4] = __cs_pc_cs[4];
                                        		}
                                        if (__cs_active_thread[0] == 1)
                                        		{
                                        __cs_thread_index = 0;
                                        __cs_pc_cs[0] = __cs_pc[0] + __cs_tmp_t0_r2;
                                        __CPROVER_assume(__cs_pc_cs[0] >= __cs_pc[0]);
                                        __CPROVER_assume(__cs_pc_cs[0] <= 16);
                                        main_thread();
                                        		}
                                        return 0;
                                        }
Beispiel #4
0
// Main ------------------------------------------------------------------------------------------
int main(int argc, char **argv) {

    const Params p(argc, argv);
    CUDASetup    setcuda(p.device);
    Timer        timer;
    cudaError_t  cudaStatus;

    // Allocate
    timer.start("Allocation");
    int n_flow_vectors = read_input_size(p);
    int best_model     = -1;
    int best_outliers  = n_flow_vectors;
#ifdef CUDA_8_0
    flowvector *h_flow_vector_array;
    cudaStatus = cudaMallocManaged(&h_flow_vector_array, n_flow_vectors * sizeof(flowvector));
    int *h_random_numbers;
    cudaStatus = cudaMallocManaged(&h_random_numbers, 2 * p.max_iter * sizeof(int));
    int *h_model_candidate;
    cudaStatus = cudaMallocManaged(&h_model_candidate, p.max_iter * sizeof(int));
    int *h_outliers_candidate;
    cudaStatus = cudaMallocManaged(&h_outliers_candidate, p.max_iter * sizeof(int));
    float *h_model_param_local;
    cudaStatus = cudaMallocManaged(&h_model_param_local, 4 * p.max_iter * sizeof(float));
    std::atomic_int *h_g_out_id;
    cudaStatus = cudaMallocManaged(&h_g_out_id, sizeof(std::atomic_int));
    flowvector *     d_flow_vector_array  = h_flow_vector_array;
    int *            d_random_numbers     = h_random_numbers;
    int *            d_model_candidate    = h_model_candidate;
    int *            d_outliers_candidate = h_outliers_candidate;
    float *          d_model_param_local  = h_model_param_local;
    std::atomic_int *d_g_out_id           = h_g_out_id;
    std::atomic_int * worklist;
    cudaStatus = cudaMallocManaged(&worklist, sizeof(std::atomic_int));
#else
    flowvector *     h_flow_vector_array  = (flowvector *)malloc(n_flow_vectors * sizeof(flowvector));
    int *            h_random_numbers     = (int *)malloc(2 * p.max_iter * sizeof(int));
    int *            h_model_candidate    = (int *)malloc(p.max_iter * sizeof(int));
    int *            h_outliers_candidate = (int *)malloc(p.max_iter * sizeof(int));
    float *          h_model_param_local  = (float *)malloc(4 * p.max_iter * sizeof(float));
    std::atomic_int *h_g_out_id           = (std::atomic_int *)malloc(sizeof(std::atomic_int));
    flowvector *     d_flow_vector_array;
    cudaStatus = cudaMalloc((void**)&d_flow_vector_array, n_flow_vectors * sizeof(flowvector));
    int *            d_random_numbers;
    cudaStatus = cudaMalloc((void**)&d_random_numbers, 2 * p.max_iter * sizeof(int));
    int *            d_model_candidate;
    cudaStatus = cudaMalloc((void**)&d_model_candidate, p.max_iter * sizeof(int));
    int *            d_outliers_candidate;
    cudaStatus = cudaMalloc((void**)&d_outliers_candidate, p.max_iter * sizeof(int));
    float *          d_model_param_local;
    cudaStatus = cudaMalloc((void**)&d_model_param_local, 4 * p.max_iter * sizeof(float));
    int *d_g_out_id;
    cudaStatus = cudaMalloc((void**)&d_g_out_id, sizeof(int));
    ALLOC_ERR(h_flow_vector_array, h_random_numbers, h_model_candidate, h_outliers_candidate, h_model_param_local, h_g_out_id);
#endif
    CUDA_ERR();
    cudaDeviceSynchronize();
    timer.stop("Allocation");
    timer.print("Allocation", 1);

    // Initialize
    timer.start("Initialization");
    const int max_gpu_threads = setcuda.max_gpu_threads();
    read_input(h_flow_vector_array, h_random_numbers, p);
    cudaDeviceSynchronize();
    timer.stop("Initialization");
    timer.print("Initialization", 1);

#ifndef CUDA_8_0
    // Copy to device
    timer.start("Copy To Device");
    cudaStatus = cudaMemcpy(d_flow_vector_array, h_flow_vector_array, n_flow_vectors * sizeof(flowvector), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_random_numbers, h_random_numbers, 2 * p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice);
    cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    CUDA_ERR();
    timer.stop("Copy To Device");
    timer.print("Copy To Device", 1);
#endif

    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {

        // Reset
        memset((void *)h_model_candidate, 0, p.max_iter * sizeof(int));
        memset((void *)h_outliers_candidate, 0, p.max_iter * sizeof(int));
        memset((void *)h_model_param_local, 0, 4 * p.max_iter * sizeof(float));
#ifdef CUDA_8_0
        h_g_out_id[0].store(0);
        if(p.alpha < 0.0 || p.alpha > 1.0) { // Dynamic partitioning
            worklist[0].store(0);
        }
#else
        h_g_out_id[0] = 0;
        cudaStatus = cudaMemcpy(d_model_candidate, h_model_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_outliers_candidate, h_outliers_candidate, p.max_iter * sizeof(int), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_model_param_local, h_model_param_local, 4 * p.max_iter * sizeof(float), cudaMemcpyHostToDevice);
        cudaStatus = cudaMemcpy(d_g_out_id, h_g_out_id, sizeof(int), cudaMemcpyHostToDevice);
        CUDA_ERR();
#endif
        cudaDeviceSynchronize();

        if(rep >= p.n_warmup)
            timer.start("Kernel");

        // Launch GPU threads
        // Kernel launch
        if(p.n_gpu_blocks > 0) {
            assert(p.n_gpu_threads <= max_gpu_threads && 
                "The thread block size is greater than the maximum thread block size that can be used on this device");
            cudaStatus = call_RANSAC_kernel_block(p.n_gpu_blocks, p.n_gpu_threads, n_flow_vectors, p.max_iter, 
                p.error_threshold, p.convergence_threshold, p.max_iter, p.alpha, d_model_param_local, 
                d_flow_vector_array, d_random_numbers, d_model_candidate, d_outliers_candidate, (int*)d_g_out_id, 
                sizeof(int)
#ifdef CUDA_8_0
                + sizeof(int), (int*)worklist
#endif
                );
            CUDA_ERR();
        }
        // Launch CPU threads
        std::thread main_thread(run_cpu_threads, h_model_candidate, h_outliers_candidate, h_model_param_local,
            h_flow_vector_array, n_flow_vectors, h_random_numbers, p.max_iter, p.error_threshold,
            p.convergence_threshold, h_g_out_id, p.n_threads, p.max_iter, p.alpha
#ifdef CUDA_8_0
            ,
            worklist);
#else
            );
#endif

        cudaDeviceSynchronize();
        main_thread.join();

        if(rep >= p.n_warmup)
            timer.stop("Kernel");

#ifndef CUDA_8_0
        // Copy back
        if(rep >= p.n_warmup)
            timer.start("Copy Back and Merge");
        int d_candidates = 0;
        if(p.alpha < 1.0) {
            cudaStatus = cudaMemcpy(&d_candidates, d_g_out_id, sizeof(int), cudaMemcpyDeviceToHost);
            cudaStatus = cudaMemcpy(&h_model_candidate[h_g_out_id[0]], d_model_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost);
            cudaStatus = cudaMemcpy(&h_outliers_candidate[h_g_out_id[0]], d_outliers_candidate, d_candidates * sizeof(int), cudaMemcpyDeviceToHost);
            CUDA_ERR();
        }
        h_g_out_id[0] += d_candidates;
        cudaDeviceSynchronize();
        if(rep >= p.n_warmup)
            timer.stop("Copy Back and Merge");
#endif

        // Post-processing (chooses the best model among the candidates)
        if(rep >= p.n_warmup)
            timer.start("Kernel");
        for(int i = 0; i < h_g_out_id[0]; i++) {
            if(h_outliers_candidate[i] < best_outliers) {
                best_outliers = h_outliers_candidate[i];
                best_model    = h_model_candidate[i];
            }
        }
        if(rep >= p.n_warmup)
            timer.stop("Kernel");
    }