Пример #1
0
sampler* initialize_sampler(cl_int chain_length, cl_int dimension,
                            cl_int walkers_per_group, size_t work_group_size,
                            double a, cl_int pdf_number,
                            cl_int data_length, cl_float *data,
                            cl_int num_to_save, cl_int *indices_to_save,
                            const char *plat_name, const char *dev_name){

    /*
     Initialize stretch move MCMC sampler struct.
     Arrange parameters into sampler struct pointer.
     Allocate arrays on host, initialize walkers and other values as appropriate.
     Start OpenCL context and queue.
     Allocate device memory and transfer from host.
     Compile and initialize random number generator.
     Compile stretch move OpenCL kernel.

     Input:
          cl_int chain_length                Allocate space for this many samples in the sampler struct.
                                                 Sampler fills this array when run_sampler is called.
          cl_int dimension                   Dimension of state vector of Markov chain.
          cl_int walkers_per_group           Number of walkers in each of two groups. Total walkers is twice this.
          size_t work_group_size             Work group size.
                                                 For CPU this must be set to one.
                                                 For GPU this should be set larger, powers of two are optimal, try 64, 128 or 256.
                                                 This number must divide walkers_per_group.
          double a                           Coefficient for range of 'z' random variable.
                                                 Must be greater than one.
                                                 Standard value is 2.
                                                 Decrease a to increase low acceptance rate, especially in high dimensions.
          cl_int pdf_number                  Which PDF to sample. Passed to pdf.h as a compile time definition.
          cl_int data_length                 Length of observation data. If no data set this to zero.
          cl_float *data                     Observation data.
          cl_int num_to_save                 Number of components to save in the chain
          cl_int *indices_to_save            Indices of components to save in the chain
          const char *plat_name              String for platform name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.
          const char *dev_name               String for device name. Set to CHOOSE_INTERACTIVELY (no quotes) to do so.

     Output:
          returned: sampler *samp            Pointer to sampler struct with parameters, arrays, context, queue, kernel initialized.
     */


    if(OUTPUT_LEVEL > 0) printf("Initializing Stretch Move sampler.\n");


    // --------------------------------------------------------------------------
    // Set parameters
    // --------------------------------------------------------------------------

    // This environment variable forces headers to be reloaded each time
    // If not set and pdf if changed, changes may not be updated
    setenv("CUDA_CACHE_DISABLE", "1", 1);

    // allocate the structure for all the sampler parameters and arrays
    sampler * samp = (sampler *) malloc(sizeof(sampler));
    if(!samp) { perror("Allocation failure sampler"); abort(); }

    // user set parameters
    samp->M = chain_length;                           // Number of steps to run
    samp->N = dimension;                              // Dimension of the problem and the walkers
    samp->K_over_two = walkers_per_group ;            // Number of walkers in each group

    // derived parameters
    samp->K = 2 * samp->K_over_two;                   // Total walkers
    samp->total_samples = samp->M * samp->K;          // Total samples produced

    // indices to save
    samp->num_to_save = num_to_save;
    samp->indices_to_save_host = indices_to_save;

    // Allocate the structure and set values
    samp->data_st = (data_struct *) malloc(sizeof(data_struct));
    if(!(samp->data_st)) { perror("Allocation failure data_struct"); abort(); }

    // default value one, unless performing simulated annealing
    (samp->data_st)->beta         = 1.0f;
    (samp->data_st)->save         = 1;
    (samp->data_st)->num_to_save  = num_to_save;

    // coefficient on Z random variable
    samp->a = a;
    double a_coeffs[3];
    a_coeffs[0] = 1.0 / a;
    a_coeffs[1] = 2.0 * (1.0 - 1.0/a);
    a_coeffs[2] = a - 2.0 + 1.0/a;


    // error check on dimensions
    if(samp->K <= samp->N){
        fprintf(stderr, "Error: Must have more walkers than the dimension.\nExiting\n");
        abort();
    }

    // error check on work sizes
    if( (samp->K_over_two % work_group_size) != 0){
        fprintf(stderr, "Error: Number of walkers in each group must be multiple of work group size.\nExiting\n");
        abort();
    }

    // error check on dimensions to save
    for(int i=0; i<num_to_save; i++){
        if(samp->indices_to_save_host[i] >= samp->N){
            fprintf(stderr, "Error: Cannot save an index larger than the dimension of the problem.\nExiting\n");
            abort();
        }
    }

    if(a <= 1.0){
        fprintf(stderr, "Error: Value of a must be greater than one.\nDefaulting to 2.\n");
        samp->a = 2.0;
    }


    // for later output
    samp->acor_times  = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->acor_times) { perror("Allocation failure"); abort(); }
    samp->acor_pass   = (char   *) malloc(samp->num_to_save * sizeof(char));
    if(!samp->acor_pass) { perror("Allocation failure"); abort(); }
    samp->sigma       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->sigma)      { perror("Allocation failure"); abort(); }
    samp->means       = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->means)      { perror("Allocation failure"); abort(); }
    samp->err_bar     = (double *) malloc(samp->num_to_save * sizeof(double));
    if(!samp->err_bar)    { perror("Allocation failure"); abort(); }

    // write parameter file for plotting
    write_parameter_file_matlab(samp->M, samp->N, samp->K, "Stretch Move",
                            samp->indices_to_save_host, samp->num_to_save, pdf_number);

    // --------------------------------------------------------------------------
    // Set up OpenCL context and queues
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin opencl contexts.\n");

    create_context_on(plat_name, dev_name, 0, &(samp->ctx), NULL, 0);

    {
      cl_int status;
      cl_device_id my_dev;

      CALL_CL_GUARDED(clGetContextInfo, (samp->ctx, CL_CONTEXT_DEVICES,
            sizeof(my_dev), &my_dev, NULL));

      samp->queue = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
      samp->queue_mem = clCreateCommandQueue(samp->ctx, my_dev, 0, &status);
      CHECK_CL_ERROR(status, "clCreateCommandQueue");
    }

    // print information on selected device
    if(OUTPUT_LEVEL > 1)  print_device_info_from_queue(samp->queue);

    // set the work group sizes
    samp->ldim[0] = work_group_size;
    samp->gdim[0] = samp->K_over_two;

    if(OUTPUT_LEVEL > 0) printf("Context built.\n");


    // --------------------------------------------------------------------------
    // Start total timing
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Begin total timing.\n");
    get_timestamp(&(samp->time1_total));


    // --------------------------------------------------------------------------
    // Allocate host memory
    // --------------------------------------------------------------------------

    // counter for number of samples accepted
    samp->accepted_host = (cl_ulong *) malloc(samp->K_over_two * sizeof(cl_ulong));
    if(!(samp->accepted_host)){ perror("Allocation failure accepted host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->accepted_host[i] = 0;

    // Adjacent memory on x_red moves with in the walker
    // To access the ith component of walker j, take x_red[i + j*N];

    // red walkers
    samp->X_red_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_red_host)){ perror("Allocation failure X_red_host"); abort(); }

    // log likelihood
    samp->log_pdf_red_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_red_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i<(samp->K_over_two); i++) samp->log_pdf_red_host[i] = (-1.0f) / 0.0f;

    // black walkers
    samp->X_black_host = (cl_float *) malloc(samp->N * samp->K_over_two * sizeof(cl_float));
    if(!(samp->X_black_host)){ perror("Allocation failure X_black_host"); abort(); }

    // log likelihood
    samp->log_pdf_black_host = (cl_float *) malloc(samp->K_over_two * sizeof(cl_float));
    if(!(samp->log_pdf_black_host)){ perror("Allocation failure X_red_host"); abort(); }
    for(int i=0; i< (samp->K_over_two); i++) samp->log_pdf_black_host[i] = (-1.0f) / 0.0f;

    // samples on host
    cl_int samples_length = samp->num_to_save * samp->M * samp->K;                // length of the samples array
    samp->samples_host = (cl_float *) malloc(samples_length * sizeof(cl_float));         // samples to return
    if(!(samp->samples_host)){ perror("Allocation failure samples_host"); abort(); }


    // intialize the walkers to random values
    // set the seed value
    srand48(0);

    // initialize the walkers to small random values
    for(int j=0; j < samp->N * samp->K_over_two; j++){
        if(NONNEGATIVE_BOX){
            samp->X_black_host[j] = (cl_float) drand48();
            samp->X_red_host[j]   = (cl_float) drand48();
        }
        else{
            samp->X_black_host[j] = (cl_float) (0.1 * (drand48()-0.5));
            samp->X_red_host[j]   = (cl_float) (0.1 * (drand48()-0.5));
        }

    }


    // set up observations
    samp->data_length = data_length;

    // there are lots of complications that appear if this is empty
    // make it length one instead
    if(samp->data_length == 0){
        samp->data_length = 1;
        samp->data_host = (cl_float *) malloc(samp->data_length * sizeof(cl_float)) ;
        if(!(samp->data_host)){ perror("Allocation failure data_host"); abort(); }
        samp->data_host[0] = 0.0f;
    }
    else{
        // standard case
        samp->data_host = data;
    }


    // --------------------------------------------------------------------------
    // load kernels
    // --------------------------------------------------------------------------

    // stretch move kernel
    char *knl_text = read_file("stretch_move.cl");
    char options[300];
    sprintf(options, "-D NN=%d -D K_OVER_TWO=%d -D WORK_GROUP_SIZE=%d -D DATA_LEN=%d -D PDF_NUMBER=%d -D A_COEFF_0=%.10ff -D A_COEFF_1=%.10ff -D A_COEFF_2=%.10ff  -I . ",
            samp->N, samp->K_over_two, (int) work_group_size, samp->data_length, pdf_number, a_coeffs[0], a_coeffs[1], a_coeffs[2]);

    if(OUTPUT_LEVEL > 0) printf("Options string for stretch move kernel:%s\n", options);

    samp->stretch_knl = kernel_from_string(samp->ctx, knl_text, "stretch_move", options);
    free(knl_text);

    if(OUTPUT_LEVEL > 0) printf("Stretch Move kernel compiled.\n");

    // random number generator initialization
    char * knl_text_rand = read_file("Kernel_Ranluxcl_Init.cl");
    char options_rand_lux[100];

    if(AMD)
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4 -I .");
    else
        sprintf(options_rand_lux, "-DRANLUXCL_LUX=4");

    samp->init_rand_lux_knl = kernel_from_string(samp->ctx, knl_text_rand, "Kernel_Ranluxcl_Init", options_rand_lux);
    free(knl_text_rand);

    if(OUTPUT_LEVEL > 0) printf("Ranluxcl init kernel compiled.\n");



    // --------------------------------------------------------------------------
    // allocate device memory
    // --------------------------------------------------------------------------
    cl_int status;

    samp->X_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_red_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_red_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->N * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->log_pdf_black_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->X_black_save = clCreateBuffer(samp->ctx, CL_MEM_WRITE_ONLY,
      sizeof(cl_float) * samp->num_to_save * samp->K_over_two, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->accepted_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            samp->K_over_two * sizeof(cl_ulong), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    samp->indices_to_save_device = clCreateBuffer(samp->ctx, CL_MEM_READ_ONLY,
            samp->num_to_save * sizeof(cl_int), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the observations
    samp->data_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
      sizeof(cl_float) * samp->data_length, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");

    // data struct on device
    samp->data_st_device = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
            sizeof(data_struct), 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // allocate for the state array for randluxcl
    // use a 1d work group
    size_t rand_lux_state_buffer_size = samp->gdim[0] * 7 * sizeof(cl_float4);
    samp->ranluxcltab = clCreateBuffer(samp->ctx, CL_MEM_READ_WRITE,
        rand_lux_state_buffer_size, 0, &status);
    CHECK_CL_ERROR(status, "clCreateBuffer");


    // --------------------------------------------------------------------------
    // transfer to device
    // --------------------------------------------------------------------------

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_red_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_red_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->X_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->N * samp->K_over_two * sizeof(cl_float), samp->X_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->log_pdf_black_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->K_over_two * sizeof(cl_float), samp->log_pdf_black_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->data_length * sizeof(cl_float), samp->data_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->data_st_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        sizeof(data_struct), samp->data_st,
        0, NULL, NULL));

    CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        samp->queue, samp->indices_to_save_device, /*blocking*/ CL_TRUE, /*offset*/ 0,
        samp->num_to_save * sizeof(cl_int), samp->indices_to_save_host,
        0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));


    // --------------------------------------------------------------------------
    // Initialize random number generator
    // --------------------------------------------------------------------------

    // int for state variable initialization
    cl_int ins = 1;
    SET_2_KERNEL_ARGS(samp->init_rand_lux_knl, ins, samp->ranluxcltab);

    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
          (samp->queue, samp->init_rand_lux_knl,
           /*dimensions*/ 1, NULL, samp->gdim, samp->ldim,
           0, NULL, NULL));

    CALL_CL_GUARDED(clFinish, (samp->queue));

    // --------------------------------------------------------------------------
    // Sampler initialization is done
    // --------------------------------------------------------------------------
    if(OUTPUT_LEVEL > 0) printf("Sampler initialized.\n");
    return samp;
}
Пример #2
0
float reduct_energy(cl_mem a,cl_mem b, cl_mem c,cl_mem d, int N, float epsilon, 
			cl_kernel reduct_eng,cl_kernel reduct, cl_command_queue queue)
{
	int n = N ;
	float output;
//CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);
	
	if(n > 128)
	{	
		SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon);
		size_t ldim[] = { 128 };
		size_t gdim[] = { n };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, reduct_eng,
			/*dimensions*/ 1, NULL, gdim, ldim,
			0, NULL, NULL));
		n /= 128;
			
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);
		while(n>=128)
		{
			SET_2_KERNEL_ARGS(reduct, d, n);
			size_t ldim[] = { 128 };
			size_t gdim[] = { n };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, reduct,
				/*dimensions*/ 1, NULL, gdim, ldim,
				0, NULL, NULL));
			n /= 128;
		CALL_CL_GUARDED(clFinish, (queue));
		//printf("aha, n = %d\n",n);

		}

		if(n != 1)
		{
			SET_2_KERNEL_ARGS(reduct, d, n);
			size_t ldim[] = { n };
			size_t gdim[] = { n };
			CALL_CL_GUARDED(clEnqueueNDRangeKernel,
				(queue, reduct,
				/*dimensions*/ 1, NULL, gdim, ldim,
				0, NULL, NULL));
		}
					
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
        		queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0,
       			sizeof(float), &output,
        		0, NULL, NULL));

	}
	else 
	{
		SET_6_KERNEL_ARGS(reduct_eng, a, b, c, d,n,epsilon);
		size_t ldim[] = { n };
		size_t gdim[] = { n };
		CALL_CL_GUARDED(clEnqueueNDRangeKernel,
			(queue, reduct_eng,
			/*dimensions*/ 1, NULL, gdim, ldim,
			0, NULL, NULL));
		CALL_CL_GUARDED(clEnqueueReadBuffer, (
        		queue, d, /*blocking*/ CL_TRUE, /*offset*/ 0,
       			sizeof(float), &output,
        		0, NULL, NULL));
	}
	CALL_CL_GUARDED(clFinish, (queue));
	//printf("reduct energy = %f\n",output);
	return output;
}