Example #1
0
int exec_loop(solver_props *props){
  int i;
  int status = SUCCESS;

  // Initialize solvers for all iterators
# if defined TARGET_GPU
  gpu_init();
# endif
  for(i=0;i<NUM_ITERATORS;i++){
    solver_init(&props[i]);
  }

  // Execute the model(s) on the appropriate target
#if defined(TARGET_CPU)
  status = exec_cpu(props, 0);
#elif defined(TARGET_OPENMP)
  status = exec_parallel_cpu(props);
#elif defined(TARGET_GPU)
  status = exec_parallel_gpu(props);
#else
#error Invalid target
#endif

  // Free solvers for all iterators
  for(i=0;i<NUM_ITERATORS;i++){
    solver_free(&props[i]);
  }
# if defined TARGET_GPU
  gpu_exit();
# endif

  return status;
}
Example #2
0
/*------------------------------------------------------------------------*/
uint32
poly_stage1_run(msieve_obj *obj, poly_stage1_t *data)
{
	bounds_t bounds;
	poly_search_t poly;
#ifdef HAVE_CUDA
	gpu_config_t gpu_config;

	gpu_init(&gpu_config);
	if (gpu_config.num_gpu == 0) {
		printf("error: no CUDA-enabled GPUs found\n");
		exit(-1);
	}
	if (obj->which_gpu >= (uint32)gpu_config.num_gpu) {
		printf("error: GPU %u does not exist "
			"or is not CUDA-enabled\n", obj->which_gpu);
		exit(-1);
	}
	logprintf(obj, "using GPU %u (%s)\n", obj->which_gpu,
			gpu_config.info[obj->which_gpu].name);

	poly.gpu_info = gpu_config.info + obj->which_gpu; 
#endif

	stage1_bounds_init(&bounds, data);
	poly_search_init(&poly, data);

	search_coeffs(obj, &poly, &bounds, data->deadline);

	poly_search_free(&poly);
	stage1_bounds_free(&bounds);
	return 1;
}
Example #3
0
int main(int argc, char **argv)
{
    set_program_name(argv[0]);
    if (argc < 2) {
        warning(0, "too few arguments");
        print_usage();
        exit(EXIT_FAILURE);
    }

    int N = atoi(argv[1]);
    if (!N)
        error(0, "invalid argument: %s", argv[1]);

    char *kernel_env = getenv("KERNEL");
    int kernel_id = 0;
    if (kernel_env)
        kernel_id = atoi(kernel_env);

    // Initialize runtime, so as not to pay the cost at the first call
    if (kernel_id > CPU_PARALLEL) {
        printf("Initializing CUDA runtime ... ");
        fflush(stdout);
        gpu_init();
        printf("DONE\n");
    }

    kernel_data_t *data = NULL;
    data = data_create_CPU(N);
    data_init(data);

#ifndef _NOCHECK
    kernel_data_t *check_data = NULL;
    check_data = data_create_CPU(N);
    data_copy(check_data, data);
    kernels[0].fn(check_data);
#endif

    // Run and time the selected kernel
    printf("Launching kernel:   %s\n", kernels[kernel_id].descr);
    fflush(stdout);
    xtimer_t timer;
    timer_clear(&timer);
    timer_start(&timer);
    kernels[kernel_id].fn(data);
    timer_stop(&timer);

    report_results(&timer, N*N);
#ifndef _NOCHECK
    check_result(data, check_data);
#endif

    // Cleanup
    data_free_CPU(data);
#ifndef _NOCHECK
    data_free_CPU(check_data);
#endif

    return EXIT_SUCCESS;
}
Example #4
0
void supervision_init(void)
{
	//fprintf(log_get(), "supervision: init\n");
	#ifndef DEBUG
	//iprintf("supervision: init\n");
	#endif

	memorymap_init();
	gpu_init();
	timer_init();
	/*sound_init();*/
	interrupts_init();
}
Example #5
0
int main(int argc, char * argv[]) {
    if (argc != 2) {
        usage(argv[0]);
        exit(1);
    }
    
    if (SDL_Init(SDL_INIT_VIDEO) != 0) {
        printf("Echec init : %s\n", SDL_GetError());
        exit(1);
    }

    GBContext   *cpu = cpu_init(argv[1]);
    GPUContext  *gpu = gpu_init(cpu);
    JOYPContext *joyp = joyp_init(cpu);
    
    uint8_t op;
    unsigned int start_time, last_time, elapsed_time;
    while (!cpu->exit) {
        start_time = SDL_GetTicks();

        while (cpu->t_clock < VSYNC_CYCLES) {
            op = read8(cpu, cpu->PC++);
            execute(cpu, op);
            
            //"realtime" operations must be here..
            cpu_ctx_update(cpu);
            gpu_ctx_update(cpu, gpu);
            joyp_ctx_update(cpu, joyp);
            handle_interrupt(cpu);
        }
        cpu->t_clock = 0;

        last_time = SDL_GetTicks();
        elapsed_time = last_time - start_time;
        if(elapsed_time < VSYNC_TIME_MS) {// just a basic frame rate wait loop
            SDL_Delay(VSYNC_TIME_MS - elapsed_time);
        }
        
        gpu_render(cpu, gpu);
//        printf("elapsed: %d\n", elapsed_time);
    }

    
    joyp_destroy(joyp);
    gpu_destroy(gpu);
    cpu_destroy(cpu);
    SDL_Quit();

    return 0;
}
Example #6
0
void kmain(uint32_t r0, uint32_t r1, uint32_t atags) {
    // Enable logging, the first thing we do
    log_init();

    // Initialize the GPU at 800x600 resolution
    gpu_init(800, 600);
    // Clear to a black color
    gpu_clear(0);

    // Test to write a character
    gpu_putchar('A', 10, 10, RGB(255, 0, 0));

    // Test the console
    console_init(800, 600);
    console_puts("ABCDEFGHIJKLMNOPQRSTUVXYZ abcdefghijklmnopqrstuvxyz \n0123456789\n");
    console_puts("Hejj\b du!\n");
}
Example #7
0
int on_program_start(Tcl_Interp *interp)
{
  EVENT_TRACE(fprintf(stderr, "%d: on_program_start\n", this_node));

#ifdef CUDA
  gpu_init();
#endif

  /*
    call the initialization of the modules here
  */
  init_random();
  init_bit_random();

  setup_node_grid();
  /* calculate initial minimimal number of cells (see tclcallback_min_num_cells) */
  min_num_cells = calc_processor_min_num_cells();

  cells_pre_init();
  ghost_init();
  /* Initialise force and energy tables */
  force_and_energy_tables_init();
#ifdef ADRESS
#ifdef INTERFACE_CORRECTION
  adress_force_and_energy_tables_init();
#endif
  /** #ifdef THERMODYNAMIC_FORCE */
  tf_tables_init();
  /** #endif */
#endif
#ifdef ELP3M
  fft_pre_init();
#endif

  /*
    call all initializations to do only on the master node here.
  */
  if (this_node == 0) {
    /* interaction_data.c: make sure 0<->0 ia always exists */
    make_particle_type_exist(0);
    
    init_tcl(interp);
  }
  return TCL_OK;
}
Example #8
0
static int kh_init(void)
{
    int  i, len, r;
    void *p;
    
    devfd = ssc(open(kgpudev, O_RDWR));

    /* alloc GPU Pinned memory buffers */
    p = (void*)gpu_alloc_pinned_mem(KGPU_BUF_SIZE+PAGE_SIZE);
    hostbuf.uva = p;
    hostbuf.size = KGPU_BUF_SIZE;
    dbg("%p \n", hostbuf.uva);
    memset(hostbuf.uva, 0, KGPU_BUF_SIZE);
    ssc( mlock(hostbuf.uva, KGPU_BUF_SIZE));

    gpu_init();

    hostvma.uva = (void*)mmap(
	NULL, KGPU_BUF_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, devfd, 0);
    hostvma.size = KGPU_BUF_SIZE;
    if (hostvma.uva == MAP_FAILED) {
	kh_log(KGPU_LOG_ERROR,
		 "set up mmap area failed\n");
	perror("mmap for GPU");
	abort();
    }
    kh_log(KGPU_LOG_PRINT,
	   "mmap start 0x%lX\n", hostvma.uva);
    
    len = sizeof(struct kgpu_gpu_mem_info);

    /* tell kernel the buffers */
    r = ioctl(devfd, KGPU_IOC_SET_GPU_BUFS, (unsigned long)&hostbuf);
    if (r < 0) {
	perror("Write req file for buffers.");
	abort();
    }

    return 0;
}
Example #9
0
int main(void)
{
	
	if (gpu_init(&gpu) < 0)
	{
		fprintf(stderr, "Failed to initialize GPU\n");
		return -1;
	}

	width = gpu.res.xres;
	height = gpu.res.yres;
	pitch = width * 4;
	signal(SIGINT, sigint_handler);

	gfx_test( &gpu, 0xfeed0007 );

	//sleep( 3 );

	gpu_cleanup(&gpu);

	return 0;
}
Example #10
0
// simengine_runmodel()
//
//    executes the model for the given parameters, states and simulation time
simengine_result *simengine_runmodel(simengine_opts *opts){
  double start_time = opts->start_time;
  double stop_time = opts->stop_time;
  unsigned int num_models = opts->num_models;
  const char *outputs_dirname = opts->outputs_dirname;

  CDATAFORMAT model_states[PARALLEL_MODELS * NUM_STATES];
  unsigned int stateid;
  unsigned int modelid;

  unsigned int models_executed;
  unsigned int models_per_batch;

  double *progress;
  int progress_fd;

  int output_fd;

  int resuming = 0;
  int random_initialized = 0;

# if defined TARGET_GPU
  gpu_init();
# endif

  open_progress_file(outputs_dirname, &progress, &progress_fd, num_models);
	     
  // Create result structure
  simengine_result *seresult = (simengine_result*)malloc(sizeof(simengine_result));
	     
  // Couldn't allocate return structure, return NULL
  if(!seresult) return NULL;
	     
  if(seint.num_states){
    seresult->final_states = (double*)malloc(num_models * seint.num_states * sizeof(double));
  }
  else{
    seresult->final_states = NULL;
  }
  seresult->final_time = (double*)malloc(num_models * sizeof(double));
  if((seint.num_states && !seresult->final_states) ||!seresult->final_time){
    seresult->status = ERRMEM;
    seresult->status_message = (char*) simengine_errors[ERRMEM];
    seresult->final_states = NULL;
    seresult->final_time = NULL;
    return seresult;
  }

  init_output_buffers(outputs_dirname, &output_fd);

  // Run the parallel simulation repeatedly until all requested models have been executed
  for(models_executed = 0 ; models_executed < num_models; models_executed += PARALLEL_MODELS){
    models_per_batch = MIN(num_models - models_executed, PARALLEL_MODELS);
    
    // Copy inputs and state initial values to internal representation
    unsigned int modelid_offset = global_modelid_offset + models_executed;
#if NUM_CONSTANT_INPUTS > 0
#if defined TARGET_GPU
    host_constant_inputs = (CDATAFORMAT *)malloc(PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT));
#else
    host_constant_inputs = constant_inputs;
#endif
#else
    CDATAFORMAT *host_constant_inputs = NULL;
#endif

#if NUM_SAMPLED_INPUTS > 0
#if defined TARGET_GPU
    host_sampled_inputs = (sampled_input_t *)malloc(STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t));
#else
    host_sampled_inputs = sampled_inputs;
#endif
#else
    sampled_input_t *host_sampled_inputs = NULL;
#endif

    resuming = initialize_states(model_states, outputs_dirname, num_models, models_per_batch, modelid_offset);
    initialize_inputs(host_constant_inputs, host_sampled_inputs, outputs_dirname, num_models, models_per_batch, modelid_offset, start_time);

#if defined TARGET_GPU && NUM_CONSTANT_INPUTS > 0
    CDATAFORMAT *g_constant_inputs;
    cutilSafeCall(cudaGetSymbolAddress((void **)&g_constant_inputs, constant_inputs));
    cutilSafeCall(cudaMemcpy(g_constant_inputs, host_constant_inputs, PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT), cudaMemcpyHostToDevice));
#endif

#if defined TARGET_GPU && NUM_SAMPLED_INPUTS > 0
    sampled_input_t *g_sampled_inputs;
    cutilSafeCall(cudaGetSymbolAddress((void **)&g_sampled_inputs, sampled_inputs));
    cutilSafeCall(cudaMemcpy(g_sampled_inputs, host_sampled_inputs, STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t), cudaMemcpyHostToDevice));
#endif

    // Initialize the solver properties and internal simulation memory structures
    solver_props *props = init_solver_props(start_time, stop_time, models_per_batch, model_states, models_executed+global_modelid_offset);

    // Initialize random number generator
    if (!random_initialized || opts->seeded) {
      random_init(models_per_batch);
      random_initialized = 1;
    }

    // If no initial states were passed in
    if(!resuming){
      if(seint.num_states > 0){
	// Initialize default states in next_states
	for(modelid=0;modelid<models_per_batch;modelid++){
	  init_states(props, modelid);
	  // Copy states from next_states to model_states
	  unsigned int iterid;
	  for(iterid=0;iterid<seint.num_iterators;iterid++){
	    solver_writeback(&props[iterid], modelid);
	  }
	}
      }
    }

    // Run the model
    seresult->status = exec_loop(props, outputs_dirname, progress + models_executed, resuming);
    seresult->status_message = (char*) simengine_errors[seresult->status];

    // Copy the final time from simulation
    for(modelid=0; modelid<models_per_batch; modelid++){
      seresult->final_time[models_executed + modelid] = props->time[modelid]; // Time from the first solver
    }

    // Free all internal simulation memory and make sure that model_states has the final state values
    free_solver_props(props, model_states);

    // Copy state values back to state initial value structure
    for(modelid=0; modelid<models_per_batch; modelid++){
      for(stateid=0;stateid<seint.num_states;stateid++){
	seresult->final_states[AS_IDX(seint.num_states, num_models, stateid, models_executed + modelid)] = model_states[TARGET_IDX(seint.num_states, PARALLEL_MODELS, stateid, modelid)];
      }
    }
  }

  close_progress_file(progress, progress_fd, num_models);
  clean_up_output_buffers(output_fd);

  return seresult;
}
Example #11
0
File: m2s.c Project: abhaykadam/vm
int main(int argc, char **argv)
{
	/* Initial information */
	fprintf(stderr, "\n");
	fprintf(stderr, "; Multi2Sim %s - A Simulation Framework for CPU-GPU Heterogeneous Computing\n",
		VERSION);
	fprintf(stderr, "; Please use command 'm2s --help' for a list of command-line options.\n");
	fprintf(stderr, "; Last compilation: %s %s\n", __DATE__, __TIME__);
	fprintf(stderr, "\n");

	/* Read command line */
	sim_read_command_line(&argc, argv);

	/* CPU disassembler tool */
	if (*cpu_disasm_file_name)
		ke_disasm(cpu_disasm_file_name);

	/* GPU disassembler tool */
	if (*gpu_disasm_file_name)
		gk_disasm(gpu_disasm_file_name);

	/* OpenGL disassembler tool */
	if (*opengl_disasm_file_name)
		gl_disasm(opengl_disasm_file_name, opengl_disasm_shader_index);	

	/* GPU visualization tool */
	if (*gpu_visual_file_name)
		vgpu_run(gpu_visual_file_name);
	
	/* Memory hierarchy visualization tool */
	if (*visual_file_name)
		vmem_run(visual_file_name);

	/* Network simulation tool */
	if (*net_sim_network_name)
		net_sim(net_debug_file_name);

	/* Debug */
	debug_init();
	isa_inst_debug_category = debug_new_category(isa_inst_debug_file_name);
	isa_call_debug_category = debug_new_category(isa_call_debug_file_name);
	elf_debug_category = debug_new_category(elf_debug_file_name);
	net_debug_category = debug_new_category(net_debug_file_name);
	ld_debug_category = debug_new_category(loader_debug_file_name);
	sys_debug_category = debug_new_category(syscall_debug_file_name);
	ctx_debug_category = debug_new_category(ctx_debug_file_name);
	mem_debug_category = debug_new_category(mem_debug_file_name);
	opencl_debug_category = debug_new_category(opencl_debug_file_name);
	gpu_isa_debug_category = debug_new_category(gpu_isa_debug_file_name);
	gpu_stack_debug_category = debug_new_category(gpu_stack_debug_file_name);  /* GPU-REL */
	gpu_faults_debug_category = debug_new_category(gpu_faults_debug_file_name);  /* GPU-REL */
	gpu_pipeline_debug_category = debug_new_category(gpu_pipeline_debug_file_name);
	error_debug_category = debug_new_category(error_debug_file_name);
	esim_debug_init(esim_debug_file_name);

	/* Trace */
	trace_init(trace_file_name);
	mem_trace_category = trace_new_category();

	/* Initialization for functional simulation */
	esim_init();
	ke_init();
	net_init();

	/* Initialization for detailed simulation */
	if (cpu_sim_kind == cpu_sim_detailed)
		cpu_init();
	if (gpu_sim_kind == gpu_sim_detailed)
		gpu_init();

	/* Memory hierarchy initialization, done after we initialized CPU cores
	 * and GPU compute units. */
	mem_system_init();

	/* Load programs */
	cpu_load_progs(argc, argv, ctxconfig_file_name);

	/* Simulation loop */
	if (ke->running_list_head)
	{
		if (cpu_sim_kind == cpu_sim_detailed)
			cpu_run();
		else
			ke_run();
	}

	/* Flush event-driven simulation */
	esim_process_all_events(0);

	/* Dump statistics summary */
	sim_stats_summary();

	/* Finalization of memory system */
	mem_system_done();

	/* Finalization of detailed CPU simulation */
	if (cpu_sim_kind == cpu_sim_detailed)
	{
		esim_debug_done();
		cpu_done();
	}

	/* Finalization of detailed GPU simulation */
	if (gpu_sim_kind == gpu_sim_detailed)
		gpu_done();

	/* Finalization */
	net_done();
	esim_done();
	trace_done();
	ke_done();
	debug_done();
	mhandle_done();

	/* End */
	return 0;
}
Example #12
0
autoencoder_GPU::autoencoder_GPU():autoencoder(){

	// initialize the OpenCL environment
	gpu_init(gpu_env, 0);

	d_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status);
	d_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status);

	d_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status);
	d_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status);

	d_layer0act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer0err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer1act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer1err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer2act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer2err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer3act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer3err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer4act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer4err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer4state = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer5act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer5err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer6act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer6err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer7act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer7err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer8act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);
	d_layer8err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);

	d_delta_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status);

	d_delta_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status);
	d_delta_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status);

	// error vector
	d_error = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status);

	// transfer data from CPU to GPU, TO DO

	// build OpenCL kernels
	char* source = new char[KERNEL_SOURCE_LENGTH];
	loadKernelSource("../src/gpu_rbm.cl", source);
	gpu_env.prog = clCreateProgramWithSource(gpu_env.ctx, 1, (const char**)&source, NULL, &gpu_env.status);

	gpu_env.status = clBuildProgram(gpu_env.prog, 0, NULL, NULL, NULL, NULL);

	if (gpu_env.status == CL_BUILD_PROGRAM_FAILURE) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

		// Allocate memory for the log
		char *log = (char *) malloc(log_size);

		// Get the log
		clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

		// Print the log
		printf("%s\n", log);
		
		exit(0);
	}

	squareError		= clCreateKernel(gpu_env.prog, "squareError", &gpu_env.status);
	sigmoid			= clCreateKernel(gpu_env.prog, "sigmoid", &gpu_env.status);
	addBias			= clCreateKernel(gpu_env.prog, "addBias", &gpu_env.status);
	sumBatch		= clCreateKernel(gpu_env.prog, "sumBatch", &gpu_env.status);
	add				= clCreateKernel(gpu_env.prog, "add", &gpu_env.status);
	getStates		= clCreateKernel(gpu_env.prog, "getStates", &gpu_env.status);
	updateWeights	= clCreateKernel(gpu_env.prog, "updateWeights", &gpu_env.status);
	updateBias		= clCreateKernel(gpu_env.prog, "updateBias", &gpu_env.status);
	randNum			= clCreateKernel(gpu_env.prog, "PRNG_threefry4x32", &gpu_env.status);
	randn			= clCreateKernel(gpu_env.prog, "PRNGn_threefry4x32", &gpu_env.status);
	reset			= clCreateKernel(gpu_env.prog, "reset", &gpu_env.status);
	rounding		= clCreateKernel(gpu_env.prog, "rounding", &gpu_env.status);
	subtract		= clCreateKernel(gpu_env.prog, "subtract", &gpu_env.status);
	deriv			= clCreateKernel(gpu_env.prog, "deriv", &gpu_env.status);
	updateAE		= clCreateKernel(gpu_env.prog, "update", &gpu_env.status);

	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight0, CL_TRUE, 0, nLayerSize0 * nLayerSize1 * sizeof(floatType), (void*)weight0, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight1, CL_TRUE, 0, nLayerSize1 * nLayerSize2 * sizeof(floatType), (void*)weight1, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight2, CL_TRUE, 0, nLayerSize2 * nLayerSize3 * sizeof(floatType), (void*)weight2, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight3, CL_TRUE, 0, nLayerSize3 * nLayerSize4 * sizeof(floatType), (void*)weight3, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight4, CL_TRUE, 0, nLayerSize4 * nLayerSize5 * sizeof(floatType), (void*)weight4, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight5, CL_TRUE, 0, nLayerSize5 * nLayerSize6 * sizeof(floatType), (void*)weight5, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight6, CL_TRUE, 0, nLayerSize6 * nLayerSize7 * sizeof(floatType), (void*)weight6, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight7, CL_TRUE, 0, nLayerSize7 * nLayerSize8 * sizeof(floatType), (void*)weight7, 0, NULL, NULL);

	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias0, CL_TRUE, 0, nLayerSize1 * sizeof(floatType), (void*)bias0, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias1, CL_TRUE, 0, nLayerSize2 * sizeof(floatType), (void*)bias1, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias2, CL_TRUE, 0, nLayerSize3 * sizeof(floatType), (void*)bias2, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias3, CL_TRUE, 0, nLayerSize4 * sizeof(floatType), (void*)bias3, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias4, CL_TRUE, 0, nLayerSize5 * sizeof(floatType), (void*)bias4, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias5, CL_TRUE, 0, nLayerSize6 * sizeof(floatType), (void*)bias5, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias6, CL_TRUE, 0, nLayerSize7 * sizeof(floatType), (void*)bias6, 0, NULL, NULL);
	gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias7, CL_TRUE, 0, nLayerSize8 * sizeof(floatType), (void*)bias7, 0, NULL, NULL);

}