int exec_loop(solver_props *props){ int i; int status = SUCCESS; // Initialize solvers for all iterators # if defined TARGET_GPU gpu_init(); # endif for(i=0;i<NUM_ITERATORS;i++){ solver_init(&props[i]); } // Execute the model(s) on the appropriate target #if defined(TARGET_CPU) status = exec_cpu(props, 0); #elif defined(TARGET_OPENMP) status = exec_parallel_cpu(props); #elif defined(TARGET_GPU) status = exec_parallel_gpu(props); #else #error Invalid target #endif // Free solvers for all iterators for(i=0;i<NUM_ITERATORS;i++){ solver_free(&props[i]); } # if defined TARGET_GPU gpu_exit(); # endif return status; }
/*------------------------------------------------------------------------*/ uint32 poly_stage1_run(msieve_obj *obj, poly_stage1_t *data) { bounds_t bounds; poly_search_t poly; #ifdef HAVE_CUDA gpu_config_t gpu_config; gpu_init(&gpu_config); if (gpu_config.num_gpu == 0) { printf("error: no CUDA-enabled GPUs found\n"); exit(-1); } if (obj->which_gpu >= (uint32)gpu_config.num_gpu) { printf("error: GPU %u does not exist " "or is not CUDA-enabled\n", obj->which_gpu); exit(-1); } logprintf(obj, "using GPU %u (%s)\n", obj->which_gpu, gpu_config.info[obj->which_gpu].name); poly.gpu_info = gpu_config.info + obj->which_gpu; #endif stage1_bounds_init(&bounds, data); poly_search_init(&poly, data); search_coeffs(obj, &poly, &bounds, data->deadline); poly_search_free(&poly); stage1_bounds_free(&bounds); return 1; }
int main(int argc, char **argv) { set_program_name(argv[0]); if (argc < 2) { warning(0, "too few arguments"); print_usage(); exit(EXIT_FAILURE); } int N = atoi(argv[1]); if (!N) error(0, "invalid argument: %s", argv[1]); char *kernel_env = getenv("KERNEL"); int kernel_id = 0; if (kernel_env) kernel_id = atoi(kernel_env); // Initialize runtime, so as not to pay the cost at the first call if (kernel_id > CPU_PARALLEL) { printf("Initializing CUDA runtime ... "); fflush(stdout); gpu_init(); printf("DONE\n"); } kernel_data_t *data = NULL; data = data_create_CPU(N); data_init(data); #ifndef _NOCHECK kernel_data_t *check_data = NULL; check_data = data_create_CPU(N); data_copy(check_data, data); kernels[0].fn(check_data); #endif // Run and time the selected kernel printf("Launching kernel: %s\n", kernels[kernel_id].descr); fflush(stdout); xtimer_t timer; timer_clear(&timer); timer_start(&timer); kernels[kernel_id].fn(data); timer_stop(&timer); report_results(&timer, N*N); #ifndef _NOCHECK check_result(data, check_data); #endif // Cleanup data_free_CPU(data); #ifndef _NOCHECK data_free_CPU(check_data); #endif return EXIT_SUCCESS; }
void supervision_init(void) { //fprintf(log_get(), "supervision: init\n"); #ifndef DEBUG //iprintf("supervision: init\n"); #endif memorymap_init(); gpu_init(); timer_init(); /*sound_init();*/ interrupts_init(); }
int main(int argc, char * argv[]) { if (argc != 2) { usage(argv[0]); exit(1); } if (SDL_Init(SDL_INIT_VIDEO) != 0) { printf("Echec init : %s\n", SDL_GetError()); exit(1); } GBContext *cpu = cpu_init(argv[1]); GPUContext *gpu = gpu_init(cpu); JOYPContext *joyp = joyp_init(cpu); uint8_t op; unsigned int start_time, last_time, elapsed_time; while (!cpu->exit) { start_time = SDL_GetTicks(); while (cpu->t_clock < VSYNC_CYCLES) { op = read8(cpu, cpu->PC++); execute(cpu, op); //"realtime" operations must be here.. cpu_ctx_update(cpu); gpu_ctx_update(cpu, gpu); joyp_ctx_update(cpu, joyp); handle_interrupt(cpu); } cpu->t_clock = 0; last_time = SDL_GetTicks(); elapsed_time = last_time - start_time; if(elapsed_time < VSYNC_TIME_MS) {// just a basic frame rate wait loop SDL_Delay(VSYNC_TIME_MS - elapsed_time); } gpu_render(cpu, gpu); // printf("elapsed: %d\n", elapsed_time); } joyp_destroy(joyp); gpu_destroy(gpu); cpu_destroy(cpu); SDL_Quit(); return 0; }
void kmain(uint32_t r0, uint32_t r1, uint32_t atags) { // Enable logging, the first thing we do log_init(); // Initialize the GPU at 800x600 resolution gpu_init(800, 600); // Clear to a black color gpu_clear(0); // Test to write a character gpu_putchar('A', 10, 10, RGB(255, 0, 0)); // Test the console console_init(800, 600); console_puts("ABCDEFGHIJKLMNOPQRSTUVXYZ abcdefghijklmnopqrstuvxyz \n0123456789\n"); console_puts("Hejj\b du!\n"); }
int on_program_start(Tcl_Interp *interp) { EVENT_TRACE(fprintf(stderr, "%d: on_program_start\n", this_node)); #ifdef CUDA gpu_init(); #endif /* call the initialization of the modules here */ init_random(); init_bit_random(); setup_node_grid(); /* calculate initial minimimal number of cells (see tclcallback_min_num_cells) */ min_num_cells = calc_processor_min_num_cells(); cells_pre_init(); ghost_init(); /* Initialise force and energy tables */ force_and_energy_tables_init(); #ifdef ADRESS #ifdef INTERFACE_CORRECTION adress_force_and_energy_tables_init(); #endif /** #ifdef THERMODYNAMIC_FORCE */ tf_tables_init(); /** #endif */ #endif #ifdef ELP3M fft_pre_init(); #endif /* call all initializations to do only on the master node here. */ if (this_node == 0) { /* interaction_data.c: make sure 0<->0 ia always exists */ make_particle_type_exist(0); init_tcl(interp); } return TCL_OK; }
static int kh_init(void) { int i, len, r; void *p; devfd = ssc(open(kgpudev, O_RDWR)); /* alloc GPU Pinned memory buffers */ p = (void*)gpu_alloc_pinned_mem(KGPU_BUF_SIZE+PAGE_SIZE); hostbuf.uva = p; hostbuf.size = KGPU_BUF_SIZE; dbg("%p \n", hostbuf.uva); memset(hostbuf.uva, 0, KGPU_BUF_SIZE); ssc( mlock(hostbuf.uva, KGPU_BUF_SIZE)); gpu_init(); hostvma.uva = (void*)mmap( NULL, KGPU_BUF_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, devfd, 0); hostvma.size = KGPU_BUF_SIZE; if (hostvma.uva == MAP_FAILED) { kh_log(KGPU_LOG_ERROR, "set up mmap area failed\n"); perror("mmap for GPU"); abort(); } kh_log(KGPU_LOG_PRINT, "mmap start 0x%lX\n", hostvma.uva); len = sizeof(struct kgpu_gpu_mem_info); /* tell kernel the buffers */ r = ioctl(devfd, KGPU_IOC_SET_GPU_BUFS, (unsigned long)&hostbuf); if (r < 0) { perror("Write req file for buffers."); abort(); } return 0; }
int main(void) { if (gpu_init(&gpu) < 0) { fprintf(stderr, "Failed to initialize GPU\n"); return -1; } width = gpu.res.xres; height = gpu.res.yres; pitch = width * 4; signal(SIGINT, sigint_handler); gfx_test( &gpu, 0xfeed0007 ); //sleep( 3 ); gpu_cleanup(&gpu); return 0; }
// simengine_runmodel() // // executes the model for the given parameters, states and simulation time simengine_result *simengine_runmodel(simengine_opts *opts){ double start_time = opts->start_time; double stop_time = opts->stop_time; unsigned int num_models = opts->num_models; const char *outputs_dirname = opts->outputs_dirname; CDATAFORMAT model_states[PARALLEL_MODELS * NUM_STATES]; unsigned int stateid; unsigned int modelid; unsigned int models_executed; unsigned int models_per_batch; double *progress; int progress_fd; int output_fd; int resuming = 0; int random_initialized = 0; # if defined TARGET_GPU gpu_init(); # endif open_progress_file(outputs_dirname, &progress, &progress_fd, num_models); // Create result structure simengine_result *seresult = (simengine_result*)malloc(sizeof(simengine_result)); // Couldn't allocate return structure, return NULL if(!seresult) return NULL; if(seint.num_states){ seresult->final_states = (double*)malloc(num_models * seint.num_states * sizeof(double)); } else{ seresult->final_states = NULL; } seresult->final_time = (double*)malloc(num_models * sizeof(double)); if((seint.num_states && !seresult->final_states) ||!seresult->final_time){ seresult->status = ERRMEM; seresult->status_message = (char*) simengine_errors[ERRMEM]; seresult->final_states = NULL; seresult->final_time = NULL; return seresult; } init_output_buffers(outputs_dirname, &output_fd); // Run the parallel simulation repeatedly until all requested models have been executed for(models_executed = 0 ; models_executed < num_models; models_executed += PARALLEL_MODELS){ models_per_batch = MIN(num_models - models_executed, PARALLEL_MODELS); // Copy inputs and state initial values to internal representation unsigned int modelid_offset = global_modelid_offset + models_executed; #if NUM_CONSTANT_INPUTS > 0 #if defined TARGET_GPU host_constant_inputs = (CDATAFORMAT *)malloc(PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT)); #else host_constant_inputs = constant_inputs; #endif #else CDATAFORMAT *host_constant_inputs = NULL; #endif #if NUM_SAMPLED_INPUTS > 0 #if defined TARGET_GPU host_sampled_inputs = (sampled_input_t *)malloc(STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t)); #else host_sampled_inputs = sampled_inputs; #endif #else sampled_input_t *host_sampled_inputs = NULL; #endif resuming = initialize_states(model_states, outputs_dirname, num_models, models_per_batch, modelid_offset); initialize_inputs(host_constant_inputs, host_sampled_inputs, outputs_dirname, num_models, models_per_batch, modelid_offset, start_time); #if defined TARGET_GPU && NUM_CONSTANT_INPUTS > 0 CDATAFORMAT *g_constant_inputs; cutilSafeCall(cudaGetSymbolAddress((void **)&g_constant_inputs, constant_inputs)); cutilSafeCall(cudaMemcpy(g_constant_inputs, host_constant_inputs, PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT), cudaMemcpyHostToDevice)); #endif #if defined TARGET_GPU && NUM_SAMPLED_INPUTS > 0 sampled_input_t *g_sampled_inputs; cutilSafeCall(cudaGetSymbolAddress((void **)&g_sampled_inputs, sampled_inputs)); cutilSafeCall(cudaMemcpy(g_sampled_inputs, host_sampled_inputs, STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t), cudaMemcpyHostToDevice)); #endif // Initialize the solver properties and internal simulation memory structures solver_props *props = init_solver_props(start_time, stop_time, models_per_batch, model_states, models_executed+global_modelid_offset); // Initialize random number generator if (!random_initialized || opts->seeded) { random_init(models_per_batch); random_initialized = 1; } // If no initial states were passed in if(!resuming){ if(seint.num_states > 0){ // Initialize default states in next_states for(modelid=0;modelid<models_per_batch;modelid++){ init_states(props, modelid); // Copy states from next_states to model_states unsigned int iterid; for(iterid=0;iterid<seint.num_iterators;iterid++){ solver_writeback(&props[iterid], modelid); } } } } // Run the model seresult->status = exec_loop(props, outputs_dirname, progress + models_executed, resuming); seresult->status_message = (char*) simengine_errors[seresult->status]; // Copy the final time from simulation for(modelid=0; modelid<models_per_batch; modelid++){ seresult->final_time[models_executed + modelid] = props->time[modelid]; // Time from the first solver } // Free all internal simulation memory and make sure that model_states has the final state values free_solver_props(props, model_states); // Copy state values back to state initial value structure for(modelid=0; modelid<models_per_batch; modelid++){ for(stateid=0;stateid<seint.num_states;stateid++){ seresult->final_states[AS_IDX(seint.num_states, num_models, stateid, models_executed + modelid)] = model_states[TARGET_IDX(seint.num_states, PARALLEL_MODELS, stateid, modelid)]; } } } close_progress_file(progress, progress_fd, num_models); clean_up_output_buffers(output_fd); return seresult; }
int main(int argc, char **argv) { /* Initial information */ fprintf(stderr, "\n"); fprintf(stderr, "; Multi2Sim %s - A Simulation Framework for CPU-GPU Heterogeneous Computing\n", VERSION); fprintf(stderr, "; Please use command 'm2s --help' for a list of command-line options.\n"); fprintf(stderr, "; Last compilation: %s %s\n", __DATE__, __TIME__); fprintf(stderr, "\n"); /* Read command line */ sim_read_command_line(&argc, argv); /* CPU disassembler tool */ if (*cpu_disasm_file_name) ke_disasm(cpu_disasm_file_name); /* GPU disassembler tool */ if (*gpu_disasm_file_name) gk_disasm(gpu_disasm_file_name); /* OpenGL disassembler tool */ if (*opengl_disasm_file_name) gl_disasm(opengl_disasm_file_name, opengl_disasm_shader_index); /* GPU visualization tool */ if (*gpu_visual_file_name) vgpu_run(gpu_visual_file_name); /* Memory hierarchy visualization tool */ if (*visual_file_name) vmem_run(visual_file_name); /* Network simulation tool */ if (*net_sim_network_name) net_sim(net_debug_file_name); /* Debug */ debug_init(); isa_inst_debug_category = debug_new_category(isa_inst_debug_file_name); isa_call_debug_category = debug_new_category(isa_call_debug_file_name); elf_debug_category = debug_new_category(elf_debug_file_name); net_debug_category = debug_new_category(net_debug_file_name); ld_debug_category = debug_new_category(loader_debug_file_name); sys_debug_category = debug_new_category(syscall_debug_file_name); ctx_debug_category = debug_new_category(ctx_debug_file_name); mem_debug_category = debug_new_category(mem_debug_file_name); opencl_debug_category = debug_new_category(opencl_debug_file_name); gpu_isa_debug_category = debug_new_category(gpu_isa_debug_file_name); gpu_stack_debug_category = debug_new_category(gpu_stack_debug_file_name); /* GPU-REL */ gpu_faults_debug_category = debug_new_category(gpu_faults_debug_file_name); /* GPU-REL */ gpu_pipeline_debug_category = debug_new_category(gpu_pipeline_debug_file_name); error_debug_category = debug_new_category(error_debug_file_name); esim_debug_init(esim_debug_file_name); /* Trace */ trace_init(trace_file_name); mem_trace_category = trace_new_category(); /* Initialization for functional simulation */ esim_init(); ke_init(); net_init(); /* Initialization for detailed simulation */ if (cpu_sim_kind == cpu_sim_detailed) cpu_init(); if (gpu_sim_kind == gpu_sim_detailed) gpu_init(); /* Memory hierarchy initialization, done after we initialized CPU cores * and GPU compute units. */ mem_system_init(); /* Load programs */ cpu_load_progs(argc, argv, ctxconfig_file_name); /* Simulation loop */ if (ke->running_list_head) { if (cpu_sim_kind == cpu_sim_detailed) cpu_run(); else ke_run(); } /* Flush event-driven simulation */ esim_process_all_events(0); /* Dump statistics summary */ sim_stats_summary(); /* Finalization of memory system */ mem_system_done(); /* Finalization of detailed CPU simulation */ if (cpu_sim_kind == cpu_sim_detailed) { esim_debug_done(); cpu_done(); } /* Finalization of detailed GPU simulation */ if (gpu_sim_kind == gpu_sim_detailed) gpu_done(); /* Finalization */ net_done(); esim_done(); trace_done(); ke_done(); debug_done(); mhandle_done(); /* End */ return 0; }
autoencoder_GPU::autoencoder_GPU():autoencoder(){ // initialize the OpenCL environment gpu_init(gpu_env, 0); d_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_layer0act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer0err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer1act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer1err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer2act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer2err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer3act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer3err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer4state = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer5act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer5err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer6act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer6err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer7act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer7err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer8act = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_layer8err = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_delta_weight7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias0 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize1 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias1 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize2 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias2 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize3 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias3 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize4 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias4 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize5 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias5 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize6 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias6 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize7 * sizeof(floatType), NULL, &gpu_env.status); d_delta_bias7 = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize8 * sizeof(floatType), NULL, &gpu_env.status); // error vector d_error = clCreateBuffer(gpu_env.ctx, CL_MEM_READ_WRITE, nLayerSize0 * nVectorPerBatch * sizeof(floatType), NULL, &gpu_env.status); // transfer data from CPU to GPU, TO DO // build OpenCL kernels char* source = new char[KERNEL_SOURCE_LENGTH]; loadKernelSource("../src/gpu_rbm.cl", source); gpu_env.prog = clCreateProgramWithSource(gpu_env.ctx, 1, (const char**)&source, NULL, &gpu_env.status); gpu_env.status = clBuildProgram(gpu_env.prog, 0, NULL, NULL, NULL, NULL); if (gpu_env.status == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(gpu_env.prog, gpu_env.device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); exit(0); } squareError = clCreateKernel(gpu_env.prog, "squareError", &gpu_env.status); sigmoid = clCreateKernel(gpu_env.prog, "sigmoid", &gpu_env.status); addBias = clCreateKernel(gpu_env.prog, "addBias", &gpu_env.status); sumBatch = clCreateKernel(gpu_env.prog, "sumBatch", &gpu_env.status); add = clCreateKernel(gpu_env.prog, "add", &gpu_env.status); getStates = clCreateKernel(gpu_env.prog, "getStates", &gpu_env.status); updateWeights = clCreateKernel(gpu_env.prog, "updateWeights", &gpu_env.status); updateBias = clCreateKernel(gpu_env.prog, "updateBias", &gpu_env.status); randNum = clCreateKernel(gpu_env.prog, "PRNG_threefry4x32", &gpu_env.status); randn = clCreateKernel(gpu_env.prog, "PRNGn_threefry4x32", &gpu_env.status); reset = clCreateKernel(gpu_env.prog, "reset", &gpu_env.status); rounding = clCreateKernel(gpu_env.prog, "rounding", &gpu_env.status); subtract = clCreateKernel(gpu_env.prog, "subtract", &gpu_env.status); deriv = clCreateKernel(gpu_env.prog, "deriv", &gpu_env.status); updateAE = clCreateKernel(gpu_env.prog, "update", &gpu_env.status); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight0, CL_TRUE, 0, nLayerSize0 * nLayerSize1 * sizeof(floatType), (void*)weight0, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight1, CL_TRUE, 0, nLayerSize1 * nLayerSize2 * sizeof(floatType), (void*)weight1, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight2, CL_TRUE, 0, nLayerSize2 * nLayerSize3 * sizeof(floatType), (void*)weight2, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight3, CL_TRUE, 0, nLayerSize3 * nLayerSize4 * sizeof(floatType), (void*)weight3, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight4, CL_TRUE, 0, nLayerSize4 * nLayerSize5 * sizeof(floatType), (void*)weight4, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight5, CL_TRUE, 0, nLayerSize5 * nLayerSize6 * sizeof(floatType), (void*)weight5, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight6, CL_TRUE, 0, nLayerSize6 * nLayerSize7 * sizeof(floatType), (void*)weight6, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_weight7, CL_TRUE, 0, nLayerSize7 * nLayerSize8 * sizeof(floatType), (void*)weight7, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias0, CL_TRUE, 0, nLayerSize1 * sizeof(floatType), (void*)bias0, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias1, CL_TRUE, 0, nLayerSize2 * sizeof(floatType), (void*)bias1, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias2, CL_TRUE, 0, nLayerSize3 * sizeof(floatType), (void*)bias2, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias3, CL_TRUE, 0, nLayerSize4 * sizeof(floatType), (void*)bias3, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias4, CL_TRUE, 0, nLayerSize5 * sizeof(floatType), (void*)bias4, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias5, CL_TRUE, 0, nLayerSize6 * sizeof(floatType), (void*)bias5, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias6, CL_TRUE, 0, nLayerSize7 * sizeof(floatType), (void*)bias6, 0, NULL, NULL); gpu_env.status = clEnqueueWriteBuffer(gpu_env.queue, d_bias7, CL_TRUE, 0, nLayerSize8 * sizeof(floatType), (void*)bias7, 0, NULL, NULL); }