void bpnn_train(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; /*** Feed forward input activations. ***/ bpnn_layerforward(net->input_units, net->hidden_units, net->input_weights, in, hid); bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); /*** Compute error on output and hidden units. ***/ bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); *eo = out_err; *eh = hid_err; /*** Adjust input and hidden weights. ***/ bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights); }
void bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; float *input_units, *hidden_units, *output_units; float **input_weights, **hidden_weights; float *target, *hidden_delta, *output_delta; float **hidden_prev_weights, **input_prev_weights; in = net->input_n; hid = net->hidden_n; out = net->output_n; input_units = net->input_units; hidden_units = net->hidden_units; output_units = net->output_units; input_weights = net->input_weights; hidden_weights = net->hidden_weights; target = net->target; hidden_delta = net->hidden_delta; output_delta = net->output_delta; hidden_prev_weights = net->hidden_prev_weights; input_prev_weights = net->input_prev_weights; #pragma acc data copyin(input_units[0:in]) \ create(hidden_units[0:hid], output_units[0:out]) \ copyin(input_weights[0:in][0:hid], hidden_weights[0:hid][0:out]) \ create(hidden_delta[0:hid], output_delta[0:out]) \ create(input_prev_weights[0:in][0:hid], hidden_prev_weights[0:hid][0:out]) \ copyin(target[0:out]) { printf("Performing CPU computation\n"); bpnn_layerforward(input_units, hidden_units, input_weights, in, hid); bpnn_layerforward(hidden_units, output_units, hidden_weights, hid, out); bpnn_output_error(output_delta, target, output_units, out, &out_err); bpnn_hidden_error(hidden_delta, hid, output_delta, out, hidden_weights, hidden_units, &hid_err); bpnn_adjust_weights(output_delta, out, hidden_units, hid, hidden_weights, hidden_prev_weights); bpnn_adjust_weights(hidden_delta, hid, input_units, in, input_weights, input_prev_weights); } /* end acc data */ }
int bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_bp1 = "bpnn_layerforward_ocl"; char * kernel_bp2 = "bpnn_adjust_weights_ocl"; char * tempchar = "./backprop_kernel.cl"; FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); int use_gpu = 1; if(initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = { source, 0 }; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = DIVIDEND_CL_WRAP(clBuildProgram)(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_bp1, &err); kernel2 = clCreateKernel(prog, kernel_bp2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); float *input_weights_one_dim; float *input_weights_prev_one_dim; float * partial_sum; float sum; float num_blocks = in / BLOCK_SIZE; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // set global and local workitems size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 }; // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights // todo: fix mem allocation int m = 0; for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } cl_mem input_hidden_ocl; cl_mem input_ocl; cl_mem output_hidden_ocl; cl_mem hidden_partial_sum; cl_mem hidden_delta_ocl; cl_mem input_prev_weights_ocl; input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;} input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;} output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;} hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;} hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;} input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;} printf("Performing GPU computation\n"); //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum ); clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL ); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid); #pragma dividend local_work_group_size local_work dim 2 dim1(2:64:2:32) dim2(2:64:2:32) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_local_work_k1[3]; { _dividend_lws_local_work_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); _dividend_lws_local_work_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_k1[2] = 1; } err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel1, 2, NULL, global_work, _dividend_lws_local_work_k1, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; } for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); } bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl); clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl ); #pragma dividend local_work_group_size local_work dim 2 dim1(8:32:2:32) dim2(16:32:2:32) //This lws will be used to profile the OpenCL kernel with id 2 size_t _dividend_lws_local_work_k2[3]; { _dividend_lws_local_work_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL); _dividend_lws_local_work_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_k2[2] = 2; } err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel2, 2, NULL, global_work, _dividend_lws_local_work_k2, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; } DIVIDEND_CL_WRAP(clFinish)(cmd_queue); clReleaseMemObject(input_ocl); clReleaseMemObject(output_hidden_ocl); clReleaseMemObject(input_hidden_ocl); clReleaseMemObject(hidden_partial_sum); clReleaseMemObject(input_prev_weights_ocl); free(input_weights_prev_one_dim); free(partial_sum); free(input_weights_one_dim); return 0; }
//extern "C" void bpnn_train_cuda(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; #ifdef GPU int m = 0; float *input_hidden_cuda; float *input_cuda; //float *output_hidden_cuda; float *partial_sum; float *hidden_partial_sum; float *hidden_delta_cuda; float *input_prev_weights_cuda; float sum; float *input_weights_one_dim; float *input_weights_prev_one_dim; num_blocks = in / 16; dim3 grid;//( 1 , num_blocks, 0); grid.x = 1; grid.y = num_blocks; grid.z = 1; dim3 threads;//(16 , 16, 0); threads.x = 16; threads.y = 16; threads.z = 1; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } //cudaMalloc((void**) &input_cuda, (in + 1) * sizeof(float)); //cudaMalloc((void**) &output_hidden_cuda, (hid + 1) * sizeof(float)); //cudaMalloc((void**) &input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); //cudaMalloc((void**) &hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); input_cuda = (float*)malloc((in + 1) * sizeof(float)); //output_hidden_cuda = (float*)malloc((hid + 1) * sizeof(float)); input_hidden_cuda = (float*)malloc((in + 1) * (hid + 1) * sizeof(float)); hidden_partial_sum = (float*)malloc(num_blocks * WIDTH * sizeof(float)); #endif #ifdef CPU printf("Performing CPU computation\n"); unsigned int start = gettime(); bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid); unsigned int end = gettime(); printf("CPU time: \t%f\n", (end - start) * 1e-6); #endif #ifdef GPU printf("Performing GPU computation\n"); //cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); memcpy(input_cuda, net->input_units, (in + 1) * sizeof(float)); memcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); start = gettime(); bpnn_layerforward_CUDA(input_cuda, input_hidden_cuda, hidden_partial_sum, in, hid, grid, threads, 1, 0); end = gettime(); printf("GPU time: \t%f\n", (end - start) * 1e-6); //cudaThreadSynchronize(); //cudaError_t error = cudaGetLastError(); // if (error != cudaSuccess) { // printf("bpnn kernel error: %s\n", cudaGetErrorString(error)); // exit(EXIT_FAILURE); // } //cudaMemcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost); memcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = (float)(1.0 / (1.0 + exp(-sum))); } #endif bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); #ifdef CPU bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights); #endif #ifdef GPU //cudaMalloc((void**) &hidden_delta_cuda, (hid + 1) * sizeof(float)); //cudaMalloc((void**) &input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float)); hidden_delta_cuda = (float*)malloc((hid + 1) * sizeof(float)); input_prev_weights_cuda = (float*)malloc((in + 1) * (hid + 1) * sizeof(float)); //cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); //cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice); memcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float)); memcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float)); memcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); printf("%d %d\n", hid + 1, (in + 1) * (hid + 1)); bpnn_adjust_weights_cuda(hidden_delta_cuda, hid, input_cuda, in, input_hidden_cuda, input_prev_weights_cuda, grid, threads, 1, 0); //cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float), cudaMemcpyDeviceToHost); //cudaMemcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost); memcpy(net->input_units, input_cuda, (in + 1) * sizeof(float)); memcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); int status = 1; float EPSILON = 0.001f; FILE *pFile; pFile = fopen("cuda/gold_output.txt", "r"); if (pFile == NULL) { fputs("fopen example", pFile); } //fprintf(pFile, "net->input_units\n"); float gold_input_units_val; for (int k = 0; k < in + 1; k++) { //fprintf(pFile, "%f\n", net->input_units[k]); fscanf(pFile, "%f\n", &gold_input_units_val); if (gold_input_units_val - net->input_units[k] < -EPSILON || gold_input_units_val - net->input_units[k] > EPSILON) { printf("Mismatch at %d: gold = %f, calc = %f.\n", k, gold_input_units_val, net->input_units[k]); status = 0; break; } } float gold_weights_one_dim_val; //fprintf(pFile, "input_weights_one_dim\n"); for (int k = 0; k < (in + 1) * (hid + 1); k++) { //fprintf(pFile, "%f\n", input_weights_one_dim[k]); fscanf(pFile, "%f\n", &gold_weights_one_dim_val); if (gold_weights_one_dim_val - input_weights_one_dim[k] < -EPSILON || gold_weights_one_dim_val - input_weights_one_dim[k] > EPSILON) { printf("Mismatch at %d: gold = %f, calc = %f.\n", k, gold_weights_one_dim_val, input_weights_one_dim[k]); status = 0; break; } } //cudaFree(input_cuda); //cudaFree(output_hidden_cuda); //cudaFree(input_hidden_cuda); //cudaFree(hidden_partial_sum); //cudaFree(input_prev_weights_cuda); //cudaFree(hidden_delta_cuda); free(input_cuda); //free(output_hidden_cuda); free(input_hidden_cuda); free(hidden_partial_sum); free(input_prev_weights_cuda); free(hidden_delta_cuda); free(partial_sum); free(input_weights_one_dim); free(input_weights_prev_one_dim); if (status == 1) printf("PASSED.\n"); else printf("FAILED.\n"); #endif }
void bpnn_train_cuda(BPNN *net, float *eo, float *eh) { int j, k; int in, hid, out; float out_err, hid_err; struct timeval tv; in = net->input_n; hid = net->hidden_n; out = net->output_n; #ifdef GPU int m = 0; float *partial_sum; float sum; float *input_weights_one_dim; float *input_weights_prev_one_dim; num_blocks = in / 16; CUdeviceptr input_cuda; CUdeviceptr input_hidden_cuda; CUdeviceptr output_hidden_cuda; CUdeviceptr hidden_partial_sum; CUdeviceptr hidden_delta_cuda; CUdeviceptr input_prev_weights_cuda; CUcontext ctx; CUmodule mod; CUresult res; input_weights_one_dim = (float *) malloc((in + 1) * (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1) * (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); /* this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights */ for (k = 0; k <= in; k++) { for (j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } /* * call our common CUDA initialization utility function. */ res = cuda_driver_api_init(&ctx, &mod, "./backprop.cubin"); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_init failed: res = %u\n", res); return ; } /* * allocate device memory space */ res = cuMemAlloc(&input_cuda, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&output_hidden_cuda, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&hidden_delta_cuda, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } res = cuMemAlloc(&input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", res); return ; } #endif #ifdef CPU printf("Performing CPU computation\n"); bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid); #endif #ifdef GPU printf("Performing GPU computation\n"); //printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks); /* * measurement start! */ time_measure_start(&tv); res = cuMemcpyHtoD(input_cuda, net->input_units, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = bpnn_layerforward_launch(mod, input_cuda, output_hidden_cuda, input_hidden_cuda, hidden_partial_sum, in, hid); if (res != CUDA_SUCCESS) { printf("bpnn_layerforward failed: res = %u\n", res); return ; } cuCtxSynchronize(); #if 0 cudaError_t error = cudaGetLastError(); if (error != cudaSuccess) { printf("bpnn kernel error: %s\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } #endif res = cuMemcpyDtoH(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(layerforward) failed: res = %u\n", res); return ; } for (j = 1; j <= hid; j++) { sum = 0.0; for (k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = (float) (1.0 / (1.0 + exp(-sum))); } #endif bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); #ifdef CPU bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights); #endif #ifdef GPU res = cuMemcpyHtoD(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD failed: res = %u\n", res); return ; } res = bpnn_adjust_weights_launch(mod, hidden_delta_cuda, hid, input_cuda, in, input_hidden_cuda, input_prev_weights_cuda); if (res != CUDA_SUCCESS) { printf("bpnn_adjust_weights failed: res = %u\n", res); return ; } res = cuMemcpyDtoH(net->input_units, input_cuda, (in + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res); return ; } res = cuMemcpyDtoH(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res); return ; } cuMemFree(input_cuda); cuMemFree(output_hidden_cuda); cuMemFree(input_hidden_cuda); cuMemFree(hidden_partial_sum); cuMemFree(input_prev_weights_cuda); cuMemFree(hidden_delta_cuda); /* * measurement end! will print out the time. */ time_measure_end(&tv); res = cuda_driver_api_exit(ctx, mod); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_exit faild: res = %u\n", res); return ; } free(partial_sum); free(input_weights_one_dim); free(input_weights_prev_one_dim); #endif }
int bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; //int use_device = 0; // use CPU as device int use_device = 2; // use GPU as device //int use_device = 2; // use FPGA as device if(initialize(use_device)) return -1; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_bp1 = "bpnn_layerforward_ocl"; char * kernel_bp2 = "bpnn_adjust_weights_ocl"; char * tempchar = "./backprop_kernel.cl"; char * krnl_file = "./binary/backprop_kernel_default.xclbin"; cl_int err = 0; cl_program prog; // create program from source if (use_device < 2 ) { FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // compile kernel err = 0; const char * slist[2] = { source, 0 }; prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } } // create program from binary else { char *krnl_bin; const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin); err = 0; prog = clCreateProgramWithBinary(context, 1, &device_list[0], &krnl_size, (const unsigned char**) &krnl_bin, NULL, &err); if ((!prog) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(EXIT_FAILURE); } } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_bp1, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; } kernel2 = clCreateKernel(prog, kernel_bp2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; } /* clReleaseProgram(prog); */ float *input_weights_one_dim; float *input_weights_prev_one_dim; float * partial_sum; float sum; float num_blocks = in / BLOCK_SIZE; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // set global and local workitems size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 }; // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights // todo: fix mem allocation int m = 0; for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } cl_mem input_hidden_ocl; cl_mem input_ocl; cl_mem output_hidden_ocl; cl_mem hidden_partial_sum; cl_mem hidden_delta_ocl; cl_mem input_prev_weights_ocl; input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;} input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;} output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;} hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;} hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;} input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;} printf("Performing GPU computation\n"); //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum ); clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL ); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid); err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0); if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");} if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; } for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); } bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl); clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl ); err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; } clReleaseMemObject(input_ocl); clReleaseMemObject(output_hidden_ocl); clReleaseMemObject(input_hidden_ocl); clReleaseMemObject(hidden_partial_sum); clReleaseMemObject(input_prev_weights_ocl); free(input_weights_prev_one_dim); free(partial_sum); free(input_weights_one_dim); }