Exemple #1
0
void bpnn_train(BPNN *net, float *eo, float *eh)
{
  int in, hid, out;
  float out_err, hid_err;

  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;

  /*** Feed forward input activations. ***/
  bpnn_layerforward(net->input_units, net->hidden_units,
      net->input_weights, in, hid);
  bpnn_layerforward(net->hidden_units, net->output_units,
      net->hidden_weights, hid, out);

  /*** Compute error on output and hidden units. ***/
  bpnn_output_error(net->output_delta, net->target, net->output_units,
      out, &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
      net->hidden_weights, net->hidden_units, &hid_err);
  *eo = out_err;
  *eh = hid_err;

  /*** Adjust input and hidden weights. ***/
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
      net->hidden_weights, net->hidden_prev_weights);
  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
      net->input_weights, net->input_prev_weights);

}
Exemple #2
0
void bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
    int in, hid, out;
    float out_err, hid_err;
    float *input_units, *hidden_units, *output_units;
    float **input_weights, **hidden_weights;
    float *target, *hidden_delta, *output_delta;
    float **hidden_prev_weights, **input_prev_weights;

    in = net->input_n;
    hid = net->hidden_n;
    out = net->output_n;

    input_units = net->input_units;
    hidden_units = net->hidden_units;
    output_units = net->output_units;

    input_weights = net->input_weights;
    hidden_weights = net->hidden_weights;

    target = net->target;
    hidden_delta = net->hidden_delta;
    output_delta = net->output_delta;

    hidden_prev_weights = net->hidden_prev_weights;
    input_prev_weights = net->input_prev_weights;

#pragma acc data copyin(input_units[0:in]) \
  create(hidden_units[0:hid], output_units[0:out]) \
  copyin(input_weights[0:in][0:hid], hidden_weights[0:hid][0:out]) \
  create(hidden_delta[0:hid], output_delta[0:out]) \
  create(input_prev_weights[0:in][0:hid], hidden_prev_weights[0:hid][0:out]) \
  copyin(target[0:out])
    {
        printf("Performing CPU computation\n");
        bpnn_layerforward(input_units, hidden_units, input_weights, in, hid);
        bpnn_layerforward(hidden_units, output_units, hidden_weights, hid, out);
        bpnn_output_error(output_delta, target, output_units, out, &out_err);
        bpnn_hidden_error(hidden_delta, hid, output_delta, out, hidden_weights, hidden_units, &hid_err);
        bpnn_adjust_weights(output_delta, out, hidden_units, hid, hidden_weights, hidden_prev_weights);
        bpnn_adjust_weights(hidden_delta, hid, input_units, in, input_weights, input_prev_weights);
    } /* end acc data */

}
Exemple #3
0
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
	int in, hid, out;
	float out_err, hid_err;

	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   

	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_bp1  = "bpnn_layerforward_ocl";
	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
	char * tempchar = "./backprop_kernel.cl";
	FILE * fp = fopen(tempchar, "rb"); 
	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	fread(source + strlen(source), sourcesize, 1, fp);
	fclose(fp);

	int use_gpu = 1;
	if(initialize(use_gpu)) return -1;

	// compile kernel
	cl_int err = 0;
	const char * slist[2] = { source, 0 };
	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
	err = DIVIDEND_CL_WRAP(clBuildProgram)(prog, 0, NULL, NULL, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }

	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
	clReleaseProgram(prog);

	float *input_weights_one_dim;
	float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;

	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));

	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };

	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
			input_weights_one_dim[m] = net->input_weights[k][j];
			input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
			m++;
		}
	}

	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;

	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}

	printf("Performing GPU computation\n");

	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }

	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);

#pragma dividend local_work_group_size local_work  dim 2 dim1(2:64:2:32) dim2(2:64:2:32)
	//This lws will be used to profile the OpenCL kernel with id 1
		size_t _dividend_lws_local_work_k1[3];
	{
	_dividend_lws_local_work_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL);
	_dividend_lws_local_work_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL);
	//Dividend extension: store the kernel id as the last element
	_dividend_lws_local_work_k1[2] = 1;
	}
		err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel1, 2, NULL, global_work, _dividend_lws_local_work_k1, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	

	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	

	for (int j = 1; j <= hid; j++) {
		sum = 0.0;
		for (int k = 0; k < num_blocks; k++) {	
			sum += partial_sum[k * hid + j-1] ;
		}
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
	}


	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }

	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );

#pragma dividend local_work_group_size local_work  dim 2 dim1(8:32:2:32) dim2(16:32:2:32)
	//This lws will be used to profile the OpenCL kernel with id 2
		size_t _dividend_lws_local_work_k2[3];
	{
	_dividend_lws_local_work_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL);
	_dividend_lws_local_work_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL);
	//Dividend extension: store the kernel id as the last element
	_dividend_lws_local_work_k2[2] = 2;
	}
		err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel2, 2, NULL, global_work, _dividend_lws_local_work_k2, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	

	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	

	DIVIDEND_CL_WRAP(clFinish)(cmd_queue);

	clReleaseMemObject(input_ocl);
	clReleaseMemObject(output_hidden_ocl);
	clReleaseMemObject(input_hidden_ocl);
	clReleaseMemObject(hidden_partial_sum);
	clReleaseMemObject(input_prev_weights_ocl);

	free(input_weights_prev_one_dim);
	free(partial_sum);
	free(input_weights_one_dim);

	return 0;

}
//extern "C"
void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
{
  int in, hid, out;
  float out_err, hid_err;

  in = net->input_n;
  hid = net->hidden_n;
  out = net->output_n;   

#ifdef GPU  
  int m = 0;
  float *input_hidden_cuda;
  float *input_cuda;
  //float *output_hidden_cuda;
  float *partial_sum;
  float *hidden_partial_sum;
  float *hidden_delta_cuda;
  float *input_prev_weights_cuda;
  float sum;
  float *input_weights_one_dim;
  float *input_weights_prev_one_dim;
  num_blocks = in / 16;  
  dim3  grid;//( 1 , num_blocks, 0);
  grid.x = 1;
  grid.y = num_blocks;
  grid.z = 1;
  dim3  threads;//(16 , 16, 0);
  threads.x = 16;
  threads.y = 16;
  threads.z = 1;
  input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
  input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
  partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));

  // this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights
  for (int k = 0; k <= in; k++) {	
    for (int j = 0; j <= hid; j++) {
      input_weights_one_dim[m] = net->input_weights[k][j];
      input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
      m++;
    }
  }

  //cudaMalloc((void**) &input_cuda, (in + 1) * sizeof(float));
  //cudaMalloc((void**) &output_hidden_cuda, (hid + 1) * sizeof(float));
  //cudaMalloc((void**) &input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
  //cudaMalloc((void**) &hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
  input_cuda = (float*)malloc((in + 1) * sizeof(float));
  //output_hidden_cuda = (float*)malloc((hid + 1) * sizeof(float));
  input_hidden_cuda = (float*)malloc((in + 1) * (hid + 1) * sizeof(float));
  hidden_partial_sum = (float*)malloc(num_blocks * WIDTH * sizeof(float));
#endif

#ifdef CPU

  printf("Performing CPU computation\n");
  unsigned int start = gettime();
  bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid);
  unsigned int end = gettime();
  printf("CPU time: \t%f\n", (end - start) * 1e-6);
#endif

#ifdef GPU

  printf("Performing GPU computation\n");


  //cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float), cudaMemcpyHostToDevice);
  //cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  memcpy(input_cuda, net->input_units, (in + 1) * sizeof(float));
  memcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));

  start = gettime();
  bpnn_layerforward_CUDA(input_cuda, input_hidden_cuda, hidden_partial_sum, in, hid, grid, threads, 1, 0);
  end = gettime();
  printf("GPU time: \t%f\n", (end - start) * 1e-6);
  //cudaThreadSynchronize();

  //cudaError_t error = cudaGetLastError();
  //	if (error != cudaSuccess) {
  //		printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
  //		exit(EXIT_FAILURE);
  //	}

  //cudaMemcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
  memcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float));   
  for (int j = 1; j <= hid; j++) {
    sum = 0.0;
    for (int k = 0; k < num_blocks; k++) {	
      sum += partial_sum[k * hid + j-1] ;
    }
    sum += net->input_weights[0][j];
    net-> hidden_units[j] = (float)(1.0 / (1.0 + exp(-sum)));
  }
#endif

  bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
  bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

#ifdef CPU

  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights);

#endif  


#ifdef GPU

  //cudaMalloc((void**) &hidden_delta_cuda, (hid + 1) * sizeof(float));
  //cudaMalloc((void**) &input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float));
  hidden_delta_cuda = (float*)malloc((hid + 1) * sizeof(float));
  input_prev_weights_cuda = (float*)malloc((in + 1) * (hid + 1) * sizeof(float));


  //cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  //cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  //cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
  memcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float));
  memcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float));
  memcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));

  printf("%d %d\n", hid + 1, (in + 1) * (hid + 1));
  bpnn_adjust_weights_cuda(hidden_delta_cuda, hid, input_cuda, in, input_hidden_cuda, input_prev_weights_cuda, grid, threads, 1, 0);



  //cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float), cudaMemcpyDeviceToHost);
  //cudaMemcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
  memcpy(net->input_units, input_cuda, (in + 1) * sizeof(float));
  memcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float)); 

  int status = 1;
  float EPSILON = 0.001f;
  FILE *pFile;
  pFile = fopen("cuda/gold_output.txt", "r");
  if (pFile == NULL) {
    fputs("fopen example", pFile);
  }
  //fprintf(pFile, "net->input_units\n");
  float gold_input_units_val;
  for (int k = 0; k < in + 1; k++) {
    //fprintf(pFile, "%f\n", net->input_units[k]);
    fscanf(pFile, "%f\n", &gold_input_units_val);
    if (gold_input_units_val - net->input_units[k] < -EPSILON ||
        gold_input_units_val - net->input_units[k] > EPSILON) {
      printf("Mismatch at %d: gold = %f, calc = %f.\n",
          k, gold_input_units_val, net->input_units[k]);
      status = 0;
      break;
    }
  }

  float gold_weights_one_dim_val;
  //fprintf(pFile, "input_weights_one_dim\n");
  for (int k = 0; k < (in + 1) * (hid + 1); k++) {
    //fprintf(pFile, "%f\n", input_weights_one_dim[k]);
    fscanf(pFile, "%f\n", &gold_weights_one_dim_val);
    if (gold_weights_one_dim_val - input_weights_one_dim[k] < -EPSILON ||
        gold_weights_one_dim_val - input_weights_one_dim[k] > EPSILON) {
      printf("Mismatch at %d: gold = %f, calc = %f.\n",
          k, gold_weights_one_dim_val, input_weights_one_dim[k]);
      status = 0;
      break;
    }
  }

  //cudaFree(input_cuda);
  //cudaFree(output_hidden_cuda);
  //cudaFree(input_hidden_cuda);
  //cudaFree(hidden_partial_sum);
  //cudaFree(input_prev_weights_cuda);
  //cudaFree(hidden_delta_cuda);
  free(input_cuda);
  //free(output_hidden_cuda);
  free(input_hidden_cuda);
  free(hidden_partial_sum);
  free(input_prev_weights_cuda);
  free(hidden_delta_cuda);

  free(partial_sum);
  free(input_weights_one_dim);
  free(input_weights_prev_one_dim);

  if (status == 1)
    printf("PASSED.\n");
  else
    printf("FAILED.\n");
#endif   

}
void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
{
	int j, k;
	int in, hid, out;
	float out_err, hid_err;
	struct timeval tv;
	
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   
	
#ifdef GPU  
	int m = 0;
	float *partial_sum;
	float sum;
	float *input_weights_one_dim;
	float *input_weights_prev_one_dim;
	num_blocks = in / 16;  
	CUdeviceptr input_cuda;
	CUdeviceptr input_hidden_cuda;
	CUdeviceptr output_hidden_cuda;
	CUdeviceptr hidden_partial_sum;
	CUdeviceptr hidden_delta_cuda;
	CUdeviceptr input_prev_weights_cuda;
	CUcontext ctx;
	CUmodule mod;
	CUresult res;
	
	input_weights_one_dim = 
		(float *) malloc((in + 1) * (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = 
		(float *) malloc((in + 1) * (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	/* this preprocessing stage is added to correct the bugs of wrong 
	   memcopy using two-dimensional net->inputweights */
	for (k = 0; k <= in; k++) {	
		for (j = 0; j <= hid; j++) {
			input_weights_one_dim[m] = net->input_weights[k][j];
			input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
			m++;
		}
	}
	
	/*
	 * call our common CUDA initialization utility function.
	 */
	res = cuda_driver_api_init(&ctx, &mod, "./backprop.cubin");
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_init failed: res = %u\n", res);
		return ;
	}
	
	/*
	 * allocate device memory space
	 */
	res = cuMemAlloc(&input_cuda, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&output_hidden_cuda, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&hidden_delta_cuda, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
	res = cuMemAlloc(&input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		return ;
	}
#endif

#ifdef CPU
	printf("Performing CPU computation\n");
	bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid);
#endif

#ifdef GPU
	printf("Performing GPU computation\n");
    //printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
  
	/*
	 * measurement start!
	 */
	time_measure_start(&tv);

	res = cuMemcpyHtoD(input_cuda, net->input_units, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}

	res = bpnn_layerforward_launch(mod, input_cuda, output_hidden_cuda,
								   input_hidden_cuda, hidden_partial_sum,
								   in, hid);
	if (res != CUDA_SUCCESS) {
		printf("bpnn_layerforward failed: res = %u\n", res);
		return ;
	}
 
	cuCtxSynchronize();
  
#if 0
	cudaError_t error = cudaGetLastError();
	if (error != cudaSuccess) {
		printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}
#endif
  
	res = cuMemcpyDtoH(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(layerforward) failed: res = %u\n", res);
		return ;
	}

	for (j = 1; j <= hid; j++) {
		sum = 0.0;
		for (k = 0; k < num_blocks; k++) {	
			sum += partial_sum[k * hid + j-1] ;
		}
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = (float) (1.0 / (1.0 + exp(-sum)));
	}
  #endif

	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
	
#ifdef CPU
	bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights);
#endif  

#ifdef GPU
	res = cuMemcpyHtoD(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}
	res = cuMemcpyHtoD(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		return ;
	}

	res = bpnn_adjust_weights_launch(mod, hidden_delta_cuda, hid, 
									 input_cuda, in, 
									 input_hidden_cuda, 
									 input_prev_weights_cuda);
	if (res != CUDA_SUCCESS) {
		printf("bpnn_adjust_weights failed: res = %u\n", res);
		return ;
	}

	res = cuMemcpyDtoH(net->input_units, input_cuda, (in + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res);
		return ;
	}

	res = cuMemcpyDtoH(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(adjust_weights) failed: res = %u\n", res);
		return ;
	}
    
	cuMemFree(input_cuda);
	cuMemFree(output_hidden_cuda);
	cuMemFree(input_hidden_cuda);
	cuMemFree(hidden_partial_sum);
	cuMemFree(input_prev_weights_cuda);
	cuMemFree(hidden_delta_cuda);

	/*
	 * measurement end! will print out the time.
	 */
	time_measure_end(&tv);

	res = cuda_driver_api_exit(ctx, mod);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_exit faild: res = %u\n", res);
		return ;
	}
	
	free(partial_sum);
	free(input_weights_one_dim);
	free(input_weights_prev_one_dim);
#endif   
}
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
	int in, hid, out;
	float out_err, hid_err;
  
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   

        //int use_device = 0;  // use CPU as device
	int use_device = 2;  // use GPU as device
        //int use_device = 2;  // use FPGA as device
	if(initialize(use_device)) return -1;
         
	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_bp1  = "bpnn_layerforward_ocl";
	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
	char * tempchar = "./backprop_kernel.cl";
        char * krnl_file = "./binary/backprop_kernel_default.xclbin";
 
        cl_int err = 0;
        cl_program prog;
        // create program from source
        if (use_device < 2 ) {
	    FILE * fp = fopen(tempchar, "rb"); 
	    if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	    fread(source + strlen(source), sourcesize, 1, fp);
	    fclose(fp);
		
	    // compile kernel
	    err = 0;
	    const char * slist[2] = { source, 0 };
	    prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	    if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
        } 
        // create program from binary
        else {
            char *krnl_bin;
	    const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);

            err = 0;
            prog = clCreateProgramWithBinary(context, 1,
	                                    &device_list[0], &krnl_size,
	                                    (const unsigned char**) &krnl_bin,
	                                    NULL, &err);
            if ((!prog) || (err!=CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n",
		       err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	    }
        }
        
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    	
	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
        if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; }
	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; }
	/* clReleaseProgram(prog); */
	
	float *input_weights_one_dim;
    float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;
	
	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
	
	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
		input_weights_one_dim[m] = net->input_weights[k][j];
		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
	    m++;
		}
	}
	
	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;
  
	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
		
	printf("Performing GPU computation\n");
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
        if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
	if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
  
	for (int j = 1; j <= hid; j++) {
		sum = 0.0;
		for (int k = 0; k < num_blocks; k++) {	
		sum += partial_sum[k * hid + j-1] ;
    }
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
	}

	
	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  
	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
  
	clReleaseMemObject(input_ocl);
	clReleaseMemObject(output_hidden_ocl);
	clReleaseMemObject(input_hidden_ocl);
	clReleaseMemObject(hidden_partial_sum);
	clReleaseMemObject(input_prev_weights_ocl);
  
	free(input_weights_prev_one_dim);
	free(partial_sum);
	free(input_weights_one_dim);

}