std::unique_ptr<TorchStage> SpatialDivisiveNormalization::loadFromFile(
    std::ifstream& file) {
  // This whole thing is a little wasteful.  I copy to GPU here, and then
  // I copy it back down in the constructor anyway...  But it's good enough
  // for now.
  int32_t kernel_size_2, kernel_size_1;  // kernel_size_1 is the inner dim
  file.read((char*)(&kernel_size_1), sizeof(kernel_size_1));
  file.read((char*)(&kernel_size_2), sizeof(kernel_size_2));
  std::shared_ptr<Tensor<float>> kernel;
  if (kernel_size_2 > 1) {
    // The kernel is 2D
    uint32_t dim = 2;
    uint32_t size[2] = {static_cast<uint32_t>(kernel_size_1),
                        static_cast<uint32_t>(kernel_size_2)};
    kernel.reset(new Tensor<float>(dim, size));
  } else {
    uint32_t dim = 1;
    uint32_t size[1] = {static_cast<uint32_t>(kernel_size_1)};
    kernel.reset(new Tensor<float>(dim, size));
  }
  std::unique_ptr<float[]> kernel_cpu(new float[kernel->nelems()]);
  file.read((char*)(kernel_cpu.get()),
            kernel->nelems() * sizeof(kernel_cpu[0]));
  kernel->setData(kernel_cpu.get());
  float threshold;
  file.read((char*)(&threshold), sizeof(threshold));
  return std::unique_ptr<TorchStage>(
      new SpatialDivisiveNormalization(kernel, threshold));
}
Esempio n. 2
0
int 
main(	int argc, 
		char *argv [])
{

	//======================================================================================================================================================150
	//	CPU/MCPU VARIABLES
	//======================================================================================================================================================150

	// timer
	long long time0;

	time0 = get_time();

	// timer
	long long time1;
	long long time2;
	long long time3;
	long long time4;
	long long time5;
	long long time6;
	long long time7;

	// counters
	int i, j, k, l, m, n;

	// system memory
	par_str par_cpu;
	dim_str dim_cpu;
	box_str* box_cpu;
	FOUR_VECTOR* rv_cpu;
	fp* qv_cpu;
	FOUR_VECTOR* fv_cpu;
	int nh;

	time1 = get_time();

	//======================================================================================================================================================150
	//	CHECK INPUT ARGUMENTS
	//======================================================================================================================================================150

	// assing default values
	dim_cpu.cores_arg = 1;
	dim_cpu.boxes1d_arg = 1;

	// go through arguments
	for(dim_cpu.cur_arg=1; dim_cpu.cur_arg<argc; dim_cpu.cur_arg++){
		// check if -cores
		if(strcmp(argv[dim_cpu.cur_arg], "-cores")==0){
			// check if value provided
			if(argc>=dim_cpu.cur_arg+1){
				// check if value is a number
				if(isInteger(argv[dim_cpu.cur_arg+1])==1){
					dim_cpu.cores_arg = atoi(argv[dim_cpu.cur_arg+1]);
					if(dim_cpu.cores_arg<0){
						printf("ERROR: Wrong value to -cores parameter, cannot be <=0\n");
						return 0;
					}
					dim_cpu.cur_arg = dim_cpu.cur_arg+1;
				}
				// value is not a number
				else{
					printf("ERROR: Value to -cores parameter in not a number\n");
					return 0;
				}
			}
			// value not provided
			else{
				printf("ERROR: Missing value to -cores parameter\n");
				return 0;
			}
		}
		// check if -boxes1d
		else if(strcmp(argv[dim_cpu.cur_arg], "-boxes1d")==0){
			// check if value provided
			if(argc>=dim_cpu.cur_arg+1){
				// check if value is a number
				if(isInteger(argv[dim_cpu.cur_arg+1])==1){
					dim_cpu.boxes1d_arg = atoi(argv[dim_cpu.cur_arg+1]);
					if(dim_cpu.boxes1d_arg<0){
						printf("ERROR: Wrong value to -boxes1d parameter, cannot be <=0\n");
						return 0;
					}
					dim_cpu.cur_arg = dim_cpu.cur_arg+1;
				}
				// value is not a number
				else{
					printf("ERROR: Value to -boxes1d parameter in not a number\n");
					return 0;
				}
			}
			// value not provided
			else{
				printf("ERROR: Missing value to -boxes1d parameter\n");
				return 0;
			}
		}
		// unknown
		else{
			printf("ERROR: Unknown parameter\n");
			return 0;
		}
	}

	// Print configuration
	printf("Configuration used: cores = %d, boxes1d = %d\n", dim_cpu.cores_arg, dim_cpu.boxes1d_arg);

	time2 = get_time();

	//======================================================================================================================================================150
	//	INPUTS
	//======================================================================================================================================================150

	par_cpu.alpha = 0.5;

	time3 = get_time();

	//======================================================================================================================================================150
	//	DIMENSIONS
	//======================================================================================================================================================150

	// total number of boxes
	dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg;

	// how many particles space has in each direction
	dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX;
	dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR);
	dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp);

	// box array
	dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str);

	time4 = get_time();

	//======================================================================================================================================================150
	//	SYSTEM MEMORY
	//======================================================================================================================================================150

	//====================================================================================================100
	//	BOX
	//====================================================================================================100

	// allocate boxes
	box_cpu = (box_str*)malloc(dim_cpu.box_mem);

	// initialize number of home boxes
	nh = 0;

	// home boxes in z direction
	for(i=0; i<dim_cpu.boxes1d_arg; i++){
		// home boxes in y direction
		for(j=0; j<dim_cpu.boxes1d_arg; j++){
			// home boxes in x direction
			for(k=0; k<dim_cpu.boxes1d_arg; k++){

				// current home box
				box_cpu[nh].x = k;
				box_cpu[nh].y = j;
				box_cpu[nh].z = i;
				box_cpu[nh].number = nh;
				box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX;

				// initialize number of neighbor boxes
				box_cpu[nh].nn = 0;

				// neighbor boxes in z direction
				for(l=-1; l<2; l++){
					// neighbor boxes in y direction
					for(m=-1; m<2; m++){
						// neighbor boxes in x direction
						for(n=-1; n<2; n++){

							// check if (this neighbor exists) and (it is not the same as home box)
							if(		(((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true)	&&
									(l==0 && m==0 && n==0)==false	){

								// current neighbor box
								box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n);
								box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m);
								box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l);
								box_cpu[nh].nei[box_cpu[nh].nn].number =	(box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) + 
																			(box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) + 
																			 box_cpu[nh].nei[box_cpu[nh].nn].x;
								box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX;

								// increment neighbor box
								box_cpu[nh].nn = box_cpu[nh].nn + 1;

							}

						} // neighbor boxes in x direction
					} // neighbor boxes in y direction
				} // neighbor boxes in z direction

				// increment home box
				nh = nh + 1;

			} // home boxes in x direction
		} // home boxes in y direction
	} // home boxes in z direction

	//====================================================================================================100
	//	PARAMETERS, DISTANCE, CHARGE AND FORCE
	//====================================================================================================100

	// random generator seed set to random value - time in this case
	srand(SEED);

	// input (distances)
	rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
	for(i=0; i<dim_cpu.space_elem; i=i+1){
		rv_cpu[i].v = (rand()%10 + 1) / 10.0;			// get a number in the range 0.1 - 1.0
		rv_cpu[i].x = (rand()%10 + 1) / 10.0;			// get a number in the range 0.1 - 1.0
		rv_cpu[i].y = (rand()%10 + 1) / 10.0;			// get a number in the range 0.1 - 1.0
		rv_cpu[i].z = (rand()%10 + 1) / 10.0;			// get a number in the range 0.1 - 1.0
	}

	// input (charge)
	qv_cpu = (fp*)malloc(dim_cpu.space_mem2);
	for(i=0; i<dim_cpu.space_elem; i=i+1){
		qv_cpu[i] = (rand()%10 + 1) / 10.0;			// get a number in the range 0.1 - 1.0
	}

	// output (forces)
	fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
	for(i=0; i<dim_cpu.space_elem; i=i+1){
		fv_cpu[i].v = 0;								// set to 0, because kernels keeps adding to initial value
		fv_cpu[i].x = 0;								// set to 0, because kernels keeps adding to initial value
		fv_cpu[i].y = 0;								// set to 0, because kernels keeps adding to initial value
		fv_cpu[i].z = 0;								// set to 0, because kernels keeps adding to initial value
	}

	time5 = get_time();

	//======================================================================================================================================================150
	//	KERNEL
	//======================================================================================================================================================150

	//====================================================================================================100
	//	CPU/MCPU
	//====================================================================================================100

	kernel_cpu(	par_cpu,
				dim_cpu,
				box_cpu,
				rv_cpu,
				qv_cpu,
				fv_cpu);

	time6 = get_time();

#ifdef BENCH_PRINT
	for(i=0; i<dim_cpu.space_elem; i=i+1){
		printf("(%f, [%f, %f, %f])\t", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z);							
	}
	printf("\n");	
#endif

	//======================================================================================================================================================150
	//	SYSTEM MEMORY DEALLOCATION
	//======================================================================================================================================================150

	free(rv_cpu);
	free(qv_cpu);
	free(fv_cpu);
	free(box_cpu);

	time7 = get_time();

	//======================================================================================================================================================150
	//	DISPLAY TIMING
	//======================================================================================================================================================150

	// printf("Time spent in different stages of the application:\n");

	// printf("%15.12f s, %15.12f % : VARIABLES\n",						(float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100);
	// printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", 					(float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100);
	// printf("%15.12f s, %15.12f % : INPUTS\n",							(float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100);
	// printf("%15.12f s, %15.12f % : dim_cpu\n", 							(float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100);
	// printf("%15.12f s, %15.12f % : SYS MEM: ALO\n",						(float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100);

	// printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n",					(float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100);

	// printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", 					(float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100);

	// printf("Total time:\n");
	// printf("%.12f s\n", 												(float) (time7-time0) / 1000000);

	//======================================================================================================================================================150
	//	RETURN
	//======================================================================================================================================================150

	return 0.0;																					// always returns 0.0

}
void SpatialSubtractiveNormalization::init(std::shared_ptr<TorchData> input) {
  RASSERT(input->type() == TorchDataType::TENSOR_DATA);
  Tensor<float>* in = TO_TENSOR_PTR(input.get());

  RASSERT(in->dim() == 3);

  if (output != nullptr) {
    if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) {
      // Input dimension has changed!
      cleanup();
    }
  }

  if (output == nullptr) {
    output.reset(new Tensor<float>(in->dim(), in->size()));
    mean_pass1_.reset(new Tensor<float>(in->dim(), in->size()));
    mean_pass2_.reset(new Tensor<float>(in->dim(), in->size()));
  }

  if (mean_coef_ == nullptr) {
    uint32_t mean_coeff_size[2];
    mean_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0];
    mean_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1];
    mean_coef_.reset(new Tensor<float>(2, mean_coeff_size));

    std::unique_ptr<float[]> mean_coef_cpu(new float[mean_coef_->nelems()]);
    std::unique_ptr<float[]> kernel_cpu(new float[kernel_->nelems()]);
    kernel_->getData(kernel_cpu.get());
    bool onedim_kernel = kernel_->dim() == 1;

    // Filter an image of all 1 values to create the normalization constants
    // See norm_test.lua for proof that this works as well as:
    // https://github.com/andresy/torch/blob/master/extra/nn/SpatialSubtractiveNormalization.lua
    int32_t n_feats = TO_TENSOR_PTR(output.get())->size()[2];
    int32_t height = TO_TENSOR_PTR(output.get())->size()[1];
    int32_t width = TO_TENSOR_PTR(output.get())->size()[0];
    if (onedim_kernel) {
      // 1D case - The filter is seperable, but we'll just do the dumb 2D
      // version since we only do this once on startup.  --> O(n * m)
      uint32_t kernel_size = kernel_->size()[0];
      int32_t filt_rad = (kernel_size - 1) / 2;
      for (int32_t v = 0; v < height; v++) {
        for (int32_t u = 0; u < width; u++) {
          float tmp = 0.0f;
          for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) {
            for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) {
              int32_t u_in = u + u_filt;
              int32_t v_in = v + v_filt;
              if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                tmp += (kernel_cpu[v_filt + filt_rad] *
                        kernel_cpu[u_filt + filt_rad]);
              }
            }
          }
          mean_coef_cpu[v * width + u] = tmp / n_feats;
        }
      }
    } else {
      // 2D case
      int32_t kernel_size_u = kernel_->size()[0];
      int32_t kernel_size_v = kernel_->size()[1];
      int32_t filt_rad_u = (kernel_size_u - 1) / 2;
      int32_t filt_rad_v = (kernel_size_v - 1) / 2;
      for (int32_t v = 0; v < height; v++) {
        for (int32_t u = 0; u < width; u++) {
          float tmp = 0.0f;
          for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) {
            for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) {
              int32_t u_in = u + u_filt;
              int32_t v_in = v + v_filt;
              if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                tmp += kernel_cpu[(v_filt + filt_rad_v) * kernel_size_u +
                                  (u_filt + filt_rad_u)];
              }
            }
          }
          mean_coef_cpu[v * width + u] = tmp / n_feats;
        }
      }
    }
    mean_coef_->setData(mean_coef_cpu.get());
  }
  if (mean_ == nullptr) {
    uint32_t mean_coeff_size[2];
    mean_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0];
    mean_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1];
    mean_.reset(new Tensor<float>(2, mean_coeff_size));
  }
}