std::unique_ptr<TorchStage> SpatialDivisiveNormalization::loadFromFile( std::ifstream& file) { // This whole thing is a little wasteful. I copy to GPU here, and then // I copy it back down in the constructor anyway... But it's good enough // for now. int32_t kernel_size_2, kernel_size_1; // kernel_size_1 is the inner dim file.read((char*)(&kernel_size_1), sizeof(kernel_size_1)); file.read((char*)(&kernel_size_2), sizeof(kernel_size_2)); std::shared_ptr<Tensor<float>> kernel; if (kernel_size_2 > 1) { // The kernel is 2D uint32_t dim = 2; uint32_t size[2] = {static_cast<uint32_t>(kernel_size_1), static_cast<uint32_t>(kernel_size_2)}; kernel.reset(new Tensor<float>(dim, size)); } else { uint32_t dim = 1; uint32_t size[1] = {static_cast<uint32_t>(kernel_size_1)}; kernel.reset(new Tensor<float>(dim, size)); } std::unique_ptr<float[]> kernel_cpu(new float[kernel->nelems()]); file.read((char*)(kernel_cpu.get()), kernel->nelems() * sizeof(kernel_cpu[0])); kernel->setData(kernel_cpu.get()); float threshold; file.read((char*)(&threshold), sizeof(threshold)); return std::unique_ptr<TorchStage>( new SpatialDivisiveNormalization(kernel, threshold)); }
int main( int argc, char *argv []) { //======================================================================================================================================================150 // CPU/MCPU VARIABLES //======================================================================================================================================================150 // timer long long time0; time0 = get_time(); // timer long long time1; long long time2; long long time3; long long time4; long long time5; long long time6; long long time7; // counters int i, j, k, l, m, n; // system memory par_str par_cpu; dim_str dim_cpu; box_str* box_cpu; FOUR_VECTOR* rv_cpu; fp* qv_cpu; FOUR_VECTOR* fv_cpu; int nh; time1 = get_time(); //======================================================================================================================================================150 // CHECK INPUT ARGUMENTS //======================================================================================================================================================150 // assing default values dim_cpu.cores_arg = 1; dim_cpu.boxes1d_arg = 1; // go through arguments for(dim_cpu.cur_arg=1; dim_cpu.cur_arg<argc; dim_cpu.cur_arg++){ // check if -cores if(strcmp(argv[dim_cpu.cur_arg], "-cores")==0){ // check if value provided if(argc>=dim_cpu.cur_arg+1){ // check if value is a number if(isInteger(argv[dim_cpu.cur_arg+1])==1){ dim_cpu.cores_arg = atoi(argv[dim_cpu.cur_arg+1]); if(dim_cpu.cores_arg<0){ printf("ERROR: Wrong value to -cores parameter, cannot be <=0\n"); return 0; } dim_cpu.cur_arg = dim_cpu.cur_arg+1; } // value is not a number else{ printf("ERROR: Value to -cores parameter in not a number\n"); return 0; } } // value not provided else{ printf("ERROR: Missing value to -cores parameter\n"); return 0; } } // check if -boxes1d else if(strcmp(argv[dim_cpu.cur_arg], "-boxes1d")==0){ // check if value provided if(argc>=dim_cpu.cur_arg+1){ // check if value is a number if(isInteger(argv[dim_cpu.cur_arg+1])==1){ dim_cpu.boxes1d_arg = atoi(argv[dim_cpu.cur_arg+1]); if(dim_cpu.boxes1d_arg<0){ printf("ERROR: Wrong value to -boxes1d parameter, cannot be <=0\n"); return 0; } dim_cpu.cur_arg = dim_cpu.cur_arg+1; } // value is not a number else{ printf("ERROR: Value to -boxes1d parameter in not a number\n"); return 0; } } // value not provided else{ printf("ERROR: Missing value to -boxes1d parameter\n"); return 0; } } // unknown else{ printf("ERROR: Unknown parameter\n"); return 0; } } // Print configuration printf("Configuration used: cores = %d, boxes1d = %d\n", dim_cpu.cores_arg, dim_cpu.boxes1d_arg); time2 = get_time(); //======================================================================================================================================================150 // INPUTS //======================================================================================================================================================150 par_cpu.alpha = 0.5; time3 = get_time(); //======================================================================================================================================================150 // DIMENSIONS //======================================================================================================================================================150 // total number of boxes dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg; // how many particles space has in each direction dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX; dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR); dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp); // box array dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str); time4 = get_time(); //======================================================================================================================================================150 // SYSTEM MEMORY //======================================================================================================================================================150 //====================================================================================================100 // BOX //====================================================================================================100 // allocate boxes box_cpu = (box_str*)malloc(dim_cpu.box_mem); // initialize number of home boxes nh = 0; // home boxes in z direction for(i=0; i<dim_cpu.boxes1d_arg; i++){ // home boxes in y direction for(j=0; j<dim_cpu.boxes1d_arg; j++){ // home boxes in x direction for(k=0; k<dim_cpu.boxes1d_arg; k++){ // current home box box_cpu[nh].x = k; box_cpu[nh].y = j; box_cpu[nh].z = i; box_cpu[nh].number = nh; box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX; // initialize number of neighbor boxes box_cpu[nh].nn = 0; // neighbor boxes in z direction for(l=-1; l<2; l++){ // neighbor boxes in y direction for(m=-1; m<2; m++){ // neighbor boxes in x direction for(n=-1; n<2; n++){ // check if (this neighbor exists) and (it is not the same as home box) if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) && (l==0 && m==0 && n==0)==false ){ // current neighbor box box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n); box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m); box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l); box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) + (box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) + box_cpu[nh].nei[box_cpu[nh].nn].x; box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX; // increment neighbor box box_cpu[nh].nn = box_cpu[nh].nn + 1; } } // neighbor boxes in x direction } // neighbor boxes in y direction } // neighbor boxes in z direction // increment home box nh = nh + 1; } // home boxes in x direction } // home boxes in y direction } // home boxes in z direction //====================================================================================================100 // PARAMETERS, DISTANCE, CHARGE AND FORCE //====================================================================================================100 // random generator seed set to random value - time in this case srand(SEED); // input (distances) rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem); for(i=0; i<dim_cpu.space_elem; i=i+1){ rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 } // input (charge) qv_cpu = (fp*)malloc(dim_cpu.space_mem2); for(i=0; i<dim_cpu.space_elem; i=i+1){ qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 } // output (forces) fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem); for(i=0; i<dim_cpu.space_elem; i=i+1){ fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value } time5 = get_time(); //======================================================================================================================================================150 // KERNEL //======================================================================================================================================================150 //====================================================================================================100 // CPU/MCPU //====================================================================================================100 kernel_cpu( par_cpu, dim_cpu, box_cpu, rv_cpu, qv_cpu, fv_cpu); time6 = get_time(); #ifdef BENCH_PRINT for(i=0; i<dim_cpu.space_elem; i=i+1){ printf("(%f, [%f, %f, %f])\t", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z); } printf("\n"); #endif //======================================================================================================================================================150 // SYSTEM MEMORY DEALLOCATION //======================================================================================================================================================150 free(rv_cpu); free(qv_cpu); free(fv_cpu); free(box_cpu); time7 = get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 // printf("Time spent in different stages of the application:\n"); // printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100); // printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100); // printf("Total time:\n"); // printf("%.12f s\n", (float) (time7-time0) / 1000000); //======================================================================================================================================================150 // RETURN //======================================================================================================================================================150 return 0.0; // always returns 0.0 }
void SpatialSubtractiveNormalization::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 3); if (output != nullptr) { if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) { // Input dimension has changed! cleanup(); } } if (output == nullptr) { output.reset(new Tensor<float>(in->dim(), in->size())); mean_pass1_.reset(new Tensor<float>(in->dim(), in->size())); mean_pass2_.reset(new Tensor<float>(in->dim(), in->size())); } if (mean_coef_ == nullptr) { uint32_t mean_coeff_size[2]; mean_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0]; mean_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1]; mean_coef_.reset(new Tensor<float>(2, mean_coeff_size)); std::unique_ptr<float[]> mean_coef_cpu(new float[mean_coef_->nelems()]); std::unique_ptr<float[]> kernel_cpu(new float[kernel_->nelems()]); kernel_->getData(kernel_cpu.get()); bool onedim_kernel = kernel_->dim() == 1; // Filter an image of all 1 values to create the normalization constants // See norm_test.lua for proof that this works as well as: // https://github.com/andresy/torch/blob/master/extra/nn/SpatialSubtractiveNormalization.lua int32_t n_feats = TO_TENSOR_PTR(output.get())->size()[2]; int32_t height = TO_TENSOR_PTR(output.get())->size()[1]; int32_t width = TO_TENSOR_PTR(output.get())->size()[0]; if (onedim_kernel) { // 1D case - The filter is seperable, but we'll just do the dumb 2D // version since we only do this once on startup. --> O(n * m) uint32_t kernel_size = kernel_->size()[0]; int32_t filt_rad = (kernel_size - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) { for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += (kernel_cpu[v_filt + filt_rad] * kernel_cpu[u_filt + filt_rad]); } } } mean_coef_cpu[v * width + u] = tmp / n_feats; } } } else { // 2D case int32_t kernel_size_u = kernel_->size()[0]; int32_t kernel_size_v = kernel_->size()[1]; int32_t filt_rad_u = (kernel_size_u - 1) / 2; int32_t filt_rad_v = (kernel_size_v - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) { for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += kernel_cpu[(v_filt + filt_rad_v) * kernel_size_u + (u_filt + filt_rad_u)]; } } } mean_coef_cpu[v * width + u] = tmp / n_feats; } } } mean_coef_->setData(mean_coef_cpu.get()); } if (mean_ == nullptr) { uint32_t mean_coeff_size[2]; mean_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0]; mean_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1]; mean_.reset(new Tensor<float>(2, mean_coeff_size)); } }