/** * Function to read the input shape file. */ unsigned int NumericFormFactor::read_shapes_file_dat(const char* filename, real_vec_t &shape_def) { std::ifstream f(filename); if(!f.is_open()) { std::cout << "Cannot open file " << filename << std::endl; return 1; } // if real_t s = 0.0, cx = 0.0, cy = 0.0, cz = 0.0, nx = 0.0, ny = 0.0, nz = 0.0; while(true) { f >> s; if(f.eof() || !f.good()) break; f >> nx; f >> ny; f >> nz; f >> cx; f >> cy; f >> cz; shape_def.push_back(s); shape_def.push_back(nx); shape_def.push_back(ny); shape_def.push_back(nz); shape_def.push_back(cx); shape_def.push_back(cy); shape_def.push_back(cz); } // while f.close(); return shape_def.size() / 7; } // NumericFormFactor::read_shapes_file_dat()
/** * Function to read the shape definition input file in HDF5 format. */ unsigned int NumericFormFactor::read_shapes_file(const char* filename, // #ifndef __SSE3__ real_vec_t &shape_def // #else // #ifdef USE_GPU // real_vec_t &shape_def // #else // real_t* &shape_def // #endif // #endif ) { unsigned int num_triangles = 0; double* temp_shape_def = NULL; // TODO: shape definition is already in HigInput ... // utilize ... ShapeFileType type = get_shapes_file_format(filename); if(type == shape_file_data) { RawShapeReader temp(filename, temp_shape_def, num_triangles); } else if(type == shape_file_object) { ObjectShapeReader temp(filename, temp_shape_def, num_triangles); } else if(type == shape_file_hdf5) { #ifdef USE_PARALLEL_HDF5 h5_shape_reader(filename, &temp_shape_def, &num_triangles); #else std::cerr << "error: use of parallel hdf5 format has not been enabled in your installation. " << "Please reinstall with the support enabled." << std::endl; return false; #endif } else if(type == shape_file_null) { std::cerr << "error: shape definition file extension is null" << std::endl; return 0; } else if(type == shape_file_error) { std::cerr << "error: shape definition file format unknown" << std::endl; return 0; } else { std::cerr << "error: shape definition file format unknown" << std::endl; return 0; } // if-else #ifdef FF_NUM_GPU #ifndef KERNEL2 for(unsigned int i = 0; i < num_triangles * 7; ++ i) shape_def.push_back((real_t)temp_shape_def[i]); #else // KERNEL2 for(unsigned int i = 0, j = 0; i < num_triangles * T_PROP_SIZE_; ++ i) { if((i + 1) % T_PROP_SIZE_ == 0) shape_def.push_back((real_t) 0.0); // padding else { shape_def.push_back((real_t)temp_shape_def[j]); ++ j; } } // for #endif // KERNEL2 //#elif defined USE_MIC // using MIC // for(unsigned int i = 0; i < num_triangles * 7; ++ i) // shape_def.push_back((real_t)temp_shape_def[i]); #else // using CPU or MIC // #ifndef __SSE3__ for(unsigned int i = 0, j = 0; i < num_triangles * CPU_T_PROP_SIZE_; ++ i) { if((i + 1) % CPU_T_PROP_SIZE_ == 0) shape_def.push_back((real_t) 0.0); // padding else { shape_def.push_back((real_t)temp_shape_def[j]); ++ j; } } // for /* #else // using SSE3, so store data differently: FOR CPU AND MIC (vectorization) #ifndef USE_MIC // generic cpu version with SSE3 or AVX #ifdef INTEL_SB_AVX // CPU version with AVX // group all 's', 'nx', 'ny', 'nz', 'x', 'y', 'z' together // for alignment at 32 bytes, make sure each of the 7 groups is padded // compute amount of padding // 32 bytes = 8 floats or 4 doubles. FIXME: assuming float only for now ... unsigned int padding = (8 - (num_triangles & 7)) & 7; unsigned int shape_size = (num_triangles + padding) * 7; shape_def = (real_t*) _mm_malloc(shape_size * sizeof(real_t), 32); if(shape_def == NULL) { std::cerr << "error: failed to allocate aligned memory for shape_def" << std::endl; return 0; } // if memset(shape_def, 0, shape_size * sizeof(real_t)); for(int i = 0; i < num_triangles; ++ i) { for(int j = 0; j < 7; ++ j) { shape_def[(num_triangles + padding) * j + i] = temp_shape_def[7 * i + j]; } // for } // for #else // CPU version with SSE3 // group all 's', 'nx', 'ny', 'nz', 'x', 'y', 'z' together // for alignment at 16 bytes, make sure each of the 7 groups is padded // compute amount of padding // 16 bytes = 4 floats or 2 doubles. FIXME: assuming float only for now ... unsigned int padding = (4 - (num_triangles & 3)) & 3; unsigned int shape_size = (num_triangles + padding) * 7; shape_def = (real_t*) _mm_malloc(shape_size * sizeof(real_t), 16); if(shape_def == NULL) { std::cerr << "error: failed to allocate aligned memory for shape_def" << std::endl; return 0; } // if memset(shape_def, 0, shape_size * sizeof(real_t)); for(int i = 0; i < num_triangles; ++ i) { for(int j = 0; j < 7; ++ j) { shape_def[(num_triangles + padding) * j + i] = temp_shape_def[7 * i + j]; } // for } // for #endif #else // optimized for MIC only: AVX2, 64 byte alignments (512-bit vector registers) // FIXME: float only for now: 16 floats in one vector! unsigned int padding = (16 - (num_triangles & 15)) & 15; unsigned int shape_size = (num_triangles + padding) * 7; shape_def = (real_t*) _mm_malloc(shape_size * sizeof(real_t), 64); if(shape_def == NULL) { std::cerr << "error: failed to allocate aligned memory for shape_def" << std::endl; return 0; } // if memset(shape_def, 0, shape_size * sizeof(real_t)); for(int i = 0; i < num_triangles; ++ i) { for(int j = 0; j < 7; ++ j) { shape_def[(num_triangles + padding) * j + i] = temp_shape_def[7 * i + j]; } // for } // for // TODO: try grouping 16 triangles together ... // that will give completely sequential memory access! #endif #endif // __SSE3__ */ #endif // FF_NUM_GPU return num_triangles; } // NumericFormFactor::read_shapes_file()