/** * Return the average value of a column, which might be lagged or unlagged */ double get_average (const int candidate, const int lag_index=-1) { /** * Find out if we own this candidate. If we do, compute the average and * broadcast this average accross to everyone else. */ double average; const int owner = interval_mapper (candidate); if (mpi_rank == owner) average = A.average (candidate, lag_index); MPI_Bcast (&average, 1, MPI_DOUBLE, owner, MPI_COMM_WORLD); return average; }
/** * There are bunch of selected time-series for the current regressor (y). * We gather these selected time-series and solve the linear system. * Variables A, y, x, and r are class variables. * * @param[in] candidate The currently added candidate. * * CAVEAT: The assumption is that all the candidates selected prior to the * current candidate are already in selected. */ void add_group (const int candidate) { /* Figure out where in test_A this candidate should start */ double* test_A_start = &(test_A[0]) + (selected.size()*(M*N)); /** * Find out if we own this candidate. If we do, then send the relevant * columns across. Else, receive the relevant columns from the owner. */ const int owner = interval_mapper (candidate); if (mpi_rank == owner) A.materialize_X (candidate, test_A_start); MPI_Bcast (test_A_start, (M*N), MPI_DOUBLE, owner, MPI_COMM_WORLD); }
void SDSMCascadedShadowLayer::UpdateCascades(Camera const & camera, float4x4 const & light_view_proj, float3 const & light_space_border) { RenderFactory& rf = Context::Instance().RenderFactoryInstance(); RenderEngine& re = rf.RenderEngineInstance(); uint32_t const num_cascades = static_cast<uint32_t>(intervals_.size()); uint32_t const copy_index = frame_index_ & 1; uint32_t const read_back_index = (0 == frame_index_) ? copy_index : !copy_index; if (cs_support_) { re.BindFrameBuffer(FrameBufferPtr()); float max_blur_light_space = 8.0f / 1024; float3 max_cascade_scale(max_blur_light_space / light_space_border.x(), max_blur_light_space / light_space_border.y(), std::numeric_limits<float>::max()); int const TILE_DIM = 128; int dispatch_x = (depth_tex_->Width(0) + TILE_DIM - 1) / TILE_DIM; int dispatch_y = (depth_tex_->Height(0) + TILE_DIM - 1) / TILE_DIM; *interval_buff_param_ = interval_buff_; *interval_buff_uint_param_ = interval_buff_; *interval_buff_read_param_ = interval_buff_; *cascade_min_buff_uint_param_ = cascade_min_buff_; *cascade_max_buff_uint_param_ = cascade_max_buff_; *cascade_min_buff_read_param_ = cascade_min_buff_; *cascade_max_buff_read_param_ = cascade_max_buff_; *scale_buff_param_ = scale_buff_; *bias_buff_param_ = bias_buff_; *depth_tex_param_ = depth_tex_; *num_cascades_param_ = static_cast<int32_t>(num_cascades); *inv_depth_width_height_param_ = float2(1.0f / depth_tex_->Width(0), 1.0f / depth_tex_->Height(0)); *near_far_param_ = float2(camera.NearPlane(), camera.FarPlane()); float4x4 const & inv_proj = camera.InverseProjMatrix(); float3 upper_left = MathLib::transform_coord(float3(-1, +1, 1), inv_proj); float3 upper_right = MathLib::transform_coord(float3(+1, +1, 1), inv_proj); float3 lower_left = MathLib::transform_coord(float3(-1, -1, 1), inv_proj); *upper_left_param_ = upper_left; *xy_dir_param_ = float2(upper_right.x() - upper_left.x(), lower_left.y() - upper_left.y()); *view_to_light_view_proj_param_ = camera.InverseViewMatrix() * light_view_proj; *light_space_border_param_ = light_space_border; *max_cascade_scale_param_ = max_cascade_scale; re.Dispatch(*clear_z_bounds_tech_, 1, 1, 1); re.Dispatch(*reduce_z_bounds_from_depth_tech_, dispatch_x, dispatch_y, 1); re.Dispatch(*compute_log_cascades_from_z_bounds_tech_, 1, 1, 1); re.Dispatch(*clear_cascade_bounds_tech_, 1, 1, 1); re.Dispatch(*reduce_bounds_from_depth_tech_, dispatch_x, dispatch_y, 1); re.Dispatch(*compute_custom_cascades_tech_, 1, 1, 1); interval_buff_->CopyToBuffer(*interval_cpu_buffs_[copy_index]); scale_buff_->CopyToBuffer(*scale_cpu_buffs_[copy_index]); bias_buff_->CopyToBuffer(*bias_cpu_buffs_[copy_index]); GraphicsBuffer::Mapper interval_mapper(*interval_cpu_buffs_[read_back_index], BA_Read_Only); GraphicsBuffer::Mapper scale_mapper(*scale_cpu_buffs_[read_back_index], BA_Read_Only); GraphicsBuffer::Mapper bias_mapper(*bias_cpu_buffs_[read_back_index], BA_Read_Only); float2* interval_ptr = interval_mapper.Pointer<float2>(); float3* scale_ptr = scale_mapper.Pointer<float3>(); float3* bias_ptr = bias_mapper.Pointer<float3>(); for (size_t i = 0; i < intervals_.size(); ++ i) { float3 const & scale = scale_ptr[i]; float3 const & bias = bias_ptr[i]; intervals_[i] = interval_ptr[i]; scales_[i] = scale; biases_[i] = bias; } } else { float2 const near_far(camera.NearPlane(), camera.FarPlane()); reduce_z_bounds_from_depth_pp_->SetParam(1, near_far); reduce_z_bounds_from_depth_pp_->Apply(); for (uint32_t i = 1; i < depth_deriative_tex_->NumMipMaps(); ++ i) { int width = depth_deriative_tex_->Width(i - 1); int height = depth_deriative_tex_->Height(i - 1); float delta_x = 1.0f / width; float delta_y = 1.0f / height; float4 delta_offset(delta_x, delta_y, -delta_x / 2, -delta_y / 2); reduce_z_bounds_from_depth_mip_map_pp_->SetParam(0, delta_offset); reduce_z_bounds_from_depth_mip_map_pp_->OutputPin(0, depth_deriative_small_tex_, i - 1); reduce_z_bounds_from_depth_mip_map_pp_->Apply(); int sw = depth_deriative_tex_->Width(i); int sh = depth_deriative_tex_->Height(i); depth_deriative_small_tex_->CopyToSubTexture2D(*depth_deriative_tex_, 0, i, 0, 0, sw, sh, 0, i - 1, 0, 0, sw, sh); } compute_log_cascades_from_z_bounds_pp_->SetParam(1, static_cast<int32_t>(num_cascades)); compute_log_cascades_from_z_bounds_pp_->SetParam(2, near_far); compute_log_cascades_from_z_bounds_pp_->Apply(); interval_tex_->CopyToSubTexture2D(*interval_cpu_texs_[copy_index], 0, 0, 0, 0, num_cascades, 1, 0, 0, 0, 0, num_cascades, 1); Texture::Mapper interval_mapper(*interval_cpu_texs_[read_back_index], 0, 0, TMA_Read_Only, 0, 0, num_cascades, 1); Vector_T<half, 2>* interval_ptr = interval_mapper.Pointer<Vector_T<half, 2> >(); for (size_t i = 0; i < intervals_.size(); ++ i) { float2 const interval(static_cast<float>(interval_ptr[i].x()), static_cast<float>(interval_ptr[i].y())); AABBox aabb = CalcFrustumExtents(camera, interval.x(), interval.y(), light_view_proj); aabb &= AABBox(float3(-1, -1, -1), float3(+1, +1, +1)); aabb.Min() -= light_space_border; aabb.Max() += light_space_border; aabb.Min().x() = +aabb.Min().x() * 0.5f + 0.5f; aabb.Min().y() = -aabb.Min().y() * 0.5f + 0.5f; aabb.Max().x() = +aabb.Max().x() * 0.5f + 0.5f; aabb.Max().y() = -aabb.Max().y() * 0.5f + 0.5f; std::swap(aabb.Min().y(), aabb.Max().y()); float3 const scale = float3(1.0f, 1.0f, 1.0f) / (aabb.Max() - aabb.Min()); float3 const bias = -aabb.Min() * scale; intervals_[i] = interval; scales_[i] = scale; biases_[i] = bias; } } this->UpdateCropMats(); ++ frame_index_; }
int main (int argc, char** argv) { /* Initialize MPI */ MPI_Init (&argc, &argv); /* Figure out the rank and size */ MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size (MPI_COMM_WORLD, &mpi_size); /* MPI sends argc and argv everywhere --- parse everywhere */ parse_parameters (argc,argv); /** * Now, we read the input matrix, FORCED, and PROHIBIT maps. To do this * we first create a partition of the total space so that we know which * range of KPIs is ours. Input matrix is stored per KPI and the maps * are also ordered according to KPIs although not all the KPIs need to * be present. */ pfunc::space_1D kpi_space = partitioner_t<int>::create (0, int_params[NUM_KPIS_INDEX], mpi_rank, mpi_size); std::pair<int,int> full_kpi_range (0, int_params[NUM_KPIS_INDEX]); std::pair<int,int> my_kpi_range (kpi_space.begin(), kpi_space.end()); std::vector<double> values ((my_kpi_range.second-my_kpi_range.first)* int_params [NUM_INTERVALS_INDEX]); int_vec_map_t prohibit_map; int_vec_map_t forced_map; std::vector<double> kpi_weights (int_params[NUM_KPIS_INDEX], 1.0); read_dense_matrix (chr_params [INPUT_MATRIX_PATH_INDEX], my_kpi_range, values.begin()); if (0!=strcmp ("",chr_params[PROHIBIT_LIST_PATH_INDEX])) { read_map (chr_params [PROHIBIT_LIST_PATH_INDEX], prohibit_map); } if (0!=strcmp ("",chr_params[FORCED_LIST_PATH_INDEX])) { read_map (chr_params [FORCED_LIST_PATH_INDEX], forced_map); } if (0!=strcmp ("",chr_params[FORCED_LIST_PATH_INDEX])) { read_dense_matrix (chr_params [KPI_WEIGHTS_PATH_INDEX], full_kpi_range, kpi_weights.begin()); } if (4<int_params[DEBUG_INDEX]) { print_matrix (values.begin(), int_params[NUM_INTERVALS_INDEX], my_kpi_range.second- my_kpi_range.first, "A"); print_map (prohibit_map.begin(), prohibit_map.end(), "PROHIBIT"); print_map (forced_map.begin(), forced_map.end(), "FORCED"); } #if USE_PFUNC /** * Define the PFunc instance. Note that we HAVE TO USE PFUNC::USE_DEFAULT as * the type of the FUNCTOR so that we can use pfunc::parallel_reduce. */ typedef pfunc::generator <pfunc::cilkS, /* Cilk-style scheduling */ pfunc::use_default, /* No task priorities needed */ pfunc::use_default /* any function type*/> generator_type; typedef generator_type::attribute attribute; typedef generator_type::task task; typedef generator_type::taskmgr taskmgr; /* Create an instance of PFunc if that is what is needed */ taskmgr* global_taskmgr; const int n_queues = int_params [NUM_THREADS_INDEX]; unsigned int* thds_per_q_arr = new unsigned int [n_queues]; for (int i=0; i<n_queues; ++i) thds_per_q_arr [i] = ONE_STEP; global_taskmgr = new taskmgr (n_queues, thds_per_q_arr); delete [] thds_per_q_arr; /* Create a task handle for all the tasks that we will use */ task root_task; attribute root_attribute (false /*nested*/, false /*grouped*/); #endif /*************************************************************************/ /* Set the base case size for all the tasks */ pfunc::space_1D::base_case_size = int_params [TASK_SIZE_INDEX]; /*************************************************************************/ /* Create a range mapper that knows about the ownership of each column */ std::vector<int> column_intervals (mpi_size+1); partitioner_t<int>::intervals (0, int_params[NUM_KPIS_INDEX], mpi_size, column_intervals.begin()); typedef interval_mapper_t<std::vector<int> > interval_mapper_t; interval_mapper_t interval_mapper (column_intervals); /* Populate the data frame with the given input matrix */ data_frame_t<double> data_frame (my_kpi_range.first, int_params [NUM_INTERVALS_INDEX], my_kpi_range.second-my_kpi_range.first, int_params [LAG_INDEX]); data_frame.set (values.begin(), values.end(), true); /* Compute the mean and the length of each of the materialized X columns */ double normalization_time = micro_time (); typedef normalizer_t <data_frame_t<double>, identity_mapper_t<int> > my_normalizer_t; identity_mapper_t<int> identity_mapper; my_normalizer_t normalizer (&data_frame, &identity_mapper); #if USE_PFUNC pfunc::parallel_reduce<generator_type, my_normalizer_t, pfunc::space_1D> normalize (kpi_space, normalizer, *global_taskmgr); pfunc::spawn (*global_taskmgr, root_task, root_attribute, normalize); pfunc::wait (*global_taskmgr, root_task); #else normalizer (kpi_space); #endif normalization_time = micro_time() - normalization_time; /*************************************************************************/ /* Rule out all the candidates that have no variation in their columns */ double selection_time = micro_time (); typedef selector_t <data_frame_t<double>, int_set_t, identity_mapper_t<int> > my_selector_t; my_selector_t selector (&data_frame, &identity_mapper); #if USE_PFUNC pfunc::parallel_reduce<generator_type, my_selector_t, pfunc::space_1D> select (kpi_space, selector, *global_taskmgr); pfunc::spawn (*global_taskmgr, root_task, root_attribute, select); pfunc::wait (*global_taskmgr, root_task); #else selector (kpi_space); #endif selection_time = micro_time() - selection_time; /*************************************************************************/ /* Factorize all the columns so that Xg'Xg is formed and ready to go */ double factorization_time = micro_time (); typedef factorizer_t <data_frame_t<double>, std::vector<double>, identity_mapper_t<int>, SolverType> my_factorizer_t; my_factorizer_t factorizer (&data_frame, &identity_mapper, int_params[NUM_INTERVALS_INDEX]- int_params[LAG_INDEX], int_params[LAG_INDEX], dbl_params[LAMBDA_RIDGE_INDEX]); #if USE_PFUNC pfunc::parallel_reduce<generator_type, my_factorizer_t, pfunc::space_1D> factorize (kpi_space, factorizer, *global_taskmgr); pfunc::spawn (*global_taskmgr, root_task, root_attribute, factorize); pfunc::wait (*global_taskmgr, root_task); #else factorizer (kpi_space); #endif factorization_time = micro_time() - factorization_time; /*************************************************************************/ double total_time = 0.0; random_filter_t<int> filter (int_params[RAND_SEED_INDEX], dbl_params[SAMPLE_RATIO_INDEX]); /* For each KPI, build model and output it one by one */ int num_kpis_processed = 0; for (int kpi=0; kpi<int_params[NUM_KPIS_INDEX]; ++kpi) { /** * We need to figure out if this is a useless kpi, in which case, we * will not bother with trying to form a model for this kpi. All we * need to do is a BROADCAST from from the OWNER of this particular kpi. */ int my_vote = 0; /* process */ int result; if (false==filter(kpi) || (selector.get_list().end()!=selector.get_list().find(kpi)))my_vote=1; MPI_Allreduce (&my_vote, &result, 1, MPI_INT, MPI_MAX, /*If there is a single 1 --- we all ranks get 1*/ MPI_COMM_WORLD); if (1 == result) continue; /* we are processing */ ++num_kpis_processed; const int num_rows = (int_params[NUM_INTERVALS_INDEX]- int_params[LAG_INDEX]); /* Populate 'y' */ std::vector<double> y (num_rows); const int owner = interval_mapper (kpi); if (mpi_rank == owner) data_frame.materialize_Y (kpi, y.begin()); MPI_Bcast (&(y[0]), num_rows, MPI_DOUBLE, owner, MPI_COMM_WORLD); /* * Create space for 'beta'. As we are modeling a normalized and centered X * with normalized 'Y', we do not have to worry about the intercept --- we * simply need enough space for the coefficients --- (M-L). The length of * each beta is at most MAX_ITERS * LAG */ std::vector<double> beta (int_params[MAX_ITERS_INDEX] * int_params[LAG_INDEX]); /* Instantiate the modeler */ typedef std::less<double> compare_t; typedef modeler_t<data_frame_t<double>, /* type for the data_frame */ std::vector<double>, /* type for Y and BETA */ std::vector<int>, /*type for storing KPI predictors*/ int_set_t, /* type for FORCED and PROHIBIT */ SolverType, /* type for the solver */ stopper_t, /* stopping functor */ compare_t, /* comparison operator */ interval_mapper_t, /* determine ownership */ my_factorizer_t /* type of factorizer */ #if USE_PFUNC , generator_type /* the generator type */ #endif > my_modeler_t; const double stop_factor = (STOP_ON_OBJ_GAIN==int_params[STOPPING_CRITERIA_INDEX]) ? dbl_params[MIN_OBJ_GAIN_INDEX]:dbl_params[MIN_BIC_GAIN_INDEX]; const stopper_t stopper (stop_factor, int_params[STOPPING_CRITERIA_INDEX]); /* Create a map of the prohibited regressors for this KPI */ int_set_t prohibit_set; if (prohibit_map.end() != prohibit_map.find(kpi)) { prohibit_set.insert ((prohibit_map[kpi]).begin(), (prohibit_map[kpi]).end()); } /* Insert the candidates that we don't want screened */ prohibit_set.insert (selector.get_list().begin(), selector.get_list().end()); /* Create a map of the forced regressors for this KPI */ int_set_t forced_set; if (forced_map.end() != forced_map.find(kpi)) { forced_set.insert ((forced_map[kpi]).begin(), (forced_map[kpi]).end()); } /* Create an instance of the modeler */ std::vector<int> selected; double variance; double intercept; my_modeler_t my_modeler (data_frame, /* data frame */ y, /* regressor */ beta, /* the output */ selected, /* the selected KPIs in order */ prohibit_set,/* prohibited regressors */ forced_set, /* forced regressors */ kpi_weights, /* weights to use for each kpi */ variance, /* variance */ intercept, /* intercept */ kpi, /* target */ stopper, /* stopping criteria */ interval_mapper, /* determine ownership */ factorizer, /* factorizer for Xg'Xg */ dbl_params[LAMBDA_RIDGE_INDEX], /*ridge penalty*/ num_rows, /* num rows */ int_params[LAG_INDEX], /* num columns */ int_params[MAX_ITERS_INDEX], int_params[DEBUG_INDEX] #if USE_PFUNC ,global_taskmgr /* task manager for pfunc */ #endif ); /* Let the model compute */ double time = micro_time (); my_modeler (); time = micro_time () - time; total_time += time; /* Print out the coefficients if asked for */ if (ROOT==mpi_rank && 1<int_params[DEBUG_INDEX]) { printf ("Model for KPI %d (Variance=%lf, Intercept=%lf)\n", kpi, variance, intercept); for (size_t i=0;i<selected.size();++i) { printf("%d (",selected[i]); for (int j=0; j<int_params[LAG_INDEX]; ++j) { printf ("%lf", beta[i*int_params[LAG_INDEX]+j]); if (j!=(int_params[LAG_INDEX]-1)) printf(","); } printf(")\n"); } } /* Print out the coefficients to file if asked for */ if (ROOT==mpi_rank && 0<int_params[WRITE_FILES_INDEX]) { const std::string base_dir = chr_params[OUTPUT_FILE_PATH_INDEX]; const std::string par_path = base_dir + "/parents.txt"; const std::string coeffs_path = base_dir + "/coeffs.txt"; const std::string var_path = base_dir + "/variance.txt"; const std::string int_path = base_dir + "/intercept.txt"; std::ofstream par_file (par_path.c_str(), std::ios_base::app); std::ofstream coeffs_file (coeffs_path.c_str(), std::ios_base::app); std::ofstream var_file (var_path.c_str(), std::ios_base::app); std::ofstream int_file (int_path.c_str(), std::ios_base::app); par_file << kpi << ":"; coeffs_file << kpi << ":"; var_file << kpi << ":"; int_file << kpi << ":"; for (size_t i=0;i<selected.size();++i) { par_file << selected[i] << " "; for (int j=0; j<int_params[LAG_INDEX]; ++j) coeffs_file << beta[i*int_params[LAG_INDEX]+j] << " "; } var_file << variance; int_file << intercept; par_file << "\n"; coeffs_file << "\n"; var_file << "\n"; int_file << "\n"; par_file.close(); coeffs_file.close(); var_file.close(); int_file.close(); } } if (ROOT==mpi_rank) printf ("Built %d models in %lf (secs) at rate of %lf (per sec)\n", num_kpis_processed, total_time, total_time/num_kpis_processed); #if USE_PFUNC delete global_taskmgr; #endif /* Finalize MPI */ MPI_Finalize (); return 0; }