void fill_padding_repeat() { using boost::irange; const auto offset = ValueLayer::padding_offset; const auto last_row = this->num_rows() + offset - 1; const auto last_col = this->num_cols() + offset - 1; auto& layer_array = this->layer_array; for (auto row : irange(0lu, util::rows(layer_array))) { auto const& x = layer_array[row][last_col]; auto const& y = layer_array[row][offset]; for (auto i : irange(1, offset + 1)) { layer_array[row][last_col + i] = x; layer_array[row][offset - i] = y; } } for (auto col : irange(0lu, util::cols(layer_array))) { auto const& x = layer_array[last_row][col]; auto const& y = layer_array[offset][col]; for (auto i : irange(1, offset + 1)) { layer_array[last_row + i][col] = x; layer_array[offset - i][col] = y; } } }
/// \brief TODOCUMENT /// /// \todo Could this be generalised (perhaps with a non-member function /// to make it convenient for the particulars of aligned_pair_score_value_list) ? void score_classn_value_results_set::add_aligned_pair_score_value_list(const aligned_pair_score_value_list &arg_aligned_pair_score_value_list, ///< TODOCUMENT const bool &arg_condition_is_positive, ///< TODOCUMENT const string &arg_instance_label ///< TODOCUMENT ) { const size_t num_score_values = arg_aligned_pair_score_value_list.size(); if ( empty() ) { for (const size_t &score_value_ctr : irange( 0_z, num_score_values ) ) { const auto &score = arg_aligned_pair_score_value_list.get_aligned_pair_score_of_index( score_value_ctr ); const string &name = score.human_friendly_short_name(); const bool &higher_is_better = score.higher_is_better(); score_classn_value_lists.push_back( make_score_classn_value_list( {}, higher_is_better, name ) ); } sort_score_classn_value_lists(); } else { const auto new_names = get_all_names( arg_aligned_pair_score_value_list ); const str_set new_names_set( common::cbegin( new_names ), common::cend( new_names ) ); const size_t num_series = size(); const auto existing_names = transform_build<str_vec>( irange( 0_z, num_series ), [&] (const size_t &x) { return get_name_of_index( *this, x ); } ); if ( ! equal( existing_names, new_names_set ) ) { // cerr << "existing_names : " << join( existing_names, ", " ) << endl; // cerr << "new_names_set : " << join( new_names_set, ", " ) << endl; BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot add aligned_pair_score_value_list because the series names don't match the existing ones")); } } for (const size_t &score_value_ctr : irange( 0_z, num_score_values ) ) { const auto &value = arg_aligned_pair_score_value_list.get_value_of_index ( score_value_ctr ); const auto &score = arg_aligned_pair_score_value_list.get_aligned_pair_score_of_index( score_value_ctr ); const string &name = score.human_friendly_short_name(); const bool &higher_is_better = score.higher_is_better(); score_classn_value_list &the_list = get_score_classn_value_list_of_name( name ); if ( get_higher_is_better( the_list ) != higher_is_better ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "Cannot add aligned_pair_score_value_list to score_classn_value_results_set because they conflict regarding the higher_is_better value for score " + name )); } the_list.add_score_classn_value( score_classn_value( value, arg_condition_is_positive, arg_instance_label ) ); } }
/// \brief TODOCUMENT size_vec common_residue_select_best_score_percent_policy::do_select_common_residues_with_scores(const doub_doub_pair_vec &arg_scores ///< TODOCUMENT ) const { // Grab the a list of the smaller value from each pair of scores const auto min_scores = transform_build<doub_vec>( arg_scores, [] (const doub_doub_pair &x) { return min( x.first, x.second ); } ); // Build a stable_sorted list of indices in descending order of the score to which each corresponds const auto score_sorted_indices = stable_sort_build<size_vec>( irange( 0_z, min_scores.size() ), [&] (const size_t &x, const size_t &y) { return ( min_scores.at( x ) > min_scores.at( y ) ); } ); // Calculate the total score of all values and the cutoff for best_score_percentage % of it const double total_score = accumulate( min_scores, 0.0 ); const double percentile_total_score = total_score * best_score_percentage / 100.0; // Step through score_sorted_indices and grab them until the sum of their scores exceeds // percentile_total_score, then return a sorted copy of all those indices double temp_sum( 0.0 ); return sort_build<size_vec>( common::cbegin( score_sorted_indices ), find_if( score_sorted_indices, [&] (const size_t &x) { temp_sum += min_scores[ x ]; return ( temp_sum > percentile_total_score ); } ) ); }
decltype(auto) convert_helper(const std::vector<cv::Mat>& input, CONVERT_FUN convert_element) { using boost::irange; const auto ROWS = input[0].rows; const auto COLS = input[0].cols; auto result = util::make_vector_2d<MATRIX_TYPE>(ROWS, COLS); for (auto row : irange(0, ROWS)) { for (auto col : irange(0, COLS)) { result[row][col] = convert_element(input, row, col); } } return result; }
/// \brief Generate names to use in the viewer for the specified number of colours str_vec cath::generate_colour_names(const size_t &arg_num_colours ///< The total number of colours ) { return transform_build<str_vec>( irange( 0_z, arg_num_colours ), [&] (const size_t &x) { return generate_colour_name( x, arg_num_colours ); } ); }
/// \brief TODOCUMENT /// /// \relates score_classn_value_results_set void cath::score::write_to_svm_light_data_files(const score_classn_value_results_set &arg_results_set, ///< TODOCUMENT const path &arg_output_file_stem, ///< TODOCUMENT const size_t &arg_num_repeats, ///< TODOCUMENT mt19937 &arg_rng, ///< TODOCUMENT const double &arg_fraction_train ///< TODOCUMENT ) { const auto num_instances = get_num_instances ( arg_results_set ); const auto names = get_names ( arg_results_set ); const auto instance_labels = get_instance_labels ( arg_results_set ); const auto scalings = get_value_list_scalings( arg_results_set ); // Get a data structure in which all lists are sorted in the same way const auto results = transform_build<score_classn_value_vec_vec>( names, [&] (const string &x) { const auto &results_list = arg_results_set.get_score_classn_value_list_of_name( x ); return get_score_classn_values_of_instance_labels( results_list, instance_labels ); } ); for (const size_t &repeat_ctr : irange( 1_z, arg_num_repeats + 1 ) ) { const auto repeat_ctr_str = lexical_cast<string>( repeat_ctr ); const path train_file = replace_extension_copy( arg_output_file_stem, "." + repeat_ctr_str + ".train" ); const path test_file = replace_extension_copy( arg_output_file_stem, "." + repeat_ctr_str + ".test" ); const size_vec_size_vec_pair split_indices = random_split( arg_rng, num_instances, arg_fraction_train ); write_to_svm_light_data_files_impl( results, scalings, train_file, split_indices.first ); write_to_svm_light_data_files_impl( results, scalings, test_file, split_indices.second ); } }
/// \brief TODOCUMENT /// /// Things for investigation: /// * raw/final score from alignment /// * expected raw/final score /// * top categorised reasons by raw score /// * repeat for varying scan_stride (can just be called by nmnf fn) alignment_scan_comparison check_scan_on_final_alignment::do_check(const alignment &arg_alignment, ///< TODOCUMENT const protein &arg_protein_a, ///< TODOCUMENT const protein &arg_protein_b, ///< TODOCUMENT const quad_criteria &arg_criteria, ///< TODOCUMENT const scan_stride &arg_scan_stride ///< TODOCUMENT ) const { const auto aln_range = irange( 0_z, arg_alignment.length() ); cerr << "SHOULD THE RANGE BE 7.0 RATHER THAN SQRT(40.0)????\n"; return accumulate( cross( aln_range, aln_range ), alignment_scan_comparison{}, [&] (alignment_scan_comparison x, const size_size_tpl &y) { const size_t aln_from_ctr = get<0>( y ); const size_t aln_to_ctr = get<1>( y ); const bool from_alns_both = has_both_positions_of_index( arg_alignment, aln_from_ctr ); const bool to_alns_both = has_both_positions_of_index( arg_alignment, aln_to_ctr ); if ( from_alns_both && to_alns_both ) { const auto a_from = get_a_position_of_index( arg_alignment, aln_from_ctr ); const auto b_from = get_b_position_of_index( arg_alignment, aln_from_ctr ); const auto a_to = get_a_position_of_index( arg_alignment, aln_to_ctr ); const auto b_to = get_b_position_of_index( arg_alignment, aln_to_ctr ); const bool a_included = difference( a_from, a_to ) > NUM_EXCLUDED_ON_SIDES; const bool b_included = difference( b_from, b_to ) > NUM_EXCLUDED_ON_SIDES; if ( a_included && b_included ) { const auto the_distance = distance_between_points( view_vector_of_residue_pair( arg_protein_a.get_residue_ref_of_index( a_from ), arg_protein_a.get_residue_ref_of_index( a_to ) ), view_vector_of_residue_pair( arg_protein_b.get_residue_ref_of_index( b_from ), arg_protein_b.get_residue_ref_of_index( b_to ) ) ); // const auto score = ( the_distance >= 7.0 ) ? 0.0 : ( 1.0 - the_distance / 7.0 ); const auto score = ( the_distance >= sqrt( 40.0 ) ) ? 0.0 : ( 1.0 - the_distance / 7.0 ); if ( score > 0.0 ) { const auto scan_result = quad_and_rep_criteria_result_of( arg_protein_a, arg_protein_b, arg_criteria, arg_scan_stride, numeric_cast<index_type>( a_from ), numeric_cast<index_type>( a_to ), numeric_cast<index_type>( b_from ), numeric_cast<index_type>( b_to ) ); x += make_pair( scan_result, score ); } } } return x; } ); }
/// \brief Generate a string describing the difference between the specified dssp_dupl_ress parsed from a DSSP file /// and the specified bifur_hbond_list or return none if there isn't any difference /// /// \relates dssp_dupl_res str_opt cath::sec::difference_string(const dssp_dupl_res_vec &arg_dssp_dupl_res_vec, ///< A vector of dssp_dupl_res parsed from a DSSP file const bifur_hbond_list &arg_bifur_hbond_list ///< The dssp_dupl_res_list to be compared to the dssp_dupl_res vector ) { const auto num_dssp_dupl_res = arg_dssp_dupl_res_vec.size(); const auto arg_bifur_hbonds = arg_bifur_hbond_list.size(); const auto dssp_non_null_indices = copy_build<size_vec>( irange( 0_z, num_dssp_dupl_res ) | filtered( [&] (const size_t &x) { return ! arg_dssp_dupl_res_vec[ x ].pdb_residue_name.is_null(); } ) ); const size_t num_non_null_residues = dssp_non_null_indices.size(); const str_opt num_prob = make_optional( ( num_non_null_residues != arg_bifur_hbonds ), "Number of (non-null) DSSP hbond residues (" + ::std::to_string( num_non_null_residues ) + "), doesn't match the number of calculated hbond residues (" + ::std::to_string( arg_bifur_hbonds ) + "). " ); const auto normal_index_of_dssp_index = [&] () { size_vec result( num_dssp_dupl_res, 0 ); for (const size_t &x : irange( 0_z, dssp_non_null_indices.size() ) ) { result[ dssp_non_null_indices[ x ] ] = x; } return result; }(); for (const size_t &index : irange( 0_z, min( num_dssp_dupl_res, arg_bifur_hbonds ) ) ) { const auto diff_str = difference_string( arg_dssp_dupl_res_vec[ dssp_non_null_indices[ index ] ], arg_bifur_hbond_list[ index ], normal_index_of_dssp_index ); if ( diff_str ) { return ( num_prob ? *num_prob : string{} ) + *diff_str; } } return num_prob ? make_optional( *num_prob + " No differences spotted until the end of the shortest" ) : none; }
/// \brief TODOCUMENT /// /// \relates score_classn_value_results_set str_vec cath::score::get_names(const score_classn_value_results_set &arg_results_set ///< TODOCUMENT ) { return transform_build<str_vec>( irange( 0_z, arg_results_set.size() ), [&] (const size_t &x) { return get_name_of_index( arg_results_set, x ); } ); }
std::vector<std::string> generateStitchSet(const std::string& directory, boost::format pattern, bool reverse, int start, int rowSize) { std::cout << format("Generating stitch set for range %1% to %2%\n") % start % (start + rowSize); std::cout << format("Reversing? %1%\n") % reverse; std::vector<std::string> files; for (auto i : irange(start, rowSize + start + 1)) { std::string filename = boost::str(pattern % directory % i); files.push_back(filename); } if (reverse) std::reverse(files.begin(), files.end()); return files; }
/// \brief Private static method that implements the process of building the views from proteins coord_vec_vec view_cache::build_views(const protein &arg_protein ///< The protein which the view_cache should be built to represent ) { // Grab the number of residues and prepare the views accordingly const size_t num_residues = arg_protein.get_length(); coord_vec_vec new_views( num_residues ); for (coord_vec &view_of : new_views) { view_of.reserve( num_residues ); } // Loop over the all from-versus-to residue pairs and add the resulting views for (const size_t &from_res_ctr : irange( 0_z, num_residues ) ) { for (const size_t &to_res_ctr : irange( 0_z, num_residues ) ) { coord_vec &view_of_from = new_views[ from_res_ctr ]; view_of_from.push_back( view_vector_of_residue_pair( arg_protein.get_residue_ref_of_index( from_res_ctr ), arg_protein.get_residue_ref_of_index( to_res_ctr ) ) ); } } return new_views; }
/// \brief TODOCUMENT broad_display_colour_spec display_colourer_consecutive::do_get_colour_spec_from_num_entries(const size_t &arg_num_entries ///< The number of structures to be coloured ) const { // Create a new display_colour_spec and populate it for the entries with colours broad_display_colour_spec new_spec; for (const size_t entry_ctr : irange( 0_z, arg_num_entries ) ) { new_spec.colour_pdb( entry_ctr, colour_of_mod_index( colours, entry_ctr ) ); } // Return the generated display_colour_spec return new_spec; }
/// \brief TODOCUMENT hmmer_scores_entry_vec hmmer_scores_file::remove_duplicates(const hmmer_scores_entry_vec &arg_hmmer_scores_entries ///< TODOCUMENT ) { hmmer_scores_entry_vec results; str_str_pair_size_map index_of_previously_seen; return transform_build<hmmer_scores_entry_vec>( irange( 0_z, arg_hmmer_scores_entries.size() ) | filtered( [&] (const size_t &x) { const auto &entry = arg_hmmer_scores_entries[ x ]; const auto &id1 = entry.get_name_1(); const auto &id2 = entry.get_name_2(); const auto &evalue = entry.get_full_sequence_evalue(); const auto name_pair = make_pair( id1, id2 ); if ( ! contains( index_of_previously_seen, name_pair ) ) { index_of_previously_seen.emplace( name_pair, x ); return true; } cerr << "Ooo - crazy things happening with HMMER parsing" << "\n"; const auto prev_entry = arg_hmmer_scores_entries[ index_of_previously_seen.at( name_pair ) ]; // if ( entry.get_hit_num() <= prev_entry.get_hit_num() ) { // BOOST_THROW_EXCEPTION(invalid_argument_exception( // "When parsing PRC results, found hit between " + id1 + " and " + id2 + " with hit number that isn't higher than for previous result" // )); // BOOST_LOG_TRIVIAL( warning ) << "When parsing PRC results, found hit between " << id1 << " and " << id2 << " with hit number that isn't higher than for previous result"; // } if ( evalue < prev_entry.get_full_sequence_evalue() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "When parsing HMMER results, found hit between " + id1 + " and " + id2 + " with evalue better than for previous result" )); } if ( entry.get_full_sequence_score() > prev_entry.get_full_sequence_score() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "When parsing HMMER results, found hit between " + id1 + " and " + id2 + " with simple score better than for previous result " + " " + std::to_string( entry.get_full_sequence_score() ) + " " + std::to_string( prev_entry.get_full_sequence_score() ) )); } return false; } ), [&] (const size_t &x) { return arg_hmmer_scores_entries[ x ]; } ); return results; }
/// \brief TODOCUMENT size_size_pair_vec cath::align::get_alignment_break_pairs(const alignment &arg_alignment ///< TODOCUMENT ) { const auto alignment_breaks = get_alignment_breaks( arg_alignment ); const auto num_alignment_breaks = alignment_breaks.size(); size_size_pair_vec break_pairs; break_pairs.reserve( num_alignment_breaks ); for (const size_t &idx_one : irange( 0_z, num_alignment_breaks ) ) { for (const size_t &idx_two : irange( idx_one, num_alignment_breaks ) ) { const size_t &break_one = alignment_breaks[ idx_one ]; const size_t &break_two = alignment_breaks[ idx_two ]; const auto pair_result = check_pair( arg_alignment, break_one, break_two ); if ( pair_result.first == break_pair_validity::GOOD ) { break_pairs.emplace_back( break_one, break_two ); } if ( pair_result.second == break_pair_future::NEVER_AGAIN ) { break; } } } return break_pairs; }
void add_entries_from_permutation( binary_truth_table& spec, const std::vector<unsigned>& permutation ) { using boost::combine; using boost::irange; using boost::get; unsigned n = (unsigned)ceil( log( permutation.size() ) / log( 2 ) ); for ( const auto& i : combine( irange( 0u, (unsigned)permutation.size() ), permutation ) ) { spec.add_entry( number_to_truth_table_cube( get<0>( i ), n ), number_to_truth_table_cube( get<1>( i ), n ) ); } }
void Pyramid::iterate() { using std::cout; using std::endl; using std::flush; using boost::irange; using accumulators::sum; using accumulators::weight; auto iteration_cnt = 0; auto levels = irange(1lu, height); for (const auto lv : levels) { cout << endl << "=== Level : " << lv << " === " << shape(lv) << " ===" << flush; assert(lv > 0); cout << endl << "Erasing bad links ..." << flush; erase_bad_links(lv); while (true) { cout << endl << "=== Iteration: " << iteration_cnt << " ==="; auto spec_dist_fun = [this](auto node, auto desc) { return wishart_distance(node, desc); }; iteration_step(lv, spec_dist_fun); if (link.activity(lv) < conf.activity_eps || conf.max_iterations == iteration_cnt) { iteration_cnt++; break; } iteration_cnt++; } if (conf.max_iterations == iteration_cnt) { cout << endl << "Maximum Iterations reached." << endl; } else { cout << endl << "Converged." << endl; } } }
int main(int argc, char** argv) { bool useGpu = true; float matchConfidence = 0.3f; std::string directory = "/Users/billylee/source/gladstone/data/cultured-neuron/ch1"; boost::format filePattern("%1%/PID20151028_JL_T0_0_D1_%2%_488_642_Empty_684_1_0.0_AndorZyla110XELWD.tif"); // read all images into memory, then do an estimateRigidTransform on all pairs // of images, and then do a cv::gpu::warpAffine /* std::vector<std::vector<cv::Mat>> imageGrid; for (auto i : irange(1, 6)) { for (auto j : irange(1, 6)) { std::string file = filePattern % directory % (i * j); imageGrid[i].push_back(cv::imread(file)); } } std::vector<cv::MatchesInfo> pairwise_matches; cv::BestOf2NearestMatcher matcher(useGpu, matchConfidence); cv::Mat matchMask(features.size(), features.size(), cv::CV_8U, ) cv::Mat stitched; cv::Stitcher stitcher = cv::Stitcher::createDefault(true); cv::Stitcher::Status status = stitcher.stitch(images, stitched); if (status != cv::Stitcher::OK) { std::cout << "Can't stitch!" } */ for (auto i : irange (0, 5)) { bool reverse = (i + 1) % 2 != 0; auto files = generateStitchSet(directory, filePattern, reverse, 5*i + 1, 4); std::array<cv::Mat, 5> images; // load set into memory int c = 0; for (auto f : files) { images[c] = cv::imread(f); c += 1; } stitchSet(images); } }
/// \brief TODOCUMENT /// /// \relates score_classn_value_results_set named_true_false_pos_neg_list_list cath::score::make_named_true_false_pos_neg_list_list(const score_classn_value_results_set &arg_score_classn_value_results_set ///< TODOCUMENT ) { return { transform_build<named_true_false_pos_neg_list_vec>( irange( 0_z, arg_score_classn_value_results_set.size() ), [&] (const size_t &x) { const auto &name = get_name_of_index( arg_score_classn_value_results_set, x ); const auto &scores = arg_score_classn_value_results_set.get_score_classn_value_list_of_name( name ); // \todo Put this check in get_score_classn_value_list_of_name() if ( name != scores.get_name() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Mismatching names in score_classn_value_results_set")); } return make_named_true_false_pos_neg_list( scores ); } ) }; }
Mat4f Pyramid::make_spatial_distance_matrix() const { using std::sqrt; using boost::irange; auto result = Mat4f(4, 4); /* clang-format off */ result << 9, 5, 5, 9, 5, 1, 1, 5, 5, 1, 1, 5, 9, 5, 5, 9; /* clang-format on */ for (const auto i : irange(0l, result.size())) { result(i) = sqrt(result(i)); } return result; }
void smooth_helper(Pyramid& p, smooth_level_fun smooth, const size_t top_level) { using boost::irange; using boost::adaptors::reversed; using namespace std; auto cnt = 0; auto v_copy = ImagePyramid{p.get_value2()}; auto w_copy = LinkPyramid{p.get_links()}; auto levels = irange(0ul, top_level + 1); for (const auto lv : levels | reversed) { cout << "Smoothing Level " << lv << " ... "; cnt += smooth(v_copy, w_copy, lv); cout << endl; } p.set_result(v_copy.bottom()); cout << "Done." << endl; cout << "Number of Segments: " << cnt << endl; }
/// \brief TODOCUMENT size_vec cath::align::get_alignment_breaks(const alignment &arg_alignment ///< TODOCUMENT ) { // For each alignment index *after* zero, check whether there's any overlap // between the entries that are present at the previous index and the entries that are // present at this index. If not, add this index to the list. // // Note: Would prefer to use Boost Range's adjacent_filtered adaptor here but // it doesn't do what's required here: it always lets the first pair through, // even if they fail the predicate. return copy_build<size_vec>( irange( 1_z, arg_alignment.length() ) | filtered ( [&] (const size_t &curr_idx) { // Return whether the set of present entries for the previous index // is disjoint with the set of present entries for this index return sets_are_disjoint( entries_present_at_index( arg_alignment, curr_idx - 1 ), entries_present_at_index( arg_alignment, curr_idx ) ); } ) ); }
/// \brief Build a ssaps_and_prcs_of_query from the specified ssap_scores_entries and prc_scores_entries /// /// \relates ssaps_and_prcs_of_query ssaps_and_prcs_of_query cath::homcheck::make_ssaps_and_prcs_of_query(const ssap_scores_entry_vec &arg_ssaps, ///< The SSAPs from which the ssaps_and_prcs_of_query should be built const prc_scores_entry_vec &arg_prcs ///< The PRCs from which the ssaps_and_prcs_of_query should be built ) { // Sanity check the inputs - step 1: check SSAP results have identical name_1s const bool ssap_query_ids_identical = all_of( arg_ssaps | adjacented, [] (const pair<const ssap_scores_entry &, const ssap_scores_entry &> &x) { return x.first.get_name_1() == x.second.get_name_1(); } ); if ( ! ssap_query_ids_identical ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "Cannot construct ssaps_and_prcs_of_query from data because the SSAP results have inconsistent query IDs (first is \"" + arg_ssaps.front().get_name_1() + "\")" )); } // Sanity check the inputs - step 2: check PRC results have identical name_1s const bool prc_query_ids_identical = all_of( arg_prcs | adjacented, [] (const pair<const prc_scores_entry &, const prc_scores_entry &> &x) { return x.first.get_name_1() == x.second.get_name_1(); } ); if ( ! prc_query_ids_identical ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot construct ssaps_and_prcs_of_query from data because the PRC results have inconsistent query IDs (first is \"" + arg_prcs.front().get_name_1() + "\")" )); } // Sanity check the inputs - step 3: check the SSAP name_1 matches the PRC name_1 if ( ! arg_ssaps.empty() && ! arg_prcs.empty() ) { const string &ssaps_name_1 = arg_ssaps.front().get_name_1(); const string &prcs_name_1 = arg_prcs.front().get_name_1(); if ( ssaps_name_1 != prcs_name_1 ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "Cannot construct ssaps_and_prcs_of_query from data because the SSAP results query ID \"" + ssaps_name_1 + "\" does not match the PRC results query ID \"" + prcs_name_1 + "\"" )); } } const auto prc_index_of_ids = transform_build<unordered_map<str_str_pair, size_t, pair_hash> >( irange( 0_z, arg_prcs.size() ), [&] (const size_t &x) { const auto &the_prc = arg_prcs[ x ]; return make_pair( make_pair( the_prc.get_name_1(), the_prc.get_name_2() ), x ); } ); vector<ssap_and_prc> ssap_and_prc_entries; for (const ssap_scores_entry &the_ssap : arg_ssaps) { const string &query_id = the_ssap.get_name_1(); const string &match_id = the_ssap.get_name_2(); const auto prc_index_itr = prc_index_of_ids.find( make_pair( query_id, match_id ) ); if ( prc_index_itr != common::cend( prc_index_of_ids ) ) { ssap_and_prc_entries.emplace_back( the_ssap, arg_prcs[ prc_index_itr->second ] ); } } const auto num_ssaps = arg_ssaps.size(); const auto num_prcs = arg_prcs.size(); const auto num_comb = ssap_and_prc_entries.size(); const string query_id_str = ( ! ssap_and_prc_entries.empty() ) ? ( " (query: " + get_query_id( ssap_and_prc_entries ) + ")" ) : ""; const string unmatched_ssaps_str = ( num_ssaps > num_comb ) ? ( std::to_string( num_ssaps - num_comb ) + " unmatched SSAP results from " + std::to_string( num_ssaps ) ) : ""; const string unmatched_prcs_str = ( num_prcs > num_comb ) ? ( std::to_string( num_prcs - num_comb ) + " unmatched PRC results from " + std::to_string( num_prcs ) ) : ""; const string conjuction_str = ( unmatched_ssaps_str.empty() || unmatched_prcs_str.empty() ) ? "" : " and "; if ( ! unmatched_ssaps_str.empty() || ! unmatched_prcs_str.empty() ) { BOOST_LOG_TRIVIAL( warning ) << "After parsing " << num_comb << " ssaps_and_prcs_of_query" << query_id_str << ", was left with " << unmatched_ssaps_str << conjuction_str << unmatched_prcs_str; } return { ssap_and_prc_entries }; }
/// \brief TODOCUMENT void cath::score::detail::write_to_svm_light_data_files_impl(const score_classn_value_vec_vec &arg_results, ///< TODOCUMENT const value_list_scaling_vec &arg_value_list_scalings, ///< TODOCUMENT const path &arg_output_file, ///< TODOCUMENT const size_vec &arg_indices ///< TODOCUMENT ) { // Sanity check the inputs if ( ! is_sorted( arg_indices ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot write SVM data for indices that aren't sorted")); } if ( contains_adjacent_match( arg_results, [] (const score_classn_value_vec &x, const score_classn_value_vec &y) { return x.size() != y.size(); } ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Not all results to be written to an SVM data file are of equal size")); } if ( contains_if( arg_results, [&] (const score_classn_value_vec &x) { return x.size() <= arg_indices.back(); } ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("The results to be written to an SVM data file are not big enough for the specified indices")); } if ( arg_value_list_scalings.size() != arg_results.size() ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("The number of scalings doesn't match the number of score_classn_value_vecs when attempting to write SVM data ")); } // if ( ! results.empty() ) { // const auto first_results_size = results.front().size(); // if ( ! all_of( results, [] (const score_classn_value_vec &x) { return x.size() == first_results_size; } ) ) { // BOOST_THROW_EXCEPTION(invalid_argument_exception("Not all results to be written to an SVM data file are of equal size")); // } // } // if ( ! arg_indices.empty() && ! results.empty() ) { // if ( arg_indices.back() >= arg_results.front().size() ) { // BOOST_THROW_EXCEPTION(invalid_argument_exception("The results to be written to an SVM data file are not big enough for the specified indices")); // } // } // Open an ostream for the file ofstream out_stream; open_ofstream( out_stream, arg_output_file ); // Loop over the instances of the requested indices for (const size_t &index : arg_indices) { // Check for any mismatching entries wrt their instance label/is_positive value const bool mismatching_instances = contains_adjacent_match( arg_results, [&] (const score_classn_value_vec &x, const score_classn_value_vec &y) { return ( ( x[ index ].get_instance_is_positive() != y[ index ].get_instance_is_positive() ) || ( x[ index ].get_instance_label() != y[ index ].get_instance_label() ) ); } ); if ( mismatching_instances ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("When attempting to write SVM data, detected mismatching entries wrt their instance label/is_positive value")); } // If there are arg_results, then output a line of data if ( ! arg_results.empty() ) { const auto &first_value = arg_results.front()[ index ]; // First, output '+1 ' if this instance is positive and '-1 ' otherwise out_stream << ( first_value.get_instance_is_positive() ? string( "+1 " ) : string( "-1 " ) ); // Next output the data in the format : '1:value_1 2:value_2 [...] n:value_n' out_stream << join( transform_build<str_vec>( irange( 0_z, arg_results.size() ), [&] (const size_t &x) { const auto &scaling = arg_value_list_scalings[ x ]; const auto scaled_score = scale_value_copy( scaling, arg_results[ x ][ index ].get_score_value() ); return lexical_cast<string>( x + 1 ) + ":" + lexical_cast<string>( scaled_score ); } ), " " ); // Finally, append a comment containing the label of the instance out_stream << " # " << first_value.get_instance_label() << "\n"; } } // Flush and close the output stream out_stream << flush; out_stream.close(); }
/// \brief Tally up the residue records parsed from a PDB file with those parsed from a corresponding DSSP/WOLF file /// /// \pre Every valid DSSP/WOLF residue should have an equivalent PDB residue and matching residues should be in the same order in both. /// /// The DSSP file may have a NULL residue to indicate a break in the chain or a residue that cannot be properly represented. /// The WOLF file may just skip some residues. /// /// Since this is attempting to find the PDB residue that matches each DSSP/WOLF residue, it is implemented /// with a simple loop through each of the DSSP/WOLF residues whilst maintaining a counter to point to the /// current PDB residue name. /// /// \returns A list of pairs of equivalent indices (offset 0) between residues in the PDB and DSSP/WOLF size_size_pair_vec cath::file::tally_residue_ids(const residue_id_vec &arg_pdb_residue_ids, ///< A list of residue_ids parsed from the PDB file const residue_id_vec &arg_dssp_or_wolf_residue_ids, ///< A list of residue_ids parsed from the DSSP/WOLF file (with a null residue represented with an empty string) const bool &arg_permit_breaks_without_null_residues, ///< (true for WOLF files and false for DSSP files (at least >= v2.0) const bool &arg_permit_head_tail_break_without_null_residue, ///< (true even for DSSP v2.0.4: file for chain A of 1bvs stops with neither residue 203 or null residue (verbose message: "ignoring incomplete residue ARG (203)") const size_set &arg_skippable_pdb_indices ///< A list of the indices of PDB residue names that should always be considered for being skipped over to find a match to the next DSSP/WOLF residue ) { BOOST_LOG_TRIVIAL( trace ) << "Tallying PDB residue names: " << join( arg_pdb_residue_ids | lexical_casted<string>(), "," ) << " with DSSP/WOLF residue names: " << join( arg_dssp_or_wolf_residue_ids | lexical_casted<string>(), "," ); // Sanity check the inputs // /// \todo Add check that arg_dssp_or_wolf_residue_ids has no duplicates other than empty strings and // Check that arg_pdb_residue_ids contains no empty strings if ( contains( arg_pdb_residue_ids, residue_id{} ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB residues should not contain empty residues names")); } // Check that arg_pdb_residue_ids contains no duplicates if ( ! is_uniq_for_unordered( arg_pdb_residue_ids ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB residues should not contain duplicate entries " + join( arg_pdb_residue_ids | lexical_casted<string>(), "," ) )); } // Check that arg_dssp_or_wolf_residue_ids contains no consecutive duplicates (not even duplicate empty strings) if ( contains_adjacent_match( arg_dssp_or_wolf_residue_ids ) ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("DSSP residues should not contain duplicate consecutive entries (not even null residues)")); } size_size_pair_vec alignment; const auto num_pdb_residues = arg_pdb_residue_ids.size(); const auto num_dssp_or_wolf_residues = arg_dssp_or_wolf_residue_ids.size(); const auto min_num_residues = min( num_pdb_residues, num_dssp_or_wolf_residues ); alignment.reserve( min_num_residues ); // Loop through the DSSP/WOLF residues, whilst also indexing through the PDB residues size_t pdb_residue_ctr = 0; for (const size_t &dssp_residue_ctr : irange( 0_z, num_dssp_or_wolf_residues ) ) { // Grab the DSSP/WOLF residue name const residue_id &dssp_or_wolf_res_id = arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr ]; const bool &dssp_or_wolf_res_is_null = is_null( dssp_or_wolf_res_id ); // If the PDB index has stepped past the end of the PDB residues if (pdb_residue_ctr >= num_pdb_residues) { // Then if this DSSP/WOLF residue is a NULL entry then just skip it if ( dssp_or_wolf_res_is_null ) { continue; } // Otherwise, this is a valid DSSP/WOLF residue with no match in the PDB so throw a wobbly else { BOOST_THROW_EXCEPTION(invalid_argument_exception("DSSP/WOLF residue " + to_string( dssp_or_wolf_res_id ) + " overshoots the end of the PDB residues")); } } // Record whether this is a permitted head break region const bool permitted_head_break = ( arg_permit_head_tail_break_without_null_residue && pdb_residue_ctr == 0 ); // Create a lambda function to calculate whether the specified PDB residue counter should/can be advanced // to find a match with the specified DSSP/WOLF residue name target const auto should_advance_pdb_res_ctr_for_target_fn = [&] (const size_t &arg_pdb_res_ctr, const residue_id &arg_target ) { const bool mismatches = ( arg_pdb_residue_ids[ arg_pdb_res_ctr ] != arg_target ); const bool reason_for_mismatch = ( dssp_or_wolf_res_is_null || arg_permit_breaks_without_null_residues || permitted_head_break || contains( arg_skippable_pdb_indices, arg_pdb_res_ctr ) ); return ( mismatches && reason_for_mismatch ); }; // If should advance... if ( should_advance_pdb_res_ctr_for_target_fn( pdb_residue_ctr, dssp_or_wolf_res_id ) ) { // If is a null residue at the end of the DSSP/WOLF, then set pdb_residue_ctr to the end of the PDB if ( dssp_or_wolf_res_is_null && dssp_residue_ctr + 1 >= num_dssp_or_wolf_residues) { pdb_residue_ctr = num_pdb_residues; } // Otherwise, it's necessary to search for the PDB residue that matches: // * the next DSSP/WOLF residue if this one is empty or // * this mismatching DSSP/WOLF residue otherwise else { // Grab the string to search for const residue_id &dssp_or_wolf_res_id_to_find = dssp_or_wolf_res_is_null ? arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr + 1 ] : arg_dssp_or_wolf_residue_ids[ dssp_residue_ctr ]; // Scan through the PDB residues to find a match while ( pdb_residue_ctr < num_pdb_residues && should_advance_pdb_res_ctr_for_target_fn( pdb_residue_ctr, dssp_or_wolf_res_id_to_find ) ) { ++pdb_residue_ctr; } // If no matching residue was found in the PDB then throw a wobbly if ( pdb_residue_ctr >= num_pdb_residues ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("Cannot find a match for DSSP/WOLF residue " + to_string( dssp_or_wolf_res_id_to_find ) )); } } // If this DSSP/WOLF residue is an empty then there is nothing more to do so move to the next loop if ( dssp_or_wolf_res_is_null ) { continue; } } // If these two residue names don't match then throw a wobbly const residue_id &pdb_res_id = arg_pdb_residue_ids[ pdb_residue_ctr ]; if ( pdb_res_id != dssp_or_wolf_res_id ) { BOOST_THROW_EXCEPTION(invalid_argument_exception( "Cannot match PDB residue " + to_string( pdb_res_id ) + " with DSSP/WOLF residue " + to_string( dssp_or_wolf_res_id ) + " - it may be worth double-checking the DSSP/WOLF file is generated from" " this version of the PDB file and, if you're using DSSP files, it may be" " worth ensuring you're using an up-to-date DSSP binary" )); } // Add this pair of residues to the alignment alignment.push_back(make_pair(pdb_residue_ctr, dssp_residue_ctr)); // Increment the pdb_residue_ctr ++pdb_residue_ctr; } // If there are further residues remaining in the PDB and permit_tail_break_without_null_residue is off then throw a wobbly if ( ! arg_permit_head_tail_break_without_null_residue && pdb_residue_ctr < num_pdb_residues ) { BOOST_THROW_EXCEPTION(invalid_argument_exception("PDB contains residues at the end that are not present at the end of the DSSP/WOLF")); } // No problem has been spotted so return the constructed alignment return alignment; }
/// \brief Combine a dssp_file and pdb representing the same structure in a sensible protein object /// /// \relates dssp_file /// /// \TODO Consider taking an ostream_ref_opt argument rather than assuming cerr /// (fix all errors, *then* provide default of boost::none) protein cath::file::protein_from_dssp_and_pdb(const dssp_file &arg_dssp_file, ///< The dssp_file object for a given structure const pdb &arg_pdb_file, ///< The dssp_file object for a given structure const dssp_skip_policy &arg_dssp_skip_policy, ///< Whether to exclude residues that are in the PDB but not the DSSP const string &arg_name, ///< The name to set as the title of the protein const ostream_ref_opt &arg_ostream ///< An optional reference to an ostream to which any logging should be sent ) { // Build a rough protein object from the pdb object const auto pdb_protein = build_protein_of_pdb( arg_pdb_file, arg_ostream, ( arg_dssp_skip_policy == dssp_skip_policy::SKIP__BREAK_ANGLES ) ? dssp_skip_policy::DONT_SKIP__BREAK_ANGLES : arg_dssp_skip_policy ); const auto pdb_skip_indices = get_protein_res_indices_that_dssp_might_skip( arg_pdb_file, arg_ostream ); // Grab the number of residues in the protein and dssp_file objects const auto num_dssp_residues = arg_dssp_file.get_num_residues(); const auto num_pdb_residues = pdb_protein.get_length(); // Grab the residues names from the DSSP and PDB and then tally them up const auto pdb_res_names = get_residue_ids ( pdb_protein ); const auto dssp_res_names = get_residue_ids ( arg_dssp_file, false ); const auto alignment = tally_residue_ids( pdb_res_names, dssp_res_names, false, true, pdb_skip_indices ); // Prepare a list of new residue to populate residue_vec new_residues; new_residues.reserve( ( arg_dssp_skip_policy == dssp_skip_policy::SKIP__BREAK_ANGLES ) ? num_dssp_residues : num_pdb_residues ); // Loop over the residues size_t alignment_ctr = 0; for (const size_t &pdb_residue_ctr : irange( 0_z, num_pdb_residues ) ) { const residue &the_pdb_residue = pdb_protein.get_residue_ref_of_index( pdb_residue_ctr ); // If this PDB residue is in the alignment then it can be combined with the equivalent DSSP residue const bool is_in_alignment = ( (alignment_ctr < alignment.size() ) && ( alignment[alignment_ctr].first == pdb_residue_ctr ) ); if ( is_in_alignment ) { // Combine the two residues and add them to the back const residue &the_dssp_residue = arg_dssp_file.get_residue_of_index( alignment[alignment_ctr].second ); new_residues.push_back( combine_residues_from_dssp_and_pdb( the_dssp_residue, the_pdb_residue, angle_skipping_of_dssp_skip_policy( arg_dssp_skip_policy ) ) ); // Increment the alignment counter ++alignment_ctr; } else if ( res_skipping_of_dssp_skip_policy( arg_dssp_skip_policy ) == dssp_skip_res_skipping::DONT_SKIP ) { new_residues.push_back( the_pdb_residue ); } } // Construct a new protein from the new list of residues return { arg_name, new_residues }; }